Add alphabet navigation to scraper - now collects ALL 233 countries

- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
2026-03-01 14:10:22 -05:00 · 2026-01-13 09:27:21 -05:00
parent 15094ac94b
commit 969ba062f7
7 changed files with 371 additions and 21 deletions
--- a/scripts/check_full_content.py
+++ b/scripts/check_full_content.py
@@ -0,0 +1,34 @@
+"""Check the full page content and structure"""
+import sys
+sys.path.insert(0, 'src')
+
+from gov_travel.scrapers import fetch_html
+
+url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
+html = fetch_html(url)
+
+# Count how many times "Currency:" appears
+currency_count = html.count('Currency:')
+print(f"'Currency:' appears {currency_count} times in the HTML")
+
+# Check page size
+print(f"HTML size: {len(html):,} bytes ({len(html)/1024:.1f} KB)")
+
+# Look for all country names in headings
+import re
+countries_pattern = r'<h[1-4][^>]*>([^<]+)\s*-\s*Currency:'
+countries = re.findall(countries_pattern, html)
+print(f"\nCountries found in headings: {len(countries)}")
+
+if countries:
+    print("\nAll countries:")
+    for i, country in enumerate(countries, 1):
+        print(f"{i:2}. {country.strip()}")
+
+# Check if there's a "show more" or expand mechanism
+if 'show all' in html.lower():
+    print("\n'show all' found in HTML")
+if 'expand' in html.lower():
+    print("'expand' found in HTML")
+if 'load more' in html.lower():
+    print("'load more' found in HTML")
--- a/scripts/check_navigation.py
+++ b/scripts/check_navigation.py
@@ -0,0 +1,61 @@
+"""Look for alphabet navigation or hidden content"""
+import sys
+sys.path.insert(0, 'src')
+
+from gov_travel.scrapers import fetch_html
+from bs4 import BeautifulSoup
+import re
+
+url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
+html = fetch_html(url)
+soup = BeautifulSoup(html, 'html.parser')
+
+# Look for alphabet links (A-Z navigation)
+print("Looking for alphabet navigation...")
+alphabet_links = []
+for link in soup.find_all('a'):
+    text = link.get_text(strip=True)
+    href = link.get('href', '')
+    # Check if it's a single letter
+    if len(text) == 1 and text.isalpha():
+        alphabet_links.append((text, href))
+
+if alphabet_links:
+    print(f"\nFound {len(alphabet_links)} alphabet links:")
+    for letter, href in alphabet_links[:10]:
+        print(f"  {letter}: {href}")
+else:
+    print("No alphabet navigation found")
+
+# Check for JavaScript or AJAX content loading
+print("\n" + "="*80)
+print("Checking for dynamic content loading...")
+scripts = soup.find_all('script')
+print(f"Script tags found: {len(scripts)}")
+
+ajax_indicators = ['ajax', 'xhr', 'fetch', 'loadmore', 'getjson']
+for script in scripts:
+    script_text = script.get_text().lower()
+    for indicator in ajax_indicators:
+        if indicator in script_text:
+            print(f"  Found '{indicator}' in script")
+            break
+
+# Look for hidden content
+print("\n" + "="*80)
+print("Looking for collapsed/hidden sections...")
+hidden = soup.find_all(attrs={'style': re.compile(r'display:\s*none')})
+print(f"Hidden elements: {len(hidden)}")
+
+collapsed = soup.find_all(class_=re.compile(r'collaps'))
+print(f"Collapsible elements: {len(collapsed)}")
+
+# Check the main content area
+print("\n" + "="*80)
+print("Checking if there's a note about alphabetical display...")
+page_text = soup.get_text()
+if 'alphabetical' in page_text.lower():
+    # Find context around "alphabetical"
+    idx = page_text.lower().find('alphabetical')
+    context = page_text[max(0, idx-100):idx+100]
+    print(f"Found 'alphabetical' in text: ...{context}...")
--- a/scripts/check_njc_page.py
+++ b/scripts/check_njc_page.py
@@ -0,0 +1,35 @@
+"""Check what's on the NJC international page"""
+import sys
+sys.path.insert(0, 'src')
+
+from gov_travel.scrapers import fetch_html
+from bs4 import BeautifulSoup
+
+url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
+print(f"Fetching: {url}\n")
+
+html = fetch_html(url)
+soup = BeautifulSoup(html, 'html.parser')
+
+# Count tables
+tables = soup.find_all('table')
+print(f"Total tables found: {len(tables)}")
+
+# Find all headings before tables (country names)
+countries = set()
+for table in tables:
+    heading = table.find_previous(['h1', 'h2', 'h3', 'h4'])
+    if heading:
+        text = heading.get_text(strip=True)
+        # Extract country name (before " - Currency:")
+        if ' - Currency:' in text:
+            country = text.split(' - Currency:')[0].strip()
+            countries.add(country)
+
+print(f"\nUnique countries found: {len(countries)}")
+print("\nFirst 20 countries:")
+for i, country in enumerate(sorted(countries)[:20], 1):
+    print(f"{i:2}. {country}")
+
+if len(countries) > 20:
+    print(f"\n... and {len(countries) - 20} more")
--- a/scripts/check_pagination.py
+++ b/scripts/check_pagination.py
@@ -0,0 +1,45 @@
+"""Check for pagination or full country list on NJC page"""
+import sys
+sys.path.insert(0, 'src')
+
+from gov_travel.scrapers import fetch_html
+from bs4 import BeautifulSoup
+
+url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
+html = fetch_html(url)
+soup = BeautifulSoup(html, 'html.parser')
+
+# Look for links or navigation
+print("Looking for navigation elements...")
+print("\nAll links on the page:")
+links = soup.find_all('a')
+for link in links[:20]:  # First 20
+    href = link.get('href', '')
+    text = link.get_text(strip=True)
+    if text:
+        print(f"  {text}: {href}")
+
+print("\n" + "="*80)
+print("Looking for select/dropdown elements (country selector):")
+selects = soup.find_all('select')
+for select in selects:
+    name = select.get('name', 'unnamed')
+    print(f"\nSelect field: {name}")
+    options = select.find_all('option')
+    print(f"  Options count: {len(options)}")
+    if len(options) > 0:
+        print(f"  First 10 options:")
+        for opt in options[:10]:
+            value = opt.get('value', '')
+            text = opt.get_text(strip=True)
+            print(f"    {text} ({value})")
+        if len(options) > 10:
+            print(f"    ... and {len(options) - 10} more")
+
+print("\n" + "="*80)
+print("Looking for forms:")
+forms = soup.find_all('form')
+print(f"Forms found: {len(forms)}")
+for form in forms:
+    print(f"  Action: {form.get('action', 'N/A')}")
+    print(f"  Method: {form.get('method', 'GET')}")
--- a/scripts/final_stats.py
+++ b/scripts/final_stats.py
@@ -0,0 +1,37 @@
+import sqlite3
+
+conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
+cursor = conn.cursor()
+
+# Count countries
+cursor.execute('SELECT COUNT(DISTINCT country) FROM rate_entries WHERE source="international" AND country IS NOT NULL')
+total_countries = cursor.fetchone()[0]
+
+# Count total entries
+cursor.execute('SELECT COUNT(*) FROM rate_entries WHERE source="international"')
+total_entries = cursor.fetchone()[0]
+
+# Count unique currencies
+cursor.execute('SELECT COUNT(DISTINCT currency) FROM rate_entries WHERE source="international"')
+total_currencies = cursor.fetchone()[0]
+
+print(f"✅ COMPLETE SCRAPER RESULTS:")
+print(f"   Total Countries: {total_countries}")
+print(f"   Total Entries: {total_entries:,}")
+print(f"   Unique Currencies: {total_currencies}")
+
+# Show currency breakdown
+cursor.execute("""
+    SELECT currency, COUNT(DISTINCT country) as country_count, COUNT(*) as entries
+    FROM rate_entries 
+    WHERE source="international"
+    GROUP BY currency
+    ORDER BY country_count DESC
+    LIMIT 20
+""")
+
+print(f"\nTop 20 Currencies:")
+for row in cursor.fetchall():
+    print(f"   {row[0]:5} - {row[1]:3} countries, {row[2]:,} entries")
+
+conn.close()
--- a/scripts/verify_all_countries.py
+++ b/scripts/verify_all_countries.py
@@ -0,0 +1,78 @@
+import sqlite3
+
+conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
+cursor = conn.cursor()
+
+print("=" * 80)
+print("COMPLETE COUNTRY AND CURRENCY VERIFICATION")
+print("=" * 80)
+
+# Get all countries with their currencies from international source
+cursor.execute("""
+    SELECT DISTINCT country, currency, COUNT(DISTINCT city) as city_count
+    FROM rate_entries
+    WHERE source = 'international' AND country IS NOT NULL
+    GROUP BY country, currency
+    ORDER BY country
+""")
+
+international = cursor.fetchall()
+
+print(f"\n{'Country':<35} {'Currency':<10} {'Cities':<10}")
+print("-" * 80)
+
+for row in international:
+    country = row[0] if row[0] else "N/A"
+    currency = row[1] if row[1] else "N/A"
+    cities = row[2]
+    print(f"{country:<35} {currency:<10} {cities:<10}")
+
+print("-" * 80)
+print(f"Total: {len(international)} country-currency combinations")
+
+# Check for any NULL currencies
+cursor.execute("""
+    SELECT COUNT(*) 
+    FROM rate_entries 
+    WHERE currency IS NULL
+""")
+null_count = cursor.fetchone()[0]
+print(f"\nEntries with NULL currency: {null_count}")
+
+# Currency summary
+print("\n" + "=" * 80)
+print("CURRENCY DISTRIBUTION SUMMARY")
+print("=" * 80)
+cursor.execute("""
+    SELECT currency, COUNT(DISTINCT country) as countries, COUNT(*) as entries
+    FROM rate_entries
+    WHERE source = 'international'
+    GROUP BY currency
+    ORDER BY countries DESC
+""")
+
+print(f"\n{'Currency':<10} {'Countries':<15} {'Total Entries':<15}")
+print("-" * 80)
+for row in cursor.fetchall():
+    print(f"{row[0]:<10} {row[1]:<15} {row[2]:<15}")
+
+# Show sample cities for each currency
+print("\n" + "=" * 80)
+print("SAMPLE CITIES BY CURRENCY")
+print("=" * 80)
+
+for currency in ['EUR', 'USD', 'CAD', 'AUD', 'ARS']:
+    cursor.execute("""
+        SELECT DISTINCT country, city
+        FROM rate_entries
+        WHERE currency = ? AND city IS NOT NULL
+        LIMIT 3
+    """, (currency,))
+    
+    results = cursor.fetchall()
+    if results:
+        print(f"\n{currency} Cities:")
+        for r in results:
+            print(f"  • {r[1]}, {r[0]}")
+
+conn.close()