Add alphabet navigation to scraper - now collects ALL 233 countries

- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
2026-03-01 14:10:22 -05:00 · 2026-01-13 09:27:21 -05:00
parent 15094ac94b
commit 969ba062f7
7 changed files with 371 additions and 21 deletions
--- a/scripts/check_full_content.py
+++ b/scripts/check_full_content.py
@@ -0,0 +1,34 @@
 """Check the full page content and structure"""
 import sys
 sys.path.insert(0, 'src')
 from gov_travel.scrapers import fetch_html
 url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
 html = fetch_html(url)
 # Count how many times "Currency:" appears
 currency_count = html.count('Currency:')
 print(f"'Currency:' appears {currency_count} times in the HTML")
 # Check page size
 print(f"HTML size: {len(html):,} bytes ({len(html)/1024:.1f} KB)")
 # Look for all country names in headings
 import re
 countries_pattern = r'<h[1-4][^>]*>([^<]+)\s*-\s*Currency:'
 countries = re.findall(countries_pattern, html)
 print(f"\nCountries found in headings: {len(countries)}")
 if countries:
    print("\nAll countries:")
    for i, country in enumerate(countries, 1):
        print(f"{i:2}. {country.strip()}")
 # Check if there's a "show more" or expand mechanism
 if 'show all' in html.lower():
    print("\n'show all' found in HTML")
 if 'expand' in html.lower():
    print("'expand' found in HTML")
 if 'load more' in html.lower():
    print("'load more' found in HTML")
--- a/scripts/check_navigation.py
+++ b/scripts/check_navigation.py
@@ -0,0 +1,61 @@
 """Look for alphabet navigation or hidden content"""
 import sys
 sys.path.insert(0, 'src')
 from gov_travel.scrapers import fetch_html
 from bs4 import BeautifulSoup
 import re
 url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
 html = fetch_html(url)
 soup = BeautifulSoup(html, 'html.parser')
 # Look for alphabet links (A-Z navigation)
 print("Looking for alphabet navigation...")
 alphabet_links = []
 for link in soup.find_all('a'):
    text = link.get_text(strip=True)
    href = link.get('href', '')
    # Check if it's a single letter
    if len(text) == 1 and text.isalpha():
        alphabet_links.append((text, href))
 if alphabet_links:
    print(f"\nFound {len(alphabet_links)} alphabet links:")
    for letter, href in alphabet_links[:10]:
        print(f"  {letter}: {href}")
 else:
    print("No alphabet navigation found")
 # Check for JavaScript or AJAX content loading
 print("\n" + "="*80)
 print("Checking for dynamic content loading...")
 scripts = soup.find_all('script')
 print(f"Script tags found: {len(scripts)}")
 ajax_indicators = ['ajax', 'xhr', 'fetch', 'loadmore', 'getjson']
 for script in scripts:
    script_text = script.get_text().lower()
    for indicator in ajax_indicators:
        if indicator in script_text:
            print(f"  Found '{indicator}' in script")
            break
 # Look for hidden content
 print("\n" + "="*80)
 print("Looking for collapsed/hidden sections...")
 hidden = soup.find_all(attrs={'style': re.compile(r'display:\s*none')})
 print(f"Hidden elements: {len(hidden)}")
 collapsed = soup.find_all(class_=re.compile(r'collaps'))
 print(f"Collapsible elements: {len(collapsed)}")
 # Check the main content area
 print("\n" + "="*80)
 print("Checking if there's a note about alphabetical display...")
 page_text = soup.get_text()
 if 'alphabetical' in page_text.lower():
    # Find context around "alphabetical"
    idx = page_text.lower().find('alphabetical')
    context = page_text[max(0, idx-100):idx+100]
    print(f"Found 'alphabetical' in text: ...{context}...")
--- a/scripts/check_njc_page.py
+++ b/scripts/check_njc_page.py
@@ -0,0 +1,35 @@
 """Check what's on the NJC international page"""
 import sys
 sys.path.insert(0, 'src')
 from gov_travel.scrapers import fetch_html
 from bs4 import BeautifulSoup
 url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
 print(f"Fetching: {url}\n")
 html = fetch_html(url)
 soup = BeautifulSoup(html, 'html.parser')
 # Count tables
 tables = soup.find_all('table')
 print(f"Total tables found: {len(tables)}")
 # Find all headings before tables (country names)
 countries = set()
 for table in tables:
    heading = table.find_previous(['h1', 'h2', 'h3', 'h4'])
    if heading:
        text = heading.get_text(strip=True)
        # Extract country name (before " - Currency:")
        if ' - Currency:' in text:
            country = text.split(' - Currency:')[0].strip()
            countries.add(country)
 print(f"\nUnique countries found: {len(countries)}")
 print("\nFirst 20 countries:")
 for i, country in enumerate(sorted(countries)[:20], 1):
    print(f"{i:2}. {country}")
 if len(countries) > 20:
    print(f"\n... and {len(countries) - 20} more")
--- a/scripts/check_pagination.py
+++ b/scripts/check_pagination.py
@@ -0,0 +1,45 @@
 """Check for pagination or full country list on NJC page"""
 import sys
 sys.path.insert(0, 'src')
 from gov_travel.scrapers import fetch_html
 from bs4 import BeautifulSoup
 url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
 html = fetch_html(url)
 soup = BeautifulSoup(html, 'html.parser')
 # Look for links or navigation
 print("Looking for navigation elements...")
 print("\nAll links on the page:")
 links = soup.find_all('a')
 for link in links[:20]:  # First 20
    href = link.get('href', '')
    text = link.get_text(strip=True)
    if text:
        print(f"  {text}: {href}")
 print("\n" + "="*80)
 print("Looking for select/dropdown elements (country selector):")
 selects = soup.find_all('select')
 for select in selects:
    name = select.get('name', 'unnamed')
    print(f"\nSelect field: {name}")
    options = select.find_all('option')
    print(f"  Options count: {len(options)}")
    if len(options) > 0:
        print(f"  First 10 options:")
        for opt in options[:10]:
            value = opt.get('value', '')
            text = opt.get_text(strip=True)
            print(f"    {text} ({value})")
        if len(options) > 10:
            print(f"    ... and {len(options) - 10} more")
 print("\n" + "="*80)
 print("Looking for forms:")
 forms = soup.find_all('form')
 print(f"Forms found: {len(forms)}")
 for form in forms:
    print(f"  Action: {form.get('action', 'N/A')}")
    print(f"  Method: {form.get('method', 'GET')}")
--- a/scripts/final_stats.py
+++ b/scripts/final_stats.py
@@ -0,0 +1,37 @@
 import sqlite3
 conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
 cursor = conn.cursor()
 # Count countries
 cursor.execute('SELECT COUNT(DISTINCT country) FROM rate_entries WHERE source="international" AND country IS NOT NULL')
 total_countries = cursor.fetchone()[0]
 # Count total entries
 cursor.execute('SELECT COUNT(*) FROM rate_entries WHERE source="international"')
 total_entries = cursor.fetchone()[0]
 # Count unique currencies
 cursor.execute('SELECT COUNT(DISTINCT currency) FROM rate_entries WHERE source="international"')
 total_currencies = cursor.fetchone()[0]
 print(f"✅ COMPLETE SCRAPER RESULTS:")
 print(f"   Total Countries: {total_countries}")
 print(f"   Total Entries: {total_entries:,}")
 print(f"   Unique Currencies: {total_currencies}")
 # Show currency breakdown
 cursor.execute("""
    SELECT currency, COUNT(DISTINCT country) as country_count, COUNT(*) as entries
    FROM rate_entries 
    WHERE source="international"
    GROUP BY currency
    ORDER BY country_count DESC
    LIMIT 20
 """)
 print(f"\nTop 20 Currencies:")
 for row in cursor.fetchall():
    print(f"   {row[0]:5} - {row[1]:3} countries, {row[2]:,} entries")
 conn.close()
--- a/scripts/verify_all_countries.py
+++ b/scripts/verify_all_countries.py
@@ -0,0 +1,78 @@
 import sqlite3
 conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
 cursor = conn.cursor()
 print("=" * 80)
 print("COMPLETE COUNTRY AND CURRENCY VERIFICATION")
 print("=" * 80)
 # Get all countries with their currencies from international source
 cursor.execute("""
    SELECT DISTINCT country, currency, COUNT(DISTINCT city) as city_count
    FROM rate_entries
    WHERE source = 'international' AND country IS NOT NULL
    GROUP BY country, currency
    ORDER BY country
 """)
 international = cursor.fetchall()
 print(f"\n{'Country':<35} {'Currency':<10} {'Cities':<10}")
 print("-" * 80)
 for row in international:
    country = row[0] if row[0] else "N/A"
    currency = row[1] if row[1] else "N/A"
    cities = row[2]
    print(f"{country:<35} {currency:<10} {cities:<10}")
 print("-" * 80)
 print(f"Total: {len(international)} country-currency combinations")
 # Check for any NULL currencies
 cursor.execute("""
    SELECT COUNT(*) 
    FROM rate_entries 
    WHERE currency IS NULL
 """)
 null_count = cursor.fetchone()[0]
 print(f"\nEntries with NULL currency: {null_count}")
 # Currency summary
 print("\n" + "=" * 80)
 print("CURRENCY DISTRIBUTION SUMMARY")
 print("=" * 80)
 cursor.execute("""
    SELECT currency, COUNT(DISTINCT country) as countries, COUNT(*) as entries
    FROM rate_entries
    WHERE source = 'international'
    GROUP BY currency
    ORDER BY countries DESC
 """)
 print(f"\n{'Currency':<10} {'Countries':<15} {'Total Entries':<15}")
 print("-" * 80)
 for row in cursor.fetchall():
    print(f"{row[0]:<10} {row[1]:<15} {row[2]:<15}")
 # Show sample cities for each currency
 print("\n" + "=" * 80)
 print("SAMPLE CITIES BY CURRENCY")
 print("=" * 80)
 for currency in ['EUR', 'USD', 'CAD', 'AUD', 'ARS']:
    cursor.execute("""
        SELECT DISTINCT country, city
        FROM rate_entries
        WHERE currency = ? AND city IS NOT NULL
        LIMIT 3
    """, (currency,))
    results = cursor.fetchall()
    if results:
        print(f"\n{currency} Cities:")
        for r in results:
            print(f"  • {r[1]}, {r[0]}")
 conn.close()
--- a/src/gov_travel/scrapers.py
+++ b/src/gov_travel/scrapers.py
@@ -2,6 +2,7 @@ from __future__ import annotations
 import json
 import re
 import time
 from dataclasses import dataclass
 from typing import Any, Iterable
@@ -10,26 +11,38 @@ import requests
 from bs4 import BeautifulSoup
 USER_AGENT = "GovTravelScraper/1.0 (+https://example.com)"
 REQUEST_DELAY = 2  # seconds between requests to avoid overwhelming server
@dataclass(frozen=True)
 class SourceConfig:
    name: str
    url: str
    uses_alphabet_navigation: bool = False
 SOURCES = [
-    SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"),
+    SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en", uses_alphabet_navigation=True),
    SourceConfig(name="domestic", url="https://www.njc-cnm.gc.ca/directive/d10/v325/s978/en"),
    SourceConfig(name="accommodations", url="https://rehelv-acrd.tpsgc-pwgsc.gc.ca/lth-crl-eng.aspx"),
 ]
-def fetch_html(url: str) -> str:
+def fetch_html(url: str, retry=3) -> str:
-    response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60)
+    for attempt in range(retry):
-    response.raise_for_status()
+        try:
-    response.encoding = response.apparent_encoding
+            response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60)
-    return response.text
+            response.raise_for_status()
            response.encoding = response.apparent_encoding
            time.sleep(REQUEST_DELAY)  # Polite delay between requests
            return response.text
        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
            if attempt < retry - 1:
                wait_time = (attempt + 1) * 5  # Exponential backoff: 5s, 10s, 15s
                print(f"    Timeout, retrying in {wait_time}s...")
                time.sleep(wait_time)
            else:
                raise
 def extract_tables(html: str) -> list[pd.DataFrame]:
@@ -100,24 +113,71 @@ def _table_title_map(html: str) -> dict[int, str]:
    return titles
-def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
+def _get_alphabet_urls(base_url: str) -> list[str]:
-    html = fetch_html(source.url)
+    """Generate URLs for all alphabet letters (A-Z) for paginated sources"""
-    tables = extract_tables(html)
+    import string
-    title_map = _table_title_map(html)
+    
-    results = []
+    # First, fetch the base page to get the drv_id (date revision)
-    for index, table in enumerate(tables):
+    html = fetch_html(base_url)
-        # Flatten MultiIndex columns before converting to JSON
+    soup = BeautifulSoup(html, "html.parser")
-        if isinstance(table.columns, pd.MultiIndex):
+    
-            table.columns = [col[1] if col[0] != col[1] else col[0] for col in table.columns]
+    # Find the drv_id from alphabet links
    drv_id = "86"  # Default to current
    for link in soup.find_all('a', href=re.compile(r'let=[A-Z]')):
        href = link.get('href', '')
        match = re.search(r'drv_id=(\d+)', href)
        if match:
            drv_id = match.group(1)
            break
    # Generate URLs for each letter
    urls = []
    for letter in string.ascii_uppercase:
        url = f"{base_url}&drv_id={drv_id}&let={letter}"
        urls.append(url)
    return urls
 def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
    results = []
    table_offset = 0
    # For sources with alphabet navigation, fetch all letter pages
    if source.uses_alphabet_navigation:
        urls = _get_alphabet_urls(source.url)
        print(f"  Fetching {len(urls)} alphabet pages...")
    else:
        urls = [source.url]
    for url in urls:
        html = fetch_html(url)
        try:
            tables = extract_tables(html)
        except ValueError:
            # No tables on this page (e.g., letters with no countries)
            continue
        title_map = _table_title_map(html)
        for index, table in enumerate(tables):
            # Flatten MultiIndex columns before converting to JSON
            if isinstance(table.columns, pd.MultiIndex):
                table.columns = [col[1] if col[0] != col[1] else col[0] for col in table.columns]
            data = json.loads(table.to_json(orient="records"))
            results.append(
                {
                    "table_index": table_offset + index,
                    "title": title_map.get(index),
                    "data": data,
                }
            )
        table_offset += len(tables)
        if len(tables) > 0:
            print(f"    {url.split('let=')[-1] if 'let=' in url else 'base'}: {len(tables)} tables")
        data = json.loads(table.to_json(orient="records"))
        results.append(
            {
                "table_index": index,
                "title": title_map.get(index),
                "data": data,
            }
        )
    return results