From 969ba062f7df7ebe64dcd1d08f2e0d3aeff150d0 Mon Sep 17 00:00:00 2001 From: mblanke Date: Tue, 13 Jan 2026 09:27:21 -0500 Subject: [PATCH] Add alphabet navigation to scraper - now collects ALL 233 countries - Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies --- scripts/check_full_content.py | 34 +++++++++++ scripts/check_navigation.py | 61 +++++++++++++++++++ scripts/check_njc_page.py | 35 +++++++++++ scripts/check_pagination.py | 45 ++++++++++++++ scripts/final_stats.py | 37 ++++++++++++ scripts/verify_all_countries.py | 78 ++++++++++++++++++++++++ src/gov_travel/scrapers.py | 102 +++++++++++++++++++++++++------- 7 files changed, 371 insertions(+), 21 deletions(-) create mode 100644 scripts/check_full_content.py create mode 100644 scripts/check_navigation.py create mode 100644 scripts/check_njc_page.py create mode 100644 scripts/check_pagination.py create mode 100644 scripts/final_stats.py create mode 100644 scripts/verify_all_countries.py diff --git a/scripts/check_full_content.py b/scripts/check_full_content.py new file mode 100644 index 0000000..f0fac81 --- /dev/null +++ b/scripts/check_full_content.py @@ -0,0 +1,34 @@ +"""Check the full page content and structure""" +import sys +sys.path.insert(0, 'src') + +from gov_travel.scrapers import fetch_html + +url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en" +html = fetch_html(url) + +# Count how many times "Currency:" appears +currency_count = html.count('Currency:') +print(f"'Currency:' appears {currency_count} times in the HTML") + +# Check page size +print(f"HTML size: {len(html):,} bytes ({len(html)/1024:.1f} KB)") + +# Look for all country names in headings +import re +countries_pattern = r']*>([^<]+)\s*-\s*Currency:' +countries = re.findall(countries_pattern, html) +print(f"\nCountries found in headings: {len(countries)}") + +if countries: + print("\nAll countries:") + for i, country in enumerate(countries, 1): + print(f"{i:2}. {country.strip()}") + +# Check if there's a "show more" or expand mechanism +if 'show all' in html.lower(): + print("\n'show all' found in HTML") +if 'expand' in html.lower(): + print("'expand' found in HTML") +if 'load more' in html.lower(): + print("'load more' found in HTML") diff --git a/scripts/check_navigation.py b/scripts/check_navigation.py new file mode 100644 index 0000000..a229709 --- /dev/null +++ b/scripts/check_navigation.py @@ -0,0 +1,61 @@ +"""Look for alphabet navigation or hidden content""" +import sys +sys.path.insert(0, 'src') + +from gov_travel.scrapers import fetch_html +from bs4 import BeautifulSoup +import re + +url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en" +html = fetch_html(url) +soup = BeautifulSoup(html, 'html.parser') + +# Look for alphabet links (A-Z navigation) +print("Looking for alphabet navigation...") +alphabet_links = [] +for link in soup.find_all('a'): + text = link.get_text(strip=True) + href = link.get('href', '') + # Check if it's a single letter + if len(text) == 1 and text.isalpha(): + alphabet_links.append((text, href)) + +if alphabet_links: + print(f"\nFound {len(alphabet_links)} alphabet links:") + for letter, href in alphabet_links[:10]: + print(f" {letter}: {href}") +else: + print("No alphabet navigation found") + +# Check for JavaScript or AJAX content loading +print("\n" + "="*80) +print("Checking for dynamic content loading...") +scripts = soup.find_all('script') +print(f"Script tags found: {len(scripts)}") + +ajax_indicators = ['ajax', 'xhr', 'fetch', 'loadmore', 'getjson'] +for script in scripts: + script_text = script.get_text().lower() + for indicator in ajax_indicators: + if indicator in script_text: + print(f" Found '{indicator}' in script") + break + +# Look for hidden content +print("\n" + "="*80) +print("Looking for collapsed/hidden sections...") +hidden = soup.find_all(attrs={'style': re.compile(r'display:\s*none')}) +print(f"Hidden elements: {len(hidden)}") + +collapsed = soup.find_all(class_=re.compile(r'collaps')) +print(f"Collapsible elements: {len(collapsed)}") + +# Check the main content area +print("\n" + "="*80) +print("Checking if there's a note about alphabetical display...") +page_text = soup.get_text() +if 'alphabetical' in page_text.lower(): + # Find context around "alphabetical" + idx = page_text.lower().find('alphabetical') + context = page_text[max(0, idx-100):idx+100] + print(f"Found 'alphabetical' in text: ...{context}...") diff --git a/scripts/check_njc_page.py b/scripts/check_njc_page.py new file mode 100644 index 0000000..8c7d9a8 --- /dev/null +++ b/scripts/check_njc_page.py @@ -0,0 +1,35 @@ +"""Check what's on the NJC international page""" +import sys +sys.path.insert(0, 'src') + +from gov_travel.scrapers import fetch_html +from bs4 import BeautifulSoup + +url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en" +print(f"Fetching: {url}\n") + +html = fetch_html(url) +soup = BeautifulSoup(html, 'html.parser') + +# Count tables +tables = soup.find_all('table') +print(f"Total tables found: {len(tables)}") + +# Find all headings before tables (country names) +countries = set() +for table in tables: + heading = table.find_previous(['h1', 'h2', 'h3', 'h4']) + if heading: + text = heading.get_text(strip=True) + # Extract country name (before " - Currency:") + if ' - Currency:' in text: + country = text.split(' - Currency:')[0].strip() + countries.add(country) + +print(f"\nUnique countries found: {len(countries)}") +print("\nFirst 20 countries:") +for i, country in enumerate(sorted(countries)[:20], 1): + print(f"{i:2}. {country}") + +if len(countries) > 20: + print(f"\n... and {len(countries) - 20} more") diff --git a/scripts/check_pagination.py b/scripts/check_pagination.py new file mode 100644 index 0000000..986ecbb --- /dev/null +++ b/scripts/check_pagination.py @@ -0,0 +1,45 @@ +"""Check for pagination or full country list on NJC page""" +import sys +sys.path.insert(0, 'src') + +from gov_travel.scrapers import fetch_html +from bs4 import BeautifulSoup + +url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en" +html = fetch_html(url) +soup = BeautifulSoup(html, 'html.parser') + +# Look for links or navigation +print("Looking for navigation elements...") +print("\nAll links on the page:") +links = soup.find_all('a') +for link in links[:20]: # First 20 + href = link.get('href', '') + text = link.get_text(strip=True) + if text: + print(f" {text}: {href}") + +print("\n" + "="*80) +print("Looking for select/dropdown elements (country selector):") +selects = soup.find_all('select') +for select in selects: + name = select.get('name', 'unnamed') + print(f"\nSelect field: {name}") + options = select.find_all('option') + print(f" Options count: {len(options)}") + if len(options) > 0: + print(f" First 10 options:") + for opt in options[:10]: + value = opt.get('value', '') + text = opt.get_text(strip=True) + print(f" {text} ({value})") + if len(options) > 10: + print(f" ... and {len(options) - 10} more") + +print("\n" + "="*80) +print("Looking for forms:") +forms = soup.find_all('form') +print(f"Forms found: {len(forms)}") +for form in forms: + print(f" Action: {form.get('action', 'N/A')}") + print(f" Method: {form.get('method', 'GET')}") diff --git a/scripts/final_stats.py b/scripts/final_stats.py new file mode 100644 index 0000000..e28a90d --- /dev/null +++ b/scripts/final_stats.py @@ -0,0 +1,37 @@ +import sqlite3 + +conn = sqlite3.connect('data/travel_rates_scraped.sqlite3') +cursor = conn.cursor() + +# Count countries +cursor.execute('SELECT COUNT(DISTINCT country) FROM rate_entries WHERE source="international" AND country IS NOT NULL') +total_countries = cursor.fetchone()[0] + +# Count total entries +cursor.execute('SELECT COUNT(*) FROM rate_entries WHERE source="international"') +total_entries = cursor.fetchone()[0] + +# Count unique currencies +cursor.execute('SELECT COUNT(DISTINCT currency) FROM rate_entries WHERE source="international"') +total_currencies = cursor.fetchone()[0] + +print(f"✅ COMPLETE SCRAPER RESULTS:") +print(f" Total Countries: {total_countries}") +print(f" Total Entries: {total_entries:,}") +print(f" Unique Currencies: {total_currencies}") + +# Show currency breakdown +cursor.execute(""" + SELECT currency, COUNT(DISTINCT country) as country_count, COUNT(*) as entries + FROM rate_entries + WHERE source="international" + GROUP BY currency + ORDER BY country_count DESC + LIMIT 20 +""") + +print(f"\nTop 20 Currencies:") +for row in cursor.fetchall(): + print(f" {row[0]:5} - {row[1]:3} countries, {row[2]:,} entries") + +conn.close() diff --git a/scripts/verify_all_countries.py b/scripts/verify_all_countries.py new file mode 100644 index 0000000..8de8d39 --- /dev/null +++ b/scripts/verify_all_countries.py @@ -0,0 +1,78 @@ +import sqlite3 + +conn = sqlite3.connect('data/travel_rates_scraped.sqlite3') +cursor = conn.cursor() + +print("=" * 80) +print("COMPLETE COUNTRY AND CURRENCY VERIFICATION") +print("=" * 80) + +# Get all countries with their currencies from international source +cursor.execute(""" + SELECT DISTINCT country, currency, COUNT(DISTINCT city) as city_count + FROM rate_entries + WHERE source = 'international' AND country IS NOT NULL + GROUP BY country, currency + ORDER BY country +""") + +international = cursor.fetchall() + +print(f"\n{'Country':<35} {'Currency':<10} {'Cities':<10}") +print("-" * 80) + +for row in international: + country = row[0] if row[0] else "N/A" + currency = row[1] if row[1] else "N/A" + cities = row[2] + print(f"{country:<35} {currency:<10} {cities:<10}") + +print("-" * 80) +print(f"Total: {len(international)} country-currency combinations") + +# Check for any NULL currencies +cursor.execute(""" + SELECT COUNT(*) + FROM rate_entries + WHERE currency IS NULL +""") +null_count = cursor.fetchone()[0] +print(f"\nEntries with NULL currency: {null_count}") + +# Currency summary +print("\n" + "=" * 80) +print("CURRENCY DISTRIBUTION SUMMARY") +print("=" * 80) +cursor.execute(""" + SELECT currency, COUNT(DISTINCT country) as countries, COUNT(*) as entries + FROM rate_entries + WHERE source = 'international' + GROUP BY currency + ORDER BY countries DESC +""") + +print(f"\n{'Currency':<10} {'Countries':<15} {'Total Entries':<15}") +print("-" * 80) +for row in cursor.fetchall(): + print(f"{row[0]:<10} {row[1]:<15} {row[2]:<15}") + +# Show sample cities for each currency +print("\n" + "=" * 80) +print("SAMPLE CITIES BY CURRENCY") +print("=" * 80) + +for currency in ['EUR', 'USD', 'CAD', 'AUD', 'ARS']: + cursor.execute(""" + SELECT DISTINCT country, city + FROM rate_entries + WHERE currency = ? AND city IS NOT NULL + LIMIT 3 + """, (currency,)) + + results = cursor.fetchall() + if results: + print(f"\n{currency} Cities:") + for r in results: + print(f" • {r[1]}, {r[0]}") + +conn.close() diff --git a/src/gov_travel/scrapers.py b/src/gov_travel/scrapers.py index b58d7fd..db3ffc7 100644 --- a/src/gov_travel/scrapers.py +++ b/src/gov_travel/scrapers.py @@ -2,6 +2,7 @@ from __future__ import annotations import json import re +import time from dataclasses import dataclass from typing import Any, Iterable @@ -10,26 +11,38 @@ import requests from bs4 import BeautifulSoup USER_AGENT = "GovTravelScraper/1.0 (+https://example.com)" +REQUEST_DELAY = 2 # seconds between requests to avoid overwhelming server @dataclass(frozen=True) class SourceConfig: name: str url: str + uses_alphabet_navigation: bool = False SOURCES = [ - SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"), + SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en", uses_alphabet_navigation=True), SourceConfig(name="domestic", url="https://www.njc-cnm.gc.ca/directive/d10/v325/s978/en"), SourceConfig(name="accommodations", url="https://rehelv-acrd.tpsgc-pwgsc.gc.ca/lth-crl-eng.aspx"), ] -def fetch_html(url: str) -> str: - response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60) - response.raise_for_status() - response.encoding = response.apparent_encoding - return response.text +def fetch_html(url: str, retry=3) -> str: + for attempt in range(retry): + try: + response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60) + response.raise_for_status() + response.encoding = response.apparent_encoding + time.sleep(REQUEST_DELAY) # Polite delay between requests + return response.text + except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e: + if attempt < retry - 1: + wait_time = (attempt + 1) * 5 # Exponential backoff: 5s, 10s, 15s + print(f" Timeout, retrying in {wait_time}s...") + time.sleep(wait_time) + else: + raise def extract_tables(html: str) -> list[pd.DataFrame]: @@ -100,24 +113,71 @@ def _table_title_map(html: str) -> dict[int, str]: return titles +def _get_alphabet_urls(base_url: str) -> list[str]: + """Generate URLs for all alphabet letters (A-Z) for paginated sources""" + import string + + # First, fetch the base page to get the drv_id (date revision) + html = fetch_html(base_url) + soup = BeautifulSoup(html, "html.parser") + + # Find the drv_id from alphabet links + drv_id = "86" # Default to current + for link in soup.find_all('a', href=re.compile(r'let=[A-Z]')): + href = link.get('href', '') + match = re.search(r'drv_id=(\d+)', href) + if match: + drv_id = match.group(1) + break + + # Generate URLs for each letter + urls = [] + for letter in string.ascii_uppercase: + url = f"{base_url}&drv_id={drv_id}&let={letter}" + urls.append(url) + + return urls + + def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]: - html = fetch_html(source.url) - tables = extract_tables(html) - title_map = _table_title_map(html) results = [] - for index, table in enumerate(tables): - # Flatten MultiIndex columns before converting to JSON - if isinstance(table.columns, pd.MultiIndex): - table.columns = [col[1] if col[0] != col[1] else col[0] for col in table.columns] + table_offset = 0 + + # For sources with alphabet navigation, fetch all letter pages + if source.uses_alphabet_navigation: + urls = _get_alphabet_urls(source.url) + print(f" Fetching {len(urls)} alphabet pages...") + else: + urls = [source.url] + + for url in urls: + html = fetch_html(url) + try: + tables = extract_tables(html) + except ValueError: + # No tables on this page (e.g., letters with no countries) + continue - data = json.loads(table.to_json(orient="records")) - results.append( - { - "table_index": index, - "title": title_map.get(index), - "data": data, - } - ) + title_map = _table_title_map(html) + + for index, table in enumerate(tables): + # Flatten MultiIndex columns before converting to JSON + if isinstance(table.columns, pd.MultiIndex): + table.columns = [col[1] if col[0] != col[1] else col[0] for col in table.columns] + + data = json.loads(table.to_json(orient="records")) + results.append( + { + "table_index": table_offset + index, + "title": title_map.get(index), + "data": data, + } + ) + + table_offset += len(tables) + if len(tables) > 0: + print(f" {url.split('let=')[-1] if 'let=' in url else 'base'}: {len(tables)} tables") + return results