Add alphabet navigation to scraper - now collects ALL 233 countries

- Implemented alphabet navigation (A-Z) for NJC international rates page
- Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts
- Added error handling for pages without tables
- Installed html5lib for better HTML parsing
- Now scrapes 233 countries (up from 15) with 104 unique currencies
- Total 11,628 international rate entries collected
- Added verification scripts to check all countries and their currencies
- Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
This commit is contained in:
2026-01-13 09:27:21 -05:00
parent 15094ac94b
commit 969ba062f7
7 changed files with 371 additions and 21 deletions

View File

@@ -0,0 +1,34 @@
"""Check the full page content and structure"""
import sys
sys.path.insert(0, 'src')
from gov_travel.scrapers import fetch_html
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
html = fetch_html(url)
# Count how many times "Currency:" appears
currency_count = html.count('Currency:')
print(f"'Currency:' appears {currency_count} times in the HTML")
# Check page size
print(f"HTML size: {len(html):,} bytes ({len(html)/1024:.1f} KB)")
# Look for all country names in headings
import re
countries_pattern = r'<h[1-4][^>]*>([^<]+)\s*-\s*Currency:'
countries = re.findall(countries_pattern, html)
print(f"\nCountries found in headings: {len(countries)}")
if countries:
print("\nAll countries:")
for i, country in enumerate(countries, 1):
print(f"{i:2}. {country.strip()}")
# Check if there's a "show more" or expand mechanism
if 'show all' in html.lower():
print("\n'show all' found in HTML")
if 'expand' in html.lower():
print("'expand' found in HTML")
if 'load more' in html.lower():
print("'load more' found in HTML")

View File

@@ -0,0 +1,61 @@
"""Look for alphabet navigation or hidden content"""
import sys
sys.path.insert(0, 'src')
from gov_travel.scrapers import fetch_html
from bs4 import BeautifulSoup
import re
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
html = fetch_html(url)
soup = BeautifulSoup(html, 'html.parser')
# Look for alphabet links (A-Z navigation)
print("Looking for alphabet navigation...")
alphabet_links = []
for link in soup.find_all('a'):
text = link.get_text(strip=True)
href = link.get('href', '')
# Check if it's a single letter
if len(text) == 1 and text.isalpha():
alphabet_links.append((text, href))
if alphabet_links:
print(f"\nFound {len(alphabet_links)} alphabet links:")
for letter, href in alphabet_links[:10]:
print(f" {letter}: {href}")
else:
print("No alphabet navigation found")
# Check for JavaScript or AJAX content loading
print("\n" + "="*80)
print("Checking for dynamic content loading...")
scripts = soup.find_all('script')
print(f"Script tags found: {len(scripts)}")
ajax_indicators = ['ajax', 'xhr', 'fetch', 'loadmore', 'getjson']
for script in scripts:
script_text = script.get_text().lower()
for indicator in ajax_indicators:
if indicator in script_text:
print(f" Found '{indicator}' in script")
break
# Look for hidden content
print("\n" + "="*80)
print("Looking for collapsed/hidden sections...")
hidden = soup.find_all(attrs={'style': re.compile(r'display:\s*none')})
print(f"Hidden elements: {len(hidden)}")
collapsed = soup.find_all(class_=re.compile(r'collaps'))
print(f"Collapsible elements: {len(collapsed)}")
# Check the main content area
print("\n" + "="*80)
print("Checking if there's a note about alphabetical display...")
page_text = soup.get_text()
if 'alphabetical' in page_text.lower():
# Find context around "alphabetical"
idx = page_text.lower().find('alphabetical')
context = page_text[max(0, idx-100):idx+100]
print(f"Found 'alphabetical' in text: ...{context}...")

35
scripts/check_njc_page.py Normal file
View File

@@ -0,0 +1,35 @@
"""Check what's on the NJC international page"""
import sys
sys.path.insert(0, 'src')
from gov_travel.scrapers import fetch_html
from bs4 import BeautifulSoup
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
print(f"Fetching: {url}\n")
html = fetch_html(url)
soup = BeautifulSoup(html, 'html.parser')
# Count tables
tables = soup.find_all('table')
print(f"Total tables found: {len(tables)}")
# Find all headings before tables (country names)
countries = set()
for table in tables:
heading = table.find_previous(['h1', 'h2', 'h3', 'h4'])
if heading:
text = heading.get_text(strip=True)
# Extract country name (before " - Currency:")
if ' - Currency:' in text:
country = text.split(' - Currency:')[0].strip()
countries.add(country)
print(f"\nUnique countries found: {len(countries)}")
print("\nFirst 20 countries:")
for i, country in enumerate(sorted(countries)[:20], 1):
print(f"{i:2}. {country}")
if len(countries) > 20:
print(f"\n... and {len(countries) - 20} more")

View File

@@ -0,0 +1,45 @@
"""Check for pagination or full country list on NJC page"""
import sys
sys.path.insert(0, 'src')
from gov_travel.scrapers import fetch_html
from bs4 import BeautifulSoup
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
html = fetch_html(url)
soup = BeautifulSoup(html, 'html.parser')
# Look for links or navigation
print("Looking for navigation elements...")
print("\nAll links on the page:")
links = soup.find_all('a')
for link in links[:20]: # First 20
href = link.get('href', '')
text = link.get_text(strip=True)
if text:
print(f" {text}: {href}")
print("\n" + "="*80)
print("Looking for select/dropdown elements (country selector):")
selects = soup.find_all('select')
for select in selects:
name = select.get('name', 'unnamed')
print(f"\nSelect field: {name}")
options = select.find_all('option')
print(f" Options count: {len(options)}")
if len(options) > 0:
print(f" First 10 options:")
for opt in options[:10]:
value = opt.get('value', '')
text = opt.get_text(strip=True)
print(f" {text} ({value})")
if len(options) > 10:
print(f" ... and {len(options) - 10} more")
print("\n" + "="*80)
print("Looking for forms:")
forms = soup.find_all('form')
print(f"Forms found: {len(forms)}")
for form in forms:
print(f" Action: {form.get('action', 'N/A')}")
print(f" Method: {form.get('method', 'GET')}")

37
scripts/final_stats.py Normal file
View File

@@ -0,0 +1,37 @@
import sqlite3
conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
cursor = conn.cursor()
# Count countries
cursor.execute('SELECT COUNT(DISTINCT country) FROM rate_entries WHERE source="international" AND country IS NOT NULL')
total_countries = cursor.fetchone()[0]
# Count total entries
cursor.execute('SELECT COUNT(*) FROM rate_entries WHERE source="international"')
total_entries = cursor.fetchone()[0]
# Count unique currencies
cursor.execute('SELECT COUNT(DISTINCT currency) FROM rate_entries WHERE source="international"')
total_currencies = cursor.fetchone()[0]
print(f"✅ COMPLETE SCRAPER RESULTS:")
print(f" Total Countries: {total_countries}")
print(f" Total Entries: {total_entries:,}")
print(f" Unique Currencies: {total_currencies}")
# Show currency breakdown
cursor.execute("""
SELECT currency, COUNT(DISTINCT country) as country_count, COUNT(*) as entries
FROM rate_entries
WHERE source="international"
GROUP BY currency
ORDER BY country_count DESC
LIMIT 20
""")
print(f"\nTop 20 Currencies:")
for row in cursor.fetchall():
print(f" {row[0]:5} - {row[1]:3} countries, {row[2]:,} entries")
conn.close()

View File

@@ -0,0 +1,78 @@
import sqlite3
conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
cursor = conn.cursor()
print("=" * 80)
print("COMPLETE COUNTRY AND CURRENCY VERIFICATION")
print("=" * 80)
# Get all countries with their currencies from international source
cursor.execute("""
SELECT DISTINCT country, currency, COUNT(DISTINCT city) as city_count
FROM rate_entries
WHERE source = 'international' AND country IS NOT NULL
GROUP BY country, currency
ORDER BY country
""")
international = cursor.fetchall()
print(f"\n{'Country':<35} {'Currency':<10} {'Cities':<10}")
print("-" * 80)
for row in international:
country = row[0] if row[0] else "N/A"
currency = row[1] if row[1] else "N/A"
cities = row[2]
print(f"{country:<35} {currency:<10} {cities:<10}")
print("-" * 80)
print(f"Total: {len(international)} country-currency combinations")
# Check for any NULL currencies
cursor.execute("""
SELECT COUNT(*)
FROM rate_entries
WHERE currency IS NULL
""")
null_count = cursor.fetchone()[0]
print(f"\nEntries with NULL currency: {null_count}")
# Currency summary
print("\n" + "=" * 80)
print("CURRENCY DISTRIBUTION SUMMARY")
print("=" * 80)
cursor.execute("""
SELECT currency, COUNT(DISTINCT country) as countries, COUNT(*) as entries
FROM rate_entries
WHERE source = 'international'
GROUP BY currency
ORDER BY countries DESC
""")
print(f"\n{'Currency':<10} {'Countries':<15} {'Total Entries':<15}")
print("-" * 80)
for row in cursor.fetchall():
print(f"{row[0]:<10} {row[1]:<15} {row[2]:<15}")
# Show sample cities for each currency
print("\n" + "=" * 80)
print("SAMPLE CITIES BY CURRENCY")
print("=" * 80)
for currency in ['EUR', 'USD', 'CAD', 'AUD', 'ARS']:
cursor.execute("""
SELECT DISTINCT country, city
FROM rate_entries
WHERE currency = ? AND city IS NOT NULL
LIMIT 3
""", (currency,))
results = cursor.fetchall()
if results:
print(f"\n{currency} Cities:")
for r in results:
print(f"{r[1]}, {r[0]}")
conn.close()

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
import json
import re
import time
from dataclasses import dataclass
from typing import Any, Iterable
@@ -10,26 +11,38 @@ import requests
from bs4 import BeautifulSoup
USER_AGENT = "GovTravelScraper/1.0 (+https://example.com)"
REQUEST_DELAY = 2 # seconds between requests to avoid overwhelming server
@dataclass(frozen=True)
class SourceConfig:
name: str
url: str
uses_alphabet_navigation: bool = False
SOURCES = [
SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"),
SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en", uses_alphabet_navigation=True),
SourceConfig(name="domestic", url="https://www.njc-cnm.gc.ca/directive/d10/v325/s978/en"),
SourceConfig(name="accommodations", url="https://rehelv-acrd.tpsgc-pwgsc.gc.ca/lth-crl-eng.aspx"),
]
def fetch_html(url: str) -> str:
def fetch_html(url: str, retry=3) -> str:
for attempt in range(retry):
try:
response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=60)
response.raise_for_status()
response.encoding = response.apparent_encoding
time.sleep(REQUEST_DELAY) # Polite delay between requests
return response.text
except (requests.exceptions.Timeout, requests.exceptions.ConnectionError) as e:
if attempt < retry - 1:
wait_time = (attempt + 1) * 5 # Exponential backoff: 5s, 10s, 15s
print(f" Timeout, retrying in {wait_time}s...")
time.sleep(wait_time)
else:
raise
def extract_tables(html: str) -> list[pd.DataFrame]:
@@ -100,11 +113,53 @@ def _table_title_map(html: str) -> dict[int, str]:
return titles
def _get_alphabet_urls(base_url: str) -> list[str]:
"""Generate URLs for all alphabet letters (A-Z) for paginated sources"""
import string
# First, fetch the base page to get the drv_id (date revision)
html = fetch_html(base_url)
soup = BeautifulSoup(html, "html.parser")
# Find the drv_id from alphabet links
drv_id = "86" # Default to current
for link in soup.find_all('a', href=re.compile(r'let=[A-Z]')):
href = link.get('href', '')
match = re.search(r'drv_id=(\d+)', href)
if match:
drv_id = match.group(1)
break
# Generate URLs for each letter
urls = []
for letter in string.ascii_uppercase:
url = f"{base_url}&drv_id={drv_id}&let={letter}"
urls.append(url)
return urls
def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
html = fetch_html(source.url)
tables = extract_tables(html)
title_map = _table_title_map(html)
results = []
table_offset = 0
# For sources with alphabet navigation, fetch all letter pages
if source.uses_alphabet_navigation:
urls = _get_alphabet_urls(source.url)
print(f" Fetching {len(urls)} alphabet pages...")
else:
urls = [source.url]
for url in urls:
html = fetch_html(url)
try:
tables = extract_tables(html)
except ValueError:
# No tables on this page (e.g., letters with no countries)
continue
title_map = _table_title_map(html)
for index, table in enumerate(tables):
# Flatten MultiIndex columns before converting to JSON
if isinstance(table.columns, pd.MultiIndex):
@@ -113,11 +168,16 @@ def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
data = json.loads(table.to_json(orient="records"))
results.append(
{
"table_index": index,
"table_index": table_offset + index,
"title": title_map.get(index),
"data": data,
}
)
table_offset += len(tables)
if len(tables) > 0:
print(f" {url.split('let=')[-1] if 'let=' in url else 'base'}: {len(tables)} tables")
return results