mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
Add alphabet navigation to scraper - now collects ALL 233 countries
- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
This commit is contained in:
35
scripts/check_njc_page.py
Normal file
35
scripts/check_njc_page.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""Check what's on the NJC international page"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
print(f"Fetching: {url}\n")
|
||||
|
||||
html = fetch_html(url)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Count tables
|
||||
tables = soup.find_all('table')
|
||||
print(f"Total tables found: {len(tables)}")
|
||||
|
||||
# Find all headings before tables (country names)
|
||||
countries = set()
|
||||
for table in tables:
|
||||
heading = table.find_previous(['h1', 'h2', 'h3', 'h4'])
|
||||
if heading:
|
||||
text = heading.get_text(strip=True)
|
||||
# Extract country name (before " - Currency:")
|
||||
if ' - Currency:' in text:
|
||||
country = text.split(' - Currency:')[0].strip()
|
||||
countries.add(country)
|
||||
|
||||
print(f"\nUnique countries found: {len(countries)}")
|
||||
print("\nFirst 20 countries:")
|
||||
for i, country in enumerate(sorted(countries)[:20], 1):
|
||||
print(f"{i:2}. {country}")
|
||||
|
||||
if len(countries) > 20:
|
||||
print(f"\n... and {len(countries) - 20} more")
|
||||
Reference in New Issue
Block a user