mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
Add alphabet navigation to scraper - now collects ALL 233 countries
- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
This commit is contained in:
34
scripts/check_full_content.py
Normal file
34
scripts/check_full_content.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Check the full page content and structure"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
html = fetch_html(url)
|
||||
|
||||
# Count how many times "Currency:" appears
|
||||
currency_count = html.count('Currency:')
|
||||
print(f"'Currency:' appears {currency_count} times in the HTML")
|
||||
|
||||
# Check page size
|
||||
print(f"HTML size: {len(html):,} bytes ({len(html)/1024:.1f} KB)")
|
||||
|
||||
# Look for all country names in headings
|
||||
import re
|
||||
countries_pattern = r'<h[1-4][^>]*>([^<]+)\s*-\s*Currency:'
|
||||
countries = re.findall(countries_pattern, html)
|
||||
print(f"\nCountries found in headings: {len(countries)}")
|
||||
|
||||
if countries:
|
||||
print("\nAll countries:")
|
||||
for i, country in enumerate(countries, 1):
|
||||
print(f"{i:2}. {country.strip()}")
|
||||
|
||||
# Check if there's a "show more" or expand mechanism
|
||||
if 'show all' in html.lower():
|
||||
print("\n'show all' found in HTML")
|
||||
if 'expand' in html.lower():
|
||||
print("'expand' found in HTML")
|
||||
if 'load more' in html.lower():
|
||||
print("'load more' found in HTML")
|
||||
Reference in New Issue
Block a user