mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
Add alphabet navigation to scraper - now collects ALL 233 countries
- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
This commit is contained in:
45
scripts/check_pagination.py
Normal file
45
scripts/check_pagination.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Check for pagination or full country list on NJC page"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
html = fetch_html(url)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Look for links or navigation
|
||||
print("Looking for navigation elements...")
|
||||
print("\nAll links on the page:")
|
||||
links = soup.find_all('a')
|
||||
for link in links[:20]: # First 20
|
||||
href = link.get('href', '')
|
||||
text = link.get_text(strip=True)
|
||||
if text:
|
||||
print(f" {text}: {href}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Looking for select/dropdown elements (country selector):")
|
||||
selects = soup.find_all('select')
|
||||
for select in selects:
|
||||
name = select.get('name', 'unnamed')
|
||||
print(f"\nSelect field: {name}")
|
||||
options = select.find_all('option')
|
||||
print(f" Options count: {len(options)}")
|
||||
if len(options) > 0:
|
||||
print(f" First 10 options:")
|
||||
for opt in options[:10]:
|
||||
value = opt.get('value', '')
|
||||
text = opt.get_text(strip=True)
|
||||
print(f" {text} ({value})")
|
||||
if len(options) > 10:
|
||||
print(f" ... and {len(options) - 10} more")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Looking for forms:")
|
||||
forms = soup.find_all('form')
|
||||
print(f"Forms found: {len(forms)}")
|
||||
for form in forms:
|
||||
print(f" Action: {form.get('action', 'N/A')}")
|
||||
print(f" Method: {form.get('method', 'GET')}")
|
||||
Reference in New Issue
Block a user