mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
62 lines
2.0 KiB
Python
62 lines
2.0 KiB
Python
"""Look for alphabet navigation or hidden content"""
|
|
import sys
|
|
sys.path.insert(0, 'src')
|
|
|
|
from gov_travel.scrapers import fetch_html
|
|
from bs4 import BeautifulSoup
|
|
import re
|
|
|
|
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
|
html = fetch_html(url)
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Look for alphabet links (A-Z navigation)
|
|
print("Looking for alphabet navigation...")
|
|
alphabet_links = []
|
|
for link in soup.find_all('a'):
|
|
text = link.get_text(strip=True)
|
|
href = link.get('href', '')
|
|
# Check if it's a single letter
|
|
if len(text) == 1 and text.isalpha():
|
|
alphabet_links.append((text, href))
|
|
|
|
if alphabet_links:
|
|
print(f"\nFound {len(alphabet_links)} alphabet links:")
|
|
for letter, href in alphabet_links[:10]:
|
|
print(f" {letter}: {href}")
|
|
else:
|
|
print("No alphabet navigation found")
|
|
|
|
# Check for JavaScript or AJAX content loading
|
|
print("\n" + "="*80)
|
|
print("Checking for dynamic content loading...")
|
|
scripts = soup.find_all('script')
|
|
print(f"Script tags found: {len(scripts)}")
|
|
|
|
ajax_indicators = ['ajax', 'xhr', 'fetch', 'loadmore', 'getjson']
|
|
for script in scripts:
|
|
script_text = script.get_text().lower()
|
|
for indicator in ajax_indicators:
|
|
if indicator in script_text:
|
|
print(f" Found '{indicator}' in script")
|
|
break
|
|
|
|
# Look for hidden content
|
|
print("\n" + "="*80)
|
|
print("Looking for collapsed/hidden sections...")
|
|
hidden = soup.find_all(attrs={'style': re.compile(r'display:\s*none')})
|
|
print(f"Hidden elements: {len(hidden)}")
|
|
|
|
collapsed = soup.find_all(class_=re.compile(r'collaps'))
|
|
print(f"Collapsible elements: {len(collapsed)}")
|
|
|
|
# Check the main content area
|
|
print("\n" + "="*80)
|
|
print("Checking if there's a note about alphabetical display...")
|
|
page_text = soup.get_text()
|
|
if 'alphabetical' in page_text.lower():
|
|
# Find context around "alphabetical"
|
|
idx = page_text.lower().find('alphabetical')
|
|
context = page_text[max(0, idx-100):idx+100]
|
|
print(f"Found 'alphabetical' in text: ...{context}...")
|