mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
Add alphabet navigation to scraper - now collects ALL 233 countries
- Implemented alphabet navigation (A-Z) for NJC international rates page - Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts - Added error handling for pages without tables - Installed html5lib for better HTML parsing - Now scrapes 233 countries (up from 15) with 104 unique currencies - Total 11,628 international rate entries collected - Added verification scripts to check all countries and their currencies - Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
This commit is contained in:
34
scripts/check_full_content.py
Normal file
34
scripts/check_full_content.py
Normal file
@@ -0,0 +1,34 @@
|
||||
"""Check the full page content and structure"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
html = fetch_html(url)
|
||||
|
||||
# Count how many times "Currency:" appears
|
||||
currency_count = html.count('Currency:')
|
||||
print(f"'Currency:' appears {currency_count} times in the HTML")
|
||||
|
||||
# Check page size
|
||||
print(f"HTML size: {len(html):,} bytes ({len(html)/1024:.1f} KB)")
|
||||
|
||||
# Look for all country names in headings
|
||||
import re
|
||||
countries_pattern = r'<h[1-4][^>]*>([^<]+)\s*-\s*Currency:'
|
||||
countries = re.findall(countries_pattern, html)
|
||||
print(f"\nCountries found in headings: {len(countries)}")
|
||||
|
||||
if countries:
|
||||
print("\nAll countries:")
|
||||
for i, country in enumerate(countries, 1):
|
||||
print(f"{i:2}. {country.strip()}")
|
||||
|
||||
# Check if there's a "show more" or expand mechanism
|
||||
if 'show all' in html.lower():
|
||||
print("\n'show all' found in HTML")
|
||||
if 'expand' in html.lower():
|
||||
print("'expand' found in HTML")
|
||||
if 'load more' in html.lower():
|
||||
print("'load more' found in HTML")
|
||||
61
scripts/check_navigation.py
Normal file
61
scripts/check_navigation.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""Look for alphabet navigation or hidden content"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
html = fetch_html(url)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Look for alphabet links (A-Z navigation)
|
||||
print("Looking for alphabet navigation...")
|
||||
alphabet_links = []
|
||||
for link in soup.find_all('a'):
|
||||
text = link.get_text(strip=True)
|
||||
href = link.get('href', '')
|
||||
# Check if it's a single letter
|
||||
if len(text) == 1 and text.isalpha():
|
||||
alphabet_links.append((text, href))
|
||||
|
||||
if alphabet_links:
|
||||
print(f"\nFound {len(alphabet_links)} alphabet links:")
|
||||
for letter, href in alphabet_links[:10]:
|
||||
print(f" {letter}: {href}")
|
||||
else:
|
||||
print("No alphabet navigation found")
|
||||
|
||||
# Check for JavaScript or AJAX content loading
|
||||
print("\n" + "="*80)
|
||||
print("Checking for dynamic content loading...")
|
||||
scripts = soup.find_all('script')
|
||||
print(f"Script tags found: {len(scripts)}")
|
||||
|
||||
ajax_indicators = ['ajax', 'xhr', 'fetch', 'loadmore', 'getjson']
|
||||
for script in scripts:
|
||||
script_text = script.get_text().lower()
|
||||
for indicator in ajax_indicators:
|
||||
if indicator in script_text:
|
||||
print(f" Found '{indicator}' in script")
|
||||
break
|
||||
|
||||
# Look for hidden content
|
||||
print("\n" + "="*80)
|
||||
print("Looking for collapsed/hidden sections...")
|
||||
hidden = soup.find_all(attrs={'style': re.compile(r'display:\s*none')})
|
||||
print(f"Hidden elements: {len(hidden)}")
|
||||
|
||||
collapsed = soup.find_all(class_=re.compile(r'collaps'))
|
||||
print(f"Collapsible elements: {len(collapsed)}")
|
||||
|
||||
# Check the main content area
|
||||
print("\n" + "="*80)
|
||||
print("Checking if there's a note about alphabetical display...")
|
||||
page_text = soup.get_text()
|
||||
if 'alphabetical' in page_text.lower():
|
||||
# Find context around "alphabetical"
|
||||
idx = page_text.lower().find('alphabetical')
|
||||
context = page_text[max(0, idx-100):idx+100]
|
||||
print(f"Found 'alphabetical' in text: ...{context}...")
|
||||
35
scripts/check_njc_page.py
Normal file
35
scripts/check_njc_page.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""Check what's on the NJC international page"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
print(f"Fetching: {url}\n")
|
||||
|
||||
html = fetch_html(url)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Count tables
|
||||
tables = soup.find_all('table')
|
||||
print(f"Total tables found: {len(tables)}")
|
||||
|
||||
# Find all headings before tables (country names)
|
||||
countries = set()
|
||||
for table in tables:
|
||||
heading = table.find_previous(['h1', 'h2', 'h3', 'h4'])
|
||||
if heading:
|
||||
text = heading.get_text(strip=True)
|
||||
# Extract country name (before " - Currency:")
|
||||
if ' - Currency:' in text:
|
||||
country = text.split(' - Currency:')[0].strip()
|
||||
countries.add(country)
|
||||
|
||||
print(f"\nUnique countries found: {len(countries)}")
|
||||
print("\nFirst 20 countries:")
|
||||
for i, country in enumerate(sorted(countries)[:20], 1):
|
||||
print(f"{i:2}. {country}")
|
||||
|
||||
if len(countries) > 20:
|
||||
print(f"\n... and {len(countries) - 20} more")
|
||||
45
scripts/check_pagination.py
Normal file
45
scripts/check_pagination.py
Normal file
@@ -0,0 +1,45 @@
|
||||
"""Check for pagination or full country list on NJC page"""
|
||||
import sys
|
||||
sys.path.insert(0, 'src')
|
||||
|
||||
from gov_travel.scrapers import fetch_html
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
|
||||
html = fetch_html(url)
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Look for links or navigation
|
||||
print("Looking for navigation elements...")
|
||||
print("\nAll links on the page:")
|
||||
links = soup.find_all('a')
|
||||
for link in links[:20]: # First 20
|
||||
href = link.get('href', '')
|
||||
text = link.get_text(strip=True)
|
||||
if text:
|
||||
print(f" {text}: {href}")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Looking for select/dropdown elements (country selector):")
|
||||
selects = soup.find_all('select')
|
||||
for select in selects:
|
||||
name = select.get('name', 'unnamed')
|
||||
print(f"\nSelect field: {name}")
|
||||
options = select.find_all('option')
|
||||
print(f" Options count: {len(options)}")
|
||||
if len(options) > 0:
|
||||
print(f" First 10 options:")
|
||||
for opt in options[:10]:
|
||||
value = opt.get('value', '')
|
||||
text = opt.get_text(strip=True)
|
||||
print(f" {text} ({value})")
|
||||
if len(options) > 10:
|
||||
print(f" ... and {len(options) - 10} more")
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Looking for forms:")
|
||||
forms = soup.find_all('form')
|
||||
print(f"Forms found: {len(forms)}")
|
||||
for form in forms:
|
||||
print(f" Action: {form.get('action', 'N/A')}")
|
||||
print(f" Method: {form.get('method', 'GET')}")
|
||||
37
scripts/final_stats.py
Normal file
37
scripts/final_stats.py
Normal file
@@ -0,0 +1,37 @@
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Count countries
|
||||
cursor.execute('SELECT COUNT(DISTINCT country) FROM rate_entries WHERE source="international" AND country IS NOT NULL')
|
||||
total_countries = cursor.fetchone()[0]
|
||||
|
||||
# Count total entries
|
||||
cursor.execute('SELECT COUNT(*) FROM rate_entries WHERE source="international"')
|
||||
total_entries = cursor.fetchone()[0]
|
||||
|
||||
# Count unique currencies
|
||||
cursor.execute('SELECT COUNT(DISTINCT currency) FROM rate_entries WHERE source="international"')
|
||||
total_currencies = cursor.fetchone()[0]
|
||||
|
||||
print(f"✅ COMPLETE SCRAPER RESULTS:")
|
||||
print(f" Total Countries: {total_countries}")
|
||||
print(f" Total Entries: {total_entries:,}")
|
||||
print(f" Unique Currencies: {total_currencies}")
|
||||
|
||||
# Show currency breakdown
|
||||
cursor.execute("""
|
||||
SELECT currency, COUNT(DISTINCT country) as country_count, COUNT(*) as entries
|
||||
FROM rate_entries
|
||||
WHERE source="international"
|
||||
GROUP BY currency
|
||||
ORDER BY country_count DESC
|
||||
LIMIT 20
|
||||
""")
|
||||
|
||||
print(f"\nTop 20 Currencies:")
|
||||
for row in cursor.fetchall():
|
||||
print(f" {row[0]:5} - {row[1]:3} countries, {row[2]:,} entries")
|
||||
|
||||
conn.close()
|
||||
78
scripts/verify_all_countries.py
Normal file
78
scripts/verify_all_countries.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
|
||||
cursor = conn.cursor()
|
||||
|
||||
print("=" * 80)
|
||||
print("COMPLETE COUNTRY AND CURRENCY VERIFICATION")
|
||||
print("=" * 80)
|
||||
|
||||
# Get all countries with their currencies from international source
|
||||
cursor.execute("""
|
||||
SELECT DISTINCT country, currency, COUNT(DISTINCT city) as city_count
|
||||
FROM rate_entries
|
||||
WHERE source = 'international' AND country IS NOT NULL
|
||||
GROUP BY country, currency
|
||||
ORDER BY country
|
||||
""")
|
||||
|
||||
international = cursor.fetchall()
|
||||
|
||||
print(f"\n{'Country':<35} {'Currency':<10} {'Cities':<10}")
|
||||
print("-" * 80)
|
||||
|
||||
for row in international:
|
||||
country = row[0] if row[0] else "N/A"
|
||||
currency = row[1] if row[1] else "N/A"
|
||||
cities = row[2]
|
||||
print(f"{country:<35} {currency:<10} {cities:<10}")
|
||||
|
||||
print("-" * 80)
|
||||
print(f"Total: {len(international)} country-currency combinations")
|
||||
|
||||
# Check for any NULL currencies
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM rate_entries
|
||||
WHERE currency IS NULL
|
||||
""")
|
||||
null_count = cursor.fetchone()[0]
|
||||
print(f"\nEntries with NULL currency: {null_count}")
|
||||
|
||||
# Currency summary
|
||||
print("\n" + "=" * 80)
|
||||
print("CURRENCY DISTRIBUTION SUMMARY")
|
||||
print("=" * 80)
|
||||
cursor.execute("""
|
||||
SELECT currency, COUNT(DISTINCT country) as countries, COUNT(*) as entries
|
||||
FROM rate_entries
|
||||
WHERE source = 'international'
|
||||
GROUP BY currency
|
||||
ORDER BY countries DESC
|
||||
""")
|
||||
|
||||
print(f"\n{'Currency':<10} {'Countries':<15} {'Total Entries':<15}")
|
||||
print("-" * 80)
|
||||
for row in cursor.fetchall():
|
||||
print(f"{row[0]:<10} {row[1]:<15} {row[2]:<15}")
|
||||
|
||||
# Show sample cities for each currency
|
||||
print("\n" + "=" * 80)
|
||||
print("SAMPLE CITIES BY CURRENCY")
|
||||
print("=" * 80)
|
||||
|
||||
for currency in ['EUR', 'USD', 'CAD', 'AUD', 'ARS']:
|
||||
cursor.execute("""
|
||||
SELECT DISTINCT country, city
|
||||
FROM rate_entries
|
||||
WHERE currency = ? AND city IS NOT NULL
|
||||
LIMIT 3
|
||||
""", (currency,))
|
||||
|
||||
results = cursor.fetchall()
|
||||
if results:
|
||||
print(f"\n{currency} Cities:")
|
||||
for r in results:
|
||||
print(f" • {r[1]}, {r[0]}")
|
||||
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user