Add alphabet navigation to scraper - now collects ALL 233 countries

- Implemented alphabet navigation (A-Z) for NJC international rates page
- Added request delays (2s) and retry logic with exponential backoff to avoid server timeouts
- Added error handling for pages without tables
- Installed html5lib for better HTML parsing
- Now scrapes 233 countries (up from 15) with 104 unique currencies
- Total 11,628 international rate entries collected
- Added verification scripts to check all countries and their currencies
- Fixed currency extraction working perfectly for EUR, USD, CAD, AUD, ARS, and 99+ other currencies
This commit is contained in:
2026-01-13 09:27:21 -05:00
parent 15094ac94b
commit 969ba062f7
7 changed files with 371 additions and 21 deletions

37
scripts/final_stats.py Normal file
View File

@@ -0,0 +1,37 @@
import sqlite3
conn = sqlite3.connect('data/travel_rates_scraped.sqlite3')
cursor = conn.cursor()
# Count countries
cursor.execute('SELECT COUNT(DISTINCT country) FROM rate_entries WHERE source="international" AND country IS NOT NULL')
total_countries = cursor.fetchone()[0]
# Count total entries
cursor.execute('SELECT COUNT(*) FROM rate_entries WHERE source="international"')
total_entries = cursor.fetchone()[0]
# Count unique currencies
cursor.execute('SELECT COUNT(DISTINCT currency) FROM rate_entries WHERE source="international"')
total_currencies = cursor.fetchone()[0]
print(f"✅ COMPLETE SCRAPER RESULTS:")
print(f" Total Countries: {total_countries}")
print(f" Total Entries: {total_entries:,}")
print(f" Unique Currencies: {total_currencies}")
# Show currency breakdown
cursor.execute("""
SELECT currency, COUNT(DISTINCT country) as country_count, COUNT(*) as entries
FROM rate_entries
WHERE source="international"
GROUP BY currency
ORDER BY country_count DESC
LIMIT 20
""")
print(f"\nTop 20 Currencies:")
for row in cursor.fetchall():
print(f" {row[0]:5} - {row[1]:3} countries, {row[2]:,} entries")
conn.close()