Gov_Travel_App/scripts/check_njc_page.py

"""Check what's on the NJC international page"""
import sys
sys.path.insert(0, 'src')

from gov_travel.scrapers import fetch_html
from bs4 import BeautifulSoup

url = "https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"
print(f"Fetching: {url}\n")

html = fetch_html(url)
soup = BeautifulSoup(html, 'html.parser')

# Count tables
tables = soup.find_all('table')
print(f"Total tables found: {len(tables)}")

# Find all headings before tables (country names)
countries = set()
for table in tables:
    heading = table.find_previous(['h1', 'h2', 'h3', 'h4'])
    if heading:
        text = heading.get_text(strip=True)
        # Extract country name (before " - Currency:")
        if ' - Currency:' in text:
            country = text.split(' - Currency:')[0].strip()
            countries.add(country)

print(f"\nUnique countries found: {len(countries)}")
print("\nFirst 20 countries:")
for i, country in enumerate(sorted(countries)[:20], 1):
    print(f"{i:2}. {country}")

if len(countries) > 20:
    print(f"\n... and {len(countries) - 20} more")