Merge pull request #2 from mblanke/codex/scrape-data-from-multiple-websites

Improve scraper robustness and store table metadata in DB
2026-03-01 06:00:21 -05:00 · 2026-01-13 09:33:13 -05:00
parent cddde1c086 2acc0e163c
commit c24d5acc8b
8 changed files with 630 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.venv/
+__pycache__/
+*.pyc
+data/
+*.sqlite3
--- a/README.md
+++ b/README.md
@@ -1 +1,30 @@
 # Gov_Travel_App
+
+## Overview
+This repository contains a Python scraper that collects travel rate tables from the NJC and accommodation listings, then stores the raw tables and normalized entries in a SQLite database.
+
+## Setup
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install -e .
+```
+
+## Run the scraper
+```bash
+python -m gov_travel.main --db data/travel_rates.sqlite3
+```
+
+### Optional flags
+- `--sources international domestic accommodations` to limit which sources are scraped.
+- `--pause 1.5` to pause between processing tables.
+- `--log-level DEBUG` to increase logging verbosity.
+- `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent.
+
+The database includes:
+- `raw_tables` for every scraped HTML table.
+- `rate_entries` for parsed rate rows (country/city/province + rate fields).
+- `exchange_rates` for parsed currency rates.
+- `accommodations` for parsed lodging listings.
+
+If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,22 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "gov-travel"
+version = "0.1.0"
+description = "Scrape NJC travel rates into SQLite"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "beautifulsoup4==4.12.3",
+  "lxml==5.3.0",
+  "pandas==2.2.3",
+  "requests==2.32.3",
+]
+
+[tool.setuptools]
+package-dir = {"" = "src"}
+
+[tool.setuptools.packages.find]
+where = ["src"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+beautifulsoup4==4.12.3
+lxml==5.3.0
+pandas==2.2.3
+requests==2.32.3
--- a/src/gov_travel/init.py
+++ b/src/gov_travel/init.py
@@ -0,0 +1 @@
+"""Gov Travel Scraper."""
--- a/src/gov_travel/db.py
+++ b/src/gov_travel/db.py
@@ -0,0 +1,242 @@
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from typing import Iterable
+
+
+SCHEMA_STATEMENTS = [
+    """
+    CREATE TABLE IF NOT EXISTS raw_tables (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        source TEXT NOT NULL,
+        source_url TEXT NOT NULL,
+        table_index INTEGER NOT NULL,
+        title TEXT,
+        data_json TEXT NOT NULL
+    )
+    """,
+    """
+    CREATE TABLE IF NOT EXISTS rate_entries (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        source TEXT NOT NULL,
+        source_url TEXT NOT NULL,
+        table_index INTEGER,
+        table_title TEXT,
+        country TEXT,
+        city TEXT,
+        province TEXT,
+        currency TEXT,
+        rate_type TEXT,
+        rate_amount REAL,
+        unit TEXT,
+        effective_date TEXT,
+        raw_json TEXT NOT NULL
+    )
+    """,
+    """
+    CREATE TABLE IF NOT EXISTS exchange_rates (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        source TEXT NOT NULL,
+        source_url TEXT NOT NULL,
+        table_index INTEGER,
+        table_title TEXT,
+        currency TEXT,
+        rate_to_cad REAL,
+        effective_date TEXT,
+        raw_json TEXT NOT NULL
+    )
+    """,
+    """
+    CREATE TABLE IF NOT EXISTS accommodations (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        source TEXT NOT NULL,
+        source_url TEXT NOT NULL,
+        table_index INTEGER,
+        table_title TEXT,
+        property_name TEXT,
+        address TEXT,
+        city TEXT,
+        province TEXT,
+        phone TEXT,
+        rate_amount REAL,
+        currency TEXT,
+        effective_date TEXT,
+        raw_json TEXT NOT NULL
+    )
+    """,
+]
+
+
+def connect(db_path: Path) -> sqlite3.Connection:
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    connection = sqlite3.connect(db_path)
+    connection.row_factory = sqlite3.Row
+    return connection
+
+
+def init_db(connection: sqlite3.Connection) -> None:
+    for statement in SCHEMA_STATEMENTS:
+        connection.execute(statement)
+    connection.commit()
+
+
+def insert_raw_tables(
+    connection: sqlite3.Connection,
+    source: str,
+    source_url: str,
+    tables: Iterable[dict],
+) -> None:
+    payload = [
+        (
+            source,
+            source_url,
+            table["table_index"],
+            table.get("title"),
+            json.dumps(table["data"], ensure_ascii=False),
+        )
+        for table in tables
+    ]
+    connection.executemany(
+        """
+        INSERT INTO raw_tables (source, source_url, table_index, title, data_json)
+        VALUES (?, ?, ?, ?, ?)
+        """,
+        payload,
+    )
+    connection.commit()
+
+
+def insert_rate_entries(
+    connection: sqlite3.Connection,
+    entries: Iterable[dict],
+) -> None:
+    payload = [
+        (
+            entry["source"],
+            entry["source_url"],
+            entry.get("table_index"),
+            entry.get("table_title"),
+            entry.get("country"),
+            entry.get("city"),
+            entry.get("province"),
+            entry.get("currency"),
+            entry.get("rate_type"),
+            entry.get("rate_amount"),
+            entry.get("unit"),
+            entry.get("effective_date"),
+            json.dumps(entry["raw"], ensure_ascii=False),
+        )
+        for entry in entries
+    ]
+    if not payload:
+        return
+    connection.executemany(
+        """
+        INSERT INTO rate_entries (
+            source,
+            source_url,
+            table_index,
+            table_title,
+            country,
+            city,
+            province,
+            currency,
+            rate_type,
+            rate_amount,
+            unit,
+            effective_date,
+            raw_json
+        )
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """,
+        payload,
+    )
+    connection.commit()
+
+
+def insert_exchange_rates(
+    connection: sqlite3.Connection,
+    entries: Iterable[dict],
+) -> None:
+    payload = [
+        (
+            entry["source"],
+            entry["source_url"],
+            entry.get("table_index"),
+            entry.get("table_title"),
+            entry.get("currency"),
+            entry.get("rate_to_cad"),
+            entry.get("effective_date"),
+            json.dumps(entry["raw"], ensure_ascii=False),
+        )
+        for entry in entries
+    ]
+    if not payload:
+        return
+    connection.executemany(
+        """
+        INSERT INTO exchange_rates (
+            source,
+            source_url,
+            table_index,
+            table_title,
+            currency,
+            rate_to_cad,
+            effective_date,
+            raw_json
+        )
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+        """,
+        payload,
+    )
+    connection.commit()
+
+
+def insert_accommodations(
+    connection: sqlite3.Connection,
+    entries: Iterable[dict],
+) -> None:
+    payload = [
+        (
+            entry["source"],
+            entry["source_url"],
+            entry.get("table_index"),
+            entry.get("table_title"),
+            entry.get("property_name"),
+            entry.get("address"),
+            entry.get("city"),
+            entry.get("province"),
+            entry.get("phone"),
+            entry.get("rate_amount"),
+            entry.get("currency"),
+            entry.get("effective_date"),
+            json.dumps(entry["raw"], ensure_ascii=False),
+        )
+        for entry in entries
+    ]
+    if not payload:
+        return
+    connection.executemany(
+        """
+        INSERT INTO accommodations (
+            source,
+            source_url,
+            table_index,
+            table_title,
+            property_name,
+            address,
+            city,
+            province,
+            phone,
+            rate_amount,
+            currency,
+            effective_date,
+            raw_json
+        )
+        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+        """,
+        payload,
+    )
+    connection.commit()
--- a/src/gov_travel/main.py
+++ b/src/gov_travel/main.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import argparse
+import logging
+from pathlib import Path
+
+from gov_travel import db
+from gov_travel.scrapers import (
+    SOURCES,
+    build_session,
+    extract_accommodations,
+    extract_exchange_rates,
+    extract_rate_entries,
+    scrape_tables_from_source,
+)
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Scrape travel rates into SQLite")
+    parser.add_argument(
+        "--db",
+        type=Path,
+        default=Path("data/travel_rates.sqlite3"),
+        help="Path to the SQLite database",
+    )
+    parser.add_argument(
+        "--sources",
+        nargs="*",
+        choices=[source.name for source in SOURCES],
+        default=[source.name for source in SOURCES],
+        help="Limit scraping to specific sources",
+    )
+    parser.add_argument(
+        "--pause",
+        type=float,
+        default=0.0,
+        help="Pause (seconds) between table processing",
+    )
+    parser.add_argument(
+        "--log-level",
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
+        help="Logging level",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = parse_args()
+    logging.basicConfig(level=getattr(logging, args.log_level))
+    logger = logging.getLogger(__name__)
+    connection = db.connect(args.db)
+    db.init_db(connection)
+
+    session = build_session()
+    selected = {name for name in args.sources}
+    for source in SOURCES:
+        if source.name not in selected:
+            continue
+        logger.info("Scraping %s (%s)", source.name, source.url)
+        tables = scrape_tables_from_source(source, session=session, pause_seconds=args.pause)
+        logger.info("Found %s tables for %s", len(tables), source.name)
+        db.insert_raw_tables(connection, source.name, source.url, tables)
+
+        rate_entries = extract_rate_entries(source, tables)
+        db.insert_rate_entries(connection, rate_entries)
+
+        exchange_rates = extract_exchange_rates(source, tables)
+        db.insert_exchange_rates(connection, exchange_rates)
+
+        if source.name == "accommodations":
+            accommodations = extract_accommodations(source, tables)
+            db.insert_accommodations(connection, accommodations)
+
+    connection.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/gov_travel/scrapers.py
+++ b/src/gov_travel/scrapers.py
@@ -0,0 +1,247 @@
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import time
+from dataclasses import dataclass
+from typing import Any, Iterable
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+
+LOGGER = logging.getLogger(__name__)
+USER_AGENT = os.getenv("GOV_TRAVEL_USER_AGENT", "GovTravelScraper/1.0 (+https://example.com)")
+DEFAULT_TIMEOUT = 60
+
+
+@dataclass(frozen=True)
+class SourceConfig:
+    name: str
+    url: str
+
+
+SOURCES = [
+    SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"),
+    SourceConfig(name="domestic", url="https://www.njc-cnm.gc.ca/directive/d10/v325/s978/en"),
+    SourceConfig(name="accommodations", url="https://rehelv-acrd.tpsgc-pwgsc.gc.ca/lth-crl-eng.aspx"),
+]
+
+
+def build_session() -> requests.Session:
+    session = requests.Session()
+    retry = Retry(
+        total=3,
+        backoff_factor=1,
+        status_forcelist=(429, 500, 502, 503, 504),
+        allowed_methods=("GET",),
+        raise_on_status=False,
+    )
+    adapter = HTTPAdapter(max_retries=retry)
+    session.mount("http://", adapter)
+    session.mount("https://", adapter)
+    return session
+
+
+def fetch_html(url: str, session: requests.Session | None = None) -> str:
+    active_session = session or build_session()
+    response = active_session.get(url, headers={"User-Agent": USER_AGENT}, timeout=DEFAULT_TIMEOUT)
+    response.raise_for_status()
+    response.encoding = response.apparent_encoding
+    return response.text
+
+
+def extract_tables(html: str) -> list[pd.DataFrame]:
+    try:
+        return pd.read_html(html)
+    except ValueError:
+        return []
+
+
+def _normalize_header(header: str) -> str:
+    return re.sub(r"\s+", " ", header.strip().lower())
+
+
+def _parse_amount(value: Any) -> float | None:
+    if value is None:
+        return None
+    text = str(value)
+    match = re.search(r"-?\d+(?:[\.,]\d+)?", text)
+    if not match:
+        return None
+    amount_text = match.group(0).replace(",", "")
+    try:
+        return float(amount_text)
+    except ValueError:
+        return None
+
+
+def _detect_currency(value: Any, fallback: str | None = None) -> str | None:
+    if value is None:
+        return fallback
+    text = str(value).upper()
+    if "CAD" in text:
+        return "CAD"
+    if "USD" in text:
+        return "USD"
+    match = re.search(r"\b[A-Z]{3}\b", text)
+    if match:
+        return match.group(0)
+    return fallback
+
+
+def _table_title_map(html: str) -> dict[int, str]:
+    soup = BeautifulSoup(html, "html.parser")
+    titles: dict[int, str] = {}
+    for index, table in enumerate(soup.find_all("table")):
+        heading = table.find_previous(["h1", "h2", "h3", "h4", "caption"])
+        if heading:
+            titles[index] = heading.get_text(strip=True)
+    return titles
+
+
+def scrape_tables_from_source(
+    source: SourceConfig,
+    session: requests.Session | None = None,
+    pause_seconds: float = 0.0,
+) -> list[dict[str, Any]]:
+    LOGGER.debug("Fetching HTML for %s", source.url)
+    html = fetch_html(source.url, session=session)
+    tables = extract_tables(html)
+    title_map = _table_title_map(html)
+    results = []
+    for index, table in enumerate(tables):
+        data = json.loads(table.to_json(orient="records"))
+        results.append(
+            {
+                "table_index": index,
+                "title": title_map.get(index),
+                "data": data,
+            }
+        )
+        if pause_seconds:
+            time.sleep(pause_seconds)
+    return results
+
+
+def extract_rate_entries(
+    source: SourceConfig,
+    tables: Iterable[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    entries: list[dict[str, Any]] = []
+    for table in tables:
+        for row in table["data"]:
+            normalized = {_normalize_header(k): v for k, v in row.items()}
+            country = normalized.get("country") or normalized.get("country/territory")
+            city = normalized.get("city") or normalized.get("location")
+            province = normalized.get("province") or normalized.get("province/territory")
+            currency = _detect_currency(normalized.get("currency"))
+            effective_date = normalized.get("effective date") or normalized.get("effective")
+            for key, value in normalized.items():
+                if key in {"country", "country/territory", "city", "location", "province", "province/territory", "currency", "effective", "effective date"}:
+                    continue
+                amount = _parse_amount(value)
+                if amount is None:
+                    continue
+                entry_currency = _detect_currency(value, fallback=currency)
+                entries.append(
+                    {
+                        "source": source.name,
+                        "source_url": source.url,
+                        "table_index": table["table_index"],
+                        "table_title": table.get("title"),
+                        "country": country,
+                        "city": city,
+                        "province": province,
+                        "currency": entry_currency,
+                        "rate_type": key,
+                        "rate_amount": amount,
+                        "unit": None,
+                        "effective_date": effective_date,
+                        "raw": row,
+                    }
+                )
+    return entries
+
+
+def extract_exchange_rates(
+    source: SourceConfig,
+    tables: Iterable[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    entries: list[dict[str, Any]] = []
+    for table in tables:
+        for row in table["data"]:
+            normalized = {_normalize_header(k): v for k, v in row.items()}
+            currency = (
+                normalized.get("currency")
+                or normalized.get("currency code")
+                or normalized.get("code")
+            )
+            rate = (
+                normalized.get("exchange rate")
+                or normalized.get("rate")
+                or normalized.get("cad rate")
+                or normalized.get("rate to cad")
+            )
+            rate_amount = _parse_amount(rate)
+            if not currency or rate_amount is None:
+                continue
+            entries.append(
+                {
+                    "source": source.name,
+                    "source_url": source.url,
+                    "table_index": table["table_index"],
+                    "table_title": table.get("title"),
+                    "currency": _detect_currency(currency),
+                    "rate_to_cad": rate_amount,
+                    "effective_date": normalized.get("effective date") or normalized.get("date"),
+                    "raw": row,
+                }
+            )
+    return entries
+
+
+def extract_accommodations(
+    source: SourceConfig,
+    tables: Iterable[dict[str, Any]],
+) -> list[dict[str, Any]]:
+    entries: list[dict[str, Any]] = []
+    for table in tables:
+        for row in table["data"]:
+            normalized = {_normalize_header(k): v for k, v in row.items()}
+            property_name = (
+                normalized.get("property")
+                or normalized.get("hotel")
+                or normalized.get("accommodation")
+                or normalized.get("name")
+            )
+            if not property_name and not normalized.get("city"):
+                continue
+            rate_amount = _parse_amount(
+                normalized.get("rate")
+                or normalized.get("room rate")
+                or normalized.get("daily rate")
+            )
+            currency = _detect_currency(normalized.get("rate"))
+            entries.append(
+                {
+                    "source": source.name,
+                    "source_url": source.url,
+                    "table_index": table["table_index"],
+                    "table_title": table.get("title"),
+                    "property_name": property_name,
+                    "address": normalized.get("address"),
+                    "city": normalized.get("city") or normalized.get("location"),
+                    "province": normalized.get("province") or normalized.get("province/territory"),
+                    "phone": normalized.get("phone") or normalized.get("telephone"),
+                    "rate_amount": rate_amount,
+                    "currency": currency,
+                    "effective_date": normalized.get("effective date") or normalized.get("effective"),
+                    "raw": row,
+                }
+            )
+    return entries