Merge pull request #2 from mblanke/codex/scrape-data-from-multiple-websites

Improve scraper robustness and store table metadata in DB
This commit is contained in:
2026-01-13 09:33:13 -05:00
committed by GitHub
8 changed files with 630 additions and 1 deletions

5
.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
.venv/
__pycache__/
*.pyc
data/
*.sqlite3

View File

@@ -1 +1,30 @@
# Gov_Travel_App
## Overview
This repository contains a Python scraper that collects travel rate tables from the NJC and accommodation listings, then stores the raw tables and normalized entries in a SQLite database.
## Setup
```bash
python -m venv .venv
source .venv/bin/activate
pip install -e .
```
## Run the scraper
```bash
python -m gov_travel.main --db data/travel_rates.sqlite3
```
### Optional flags
- `--sources international domestic accommodations` to limit which sources are scraped.
- `--pause 1.5` to pause between processing tables.
- `--log-level DEBUG` to increase logging verbosity.
- `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent.
The database includes:
- `raw_tables` for every scraped HTML table.
- `rate_entries` for parsed rate rows (country/city/province + rate fields).
- `exchange_rates` for parsed currency rates.
- `accommodations` for parsed lodging listings.
If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing.

22
pyproject.toml Normal file
View File

@@ -0,0 +1,22 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "gov-travel"
version = "0.1.0"
description = "Scrape NJC travel rates into SQLite"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"beautifulsoup4==4.12.3",
"lxml==5.3.0",
"pandas==2.2.3",
"requests==2.32.3",
]
[tool.setuptools]
package-dir = {"" = "src"}
[tool.setuptools.packages.find]
where = ["src"]

4
requirements.txt Normal file
View File

@@ -0,0 +1,4 @@
beautifulsoup4==4.12.3
lxml==5.3.0
pandas==2.2.3
requests==2.32.3

View File

@@ -0,0 +1 @@
"""Gov Travel Scraper."""

242
src/gov_travel/db.py Normal file
View File

@@ -0,0 +1,242 @@
from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from typing import Iterable
SCHEMA_STATEMENTS = [
"""
CREATE TABLE IF NOT EXISTS raw_tables (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT NOT NULL,
source_url TEXT NOT NULL,
table_index INTEGER NOT NULL,
title TEXT,
data_json TEXT NOT NULL
)
""",
"""
CREATE TABLE IF NOT EXISTS rate_entries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT NOT NULL,
source_url TEXT NOT NULL,
table_index INTEGER,
table_title TEXT,
country TEXT,
city TEXT,
province TEXT,
currency TEXT,
rate_type TEXT,
rate_amount REAL,
unit TEXT,
effective_date TEXT,
raw_json TEXT NOT NULL
)
""",
"""
CREATE TABLE IF NOT EXISTS exchange_rates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT NOT NULL,
source_url TEXT NOT NULL,
table_index INTEGER,
table_title TEXT,
currency TEXT,
rate_to_cad REAL,
effective_date TEXT,
raw_json TEXT NOT NULL
)
""",
"""
CREATE TABLE IF NOT EXISTS accommodations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source TEXT NOT NULL,
source_url TEXT NOT NULL,
table_index INTEGER,
table_title TEXT,
property_name TEXT,
address TEXT,
city TEXT,
province TEXT,
phone TEXT,
rate_amount REAL,
currency TEXT,
effective_date TEXT,
raw_json TEXT NOT NULL
)
""",
]
def connect(db_path: Path) -> sqlite3.Connection:
db_path.parent.mkdir(parents=True, exist_ok=True)
connection = sqlite3.connect(db_path)
connection.row_factory = sqlite3.Row
return connection
def init_db(connection: sqlite3.Connection) -> None:
for statement in SCHEMA_STATEMENTS:
connection.execute(statement)
connection.commit()
def insert_raw_tables(
connection: sqlite3.Connection,
source: str,
source_url: str,
tables: Iterable[dict],
) -> None:
payload = [
(
source,
source_url,
table["table_index"],
table.get("title"),
json.dumps(table["data"], ensure_ascii=False),
)
for table in tables
]
connection.executemany(
"""
INSERT INTO raw_tables (source, source_url, table_index, title, data_json)
VALUES (?, ?, ?, ?, ?)
""",
payload,
)
connection.commit()
def insert_rate_entries(
connection: sqlite3.Connection,
entries: Iterable[dict],
) -> None:
payload = [
(
entry["source"],
entry["source_url"],
entry.get("table_index"),
entry.get("table_title"),
entry.get("country"),
entry.get("city"),
entry.get("province"),
entry.get("currency"),
entry.get("rate_type"),
entry.get("rate_amount"),
entry.get("unit"),
entry.get("effective_date"),
json.dumps(entry["raw"], ensure_ascii=False),
)
for entry in entries
]
if not payload:
return
connection.executemany(
"""
INSERT INTO rate_entries (
source,
source_url,
table_index,
table_title,
country,
city,
province,
currency,
rate_type,
rate_amount,
unit,
effective_date,
raw_json
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
payload,
)
connection.commit()
def insert_exchange_rates(
connection: sqlite3.Connection,
entries: Iterable[dict],
) -> None:
payload = [
(
entry["source"],
entry["source_url"],
entry.get("table_index"),
entry.get("table_title"),
entry.get("currency"),
entry.get("rate_to_cad"),
entry.get("effective_date"),
json.dumps(entry["raw"], ensure_ascii=False),
)
for entry in entries
]
if not payload:
return
connection.executemany(
"""
INSERT INTO exchange_rates (
source,
source_url,
table_index,
table_title,
currency,
rate_to_cad,
effective_date,
raw_json
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
""",
payload,
)
connection.commit()
def insert_accommodations(
connection: sqlite3.Connection,
entries: Iterable[dict],
) -> None:
payload = [
(
entry["source"],
entry["source_url"],
entry.get("table_index"),
entry.get("table_title"),
entry.get("property_name"),
entry.get("address"),
entry.get("city"),
entry.get("province"),
entry.get("phone"),
entry.get("rate_amount"),
entry.get("currency"),
entry.get("effective_date"),
json.dumps(entry["raw"], ensure_ascii=False),
)
for entry in entries
]
if not payload:
return
connection.executemany(
"""
INSERT INTO accommodations (
source,
source_url,
table_index,
table_title,
property_name,
address,
city,
province,
phone,
rate_amount,
currency,
effective_date,
raw_json
)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
payload,
)
connection.commit()

79
src/gov_travel/main.py Normal file
View File

@@ -0,0 +1,79 @@
from __future__ import annotations
import argparse
import logging
from pathlib import Path
from gov_travel import db
from gov_travel.scrapers import (
SOURCES,
build_session,
extract_accommodations,
extract_exchange_rates,
extract_rate_entries,
scrape_tables_from_source,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Scrape travel rates into SQLite")
parser.add_argument(
"--db",
type=Path,
default=Path("data/travel_rates.sqlite3"),
help="Path to the SQLite database",
)
parser.add_argument(
"--sources",
nargs="*",
choices=[source.name for source in SOURCES],
default=[source.name for source in SOURCES],
help="Limit scraping to specific sources",
)
parser.add_argument(
"--pause",
type=float,
default=0.0,
help="Pause (seconds) between table processing",
)
parser.add_argument(
"--log-level",
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
logging.basicConfig(level=getattr(logging, args.log_level))
logger = logging.getLogger(__name__)
connection = db.connect(args.db)
db.init_db(connection)
session = build_session()
selected = {name for name in args.sources}
for source in SOURCES:
if source.name not in selected:
continue
logger.info("Scraping %s (%s)", source.name, source.url)
tables = scrape_tables_from_source(source, session=session, pause_seconds=args.pause)
logger.info("Found %s tables for %s", len(tables), source.name)
db.insert_raw_tables(connection, source.name, source.url, tables)
rate_entries = extract_rate_entries(source, tables)
db.insert_rate_entries(connection, rate_entries)
exchange_rates = extract_exchange_rates(source, tables)
db.insert_exchange_rates(connection, exchange_rates)
if source.name == "accommodations":
accommodations = extract_accommodations(source, tables)
db.insert_accommodations(connection, accommodations)
connection.close()
if __name__ == "__main__":
main()

247
src/gov_travel/scrapers.py Normal file
View File

@@ -0,0 +1,247 @@
from __future__ import annotations
import json
import logging
import os
import re
import time
from dataclasses import dataclass
from typing import Any, Iterable
import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
LOGGER = logging.getLogger(__name__)
USER_AGENT = os.getenv("GOV_TRAVEL_USER_AGENT", "GovTravelScraper/1.0 (+https://example.com)")
DEFAULT_TIMEOUT = 60
@dataclass(frozen=True)
class SourceConfig:
name: str
url: str
SOURCES = [
SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"),
SourceConfig(name="domestic", url="https://www.njc-cnm.gc.ca/directive/d10/v325/s978/en"),
SourceConfig(name="accommodations", url="https://rehelv-acrd.tpsgc-pwgsc.gc.ca/lth-crl-eng.aspx"),
]
def build_session() -> requests.Session:
session = requests.Session()
retry = Retry(
total=3,
backoff_factor=1,
status_forcelist=(429, 500, 502, 503, 504),
allowed_methods=("GET",),
raise_on_status=False,
)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
def fetch_html(url: str, session: requests.Session | None = None) -> str:
active_session = session or build_session()
response = active_session.get(url, headers={"User-Agent": USER_AGENT}, timeout=DEFAULT_TIMEOUT)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
def extract_tables(html: str) -> list[pd.DataFrame]:
try:
return pd.read_html(html)
except ValueError:
return []
def _normalize_header(header: str) -> str:
return re.sub(r"\s+", " ", header.strip().lower())
def _parse_amount(value: Any) -> float | None:
if value is None:
return None
text = str(value)
match = re.search(r"-?\d+(?:[\.,]\d+)?", text)
if not match:
return None
amount_text = match.group(0).replace(",", "")
try:
return float(amount_text)
except ValueError:
return None
def _detect_currency(value: Any, fallback: str | None = None) -> str | None:
if value is None:
return fallback
text = str(value).upper()
if "CAD" in text:
return "CAD"
if "USD" in text:
return "USD"
match = re.search(r"\b[A-Z]{3}\b", text)
if match:
return match.group(0)
return fallback
def _table_title_map(html: str) -> dict[int, str]:
soup = BeautifulSoup(html, "html.parser")
titles: dict[int, str] = {}
for index, table in enumerate(soup.find_all("table")):
heading = table.find_previous(["h1", "h2", "h3", "h4", "caption"])
if heading:
titles[index] = heading.get_text(strip=True)
return titles
def scrape_tables_from_source(
source: SourceConfig,
session: requests.Session | None = None,
pause_seconds: float = 0.0,
) -> list[dict[str, Any]]:
LOGGER.debug("Fetching HTML for %s", source.url)
html = fetch_html(source.url, session=session)
tables = extract_tables(html)
title_map = _table_title_map(html)
results = []
for index, table in enumerate(tables):
data = json.loads(table.to_json(orient="records"))
results.append(
{
"table_index": index,
"title": title_map.get(index),
"data": data,
}
)
if pause_seconds:
time.sleep(pause_seconds)
return results
def extract_rate_entries(
source: SourceConfig,
tables: Iterable[dict[str, Any]],
) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for table in tables:
for row in table["data"]:
normalized = {_normalize_header(k): v for k, v in row.items()}
country = normalized.get("country") or normalized.get("country/territory")
city = normalized.get("city") or normalized.get("location")
province = normalized.get("province") or normalized.get("province/territory")
currency = _detect_currency(normalized.get("currency"))
effective_date = normalized.get("effective date") or normalized.get("effective")
for key, value in normalized.items():
if key in {"country", "country/territory", "city", "location", "province", "province/territory", "currency", "effective", "effective date"}:
continue
amount = _parse_amount(value)
if amount is None:
continue
entry_currency = _detect_currency(value, fallback=currency)
entries.append(
{
"source": source.name,
"source_url": source.url,
"table_index": table["table_index"],
"table_title": table.get("title"),
"country": country,
"city": city,
"province": province,
"currency": entry_currency,
"rate_type": key,
"rate_amount": amount,
"unit": None,
"effective_date": effective_date,
"raw": row,
}
)
return entries
def extract_exchange_rates(
source: SourceConfig,
tables: Iterable[dict[str, Any]],
) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for table in tables:
for row in table["data"]:
normalized = {_normalize_header(k): v for k, v in row.items()}
currency = (
normalized.get("currency")
or normalized.get("currency code")
or normalized.get("code")
)
rate = (
normalized.get("exchange rate")
or normalized.get("rate")
or normalized.get("cad rate")
or normalized.get("rate to cad")
)
rate_amount = _parse_amount(rate)
if not currency or rate_amount is None:
continue
entries.append(
{
"source": source.name,
"source_url": source.url,
"table_index": table["table_index"],
"table_title": table.get("title"),
"currency": _detect_currency(currency),
"rate_to_cad": rate_amount,
"effective_date": normalized.get("effective date") or normalized.get("date"),
"raw": row,
}
)
return entries
def extract_accommodations(
source: SourceConfig,
tables: Iterable[dict[str, Any]],
) -> list[dict[str, Any]]:
entries: list[dict[str, Any]] = []
for table in tables:
for row in table["data"]:
normalized = {_normalize_header(k): v for k, v in row.items()}
property_name = (
normalized.get("property")
or normalized.get("hotel")
or normalized.get("accommodation")
or normalized.get("name")
)
if not property_name and not normalized.get("city"):
continue
rate_amount = _parse_amount(
normalized.get("rate")
or normalized.get("room rate")
or normalized.get("daily rate")
)
currency = _detect_currency(normalized.get("rate"))
entries.append(
{
"source": source.name,
"source_url": source.url,
"table_index": table["table_index"],
"table_title": table.get("title"),
"property_name": property_name,
"address": normalized.get("address"),
"city": normalized.get("city") or normalized.get("location"),
"province": normalized.get("province") or normalized.get("province/territory"),
"phone": normalized.get("phone") or normalized.get("telephone"),
"rate_amount": rate_amount,
"currency": currency,
"effective_date": normalized.get("effective date") or normalized.get("effective"),
"raw": row,
}
)
return entries