diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..992cf81 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.venv/ +__pycache__/ +*.pyc +data/ +*.sqlite3 diff --git a/README.md b/README.md index 5f70f21..e005923 100644 --- a/README.md +++ b/README.md @@ -1 +1,30 @@ -# Gov_Travel_App \ No newline at end of file +# Gov_Travel_App + +## Overview +This repository contains a Python scraper that collects travel rate tables from the NJC and accommodation listings, then stores the raw tables and normalized entries in a SQLite database. + +## Setup +```bash +python -m venv .venv +source .venv/bin/activate +pip install -e . +``` + +## Run the scraper +```bash +python -m gov_travel.main --db data/travel_rates.sqlite3 +``` + +### Optional flags +- `--sources international domestic accommodations` to limit which sources are scraped. +- `--pause 1.5` to pause between processing tables. +- `--log-level DEBUG` to increase logging verbosity. +- `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent. + +The database includes: +- `raw_tables` for every scraped HTML table. +- `rate_entries` for parsed rate rows (country/city/province + rate fields). +- `exchange_rates` for parsed currency rates. +- `accommodations` for parsed lodging listings. + +If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..da2b58b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,22 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "gov-travel" +version = "0.1.0" +description = "Scrape NJC travel rates into SQLite" +readme = "README.md" +requires-python = ">=3.10" +dependencies = [ + "beautifulsoup4==4.12.3", + "lxml==5.3.0", + "pandas==2.2.3", + "requests==2.32.3", +] + +[tool.setuptools] +package-dir = {"" = "src"} + +[tool.setuptools.packages.find] +where = ["src"] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..db128da --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +beautifulsoup4==4.12.3 +lxml==5.3.0 +pandas==2.2.3 +requests==2.32.3 diff --git a/src/gov_travel/__init__.py b/src/gov_travel/__init__.py new file mode 100644 index 0000000..63d0ed0 --- /dev/null +++ b/src/gov_travel/__init__.py @@ -0,0 +1 @@ +"""Gov Travel Scraper.""" diff --git a/src/gov_travel/db.py b/src/gov_travel/db.py new file mode 100644 index 0000000..66f8c8c --- /dev/null +++ b/src/gov_travel/db.py @@ -0,0 +1,242 @@ +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path +from typing import Iterable + + +SCHEMA_STATEMENTS = [ + """ + CREATE TABLE IF NOT EXISTS raw_tables ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + source_url TEXT NOT NULL, + table_index INTEGER NOT NULL, + title TEXT, + data_json TEXT NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS rate_entries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + source_url TEXT NOT NULL, + table_index INTEGER, + table_title TEXT, + country TEXT, + city TEXT, + province TEXT, + currency TEXT, + rate_type TEXT, + rate_amount REAL, + unit TEXT, + effective_date TEXT, + raw_json TEXT NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS exchange_rates ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + source_url TEXT NOT NULL, + table_index INTEGER, + table_title TEXT, + currency TEXT, + rate_to_cad REAL, + effective_date TEXT, + raw_json TEXT NOT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS accommodations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + source_url TEXT NOT NULL, + table_index INTEGER, + table_title TEXT, + property_name TEXT, + address TEXT, + city TEXT, + province TEXT, + phone TEXT, + rate_amount REAL, + currency TEXT, + effective_date TEXT, + raw_json TEXT NOT NULL + ) + """, +] + + +def connect(db_path: Path) -> sqlite3.Connection: + db_path.parent.mkdir(parents=True, exist_ok=True) + connection = sqlite3.connect(db_path) + connection.row_factory = sqlite3.Row + return connection + + +def init_db(connection: sqlite3.Connection) -> None: + for statement in SCHEMA_STATEMENTS: + connection.execute(statement) + connection.commit() + + +def insert_raw_tables( + connection: sqlite3.Connection, + source: str, + source_url: str, + tables: Iterable[dict], +) -> None: + payload = [ + ( + source, + source_url, + table["table_index"], + table.get("title"), + json.dumps(table["data"], ensure_ascii=False), + ) + for table in tables + ] + connection.executemany( + """ + INSERT INTO raw_tables (source, source_url, table_index, title, data_json) + VALUES (?, ?, ?, ?, ?) + """, + payload, + ) + connection.commit() + + +def insert_rate_entries( + connection: sqlite3.Connection, + entries: Iterable[dict], +) -> None: + payload = [ + ( + entry["source"], + entry["source_url"], + entry.get("table_index"), + entry.get("table_title"), + entry.get("country"), + entry.get("city"), + entry.get("province"), + entry.get("currency"), + entry.get("rate_type"), + entry.get("rate_amount"), + entry.get("unit"), + entry.get("effective_date"), + json.dumps(entry["raw"], ensure_ascii=False), + ) + for entry in entries + ] + if not payload: + return + connection.executemany( + """ + INSERT INTO rate_entries ( + source, + source_url, + table_index, + table_title, + country, + city, + province, + currency, + rate_type, + rate_amount, + unit, + effective_date, + raw_json + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + payload, + ) + connection.commit() + + +def insert_exchange_rates( + connection: sqlite3.Connection, + entries: Iterable[dict], +) -> None: + payload = [ + ( + entry["source"], + entry["source_url"], + entry.get("table_index"), + entry.get("table_title"), + entry.get("currency"), + entry.get("rate_to_cad"), + entry.get("effective_date"), + json.dumps(entry["raw"], ensure_ascii=False), + ) + for entry in entries + ] + if not payload: + return + connection.executemany( + """ + INSERT INTO exchange_rates ( + source, + source_url, + table_index, + table_title, + currency, + rate_to_cad, + effective_date, + raw_json + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + """, + payload, + ) + connection.commit() + + +def insert_accommodations( + connection: sqlite3.Connection, + entries: Iterable[dict], +) -> None: + payload = [ + ( + entry["source"], + entry["source_url"], + entry.get("table_index"), + entry.get("table_title"), + entry.get("property_name"), + entry.get("address"), + entry.get("city"), + entry.get("province"), + entry.get("phone"), + entry.get("rate_amount"), + entry.get("currency"), + entry.get("effective_date"), + json.dumps(entry["raw"], ensure_ascii=False), + ) + for entry in entries + ] + if not payload: + return + connection.executemany( + """ + INSERT INTO accommodations ( + source, + source_url, + table_index, + table_title, + property_name, + address, + city, + province, + phone, + rate_amount, + currency, + effective_date, + raw_json + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + payload, + ) + connection.commit() diff --git a/src/gov_travel/main.py b/src/gov_travel/main.py new file mode 100644 index 0000000..69c90f2 --- /dev/null +++ b/src/gov_travel/main.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +import argparse +import logging +from pathlib import Path + +from gov_travel import db +from gov_travel.scrapers import ( + SOURCES, + build_session, + extract_accommodations, + extract_exchange_rates, + extract_rate_entries, + scrape_tables_from_source, +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Scrape travel rates into SQLite") + parser.add_argument( + "--db", + type=Path, + default=Path("data/travel_rates.sqlite3"), + help="Path to the SQLite database", + ) + parser.add_argument( + "--sources", + nargs="*", + choices=[source.name for source in SOURCES], + default=[source.name for source in SOURCES], + help="Limit scraping to specific sources", + ) + parser.add_argument( + "--pause", + type=float, + default=0.0, + help="Pause (seconds) between table processing", + ) + parser.add_argument( + "--log-level", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="Logging level", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + logging.basicConfig(level=getattr(logging, args.log_level)) + logger = logging.getLogger(__name__) + connection = db.connect(args.db) + db.init_db(connection) + + session = build_session() + selected = {name for name in args.sources} + for source in SOURCES: + if source.name not in selected: + continue + logger.info("Scraping %s (%s)", source.name, source.url) + tables = scrape_tables_from_source(source, session=session, pause_seconds=args.pause) + logger.info("Found %s tables for %s", len(tables), source.name) + db.insert_raw_tables(connection, source.name, source.url, tables) + + rate_entries = extract_rate_entries(source, tables) + db.insert_rate_entries(connection, rate_entries) + + exchange_rates = extract_exchange_rates(source, tables) + db.insert_exchange_rates(connection, exchange_rates) + + if source.name == "accommodations": + accommodations = extract_accommodations(source, tables) + db.insert_accommodations(connection, accommodations) + + connection.close() + + +if __name__ == "__main__": + main() diff --git a/src/gov_travel/scrapers.py b/src/gov_travel/scrapers.py new file mode 100644 index 0000000..cc49fd1 --- /dev/null +++ b/src/gov_travel/scrapers.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import json +import logging +import os +import re +import time +from dataclasses import dataclass +from typing import Any, Iterable + +import pandas as pd +import requests +from bs4 import BeautifulSoup +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry + +LOGGER = logging.getLogger(__name__) +USER_AGENT = os.getenv("GOV_TRAVEL_USER_AGENT", "GovTravelScraper/1.0 (+https://example.com)") +DEFAULT_TIMEOUT = 60 + + +@dataclass(frozen=True) +class SourceConfig: + name: str + url: str + + +SOURCES = [ + SourceConfig(name="international", url="https://www.njc-cnm.gc.ca/directive/app_d.php?lang=en"), + SourceConfig(name="domestic", url="https://www.njc-cnm.gc.ca/directive/d10/v325/s978/en"), + SourceConfig(name="accommodations", url="https://rehelv-acrd.tpsgc-pwgsc.gc.ca/lth-crl-eng.aspx"), +] + + +def build_session() -> requests.Session: + session = requests.Session() + retry = Retry( + total=3, + backoff_factor=1, + status_forcelist=(429, 500, 502, 503, 504), + allowed_methods=("GET",), + raise_on_status=False, + ) + adapter = HTTPAdapter(max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +def fetch_html(url: str, session: requests.Session | None = None) -> str: + active_session = session or build_session() + response = active_session.get(url, headers={"User-Agent": USER_AGENT}, timeout=DEFAULT_TIMEOUT) + response.raise_for_status() + response.encoding = response.apparent_encoding + return response.text + + +def extract_tables(html: str) -> list[pd.DataFrame]: + try: + return pd.read_html(html) + except ValueError: + return [] + + +def _normalize_header(header: str) -> str: + return re.sub(r"\s+", " ", header.strip().lower()) + + +def _parse_amount(value: Any) -> float | None: + if value is None: + return None + text = str(value) + match = re.search(r"-?\d+(?:[\.,]\d+)?", text) + if not match: + return None + amount_text = match.group(0).replace(",", "") + try: + return float(amount_text) + except ValueError: + return None + + +def _detect_currency(value: Any, fallback: str | None = None) -> str | None: + if value is None: + return fallback + text = str(value).upper() + if "CAD" in text: + return "CAD" + if "USD" in text: + return "USD" + match = re.search(r"\b[A-Z]{3}\b", text) + if match: + return match.group(0) + return fallback + + +def _table_title_map(html: str) -> dict[int, str]: + soup = BeautifulSoup(html, "html.parser") + titles: dict[int, str] = {} + for index, table in enumerate(soup.find_all("table")): + heading = table.find_previous(["h1", "h2", "h3", "h4", "caption"]) + if heading: + titles[index] = heading.get_text(strip=True) + return titles + + +def scrape_tables_from_source( + source: SourceConfig, + session: requests.Session | None = None, + pause_seconds: float = 0.0, +) -> list[dict[str, Any]]: + LOGGER.debug("Fetching HTML for %s", source.url) + html = fetch_html(source.url, session=session) + tables = extract_tables(html) + title_map = _table_title_map(html) + results = [] + for index, table in enumerate(tables): + data = json.loads(table.to_json(orient="records")) + results.append( + { + "table_index": index, + "title": title_map.get(index), + "data": data, + } + ) + if pause_seconds: + time.sleep(pause_seconds) + return results + + +def extract_rate_entries( + source: SourceConfig, + tables: Iterable[dict[str, Any]], +) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + for table in tables: + for row in table["data"]: + normalized = {_normalize_header(k): v for k, v in row.items()} + country = normalized.get("country") or normalized.get("country/territory") + city = normalized.get("city") or normalized.get("location") + province = normalized.get("province") or normalized.get("province/territory") + currency = _detect_currency(normalized.get("currency")) + effective_date = normalized.get("effective date") or normalized.get("effective") + for key, value in normalized.items(): + if key in {"country", "country/territory", "city", "location", "province", "province/territory", "currency", "effective", "effective date"}: + continue + amount = _parse_amount(value) + if amount is None: + continue + entry_currency = _detect_currency(value, fallback=currency) + entries.append( + { + "source": source.name, + "source_url": source.url, + "table_index": table["table_index"], + "table_title": table.get("title"), + "country": country, + "city": city, + "province": province, + "currency": entry_currency, + "rate_type": key, + "rate_amount": amount, + "unit": None, + "effective_date": effective_date, + "raw": row, + } + ) + return entries + + +def extract_exchange_rates( + source: SourceConfig, + tables: Iterable[dict[str, Any]], +) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + for table in tables: + for row in table["data"]: + normalized = {_normalize_header(k): v for k, v in row.items()} + currency = ( + normalized.get("currency") + or normalized.get("currency code") + or normalized.get("code") + ) + rate = ( + normalized.get("exchange rate") + or normalized.get("rate") + or normalized.get("cad rate") + or normalized.get("rate to cad") + ) + rate_amount = _parse_amount(rate) + if not currency or rate_amount is None: + continue + entries.append( + { + "source": source.name, + "source_url": source.url, + "table_index": table["table_index"], + "table_title": table.get("title"), + "currency": _detect_currency(currency), + "rate_to_cad": rate_amount, + "effective_date": normalized.get("effective date") or normalized.get("date"), + "raw": row, + } + ) + return entries + + +def extract_accommodations( + source: SourceConfig, + tables: Iterable[dict[str, Any]], +) -> list[dict[str, Any]]: + entries: list[dict[str, Any]] = [] + for table in tables: + for row in table["data"]: + normalized = {_normalize_header(k): v for k, v in row.items()} + property_name = ( + normalized.get("property") + or normalized.get("hotel") + or normalized.get("accommodation") + or normalized.get("name") + ) + if not property_name and not normalized.get("city"): + continue + rate_amount = _parse_amount( + normalized.get("rate") + or normalized.get("room rate") + or normalized.get("daily rate") + ) + currency = _detect_currency(normalized.get("rate")) + entries.append( + { + "source": source.name, + "source_url": source.url, + "table_index": table["table_index"], + "table_title": table.get("title"), + "property_name": property_name, + "address": normalized.get("address"), + "city": normalized.get("city") or normalized.get("location"), + "province": normalized.get("province") or normalized.get("province/territory"), + "phone": normalized.get("phone") or normalized.get("telephone"), + "rate_amount": rate_amount, + "currency": currency, + "effective_date": normalized.get("effective date") or normalized.get("effective"), + "raw": row, + } + ) + return entries