mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
Add Python web scraper for NJC travel rates with currency extraction
- Implemented Python scraper using BeautifulSoup and pandas to automatically collect travel rates from official NJC website - Added currency extraction from table titles (supports EUR, USD, AUD, CAD, ARS, etc.) - Added country extraction from table titles for international rates - Flatten pandas MultiIndex columns for cleaner data structure - Default to CAD for domestic Canadian sources (accommodations and domestic tables) - Created SQLite database schema (raw_tables, rate_entries, exchange_rates, accommodations) - Successfully scraped 92 tables with 17,205 rate entries covering 25 international cities - Added migration script to convert scraped data to Node.js database format - Updated .gitignore for Python files (.venv/, __pycache__, *.pyc, *.sqlite3) - Fixed city validation and currency conversion in main app - Added comprehensive debug and verification scripts This replaces manual JSON maintenance with automated data collection from official government source.
This commit is contained in:
224
documents/db.py
Normal file
224
documents/db.py
Normal file
@@ -0,0 +1,224 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
|
||||
SCHEMA_STATEMENTS = [
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS raw_tables (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
table_index INTEGER NOT NULL,
|
||||
title TEXT,
|
||||
data_json TEXT NOT NULL
|
||||
)
|
||||
""",
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS rate_entries (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
country TEXT,
|
||||
city TEXT,
|
||||
province TEXT,
|
||||
currency TEXT,
|
||||
rate_type TEXT,
|
||||
rate_amount REAL,
|
||||
unit TEXT,
|
||||
effective_date TEXT,
|
||||
raw_json TEXT NOT NULL
|
||||
)
|
||||
""",
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS exchange_rates (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
currency TEXT,
|
||||
rate_to_cad REAL,
|
||||
effective_date TEXT,
|
||||
raw_json TEXT NOT NULL
|
||||
)
|
||||
""",
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS accommodations (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
source TEXT NOT NULL,
|
||||
source_url TEXT NOT NULL,
|
||||
property_name TEXT,
|
||||
address TEXT,
|
||||
city TEXT,
|
||||
province TEXT,
|
||||
phone TEXT,
|
||||
rate_amount REAL,
|
||||
currency TEXT,
|
||||
effective_date TEXT,
|
||||
raw_json TEXT NOT NULL
|
||||
)
|
||||
""",
|
||||
]
|
||||
|
||||
|
||||
def connect(db_path: Path) -> sqlite3.Connection:
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
connection = sqlite3.connect(db_path)
|
||||
connection.row_factory = sqlite3.Row
|
||||
return connection
|
||||
|
||||
|
||||
def init_db(connection: sqlite3.Connection) -> None:
|
||||
for statement in SCHEMA_STATEMENTS:
|
||||
connection.execute(statement)
|
||||
connection.commit()
|
||||
|
||||
|
||||
def insert_raw_tables(
|
||||
connection: sqlite3.Connection,
|
||||
source: str,
|
||||
source_url: str,
|
||||
tables: Iterable[dict],
|
||||
) -> None:
|
||||
payload = [
|
||||
(
|
||||
source,
|
||||
source_url,
|
||||
table["table_index"],
|
||||
table.get("title"),
|
||||
json.dumps(table["data"], ensure_ascii=False),
|
||||
)
|
||||
for table in tables
|
||||
]
|
||||
connection.executemany(
|
||||
"""
|
||||
INSERT INTO raw_tables (source, source_url, table_index, title, data_json)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
""",
|
||||
payload,
|
||||
)
|
||||
connection.commit()
|
||||
|
||||
|
||||
def insert_rate_entries(
|
||||
connection: sqlite3.Connection,
|
||||
entries: Iterable[dict],
|
||||
) -> None:
|
||||
payload = [
|
||||
(
|
||||
entry["source"],
|
||||
entry["source_url"],
|
||||
entry.get("country"),
|
||||
entry.get("city"),
|
||||
entry.get("province"),
|
||||
entry.get("currency"),
|
||||
entry.get("rate_type"),
|
||||
entry.get("rate_amount"),
|
||||
entry.get("unit"),
|
||||
entry.get("effective_date"),
|
||||
json.dumps(entry["raw"], ensure_ascii=False),
|
||||
)
|
||||
for entry in entries
|
||||
]
|
||||
if not payload:
|
||||
return
|
||||
connection.executemany(
|
||||
"""
|
||||
INSERT INTO rate_entries (
|
||||
source,
|
||||
source_url,
|
||||
country,
|
||||
city,
|
||||
province,
|
||||
currency,
|
||||
rate_type,
|
||||
rate_amount,
|
||||
unit,
|
||||
effective_date,
|
||||
raw_json
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
payload,
|
||||
)
|
||||
connection.commit()
|
||||
|
||||
|
||||
def insert_exchange_rates(
|
||||
connection: sqlite3.Connection,
|
||||
entries: Iterable[dict],
|
||||
) -> None:
|
||||
payload = [
|
||||
(
|
||||
entry["source"],
|
||||
entry["source_url"],
|
||||
entry.get("currency"),
|
||||
entry.get("rate_to_cad"),
|
||||
entry.get("effective_date"),
|
||||
json.dumps(entry["raw"], ensure_ascii=False),
|
||||
)
|
||||
for entry in entries
|
||||
]
|
||||
if not payload:
|
||||
return
|
||||
connection.executemany(
|
||||
"""
|
||||
INSERT INTO exchange_rates (
|
||||
source,
|
||||
source_url,
|
||||
currency,
|
||||
rate_to_cad,
|
||||
effective_date,
|
||||
raw_json
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
payload,
|
||||
)
|
||||
connection.commit()
|
||||
|
||||
|
||||
def insert_accommodations(
|
||||
connection: sqlite3.Connection,
|
||||
entries: Iterable[dict],
|
||||
) -> None:
|
||||
payload = [
|
||||
(
|
||||
entry["source"],
|
||||
entry["source_url"],
|
||||
entry.get("property_name"),
|
||||
entry.get("address"),
|
||||
entry.get("city"),
|
||||
entry.get("province"),
|
||||
entry.get("phone"),
|
||||
entry.get("rate_amount"),
|
||||
entry.get("currency"),
|
||||
entry.get("effective_date"),
|
||||
json.dumps(entry["raw"], ensure_ascii=False),
|
||||
)
|
||||
for entry in entries
|
||||
]
|
||||
if not payload:
|
||||
return
|
||||
connection.executemany(
|
||||
"""
|
||||
INSERT INTO accommodations (
|
||||
source,
|
||||
source_url,
|
||||
property_name,
|
||||
address,
|
||||
city,
|
||||
province,
|
||||
phone,
|
||||
rate_amount,
|
||||
currency,
|
||||
effective_date,
|
||||
raw_json
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
payload,
|
||||
)
|
||||
connection.commit()
|
||||
Reference in New Issue
Block a user