diff --git a/README.md b/README.md index e005923..2c712ed 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,31 @@ python -m gov_travel.main --db data/travel_rates.sqlite3 - `--sources international domestic accommodations` to limit which sources are scraped. - `--pause 1.5` to pause between processing tables. - `--log-level DEBUG` to increase logging verbosity. +- `--no-scrape` to skip scraping and only work with existing database data. - `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent. +## Export an estimate to Excel +After data exists in SQLite (from a previous scrape), export a cost estimate workbook: + +```bash +python -m gov_travel.main \ + --db data/travel_rates.sqlite3 \ + --no-scrape \ + --export-estimate-xlsx output/travel_estimate.xlsx \ + --estimate-days 5 \ + --estimate-rate-type meal \ + --estimate-country Canada \ + --estimate-city Ottawa \ + --estimate-lodging-per-night 235 \ + --estimate-transport-total 175 \ + --estimate-misc-total 80 +``` + +Workbook sheets: +- `estimate_summary`: Days, recommended meal allowance, line item subtotals, and grand total. +- `matched_rate_entries`: Source rows used to derive the allowance recommendation. + +## Database contents The database includes: - `raw_tables` for every scraped HTML table. - `rate_entries` for parsed rate rows (country/city/province + rate fields). @@ -28,3 +51,9 @@ The database includes: - `accommodations` for parsed lodging listings. If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing. + +## Suggested next improvements +- Add automated tests for parser heuristics and the estimate export path. +- Add currency conversion in estimate exports using `exchange_rates` so totals can be normalized to CAD. +- Add source-level freshness metadata to avoid duplicate inserts when scraping repeatedly. +- Expose estimate/export in a small web UI for non-technical users. diff --git a/pyproject.toml b/pyproject.toml index da2b58b..9c50bd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "lxml==5.3.0", "pandas==2.2.3", "requests==2.32.3", + "openpyxl==3.1.5", ] [tool.setuptools] diff --git a/requirements.txt b/requirements.txt index db128da..3ce3884 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ beautifulsoup4==4.12.3 lxml==5.3.0 pandas==2.2.3 requests==2.32.3 +openpyxl==3.1.5 diff --git a/src/gov_travel/estimate.py b/src/gov_travel/estimate.py new file mode 100644 index 0000000..2672277 --- /dev/null +++ b/src/gov_travel/estimate.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +import sqlite3 +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +import pandas as pd + + +@dataclass(frozen=True) +class EstimateConfig: + days: int + lodging_per_night: float + transport_total: float + misc_total: float + rate_type: str + country: str | None = None + city: str | None = None + province: str | None = None + + +def _load_rate_matches(connection: sqlite3.Connection, config: EstimateConfig) -> list[dict[str, Any]]: + query = """ + SELECT + source, + source_url, + table_title, + country, + city, + province, + currency, + rate_type, + rate_amount, + effective_date + FROM rate_entries + WHERE rate_amount IS NOT NULL + AND LOWER(rate_type) LIKE LOWER(?) + """ + params: list[Any] = [f"%{config.rate_type}%"] + if config.country: + query += " AND LOWER(country) = LOWER(?)" + params.append(config.country) + if config.city: + query += " AND LOWER(city) = LOWER(?)" + params.append(config.city) + if config.province: + query += " AND LOWER(province) = LOWER(?)" + params.append(config.province) + + query += " ORDER BY effective_date DESC, rate_amount DESC" + rows = connection.execute(query, params).fetchall() + return [dict(row) for row in rows] + + +def _pick_recommended_rate(matches: list[dict[str, Any]]) -> tuple[float, str]: + if not matches: + return 0.0, "CAD" + + currency = matches[0].get("currency") or "CAD" + latest_date = matches[0].get("effective_date") + latest_rows = [ + row for row in matches if row.get("effective_date") == latest_date and row.get("currency") == currency + ] + candidates = latest_rows or matches[:5] + average_rate = sum(float(row["rate_amount"]) for row in candidates if row.get("rate_amount") is not None) / max( + len(candidates), + 1, + ) + return round(average_rate, 2), currency + + +def export_estimate_xlsx( + connection: sqlite3.Connection, + output_path: Path, + config: EstimateConfig, +) -> Path: + matches = _load_rate_matches(connection, config) + recommended_rate, currency = _pick_recommended_rate(matches) + + meals_total = round(recommended_rate * config.days, 2) + lodging_total = round(config.lodging_per_night * config.days, 2) + grand_total = round(meals_total + lodging_total + config.transport_total + config.misc_total, 2) + + summary_rows = [ + {"item": "Days", "value": config.days, "currency": ""}, + {"item": f"Meal allowance ({config.rate_type})", "value": recommended_rate, "currency": currency}, + {"item": "Meals subtotal", "value": meals_total, "currency": currency}, + {"item": "Lodging per night", "value": config.lodging_per_night, "currency": currency}, + {"item": "Lodging subtotal", "value": lodging_total, "currency": currency}, + {"item": "Transport total", "value": config.transport_total, "currency": currency}, + {"item": "Misc total", "value": config.misc_total, "currency": currency}, + {"item": "Grand total", "value": grand_total, "currency": currency}, + ] + + output_path.parent.mkdir(parents=True, exist_ok=True) + with pd.ExcelWriter(output_path, engine="openpyxl") as writer: + pd.DataFrame(summary_rows).to_excel(writer, index=False, sheet_name="estimate_summary") + pd.DataFrame(matches).to_excel(writer, index=False, sheet_name="matched_rate_entries") + + return output_path diff --git a/src/gov_travel/main.py b/src/gov_travel/main.py index 69c90f2..97b0e91 100644 --- a/src/gov_travel/main.py +++ b/src/gov_travel/main.py @@ -5,6 +5,7 @@ import logging from pathlib import Path from gov_travel import db +from gov_travel.estimate import EstimateConfig, export_estimate_xlsx from gov_travel.scrapers import ( SOURCES, build_session, @@ -42,6 +43,57 @@ def parse_args() -> argparse.Namespace: choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="Logging level", ) + parser.add_argument( + "--no-scrape", + action="store_true", + help="Skip scraping and only use data already in the database", + ) + parser.add_argument( + "--export-estimate-xlsx", + type=Path, + help="Optional output path to export a travel estimate workbook", + ) + parser.add_argument( + "--estimate-days", + type=int, + default=1, + help="Number of travel days for the estimate", + ) + parser.add_argument( + "--estimate-rate-type", + default="meal", + help="Rate type filter used to find allowance rates (e.g., meal, breakfast, dinner)", + ) + parser.add_argument( + "--estimate-country", + help="Optional country filter for estimate rate lookup", + ) + parser.add_argument( + "--estimate-city", + help="Optional city filter for estimate rate lookup", + ) + parser.add_argument( + "--estimate-province", + help="Optional province filter for estimate rate lookup", + ) + parser.add_argument( + "--estimate-lodging-per-night", + type=float, + default=0.0, + help="Manual lodging cost per night", + ) + parser.add_argument( + "--estimate-transport-total", + type=float, + default=0.0, + help="Manual transport total", + ) + parser.add_argument( + "--estimate-misc-total", + type=float, + default=0.0, + help="Manual misc total", + ) return parser.parse_args() @@ -52,25 +104,40 @@ def main() -> None: connection = db.connect(args.db) db.init_db(connection) - session = build_session() - selected = {name for name in args.sources} - for source in SOURCES: - if source.name not in selected: - continue - logger.info("Scraping %s (%s)", source.name, source.url) - tables = scrape_tables_from_source(source, session=session, pause_seconds=args.pause) - logger.info("Found %s tables for %s", len(tables), source.name) - db.insert_raw_tables(connection, source.name, source.url, tables) + if not args.no_scrape: + session = build_session() + selected = {name for name in args.sources} + for source in SOURCES: + if source.name not in selected: + continue + logger.info("Scraping %s (%s)", source.name, source.url) + tables = scrape_tables_from_source(source, session=session, pause_seconds=args.pause) + logger.info("Found %s tables for %s", len(tables), source.name) + db.insert_raw_tables(connection, source.name, source.url, tables) - rate_entries = extract_rate_entries(source, tables) - db.insert_rate_entries(connection, rate_entries) + rate_entries = extract_rate_entries(source, tables) + db.insert_rate_entries(connection, rate_entries) - exchange_rates = extract_exchange_rates(source, tables) - db.insert_exchange_rates(connection, exchange_rates) + exchange_rates = extract_exchange_rates(source, tables) + db.insert_exchange_rates(connection, exchange_rates) - if source.name == "accommodations": - accommodations = extract_accommodations(source, tables) - db.insert_accommodations(connection, accommodations) + if source.name == "accommodations": + accommodations = extract_accommodations(source, tables) + db.insert_accommodations(connection, accommodations) + + if args.export_estimate_xlsx: + estimate_config = EstimateConfig( + days=args.estimate_days, + lodging_per_night=args.estimate_lodging_per_night, + transport_total=args.estimate_transport_total, + misc_total=args.estimate_misc_total, + rate_type=args.estimate_rate_type, + country=args.estimate_country, + city=args.estimate_city, + province=args.estimate_province, + ) + output_path = export_estimate_xlsx(connection, args.export_estimate_xlsx, estimate_config) + logger.info("Estimate workbook exported to %s", output_path) connection.close()