Add Excel estimate export workflow to CLI

2026-03-01 14:10:22 -05:00 · 2026-02-11 13:55:07 -05:00
parent c24d5acc8b
commit 0798d90043
5 changed files with 215 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -19,8 +19,31 @@ python -m gov_travel.main --db data/travel_rates.sqlite3
 - `--sources international domestic accommodations` to limit which sources are scraped.
 - `--pause 1.5` to pause between processing tables.
 - `--log-level DEBUG` to increase logging verbosity.
 - `--no-scrape` to skip scraping and only work with existing database data.
 - `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent.
 ## Export an estimate to Excel
 After data exists in SQLite (from a previous scrape), export a cost estimate workbook:
 ```bash
 python -m gov_travel.main \
  --db data/travel_rates.sqlite3 \
  --no-scrape \
  --export-estimate-xlsx output/travel_estimate.xlsx \
  --estimate-days 5 \
  --estimate-rate-type meal \
  --estimate-country Canada \
  --estimate-city Ottawa \
  --estimate-lodging-per-night 235 \
  --estimate-transport-total 175 \
  --estimate-misc-total 80
 ```
 Workbook sheets:
 - `estimate_summary`: Days, recommended meal allowance, line item subtotals, and grand total.
 - `matched_rate_entries`: Source rows used to derive the allowance recommendation.
 ## Database contents
 The database includes:
 - `raw_tables` for every scraped HTML table.
 - `rate_entries` for parsed rate rows (country/city/province + rate fields).
@@ -28,3 +51,9 @@ The database includes:
 - `accommodations` for parsed lodging listings.
 If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing.
 ## Suggested next improvements
 - Add automated tests for parser heuristics and the estimate export path.
 - Add currency conversion in estimate exports using `exchange_rates` so totals can be normalized to CAD.
 - Add source-level freshness metadata to avoid duplicate inserts when scraping repeatedly.
 - Expose estimate/export in a small web UI for non-technical users.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
  "lxml==5.3.0",
  "pandas==2.2.3",
  "requests==2.32.3",
  "openpyxl==3.1.5",
 ]
 [tool.setuptools]
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ beautifulsoup4==4.12.3
 lxml==5.3.0
 pandas==2.2.3
 requests==2.32.3
 openpyxl==3.1.5
--- a/src/gov_travel/estimate.py
+++ b/src/gov_travel/estimate.py
@@ -0,0 +1,101 @@
 from __future__ import annotations
 import sqlite3
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
 import pandas as pd
@dataclass(frozen=True)
 class EstimateConfig:
    days: int
    lodging_per_night: float
    transport_total: float
    misc_total: float
    rate_type: str
    country: str | None = None
    city: str | None = None
    province: str | None = None
 def _load_rate_matches(connection: sqlite3.Connection, config: EstimateConfig) -> list[dict[str, Any]]:
    query = """
        SELECT
            source,
            source_url,
            table_title,
            country,
            city,
            province,
            currency,
            rate_type,
            rate_amount,
            effective_date
        FROM rate_entries
        WHERE rate_amount IS NOT NULL
          AND LOWER(rate_type) LIKE LOWER(?)
    """
    params: list[Any] = [f"%{config.rate_type}%"]
    if config.country:
        query += " AND LOWER(country) = LOWER(?)"
        params.append(config.country)
    if config.city:
        query += " AND LOWER(city) = LOWER(?)"
        params.append(config.city)
    if config.province:
        query += " AND LOWER(province) = LOWER(?)"
        params.append(config.province)
    query += " ORDER BY effective_date DESC, rate_amount DESC"
    rows = connection.execute(query, params).fetchall()
    return [dict(row) for row in rows]
 def _pick_recommended_rate(matches: list[dict[str, Any]]) -> tuple[float, str]:
    if not matches:
        return 0.0, "CAD"
    currency = matches[0].get("currency") or "CAD"
    latest_date = matches[0].get("effective_date")
    latest_rows = [
        row for row in matches if row.get("effective_date") == latest_date and row.get("currency") == currency
    ]
    candidates = latest_rows or matches[:5]
    average_rate = sum(float(row["rate_amount"]) for row in candidates if row.get("rate_amount") is not None) / max(
        len(candidates),
        1,
    )
    return round(average_rate, 2), currency
 def export_estimate_xlsx(
    connection: sqlite3.Connection,
    output_path: Path,
    config: EstimateConfig,
 ) -> Path:
    matches = _load_rate_matches(connection, config)
    recommended_rate, currency = _pick_recommended_rate(matches)
    meals_total = round(recommended_rate * config.days, 2)
    lodging_total = round(config.lodging_per_night * config.days, 2)
    grand_total = round(meals_total + lodging_total + config.transport_total + config.misc_total, 2)
    summary_rows = [
        {"item": "Days", "value": config.days, "currency": ""},
        {"item": f"Meal allowance ({config.rate_type})", "value": recommended_rate, "currency": currency},
        {"item": "Meals subtotal", "value": meals_total, "currency": currency},
        {"item": "Lodging per night", "value": config.lodging_per_night, "currency": currency},
        {"item": "Lodging subtotal", "value": lodging_total, "currency": currency},
        {"item": "Transport total", "value": config.transport_total, "currency": currency},
        {"item": "Misc total", "value": config.misc_total, "currency": currency},
        {"item": "Grand total", "value": grand_total, "currency": currency},
    ]
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
        pd.DataFrame(summary_rows).to_excel(writer, index=False, sheet_name="estimate_summary")
        pd.DataFrame(matches).to_excel(writer, index=False, sheet_name="matched_rate_entries")
    return output_path
--- a/src/gov_travel/main.py
+++ b/src/gov_travel/main.py
@@ -5,6 +5,7 @@ import logging
 from pathlib import Path
 from gov_travel import db
 from gov_travel.estimate import EstimateConfig, export_estimate_xlsx
 from gov_travel.scrapers import (
    SOURCES,
    build_session,
@@ -42,6 +43,57 @@ def parse_args() -> argparse.Namespace:
        choices=["DEBUG", "INFO", "WARNING", "ERROR"],
        help="Logging level",
    )
    parser.add_argument(
        "--no-scrape",
        action="store_true",
        help="Skip scraping and only use data already in the database",
    )
    parser.add_argument(
        "--export-estimate-xlsx",
        type=Path,
        help="Optional output path to export a travel estimate workbook",
    )
    parser.add_argument(
        "--estimate-days",
        type=int,
        default=1,
        help="Number of travel days for the estimate",
    )
    parser.add_argument(
        "--estimate-rate-type",
        default="meal",
        help="Rate type filter used to find allowance rates (e.g., meal, breakfast, dinner)",
    )
    parser.add_argument(
        "--estimate-country",
        help="Optional country filter for estimate rate lookup",
    )
    parser.add_argument(
        "--estimate-city",
        help="Optional city filter for estimate rate lookup",
    )
    parser.add_argument(
        "--estimate-province",
        help="Optional province filter for estimate rate lookup",
    )
    parser.add_argument(
        "--estimate-lodging-per-night",
        type=float,
        default=0.0,
        help="Manual lodging cost per night",
    )
    parser.add_argument(
        "--estimate-transport-total",
        type=float,
        default=0.0,
        help="Manual transport total",
    )
    parser.add_argument(
        "--estimate-misc-total",
        type=float,
        default=0.0,
        help="Manual misc total",
    )
    return parser.parse_args()
@@ -52,6 +104,7 @@ def main() -> None:
    connection = db.connect(args.db)
    db.init_db(connection)
    if not args.no_scrape:
        session = build_session()
        selected = {name for name in args.sources}
        for source in SOURCES:
@@ -72,6 +125,20 @@ def main() -> None:
                accommodations = extract_accommodations(source, tables)
                db.insert_accommodations(connection, accommodations)
    if args.export_estimate_xlsx:
        estimate_config = EstimateConfig(
            days=args.estimate_days,
            lodging_per_night=args.estimate_lodging_per_night,
            transport_total=args.estimate_transport_total,
            misc_total=args.estimate_misc_total,
            rate_type=args.estimate_rate_type,
            country=args.estimate_country,
            city=args.estimate_city,
            province=args.estimate_province,
        )
        output_path = export_estimate_xlsx(connection, args.export_estimate_xlsx, estimate_config)
        logger.info("Estimate workbook exported to %s", output_path)
    connection.close()