Add Excel estimate export workflow to CLI

This commit is contained in:
2026-02-11 13:55:07 -05:00
parent c24d5acc8b
commit 0798d90043
5 changed files with 215 additions and 16 deletions

View File

@@ -19,8 +19,31 @@ python -m gov_travel.main --db data/travel_rates.sqlite3
- `--sources international domestic accommodations` to limit which sources are scraped. - `--sources international domestic accommodations` to limit which sources are scraped.
- `--pause 1.5` to pause between processing tables. - `--pause 1.5` to pause between processing tables.
- `--log-level DEBUG` to increase logging verbosity. - `--log-level DEBUG` to increase logging verbosity.
- `--no-scrape` to skip scraping and only work with existing database data.
- `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent. - `GOV_TRAVEL_USER_AGENT="YourOrg/1.0"` to override the default user agent.
## Export an estimate to Excel
After data exists in SQLite (from a previous scrape), export a cost estimate workbook:
```bash
python -m gov_travel.main \
--db data/travel_rates.sqlite3 \
--no-scrape \
--export-estimate-xlsx output/travel_estimate.xlsx \
--estimate-days 5 \
--estimate-rate-type meal \
--estimate-country Canada \
--estimate-city Ottawa \
--estimate-lodging-per-night 235 \
--estimate-transport-total 175 \
--estimate-misc-total 80
```
Workbook sheets:
- `estimate_summary`: Days, recommended meal allowance, line item subtotals, and grand total.
- `matched_rate_entries`: Source rows used to derive the allowance recommendation.
## Database contents
The database includes: The database includes:
- `raw_tables` for every scraped HTML table. - `raw_tables` for every scraped HTML table.
- `rate_entries` for parsed rate rows (country/city/province + rate fields). - `rate_entries` for parsed rate rows (country/city/province + rate fields).
@@ -28,3 +51,9 @@ The database includes:
- `accommodations` for parsed lodging listings. - `accommodations` for parsed lodging listings.
If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing. If a field is not detected by the heuristics, the full row is still preserved in `raw_tables` and the `raw_json` columns for deeper post-processing.
## Suggested next improvements
- Add automated tests for parser heuristics and the estimate export path.
- Add currency conversion in estimate exports using `exchange_rates` so totals can be normalized to CAD.
- Add source-level freshness metadata to avoid duplicate inserts when scraping repeatedly.
- Expose estimate/export in a small web UI for non-technical users.

View File

@@ -13,6 +13,7 @@ dependencies = [
"lxml==5.3.0", "lxml==5.3.0",
"pandas==2.2.3", "pandas==2.2.3",
"requests==2.32.3", "requests==2.32.3",
"openpyxl==3.1.5",
] ]
[tool.setuptools] [tool.setuptools]

View File

@@ -2,3 +2,4 @@ beautifulsoup4==4.12.3
lxml==5.3.0 lxml==5.3.0
pandas==2.2.3 pandas==2.2.3
requests==2.32.3 requests==2.32.3
openpyxl==3.1.5

101
src/gov_travel/estimate.py Normal file
View File

@@ -0,0 +1,101 @@
from __future__ import annotations
import sqlite3
from dataclasses import dataclass
from pathlib import Path
from typing import Any
import pandas as pd
@dataclass(frozen=True)
class EstimateConfig:
days: int
lodging_per_night: float
transport_total: float
misc_total: float
rate_type: str
country: str | None = None
city: str | None = None
province: str | None = None
def _load_rate_matches(connection: sqlite3.Connection, config: EstimateConfig) -> list[dict[str, Any]]:
query = """
SELECT
source,
source_url,
table_title,
country,
city,
province,
currency,
rate_type,
rate_amount,
effective_date
FROM rate_entries
WHERE rate_amount IS NOT NULL
AND LOWER(rate_type) LIKE LOWER(?)
"""
params: list[Any] = [f"%{config.rate_type}%"]
if config.country:
query += " AND LOWER(country) = LOWER(?)"
params.append(config.country)
if config.city:
query += " AND LOWER(city) = LOWER(?)"
params.append(config.city)
if config.province:
query += " AND LOWER(province) = LOWER(?)"
params.append(config.province)
query += " ORDER BY effective_date DESC, rate_amount DESC"
rows = connection.execute(query, params).fetchall()
return [dict(row) for row in rows]
def _pick_recommended_rate(matches: list[dict[str, Any]]) -> tuple[float, str]:
if not matches:
return 0.0, "CAD"
currency = matches[0].get("currency") or "CAD"
latest_date = matches[0].get("effective_date")
latest_rows = [
row for row in matches if row.get("effective_date") == latest_date and row.get("currency") == currency
]
candidates = latest_rows or matches[:5]
average_rate = sum(float(row["rate_amount"]) for row in candidates if row.get("rate_amount") is not None) / max(
len(candidates),
1,
)
return round(average_rate, 2), currency
def export_estimate_xlsx(
connection: sqlite3.Connection,
output_path: Path,
config: EstimateConfig,
) -> Path:
matches = _load_rate_matches(connection, config)
recommended_rate, currency = _pick_recommended_rate(matches)
meals_total = round(recommended_rate * config.days, 2)
lodging_total = round(config.lodging_per_night * config.days, 2)
grand_total = round(meals_total + lodging_total + config.transport_total + config.misc_total, 2)
summary_rows = [
{"item": "Days", "value": config.days, "currency": ""},
{"item": f"Meal allowance ({config.rate_type})", "value": recommended_rate, "currency": currency},
{"item": "Meals subtotal", "value": meals_total, "currency": currency},
{"item": "Lodging per night", "value": config.lodging_per_night, "currency": currency},
{"item": "Lodging subtotal", "value": lodging_total, "currency": currency},
{"item": "Transport total", "value": config.transport_total, "currency": currency},
{"item": "Misc total", "value": config.misc_total, "currency": currency},
{"item": "Grand total", "value": grand_total, "currency": currency},
]
output_path.parent.mkdir(parents=True, exist_ok=True)
with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
pd.DataFrame(summary_rows).to_excel(writer, index=False, sheet_name="estimate_summary")
pd.DataFrame(matches).to_excel(writer, index=False, sheet_name="matched_rate_entries")
return output_path

View File

@@ -5,6 +5,7 @@ import logging
from pathlib import Path from pathlib import Path
from gov_travel import db from gov_travel import db
from gov_travel.estimate import EstimateConfig, export_estimate_xlsx
from gov_travel.scrapers import ( from gov_travel.scrapers import (
SOURCES, SOURCES,
build_session, build_session,
@@ -42,6 +43,57 @@ def parse_args() -> argparse.Namespace:
choices=["DEBUG", "INFO", "WARNING", "ERROR"], choices=["DEBUG", "INFO", "WARNING", "ERROR"],
help="Logging level", help="Logging level",
) )
parser.add_argument(
"--no-scrape",
action="store_true",
help="Skip scraping and only use data already in the database",
)
parser.add_argument(
"--export-estimate-xlsx",
type=Path,
help="Optional output path to export a travel estimate workbook",
)
parser.add_argument(
"--estimate-days",
type=int,
default=1,
help="Number of travel days for the estimate",
)
parser.add_argument(
"--estimate-rate-type",
default="meal",
help="Rate type filter used to find allowance rates (e.g., meal, breakfast, dinner)",
)
parser.add_argument(
"--estimate-country",
help="Optional country filter for estimate rate lookup",
)
parser.add_argument(
"--estimate-city",
help="Optional city filter for estimate rate lookup",
)
parser.add_argument(
"--estimate-province",
help="Optional province filter for estimate rate lookup",
)
parser.add_argument(
"--estimate-lodging-per-night",
type=float,
default=0.0,
help="Manual lodging cost per night",
)
parser.add_argument(
"--estimate-transport-total",
type=float,
default=0.0,
help="Manual transport total",
)
parser.add_argument(
"--estimate-misc-total",
type=float,
default=0.0,
help="Manual misc total",
)
return parser.parse_args() return parser.parse_args()
@@ -52,6 +104,7 @@ def main() -> None:
connection = db.connect(args.db) connection = db.connect(args.db)
db.init_db(connection) db.init_db(connection)
if not args.no_scrape:
session = build_session() session = build_session()
selected = {name for name in args.sources} selected = {name for name in args.sources}
for source in SOURCES: for source in SOURCES:
@@ -72,6 +125,20 @@ def main() -> None:
accommodations = extract_accommodations(source, tables) accommodations = extract_accommodations(source, tables)
db.insert_accommodations(connection, accommodations) db.insert_accommodations(connection, accommodations)
if args.export_estimate_xlsx:
estimate_config = EstimateConfig(
days=args.estimate_days,
lodging_per_night=args.estimate_lodging_per_night,
transport_total=args.estimate_transport_total,
misc_total=args.estimate_misc_total,
rate_type=args.estimate_rate_type,
country=args.estimate_country,
city=args.estimate_city,
province=args.estimate_province,
)
output_path = export_estimate_xlsx(connection, args.export_estimate_xlsx, estimate_config)
logger.info("Estimate workbook exported to %s", output_path)
connection.close() connection.close()