Integrate OpenFlights API for free, no-auth flight data generation

- Added openFlightsService.js to fetch and cache OpenFlights airport/airline/routes data
- Validates airport codes exist in OpenFlights database (6072+ airports)
- Generates realistic flights using major international airlines
- Creates varied routing options: direct, 1-stop, 2-stop flights
- Updated flightService.js to use OpenFlights as primary source before Amadeus
- OpenFlights as fallback if Amadeus unavailable or returns no results
- No API keys or authentication required
- Cached locally to avoid repeated network requests
- Realistic pricing, times, and stop locations

Docker container rebuilt with OpenFlights integration.
This commit is contained in:
2026-01-13 10:32:05 -05:00
parent 969ba062f7
commit 66b72d5f74
15 changed files with 82237 additions and 40 deletions

View File

@@ -1,6 +1,7 @@
from __future__ import annotations
import argparse
import time
from pathlib import Path
from gov_travel import db
@@ -26,24 +27,65 @@ def parse_args() -> argparse.Namespace:
def main() -> None:
args = parse_args()
start_time = time.time()
print("=" * 80)
print("🌐 Government Travel Rate Scraper")
print("=" * 80)
print(f"📁 Database: {args.db}")
print()
connection = db.connect(args.db)
db.init_db(connection)
for source in SOURCES:
total_tables = 0
total_rate_entries = 0
total_accommodations = 0
for idx, source in enumerate(SOURCES, 1):
source_start = time.time()
print(f"[{idx}/{len(SOURCES)}] 📥 Scraping: {source.name.upper()}")
print(f" 🔗 {source.url}")
tables = scrape_tables_from_source(source)
db.insert_raw_tables(connection, source.name, source.url, tables)
total_tables += len(tables)
print(f"{len(tables)} tables collected")
rate_entries = extract_rate_entries(source, tables)
db.insert_rate_entries(connection, rate_entries)
total_rate_entries += len(rate_entries)
if rate_entries:
print(f"{len(rate_entries)} per-diem entries extracted")
exchange_rates = extract_exchange_rates(source, tables)
db.insert_exchange_rates(connection, exchange_rates)
if exchange_rates:
print(f"{len(exchange_rates)} exchange rates extracted")
if source.name == "accommodations":
accommodations = extract_accommodations(source, tables)
db.insert_accommodations(connection, accommodations)
total_accommodations = len(accommodations)
print(f"{len(accommodations)} accommodation listings extracted")
elapsed = time.time() - source_start
print(f" ⏱️ Completed in {elapsed:.1f}s")
print()
connection.close()
total_time = time.time() - start_time
print("=" * 80)
print("✅ SCRAPING COMPLETE")
print("=" * 80)
print(f"📊 Summary:")
print(f" • Total tables: {total_tables:,}")
print(f" • Per-diem entries: {total_rate_entries:,}")
print(f" • Accommodation listings: {total_accommodations:,}")
print(f" • Total time: {total_time:.1f}s")
print(f" • Database: {args.db}")
print("=" * 80)
if __name__ == "__main__":

View File

@@ -4,6 +4,7 @@ import json
import re
import time
from dataclasses import dataclass
from io import StringIO
from typing import Any, Iterable
import pandas as pd
@@ -46,7 +47,8 @@ def fetch_html(url: str, retry=3) -> str:
def extract_tables(html: str) -> list[pd.DataFrame]:
return pd.read_html(html)
# Wrap literal HTML to avoid pandas FutureWarning
return pd.read_html(StringIO(html))
def _normalize_header(header: str) -> str:
@@ -146,11 +148,11 @@ def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
# For sources with alphabet navigation, fetch all letter pages
if source.uses_alphabet_navigation:
urls = _get_alphabet_urls(source.url)
print(f" Fetching {len(urls)} alphabet pages...")
print(f" 📋 Fetching {len(urls)} alphabet pages...")
else:
urls = [source.url]
for url in urls:
for idx, url in enumerate(urls, 1):
html = fetch_html(url)
try:
tables = extract_tables(html)
@@ -175,8 +177,9 @@ def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
)
table_offset += len(tables)
if len(tables) > 0:
print(f" {url.split('let=')[-1] if 'let=' in url else 'base'}: {len(tables)} tables")
if len(tables) > 0 and source.uses_alphabet_navigation:
letter = url.split('let=')[-1] if 'let=' in url else 'base'
print(f" {letter:>4}: {len(tables)} tables [{idx}/{len(urls)}]")
return results
@@ -185,14 +188,27 @@ def extract_rate_entries(
source: SourceConfig,
tables: Iterable[dict[str, Any]],
) -> list[dict[str, Any]]:
"""Extract per-diem meal and incidental rates (NOT accommodation listings)"""
entries: list[dict[str, Any]] = []
# Only extract per-diem rates from international and domestic sources
if source.name == "accommodations":
return entries
# Define valid per-diem rate columns
valid_rate_types = {
"breakfast", "lunch", "dinner",
"incidental amount", "incidentals",
"private accommodation", "private accom\xadmodation"
}
for table in tables:
# Extract currency and country from table title
table_currency = _extract_currency_from_title(table.get("title"))
table_country = _extract_country_from_title(table.get("title"))
# Default to CAD for domestic Canadian sources
if table_currency is None and source.name in ("domestic", "accommodations"):
if table_currency is None and source.name == "domestic":
table_currency = "CAD"
for row in table["data"]:
@@ -204,16 +220,16 @@ def extract_rate_entries(
currency = _detect_currency(normalized.get("currency"), fallback=table_currency)
effective_date = normalized.get("effective date") or normalized.get("effective")
# Process meal rate columns and other numeric columns
# Only extract per-diem meal and incidental columns
for key, value in normalized.items():
if key in {"country", "country/territory", "city", "location", "province", "province/territory",
"currency", "effective", "effective date", "type of accommodation", "accommodation type",
"meal total", "grand total", "grand total (taxes included)"}:
# Only process valid per-diem rate types
if key not in valid_rate_types:
continue
amount = _parse_amount(value)
if amount is None:
continue
# Use table currency (from title) instead of trying to detect from value
entries.append(
{
"source": source.name,