Integrate OpenFlights API for free, no-auth flight data generation

- Added openFlightsService.js to fetch and cache OpenFlights airport/airline/routes data - Validates airport codes exist in OpenFlights database (6072+ airports) - Generates realistic flights using major international airlines - Creates varied routing options: direct, 1-stop, 2-stop flights - Updated flightService.js to use OpenFlights as primary source before Amadeus - OpenFlights as fallback if Amadeus unavailable or returns no results - No API keys or authentication required - Cached locally to avoid repeated network requests - Realistic pricing, times, and stop locations Docker container rebuilt with OpenFlights integration.
2026-03-01 14:10:22 -05:00 · 2026-01-13 10:32:05 -05:00
parent 969ba062f7
commit 66b72d5f74
15 changed files with 82237 additions and 40 deletions
--- a/src/gov_travel/main.py
+++ b/src/gov_travel/main.py
@@ -1,6 +1,7 @@
 from __future__ import annotations

 import argparse
+import time
 from pathlib import Path

 from gov_travel import db
@@ -26,24 +27,65 @@ def parse_args() -> argparse.Namespace:

 def main() -> None:
    args = parse_args()
+    start_time = time.time()
+    
+    print("=" * 80)
+    print("🌐 Government Travel Rate Scraper")
+    print("=" * 80)
+    print(f"📁 Database: {args.db}")
+    print()
+    
    connection = db.connect(args.db)
    db.init_db(connection)

-    for source in SOURCES:
+    total_tables = 0
+    total_rate_entries = 0
+    total_accommodations = 0
+    
+    for idx, source in enumerate(SOURCES, 1):
+        source_start = time.time()
+        print(f"[{idx}/{len(SOURCES)}] 📥 Scraping: {source.name.upper()}")
+        print(f"    🔗 {source.url}")
+        
        tables = scrape_tables_from_source(source)
        db.insert_raw_tables(connection, source.name, source.url, tables)
+        total_tables += len(tables)
+        print(f"    ✓ {len(tables)} tables collected")

        rate_entries = extract_rate_entries(source, tables)
        db.insert_rate_entries(connection, rate_entries)
+        total_rate_entries += len(rate_entries)
+        if rate_entries:
+            print(f"    ✓ {len(rate_entries)} per-diem entries extracted")

        exchange_rates = extract_exchange_rates(source, tables)
        db.insert_exchange_rates(connection, exchange_rates)
+        if exchange_rates:
+            print(f"    ✓ {len(exchange_rates)} exchange rates extracted")

        if source.name == "accommodations":
            accommodations = extract_accommodations(source, tables)
            db.insert_accommodations(connection, accommodations)
+            total_accommodations = len(accommodations)
+            print(f"    ✓ {len(accommodations)} accommodation listings extracted")
+        
+        elapsed = time.time() - source_start
+        print(f"    ⏱️  Completed in {elapsed:.1f}s")
+        print()

    connection.close()
+    
+    total_time = time.time() - start_time
+    print("=" * 80)
+    print("✅ SCRAPING COMPLETE")
+    print("=" * 80)
+    print(f"📊 Summary:")
+    print(f"   • Total tables:          {total_tables:,}")
+    print(f"   • Per-diem entries:      {total_rate_entries:,}")
+    print(f"   • Accommodation listings: {total_accommodations:,}")
+    print(f"   • Total time:            {total_time:.1f}s")
+    print(f"   • Database:              {args.db}")
+    print("=" * 80)


 if __name__ == "__main__":
--- a/src/gov_travel/scrapers.py
+++ b/src/gov_travel/scrapers.py
@@ -4,6 +4,7 @@ import json
 import re
 import time
 from dataclasses import dataclass
+from io import StringIO
 from typing import Any, Iterable

 import pandas as pd
@@ -46,7 +47,8 @@ def fetch_html(url: str, retry=3) -> str:


 def extract_tables(html: str) -> list[pd.DataFrame]:
-    return pd.read_html(html)
+    # Wrap literal HTML to avoid pandas FutureWarning
+    return pd.read_html(StringIO(html))


 def _normalize_header(header: str) -> str:
@@ -146,11 +148,11 @@ def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
    # For sources with alphabet navigation, fetch all letter pages
    if source.uses_alphabet_navigation:
        urls = _get_alphabet_urls(source.url)
-        print(f"  Fetching {len(urls)} alphabet pages...")
+        print(f"    📋 Fetching {len(urls)} alphabet pages...")
    else:
        urls = [source.url]
    
-    for url in urls:
+    for idx, url in enumerate(urls, 1):
        html = fetch_html(url)
        try:
            tables = extract_tables(html)
@@ -175,8 +177,9 @@ def scrape_tables_from_source(source: SourceConfig) -> list[dict[str, Any]]:
            )
        
        table_offset += len(tables)
-        if len(tables) > 0:
-            print(f"    {url.split('let=')[-1] if 'let=' in url else 'base'}: {len(tables)} tables")
+        if len(tables) > 0 and source.uses_alphabet_navigation:
+            letter = url.split('let=')[-1] if 'let=' in url else 'base'
+            print(f"       {letter:>4}: {len(tables)} tables [{idx}/{len(urls)}]")
    
    return results

@@ -185,14 +188,27 @@ def extract_rate_entries(
    source: SourceConfig,
    tables: Iterable[dict[str, Any]],
 ) -> list[dict[str, Any]]:
+    """Extract per-diem meal and incidental rates (NOT accommodation listings)"""
    entries: list[dict[str, Any]] = []
+    
+    # Only extract per-diem rates from international and domestic sources
+    if source.name == "accommodations":
+        return entries
+    
+    # Define valid per-diem rate columns
+    valid_rate_types = {
+        "breakfast", "lunch", "dinner", 
+        "incidental amount", "incidentals",
+        "private accommodation", "private accom\xadmodation"
+    }
+    
    for table in tables:
        # Extract currency and country from table title
        table_currency = _extract_currency_from_title(table.get("title"))
        table_country = _extract_country_from_title(table.get("title"))
        
        # Default to CAD for domestic Canadian sources
-        if table_currency is None and source.name in ("domestic", "accommodations"):
+        if table_currency is None and source.name == "domestic":
            table_currency = "CAD"
        
        for row in table["data"]:
@@ -204,16 +220,16 @@ def extract_rate_entries(
            currency = _detect_currency(normalized.get("currency"), fallback=table_currency)
            effective_date = normalized.get("effective date") or normalized.get("effective")
            
-            # Process meal rate columns and other numeric columns
+            # Only extract per-diem meal and incidental columns
            for key, value in normalized.items():
-                if key in {"country", "country/territory", "city", "location", "province", "province/territory", 
-                          "currency", "effective", "effective date", "type of accommodation", "accommodation type",
-                          "meal total", "grand total", "grand total (taxes included)"}:
+                # Only process valid per-diem rate types
+                if key not in valid_rate_types:
                    continue
+                
                amount = _parse_amount(value)
                if amount is None:
                    continue
-                # Use table currency (from title) instead of trying to detect from value
+                    
                entries.append(
                    {
                        "source": source.name,