mirror of
https://github.com/mblanke/Gov_Travel_App.git
synced 2026-03-01 14:10:22 -05:00
Add Python web scraper for NJC travel rates with currency extraction
- Implemented Python scraper using BeautifulSoup and pandas to automatically collect travel rates from official NJC website - Added currency extraction from table titles (supports EUR, USD, AUD, CAD, ARS, etc.) - Added country extraction from table titles for international rates - Flatten pandas MultiIndex columns for cleaner data structure - Default to CAD for domestic Canadian sources (accommodations and domestic tables) - Created SQLite database schema (raw_tables, rate_entries, exchange_rates, accommodations) - Successfully scraped 92 tables with 17,205 rate entries covering 25 international cities - Added migration script to convert scraped data to Node.js database format - Updated .gitignore for Python files (.venv/, __pycache__, *.pyc, *.sqlite3) - Fixed city validation and currency conversion in main app - Added comprehensive debug and verification scripts This replaces manual JSON maintenance with automated data collection from official government source.
This commit is contained in:
212
scripts/migrateScrapedToNodeJS.js
Normal file
212
scripts/migrateScrapedToNodeJS.js
Normal file
@@ -0,0 +1,212 @@
|
||||
/**
|
||||
* Migration script to convert scraped SQLite database to Node.js travel_rates schema
|
||||
*
|
||||
* Source DB: travel_rates_scraped.sqlite3 (from Python scraper)
|
||||
* Target DB: travel_rates.db (Node.js app schema)
|
||||
*
|
||||
* This script:
|
||||
* 1. Reads rate_entries from scraped DB
|
||||
* 2. Aggregates meal rates (breakfast, lunch, dinner) and incidentals by city
|
||||
* 3. Inserts into travel_rates table in Node.js format
|
||||
*/
|
||||
|
||||
const sqlite3 = require("sqlite3").verbose();
|
||||
const path = require("path");
|
||||
|
||||
// Database paths
|
||||
const SOURCE_DB = path.join(
|
||||
__dirname,
|
||||
"..",
|
||||
"data",
|
||||
"travel_rates_scraped.sqlite3"
|
||||
);
|
||||
const TARGET_DB = path.join(__dirname, "..", "travel_rates.db");
|
||||
|
||||
// Exchange rates for display (not used for conversion, just for reference)
|
||||
const EXCHANGE_RATES = {
|
||||
EUR: 1.54, // EUR to CAD
|
||||
USD: 1.39, // USD to CAD
|
||||
AUD: 0.92, // AUD to CAD
|
||||
CAD: 1.0,
|
||||
ARS: 0.0014, // ARS to CAD (approximate)
|
||||
};
|
||||
|
||||
async function openDatabase(dbPath) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const db = new sqlite3.Database(dbPath, (err) => {
|
||||
if (err) reject(err);
|
||||
else resolve(db);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function queryAll(db, sql, params = []) {
|
||||
return new Promise((resolve, reject) => {
|
||||
db.all(sql, params, (err, rows) => {
|
||||
if (err) reject(err);
|
||||
else resolve(rows);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function runQuery(db, sql, params = []) {
|
||||
return new Promise((resolve, reject) => {
|
||||
db.run(sql, params, function (err) {
|
||||
if (err) reject(err);
|
||||
else resolve(this);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function aggregateCityRates(sourceDb) {
|
||||
// Get all international cities with their meal rates
|
||||
const query = `
|
||||
SELECT
|
||||
country,
|
||||
city,
|
||||
currency,
|
||||
MAX(CASE WHEN rate_type LIKE '%breakfast%' THEN rate_amount END) as breakfast,
|
||||
MAX(CASE WHEN rate_type LIKE '%lunch%' THEN rate_amount END) as lunch,
|
||||
MAX(CASE WHEN rate_type LIKE '%dinner%' THEN rate_amount END) as dinner,
|
||||
MAX(CASE WHEN rate_type LIKE '%incidental%' THEN rate_amount END) as incidentals
|
||||
FROM rate_entries
|
||||
WHERE city IS NOT NULL
|
||||
AND country IS NOT NULL
|
||||
AND source = 'international'
|
||||
GROUP BY country, city, currency
|
||||
HAVING breakfast IS NOT NULL OR lunch IS NOT NULL OR dinner IS NOT NULL
|
||||
`;
|
||||
|
||||
return await queryAll(sourceDb, query);
|
||||
}
|
||||
|
||||
async function clearTargetDatabase(targetDb) {
|
||||
await runQuery(targetDb, "DELETE FROM travel_rates");
|
||||
console.log("Cleared existing travel_rates data");
|
||||
}
|
||||
|
||||
async function insertCityRates(targetDb, cities) {
|
||||
const insertStmt = `
|
||||
INSERT INTO travel_rates (
|
||||
city_key, city_name, country, breakfast, lunch, dinner,
|
||||
incidentals, currency, standardRate, standard_rate
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
`;
|
||||
|
||||
let inserted = 0;
|
||||
let skipped = 0;
|
||||
|
||||
for (const city of cities) {
|
||||
try {
|
||||
// Create city key (lowercase, spaces to dashes)
|
||||
const cityKey = `${city.city
|
||||
.toLowerCase()
|
||||
.replace(/\s+/g, "-")}-${city.country
|
||||
.toLowerCase()
|
||||
.replace(/\s+/g, "-")}`;
|
||||
|
||||
// Standard rate is typically breakfast + lunch + dinner
|
||||
const standardRate =
|
||||
(city.breakfast || 0) + (city.lunch || 0) + (city.dinner || 0);
|
||||
|
||||
await runQuery(targetDb, insertStmt, [
|
||||
cityKey,
|
||||
city.city,
|
||||
city.country,
|
||||
city.breakfast,
|
||||
city.lunch,
|
||||
city.dinner,
|
||||
city.incidentals,
|
||||
city.currency,
|
||||
standardRate,
|
||||
standardRate, // Both standardRate and standard_rate for compatibility
|
||||
]);
|
||||
|
||||
inserted++;
|
||||
} catch (err) {
|
||||
console.error(
|
||||
`Error inserting ${city.city}, ${city.country}: ${err.message}`
|
||||
);
|
||||
skipped++;
|
||||
}
|
||||
}
|
||||
|
||||
return { inserted, skipped };
|
||||
}
|
||||
|
||||
async function migrate() {
|
||||
console.log(
|
||||
"Starting migration from scraped database to Node.js schema...\n"
|
||||
);
|
||||
|
||||
let sourceDb, targetDb;
|
||||
|
||||
try {
|
||||
// Open databases
|
||||
console.log(`Opening source database: ${SOURCE_DB}`);
|
||||
sourceDb = await openDatabase(SOURCE_DB);
|
||||
|
||||
console.log(`Opening target database: ${TARGET_DB}`);
|
||||
targetDb = await openDatabase(TARGET_DB);
|
||||
|
||||
// Aggregate city rates from scraped data
|
||||
console.log("\nAggregating city rates from scraped data...");
|
||||
const cities = await aggregateCityRates(sourceDb);
|
||||
console.log(`Found ${cities.length} cities with meal rates`);
|
||||
|
||||
// Show currency distribution
|
||||
const currencyCounts = cities.reduce((acc, city) => {
|
||||
acc[city.currency] = (acc[city.currency] || 0) + 1;
|
||||
return acc;
|
||||
}, {});
|
||||
console.log("\nCurrency distribution:");
|
||||
for (const [currency, count] of Object.entries(currencyCounts)) {
|
||||
console.log(` ${currency}: ${count} cities`);
|
||||
}
|
||||
|
||||
// Clear target database
|
||||
console.log("\nClearing target database...");
|
||||
await clearTargetDatabase(targetDb);
|
||||
|
||||
// Insert city rates
|
||||
console.log("\nInserting city rates into target database...");
|
||||
const result = await insertCityRates(targetDb, cities);
|
||||
|
||||
console.log(`\nMigration complete!`);
|
||||
console.log(` Inserted: ${result.inserted} cities`);
|
||||
console.log(` Skipped: ${result.skipped} cities`);
|
||||
|
||||
// Show sample entries
|
||||
console.log("\nSample migrated entries:");
|
||||
const samples = await queryAll(
|
||||
targetDb,
|
||||
`
|
||||
SELECT city_name, country, breakfast, lunch, dinner, incidentals, currency
|
||||
FROM travel_rates
|
||||
WHERE country IN ('Argentina', 'Albania', 'Australia')
|
||||
LIMIT 5
|
||||
`
|
||||
);
|
||||
for (const sample of samples) {
|
||||
console.log(
|
||||
` ${sample.city_name}, ${sample.country}: B:${sample.breakfast} L:${sample.lunch} D:${sample.dinner} I:${sample.incidentals} (${sample.currency})`
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("\nMigration failed:", err);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
if (sourceDb) sourceDb.close();
|
||||
if (targetDb) targetDb.close();
|
||||
}
|
||||
}
|
||||
|
||||
// Run migration
|
||||
if (require.main === module) {
|
||||
migrate().catch((err) => {
|
||||
console.error("Fatal error:", err);
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
module.exports = { migrate };
|
||||
Reference in New Issue
Block a user