Version 1.1

This commit is contained in:
2026-02-18 08:24:54 -05:00
parent 4318c8f642
commit fdba869a8d
33 changed files with 2142 additions and 1942 deletions

191
scrapers.py Normal file
View File

@@ -0,0 +1,191 @@
"""
Unified lottery jackpot scrapers with TTL caching.
Consolidates all scraping logic that was previously duplicated across
app.py, email_sender.py, and ``import requests.py``.
"""
from __future__ import annotations
import logging
import re
import time
import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright
from config import load_config
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Browser-like request headers
# ---------------------------------------------------------------------------
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept": (
"text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
),
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Cache-Control": "max-age=0",
}
# ---------------------------------------------------------------------------
# Simple TTL cache
# ---------------------------------------------------------------------------
_cache: dict[str, tuple[float, object]] = {}
def _get_cached(key: str, ttl: int) -> object | None:
"""Return cached value if it exists and hasn't expired."""
entry = _cache.get(key)
if entry is None:
return None
ts, value = entry
if time.time() - ts > ttl:
return None
return value
def _set_cached(key: str, value: object) -> None:
_cache[key] = (time.time(), value)
def clear_cache() -> None:
"""Clear the scraper cache (useful for testing or forcing refresh)."""
_cache.clear()
# ---------------------------------------------------------------------------
# Individual scrapers
# ---------------------------------------------------------------------------
def _parse_jackpot_from_lotto_net(html: str) -> float | None:
"""Extract the *Next Jackpot* dollar amount from a lotto.net page."""
soup = BeautifulSoup(html, "html.parser")
text = soup.get_text()
lines = text.split("\n")
for i, line in enumerate(lines):
if "Next Jackpot" in line and i + 1 < len(lines):
next_line = lines[i + 1].strip()
if "$" in next_line:
match = re.search(r"\$(\d+(?:,\d+)?(?:\.\d+)?)", next_line)
if match:
value = float(match.group(1).replace(",", ""))
if "Billion" in next_line:
return value * 1_000_000_000
if "Million" in next_line:
return value * 1_000_000
return value
return None
def scrape_powerball(url: str | None = None) -> float | None:
"""Scrape the current Powerball jackpot from lotto.net."""
cfg = load_config()
target = url or cfg.urls.powerball
try:
resp = requests.get(target, timeout=15, headers=HEADERS)
resp.raise_for_status()
return _parse_jackpot_from_lotto_net(resp.text)
except Exception:
logger.exception("Failed to scrape Powerball from %s", target)
return None
def scrape_mega_millions(url: str | None = None) -> float | None:
"""Scrape the current Mega Millions jackpot from lotto.net."""
cfg = load_config()
target = url or cfg.urls.mega_millions
try:
resp = requests.get(target, timeout=15, headers=HEADERS)
resp.raise_for_status()
return _parse_jackpot_from_lotto_net(resp.text)
except Exception:
logger.exception("Failed to scrape Mega Millions from %s", target)
return None
def scrape_canadian_lotteries(url: str | None = None) -> dict[str, float | None]:
"""Scrape Lotto Max and Lotto 6/49 from OLG using Playwright."""
cfg = load_config()
target = url or cfg.urls.olg
results: dict[str, float | None] = {"lottoMax": None, "lotto649": None}
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(target, wait_until="networkidle", timeout=30000)
page.wait_for_timeout(3000)
content = page.content()
browser.close()
# Lotto Max
max_match = re.search(
r"LOTTO\s*MAX(?:(?!LOTTO\s*6/49).)*?\$\s*([\d.,]+)\s*Million",
content,
re.IGNORECASE | re.DOTALL,
)
if max_match:
results["lottoMax"] = float(max_match.group(1).replace(",", "")) * 1_000_000
# Lotto 6/49
match_649 = re.search(
r"LOTTO\s*6/49(?:(?!LOTTO\s*MAX).)*?\$\s*([\d.,]+)\s*Million",
content,
re.IGNORECASE | re.DOTALL,
)
if match_649:
results["lotto649"] = float(match_649.group(1).replace(",", "")) * 1_000_000
except Exception:
logger.exception("Failed to scrape Canadian lotteries from %s", target)
return results
# ---------------------------------------------------------------------------
# Aggregated fetchers (with cache)
# ---------------------------------------------------------------------------
def get_all_jackpots(*, force_refresh: bool = False) -> dict:
"""Return all four lottery jackpots, using cache when available.
Returns::
{
"us": {"powerball": float|None, "megaMillions": float|None},
"canadian": {"lottoMax": float|None, "lotto649": float|None},
}
"""
cfg = load_config()
cache_key = "all_jackpots"
if not force_refresh:
cached = _get_cached(cache_key, cfg.cache_ttl)
if cached is not None:
return cached # type: ignore[return-value]
pb = scrape_powerball()
mm = scrape_mega_millions()
canadian = scrape_canadian_lotteries()
result = {
"us": {"powerball": pb, "megaMillions": mm},
"canadian": canadian,
}
_set_cached(cache_key, result)
return result