""" Unified lottery jackpot scrapers with TTL caching. Consolidates all scraping logic that was previously duplicated across app.py, email_sender.py, and ``import requests.py``. """ from __future__ import annotations import logging import re import time import requests from bs4 import BeautifulSoup from playwright.sync_api import sync_playwright from config import load_config logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Browser-like request headers # --------------------------------------------------------------------------- HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), "Accept": ( "text/html,application/xhtml+xml,application/xml;" "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" ), "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Cache-Control": "max-age=0", } # --------------------------------------------------------------------------- # Simple TTL cache # --------------------------------------------------------------------------- _cache: dict[str, tuple[float, object]] = {} def _get_cached(key: str, ttl: int) -> object | None: """Return cached value if it exists and hasn't expired.""" entry = _cache.get(key) if entry is None: return None ts, value = entry if time.time() - ts > ttl: return None return value def _set_cached(key: str, value: object) -> None: _cache[key] = (time.time(), value) def clear_cache() -> None: """Clear the scraper cache (useful for testing or forcing refresh).""" _cache.clear() # --------------------------------------------------------------------------- # Individual scrapers # --------------------------------------------------------------------------- def _parse_jackpot_from_lotto_net(html: str) -> float | None: """Extract the *Next Jackpot* dollar amount from a lotto.net page.""" soup = BeautifulSoup(html, "html.parser") text = soup.get_text() lines = text.split("\n") for i, line in enumerate(lines): if "Next Jackpot" in line and i + 1 < len(lines): next_line = lines[i + 1].strip() if "$" in next_line: match = re.search(r"\$(\d+(?:,\d+)?(?:\.\d+)?)", next_line) if match: value = float(match.group(1).replace(",", "")) if "Billion" in next_line: return value * 1_000_000_000 if "Million" in next_line: return value * 1_000_000 return value return None def scrape_powerball(url: str | None = None) -> float | None: """Scrape the current Powerball jackpot from lotto.net.""" cfg = load_config() target = url or cfg.urls.powerball try: resp = requests.get(target, timeout=15, headers=HEADERS) resp.raise_for_status() return _parse_jackpot_from_lotto_net(resp.text) except Exception: logger.exception("Failed to scrape Powerball from %s", target) return None def scrape_mega_millions(url: str | None = None) -> float | None: """Scrape the current Mega Millions jackpot from lotto.net.""" cfg = load_config() target = url or cfg.urls.mega_millions try: resp = requests.get(target, timeout=15, headers=HEADERS) resp.raise_for_status() return _parse_jackpot_from_lotto_net(resp.text) except Exception: logger.exception("Failed to scrape Mega Millions from %s", target) return None def scrape_canadian_lotteries(url: str | None = None) -> dict[str, float | None]: """Scrape Lotto Max and Lotto 6/49 from OLG using Playwright.""" cfg = load_config() target = url or cfg.urls.olg results: dict[str, float | None] = {"lottoMax": None, "lotto649": None} try: with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(target, wait_until="networkidle", timeout=30000) page.wait_for_timeout(3000) content = page.content() browser.close() # Lotto Max max_match = re.search( r"LOTTO\s*MAX(?:(?!LOTTO\s*6/49).)*?\$\s*([\d.,]+)\s*Million", content, re.IGNORECASE | re.DOTALL, ) if max_match: results["lottoMax"] = float(max_match.group(1).replace(",", "")) * 1_000_000 # Lotto 6/49 match_649 = re.search( r"LOTTO\s*6/49(?:(?!LOTTO\s*MAX).)*?\$\s*([\d.,]+)\s*Million", content, re.IGNORECASE | re.DOTALL, ) if match_649: results["lotto649"] = float(match_649.group(1).replace(",", "")) * 1_000_000 except Exception: logger.exception("Failed to scrape Canadian lotteries from %s", target) return results # --------------------------------------------------------------------------- # Aggregated fetchers (with cache) # --------------------------------------------------------------------------- def get_all_jackpots(*, force_refresh: bool = False) -> dict: """Return all four lottery jackpots, using cache when available. Returns:: { "us": {"powerball": float|None, "megaMillions": float|None}, "canadian": {"lottoMax": float|None, "lotto649": float|None}, } """ cfg = load_config() cache_key = "all_jackpots" if not force_refresh: cached = _get_cached(cache_key, cfg.cache_ttl) if cached is not None: return cached # type: ignore[return-value] pb = scrape_powerball() mm = scrape_mega_millions() canadian = scrape_canadian_lotteries() result = { "us": {"powerball": pb, "megaMillions": mm}, "canadian": canadian, } _set_cached(cache_key, result) return result