Version 1.1

2026-03-01 14:10:22 -05:00 · 2026-02-18 08:24:54 -05:00
parent 4318c8f642
commit fdba869a8d
33 changed files with 2142 additions and 1942 deletions
--- a/scrapers.py
+++ b/scrapers.py
@@ -0,0 +1,191 @@
+"""
+Unified lottery jackpot scrapers with TTL caching.
+
+Consolidates all scraping logic that was previously duplicated across
+app.py, email_sender.py, and ``import requests.py``.
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+import time
+
+import requests
+from bs4 import BeautifulSoup
+from playwright.sync_api import sync_playwright
+
+from config import load_config
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Browser-like request headers
+# ---------------------------------------------------------------------------
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+        "AppleWebKit/537.36 (KHTML, like Gecko) "
+        "Chrome/120.0.0.0 Safari/537.36"
+    ),
+    "Accept": (
+        "text/html,application/xhtml+xml,application/xml;"
+        "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
+    ),
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept-Encoding": "gzip, deflate, br",
+    "Connection": "keep-alive",
+    "Upgrade-Insecure-Requests": "1",
+    "Sec-Fetch-Dest": "document",
+    "Sec-Fetch-Mode": "navigate",
+    "Sec-Fetch-Site": "none",
+    "Cache-Control": "max-age=0",
+}
+
+# ---------------------------------------------------------------------------
+# Simple TTL cache
+# ---------------------------------------------------------------------------
+_cache: dict[str, tuple[float, object]] = {}
+
+
+def _get_cached(key: str, ttl: int) -> object | None:
+    """Return cached value if it exists and hasn't expired."""
+    entry = _cache.get(key)
+    if entry is None:
+        return None
+    ts, value = entry
+    if time.time() - ts > ttl:
+        return None
+    return value
+
+
+def _set_cached(key: str, value: object) -> None:
+    _cache[key] = (time.time(), value)
+
+
+def clear_cache() -> None:
+    """Clear the scraper cache (useful for testing or forcing refresh)."""
+    _cache.clear()
+
+
+# ---------------------------------------------------------------------------
+# Individual scrapers
+# ---------------------------------------------------------------------------
+
+def _parse_jackpot_from_lotto_net(html: str) -> float | None:
+    """Extract the *Next Jackpot* dollar amount from a lotto.net page."""
+    soup = BeautifulSoup(html, "html.parser")
+    text = soup.get_text()
+    lines = text.split("\n")
+    for i, line in enumerate(lines):
+        if "Next Jackpot" in line and i + 1 < len(lines):
+            next_line = lines[i + 1].strip()
+            if "$" in next_line:
+                match = re.search(r"\$(\d+(?:,\d+)?(?:\.\d+)?)", next_line)
+                if match:
+                    value = float(match.group(1).replace(",", ""))
+                    if "Billion" in next_line:
+                        return value * 1_000_000_000
+                    if "Million" in next_line:
+                        return value * 1_000_000
+                    return value
+    return None
+
+
+def scrape_powerball(url: str | None = None) -> float | None:
+    """Scrape the current Powerball jackpot from lotto.net."""
+    cfg = load_config()
+    target = url or cfg.urls.powerball
+    try:
+        resp = requests.get(target, timeout=15, headers=HEADERS)
+        resp.raise_for_status()
+        return _parse_jackpot_from_lotto_net(resp.text)
+    except Exception:
+        logger.exception("Failed to scrape Powerball from %s", target)
+        return None
+
+
+def scrape_mega_millions(url: str | None = None) -> float | None:
+    """Scrape the current Mega Millions jackpot from lotto.net."""
+    cfg = load_config()
+    target = url or cfg.urls.mega_millions
+    try:
+        resp = requests.get(target, timeout=15, headers=HEADERS)
+        resp.raise_for_status()
+        return _parse_jackpot_from_lotto_net(resp.text)
+    except Exception:
+        logger.exception("Failed to scrape Mega Millions from %s", target)
+        return None
+
+
+def scrape_canadian_lotteries(url: str | None = None) -> dict[str, float | None]:
+    """Scrape Lotto Max and Lotto 6/49 from OLG using Playwright."""
+    cfg = load_config()
+    target = url or cfg.urls.olg
+    results: dict[str, float | None] = {"lottoMax": None, "lotto649": None}
+
+    try:
+        with sync_playwright() as p:
+            browser = p.chromium.launch(headless=True)
+            page = browser.new_page()
+            page.goto(target, wait_until="networkidle", timeout=30000)
+            page.wait_for_timeout(3000)
+            content = page.content()
+            browser.close()
+
+            # Lotto Max
+            max_match = re.search(
+                r"LOTTO\s*MAX(?:(?!LOTTO\s*6/49).)*?\$\s*([\d.,]+)\s*Million",
+                content,
+                re.IGNORECASE | re.DOTALL,
+            )
+            if max_match:
+                results["lottoMax"] = float(max_match.group(1).replace(",", "")) * 1_000_000
+
+            # Lotto 6/49
+            match_649 = re.search(
+                r"LOTTO\s*6/49(?:(?!LOTTO\s*MAX).)*?\$\s*([\d.,]+)\s*Million",
+                content,
+                re.IGNORECASE | re.DOTALL,
+            )
+            if match_649:
+                results["lotto649"] = float(match_649.group(1).replace(",", "")) * 1_000_000
+
+    except Exception:
+        logger.exception("Failed to scrape Canadian lotteries from %s", target)
+
+    return results
+
+
+# ---------------------------------------------------------------------------
+# Aggregated fetchers (with cache)
+# ---------------------------------------------------------------------------
+
+def get_all_jackpots(*, force_refresh: bool = False) -> dict:
+    """Return all four lottery jackpots, using cache when available.
+
+    Returns::
+
+        {
+            "us": {"powerball": float|None, "megaMillions": float|None},
+            "canadian": {"lottoMax": float|None, "lotto649": float|None},
+        }
+    """
+    cfg = load_config()
+    cache_key = "all_jackpots"
+
+    if not force_refresh:
+        cached = _get_cached(cache_key, cfg.cache_ttl)
+        if cached is not None:
+            return cached  # type: ignore[return-value]
+
+    pb = scrape_powerball()
+    mm = scrape_mega_millions()
+    canadian = scrape_canadian_lotteries()
+
+    result = {
+        "us": {"powerball": pb, "megaMillions": mm},
+        "canadian": canadian,
+    }
+    _set_cached(cache_key, result)
+    return result