Lottery-Tracker/scrapers.py

"""
Unified lottery jackpot scrapers with TTL caching.

Consolidates all scraping logic that was previously duplicated across
app.py, email_sender.py, and ``import requests.py``.
"""

from __future__ import annotations

import logging
import re
import time

import requests
from bs4 import BeautifulSoup
from playwright.sync_api import sync_playwright

from config import load_config

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Browser-like request headers
# ---------------------------------------------------------------------------
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept": (
        "text/html,application/xhtml+xml,application/xml;"
        "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Cache-Control": "max-age=0",
}

# ---------------------------------------------------------------------------
# Simple TTL cache
# ---------------------------------------------------------------------------
_cache: dict[str, tuple[float, object]] = {}


def _get_cached(key: str, ttl: int) -> object | None:
    """Return cached value if it exists and hasn't expired."""
    entry = _cache.get(key)
    if entry is None:
        return None
    ts, value = entry
    if time.time() - ts > ttl:
        return None
    return value


def _set_cached(key: str, value: object) -> None:
    _cache[key] = (time.time(), value)


def clear_cache() -> None:
    """Clear the scraper cache (useful for testing or forcing refresh)."""
    _cache.clear()


# ---------------------------------------------------------------------------
# Individual scrapers
# ---------------------------------------------------------------------------

def _parse_jackpot_from_lotto_net(html: str) -> float | None:
    """Extract the *Next Jackpot* dollar amount from a lotto.net page."""
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text()
    lines = text.split("\n")
    for i, line in enumerate(lines):
        if "Next Jackpot" in line and i + 1 < len(lines):
            next_line = lines[i + 1].strip()
            if "$" in next_line:
                match = re.search(r"\$(\d+(?:,\d+)?(?:\.\d+)?)", next_line)
                if match:
                    value = float(match.group(1).replace(",", ""))
                    if "Billion" in next_line:
                        return value * 1_000_000_000
                    if "Million" in next_line:
                        return value * 1_000_000
                    return value
    return None


def scrape_powerball(url: str | None = None) -> float | None:
    """Scrape the current Powerball jackpot from lotto.net."""
    cfg = load_config()
    target = url or cfg.urls.powerball
    try:
        resp = requests.get(target, timeout=15, headers=HEADERS)
        resp.raise_for_status()
        return _parse_jackpot_from_lotto_net(resp.text)
    except Exception:
        logger.exception("Failed to scrape Powerball from %s", target)
        return None


def scrape_mega_millions(url: str | None = None) -> float | None:
    """Scrape the current Mega Millions jackpot from lotto.net."""
    cfg = load_config()
    target = url or cfg.urls.mega_millions
    try:
        resp = requests.get(target, timeout=15, headers=HEADERS)
        resp.raise_for_status()
        return _parse_jackpot_from_lotto_net(resp.text)
    except Exception:
        logger.exception("Failed to scrape Mega Millions from %s", target)
        return None


def scrape_canadian_lotteries(url: str | None = None) -> dict[str, float | None]:
    """Scrape Lotto Max and Lotto 6/49 from OLG using Playwright."""
    cfg = load_config()
    target = url or cfg.urls.olg
    results: dict[str, float | None] = {"lottoMax": None, "lotto649": None}

    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            page.goto(target, wait_until="networkidle", timeout=30000)
            page.wait_for_timeout(3000)
            content = page.content()
            browser.close()

            # Lotto Max
            max_match = re.search(
                r"LOTTO\s*MAX(?:(?!LOTTO\s*6/49).)*?\$\s*([\d.,]+)\s*Million",
                content,
                re.IGNORECASE | re.DOTALL,
            )
            if max_match:
                results["lottoMax"] = float(max_match.group(1).replace(",", "")) * 1_000_000

            # Lotto 6/49
            match_649 = re.search(
                r"LOTTO\s*6/49(?:(?!LOTTO\s*MAX).)*?\$\s*([\d.,]+)\s*Million",
                content,
                re.IGNORECASE | re.DOTALL,
            )
            if match_649:
                results["lotto649"] = float(match_649.group(1).replace(",", "")) * 1_000_000

    except Exception:
        logger.exception("Failed to scrape Canadian lotteries from %s", target)

    return results


# ---------------------------------------------------------------------------
# Aggregated fetchers (with cache)
# ---------------------------------------------------------------------------

def get_all_jackpots(*, force_refresh: bool = False) -> dict:
    """Return all four lottery jackpots, using cache when available.

    Returns::

        {
            "us": {"powerball": float|None, "megaMillions": float|None},
            "canadian": {"lottoMax": float|None, "lotto649": float|None},
        }
    """
    cfg = load_config()
    cache_key = "all_jackpots"

    if not force_refresh:
        cached = _get_cached(cache_key, cfg.cache_ttl)
        if cached is not None:
            return cached  # type: ignore[return-value]

    pb = scrape_powerball()
    mm = scrape_mega_millions()
    canadian = scrape_canadian_lotteries()

    result = {
        "us": {"powerball": pb, "megaMillions": mm},
        "canadian": canadian,
    }
    _set_cached(cache_key, result)
    return result