mirror of
https://github.com/mblanke/Lottery-Tracker.git
synced 2026-03-01 14:10:22 -05:00
192 lines
6.1 KiB
Python
192 lines
6.1 KiB
Python
"""
|
|
Unified lottery jackpot scrapers with TTL caching.
|
|
|
|
Consolidates all scraping logic that was previously duplicated across
|
|
app.py, email_sender.py, and ``import requests.py``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
import time
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
from config import load_config
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Browser-like request headers
|
|
# ---------------------------------------------------------------------------
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": (
|
|
"text/html,application/xhtml+xml,application/xml;"
|
|
"q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
|
|
),
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept-Encoding": "gzip, deflate, br",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Cache-Control": "max-age=0",
|
|
}
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Simple TTL cache
|
|
# ---------------------------------------------------------------------------
|
|
_cache: dict[str, tuple[float, object]] = {}
|
|
|
|
|
|
def _get_cached(key: str, ttl: int) -> object | None:
|
|
"""Return cached value if it exists and hasn't expired."""
|
|
entry = _cache.get(key)
|
|
if entry is None:
|
|
return None
|
|
ts, value = entry
|
|
if time.time() - ts > ttl:
|
|
return None
|
|
return value
|
|
|
|
|
|
def _set_cached(key: str, value: object) -> None:
|
|
_cache[key] = (time.time(), value)
|
|
|
|
|
|
def clear_cache() -> None:
|
|
"""Clear the scraper cache (useful for testing or forcing refresh)."""
|
|
_cache.clear()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Individual scrapers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _parse_jackpot_from_lotto_net(html: str) -> float | None:
|
|
"""Extract the *Next Jackpot* dollar amount from a lotto.net page."""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
text = soup.get_text()
|
|
lines = text.split("\n")
|
|
for i, line in enumerate(lines):
|
|
if "Next Jackpot" in line and i + 1 < len(lines):
|
|
next_line = lines[i + 1].strip()
|
|
if "$" in next_line:
|
|
match = re.search(r"\$(\d+(?:,\d+)?(?:\.\d+)?)", next_line)
|
|
if match:
|
|
value = float(match.group(1).replace(",", ""))
|
|
if "Billion" in next_line:
|
|
return value * 1_000_000_000
|
|
if "Million" in next_line:
|
|
return value * 1_000_000
|
|
return value
|
|
return None
|
|
|
|
|
|
def scrape_powerball(url: str | None = None) -> float | None:
|
|
"""Scrape the current Powerball jackpot from lotto.net."""
|
|
cfg = load_config()
|
|
target = url or cfg.urls.powerball
|
|
try:
|
|
resp = requests.get(target, timeout=15, headers=HEADERS)
|
|
resp.raise_for_status()
|
|
return _parse_jackpot_from_lotto_net(resp.text)
|
|
except Exception:
|
|
logger.exception("Failed to scrape Powerball from %s", target)
|
|
return None
|
|
|
|
|
|
def scrape_mega_millions(url: str | None = None) -> float | None:
|
|
"""Scrape the current Mega Millions jackpot from lotto.net."""
|
|
cfg = load_config()
|
|
target = url or cfg.urls.mega_millions
|
|
try:
|
|
resp = requests.get(target, timeout=15, headers=HEADERS)
|
|
resp.raise_for_status()
|
|
return _parse_jackpot_from_lotto_net(resp.text)
|
|
except Exception:
|
|
logger.exception("Failed to scrape Mega Millions from %s", target)
|
|
return None
|
|
|
|
|
|
def scrape_canadian_lotteries(url: str | None = None) -> dict[str, float | None]:
|
|
"""Scrape Lotto Max and Lotto 6/49 from OLG using Playwright."""
|
|
cfg = load_config()
|
|
target = url or cfg.urls.olg
|
|
results: dict[str, float | None] = {"lottoMax": None, "lotto649": None}
|
|
|
|
try:
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
page = browser.new_page()
|
|
page.goto(target, wait_until="networkidle", timeout=30000)
|
|
page.wait_for_timeout(3000)
|
|
content = page.content()
|
|
browser.close()
|
|
|
|
# Lotto Max
|
|
max_match = re.search(
|
|
r"LOTTO\s*MAX(?:(?!LOTTO\s*6/49).)*?\$\s*([\d.,]+)\s*Million",
|
|
content,
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
if max_match:
|
|
results["lottoMax"] = float(max_match.group(1).replace(",", "")) * 1_000_000
|
|
|
|
# Lotto 6/49
|
|
match_649 = re.search(
|
|
r"LOTTO\s*6/49(?:(?!LOTTO\s*MAX).)*?\$\s*([\d.,]+)\s*Million",
|
|
content,
|
|
re.IGNORECASE | re.DOTALL,
|
|
)
|
|
if match_649:
|
|
results["lotto649"] = float(match_649.group(1).replace(",", "")) * 1_000_000
|
|
|
|
except Exception:
|
|
logger.exception("Failed to scrape Canadian lotteries from %s", target)
|
|
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Aggregated fetchers (with cache)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def get_all_jackpots(*, force_refresh: bool = False) -> dict:
|
|
"""Return all four lottery jackpots, using cache when available.
|
|
|
|
Returns::
|
|
|
|
{
|
|
"us": {"powerball": float|None, "megaMillions": float|None},
|
|
"canadian": {"lottoMax": float|None, "lotto649": float|None},
|
|
}
|
|
"""
|
|
cfg = load_config()
|
|
cache_key = "all_jackpots"
|
|
|
|
if not force_refresh:
|
|
cached = _get_cached(cache_key, cfg.cache_ttl)
|
|
if cached is not None:
|
|
return cached # type: ignore[return-value]
|
|
|
|
pb = scrape_powerball()
|
|
mm = scrape_mega_millions()
|
|
canadian = scrape_canadian_lotteries()
|
|
|
|
result = {
|
|
"us": {"powerball": pb, "megaMillions": mm},
|
|
"canadian": canadian,
|
|
}
|
|
_set_cached(cache_key, result)
|
|
return result
|