mirror of
https://github.com/mblanke/Lottery-Tracker.git
synced 2026-03-01 14:10:22 -05:00
Version 1.1
This commit is contained in:
191
scrapers.py
Normal file
191
scrapers.py
Normal file
@@ -0,0 +1,191 @@
|
||||
"""
|
||||
Unified lottery jackpot scrapers with TTL caching.
|
||||
|
||||
Consolidates all scraping logic that was previously duplicated across
|
||||
app.py, email_sender.py, and ``import requests.py``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from playwright.sync_api import sync_playwright
|
||||
|
||||
from config import load_config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Browser-like request headers
|
||||
# ---------------------------------------------------------------------------
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": (
|
||||
"text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
|
||||
),
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
"Sec-Fetch-Dest": "document",
|
||||
"Sec-Fetch-Mode": "navigate",
|
||||
"Sec-Fetch-Site": "none",
|
||||
"Cache-Control": "max-age=0",
|
||||
}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Simple TTL cache
|
||||
# ---------------------------------------------------------------------------
|
||||
_cache: dict[str, tuple[float, object]] = {}
|
||||
|
||||
|
||||
def _get_cached(key: str, ttl: int) -> object | None:
|
||||
"""Return cached value if it exists and hasn't expired."""
|
||||
entry = _cache.get(key)
|
||||
if entry is None:
|
||||
return None
|
||||
ts, value = entry
|
||||
if time.time() - ts > ttl:
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def _set_cached(key: str, value: object) -> None:
|
||||
_cache[key] = (time.time(), value)
|
||||
|
||||
|
||||
def clear_cache() -> None:
|
||||
"""Clear the scraper cache (useful for testing or forcing refresh)."""
|
||||
_cache.clear()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Individual scrapers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _parse_jackpot_from_lotto_net(html: str) -> float | None:
|
||||
"""Extract the *Next Jackpot* dollar amount from a lotto.net page."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
text = soup.get_text()
|
||||
lines = text.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
if "Next Jackpot" in line and i + 1 < len(lines):
|
||||
next_line = lines[i + 1].strip()
|
||||
if "$" in next_line:
|
||||
match = re.search(r"\$(\d+(?:,\d+)?(?:\.\d+)?)", next_line)
|
||||
if match:
|
||||
value = float(match.group(1).replace(",", ""))
|
||||
if "Billion" in next_line:
|
||||
return value * 1_000_000_000
|
||||
if "Million" in next_line:
|
||||
return value * 1_000_000
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def scrape_powerball(url: str | None = None) -> float | None:
|
||||
"""Scrape the current Powerball jackpot from lotto.net."""
|
||||
cfg = load_config()
|
||||
target = url or cfg.urls.powerball
|
||||
try:
|
||||
resp = requests.get(target, timeout=15, headers=HEADERS)
|
||||
resp.raise_for_status()
|
||||
return _parse_jackpot_from_lotto_net(resp.text)
|
||||
except Exception:
|
||||
logger.exception("Failed to scrape Powerball from %s", target)
|
||||
return None
|
||||
|
||||
|
||||
def scrape_mega_millions(url: str | None = None) -> float | None:
|
||||
"""Scrape the current Mega Millions jackpot from lotto.net."""
|
||||
cfg = load_config()
|
||||
target = url or cfg.urls.mega_millions
|
||||
try:
|
||||
resp = requests.get(target, timeout=15, headers=HEADERS)
|
||||
resp.raise_for_status()
|
||||
return _parse_jackpot_from_lotto_net(resp.text)
|
||||
except Exception:
|
||||
logger.exception("Failed to scrape Mega Millions from %s", target)
|
||||
return None
|
||||
|
||||
|
||||
def scrape_canadian_lotteries(url: str | None = None) -> dict[str, float | None]:
|
||||
"""Scrape Lotto Max and Lotto 6/49 from OLG using Playwright."""
|
||||
cfg = load_config()
|
||||
target = url or cfg.urls.olg
|
||||
results: dict[str, float | None] = {"lottoMax": None, "lotto649": None}
|
||||
|
||||
try:
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
page = browser.new_page()
|
||||
page.goto(target, wait_until="networkidle", timeout=30000)
|
||||
page.wait_for_timeout(3000)
|
||||
content = page.content()
|
||||
browser.close()
|
||||
|
||||
# Lotto Max
|
||||
max_match = re.search(
|
||||
r"LOTTO\s*MAX(?:(?!LOTTO\s*6/49).)*?\$\s*([\d.,]+)\s*Million",
|
||||
content,
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
if max_match:
|
||||
results["lottoMax"] = float(max_match.group(1).replace(",", "")) * 1_000_000
|
||||
|
||||
# Lotto 6/49
|
||||
match_649 = re.search(
|
||||
r"LOTTO\s*6/49(?:(?!LOTTO\s*MAX).)*?\$\s*([\d.,]+)\s*Million",
|
||||
content,
|
||||
re.IGNORECASE | re.DOTALL,
|
||||
)
|
||||
if match_649:
|
||||
results["lotto649"] = float(match_649.group(1).replace(",", "")) * 1_000_000
|
||||
|
||||
except Exception:
|
||||
logger.exception("Failed to scrape Canadian lotteries from %s", target)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Aggregated fetchers (with cache)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_all_jackpots(*, force_refresh: bool = False) -> dict:
|
||||
"""Return all four lottery jackpots, using cache when available.
|
||||
|
||||
Returns::
|
||||
|
||||
{
|
||||
"us": {"powerball": float|None, "megaMillions": float|None},
|
||||
"canadian": {"lottoMax": float|None, "lotto649": float|None},
|
||||
}
|
||||
"""
|
||||
cfg = load_config()
|
||||
cache_key = "all_jackpots"
|
||||
|
||||
if not force_refresh:
|
||||
cached = _get_cached(cache_key, cfg.cache_ttl)
|
||||
if cached is not None:
|
||||
return cached # type: ignore[return-value]
|
||||
|
||||
pb = scrape_powerball()
|
||||
mm = scrape_mega_millions()
|
||||
canadian = scrape_canadian_lotteries()
|
||||
|
||||
result = {
|
||||
"us": {"powerball": pb, "megaMillions": mm},
|
||||
"canadian": canadian,
|
||||
}
|
||||
_set_cached(cache_key, result)
|
||||
return result
|
||||
Reference in New Issue
Block a user