Files
ThreatHunt/backend/app/services/scanner.py
mblanke 9b98ab9614 feat: interactive network map, IOC highlighting, AUP hunt selector, type filters
- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover
- NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform
- NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types
- NetworkMap: brighter colors, 20% smaller nodes
- DatasetViewer: IOC columns highlighted with colored headers + cell tinting
- AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all
- Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration
- Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade)
- Add OS column mapping to normalizer
- Full backend services, DB models, alembic migrations, new routes
- New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc.
- Docker Compose deployment with nginx reverse proxy
2026-02-19 15:41:15 -05:00

234 lines
8.2 KiB
Python

"""AUP Keyword Scanner — searches dataset rows, hunts, annotations, and
messages for keyword matches.
Scanning is done in Python (not SQL LIKE on JSON columns) for portability
across SQLite / PostgreSQL and to provide per-cell match context.
"""
import logging
import re
from dataclasses import dataclass, field
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import (
KeywordTheme,
Keyword,
DatasetRow,
Dataset,
Hunt,
Annotation,
Message,
Conversation,
)
logger = logging.getLogger(__name__)
BATCH_SIZE = 500
@dataclass
class ScanHit:
theme_name: str
theme_color: str
keyword: str
source_type: str # dataset_row | hunt | annotation | message
source_id: str | int
field: str
matched_value: str
row_index: int | None = None
dataset_name: str | None = None
@dataclass
class ScanResult:
total_hits: int = 0
hits: list[ScanHit] = field(default_factory=list)
themes_scanned: int = 0
keywords_scanned: int = 0
rows_scanned: int = 0
class KeywordScanner:
"""Scans multiple data sources for keyword/regex matches."""
def __init__(self, db: AsyncSession):
self.db = db
# ── Public API ────────────────────────────────────────────────────
async def scan(
self,
dataset_ids: list[str] | None = None,
theme_ids: list[str] | None = None,
scan_hunts: bool = True,
scan_annotations: bool = True,
scan_messages: bool = True,
) -> dict:
"""Run a full AUP scan and return dict matching ScanResponse."""
# Load themes + keywords
themes = await self._load_themes(theme_ids)
if not themes:
return ScanResult().__dict__
# Pre-compile patterns per theme
patterns = self._compile_patterns(themes)
result = ScanResult(
themes_scanned=len(themes),
keywords_scanned=sum(len(kws) for kws in patterns.values()),
)
# Scan dataset rows
await self._scan_datasets(patterns, result, dataset_ids)
# Scan hunts
if scan_hunts:
await self._scan_hunts(patterns, result)
# Scan annotations
if scan_annotations:
await self._scan_annotations(patterns, result)
# Scan messages
if scan_messages:
await self._scan_messages(patterns, result)
result.total_hits = len(result.hits)
return {
"total_hits": result.total_hits,
"hits": [h.__dict__ for h in result.hits],
"themes_scanned": result.themes_scanned,
"keywords_scanned": result.keywords_scanned,
"rows_scanned": result.rows_scanned,
}
# ── Internal ──────────────────────────────────────────────────────
async def _load_themes(self, theme_ids: list[str] | None) -> list[KeywordTheme]:
q = select(KeywordTheme).where(KeywordTheme.enabled == True) # noqa: E712
if theme_ids:
q = q.where(KeywordTheme.id.in_(theme_ids))
result = await self.db.execute(q)
return list(result.scalars().all())
def _compile_patterns(
self, themes: list[KeywordTheme]
) -> dict[tuple[str, str, str], list[tuple[str, re.Pattern]]]:
"""Returns {(theme_id, theme_name, theme_color): [(keyword_value, compiled_pattern), ...]}"""
patterns: dict[tuple[str, str, str], list[tuple[str, re.Pattern]]] = {}
for theme in themes:
key = (theme.id, theme.name, theme.color)
compiled = []
for kw in theme.keywords:
try:
if kw.is_regex:
pat = re.compile(kw.value, re.IGNORECASE)
else:
pat = re.compile(re.escape(kw.value), re.IGNORECASE)
compiled.append((kw.value, pat))
except re.error:
logger.warning("Invalid regex pattern '%s' in theme '%s', skipping",
kw.value, theme.name)
patterns[key] = compiled
return patterns
def _match_text(
self,
text: str,
patterns: dict,
source_type: str,
source_id: str | int,
field_name: str,
hits: list[ScanHit],
row_index: int | None = None,
dataset_name: str | None = None,
) -> None:
"""Check text against all compiled patterns, append hits."""
if not text:
return
for (theme_id, theme_name, theme_color), keyword_patterns in patterns.items():
for kw_value, pat in keyword_patterns:
if pat.search(text):
# Truncate matched_value for display
matched_preview = text[:200] + ("" if len(text) > 200 else "")
hits.append(ScanHit(
theme_name=theme_name,
theme_color=theme_color,
keyword=kw_value,
source_type=source_type,
source_id=source_id,
field=field_name,
matched_value=matched_preview,
row_index=row_index,
dataset_name=dataset_name,
))
async def _scan_datasets(
self, patterns: dict, result: ScanResult, dataset_ids: list[str] | None
) -> None:
"""Scan dataset rows in batches."""
# Build dataset name lookup
ds_q = select(Dataset.id, Dataset.name)
if dataset_ids:
ds_q = ds_q.where(Dataset.id.in_(dataset_ids))
ds_result = await self.db.execute(ds_q)
ds_map = {r[0]: r[1] for r in ds_result.fetchall()}
if not ds_map:
return
# Iterate rows in batches
offset = 0
row_q_base = select(DatasetRow).where(
DatasetRow.dataset_id.in_(list(ds_map.keys()))
).order_by(DatasetRow.id)
while True:
rows_result = await self.db.execute(
row_q_base.offset(offset).limit(BATCH_SIZE)
)
rows = rows_result.scalars().all()
if not rows:
break
for row in rows:
result.rows_scanned += 1
data = row.data or {}
for col_name, cell_value in data.items():
if cell_value is None:
continue
text = str(cell_value)
self._match_text(
text, patterns, "dataset_row", row.id,
col_name, result.hits,
row_index=row.row_index,
dataset_name=ds_map.get(row.dataset_id),
)
offset += BATCH_SIZE
if len(rows) < BATCH_SIZE:
break
async def _scan_hunts(self, patterns: dict, result: ScanResult) -> None:
"""Scan hunt names and descriptions."""
hunts_result = await self.db.execute(select(Hunt))
for hunt in hunts_result.scalars().all():
self._match_text(hunt.name, patterns, "hunt", hunt.id, "name", result.hits)
if hunt.description:
self._match_text(hunt.description, patterns, "hunt", hunt.id, "description", result.hits)
async def _scan_annotations(self, patterns: dict, result: ScanResult) -> None:
"""Scan annotation text."""
ann_result = await self.db.execute(select(Annotation))
for ann in ann_result.scalars().all():
self._match_text(ann.text, patterns, "annotation", ann.id, "text", result.hits)
async def _scan_messages(self, patterns: dict, result: ScanResult) -> None:
"""Scan conversation messages (user messages only)."""
msg_result = await self.db.execute(
select(Message).where(Message.role == "user")
)
for msg in msg_result.scalars().all():
self._match_text(msg.content, patterns, "message", msg.id, "content", result.hits)