mirror of
https://github.com/mblanke/ThreatHunt.git
synced 2026-03-01 14:00:20 -05:00
feat: interactive network map, IOC highlighting, AUP hunt selector, type filters
- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
This commit is contained in:
233
backend/app/services/scanner.py
Normal file
233
backend/app/services/scanner.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""AUP Keyword Scanner — searches dataset rows, hunts, annotations, and
|
||||
messages for keyword matches.
|
||||
|
||||
Scanning is done in Python (not SQL LIKE on JSON columns) for portability
|
||||
across SQLite / PostgreSQL and to provide per-cell match context.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from sqlalchemy import select, func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db.models import (
|
||||
KeywordTheme,
|
||||
Keyword,
|
||||
DatasetRow,
|
||||
Dataset,
|
||||
Hunt,
|
||||
Annotation,
|
||||
Message,
|
||||
Conversation,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BATCH_SIZE = 500
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScanHit:
|
||||
theme_name: str
|
||||
theme_color: str
|
||||
keyword: str
|
||||
source_type: str # dataset_row | hunt | annotation | message
|
||||
source_id: str | int
|
||||
field: str
|
||||
matched_value: str
|
||||
row_index: int | None = None
|
||||
dataset_name: str | None = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScanResult:
|
||||
total_hits: int = 0
|
||||
hits: list[ScanHit] = field(default_factory=list)
|
||||
themes_scanned: int = 0
|
||||
keywords_scanned: int = 0
|
||||
rows_scanned: int = 0
|
||||
|
||||
|
||||
class KeywordScanner:
|
||||
"""Scans multiple data sources for keyword/regex matches."""
|
||||
|
||||
def __init__(self, db: AsyncSession):
|
||||
self.db = db
|
||||
|
||||
# ── Public API ────────────────────────────────────────────────────
|
||||
|
||||
async def scan(
|
||||
self,
|
||||
dataset_ids: list[str] | None = None,
|
||||
theme_ids: list[str] | None = None,
|
||||
scan_hunts: bool = True,
|
||||
scan_annotations: bool = True,
|
||||
scan_messages: bool = True,
|
||||
) -> dict:
|
||||
"""Run a full AUP scan and return dict matching ScanResponse."""
|
||||
# Load themes + keywords
|
||||
themes = await self._load_themes(theme_ids)
|
||||
if not themes:
|
||||
return ScanResult().__dict__
|
||||
|
||||
# Pre-compile patterns per theme
|
||||
patterns = self._compile_patterns(themes)
|
||||
result = ScanResult(
|
||||
themes_scanned=len(themes),
|
||||
keywords_scanned=sum(len(kws) for kws in patterns.values()),
|
||||
)
|
||||
|
||||
# Scan dataset rows
|
||||
await self._scan_datasets(patterns, result, dataset_ids)
|
||||
|
||||
# Scan hunts
|
||||
if scan_hunts:
|
||||
await self._scan_hunts(patterns, result)
|
||||
|
||||
# Scan annotations
|
||||
if scan_annotations:
|
||||
await self._scan_annotations(patterns, result)
|
||||
|
||||
# Scan messages
|
||||
if scan_messages:
|
||||
await self._scan_messages(patterns, result)
|
||||
|
||||
result.total_hits = len(result.hits)
|
||||
return {
|
||||
"total_hits": result.total_hits,
|
||||
"hits": [h.__dict__ for h in result.hits],
|
||||
"themes_scanned": result.themes_scanned,
|
||||
"keywords_scanned": result.keywords_scanned,
|
||||
"rows_scanned": result.rows_scanned,
|
||||
}
|
||||
|
||||
# ── Internal ──────────────────────────────────────────────────────
|
||||
|
||||
async def _load_themes(self, theme_ids: list[str] | None) -> list[KeywordTheme]:
|
||||
q = select(KeywordTheme).where(KeywordTheme.enabled == True) # noqa: E712
|
||||
if theme_ids:
|
||||
q = q.where(KeywordTheme.id.in_(theme_ids))
|
||||
result = await self.db.execute(q)
|
||||
return list(result.scalars().all())
|
||||
|
||||
def _compile_patterns(
|
||||
self, themes: list[KeywordTheme]
|
||||
) -> dict[tuple[str, str, str], list[tuple[str, re.Pattern]]]:
|
||||
"""Returns {(theme_id, theme_name, theme_color): [(keyword_value, compiled_pattern), ...]}"""
|
||||
patterns: dict[tuple[str, str, str], list[tuple[str, re.Pattern]]] = {}
|
||||
for theme in themes:
|
||||
key = (theme.id, theme.name, theme.color)
|
||||
compiled = []
|
||||
for kw in theme.keywords:
|
||||
try:
|
||||
if kw.is_regex:
|
||||
pat = re.compile(kw.value, re.IGNORECASE)
|
||||
else:
|
||||
pat = re.compile(re.escape(kw.value), re.IGNORECASE)
|
||||
compiled.append((kw.value, pat))
|
||||
except re.error:
|
||||
logger.warning("Invalid regex pattern '%s' in theme '%s', skipping",
|
||||
kw.value, theme.name)
|
||||
patterns[key] = compiled
|
||||
return patterns
|
||||
|
||||
def _match_text(
|
||||
self,
|
||||
text: str,
|
||||
patterns: dict,
|
||||
source_type: str,
|
||||
source_id: str | int,
|
||||
field_name: str,
|
||||
hits: list[ScanHit],
|
||||
row_index: int | None = None,
|
||||
dataset_name: str | None = None,
|
||||
) -> None:
|
||||
"""Check text against all compiled patterns, append hits."""
|
||||
if not text:
|
||||
return
|
||||
for (theme_id, theme_name, theme_color), keyword_patterns in patterns.items():
|
||||
for kw_value, pat in keyword_patterns:
|
||||
if pat.search(text):
|
||||
# Truncate matched_value for display
|
||||
matched_preview = text[:200] + ("…" if len(text) > 200 else "")
|
||||
hits.append(ScanHit(
|
||||
theme_name=theme_name,
|
||||
theme_color=theme_color,
|
||||
keyword=kw_value,
|
||||
source_type=source_type,
|
||||
source_id=source_id,
|
||||
field=field_name,
|
||||
matched_value=matched_preview,
|
||||
row_index=row_index,
|
||||
dataset_name=dataset_name,
|
||||
))
|
||||
|
||||
async def _scan_datasets(
|
||||
self, patterns: dict, result: ScanResult, dataset_ids: list[str] | None
|
||||
) -> None:
|
||||
"""Scan dataset rows in batches."""
|
||||
# Build dataset name lookup
|
||||
ds_q = select(Dataset.id, Dataset.name)
|
||||
if dataset_ids:
|
||||
ds_q = ds_q.where(Dataset.id.in_(dataset_ids))
|
||||
ds_result = await self.db.execute(ds_q)
|
||||
ds_map = {r[0]: r[1] for r in ds_result.fetchall()}
|
||||
|
||||
if not ds_map:
|
||||
return
|
||||
|
||||
# Iterate rows in batches
|
||||
offset = 0
|
||||
row_q_base = select(DatasetRow).where(
|
||||
DatasetRow.dataset_id.in_(list(ds_map.keys()))
|
||||
).order_by(DatasetRow.id)
|
||||
|
||||
while True:
|
||||
rows_result = await self.db.execute(
|
||||
row_q_base.offset(offset).limit(BATCH_SIZE)
|
||||
)
|
||||
rows = rows_result.scalars().all()
|
||||
if not rows:
|
||||
break
|
||||
|
||||
for row in rows:
|
||||
result.rows_scanned += 1
|
||||
data = row.data or {}
|
||||
for col_name, cell_value in data.items():
|
||||
if cell_value is None:
|
||||
continue
|
||||
text = str(cell_value)
|
||||
self._match_text(
|
||||
text, patterns, "dataset_row", row.id,
|
||||
col_name, result.hits,
|
||||
row_index=row.row_index,
|
||||
dataset_name=ds_map.get(row.dataset_id),
|
||||
)
|
||||
|
||||
offset += BATCH_SIZE
|
||||
if len(rows) < BATCH_SIZE:
|
||||
break
|
||||
|
||||
async def _scan_hunts(self, patterns: dict, result: ScanResult) -> None:
|
||||
"""Scan hunt names and descriptions."""
|
||||
hunts_result = await self.db.execute(select(Hunt))
|
||||
for hunt in hunts_result.scalars().all():
|
||||
self._match_text(hunt.name, patterns, "hunt", hunt.id, "name", result.hits)
|
||||
if hunt.description:
|
||||
self._match_text(hunt.description, patterns, "hunt", hunt.id, "description", result.hits)
|
||||
|
||||
async def _scan_annotations(self, patterns: dict, result: ScanResult) -> None:
|
||||
"""Scan annotation text."""
|
||||
ann_result = await self.db.execute(select(Annotation))
|
||||
for ann in ann_result.scalars().all():
|
||||
self._match_text(ann.text, patterns, "annotation", ann.id, "text", result.hits)
|
||||
|
||||
async def _scan_messages(self, patterns: dict, result: ScanResult) -> None:
|
||||
"""Scan conversation messages (user messages only)."""
|
||||
msg_result = await self.db.execute(
|
||||
select(Message).where(Message.role == "user")
|
||||
)
|
||||
for msg in msg_result.scalars().all():
|
||||
self._match_text(msg.content, patterns, "message", msg.id, "content", result.hits)
|
||||
Reference in New Issue
Block a user