feat: interactive network map, IOC highlighting, AUP hunt selector, type filters

- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
2026-03-01 14:00:20 -05:00 · 2026-02-19 15:41:15 -05:00
parent d0c9f88268
commit 9b98ab9614
92 changed files with 13042 additions and 1089 deletions
--- a/backend/app/services/scanner.py
+++ b/backend/app/services/scanner.py
@@ -0,0 +1,233 @@
+"""AUP Keyword Scanner — searches dataset rows, hunts, annotations, and
+messages for keyword matches.
+
+Scanning is done in Python (not SQL LIKE on JSON columns) for portability
+across SQLite / PostgreSQL and to provide per-cell match context.
+"""
+
+import logging
+import re
+from dataclasses import dataclass, field
+
+from sqlalchemy import select, func
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models import (
+    KeywordTheme,
+    Keyword,
+    DatasetRow,
+    Dataset,
+    Hunt,
+    Annotation,
+    Message,
+    Conversation,
+)
+
+logger = logging.getLogger(__name__)
+
+BATCH_SIZE = 500
+
+
+@dataclass
+class ScanHit:
+    theme_name: str
+    theme_color: str
+    keyword: str
+    source_type: str       # dataset_row | hunt | annotation | message
+    source_id: str | int
+    field: str
+    matched_value: str
+    row_index: int | None = None
+    dataset_name: str | None = None
+
+
+@dataclass
+class ScanResult:
+    total_hits: int = 0
+    hits: list[ScanHit] = field(default_factory=list)
+    themes_scanned: int = 0
+    keywords_scanned: int = 0
+    rows_scanned: int = 0
+
+
+class KeywordScanner:
+    """Scans multiple data sources for keyword/regex matches."""
+
+    def __init__(self, db: AsyncSession):
+        self.db = db
+
+    # ── Public API ────────────────────────────────────────────────────
+
+    async def scan(
+        self,
+        dataset_ids: list[str] | None = None,
+        theme_ids: list[str] | None = None,
+        scan_hunts: bool = True,
+        scan_annotations: bool = True,
+        scan_messages: bool = True,
+    ) -> dict:
+        """Run a full AUP scan and return dict matching ScanResponse."""
+        # Load themes + keywords
+        themes = await self._load_themes(theme_ids)
+        if not themes:
+            return ScanResult().__dict__
+
+        # Pre-compile patterns per theme
+        patterns = self._compile_patterns(themes)
+        result = ScanResult(
+            themes_scanned=len(themes),
+            keywords_scanned=sum(len(kws) for kws in patterns.values()),
+        )
+
+        # Scan dataset rows
+        await self._scan_datasets(patterns, result, dataset_ids)
+
+        # Scan hunts
+        if scan_hunts:
+            await self._scan_hunts(patterns, result)
+
+        # Scan annotations
+        if scan_annotations:
+            await self._scan_annotations(patterns, result)
+
+        # Scan messages
+        if scan_messages:
+            await self._scan_messages(patterns, result)
+
+        result.total_hits = len(result.hits)
+        return {
+            "total_hits": result.total_hits,
+            "hits": [h.__dict__ for h in result.hits],
+            "themes_scanned": result.themes_scanned,
+            "keywords_scanned": result.keywords_scanned,
+            "rows_scanned": result.rows_scanned,
+        }
+
+    # ── Internal ──────────────────────────────────────────────────────
+
+    async def _load_themes(self, theme_ids: list[str] | None) -> list[KeywordTheme]:
+        q = select(KeywordTheme).where(KeywordTheme.enabled == True)  # noqa: E712
+        if theme_ids:
+            q = q.where(KeywordTheme.id.in_(theme_ids))
+        result = await self.db.execute(q)
+        return list(result.scalars().all())
+
+    def _compile_patterns(
+        self, themes: list[KeywordTheme]
+    ) -> dict[tuple[str, str, str], list[tuple[str, re.Pattern]]]:
+        """Returns {(theme_id, theme_name, theme_color): [(keyword_value, compiled_pattern), ...]}"""
+        patterns: dict[tuple[str, str, str], list[tuple[str, re.Pattern]]] = {}
+        for theme in themes:
+            key = (theme.id, theme.name, theme.color)
+            compiled = []
+            for kw in theme.keywords:
+                try:
+                    if kw.is_regex:
+                        pat = re.compile(kw.value, re.IGNORECASE)
+                    else:
+                        pat = re.compile(re.escape(kw.value), re.IGNORECASE)
+                    compiled.append((kw.value, pat))
+                except re.error:
+                    logger.warning("Invalid regex pattern '%s' in theme '%s', skipping",
+                                   kw.value, theme.name)
+            patterns[key] = compiled
+        return patterns
+
+    def _match_text(
+        self,
+        text: str,
+        patterns: dict,
+        source_type: str,
+        source_id: str | int,
+        field_name: str,
+        hits: list[ScanHit],
+        row_index: int | None = None,
+        dataset_name: str | None = None,
+    ) -> None:
+        """Check text against all compiled patterns, append hits."""
+        if not text:
+            return
+        for (theme_id, theme_name, theme_color), keyword_patterns in patterns.items():
+            for kw_value, pat in keyword_patterns:
+                if pat.search(text):
+                    # Truncate matched_value for display
+                    matched_preview = text[:200] + ("…" if len(text) > 200 else "")
+                    hits.append(ScanHit(
+                        theme_name=theme_name,
+                        theme_color=theme_color,
+                        keyword=kw_value,
+                        source_type=source_type,
+                        source_id=source_id,
+                        field=field_name,
+                        matched_value=matched_preview,
+                        row_index=row_index,
+                        dataset_name=dataset_name,
+                    ))
+
+    async def _scan_datasets(
+        self, patterns: dict, result: ScanResult, dataset_ids: list[str] | None
+    ) -> None:
+        """Scan dataset rows in batches."""
+        # Build dataset name lookup
+        ds_q = select(Dataset.id, Dataset.name)
+        if dataset_ids:
+            ds_q = ds_q.where(Dataset.id.in_(dataset_ids))
+        ds_result = await self.db.execute(ds_q)
+        ds_map = {r[0]: r[1] for r in ds_result.fetchall()}
+
+        if not ds_map:
+            return
+
+        # Iterate rows in batches
+        offset = 0
+        row_q_base = select(DatasetRow).where(
+            DatasetRow.dataset_id.in_(list(ds_map.keys()))
+        ).order_by(DatasetRow.id)
+
+        while True:
+            rows_result = await self.db.execute(
+                row_q_base.offset(offset).limit(BATCH_SIZE)
+            )
+            rows = rows_result.scalars().all()
+            if not rows:
+                break
+
+            for row in rows:
+                result.rows_scanned += 1
+                data = row.data or {}
+                for col_name, cell_value in data.items():
+                    if cell_value is None:
+                        continue
+                    text = str(cell_value)
+                    self._match_text(
+                        text, patterns, "dataset_row", row.id,
+                        col_name, result.hits,
+                        row_index=row.row_index,
+                        dataset_name=ds_map.get(row.dataset_id),
+                    )
+
+            offset += BATCH_SIZE
+            if len(rows) < BATCH_SIZE:
+                break
+
+    async def _scan_hunts(self, patterns: dict, result: ScanResult) -> None:
+        """Scan hunt names and descriptions."""
+        hunts_result = await self.db.execute(select(Hunt))
+        for hunt in hunts_result.scalars().all():
+            self._match_text(hunt.name, patterns, "hunt", hunt.id, "name", result.hits)
+            if hunt.description:
+                self._match_text(hunt.description, patterns, "hunt", hunt.id, "description", result.hits)
+
+    async def _scan_annotations(self, patterns: dict, result: ScanResult) -> None:
+        """Scan annotation text."""
+        ann_result = await self.db.execute(select(Annotation))
+        for ann in ann_result.scalars().all():
+            self._match_text(ann.text, patterns, "annotation", ann.id, "text", result.hits)
+
+    async def _scan_messages(self, patterns: dict, result: ScanResult) -> None:
+        """Scan conversation messages (user messages only)."""
+        msg_result = await self.db.execute(
+            select(Message).where(Message.role == "user")
+        )
+        for msg in msg_result.scalars().all():
+            self._match_text(msg.content, patterns, "message", msg.id, "content", result.hits)