feat: interactive network map, IOC highlighting, AUP hunt selector, type filters

- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
2026-03-01 14:00:20 -05:00 · 2026-02-19 15:41:15 -05:00
parent d0c9f88268
commit 9b98ab9614
92 changed files with 13042 additions and 1089 deletions
--- a/backend/app/services/correlation.py
+++ b/backend/app/services/correlation.py
@@ -0,0 +1,400 @@
+"""Cross-hunt correlation engine — find IOC overlaps, timeline patterns, and shared TTPs.
+
+Identifies connections between hunts by analyzing:
+1. Shared IOC values across datasets
+2. Overlapping time ranges and temporal proximity
+3. Common MITRE ATT&CK techniques across hypotheses
+4. Host-to-host lateral movement patterns
+"""
+
+import logging
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Optional
+
+from sqlalchemy import select, func, text
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models import Dataset, DatasetRow, Hunt, Hypothesis, EnrichmentResult
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class IOCOverlap:
+    """Shared IOC between two or more hunts/datasets."""
+    ioc_value: str
+    ioc_type: str
+    datasets: list[dict] = field(default_factory=list)  # [{dataset_id, hunt_id, name}]
+    hunt_ids: list[str] = field(default_factory=list)
+    count: int = 0
+    enrichment_verdict: str = ""
+
+
+@dataclass
+class TimeOverlap:
+    """Overlapping time window between datasets."""
+    dataset_a: dict = field(default_factory=dict)
+    dataset_b: dict = field(default_factory=dict)
+    overlap_start: str = ""
+    overlap_end: str = ""
+    overlap_hours: float = 0.0
+
+
+@dataclass
+class TechniqueOverlap:
+    """Shared MITRE ATT&CK technique across hunts."""
+    technique_id: str
+    technique_name: str = ""
+    hypotheses: list[dict] = field(default_factory=list)
+    hunt_ids: list[str] = field(default_factory=list)
+
+
+@dataclass
+class CorrelationResult:
+    """Complete correlation analysis result."""
+    hunt_ids: list[str]
+    ioc_overlaps: list[IOCOverlap] = field(default_factory=list)
+    time_overlaps: list[TimeOverlap] = field(default_factory=list)
+    technique_overlaps: list[TechniqueOverlap] = field(default_factory=list)
+    host_overlaps: list[dict] = field(default_factory=list)
+    summary: str = ""
+    total_correlations: int = 0
+
+
+class CorrelationEngine:
+    """Engine for finding correlations across hunts and datasets."""
+
+    async def correlate_hunts(
+        self,
+        hunt_ids: list[str],
+        db: AsyncSession,
+    ) -> CorrelationResult:
+        """Run full correlation analysis across specified hunts."""
+        result = CorrelationResult(hunt_ids=hunt_ids)
+
+        # Run all correlation types
+        result.ioc_overlaps = await self._find_ioc_overlaps(hunt_ids, db)
+        result.time_overlaps = await self._find_time_overlaps(hunt_ids, db)
+        result.technique_overlaps = await self._find_technique_overlaps(hunt_ids, db)
+        result.host_overlaps = await self._find_host_overlaps(hunt_ids, db)
+
+        result.total_correlations = (
+            len(result.ioc_overlaps)
+            + len(result.time_overlaps)
+            + len(result.technique_overlaps)
+            + len(result.host_overlaps)
+        )
+
+        result.summary = self._build_summary(result)
+        return result
+
+    async def correlate_all(self, db: AsyncSession) -> CorrelationResult:
+        """Correlate across ALL hunts in the system."""
+        stmt = select(Hunt.id)
+        result = await db.execute(stmt)
+        hunt_ids = [row[0] for row in result.fetchall()]
+
+        if len(hunt_ids) < 2:
+            return CorrelationResult(
+                hunt_ids=hunt_ids,
+                summary="Need at least 2 hunts for correlation analysis.",
+            )
+
+        return await self.correlate_hunts(hunt_ids, db)
+
+    async def find_ioc_across_hunts(
+        self,
+        ioc_value: str,
+        db: AsyncSession,
+    ) -> list[dict]:
+        """Find all occurrences of a specific IOC across all datasets/hunts."""
+        # Search in dataset rows using JSON contains
+        stmt = select(DatasetRow, Dataset).join(
+            Dataset, DatasetRow.dataset_id == Dataset.id
+        )
+        result = await db.execute(stmt.limit(5000))
+        rows = result.all()
+
+        occurrences = []
+        for row, dataset in rows:
+            data = row.data or {}
+            normalized = row.normalized_data or {}
+
+            # Search both raw and normalized data
+            for col, val in {**data, **normalized}.items():
+                if str(val) == ioc_value:
+                    occurrences.append({
+                        "dataset_id": dataset.id,
+                        "dataset_name": dataset.name,
+                        "hunt_id": dataset.hunt_id,
+                        "row_index": row.row_index,
+                        "column": col,
+                    })
+                    break
+
+        return occurrences
+
+    # ── IOC overlap detection ─────────────────────────────────────────
+
+    async def _find_ioc_overlaps(
+        self,
+        hunt_ids: list[str],
+        db: AsyncSession,
+    ) -> list[IOCOverlap]:
+        """Find IOC values that appear in datasets from different hunts."""
+        # Get all datasets for the specified hunts
+        stmt = select(Dataset).where(Dataset.hunt_id.in_(hunt_ids))
+        result = await db.execute(stmt)
+        datasets = result.scalars().all()
+
+        if len(datasets) < 2:
+            return []
+
+        # Build IOC → dataset mapping
+        ioc_map: dict[str, list[dict]] = defaultdict(list)
+
+        for dataset in datasets:
+            if not dataset.ioc_columns:
+                continue
+
+            ioc_cols = list(dataset.ioc_columns.keys())
+            rows_stmt = select(DatasetRow).where(
+                DatasetRow.dataset_id == dataset.id
+            ).limit(2000)
+            rows_result = await db.execute(rows_stmt)
+            rows = rows_result.scalars().all()
+
+            for row in rows:
+                data = row.data or {}
+                for col in ioc_cols:
+                    val = data.get(col, "")
+                    if val and str(val).strip():
+                        ioc_map[str(val).strip()].append({
+                            "dataset_id": dataset.id,
+                            "dataset_name": dataset.name,
+                            "hunt_id": dataset.hunt_id,
+                            "column": col,
+                            "ioc_type": dataset.ioc_columns.get(col, "unknown"),
+                        })
+
+        # Filter to IOCs appearing in multiple hunts
+        overlaps = []
+        for ioc_value, appearances in ioc_map.items():
+            hunt_set = set(a["hunt_id"] for a in appearances if a["hunt_id"])
+            if len(hunt_set) >= 2:
+                # Check for enrichment data
+                enrich_stmt = select(EnrichmentResult).where(
+                    EnrichmentResult.ioc_value == ioc_value
+                ).limit(1)
+                enrich_result = await db.execute(enrich_stmt)
+                enrichment = enrich_result.scalar_one_or_none()
+
+                overlaps.append(IOCOverlap(
+                    ioc_value=ioc_value,
+                    ioc_type=appearances[0].get("ioc_type", "unknown"),
+                    datasets=appearances,
+                    hunt_ids=sorted(hunt_set),
+                    count=len(appearances),
+                    enrichment_verdict=enrichment.verdict if enrichment else "",
+                ))
+
+        # Sort by count descending
+        overlaps.sort(key=lambda x: x.count, reverse=True)
+        return overlaps[:100]  # Limit results
+
+    # ── Time window overlap ───────────────────────────────────────────
+
+    async def _find_time_overlaps(
+        self,
+        hunt_ids: list[str],
+        db: AsyncSession,
+    ) -> list[TimeOverlap]:
+        """Find datasets across hunts with overlapping time ranges."""
+        stmt = select(Dataset).where(
+            Dataset.hunt_id.in_(hunt_ids),
+            Dataset.time_range_start.isnot(None),
+            Dataset.time_range_end.isnot(None),
+        )
+        result = await db.execute(stmt)
+        datasets = result.scalars().all()
+
+        overlaps = []
+        for i, ds_a in enumerate(datasets):
+            for ds_b in datasets[i + 1:]:
+                if ds_a.hunt_id == ds_b.hunt_id:
+                    continue  # Same hunt, skip
+
+                try:
+                    a_start = datetime.fromisoformat(ds_a.time_range_start)
+                    a_end = datetime.fromisoformat(ds_a.time_range_end)
+                    b_start = datetime.fromisoformat(ds_b.time_range_start)
+                    b_end = datetime.fromisoformat(ds_b.time_range_end)
+                except (ValueError, TypeError):
+                    continue
+
+                # Check overlap
+                overlap_start = max(a_start, b_start)
+                overlap_end = min(a_end, b_end)
+
+                if overlap_start < overlap_end:
+                    hours = (overlap_end - overlap_start).total_seconds() / 3600
+                    overlaps.append(TimeOverlap(
+                        dataset_a={
+                            "id": ds_a.id,
+                            "name": ds_a.name,
+                            "hunt_id": ds_a.hunt_id,
+                            "start": ds_a.time_range_start,
+                            "end": ds_a.time_range_end,
+                        },
+                        dataset_b={
+                            "id": ds_b.id,
+                            "name": ds_b.name,
+                            "hunt_id": ds_b.hunt_id,
+                            "start": ds_b.time_range_start,
+                            "end": ds_b.time_range_end,
+                        },
+                        overlap_start=overlap_start.isoformat(),
+                        overlap_end=overlap_end.isoformat(),
+                        overlap_hours=round(hours, 2),
+                    ))
+
+        overlaps.sort(key=lambda x: x.overlap_hours, reverse=True)
+        return overlaps[:50]
+
+    # ── MITRE technique overlap ───────────────────────────────────────
+
+    async def _find_technique_overlaps(
+        self,
+        hunt_ids: list[str],
+        db: AsyncSession,
+    ) -> list[TechniqueOverlap]:
+        """Find MITRE ATT&CK techniques shared across hunts."""
+        stmt = select(Hypothesis).where(
+            Hypothesis.hunt_id.in_(hunt_ids),
+            Hypothesis.mitre_technique.isnot(None),
+        )
+        result = await db.execute(stmt)
+        hypotheses = result.scalars().all()
+
+        technique_map: dict[str, list[dict]] = defaultdict(list)
+        for hyp in hypotheses:
+            technique = hyp.mitre_technique.strip()
+            if technique:
+                technique_map[technique].append({
+                    "hypothesis_id": hyp.id,
+                    "hypothesis_title": hyp.title,
+                    "hunt_id": hyp.hunt_id,
+                    "status": hyp.status,
+                })
+
+        overlaps = []
+        for technique, hyps in technique_map.items():
+            hunt_set = set(h["hunt_id"] for h in hyps if h["hunt_id"])
+            if len(hunt_set) >= 2:
+                overlaps.append(TechniqueOverlap(
+                    technique_id=technique,
+                    hypotheses=hyps,
+                    hunt_ids=sorted(hunt_set),
+                ))
+
+        return overlaps
+
+    # ── Host overlap ──────────────────────────────────────────────────
+
+    async def _find_host_overlaps(
+        self,
+        hunt_ids: list[str],
+        db: AsyncSession,
+    ) -> list[dict]:
+        """Find hostnames that appear in datasets from different hunts.
+
+        Useful for detecting lateral movement patterns.
+        """
+        stmt = select(Dataset).where(Dataset.hunt_id.in_(hunt_ids))
+        result = await db.execute(stmt)
+        datasets = result.scalars().all()
+
+        host_map: dict[str, list[dict]] = defaultdict(list)
+
+        for dataset in datasets:
+            norm_cols = dataset.normalized_columns or {}
+            # Look for hostname columns
+            hostname_cols = [
+                orig for orig, canon in norm_cols.items()
+                if canon in ("hostname", "host", "computer_name", "src_host", "dst_host")
+            ]
+            if not hostname_cols:
+                continue
+
+            rows_stmt = select(DatasetRow).where(
+                DatasetRow.dataset_id == dataset.id
+            ).limit(2000)
+            rows_result = await db.execute(rows_stmt)
+            rows = rows_result.scalars().all()
+
+            for row in rows:
+                data = row.data or {}
+                for col in hostname_cols:
+                    val = data.get(col, "")
+                    if val and str(val).strip():
+                        host_name = str(val).strip().upper()
+                        host_map[host_name].append({
+                            "dataset_id": dataset.id,
+                            "dataset_name": dataset.name,
+                            "hunt_id": dataset.hunt_id,
+                        })
+
+        # Filter to hosts appearing in multiple hunts
+        overlaps = []
+        for host, appearances in host_map.items():
+            hunt_set = set(a["hunt_id"] for a in appearances if a["hunt_id"])
+            if len(hunt_set) >= 2:
+                overlaps.append({
+                    "hostname": host,
+                    "hunt_ids": sorted(hunt_set),
+                    "dataset_count": len(appearances),
+                    "datasets": appearances[:10],
+                })
+
+        overlaps.sort(key=lambda x: x["dataset_count"], reverse=True)
+        return overlaps[:50]
+
+    # ── Summary builder ───────────────────────────────────────────────
+
+    def _build_summary(self, result: CorrelationResult) -> str:
+        """Build a human-readable summary of correlations."""
+        parts = [f"Correlation analysis across {len(result.hunt_ids)} hunts:"]
+
+        if result.ioc_overlaps:
+            malicious = [o for o in result.ioc_overlaps if o.enrichment_verdict == "malicious"]
+            parts.append(
+                f"  - {len(result.ioc_overlaps)} shared IOCs "
+                f"({len(malicious)} flagged malicious)"
+            )
+        else:
+            parts.append("  - No shared IOCs found")
+
+        if result.time_overlaps:
+            parts.append(f"  - {len(result.time_overlaps)} overlapping time windows")
+
+        if result.technique_overlaps:
+            parts.append(
+                f"  - {len(result.technique_overlaps)} shared MITRE techniques"
+            )
+
+        if result.host_overlaps:
+            parts.append(
+                f"  - {len(result.host_overlaps)} hosts appearing in multiple hunts "
+                "(potential lateral movement)"
+            )
+
+        if result.total_correlations == 0:
+            parts.append("  No significant correlations detected.")
+
+        return "\n".join(parts)
+
+
+# Singleton
+correlation_engine = CorrelationEngine()