feat: interactive network map, IOC highlighting, AUP hunt selector, type filters

- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
2026-03-01 14:00:20 -05:00 · 2026-02-19 15:41:15 -05:00
parent d0c9f88268
commit 9b98ab9614
92 changed files with 13042 additions and 1089 deletions
--- a/backend/app/agents/core_v2.py
+++ b/backend/app/agents/core_v2.py
@@ -0,0 +1,408 @@
+"""Core ThreatHunt analyst-assist agent — v2.
+
+Uses TaskRouter to select the right model/node for each query,
+real LLM providers (Ollama/OpenWebUI), and structured response parsing.
+Integrates SANS RAG context from Open WebUI.
+"""
+
+import json
+import logging
+import re
+import time
+from typing import AsyncIterator, Optional
+
+from pydantic import BaseModel, Field
+
+from app.config import settings
+from app.services.sans_rag import sans_rag
+from .router import TaskRouter, TaskType, RoutingDecision, task_router
+from .providers_v2 import OllamaProvider, OpenWebUIProvider
+
+logger = logging.getLogger(__name__)
+
+
+# ── Models ────────────────────────────────────────────────────────────
+
+
+class AgentContext(BaseModel):
+    """Context for agent guidance requests."""
+
+    query: str = Field(..., description="Analyst question or request for guidance")
+    dataset_name: Optional[str] = Field(None, description="Name of CSV dataset")
+    artifact_type: Optional[str] = Field(None, description="Artifact type")
+    host_identifier: Optional[str] = Field(None, description="Host name, IP, or identifier")
+    data_summary: Optional[str] = Field(None, description="Brief description of data")
+    conversation_history: Optional[list[dict]] = Field(
+        default_factory=list, description="Previous messages"
+    )
+    active_hypotheses: Optional[list[str]] = Field(
+        default_factory=list, description="Active investigation hypotheses"
+    )
+    annotations_summary: Optional[str] = Field(
+        None, description="Summary of analyst annotations"
+    )
+    enrichment_summary: Optional[str] = Field(
+        None, description="Summary of enrichment results"
+    )
+    mode: str = Field(default="quick", description="quick | deep | debate")
+    model_override: Optional[str] = Field(None, description="Force a specific model")
+
+
+class Perspective(BaseModel):
+    """A single perspective from the debate agent."""
+    role: str
+    content: str
+    model_used: str
+    node_used: str
+    latency_ms: int
+
+
+class AgentResponse(BaseModel):
+    """Response from analyst-assist agent."""
+
+    guidance: str = Field(..., description="Advisory guidance for analyst")
+    confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence (0-1)")
+    suggested_pivots: list[str] = Field(default_factory=list)
+    suggested_filters: list[str] = Field(default_factory=list)
+    caveats: Optional[str] = None
+    reasoning: Optional[str] = None
+    sans_references: list[str] = Field(
+        default_factory=list, description="SANS course references"
+    )
+    model_used: str = Field(default="", description="Model that generated the response")
+    node_used: str = Field(default="", description="Node that processed the request")
+    latency_ms: int = Field(default=0, description="Total latency in ms")
+    perspectives: Optional[list[Perspective]] = Field(
+        None, description="Debate perspectives (only in debate mode)"
+    )
+
+
+# ── System prompt ─────────────────────────────────────────────────────
+
+SYSTEM_PROMPT = """You are an analyst-assist agent for ThreatHunt, a threat hunting platform.
+You have access to 300GB of SANS cybersecurity course material for reference.
+
+Your role:
+- Interpret and explain CSV artifact data from Velociraptor and other forensic tools
+- Suggest analytical pivots, filters, and hypotheses
+- Highlight anomalies, patterns, or points of interest
+- Reference relevant SANS methodologies and techniques when applicable
+- Guide analysts without replacing their judgment
+
+Your constraints:
+- You ONLY provide guidance and suggestions
+- You do NOT execute actions or tools
+- You do NOT modify data or escalate alerts
+- You explain your reasoning transparently
+
+RESPONSE FORMAT — you MUST respond with valid JSON:
+{
+  "guidance": "Your main guidance text here",
+  "confidence": 0.85,
+  "suggested_pivots": ["Pivot 1", "Pivot 2"],
+  "suggested_filters": ["filter expression 1", "filter expression 2"],
+  "caveats": "Any assumptions or limitations",
+  "reasoning": "How you arrived at this guidance",
+  "sans_references": ["SANS SEC504: ...", "SANS FOR508: ..."]
+}
+
+Respond ONLY with the JSON object. No markdown, no code fences, no extra text."""
+
+
+# ── Agent ─────────────────────────────────────────────────────────────
+
+
+class ThreatHuntAgent:
+    """Analyst-assist agent backed by Wile + Roadrunner LLM cluster."""
+
+    def __init__(self, router: TaskRouter | None = None):
+        self.router = router or task_router
+        self.system_prompt = SYSTEM_PROMPT
+
+    async def assist(self, context: AgentContext) -> AgentResponse:
+        """Provide guidance on artifact data and analysis."""
+        start = time.monotonic()
+
+        if context.mode == "debate":
+            return await self._debate_assist(context)
+
+        # Classify task and route
+        task_type = self.router.classify_task(context.query)
+        if context.mode == "deep":
+            task_type = TaskType.DEEP_ANALYSIS
+
+        decision = self.router.route(task_type, model_override=context.model_override)
+        logger.info(f"Routing: {decision.reason}")
+
+        # Enrich prompt with SANS RAG context
+        prompt = self._build_prompt(context)
+        try:
+            rag_context = await sans_rag.enrich_prompt(
+                context.query,
+                investigation_context=context.data_summary or "",
+            )
+            if rag_context:
+                prompt = f"{prompt}\n\n{rag_context}"
+        except Exception as e:
+            logger.warning(f"SANS RAG enrichment failed: {e}")
+
+        # Call LLM
+        provider = self.router.get_provider(decision)
+        if isinstance(provider, OpenWebUIProvider):
+            messages = [
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": prompt},
+            ]
+            result = await provider.chat(
+                messages,
+                max_tokens=settings.AGENT_MAX_TOKENS,
+                temperature=settings.AGENT_TEMPERATURE,
+            )
+        else:
+            result = await provider.generate(
+                prompt,
+                system=self.system_prompt,
+                max_tokens=settings.AGENT_MAX_TOKENS,
+                temperature=settings.AGENT_TEMPERATURE,
+            )
+
+        raw_text = result.get("response", "")
+        latency_ms = result.get("_latency_ms", 0)
+
+        # Parse structured response
+        response = self._parse_response(raw_text, context)
+        response.model_used = decision.model
+        response.node_used = decision.node.value
+        response.latency_ms = latency_ms
+
+        total_ms = int((time.monotonic() - start) * 1000)
+        logger.info(
+            f"Agent assist: {context.query[:60]}... → "
+            f"{decision.model} on {decision.node.value} "
+            f"({total_ms}ms total, {latency_ms}ms LLM)"
+        )
+
+        return response
+
+    async def assist_stream(
+        self,
+        context: AgentContext,
+    ) -> AsyncIterator[str]:
+        """Stream agent response tokens."""
+        task_type = self.router.classify_task(context.query)
+        decision = self.router.route(task_type, model_override=context.model_override)
+        prompt = self._build_prompt(context)
+
+        provider = self.router.get_provider(decision)
+        if isinstance(provider, OllamaProvider):
+            async for token in provider.generate_stream(
+                prompt,
+                system=self.system_prompt,
+                max_tokens=settings.AGENT_MAX_TOKENS,
+                temperature=settings.AGENT_TEMPERATURE,
+            ):
+                yield token
+        elif isinstance(provider, OpenWebUIProvider):
+            messages = [
+                {"role": "system", "content": self.system_prompt},
+                {"role": "user", "content": prompt},
+            ]
+            async for token in provider.chat_stream(
+                messages,
+                max_tokens=settings.AGENT_MAX_TOKENS,
+                temperature=settings.AGENT_TEMPERATURE,
+            ):
+                yield token
+
+    async def _debate_assist(self, context: AgentContext) -> AgentResponse:
+        """Multi-perspective analysis using diverse models on Wile."""
+        import asyncio
+
+        start = time.monotonic()
+        prompt = self._build_prompt(context)
+
+        # Route each perspective to a different heavy model
+        roles = {
+            TaskType.DEBATE_PLANNER: (
+                "Planner",
+                "You are the Planner for a threat hunting investigation.\n"
+                "Provide a structured investigation strategy. Reference SANS methodologies.\n"
+                "Focus on: investigation steps, data sources to examine, MITRE ATT&CK mapping.\n"
+                "Be specific to the data context provided.\n\n",
+            ),
+            TaskType.DEBATE_CRITIC: (
+                "Critic",
+                "You are the Critic for a threat hunting investigation.\n"
+                "Identify risks, false positive scenarios, missing evidence, and assumptions.\n"
+                "Reference SANS training on common analyst mistakes.\n"
+                "Challenge the obvious interpretation.\n\n",
+            ),
+            TaskType.DEBATE_PRAGMATIST: (
+                "Pragmatist",
+                "You are the Pragmatist for a threat hunting investigation.\n"
+                "Suggest the most actionable, efficient next steps.\n"
+                "Reference SANS incident response playbooks.\n"
+                "Focus on: quick wins, triage priorities, what to escalate.\n\n",
+            ),
+        }
+
+        async def _call_perspective(task_type: TaskType, role_name: str, prefix: str):
+            decision = self.router.route(task_type)
+            provider = self.router.get_provider(decision)
+            full_prompt = prefix + prompt
+
+            if isinstance(provider, OpenWebUIProvider):
+                result = await provider.generate(
+                    full_prompt,
+                    system=f"You are the {role_name}. Provide analysis only. No execution.",
+                    max_tokens=settings.AGENT_MAX_TOKENS,
+                    temperature=0.4,
+                )
+            else:
+                result = await provider.generate(
+                    full_prompt,
+                    system=f"You are the {role_name}. Provide analysis only. No execution.",
+                    max_tokens=settings.AGENT_MAX_TOKENS,
+                    temperature=0.4,
+                )
+
+            return Perspective(
+                role=role_name,
+                content=result.get("response", ""),
+                model_used=decision.model,
+                node_used=decision.node.value,
+                latency_ms=result.get("_latency_ms", 0),
+            )
+
+        # Run perspectives in parallel
+        perspective_tasks = [
+            _call_perspective(tt, name, prefix)
+            for tt, (name, prefix) in roles.items()
+        ]
+        perspectives = await asyncio.gather(*perspective_tasks)
+
+        # Judge merges the perspectives
+        judge_prompt = (
+            "You are the Judge. Merge these three threat hunting perspectives into "
+            "ONE final advisory answer.\n\n"
+            "Rules:\n"
+            "- Advisory only — no execution\n"
+            "- Clearly list risks and assumptions\n"
+            "- Highlight where perspectives agree and disagree\n"
+            "- Provide a unified recommendation\n"
+            "- Reference SANS methodologies where relevant\n\n"
+        )
+        for p in perspectives:
+            judge_prompt += f"=== {p.role} (via {p.model_used}) ===\n{p.content}\n\n"
+
+        judge_prompt += (
+            f"\nOriginal analyst query:\n{context.query}\n\n"
+            "Respond with the merged analysis in this JSON format:\n"
+            '{"guidance": "...", "confidence": 0.85, "suggested_pivots": [...], '
+            '"suggested_filters": [...], "caveats": "...", "reasoning": "...", '
+            '"sans_references": [...]}'
+        )
+
+        judge_decision = self.router.route(TaskType.DEBATE_JUDGE)
+        judge_provider = self.router.get_provider(judge_decision)
+
+        if isinstance(judge_provider, OpenWebUIProvider):
+            judge_result = await judge_provider.generate(
+                judge_prompt,
+                system="You are the Judge. Merge perspectives into a final advisory answer. Respond with JSON only.",
+                max_tokens=settings.AGENT_MAX_TOKENS,
+                temperature=0.2,
+            )
+        else:
+            judge_result = await judge_provider.generate(
+                judge_prompt,
+                system="You are the Judge. Merge perspectives into a final advisory answer. Respond with JSON only.",
+                max_tokens=settings.AGENT_MAX_TOKENS,
+                temperature=0.2,
+            )
+
+        raw_text = judge_result.get("response", "")
+        response = self._parse_response(raw_text, context)
+        response.model_used = judge_decision.model
+        response.node_used = judge_decision.node.value
+        response.latency_ms = int((time.monotonic() - start) * 1000)
+        response.perspectives = list(perspectives)
+
+        return response
+
+    def _build_prompt(self, context: AgentContext) -> str:
+        """Build the prompt with all available context."""
+        parts = [f"Analyst query: {context.query}"]
+
+        if context.dataset_name:
+            parts.append(f"Dataset: {context.dataset_name}")
+        if context.artifact_type:
+            parts.append(f"Artifact type: {context.artifact_type}")
+        if context.host_identifier:
+            parts.append(f"Host: {context.host_identifier}")
+        if context.data_summary:
+            parts.append(f"Data summary: {context.data_summary}")
+        if context.active_hypotheses:
+            parts.append(f"Active hypotheses: {'; '.join(context.active_hypotheses)}")
+        if context.annotations_summary:
+            parts.append(f"Analyst annotations: {context.annotations_summary}")
+        if context.enrichment_summary:
+            parts.append(f"Enrichment data: {context.enrichment_summary}")
+        if context.conversation_history:
+            parts.append("\nRecent conversation:")
+            for msg in context.conversation_history[-settings.AGENT_HISTORY_LENGTH:]:
+                parts.append(f"  {msg.get('role', 'unknown')}: {msg.get('content', '')[:500]}")
+
+        return "\n".join(parts)
+
+    def _parse_response(self, raw: str, context: AgentContext) -> AgentResponse:
+        """Parse LLM output into structured AgentResponse.
+
+        Tries JSON extraction first, falls back to raw text with defaults.
+        """
+        parsed = self._try_parse_json(raw)
+        if parsed:
+            return AgentResponse(
+                guidance=parsed.get("guidance", raw),
+                confidence=min(max(float(parsed.get("confidence", 0.7)), 0.0), 1.0),
+                suggested_pivots=parsed.get("suggested_pivots", [])[:6],
+                suggested_filters=parsed.get("suggested_filters", [])[:6],
+                caveats=parsed.get("caveats"),
+                reasoning=parsed.get("reasoning"),
+                sans_references=parsed.get("sans_references", []),
+            )
+
+        # Fallback: use raw text as guidance
+        return AgentResponse(
+            guidance=raw.strip() or "No guidance generated. Please try rephrasing your question.",
+            confidence=0.5,
+            suggested_pivots=[],
+            suggested_filters=[],
+            caveats="Response was not in structured format. Pivots and filters may be embedded in the guidance text.",
+            reasoning=None,
+            sans_references=[],
+        )
+
+    def _try_parse_json(self, text: str) -> dict | None:
+        """Try to extract JSON from LLM output."""
+        # Direct parse
+        try:
+            return json.loads(text.strip())
+        except json.JSONDecodeError:
+            pass
+
+        # Extract from code fences
+        patterns = [
+            r"```json\s*(.*?)\s*```",
+            r"```\s*(.*?)\s*```",
+            r"\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}",
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, text, re.DOTALL)
+            if match:
+                try:
+                    return json.loads(match.group(1) if match.lastindex else match.group(0))
+                except (json.JSONDecodeError, IndexError):
+                    continue
+
+        return None