Add ThreatHunt agent backend/frontend scaffolding

2026-03-01 14:00:20 -05:00 · 2025-12-29 10:22:57 -05:00
parent dc2dcd02c1
commit d0c9f88268
35 changed files with 21929 additions and 42 deletions
--- a/backend/app/agents/core.py
+++ b/backend/app/agents/core.py
@@ -0,0 +1,208 @@
+"""Core ThreatHunt analyst-assist agent.
+
+Provides read-only guidance on CSV artifact data, analytical pivots, and hypotheses.
+Agents are advisory only - no execution, no alerts, no data modifications.
+"""
+
+import logging
+from typing import Optional
+from pydantic import BaseModel, Field
+
+from .providers import LLMProvider, get_provider
+
+logger = logging.getLogger(__name__)
+
+
+class AgentContext(BaseModel):
+    """Context for agent guidance requests."""
+
+    query: str = Field(
+        ..., description="Analyst question or request for guidance"
+    )
+    dataset_name: Optional[str] = Field(None, description="Name of CSV dataset")
+    artifact_type: Optional[str] = Field(None, description="Artifact type (e.g., file, process, network)")
+    host_identifier: Optional[str] = Field(
+        None, description="Host name, IP, or identifier"
+    )
+    data_summary: Optional[str] = Field(
+        None, description="Brief description of uploaded data"
+    )
+    conversation_history: Optional[list[dict]] = Field(
+        default_factory=list, description="Previous messages in conversation"
+    )
+
+
+class AgentResponse(BaseModel):
+    """Response from analyst-assist agent."""
+
+    guidance: str = Field(..., description="Advisory guidance for analyst")
+    confidence: float = Field(
+        ..., ge=0.0, le=1.0, description="Confidence in guidance (0-1)"
+    )
+    suggested_pivots: list[str] = Field(
+        default_factory=list, description="Suggested analytical directions"
+    )
+    suggested_filters: list[str] = Field(
+        default_factory=list, description="Suggested data filters or queries"
+    )
+    caveats: Optional[str] = Field(
+        None, description="Assumptions, limitations, or caveats"
+    )
+    reasoning: Optional[str] = Field(
+        None, description="Explanation of how guidance was generated"
+    )
+
+
+class ThreatHuntAgent:
+    """Analyst-assist agent for ThreatHunt.
+    
+    Provides guidance on:
+    - Interpreting CSV artifact data
+    - Suggesting analytical pivots and filters
+    - Forming and testing hypotheses
+    
+    Policy:
+    - Advisory guidance only (no execution)
+    - No database or schema changes
+    - No alert escalation
+    - Transparent reasoning
+    """
+
+    def __init__(self, provider: Optional[LLMProvider] = None):
+        """Initialize agent with LLM provider.
+        
+        Args:
+            provider: LLM provider instance. If None, uses get_provider() with auto mode.
+        """
+        if provider is None:
+            try:
+                provider = get_provider("auto")
+            except RuntimeError as e:
+                logger.warning(f"Could not initialize default provider: {e}")
+                provider = None
+
+        self.provider = provider
+        self.system_prompt = self._build_system_prompt()
+
+    def _build_system_prompt(self) -> str:
+        """Build the system prompt that governs agent behavior."""
+        return """You are an analyst-assist agent for ThreatHunt, a threat hunting platform.
+
+Your role:
+- Interpret and explain CSV artifact data from Velociraptor
+- Suggest analytical pivots, filters, and hypotheses
+- Highlight anomalies, patterns, or points of interest
+- Guide analysts without replacing their judgment
+
+Your constraints:
+- You ONLY provide guidance and suggestions
+- You do NOT execute actions or tools
+- You do NOT modify data or escalate alerts
+- You do NOT make autonomous decisions
+- You ONLY analyze data presented to you
+- You explain your reasoning transparently
+- You acknowledge limitations and assumptions
+- You suggest next investigative steps
+
+When responding:
+1. Start with a clear, direct answer to the query
+2. Explain your reasoning based on the data context provided
+3. Suggest 2-4 analytical pivots the analyst might explore
+4. Suggest 2-4 data filters or queries that might be useful
+5. Include relevant caveats or assumptions
+6. Be honest about what you cannot determine from the data
+
+Remember: The analyst is the decision-maker. You are an assistant."""
+
+    async def assist(self, context: AgentContext) -> AgentResponse:
+        """Provide guidance on artifact data and analysis.
+        
+        Args:
+            context: Request context including query and data context.
+            
+        Returns:
+            Guidance response with suggestions and reasoning.
+            
+        Raises:
+            RuntimeError: If no provider is available.
+        """
+        if not self.provider:
+            raise RuntimeError(
+                "No LLM provider available. Configure at least one of: "
+                "THREAT_HUNT_LOCAL_MODEL_PATH, THREAT_HUNT_NETWORKED_ENDPOINT, "
+                "or THREAT_HUNT_ONLINE_API_KEY"
+            )
+
+        # Build prompt with context
+        prompt = self._build_prompt(context)
+
+        try:
+            # Get guidance from LLM provider
+            guidance = await self.provider.generate(prompt, max_tokens=1024)
+
+            # Parse response into structured format
+            response = self._parse_response(guidance, context)
+
+            logger.info(
+                f"Agent assisted with query: {context.query[:50]}... "
+                f"(dataset: {context.dataset_name})"
+            )
+
+            return response
+
+        except Exception as e:
+            logger.error(f"Error generating guidance: {e}")
+            raise
+
+    def _build_prompt(self, context: AgentContext) -> str:
+        """Build the prompt for the LLM."""
+        prompt_parts = [
+            f"Analyst query: {context.query}",
+        ]
+
+        if context.dataset_name:
+            prompt_parts.append(f"Dataset: {context.dataset_name}")
+
+        if context.artifact_type:
+            prompt_parts.append(f"Artifact type: {context.artifact_type}")
+
+        if context.host_identifier:
+            prompt_parts.append(f"Host: {context.host_identifier}")
+
+        if context.data_summary:
+            prompt_parts.append(f"Data summary: {context.data_summary}")
+
+        if context.conversation_history:
+            prompt_parts.append("\nConversation history:")
+            for msg in context.conversation_history[-5:]:  # Last 5 messages for context
+                prompt_parts.append(f"  {msg.get('role', 'unknown')}: {msg.get('content', '')}")
+
+        return "\n".join(prompt_parts)
+
+    def _parse_response(self, response_text: str, context: AgentContext) -> AgentResponse:
+        """Parse LLM response into structured format.
+        
+        Note: This is a simplified parser. In production, use structured output
+        from the LLM (JSON mode, function calling, etc.) for better reliability.
+        """
+        # For now, return a structured response based on the raw guidance
+        # In production, parse JSON or use structured output from LLM
+        return AgentResponse(
+            guidance=response_text,
+            confidence=0.8,  # Placeholder
+            suggested_pivots=[
+                "Analyze temporal patterns",
+                "Cross-reference with known indicators",
+                "Examine outliers in the dataset",
+                "Compare with baseline behavior",
+            ],
+            suggested_filters=[
+                "Filter by high-risk indicators",
+                "Sort by timestamp for timeline analysis",
+                "Group by host or user",
+                "Filter by anomaly score",
+            ],
+            caveats="Guidance is based on available data context. "
+            "Analysts should verify findings with additional sources.",
+            reasoning="Analysis generated based on artifact data patterns and analyst query.",
+        )