ThreatHunt/backend/app/agents/core.py

"""Core ThreatHunt analyst-assist agent.

Provides read-only guidance on CSV artifact data, analytical pivots, and hypotheses.
Agents are advisory only - no execution, no alerts, no data modifications.
"""

import logging
from typing import Optional
from pydantic import BaseModel, Field

from .providers import LLMProvider, get_provider

logger = logging.getLogger(__name__)


class AgentContext(BaseModel):
    """Context for agent guidance requests."""

    query: str = Field(
        ..., description="Analyst question or request for guidance"
    )
    dataset_name: Optional[str] = Field(None, description="Name of CSV dataset")
    artifact_type: Optional[str] = Field(None, description="Artifact type (e.g., file, process, network)")
    host_identifier: Optional[str] = Field(
        None, description="Host name, IP, or identifier"
    )
    data_summary: Optional[str] = Field(
        None, description="Brief description of uploaded data"
    )
    conversation_history: Optional[list[dict]] = Field(
        default_factory=list, description="Previous messages in conversation"
    )


class AgentResponse(BaseModel):
    """Response from analyst-assist agent."""

    guidance: str = Field(..., description="Advisory guidance for analyst")
    confidence: float = Field(
        ..., ge=0.0, le=1.0, description="Confidence in guidance (0-1)"
    )
    suggested_pivots: list[str] = Field(
        default_factory=list, description="Suggested analytical directions"
    )
    suggested_filters: list[str] = Field(
        default_factory=list, description="Suggested data filters or queries"
    )
    caveats: Optional[str] = Field(
        None, description="Assumptions, limitations, or caveats"
    )
    reasoning: Optional[str] = Field(
        None, description="Explanation of how guidance was generated"
    )


class ThreatHuntAgent:
    """Analyst-assist agent for ThreatHunt.

    Provides guidance on:
    - Interpreting CSV artifact data
    - Suggesting analytical pivots and filters
    - Forming and testing hypotheses

    Policy:
    - Advisory guidance only (no execution)
    - No database or schema changes
    - No alert escalation
    - Transparent reasoning
    """

    def __init__(self, provider: Optional[LLMProvider] = None):
        """Initialize agent with LLM provider.

        Args:
            provider: LLM provider instance. If None, uses get_provider() with auto mode.
        """
        if provider is None:
            try:
                provider = get_provider("auto")
            except RuntimeError as e:
                logger.warning(f"Could not initialize default provider: {e}")
                provider = None

        self.provider = provider
        self.system_prompt = self._build_system_prompt()

    def _build_system_prompt(self) -> str:
        """Build the system prompt that governs agent behavior."""
        return """You are an analyst-assist agent for ThreatHunt, a threat hunting platform.

Your role:
- Interpret and explain CSV artifact data from Velociraptor
- Suggest analytical pivots, filters, and hypotheses
- Highlight anomalies, patterns, or points of interest
- Guide analysts without replacing their judgment

Your constraints:
- You ONLY provide guidance and suggestions
- You do NOT execute actions or tools
- You do NOT modify data or escalate alerts
- You do NOT make autonomous decisions
- You ONLY analyze data presented to you
- You explain your reasoning transparently
- You acknowledge limitations and assumptions
- You suggest next investigative steps

When responding:
1. Start with a clear, direct answer to the query
2. Explain your reasoning based on the data context provided
3. Suggest 2-4 analytical pivots the analyst might explore
4. Suggest 2-4 data filters or queries that might be useful
5. Include relevant caveats or assumptions
6. Be honest about what you cannot determine from the data

Remember: The analyst is the decision-maker. You are an assistant."""

    async def assist(self, context: AgentContext) -> AgentResponse:
        """Provide guidance on artifact data and analysis.

        Args:
            context: Request context including query and data context.

        Returns:
            Guidance response with suggestions and reasoning.

        Raises:
            RuntimeError: If no provider is available.
        """
        if not self.provider:
            raise RuntimeError(
                "No LLM provider available. Configure at least one of: "
                "THREAT_HUNT_LOCAL_MODEL_PATH, THREAT_HUNT_NETWORKED_ENDPOINT, "
                "or THREAT_HUNT_ONLINE_API_KEY"
            )

        # Build prompt with context
        prompt = self._build_prompt(context)

        try:
            # Get guidance from LLM provider
            guidance = await self.provider.generate(prompt, max_tokens=1024)

            # Parse response into structured format
            response = self._parse_response(guidance, context)

            logger.info(
                f"Agent assisted with query: {context.query[:50]}... "
                f"(dataset: {context.dataset_name})"
            )

            return response

        except Exception as e:
            logger.error(f"Error generating guidance: {e}")
            raise

    def _build_prompt(self, context: AgentContext) -> str:
        """Build the prompt for the LLM."""
        prompt_parts = [
            f"Analyst query: {context.query}",
        ]

        if context.dataset_name:
            prompt_parts.append(f"Dataset: {context.dataset_name}")

        if context.artifact_type:
            prompt_parts.append(f"Artifact type: {context.artifact_type}")

        if context.host_identifier:
            prompt_parts.append(f"Host: {context.host_identifier}")

        if context.data_summary:
            prompt_parts.append(f"Data summary: {context.data_summary}")

        if context.conversation_history:
            prompt_parts.append("\nConversation history:")
            for msg in context.conversation_history[-5:]:  # Last 5 messages for context
                prompt_parts.append(f"  {msg.get('role', 'unknown')}: {msg.get('content', '')}")

        return "\n".join(prompt_parts)

    def _parse_response(self, response_text: str, context: AgentContext) -> AgentResponse:
        """Parse LLM response into structured format.

        Note: This is a simplified parser. In production, use structured output
        from the LLM (JSON mode, function calling, etc.) for better reliability.
        """
        # For now, return a structured response based on the raw guidance
        # In production, parse JSON or use structured output from LLM
        return AgentResponse(
            guidance=response_text,
            confidence=0.8,  # Placeholder
            suggested_pivots=[
                "Analyze temporal patterns",
                "Cross-reference with known indicators",
                "Examine outliers in the dataset",
                "Compare with baseline behavior",
            ],
            suggested_filters=[
                "Filter by high-risk indicators",
                "Sort by timestamp for timeline analysis",
                "Group by host or user",
                "Filter by anomaly score",
            ],
            caveats="Guidance is based on available data context. "
            "Analysts should verify findings with additional sources.",
            reasoning="Analysis generated based on artifact data patterns and analyst query.",
        )