Add ThreatHunt agent backend/frontend scaffolding

This commit is contained in:
2025-12-29 10:22:57 -05:00
parent dc2dcd02c1
commit d0c9f88268
35 changed files with 21929 additions and 42 deletions

1
backend/app/__init__.py Normal file
View File

@@ -0,0 +1 @@
"""Backend initialization."""

View File

@@ -0,0 +1,67 @@
import asyncio
async def debated_generate(provider, prompt: str) -> str:
"""
Minimal behind-the-scenes debate.
Same logic for all apps.
Advisory only. No execution.
"""
planner = f"""
You are the Planner.
Give structured advisory guidance only.
No execution. No tools.
Request:
{prompt}
"""
critic = f"""
You are the Critic.
Identify risks, missing steps, and assumptions.
No execution. No tools.
Request:
{prompt}
"""
pragmatist = f"""
You are the Pragmatist.
Suggest the safest and simplest approach.
No execution. No tools.
Request:
{prompt}
"""
planner_task = provider.generate(planner)
critic_task = provider.generate(critic)
prag_task = provider.generate(pragmatist)
planner_resp, critic_resp, prag_resp = await asyncio.gather(
planner_task, critic_task, prag_task
)
judge = f"""
You are the Judge.
Merge the three responses into ONE final advisory answer.
Rules:
- Advisory only
- No execution
- Clearly list risks and assumptions
- Be concise
Planner:
{planner_resp}
Critic:
{critic_resp}
Pragmatist:
{prag_resp}
"""
final = await provider.generate(judge)
return final

View File

@@ -0,0 +1,16 @@
"""Analyst-assist agent module for ThreatHunt.
Provides read-only guidance on CSV artifact data, analytical pivots, and hypotheses.
Agents are advisory only and do not execute actions or modify data.
"""
from .core import ThreatHuntAgent
from .providers import LLMProvider, LocalProvider, NetworkedProvider, OnlineProvider
__all__ = [
"ThreatHuntAgent",
"LLMProvider",
"LocalProvider",
"NetworkedProvider",
"OnlineProvider",
]

View File

@@ -0,0 +1,59 @@
"""Configuration for agent settings."""
import os
from typing import Literal
class AgentConfig:
"""Configuration for analyst-assist agents."""
# Provider type: 'local', 'networked', 'online', or 'auto'
PROVIDER_TYPE: Literal["local", "networked", "online", "auto"] = os.getenv(
"THREAT_HUNT_AGENT_PROVIDER", "auto"
)
# Local provider settings
LOCAL_MODEL_PATH: str | None = os.getenv("THREAT_HUNT_LOCAL_MODEL_PATH")
# Networked provider settings
NETWORKED_ENDPOINT: str | None = os.getenv("THREAT_HUNT_NETWORKED_ENDPOINT")
NETWORKED_API_KEY: str | None = os.getenv("THREAT_HUNT_NETWORKED_KEY")
# Online provider settings
ONLINE_API_PROVIDER: str = os.getenv("THREAT_HUNT_ONLINE_PROVIDER", "openai")
ONLINE_API_KEY: str | None = os.getenv("THREAT_HUNT_ONLINE_API_KEY")
ONLINE_MODEL: str | None = os.getenv("THREAT_HUNT_ONLINE_MODEL")
# Agent behavior settings
MAX_RESPONSE_TOKENS: int = int(
os.getenv("THREAT_HUNT_AGENT_MAX_TOKENS", "1024")
)
ENABLE_REASONING: bool = os.getenv(
"THREAT_HUNT_AGENT_REASONING", "true"
).lower() in ("true", "1", "yes")
CONVERSATION_HISTORY_LENGTH: int = int(
os.getenv("THREAT_HUNT_AGENT_HISTORY_LENGTH", "10")
)
# Privacy settings
FILTER_SENSITIVE_DATA: bool = os.getenv(
"THREAT_HUNT_AGENT_FILTER_SENSITIVE", "true"
).lower() in ("true", "1", "yes")
@classmethod
def is_agent_enabled(cls) -> bool:
"""Check if agent is enabled and properly configured."""
# Agent is disabled if no provider can be used
if cls.PROVIDER_TYPE == "auto":
return bool(
cls.LOCAL_MODEL_PATH
or cls.NETWORKED_ENDPOINT
or cls.ONLINE_API_KEY
)
elif cls.PROVIDER_TYPE == "local":
return bool(cls.LOCAL_MODEL_PATH)
elif cls.PROVIDER_TYPE == "networked":
return bool(cls.NETWORKED_ENDPOINT)
elif cls.PROVIDER_TYPE == "online":
return bool(cls.ONLINE_API_KEY)
return False

208
backend/app/agents/core.py Normal file
View File

@@ -0,0 +1,208 @@
"""Core ThreatHunt analyst-assist agent.
Provides read-only guidance on CSV artifact data, analytical pivots, and hypotheses.
Agents are advisory only - no execution, no alerts, no data modifications.
"""
import logging
from typing import Optional
from pydantic import BaseModel, Field
from .providers import LLMProvider, get_provider
logger = logging.getLogger(__name__)
class AgentContext(BaseModel):
"""Context for agent guidance requests."""
query: str = Field(
..., description="Analyst question or request for guidance"
)
dataset_name: Optional[str] = Field(None, description="Name of CSV dataset")
artifact_type: Optional[str] = Field(None, description="Artifact type (e.g., file, process, network)")
host_identifier: Optional[str] = Field(
None, description="Host name, IP, or identifier"
)
data_summary: Optional[str] = Field(
None, description="Brief description of uploaded data"
)
conversation_history: Optional[list[dict]] = Field(
default_factory=list, description="Previous messages in conversation"
)
class AgentResponse(BaseModel):
"""Response from analyst-assist agent."""
guidance: str = Field(..., description="Advisory guidance for analyst")
confidence: float = Field(
..., ge=0.0, le=1.0, description="Confidence in guidance (0-1)"
)
suggested_pivots: list[str] = Field(
default_factory=list, description="Suggested analytical directions"
)
suggested_filters: list[str] = Field(
default_factory=list, description="Suggested data filters or queries"
)
caveats: Optional[str] = Field(
None, description="Assumptions, limitations, or caveats"
)
reasoning: Optional[str] = Field(
None, description="Explanation of how guidance was generated"
)
class ThreatHuntAgent:
"""Analyst-assist agent for ThreatHunt.
Provides guidance on:
- Interpreting CSV artifact data
- Suggesting analytical pivots and filters
- Forming and testing hypotheses
Policy:
- Advisory guidance only (no execution)
- No database or schema changes
- No alert escalation
- Transparent reasoning
"""
def __init__(self, provider: Optional[LLMProvider] = None):
"""Initialize agent with LLM provider.
Args:
provider: LLM provider instance. If None, uses get_provider() with auto mode.
"""
if provider is None:
try:
provider = get_provider("auto")
except RuntimeError as e:
logger.warning(f"Could not initialize default provider: {e}")
provider = None
self.provider = provider
self.system_prompt = self._build_system_prompt()
def _build_system_prompt(self) -> str:
"""Build the system prompt that governs agent behavior."""
return """You are an analyst-assist agent for ThreatHunt, a threat hunting platform.
Your role:
- Interpret and explain CSV artifact data from Velociraptor
- Suggest analytical pivots, filters, and hypotheses
- Highlight anomalies, patterns, or points of interest
- Guide analysts without replacing their judgment
Your constraints:
- You ONLY provide guidance and suggestions
- You do NOT execute actions or tools
- You do NOT modify data or escalate alerts
- You do NOT make autonomous decisions
- You ONLY analyze data presented to you
- You explain your reasoning transparently
- You acknowledge limitations and assumptions
- You suggest next investigative steps
When responding:
1. Start with a clear, direct answer to the query
2. Explain your reasoning based on the data context provided
3. Suggest 2-4 analytical pivots the analyst might explore
4. Suggest 2-4 data filters or queries that might be useful
5. Include relevant caveats or assumptions
6. Be honest about what you cannot determine from the data
Remember: The analyst is the decision-maker. You are an assistant."""
async def assist(self, context: AgentContext) -> AgentResponse:
"""Provide guidance on artifact data and analysis.
Args:
context: Request context including query and data context.
Returns:
Guidance response with suggestions and reasoning.
Raises:
RuntimeError: If no provider is available.
"""
if not self.provider:
raise RuntimeError(
"No LLM provider available. Configure at least one of: "
"THREAT_HUNT_LOCAL_MODEL_PATH, THREAT_HUNT_NETWORKED_ENDPOINT, "
"or THREAT_HUNT_ONLINE_API_KEY"
)
# Build prompt with context
prompt = self._build_prompt(context)
try:
# Get guidance from LLM provider
guidance = await self.provider.generate(prompt, max_tokens=1024)
# Parse response into structured format
response = self._parse_response(guidance, context)
logger.info(
f"Agent assisted with query: {context.query[:50]}... "
f"(dataset: {context.dataset_name})"
)
return response
except Exception as e:
logger.error(f"Error generating guidance: {e}")
raise
def _build_prompt(self, context: AgentContext) -> str:
"""Build the prompt for the LLM."""
prompt_parts = [
f"Analyst query: {context.query}",
]
if context.dataset_name:
prompt_parts.append(f"Dataset: {context.dataset_name}")
if context.artifact_type:
prompt_parts.append(f"Artifact type: {context.artifact_type}")
if context.host_identifier:
prompt_parts.append(f"Host: {context.host_identifier}")
if context.data_summary:
prompt_parts.append(f"Data summary: {context.data_summary}")
if context.conversation_history:
prompt_parts.append("\nConversation history:")
for msg in context.conversation_history[-5:]: # Last 5 messages for context
prompt_parts.append(f" {msg.get('role', 'unknown')}: {msg.get('content', '')}")
return "\n".join(prompt_parts)
def _parse_response(self, response_text: str, context: AgentContext) -> AgentResponse:
"""Parse LLM response into structured format.
Note: This is a simplified parser. In production, use structured output
from the LLM (JSON mode, function calling, etc.) for better reliability.
"""
# For now, return a structured response based on the raw guidance
# In production, parse JSON or use structured output from LLM
return AgentResponse(
guidance=response_text,
confidence=0.8, # Placeholder
suggested_pivots=[
"Analyze temporal patterns",
"Cross-reference with known indicators",
"Examine outliers in the dataset",
"Compare with baseline behavior",
],
suggested_filters=[
"Filter by high-risk indicators",
"Sort by timestamp for timeline analysis",
"Group by host or user",
"Filter by anomaly score",
],
caveats="Guidance is based on available data context. "
"Analysts should verify findings with additional sources.",
reasoning="Analysis generated based on artifact data patterns and analyst query.",
)

View File

@@ -0,0 +1,190 @@
"""Pluggable LLM provider interface for analyst-assist agents.
Supports three provider types:
- Local: On-device or on-prem models
- Networked: Shared internal inference services
- Online: External hosted APIs
"""
import os
from abc import ABC, abstractmethod
from typing import Optional
class LLMProvider(ABC):
"""Abstract base class for LLM providers."""
@abstractmethod
async def generate(self, prompt: str, max_tokens: int = 1024) -> str:
"""Generate a response from the LLM.
Args:
prompt: The input prompt
max_tokens: Maximum tokens in response
Returns:
Generated text response
"""
pass
@abstractmethod
def is_available(self) -> bool:
"""Check if provider backend is available."""
pass
class LocalProvider(LLMProvider):
"""Local LLM provider (on-device or on-prem models)."""
def __init__(self, model_path: Optional[str] = None):
"""Initialize local provider.
Args:
model_path: Path to local model. If None, uses THREAT_HUNT_LOCAL_MODEL_PATH env var.
"""
self.model_path = model_path or os.getenv("THREAT_HUNT_LOCAL_MODEL_PATH")
self.model = None
def is_available(self) -> bool:
"""Check if local model is available."""
if not self.model_path:
return False
# In production, would verify model file exists and can be loaded
return os.path.exists(str(self.model_path))
async def generate(self, prompt: str, max_tokens: int = 1024) -> str:
"""Generate response using local model.
Note: This is a placeholder. In production, integrate with:
- llama-cpp-python for GGML models
- Ollama API
- vLLM
- Other local inference engines
"""
if not self.is_available():
raise RuntimeError("Local model not available")
# Placeholder implementation
return f"[Local model response to: {prompt[:50]}...]"
class NetworkedProvider(LLMProvider):
"""Networked LLM provider (shared internal inference services)."""
def __init__(
self,
api_endpoint: Optional[str] = None,
api_key: Optional[str] = None,
model_name: str = "default",
):
"""Initialize networked provider.
Args:
api_endpoint: URL to inference service. Defaults to env var THREAT_HUNT_NETWORKED_ENDPOINT.
api_key: API key for service. Defaults to env var THREAT_HUNT_NETWORKED_KEY.
model_name: Model name/ID on the service.
"""
self.api_endpoint = api_endpoint or os.getenv("THREAT_HUNT_NETWORKED_ENDPOINT")
self.api_key = api_key or os.getenv("THREAT_HUNT_NETWORKED_KEY")
self.model_name = model_name
def is_available(self) -> bool:
"""Check if networked service is available."""
return bool(self.api_endpoint)
async def generate(self, prompt: str, max_tokens: int = 1024) -> str:
"""Generate response using networked service.
Note: This is a placeholder. In production, integrate with:
- Internal inference service API
- LLM inference container cluster
- Enterprise inference gateway
"""
if not self.is_available():
raise RuntimeError("Networked service not available")
# Placeholder implementation
return f"[Networked response from {self.model_name}: {prompt[:50]}...]"
class OnlineProvider(LLMProvider):
"""Online LLM provider (external hosted APIs)."""
def __init__(
self,
api_provider: str = "openai",
api_key: Optional[str] = None,
model_name: Optional[str] = None,
):
"""Initialize online provider.
Args:
api_provider: Provider name (openai, anthropic, google, etc.)
api_key: API key. Defaults to env var THREAT_HUNT_ONLINE_API_KEY.
model_name: Model name. Defaults to env var THREAT_HUNT_ONLINE_MODEL.
"""
self.api_provider = api_provider
self.api_key = api_key or os.getenv("THREAT_HUNT_ONLINE_API_KEY")
self.model_name = model_name or os.getenv(
"THREAT_HUNT_ONLINE_MODEL", f"{api_provider}-default"
)
def is_available(self) -> bool:
"""Check if online API is available."""
return bool(self.api_key)
async def generate(self, prompt: str, max_tokens: int = 1024) -> str:
"""Generate response using online API.
Note: This is a placeholder. In production, integrate with:
- OpenAI API (GPT-3.5, GPT-4, etc.)
- Anthropic Claude API
- Google Gemini API
- Other hosted LLM services
"""
if not self.is_available():
raise RuntimeError("Online API not available or API key not set")
# Placeholder implementation
return f"[Online {self.api_provider} response: {prompt[:50]}...]"
def get_provider(provider_type: str = "auto") -> LLMProvider:
"""Get an LLM provider based on configuration.
Args:
provider_type: Type of provider to use: 'local', 'networked', 'online', or 'auto'.
'auto' attempts to use the first available provider in order:
local -> networked -> online.
Returns:
Configured LLM provider instance.
Raises:
RuntimeError: If no provider is available.
"""
# Explicit provider selection
if provider_type == "local":
provider = LocalProvider()
elif provider_type == "networked":
provider = NetworkedProvider()
elif provider_type == "online":
provider = OnlineProvider()
elif provider_type == "auto":
# Try providers in order of preference
for Provider in [LocalProvider, NetworkedProvider, OnlineProvider]:
provider = Provider()
if provider.is_available():
return provider
raise RuntimeError(
"No LLM provider available. Configure at least one of: "
"THREAT_HUNT_LOCAL_MODEL_PATH, THREAT_HUNT_NETWORKED_ENDPOINT, "
"or THREAT_HUNT_ONLINE_API_KEY"
)
else:
raise ValueError(f"Unknown provider type: {provider_type}")
if not provider.is_available():
raise RuntimeError(f"{provider_type} provider not available")
return provider

View File

@@ -0,0 +1 @@
"""API routes initialization."""

View File

@@ -0,0 +1 @@
"""API route modules."""

View File

@@ -0,0 +1,170 @@
"""API routes for analyst-assist agent."""
import logging
from fastapi import APIRouter, HTTPException
from pydantic import BaseModel, Field
from app.agents.core import ThreatHuntAgent, AgentContext, AgentResponse
from app.agents.config import AgentConfig
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/agent", tags=["agent"])
# Global agent instance (lazy-loaded)
_agent: ThreatHuntAgent | None = None
def get_agent() -> ThreatHuntAgent:
"""Get or create the agent instance."""
global _agent
if _agent is None:
if not AgentConfig.is_agent_enabled():
raise HTTPException(
status_code=503,
detail="Analyst-assist agent is not configured. "
"Please configure an LLM provider.",
)
_agent = ThreatHuntAgent()
return _agent
class AssistRequest(BaseModel):
"""Request for agent assistance."""
query: str = Field(
..., description="Analyst question or request for guidance"
)
dataset_name: str | None = Field(
None, description="Name of CSV dataset being analyzed"
)
artifact_type: str | None = Field(
None, description="Type of artifact (e.g., FileList, ProcessList, NetworkConnections)"
)
host_identifier: str | None = Field(
None, description="Host name, IP address, or identifier"
)
data_summary: str | None = Field(
None, description="Brief summary or context about the uploaded data"
)
conversation_history: list[dict] | None = Field(
None, description="Previous messages for context"
)
class AssistResponse(BaseModel):
"""Response with agent guidance."""
guidance: str
confidence: float
suggested_pivots: list[str]
suggested_filters: list[str]
caveats: str | None = None
reasoning: str | None = None
@router.post(
"/assist",
response_model=AssistResponse,
summary="Get analyst-assist guidance",
description="Request guidance on CSV artifact data, analytical pivots, and hypotheses. "
"Agent provides advisory guidance only - no execution.",
)
async def agent_assist(request: AssistRequest) -> AssistResponse:
"""Provide analyst-assist guidance on artifact data.
The agent will:
- Explain and interpret the provided data context
- Suggest analytical pivots the analyst might explore
- Suggest data filters or queries that might be useful
- Highlight assumptions, limitations, and caveats
The agent will NOT:
- Execute any tools or actions
- Escalate findings to alerts
- Modify any data or schema
- Make autonomous decisions
Args:
request: Assistance request with query and context
Returns:
Guidance response with suggestions and reasoning
Raises:
HTTPException: If agent is not configured (503) or request fails
"""
try:
agent = get_agent()
# Build context
context = AgentContext(
query=request.query,
dataset_name=request.dataset_name,
artifact_type=request.artifact_type,
host_identifier=request.host_identifier,
data_summary=request.data_summary,
conversation_history=request.conversation_history or [],
)
# Get guidance
response = await agent.assist(context)
logger.info(
f"Agent assisted analyst with query: {request.query[:50]}... "
f"(host: {request.host_identifier}, artifact: {request.artifact_type})"
)
return AssistResponse(
guidance=response.guidance,
confidence=response.confidence,
suggested_pivots=response.suggested_pivots,
suggested_filters=response.suggested_filters,
caveats=response.caveats,
reasoning=response.reasoning,
)
except RuntimeError as e:
logger.error(f"Agent error: {e}")
raise HTTPException(
status_code=503,
detail=f"Agent unavailable: {str(e)}",
)
except Exception as e:
logger.exception(f"Unexpected error in agent_assist: {e}")
raise HTTPException(
status_code=500,
detail="Error generating guidance. Please try again.",
)
@router.get(
"/health",
summary="Check agent health",
description="Check if agent is configured and ready to assist.",
)
async def agent_health() -> dict:
"""Check agent availability and configuration.
Returns:
Health status with configuration details
"""
try:
agent = get_agent()
provider_type = agent.provider.__class__.__name__ if agent.provider else "None"
return {
"status": "healthy",
"provider": provider_type,
"max_tokens": AgentConfig.MAX_RESPONSE_TOKENS,
"reasoning_enabled": AgentConfig.ENABLE_REASONING,
}
except HTTPException:
return {
"status": "unavailable",
"reason": "No LLM provider configured",
"configured_providers": {
"local": bool(AgentConfig.LOCAL_MODEL_PATH),
"networked": bool(AgentConfig.NETWORKED_ENDPOINT),
"online": bool(AgentConfig.ONLINE_API_KEY),
},
}

35
backend/app/main.py Normal file
View File

@@ -0,0 +1,35 @@
"""ThreatHunt backend application."""
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.api.routes import agent
# Create FastAPI application
app = FastAPI(
title="ThreatHunt API",
description="Analyst-assist threat hunting platform with agent guidance",
version="0.1.0",
)
# Configure CORS
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # In production, restrict to known domains
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Include routes
app.include_router(agent.router)
@app.get("/", tags=["health"])
async def root():
"""API health check."""
return {
"service": "ThreatHunt API",
"status": "running",
"docs": "/docs",
}

21
backend/requirements.txt Normal file
View File

@@ -0,0 +1,21 @@
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0
pydantic-settings==2.1.0
# Optional LLM provider dependencies
# Uncomment based on your deployment choice:
# For local models (GGML, Ollama, etc.)
# llama-cpp-python==0.2.15
# ollama==0.0.11
# For online providers (OpenAI, Anthropic, Google)
# openai==1.3.5
# anthropic==0.7.1
# google-generativeai==0.3.0
# For development
pytest==7.4.3
pytest-asyncio==0.21.1
httpx==0.25.1

17
backend/run.py Normal file
View File

@@ -0,0 +1,17 @@
"""Entry point for backend server."""
import logging
import uvicorn
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
if __name__ == "__main__":
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=8000,
reload=True,
)