mirror of
https://github.com/mblanke/ThreatHunt.git
synced 2026-03-01 14:00:20 -05:00
- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
347 lines
12 KiB
Python
347 lines
12 KiB
Python
"""SANS RAG service — queries the 300GB SANS courseware indexed in Open WebUI.
|
|
|
|
Provides contextual SANS references for threat hunting guidance.
|
|
Uses two approaches:
|
|
1. Open WebUI RAG pipeline (if configured with a knowledge collection)
|
|
2. Embedding-based semantic search against locally indexed SANS content
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import re
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
from app.config import settings
|
|
from app.agents.providers_v2 import _get_client
|
|
from app.agents.registry import Node
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ── SANS course catalog for reference matching ────────────────────────
|
|
|
|
SANS_COURSES = {
|
|
"SEC401": "Security Essentials",
|
|
"SEC504": "Hacker Tools, Techniques, and Incident Handling",
|
|
"SEC503": "Network Monitoring and Threat Detection In-Depth",
|
|
"SEC505": "Securing Windows and PowerShell Automation",
|
|
"SEC506": "Securing Linux/Unix",
|
|
"SEC510": "Public Cloud Security: AWS, Azure, and GCP",
|
|
"SEC511": "Continuous Monitoring and Security Operations",
|
|
"SEC530": "Defensible Security Architecture and Engineering",
|
|
"SEC540": "Cloud Security and DevSecOps Automation",
|
|
"SEC555": "SIEM with Tactical Analytics",
|
|
"SEC560": "Enterprise Penetration Testing",
|
|
"SEC565": "Red Team Operations and Adversary Emulation",
|
|
"SEC573": "Automating Information Security with Python",
|
|
"SEC575": "Mobile Device Security and Ethical Hacking",
|
|
"SEC588": "Cloud Penetration Testing",
|
|
"SEC599": "Defeating Advanced Adversaries - Purple Team Tactics",
|
|
"FOR408": "Windows Forensic Analysis",
|
|
"FOR498": "Digital Acquisition and Rapid Triage",
|
|
"FOR500": "Windows Forensic Analysis",
|
|
"FOR508": "Advanced Incident Response, Threat Hunting, and Digital Forensics",
|
|
"FOR509": "Enterprise Cloud Forensics and Incident Response",
|
|
"FOR518": "Mac and iOS Forensic Analysis and Incident Response",
|
|
"FOR572": "Advanced Network Forensics: Threat Hunting, Analysis, and Incident Response",
|
|
"FOR578": "Cyber Threat Intelligence",
|
|
"FOR585": "Smartphone Forensic Analysis In-Depth",
|
|
"FOR610": "Reverse-Engineering Malware: Malware Analysis Tools and Techniques",
|
|
"FOR710": "Reverse-Engineering Malware: Advanced Code Analysis",
|
|
"ICS410": "ICS/SCADA Security Essentials",
|
|
"ICS515": "ICS Visibility, Detection, and Response",
|
|
}
|
|
|
|
# Topic-to-course mapping for fallback recommendations
|
|
TOPIC_COURSE_MAP = {
|
|
"malware": ["FOR610", "FOR710", "SEC504"],
|
|
"reverse engineer": ["FOR610", "FOR710"],
|
|
"incident response": ["FOR508", "SEC504"],
|
|
"forensic": ["FOR508", "FOR500", "FOR408"],
|
|
"windows forensic": ["FOR500", "FOR408"],
|
|
"network forensic": ["FOR572"],
|
|
"threat hunting": ["FOR508", "SEC504", "FOR578"],
|
|
"threat intelligence": ["FOR578"],
|
|
"powershell": ["SEC505", "FOR508"],
|
|
"lateral movement": ["SEC504", "FOR508"],
|
|
"persistence": ["FOR508", "SEC504"],
|
|
"privilege escalation": ["SEC504", "SEC560"],
|
|
"credential": ["SEC504", "SEC560"],
|
|
"memory forensic": ["FOR508"],
|
|
"disk forensic": ["FOR500", "FOR408"],
|
|
"registry": ["FOR500", "FOR408"],
|
|
"event log": ["FOR508", "SEC555"],
|
|
"siem": ["SEC555"],
|
|
"log analysis": ["SEC555", "SEC503"],
|
|
"network monitor": ["SEC503"],
|
|
"pcap": ["SEC503", "FOR572"],
|
|
"cloud": ["SEC510", "SEC540", "FOR509"],
|
|
"aws": ["SEC510", "SEC540", "FOR509"],
|
|
"azure": ["SEC510", "FOR509"],
|
|
"linux": ["SEC506"],
|
|
"mobile": ["SEC575", "FOR585"],
|
|
"penetration test": ["SEC560", "SEC565"],
|
|
"red team": ["SEC565", "SEC599"],
|
|
"purple team": ["SEC599"],
|
|
"python": ["SEC573"],
|
|
"automation": ["SEC573", "SEC540"],
|
|
"deobfusc": ["FOR610", "SEC504"],
|
|
"base64": ["FOR610", "SEC504"],
|
|
"shellcode": ["FOR610", "FOR710"],
|
|
"ransomware": ["FOR508", "FOR610"],
|
|
"phishing": ["SEC504", "FOR578"],
|
|
"c2": ["FOR508", "SEC504", "FOR572"],
|
|
"command and control": ["FOR508", "SEC504"],
|
|
"exfiltration": ["FOR508", "FOR572", "SEC503"],
|
|
"dns": ["FOR572", "SEC503"],
|
|
"ioc": ["FOR508", "FOR578"],
|
|
"mitre": ["FOR508", "SEC504", "SEC599"],
|
|
"att&ck": ["FOR508", "SEC504"],
|
|
"velociraptor": ["FOR508"],
|
|
"volatility": ["FOR508"],
|
|
"scheduled task": ["FOR508", "SEC504"],
|
|
"service": ["FOR508", "SEC504"],
|
|
"wmi": ["FOR508", "SEC504"],
|
|
"process": ["FOR508"],
|
|
"dll": ["FOR610", "FOR508"],
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class RAGResult:
|
|
"""Result from a RAG query."""
|
|
query: str
|
|
context: str # Retrieved relevant text
|
|
sources: list[str] = field(default_factory=list) # Source document names
|
|
course_references: list[str] = field(default_factory=list) # SANS course IDs
|
|
confidence: float = 0.0
|
|
latency_ms: int = 0
|
|
|
|
|
|
class SANSRAGService:
|
|
"""Service for querying SANS courseware via Open WebUI RAG pipeline."""
|
|
|
|
def __init__(self):
|
|
self.openwebui_url = settings.OPENWEBUI_URL.rstrip("/")
|
|
self.api_key = settings.OPENWEBUI_API_KEY
|
|
self.rag_model = settings.DEFAULT_FAST_MODEL
|
|
self._available: bool | None = None
|
|
|
|
def _headers(self) -> dict:
|
|
h = {"Content-Type": "application/json"}
|
|
if self.api_key:
|
|
h["Authorization"] = f"Bearer {self.api_key}"
|
|
return h
|
|
|
|
async def query(
|
|
self,
|
|
question: str,
|
|
context: str = "",
|
|
max_tokens: int = 1024,
|
|
) -> RAGResult:
|
|
"""Query SANS courseware for relevant context.
|
|
|
|
Uses Open WebUI's RAG-enabled chat to retrieve from indexed SANS content.
|
|
Falls back to topic-based course recommendations if RAG is unavailable.
|
|
"""
|
|
start = time.monotonic()
|
|
|
|
# Try Open WebUI RAG pipeline first
|
|
try:
|
|
result = await self._query_openwebui_rag(question, context, max_tokens)
|
|
result.latency_ms = int((time.monotonic() - start) * 1000)
|
|
|
|
# Enrich with course references if not already present
|
|
if not result.course_references:
|
|
result.course_references = self._match_courses(question)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.warning(f"RAG query failed, using fallback: {e}")
|
|
# Fallback to topic-based matching
|
|
courses = self._match_courses(question)
|
|
return RAGResult(
|
|
query=question,
|
|
context="",
|
|
sources=[],
|
|
course_references=courses,
|
|
confidence=0.3 if courses else 0.0,
|
|
latency_ms=int((time.monotonic() - start) * 1000),
|
|
)
|
|
|
|
async def _query_openwebui_rag(
|
|
self,
|
|
question: str,
|
|
context: str,
|
|
max_tokens: int,
|
|
) -> RAGResult:
|
|
"""Query Open WebUI with RAG context retrieval.
|
|
|
|
Open WebUI automatically retrieves from its indexed knowledge base
|
|
when the model is configured with a knowledge collection.
|
|
"""
|
|
client = _get_client()
|
|
|
|
system_msg = (
|
|
"You are a SANS cybersecurity knowledge assistant. "
|
|
"Use your indexed SANS courseware to answer the question. "
|
|
"Always cite the specific SANS course (e.g., FOR508, SEC504) "
|
|
"and relevant section when referencing material. "
|
|
"If the question relates to threat hunting procedures, "
|
|
"reference the specific SANS methodology or framework."
|
|
)
|
|
|
|
messages = [
|
|
{"role": "system", "content": system_msg},
|
|
]
|
|
|
|
if context:
|
|
messages.append({
|
|
"role": "user",
|
|
"content": f"Investigation context:\n{context}\n\nQuestion: {question}",
|
|
})
|
|
else:
|
|
messages.append({"role": "user", "content": question})
|
|
|
|
payload = {
|
|
"model": self.rag_model,
|
|
"messages": messages,
|
|
"max_tokens": max_tokens,
|
|
"temperature": 0.2,
|
|
"stream": False,
|
|
}
|
|
|
|
resp = await client.post(
|
|
f"{self.openwebui_url}/v1/chat/completions",
|
|
json=payload,
|
|
headers=self._headers(),
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
|
|
content = ""
|
|
if data.get("choices"):
|
|
content = data["choices"][0].get("message", {}).get("content", "")
|
|
|
|
# Extract course references from response
|
|
course_refs = self._extract_course_refs(content)
|
|
sources = self._extract_sources(data)
|
|
|
|
return RAGResult(
|
|
query=question,
|
|
context=content,
|
|
sources=sources,
|
|
course_references=course_refs,
|
|
confidence=0.8 if content else 0.0,
|
|
)
|
|
|
|
def _extract_course_refs(self, text: str) -> list[str]:
|
|
"""Extract SANS course references from response text."""
|
|
refs = set()
|
|
# Match patterns like SEC504, FOR508, ICS410
|
|
pattern = r'\b(SEC|FOR|ICS|MGT|AUD|DEV|LEG)\d{3}\b'
|
|
matches = re.findall(pattern, text, re.IGNORECASE)
|
|
# Need to get the full match
|
|
full_pattern = r'\b(?:SEC|FOR|ICS|MGT|AUD|DEV|LEG)\d{3}\b'
|
|
full_matches = re.findall(full_pattern, text, re.IGNORECASE)
|
|
for m in full_matches:
|
|
course_id = m.upper()
|
|
if course_id in SANS_COURSES:
|
|
refs.add(f"{course_id}: {SANS_COURSES[course_id]}")
|
|
else:
|
|
refs.add(course_id)
|
|
return sorted(refs)
|
|
|
|
def _extract_sources(self, api_response: dict) -> list[str]:
|
|
"""Extract source document references from Open WebUI response metadata."""
|
|
sources = []
|
|
# Open WebUI may include source metadata in various formats
|
|
if "sources" in api_response:
|
|
for src in api_response["sources"]:
|
|
if isinstance(src, dict):
|
|
sources.append(src.get("name", src.get("title", str(src))))
|
|
else:
|
|
sources.append(str(src))
|
|
# Check in metadata
|
|
for choice in api_response.get("choices", []):
|
|
meta = choice.get("metadata", {})
|
|
if "sources" in meta:
|
|
for src in meta["sources"]:
|
|
if isinstance(src, dict):
|
|
sources.append(src.get("name", str(src)))
|
|
else:
|
|
sources.append(str(src))
|
|
return sources[:10] # Limit
|
|
|
|
def _match_courses(self, query: str) -> list[str]:
|
|
"""Match query keywords to SANS courses using topic map."""
|
|
q = query.lower()
|
|
matched = set()
|
|
for topic, courses in TOPIC_COURSE_MAP.items():
|
|
if topic in q:
|
|
for course_id in courses:
|
|
if course_id in SANS_COURSES:
|
|
matched.add(f"{course_id}: {SANS_COURSES[course_id]}")
|
|
return sorted(matched)[:5]
|
|
|
|
async def get_course_context(self, course_id: str) -> str:
|
|
"""Get a brief course description for context injection."""
|
|
course_id = course_id.upper().split(":")[0].strip()
|
|
if course_id in SANS_COURSES:
|
|
return f"{course_id}: {SANS_COURSES[course_id]}"
|
|
return ""
|
|
|
|
async def enrich_prompt(
|
|
self,
|
|
query: str,
|
|
investigation_context: str = "",
|
|
) -> str:
|
|
"""Generate SANS-enriched context to inject into agent prompts.
|
|
|
|
Returns a context string with relevant SANS references.
|
|
"""
|
|
result = await self.query(query, context=investigation_context, max_tokens=512)
|
|
|
|
parts = []
|
|
if result.context:
|
|
parts.append(f"SANS Reference Context:\n{result.context}")
|
|
if result.course_references:
|
|
parts.append(f"Relevant SANS Courses: {', '.join(result.course_references)}")
|
|
if result.sources:
|
|
parts.append(f"Sources: {', '.join(result.sources[:5])}")
|
|
|
|
return "\n".join(parts) if parts else ""
|
|
|
|
async def health_check(self) -> dict:
|
|
"""Check RAG service availability."""
|
|
try:
|
|
client = _get_client()
|
|
resp = await client.get(
|
|
f"{self.openwebui_url}/v1/models",
|
|
headers=self._headers(),
|
|
timeout=5,
|
|
)
|
|
available = resp.status_code == 200
|
|
self._available = available
|
|
return {
|
|
"available": available,
|
|
"url": self.openwebui_url,
|
|
"model": self.rag_model,
|
|
}
|
|
except Exception as e:
|
|
self._available = False
|
|
return {
|
|
"available": False,
|
|
"url": self.openwebui_url,
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
# Singleton
|
|
sans_rag = SANSRAGService()
|