Files
ThreatHunt/backend/app/services/sans_rag.py
mblanke 9b98ab9614 feat: interactive network map, IOC highlighting, AUP hunt selector, type filters
- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover
- NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform
- NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types
- NetworkMap: brighter colors, 20% smaller nodes
- DatasetViewer: IOC columns highlighted with colored headers + cell tinting
- AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all
- Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration
- Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade)
- Add OS column mapping to normalizer
- Full backend services, DB models, alembic migrations, new routes
- New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc.
- Docker Compose deployment with nginx reverse proxy
2026-02-19 15:41:15 -05:00

347 lines
12 KiB
Python

"""SANS RAG service — queries the 300GB SANS courseware indexed in Open WebUI.
Provides contextual SANS references for threat hunting guidance.
Uses two approaches:
1. Open WebUI RAG pipeline (if configured with a knowledge collection)
2. Embedding-based semantic search against locally indexed SANS content
"""
import asyncio
import logging
import re
import time
from dataclasses import dataclass, field
from typing import Optional
import httpx
from app.config import settings
from app.agents.providers_v2 import _get_client
from app.agents.registry import Node
logger = logging.getLogger(__name__)
# ── SANS course catalog for reference matching ────────────────────────
SANS_COURSES = {
"SEC401": "Security Essentials",
"SEC504": "Hacker Tools, Techniques, and Incident Handling",
"SEC503": "Network Monitoring and Threat Detection In-Depth",
"SEC505": "Securing Windows and PowerShell Automation",
"SEC506": "Securing Linux/Unix",
"SEC510": "Public Cloud Security: AWS, Azure, and GCP",
"SEC511": "Continuous Monitoring and Security Operations",
"SEC530": "Defensible Security Architecture and Engineering",
"SEC540": "Cloud Security and DevSecOps Automation",
"SEC555": "SIEM with Tactical Analytics",
"SEC560": "Enterprise Penetration Testing",
"SEC565": "Red Team Operations and Adversary Emulation",
"SEC573": "Automating Information Security with Python",
"SEC575": "Mobile Device Security and Ethical Hacking",
"SEC588": "Cloud Penetration Testing",
"SEC599": "Defeating Advanced Adversaries - Purple Team Tactics",
"FOR408": "Windows Forensic Analysis",
"FOR498": "Digital Acquisition and Rapid Triage",
"FOR500": "Windows Forensic Analysis",
"FOR508": "Advanced Incident Response, Threat Hunting, and Digital Forensics",
"FOR509": "Enterprise Cloud Forensics and Incident Response",
"FOR518": "Mac and iOS Forensic Analysis and Incident Response",
"FOR572": "Advanced Network Forensics: Threat Hunting, Analysis, and Incident Response",
"FOR578": "Cyber Threat Intelligence",
"FOR585": "Smartphone Forensic Analysis In-Depth",
"FOR610": "Reverse-Engineering Malware: Malware Analysis Tools and Techniques",
"FOR710": "Reverse-Engineering Malware: Advanced Code Analysis",
"ICS410": "ICS/SCADA Security Essentials",
"ICS515": "ICS Visibility, Detection, and Response",
}
# Topic-to-course mapping for fallback recommendations
TOPIC_COURSE_MAP = {
"malware": ["FOR610", "FOR710", "SEC504"],
"reverse engineer": ["FOR610", "FOR710"],
"incident response": ["FOR508", "SEC504"],
"forensic": ["FOR508", "FOR500", "FOR408"],
"windows forensic": ["FOR500", "FOR408"],
"network forensic": ["FOR572"],
"threat hunting": ["FOR508", "SEC504", "FOR578"],
"threat intelligence": ["FOR578"],
"powershell": ["SEC505", "FOR508"],
"lateral movement": ["SEC504", "FOR508"],
"persistence": ["FOR508", "SEC504"],
"privilege escalation": ["SEC504", "SEC560"],
"credential": ["SEC504", "SEC560"],
"memory forensic": ["FOR508"],
"disk forensic": ["FOR500", "FOR408"],
"registry": ["FOR500", "FOR408"],
"event log": ["FOR508", "SEC555"],
"siem": ["SEC555"],
"log analysis": ["SEC555", "SEC503"],
"network monitor": ["SEC503"],
"pcap": ["SEC503", "FOR572"],
"cloud": ["SEC510", "SEC540", "FOR509"],
"aws": ["SEC510", "SEC540", "FOR509"],
"azure": ["SEC510", "FOR509"],
"linux": ["SEC506"],
"mobile": ["SEC575", "FOR585"],
"penetration test": ["SEC560", "SEC565"],
"red team": ["SEC565", "SEC599"],
"purple team": ["SEC599"],
"python": ["SEC573"],
"automation": ["SEC573", "SEC540"],
"deobfusc": ["FOR610", "SEC504"],
"base64": ["FOR610", "SEC504"],
"shellcode": ["FOR610", "FOR710"],
"ransomware": ["FOR508", "FOR610"],
"phishing": ["SEC504", "FOR578"],
"c2": ["FOR508", "SEC504", "FOR572"],
"command and control": ["FOR508", "SEC504"],
"exfiltration": ["FOR508", "FOR572", "SEC503"],
"dns": ["FOR572", "SEC503"],
"ioc": ["FOR508", "FOR578"],
"mitre": ["FOR508", "SEC504", "SEC599"],
"att&ck": ["FOR508", "SEC504"],
"velociraptor": ["FOR508"],
"volatility": ["FOR508"],
"scheduled task": ["FOR508", "SEC504"],
"service": ["FOR508", "SEC504"],
"wmi": ["FOR508", "SEC504"],
"process": ["FOR508"],
"dll": ["FOR610", "FOR508"],
}
@dataclass
class RAGResult:
"""Result from a RAG query."""
query: str
context: str # Retrieved relevant text
sources: list[str] = field(default_factory=list) # Source document names
course_references: list[str] = field(default_factory=list) # SANS course IDs
confidence: float = 0.0
latency_ms: int = 0
class SANSRAGService:
"""Service for querying SANS courseware via Open WebUI RAG pipeline."""
def __init__(self):
self.openwebui_url = settings.OPENWEBUI_URL.rstrip("/")
self.api_key = settings.OPENWEBUI_API_KEY
self.rag_model = settings.DEFAULT_FAST_MODEL
self._available: bool | None = None
def _headers(self) -> dict:
h = {"Content-Type": "application/json"}
if self.api_key:
h["Authorization"] = f"Bearer {self.api_key}"
return h
async def query(
self,
question: str,
context: str = "",
max_tokens: int = 1024,
) -> RAGResult:
"""Query SANS courseware for relevant context.
Uses Open WebUI's RAG-enabled chat to retrieve from indexed SANS content.
Falls back to topic-based course recommendations if RAG is unavailable.
"""
start = time.monotonic()
# Try Open WebUI RAG pipeline first
try:
result = await self._query_openwebui_rag(question, context, max_tokens)
result.latency_ms = int((time.monotonic() - start) * 1000)
# Enrich with course references if not already present
if not result.course_references:
result.course_references = self._match_courses(question)
return result
except Exception as e:
logger.warning(f"RAG query failed, using fallback: {e}")
# Fallback to topic-based matching
courses = self._match_courses(question)
return RAGResult(
query=question,
context="",
sources=[],
course_references=courses,
confidence=0.3 if courses else 0.0,
latency_ms=int((time.monotonic() - start) * 1000),
)
async def _query_openwebui_rag(
self,
question: str,
context: str,
max_tokens: int,
) -> RAGResult:
"""Query Open WebUI with RAG context retrieval.
Open WebUI automatically retrieves from its indexed knowledge base
when the model is configured with a knowledge collection.
"""
client = _get_client()
system_msg = (
"You are a SANS cybersecurity knowledge assistant. "
"Use your indexed SANS courseware to answer the question. "
"Always cite the specific SANS course (e.g., FOR508, SEC504) "
"and relevant section when referencing material. "
"If the question relates to threat hunting procedures, "
"reference the specific SANS methodology or framework."
)
messages = [
{"role": "system", "content": system_msg},
]
if context:
messages.append({
"role": "user",
"content": f"Investigation context:\n{context}\n\nQuestion: {question}",
})
else:
messages.append({"role": "user", "content": question})
payload = {
"model": self.rag_model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": 0.2,
"stream": False,
}
resp = await client.post(
f"{self.openwebui_url}/v1/chat/completions",
json=payload,
headers=self._headers(),
)
resp.raise_for_status()
data = resp.json()
content = ""
if data.get("choices"):
content = data["choices"][0].get("message", {}).get("content", "")
# Extract course references from response
course_refs = self._extract_course_refs(content)
sources = self._extract_sources(data)
return RAGResult(
query=question,
context=content,
sources=sources,
course_references=course_refs,
confidence=0.8 if content else 0.0,
)
def _extract_course_refs(self, text: str) -> list[str]:
"""Extract SANS course references from response text."""
refs = set()
# Match patterns like SEC504, FOR508, ICS410
pattern = r'\b(SEC|FOR|ICS|MGT|AUD|DEV|LEG)\d{3}\b'
matches = re.findall(pattern, text, re.IGNORECASE)
# Need to get the full match
full_pattern = r'\b(?:SEC|FOR|ICS|MGT|AUD|DEV|LEG)\d{3}\b'
full_matches = re.findall(full_pattern, text, re.IGNORECASE)
for m in full_matches:
course_id = m.upper()
if course_id in SANS_COURSES:
refs.add(f"{course_id}: {SANS_COURSES[course_id]}")
else:
refs.add(course_id)
return sorted(refs)
def _extract_sources(self, api_response: dict) -> list[str]:
"""Extract source document references from Open WebUI response metadata."""
sources = []
# Open WebUI may include source metadata in various formats
if "sources" in api_response:
for src in api_response["sources"]:
if isinstance(src, dict):
sources.append(src.get("name", src.get("title", str(src))))
else:
sources.append(str(src))
# Check in metadata
for choice in api_response.get("choices", []):
meta = choice.get("metadata", {})
if "sources" in meta:
for src in meta["sources"]:
if isinstance(src, dict):
sources.append(src.get("name", str(src)))
else:
sources.append(str(src))
return sources[:10] # Limit
def _match_courses(self, query: str) -> list[str]:
"""Match query keywords to SANS courses using topic map."""
q = query.lower()
matched = set()
for topic, courses in TOPIC_COURSE_MAP.items():
if topic in q:
for course_id in courses:
if course_id in SANS_COURSES:
matched.add(f"{course_id}: {SANS_COURSES[course_id]}")
return sorted(matched)[:5]
async def get_course_context(self, course_id: str) -> str:
"""Get a brief course description for context injection."""
course_id = course_id.upper().split(":")[0].strip()
if course_id in SANS_COURSES:
return f"{course_id}: {SANS_COURSES[course_id]}"
return ""
async def enrich_prompt(
self,
query: str,
investigation_context: str = "",
) -> str:
"""Generate SANS-enriched context to inject into agent prompts.
Returns a context string with relevant SANS references.
"""
result = await self.query(query, context=investigation_context, max_tokens=512)
parts = []
if result.context:
parts.append(f"SANS Reference Context:\n{result.context}")
if result.course_references:
parts.append(f"Relevant SANS Courses: {', '.join(result.course_references)}")
if result.sources:
parts.append(f"Sources: {', '.join(result.sources[:5])}")
return "\n".join(parts) if parts else ""
async def health_check(self) -> dict:
"""Check RAG service availability."""
try:
client = _get_client()
resp = await client.get(
f"{self.openwebui_url}/v1/models",
headers=self._headers(),
timeout=5,
)
available = resp.status_code == 200
self._available = available
return {
"available": available,
"url": self.openwebui_url,
"model": self.rag_model,
}
except Exception as e:
self._available = False
return {
"available": False,
"url": self.openwebui_url,
"error": str(e),
}
# Singleton
sans_rag = SANSRAGService()