version 0.4.0

2026-03-01 14:00:20 -05:00 · 2026-02-20 14:32:42 -05:00
parent ab8038867a
commit 365cf87c90
76 changed files with 34422 additions and 690 deletions
--- a/backend/app/api/routes/analysis.py
+++ b/backend/app/api/routes/analysis.py
@@ -0,0 +1,295 @@
+"""API routes for process trees, storyline graphs, risk scoring, LLM analysis, timeline, and field stats."""
+
+import logging
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query, Body
+from pydantic import BaseModel, Field
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import get_db
+from app.db.repositories.datasets import DatasetRepository
+from app.services.process_tree import (
+    build_process_tree,
+    build_storyline,
+    compute_risk_scores,
+    _fetch_rows,
+)
+from app.services.llm_analysis import (
+    AnalysisRequest,
+    AnalysisResult,
+    run_llm_analysis,
+)
+from app.services.timeline import (
+    build_timeline_bins,
+    compute_field_stats,
+    search_rows,
+)
+from app.services.mitre import (
+    map_to_attack,
+    build_knowledge_graph,
+)
+
+logger = logging.getLogger(__name__)
+
+router = APIRouter(prefix="/api/analysis", tags=["analysis"])
+
+
+# ── Response models ───────────────────────────────────────────────────
+
+
+class ProcessTreeResponse(BaseModel):
+    trees: list[dict] = Field(default_factory=list)
+    total_processes: int = 0
+
+
+class StorylineResponse(BaseModel):
+    nodes: list[dict] = Field(default_factory=list)
+    edges: list[dict] = Field(default_factory=list)
+    summary: dict = Field(default_factory=dict)
+
+
+class RiskHostEntry(BaseModel):
+    hostname: str
+    score: int = 0
+    signals: list[str] = Field(default_factory=list)
+    event_count: int = 0
+    process_count: int = 0
+    network_count: int = 0
+    file_count: int = 0
+
+
+class RiskSummaryResponse(BaseModel):
+    hosts: list[RiskHostEntry] = Field(default_factory=list)
+    overall_score: int = 0
+    total_events: int = 0
+    severity_breakdown: dict[str, int] = Field(default_factory=dict)
+
+
+# ── Routes ────────────────────────────────────────────────────────────
+
+
+@router.get(
+    "/process-tree",
+    response_model=ProcessTreeResponse,
+    summary="Build process tree from dataset rows",
+    description=(
+        "Extracts parent→child process relationships from dataset rows "
+        "and returns a hierarchical forest of process nodes."
+    ),
+)
+async def get_process_tree(
+    dataset_id: str | None = Query(None, description="Dataset ID"),
+    hunt_id: str | None = Query(None, description="Hunt ID (scans all datasets in hunt)"),
+    hostname: str | None = Query(None, description="Filter by hostname"),
+    db: AsyncSession = Depends(get_db),
+):
+    """Return process tree(s) for a dataset or hunt."""
+    if not dataset_id and not hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+
+    trees = await build_process_tree(
+        db, dataset_id=dataset_id, hunt_id=hunt_id, hostname_filter=hostname,
+    )
+
+    # Count total processes recursively
+    def _count(node: dict) -> int:
+        return 1 + sum(_count(c) for c in node.get("children", []))
+
+    total = sum(_count(t) for t in trees)
+
+    return ProcessTreeResponse(trees=trees, total_processes=total)
+
+
+@router.get(
+    "/storyline",
+    response_model=StorylineResponse,
+    summary="Build CrowdStrike-style storyline attack graph",
+    description=(
+        "Creates a Cytoscape-compatible graph of events connected by "
+        "process lineage (spawned) and temporal sequence within each host."
+    ),
+)
+async def get_storyline(
+    dataset_id: str | None = Query(None, description="Dataset ID"),
+    hunt_id: str | None = Query(None, description="Hunt ID (scans all datasets in hunt)"),
+    hostname: str | None = Query(None, description="Filter by hostname"),
+    db: AsyncSession = Depends(get_db),
+):
+    """Return a storyline graph for a dataset or hunt."""
+    if not dataset_id and not hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+
+    result = await build_storyline(
+        db, dataset_id=dataset_id, hunt_id=hunt_id, hostname_filter=hostname,
+    )
+
+    return StorylineResponse(**result)
+
+
+@router.get(
+    "/risk-summary",
+    response_model=RiskSummaryResponse,
+    summary="Compute risk scores per host",
+    description=(
+        "Analyzes dataset rows for suspicious patterns (encoded PowerShell, "
+        "credential dumping, lateral movement) and produces per-host risk scores."
+    ),
+)
+async def get_risk_summary(
+    hunt_id: str | None = Query(None, description="Hunt ID"),
+    db: AsyncSession = Depends(get_db),
+):
+    """Return risk scores for all hosts in a hunt."""
+    result = await compute_risk_scores(db, hunt_id=hunt_id)
+    return RiskSummaryResponse(**result)
+
+
+# ── LLM Analysis ─────────────────────────────────────────────────────
+
+
+@router.post(
+    "/llm-analyze",
+    response_model=AnalysisResult,
+    summary="Run LLM-powered threat analysis on dataset",
+    description=(
+        "Loads dataset rows server-side, builds a summary, and sends to "
+        "Wile (deep analysis) or Roadrunner (quick) for comprehensive "
+        "threat analysis. Returns structured findings, IOCs, MITRE techniques."
+    ),
+)
+async def llm_analyze(
+    request: AnalysisRequest,
+    db: AsyncSession = Depends(get_db),
+):
+    """Run LLM analysis on a dataset or hunt."""
+    if not request.dataset_id and not request.hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+
+    # Load rows
+    rows_objs = await _fetch_rows(
+        db,
+        dataset_id=request.dataset_id,
+        hunt_id=request.hunt_id,
+        limit=2000,
+    )
+
+    if not rows_objs:
+        raise HTTPException(status_code=404, detail="No rows found for analysis")
+
+    # Extract data dicts
+    rows = [r.normalized_data or r.data for r in rows_objs]
+
+    # Get dataset name
+    ds_name = "hunt datasets"
+    if request.dataset_id:
+        repo = DatasetRepository(db)
+        ds = await repo.get_dataset(request.dataset_id)
+        if ds:
+            ds_name = ds.name
+
+    result = await run_llm_analysis(rows, request, dataset_name=ds_name)
+    return result
+
+
+# ── Timeline ──────────────────────────────────────────────────────────
+
+
+@router.get(
+    "/timeline",
+    summary="Get event timeline histogram bins",
+)
+async def get_timeline(
+    dataset_id: str | None = Query(None),
+    hunt_id: str | None = Query(None),
+    bins: int = Query(60, ge=10, le=200),
+    db: AsyncSession = Depends(get_db),
+):
+    if not dataset_id and not hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+    return await build_timeline_bins(db, dataset_id=dataset_id, hunt_id=hunt_id, bins=bins)
+
+
+@router.get(
+    "/field-stats",
+    summary="Get per-field value distributions",
+)
+async def get_field_stats(
+    dataset_id: str | None = Query(None),
+    hunt_id: str | None = Query(None),
+    fields: str | None = Query(None, description="Comma-separated field names"),
+    top_n: int = Query(20, ge=5, le=100),
+    db: AsyncSession = Depends(get_db),
+):
+    if not dataset_id and not hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+    field_list = [f.strip() for f in fields.split(",")] if fields else None
+    return await compute_field_stats(
+        db, dataset_id=dataset_id, hunt_id=hunt_id,
+        fields=field_list, top_n=top_n,
+    )
+
+
+class SearchRequest(BaseModel):
+    dataset_id: Optional[str] = None
+    hunt_id: Optional[str] = None
+    query: str = ""
+    filters: dict[str, str] = Field(default_factory=dict)
+    time_start: Optional[str] = None
+    time_end: Optional[str] = None
+    limit: int = 500
+    offset: int = 0
+
+
+@router.post(
+    "/search",
+    summary="Search and filter dataset rows",
+)
+async def search_dataset_rows(
+    request: SearchRequest,
+    db: AsyncSession = Depends(get_db),
+):
+    if not request.dataset_id and not request.hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+    return await search_rows(
+        db,
+        dataset_id=request.dataset_id,
+        hunt_id=request.hunt_id,
+        query=request.query,
+        filters=request.filters,
+        time_start=request.time_start,
+        time_end=request.time_end,
+        limit=request.limit,
+        offset=request.offset,
+    )
+
+
+# ── MITRE ATT&CK ─────────────────────────────────────────────────────
+
+
+@router.get(
+    "/mitre-map",
+    summary="Map dataset events to MITRE ATT&CK techniques",
+)
+async def get_mitre_map(
+    dataset_id: str | None = Query(None),
+    hunt_id: str | None = Query(None),
+    db: AsyncSession = Depends(get_db),
+):
+    if not dataset_id and not hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+    return await map_to_attack(db, dataset_id=dataset_id, hunt_id=hunt_id)
+
+
+@router.get(
+    "/knowledge-graph",
+    summary="Build entity-technique knowledge graph",
+)
+async def get_knowledge_graph(
+    dataset_id: str | None = Query(None),
+    hunt_id: str | None = Query(None),
+    db: AsyncSession = Depends(get_db),
+):
+    if not dataset_id and not hunt_id:
+        raise HTTPException(status_code=400, detail="Provide dataset_id or hunt_id")
+    return await build_knowledge_graph(db, dataset_id=dataset_id, hunt_id=hunt_id)