"""Report generation — JSON, HTML, and CSV export for hunt investigations. Generates comprehensive investigation reports including: - Hunt metadata and status - Dataset summaries with IOC counts - Hypotheses and their evidence - Annotations timeline - Enrichment verdicts - Agent conversation history - Cross-hunt correlations """ import csv import io import json import logging from dataclasses import asdict from datetime import datetime, timezone from typing import Optional from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.db.models import ( Hunt, Dataset, DatasetRow, Hypothesis, Annotation, Conversation, Message, EnrichmentResult, ) logger = logging.getLogger(__name__) class ReportGenerator: """Generates exportable investigation reports.""" async def generate_hunt_report( self, hunt_id: str, db: AsyncSession, format: str = "json", include_rows: bool = False, max_rows: int = 500, ) -> dict | str: """Generate a comprehensive report for a hunt investigation.""" # Gather all hunt data report_data = await self._gather_hunt_data( hunt_id, db, include_rows=include_rows, max_rows=max_rows, ) if not report_data: return {"error": "Hunt not found"} if format == "json": return report_data elif format == "html": return self._render_html(report_data) elif format == "csv": return self._render_csv(report_data) else: return report_data async def _gather_hunt_data( self, hunt_id: str, db: AsyncSession, include_rows: bool = False, max_rows: int = 500, ) -> dict | None: """Gather all data for a hunt report.""" # Hunt metadata result = await db.execute(select(Hunt).where(Hunt.id == hunt_id)) hunt = result.scalar_one_or_none() if not hunt: return None # Datasets ds_result = await db.execute( select(Dataset).where(Dataset.hunt_id == hunt_id) ) datasets = ds_result.scalars().all() dataset_summaries = [] all_iocs = {} for ds in datasets: summary = { "id": ds.id, "name": ds.name, "filename": ds.filename, "source_tool": ds.source_tool, "row_count": ds.row_count, "columns": list((ds.column_schema or {}).keys()), "ioc_columns": ds.ioc_columns or {}, "time_range": { "start": ds.time_range_start, "end": ds.time_range_end, }, "created_at": ds.created_at.isoformat(), } if include_rows: rows_result = await db.execute( select(DatasetRow) .where(DatasetRow.dataset_id == ds.id) .order_by(DatasetRow.row_index) .limit(max_rows) ) rows = rows_result.scalars().all() summary["rows"] = [r.data for r in rows] dataset_summaries.append(summary) # Collect IOCs for enrichment lookup if ds.ioc_columns: all_iocs.update(ds.ioc_columns) # Hypotheses hyp_result = await db.execute( select(Hypothesis).where(Hypothesis.hunt_id == hunt_id) ) hypotheses = hyp_result.scalars().all() hypotheses_data = [ { "id": h.id, "title": h.title, "description": h.description, "mitre_technique": h.mitre_technique, "status": h.status, "evidence_row_ids": h.evidence_row_ids, "evidence_notes": h.evidence_notes, "created_at": h.created_at.isoformat(), "updated_at": h.updated_at.isoformat(), } for h in hypotheses ] # Annotations (across all datasets in this hunt) dataset_ids = [ds.id for ds in datasets] annotations_data = [] if dataset_ids: ann_result = await db.execute( select(Annotation) .where(Annotation.dataset_id.in_(dataset_ids)) .order_by(Annotation.created_at) ) annotations = ann_result.scalars().all() annotations_data = [ { "id": a.id, "dataset_id": a.dataset_id, "row_id": a.row_id, "text": a.text, "severity": a.severity, "tag": a.tag, "created_at": a.created_at.isoformat(), } for a in annotations ] # Conversations conv_result = await db.execute( select(Conversation).where(Conversation.hunt_id == hunt_id) ) conversations = conv_result.scalars().all() conversations_data = [] for conv in conversations: msg_result = await db.execute( select(Message) .where(Message.conversation_id == conv.id) .order_by(Message.created_at) ) messages = msg_result.scalars().all() conversations_data.append({ "id": conv.id, "title": conv.title, "messages": [ { "role": m.role, "content": m.content, "model_used": m.model_used, "node_used": m.node_used, "latency_ms": m.latency_ms, "created_at": m.created_at.isoformat(), } for m in messages ], }) # Enrichment results enrichment_data = [] for ds in datasets: if not ds.ioc_columns: continue # Get unique enriched IOCs for this dataset for col_name in ds.ioc_columns.keys(): enrich_result = await db.execute( select(EnrichmentResult) .where(EnrichmentResult.source.isnot(None)) .limit(100) ) enrichments = enrich_result.scalars().all() for e in enrichments: enrichment_data.append({ "ioc_value": e.ioc_value, "ioc_type": e.ioc_type, "source": e.source, "verdict": e.verdict, "score": e.score, "tags": e.tags, "country": e.country, }) break # Only query once # Build report now = datetime.now(timezone.utc) return { "report_metadata": { "generated_at": now.isoformat(), "format_version": "1.0", "generator": "ThreatHunt Report Engine", }, "hunt": { "id": hunt.id, "name": hunt.name, "description": hunt.description, "status": hunt.status, "created_at": hunt.created_at.isoformat(), "updated_at": hunt.updated_at.isoformat(), }, "summary": { "dataset_count": len(datasets), "total_rows": sum(ds.row_count for ds in datasets), "hypothesis_count": len(hypotheses), "confirmed_hypotheses": len([h for h in hypotheses if h.status == "confirmed"]), "annotation_count": len(annotations_data), "critical_annotations": len([a for a in annotations_data if a["severity"] == "critical"]), "conversation_count": len(conversations_data), "enrichment_count": len(enrichment_data), "malicious_iocs": len([e for e in enrichment_data if e["verdict"] == "malicious"]), }, "datasets": dataset_summaries, "hypotheses": hypotheses_data, "annotations": annotations_data, "conversations": conversations_data, "enrichments": enrichment_data[:100], } def _render_html(self, data: dict) -> str: """Render report as self-contained HTML.""" hunt = data.get("hunt", {}) summary = data.get("summary", {}) hypotheses = data.get("hypotheses", []) annotations = data.get("annotations", []) datasets = data.get("datasets", []) enrichments = data.get("enrichments", []) meta = data.get("report_metadata", {}) html = f""" ThreatHunt Report: {hunt.get('name', 'Unknown')}

🔍 ThreatHunt Report: {hunt.get('name', 'Untitled')}

Hunt ID: {hunt.get('id', '')}
Status: {hunt.get('status', 'unknown')}
Description: {hunt.get('description', 'N/A')}
Created: {hunt.get('created_at', '')}

Summary

{summary.get('dataset_count', 0)}
Datasets
{summary.get('total_rows', 0):,}
Total Rows
{summary.get('hypothesis_count', 0)}
Hypotheses
{summary.get('confirmed_hypotheses', 0)}
Confirmed
{summary.get('annotation_count', 0)}
Annotations
{summary.get('malicious_iocs', 0)}
Malicious IOCs
""" # Hypotheses section if hypotheses: html += "

Hypotheses

\n" html += "\n" for h in hypotheses: status_class = f"badge-{h['status']}" if h['status'] in ('confirmed', 'active') else "" html += ( f"" f"" f"" f"\n" ) html += "
TitleMITREStatusDescription
{h['title']}{h.get('mitre_technique', 'N/A')}{h['status']}{h.get('description', '') or ''}
\n" # Datasets section if datasets: html += "

Datasets

\n" for ds in datasets: html += f"""

{ds['name']} ({ds.get('filename', '')})

Source: {ds.get('source_tool', 'N/A')} | Rows: {ds['row_count']:,} | IOC Columns: {len(ds.get('ioc_columns', {}))} | Time Range: {ds.get('time_range', {}).get('start', 'N/A')} to {ds.get('time_range', {}).get('end', 'N/A')}

\n""" # Annotations if annotations: critical = [a for a in annotations if a['severity'] in ('critical', 'high')] html += f"

Annotations ({len(annotations)} total, {len(critical)} critical/high)

\n" html += "\n" for a in annotations[:50]: sev_class = f"badge-{a['severity']}" if a['severity'] in ('critical', 'high', 'medium') else "" html += ( f"" f"" f"" f"\n" ) html += "
SeverityTagTextCreated
{a['severity']}{a.get('tag', 'N/A')}{a['text'][:200]}{a['created_at'][:19]}
\n" # Enrichments if enrichments: malicious = [e for e in enrichments if e['verdict'] == 'malicious'] html += f"

IOC Enrichment ({len(enrichments)} results, {len(malicious)} malicious)

\n" html += "\n" for e in enrichments[:50]: verdict_class = f"badge-{e['verdict']}" html += ( f"" f"" f"" f"" f"\n" ) html += "
IOCTypeSourceVerdictScore
{e['ioc_value']}{e['ioc_type']}{e['source']}{e['verdict']}{e.get('score', 0)}
\n" html += f"""
""" return html def _render_csv(self, data: dict) -> str: """Render key report data as CSV.""" output = io.StringIO() # Hypotheses sheet output.write("=== HYPOTHESES ===\n") writer = csv.writer(output) writer.writerow(["Title", "MITRE Technique", "Status", "Description", "Evidence Notes"]) for h in data.get("hypotheses", []): writer.writerow([ h.get("title", ""), h.get("mitre_technique", ""), h.get("status", ""), h.get("description", ""), h.get("evidence_notes", ""), ]) output.write("\n=== ANNOTATIONS ===\n") writer.writerow(["Severity", "Tag", "Text", "Dataset ID", "Row ID", "Created"]) for a in data.get("annotations", []): writer.writerow([ a.get("severity", ""), a.get("tag", ""), a.get("text", ""), a.get("dataset_id", ""), a.get("row_id", ""), a.get("created_at", ""), ]) output.write("\n=== ENRICHMENTS ===\n") writer.writerow(["IOC Value", "IOC Type", "Source", "Verdict", "Score", "Country"]) for e in data.get("enrichments", []): writer.writerow([ e.get("ioc_value", ""), e.get("ioc_type", ""), e.get("source", ""), e.get("verdict", ""), e.get("score", ""), e.get("country", ""), ]) return output.getvalue() # Singleton report_generator = ReportGenerator()