version 0.4.0

2026-03-01 14:00:20 -05:00 · 2026-02-20 14:32:42 -05:00
parent ab8038867a
commit 365cf87c90
76 changed files with 34422 additions and 690 deletions
--- a/backend/app/services/mitre.py
+++ b/backend/app/services/mitre.py
@@ -0,0 +1,484 @@
+"""
+MITRE ATT&CK mapping service.
+
+Maps dataset events to ATT&CK techniques using pattern-based heuristics.
+Uses the enterprise-attack matrix (embedded patterns for offline use).
+"""
+
+import logging
+import re
+from collections import Counter, defaultdict
+from typing import Any
+
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.models import Dataset, DatasetRow
+
+logger = logging.getLogger(__name__)
+
+# ── ATT&CK Technique Patterns ────────────────────────────────────────
+# Subset of enterprise-attack techniques with detection patterns.
+# Each entry: (technique_id, name, tactic, patterns_list)
+
+TECHNIQUE_PATTERNS: list[tuple[str, str, str, list[str]]] = [
+    # Initial Access
+    ("T1566", "Phishing", "initial-access", [
+        r"phish", r"\.hta\b", r"\.lnk\b", r"mshta\.exe", r"outlook.*attachment",
+    ]),
+    ("T1190", "Exploit Public-Facing Application", "initial-access", [
+        r"exploit", r"CVE-\d{4}", r"vulnerability", r"webshell",
+    ]),
+
+    # Execution
+    ("T1059.001", "PowerShell", "execution", [
+        r"powershell", r"pwsh", r"-enc\b", r"-encodedcommand",
+        r"invoke-expression", r"iex\b", r"bypass\b.*execution",
+    ]),
+    ("T1059.003", "Windows Command Shell", "execution", [
+        r"cmd\.exe", r"/c\s+", r"command\.com",
+    ]),
+    ("T1059.005", "Visual Basic", "execution", [
+        r"wscript", r"cscript", r"\.vbs\b", r"\.vbe\b",
+    ]),
+    ("T1047", "Windows Management Instrumentation", "execution", [
+        r"wmic\b", r"winmgmt", r"wmi\b",
+    ]),
+    ("T1053.005", "Scheduled Task", "execution", [
+        r"schtasks", r"at\.exe", r"taskschd",
+    ]),
+    ("T1204", "User Execution", "execution", [
+        r"user.*click", r"open.*attachment", r"macro",
+    ]),
+
+    # Persistence
+    ("T1547.001", "Registry Run Keys", "persistence", [
+        r"CurrentVersion\\Run", r"HKLM\\Software\\Microsoft\\Windows\\CurrentVersion\\Run",
+        r"reg\s+add.*\\Run",
+    ]),
+    ("T1543.003", "Windows Service", "persistence", [
+        r"sc\s+create", r"new-service", r"service.*install",
+    ]),
+    ("T1136", "Create Account", "persistence", [
+        r"net\s+user\s+/add", r"new-localuser", r"useradd",
+    ]),
+    ("T1053.005", "Scheduled Task/Job", "persistence", [
+        r"schtasks\s+/create", r"crontab",
+    ]),
+
+    # Privilege Escalation
+    ("T1548.002", "Bypass User Access Control", "privilege-escalation", [
+        r"eventvwr", r"fodhelper", r"uac.*bypass", r"computerdefaults",
+    ]),
+    ("T1134", "Access Token Manipulation", "privilege-escalation", [
+        r"token.*impersonat", r"runas", r"adjusttokenprivileges",
+    ]),
+
+    # Defense Evasion
+    ("T1070.001", "Clear Windows Event Logs", "defense-evasion", [
+        r"wevtutil\s+cl", r"clear-eventlog", r"clearlog",
+    ]),
+    ("T1562.001", "Disable or Modify Tools", "defense-evasion", [
+        r"tamper.*protection", r"disable.*defender", r"set-mppreference",
+        r"disable.*firewall",
+    ]),
+    ("T1027", "Obfuscated Files or Information", "defense-evasion", [
+        r"base64", r"-enc\b", r"certutil.*-decode", r"frombase64",
+    ]),
+    ("T1036", "Masquerading", "defense-evasion", [
+        r"rename.*\.exe", r"masquerad", r"svchost.*unusual",
+    ]),
+    ("T1055", "Process Injection", "defense-evasion", [
+        r"inject", r"createremotethread", r"ntcreatethreadex",
+        r"virtualalloc", r"writeprocessmemory",
+    ]),
+
+    # Credential Access
+    ("T1003.001", "LSASS Memory", "credential-access", [
+        r"mimikatz", r"sekurlsa", r"lsass", r"procdump.*lsass",
+    ]),
+    ("T1003.003", "NTDS", "credential-access", [
+        r"ntds\.dit", r"vssadmin.*shadow", r"ntdsutil",
+    ]),
+    ("T1110", "Brute Force", "credential-access", [
+        r"brute.*force", r"failed.*login.*\d{3,}", r"hydra", r"medusa",
+    ]),
+    ("T1558.003", "Kerberoasting", "credential-access", [
+        r"kerberoast", r"invoke-kerberoast", r"GetUserSPNs",
+    ]),
+
+    # Discovery
+    ("T1087", "Account Discovery", "discovery", [
+        r"net\s+user", r"net\s+localgroup", r"get-aduser",
+    ]),
+    ("T1082", "System Information Discovery", "discovery", [
+        r"systeminfo", r"hostname", r"ver\b",
+    ]),
+    ("T1083", "File and Directory Discovery", "discovery", [
+        r"dir\s+/s", r"tree\s+/f", r"get-childitem.*-recurse",
+    ]),
+    ("T1057", "Process Discovery", "discovery", [
+        r"tasklist", r"get-process", r"ps\s+aux",
+    ]),
+    ("T1018", "Remote System Discovery", "discovery", [
+        r"net\s+view", r"ping\s+-", r"arp\s+-a", r"nslookup",
+    ]),
+    ("T1016", "System Network Configuration Discovery", "discovery", [
+        r"ipconfig", r"ifconfig", r"netstat",
+    ]),
+
+    # Lateral Movement
+    ("T1021.001", "Remote Desktop Protocol", "lateral-movement", [
+        r"rdp\b", r"mstsc", r"3389", r"remote\s+desktop",
+    ]),
+    ("T1021.002", "SMB/Windows Admin Shares", "lateral-movement", [
+        r"\\\\.*\\(c|admin)\$", r"psexec", r"smbclient", r"net\s+use",
+    ]),
+    ("T1021.006", "Windows Remote Management", "lateral-movement", [
+        r"winrm", r"enter-pssession", r"invoke-command.*-computername",
+        r"wsman", r"5985|5986",
+    ]),
+    ("T1570", "Lateral Tool Transfer", "lateral-movement", [
+        r"copy.*\\\\", r"xcopy.*\\\\", r"robocopy",
+    ]),
+
+    # Collection
+    ("T1560", "Archive Collected Data", "collection", [
+        r"compress-archive", r"7z\.exe", r"rar\s+a", r"tar\s+-[cz]",
+    ]),
+    ("T1005", "Data from Local System", "collection", [
+        r"type\s+.*password", r"findstr.*password", r"select-string.*credential",
+    ]),
+
+    # Command and Control
+    ("T1071.001", "Web Protocols", "command-and-control", [
+        r"http[s]?://\d+\.\d+\.\d+\.\d+", r"curl\b", r"wget\b",
+        r"invoke-webrequest", r"beacon",
+    ]),
+    ("T1573", "Encrypted Channel", "command-and-control", [
+        r"ssl\b", r"tls\b", r"encrypted.*tunnel", r"stunnel",
+    ]),
+    ("T1105", "Ingress Tool Transfer", "command-and-control", [
+        r"certutil.*-urlcache", r"bitsadmin.*transfer",
+        r"downloadfile", r"invoke-webrequest.*-outfile",
+    ]),
+    ("T1219", "Remote Access Software", "command-and-control", [
+        r"teamviewer", r"anydesk", r"logmein", r"vnc",
+    ]),
+
+    # Exfiltration
+    ("T1048", "Exfiltration Over Alternative Protocol", "exfiltration", [
+        r"dns.*tunnel", r"exfil", r"icmp.*tunnel",
+    ]),
+    ("T1041", "Exfiltration Over C2 Channel", "exfiltration", [
+        r"upload.*c2", r"exfil.*http",
+    ]),
+    ("T1567", "Exfiltration Over Web Service", "exfiltration", [
+        r"mega\.nz", r"dropbox", r"pastebin", r"transfer\.sh",
+    ]),
+
+    # Impact
+    ("T1486", "Data Encrypted for Impact", "impact", [
+        r"ransomware", r"encrypt.*files", r"\.locked\b", r"ransom",
+    ]),
+    ("T1489", "Service Stop", "impact", [
+        r"sc\s+stop", r"net\s+stop", r"stop-service",
+    ]),
+    ("T1529", "System Shutdown/Reboot", "impact", [
+        r"shutdown\s+/[rs]", r"restart-computer",
+    ]),
+]
+
+# Tactic display names and kill-chain order
+TACTIC_ORDER = [
+    "initial-access", "execution", "persistence", "privilege-escalation",
+    "defense-evasion", "credential-access", "discovery", "lateral-movement",
+    "collection", "command-and-control", "exfiltration", "impact",
+]
+TACTIC_NAMES = {
+    "initial-access": "Initial Access",
+    "execution": "Execution",
+    "persistence": "Persistence",
+    "privilege-escalation": "Privilege Escalation",
+    "defense-evasion": "Defense Evasion",
+    "credential-access": "Credential Access",
+    "discovery": "Discovery",
+    "lateral-movement": "Lateral Movement",
+    "collection": "Collection",
+    "command-and-control": "Command and Control",
+    "exfiltration": "Exfiltration",
+    "impact": "Impact",
+}
+
+
+# ── Row fetcher ───────────────────────────────────────────────────────
+
+async def _fetch_rows(
+    db: AsyncSession,
+    dataset_id: str | None = None,
+    hunt_id: str | None = None,
+    limit: int = 5000,
+) -> list[dict[str, Any]]:
+    q = select(DatasetRow).join(Dataset)
+    if dataset_id:
+        q = q.where(DatasetRow.dataset_id == dataset_id)
+    elif hunt_id:
+        q = q.where(Dataset.hunt_id == hunt_id)
+    q = q.limit(limit)
+    result = await db.execute(q)
+    return [r.data for r in result.scalars().all()]
+
+
+# ── Main functions ────────────────────────────────────────────────────
+
+async def map_to_attack(
+    db: AsyncSession,
+    dataset_id: str | None = None,
+    hunt_id: str | None = None,
+) -> dict[str, Any]:
+    """
+    Map dataset rows to MITRE ATT&CK techniques.
+    Returns a matrix-style structure + evidence list.
+    """
+    rows = await _fetch_rows(db, dataset_id, hunt_id)
+    if not rows:
+        return {"tactics": [], "techniques": [], "evidence": [], "coverage": {}, "total_rows": 0}
+
+    # Flatten all string values per row for matching
+    row_texts: list[str] = []
+    for row in rows:
+        parts = []
+        for v in row.values():
+            if v is not None:
+                parts.append(str(v).lower())
+        row_texts.append(" ".join(parts))
+
+    # Match techniques
+    technique_hits: dict[str, list[dict]] = defaultdict(list)  # tech_id -> evidence rows
+    technique_meta: dict[str, tuple[str, str]] = {}  # tech_id -> (name, tactic)
+    row_techniques: list[set[str]] = [set() for _ in rows]
+
+    for tech_id, tech_name, tactic, patterns in TECHNIQUE_PATTERNS:
+        compiled = [re.compile(p, re.IGNORECASE) for p in patterns]
+        technique_meta[tech_id] = (tech_name, tactic)
+        for i, text in enumerate(row_texts):
+            for pat in compiled:
+                if pat.search(text):
+                    row_techniques[i].add(tech_id)
+                    if len(technique_hits[tech_id]) < 10:  # limit evidence
+                        # find matching field
+                        matched_field = ""
+                        matched_value = ""
+                        for k, v in rows[i].items():
+                            if v and pat.search(str(v).lower()):
+                                matched_field = k
+                                matched_value = str(v)[:200]
+                                break
+                        technique_hits[tech_id].append({
+                            "row_index": i,
+                            "field": matched_field,
+                            "value": matched_value,
+                            "pattern": pat.pattern,
+                        })
+                    break  # one pattern match per technique per row is enough
+
+    # Build tactic → technique structure
+    tactic_techniques: dict[str, list[dict]] = defaultdict(list)
+    for tech_id, evidence_list in technique_hits.items():
+        name, tactic = technique_meta[tech_id]
+        tactic_techniques[tactic].append({
+            "id": tech_id,
+            "name": name,
+            "count": len(evidence_list),
+            "evidence": evidence_list[:5],
+        })
+
+    # Build ordered tactics list
+    tactics = []
+    for tactic_key in TACTIC_ORDER:
+        techs = tactic_techniques.get(tactic_key, [])
+        tactics.append({
+            "id": tactic_key,
+            "name": TACTIC_NAMES.get(tactic_key, tactic_key),
+            "techniques": sorted(techs, key=lambda t: -t["count"]),
+            "total_hits": sum(t["count"] for t in techs),
+        })
+
+    # Coverage stats
+    covered_tactics = sum(1 for t in tactics if t["total_hits"] > 0)
+    total_technique_hits = sum(t["total_hits"] for t in tactics)
+
+    return {
+        "tactics": tactics,
+        "coverage": {
+            "tactics_covered": covered_tactics,
+            "tactics_total": len(TACTIC_ORDER),
+            "techniques_matched": len(technique_hits),
+            "total_evidence": total_technique_hits,
+        },
+        "total_rows": len(rows),
+    }
+
+
+async def build_knowledge_graph(
+    db: AsyncSession,
+    dataset_id: str | None = None,
+    hunt_id: str | None = None,
+) -> dict[str, Any]:
+    """
+    Build a knowledge graph connecting entities (hosts, users, processes, IPs)
+    to MITRE techniques and tactics.
+    Returns Cytoscape-compatible nodes + edges.
+    """
+    rows = await _fetch_rows(db, dataset_id, hunt_id)
+    if not rows:
+        return {"nodes": [], "edges": [], "stats": {}}
+
+    # Extract entities
+    entities: dict[str, set[str]] = defaultdict(set)  # type -> set of values
+    row_entity_map: list[list[tuple[str, str]]] = []  # per-row list of (type, value)
+
+    # Field name patterns for entity extraction
+    HOST_FIELDS = re.compile(r"hostname|computer|host|machine", re.I)
+    USER_FIELDS = re.compile(r"user|account|logon.*name|subject.*name", re.I)
+    IP_FIELDS = re.compile(r"src.*ip|dst.*ip|ip.*addr|source.*ip|dest.*ip|remote.*addr", re.I)
+    PROC_FIELDS = re.compile(r"process.*name|image|parent.*image|executable|command", re.I)
+
+    for row in rows:
+        row_ents: list[tuple[str, str]] = []
+        for k, v in row.items():
+            if not v or str(v).strip() in ('', '-', 'N/A', 'None'):
+                continue
+            val = str(v).strip()
+            if HOST_FIELDS.search(k):
+                entities["host"].add(val)
+                row_ents.append(("host", val))
+            elif USER_FIELDS.search(k):
+                entities["user"].add(val)
+                row_ents.append(("user", val))
+            elif IP_FIELDS.search(k):
+                entities["ip"].add(val)
+                row_ents.append(("ip", val))
+            elif PROC_FIELDS.search(k):
+                # Clean process name
+                pname = val.split("\\")[-1].split("/")[-1][:60]
+                entities["process"].add(pname)
+                row_ents.append(("process", pname))
+        row_entity_map.append(row_ents)
+
+    # Map rows to techniques
+    row_texts = [" ".join(str(v).lower() for v in row.values() if v) for row in rows]
+    row_techniques: list[set[str]] = [set() for _ in rows]
+    tech_meta: dict[str, tuple[str, str]] = {}
+
+    for tech_id, tech_name, tactic, patterns in TECHNIQUE_PATTERNS:
+        compiled = [re.compile(p, re.I) for p in patterns]
+        tech_meta[tech_id] = (tech_name, tactic)
+        for i, text in enumerate(row_texts):
+            for pat in compiled:
+                if pat.search(text):
+                    row_techniques[i].add(tech_id)
+                    break
+
+    # Build graph
+    nodes: list[dict] = []
+    edges: list[dict] = []
+    node_ids: set[str] = set()
+    edge_counter: Counter = Counter()
+
+    # Entity nodes
+    TYPE_COLORS = {
+        "host": "#3b82f6",
+        "user": "#10b981",
+        "ip": "#8b5cf6",
+        "process": "#f59e0b",
+        "technique": "#ef4444",
+        "tactic": "#6366f1",
+    }
+    TYPE_SHAPES = {
+        "host": "roundrectangle",
+        "user": "ellipse",
+        "ip": "diamond",
+        "process": "hexagon",
+        "technique": "tag",
+        "tactic": "round-rectangle",
+    }
+
+    for ent_type, values in entities.items():
+        for val in list(values)[:50]:  # limit nodes
+            nid = f"{ent_type}:{val}"
+            if nid not in node_ids:
+                node_ids.add(nid)
+                nodes.append({
+                    "data": {
+                        "id": nid,
+                        "label": val[:40],
+                        "type": ent_type,
+                        "color": TYPE_COLORS.get(ent_type, "#666"),
+                        "shape": TYPE_SHAPES.get(ent_type, "ellipse"),
+                    },
+                })
+
+    # Technique nodes
+    seen_techniques: set[str] = set()
+    for tech_set in row_techniques:
+        seen_techniques.update(tech_set)
+
+    for tech_id in seen_techniques:
+        name, tactic = tech_meta.get(tech_id, (tech_id, "unknown"))
+        nid = f"technique:{tech_id}"
+        if nid not in node_ids:
+            node_ids.add(nid)
+            nodes.append({
+                "data": {
+                    "id": nid,
+                    "label": f"{tech_id}\n{name}",
+                    "type": "technique",
+                    "color": TYPE_COLORS["technique"],
+                    "shape": TYPE_SHAPES["technique"],
+                    "tactic": tactic,
+                },
+            })
+
+    # Edges: entity → technique (based on co-occurrence in rows)
+    for i, row_ents in enumerate(row_entity_map):
+        for ent_type, ent_val in row_ents:
+            for tech_id in row_techniques[i]:
+                src = f"{ent_type}:{ent_val}"
+                tgt = f"technique:{tech_id}"
+                if src in node_ids and tgt in node_ids:
+                    edge_key = (src, tgt)
+                    edge_counter[edge_key] += 1
+
+    # Edges: entity → entity (based on co-occurrence)
+    for row_ents in row_entity_map:
+        for j in range(len(row_ents)):
+            for k in range(j + 1, len(row_ents)):
+                src = f"{row_ents[j][0]}:{row_ents[j][1]}"
+                tgt = f"{row_ents[k][0]}:{row_ents[k][1]}"
+                if src in node_ids and tgt in node_ids and src != tgt:
+                    edge_counter[(src, tgt)] += 1
+
+    # Build edge list (filter low-weight edges)
+    for (src, tgt), weight in edge_counter.most_common(500):
+        if weight < 1:
+            continue
+        edges.append({
+            "data": {
+                "source": src,
+                "target": tgt,
+                "weight": weight,
+                "label": str(weight) if weight > 2 else "",
+            },
+        })
+
+    return {
+        "nodes": nodes,
+        "edges": edges,
+        "stats": {
+            "total_nodes": len(nodes),
+            "total_edges": len(edges),
+            "entity_counts": {t: len(v) for t, v in entities.items()},
+            "techniques_found": len(seen_techniques),
+        },
+    }