version 0.4.0

2026-03-01 14:00:20 -05:00 · 2026-02-20 14:32:42 -05:00
parent ab8038867a
commit 365cf87c90
76 changed files with 34422 additions and 690 deletions
--- a/backend/app/services/analyzers.py
+++ b/backend/app/services/analyzers.py
@@ -0,0 +1,464 @@
+"""Pluggable Analyzer Framework for ThreatHunt.
+
+Each analyzer implements a simple protocol:
+  - name / description properties
+  - async analyze(rows, config) -> list[AlertCandidate]
+
+The AnalyzerRegistry discovers and runs all enabled analyzers against
+a dataset, producing alert candidates that the alert system can persist.
+"""
+
+from __future__ import annotations
+
+import logging
+import math
+import re
+from abc import ABC, abstractmethod
+from collections import Counter, defaultdict
+from dataclasses import dataclass, field
+from typing import Any, Optional, Sequence
+
+logger = logging.getLogger(__name__)
+
+
+# ── Alert Candidate DTO ──────────────────────────────────────────────
+
+
+@dataclass
+class AlertCandidate:
+    """A single finding from an analyzer, before it becomes a persisted Alert."""
+    analyzer: str
+    title: str
+    severity: str  # critical | high | medium | low | info
+    description: str
+    evidence: list[dict] = field(default_factory=list)  # [{row_index, field, value, ...}]
+    mitre_technique: Optional[str] = None
+    tags: list[str] = field(default_factory=list)
+    score: float = 0.0  # 0-100
+
+
+# ── Base Analyzer ────────────────────────────────────────────────────
+
+
+class BaseAnalyzer(ABC):
+    """Interface every analyzer must implement."""
+
+    @property
+    @abstractmethod
+    def name(self) -> str: ...
+
+    @property
+    @abstractmethod
+    def description(self) -> str: ...
+
+    @abstractmethod
+    async def analyze(
+        self, rows: list[dict[str, Any]], config: dict[str, Any] | None = None
+    ) -> list[AlertCandidate]: ...
+
+
+# ── Built-in Analyzers ──────────────────────────────────────────────
+
+
+class EntropyAnalyzer(BaseAnalyzer):
+    """Detects high-entropy strings (encoded payloads, obfuscated commands)."""
+
+    name = "entropy"
+    description = "Flags fields with high Shannon entropy (possible encoding/obfuscation)"
+
+    ENTROPY_FIELDS = [
+        "command_line", "commandline", "process_command_line", "cmdline",
+        "powershell_command", "script_block", "url", "uri", "path",
+        "file_path", "target_filename", "query", "dns_query",
+    ]
+    DEFAULT_THRESHOLD = 4.5
+
+    @staticmethod
+    def _shannon(s: str) -> float:
+        if not s or len(s) < 8:
+            return 0.0
+        freq = Counter(s)
+        length = len(s)
+        return -sum((c / length) * math.log2(c / length) for c in freq.values())
+
+    async def analyze(self, rows, config=None):
+        config = config or {}
+        threshold = config.get("entropy_threshold", self.DEFAULT_THRESHOLD)
+        min_length = config.get("min_length", 20)
+        alerts: list[AlertCandidate] = []
+
+        for idx, row in enumerate(rows):
+            for field_name in self.ENTROPY_FIELDS:
+                val = str(row.get(field_name, ""))
+                if len(val) < min_length:
+                    continue
+                ent = self._shannon(val)
+                if ent >= threshold:
+                    sev = "critical" if ent > 5.5 else "high" if ent > 5.0 else "medium"
+                    alerts.append(AlertCandidate(
+                        analyzer=self.name,
+                        title=f"High-entropy string in {field_name}",
+                        severity=sev,
+                        description=f"Shannon entropy {ent:.2f} (threshold {threshold}) in row {idx}, field '{field_name}'",
+                        evidence=[{"row_index": idx, "field": field_name, "value": val[:200], "entropy": round(ent, 3)}],
+                        mitre_technique="T1027",  # Obfuscated Files or Information
+                        tags=["obfuscation", "entropy"],
+                        score=min(100, ent * 18),
+                    ))
+        return alerts
+
+
+class SuspiciousCommandAnalyzer(BaseAnalyzer):
+    """Detects known-bad command patterns (credential dumping, lateral movement, persistence)."""
+
+    name = "suspicious_commands"
+    description = "Flags processes executing known-suspicious command patterns"
+
+    PATTERNS: list[tuple[str, str, str, str]] = [
+        # (regex, title, severity, mitre_technique)
+        (r"mimikatz|sekurlsa|lsadump|kerberos::list", "Mimikatz / Credential Dumping", "critical", "T1003"),
+        (r"(?i)-enc\s+[A-Za-z0-9+/=]{40,}", "Encoded PowerShell command", "high", "T1059.001"),
+        (r"(?i)invoke-(mimikatz|expression|webrequest|shellcode)", "Suspicious PowerShell Invoke", "high", "T1059.001"),
+        (r"(?i)net\s+(user|localgroup|group)\s+/add", "Local account creation", "high", "T1136.001"),
+        (r"(?i)schtasks\s+/create", "Scheduled task creation", "medium", "T1053.005"),
+        (r"(?i)reg\s+add\s+.*\\run", "Registry Run key persistence", "high", "T1547.001"),
+        (r"(?i)wmic\s+.*(process\s+call|shadowcopy\s+delete)", "WMI abuse / shadow copy deletion", "critical", "T1047"),
+        (r"(?i)psexec|winrm|wmic\s+/node:", "Lateral movement tool", "high", "T1021"),
+        (r"(?i)certutil\s+-urlcache", "Certutil download (LOLBin)", "high", "T1105"),
+        (r"(?i)bitsadmin\s+/transfer", "BITSAdmin download", "medium", "T1197"),
+        (r"(?i)vssadmin\s+delete\s+shadows", "VSS shadow deletion (ransomware)", "critical", "T1490"),
+        (r"(?i)bcdedit.*recoveryenabled.*no", "Boot config tamper (ransomware)", "critical", "T1490"),
+        (r"(?i)attrib\s+\+h\s+\+s", "Hidden file attribute set", "low", "T1564.001"),
+        (r"(?i)netsh\s+advfirewall\s+.*disable", "Firewall disabled", "high", "T1562.004"),
+        (r"(?i)whoami\s*/priv", "Privilege enumeration", "medium", "T1033"),
+        (r"(?i)nltest\s+/dclist", "Domain controller enumeration", "medium", "T1018"),
+        (r"(?i)dsquery|ldapsearch|adfind", "Active Directory enumeration", "medium", "T1087.002"),
+        (r"(?i)procdump.*-ma\s+lsass", "LSASS memory dump", "critical", "T1003.001"),
+        (r"(?i)rundll32.*comsvcs.*MiniDump", "LSASS dump via comsvcs", "critical", "T1003.001"),
+    ]
+
+    CMD_FIELDS = [
+        "command_line", "commandline", "process_command_line", "cmdline",
+        "parent_command_line", "powershell_command",
+    ]
+
+    async def analyze(self, rows, config=None):
+        alerts: list[AlertCandidate] = []
+        compiled = [(re.compile(p, re.IGNORECASE), t, s, m) for p, t, s, m in self.PATTERNS]
+
+        for idx, row in enumerate(rows):
+            for fld in self.CMD_FIELDS:
+                val = str(row.get(fld, ""))
+                if len(val) < 3:
+                    continue
+                for pattern, title, sev, mitre in compiled:
+                    if pattern.search(val):
+                        alerts.append(AlertCandidate(
+                            analyzer=self.name,
+                            title=title,
+                            severity=sev,
+                            description=f"Suspicious command pattern in row {idx}: {val[:200]}",
+                            evidence=[{"row_index": idx, "field": fld, "value": val[:300]}],
+                            mitre_technique=mitre,
+                            tags=["command", "suspicious"],
+                            score={"critical": 95, "high": 80, "medium": 60, "low": 30}.get(sev, 50),
+                        ))
+        return alerts
+
+
+class NetworkAnomalyAnalyzer(BaseAnalyzer):
+    """Detects anomalous network patterns (beaconing, unusual ports, large transfers)."""
+
+    name = "network_anomaly"
+    description = "Flags anomalous network behavior (beaconing, unusual ports, large transfers)"
+
+    SUSPICIOUS_PORTS = {4444, 5555, 6666, 8888, 9999, 1234, 31337, 12345, 54321, 1337}
+    C2_PORTS = {443, 8443, 8080, 4443, 9443}
+
+    async def analyze(self, rows, config=None):
+        config = config or {}
+        alerts: list[AlertCandidate] = []
+
+        # Track destination IP frequency for beaconing detection
+        dst_freq: dict[str, list[int]] = defaultdict(list)
+        port_hits: list[tuple[int, str, int]] = []
+
+        for idx, row in enumerate(rows):
+            dst_ip = str(row.get("dst_ip", row.get("destination_ip", row.get("dest_ip", ""))))
+            dst_port = row.get("dst_port", row.get("destination_port", row.get("dest_port", "")))
+
+            if dst_ip and dst_ip != "":
+                dst_freq[dst_ip].append(idx)
+
+            if dst_port:
+                try:
+                    port_num = int(dst_port)
+                    if port_num in self.SUSPICIOUS_PORTS:
+                        port_hits.append((idx, dst_ip, port_num))
+                except (ValueError, TypeError):
+                    pass
+
+            # Large transfer detection
+            bytes_val = row.get("bytes_sent", row.get("bytes_out", row.get("sent_bytes", 0)))
+            try:
+                if int(bytes_val or 0) > config.get("large_transfer_threshold", 10_000_000):
+                    alerts.append(AlertCandidate(
+                        analyzer=self.name,
+                        title="Large data transfer detected",
+                        severity="medium",
+                        description=f"Row {idx}: {bytes_val} bytes sent to {dst_ip}",
+                        evidence=[{"row_index": idx, "dst_ip": dst_ip, "bytes": str(bytes_val)}],
+                        mitre_technique="T1048",
+                        tags=["exfiltration", "network"],
+                        score=65,
+                    ))
+            except (ValueError, TypeError):
+                pass
+
+        # Beaconing: IPs contacted more than threshold times
+        beacon_thresh = config.get("beacon_threshold", 20)
+        for ip, indices in dst_freq.items():
+            if len(indices) >= beacon_thresh:
+                alerts.append(AlertCandidate(
+                    analyzer=self.name,
+                    title=f"Possible beaconing to {ip}",
+                    severity="high",
+                    description=f"Destination {ip} contacted {len(indices)} times (threshold: {beacon_thresh})",
+                    evidence=[{"dst_ip": ip, "contact_count": len(indices), "sample_rows": indices[:10]}],
+                    mitre_technique="T1071",
+                    tags=["beaconing", "c2", "network"],
+                    score=min(95, 50 + len(indices)),
+                ))
+
+        # Suspicious ports
+        for idx, ip, port in port_hits:
+            alerts.append(AlertCandidate(
+                analyzer=self.name,
+                title=f"Connection on suspicious port {port}",
+                severity="medium",
+                description=f"Row {idx}: connection to {ip}:{port}",
+                evidence=[{"row_index": idx, "dst_ip": ip, "dst_port": port}],
+                mitre_technique="T1571",
+                tags=["suspicious_port", "network"],
+                score=55,
+            ))
+
+        return alerts
+
+
+class FrequencyAnomalyAnalyzer(BaseAnalyzer):
+    """Detects statistically rare values that may indicate anomalies."""
+
+    name = "frequency_anomaly"
+    description = "Flags statistically rare field values (potential anomalies)"
+
+    FIELDS_TO_CHECK = [
+        "process_name", "image_name", "parent_process_name",
+        "user", "username", "user_name",
+        "event_type", "action", "status",
+    ]
+
+    async def analyze(self, rows, config=None):
+        config = config or {}
+        rarity_threshold = config.get("rarity_threshold", 0.01)  # <1% occurrence
+        min_rows = config.get("min_rows", 50)
+        alerts: list[AlertCandidate] = []
+
+        if len(rows) < min_rows:
+            return alerts
+
+        for fld in self.FIELDS_TO_CHECK:
+            values = [str(row.get(fld, "")) for row in rows if row.get(fld)]
+            if not values:
+                continue
+            counts = Counter(values)
+            total = len(values)
+
+            for val, cnt in counts.items():
+                pct = cnt / total
+                if pct <= rarity_threshold and cnt <= 3:
+                    # Find row indices
+                    indices = [i for i, r in enumerate(rows) if str(r.get(fld, "")) == val]
+                    alerts.append(AlertCandidate(
+                        analyzer=self.name,
+                        title=f"Rare {fld}: {val[:80]}",
+                        severity="low",
+                        description=f"'{val}' appears {cnt}/{total} times ({pct:.2%}) in field '{fld}'",
+                        evidence=[{"field": fld, "value": val[:200], "count": cnt, "total": total, "rows": indices[:5]}],
+                        tags=["anomaly", "rare"],
+                        score=max(20, 50 - (pct * 5000)),
+                    ))
+
+        return alerts
+
+
+class AuthAnomalyAnalyzer(BaseAnalyzer):
+    """Detects authentication anomalies (brute force, unusual logon types)."""
+
+    name = "auth_anomaly"
+    description = "Flags authentication anomalies (failed logins, unusual logon types)"
+
+    async def analyze(self, rows, config=None):
+        config = config or {}
+        alerts: list[AlertCandidate] = []
+
+        # Track failed logins per user
+        failed_by_user: dict[str, list[int]] = defaultdict(list)
+        logon_types: dict[str, list[int]] = defaultdict(list)
+
+        for idx, row in enumerate(rows):
+            event_type = str(row.get("event_type", row.get("action", ""))).lower()
+            status = str(row.get("status", row.get("result", ""))).lower()
+            user = str(row.get("username", row.get("user", row.get("user_name", ""))))
+            logon_type = str(row.get("logon_type", ""))
+
+            if "logon" in event_type or "auth" in event_type or "login" in event_type:
+                if "fail" in status or "4625" in str(row.get("event_id", "")):
+                    if user:
+                        failed_by_user[user].append(idx)
+
+                if logon_type in ("3", "10"):  # Network/RemoteInteractive
+                    logon_types[logon_type].append(idx)
+
+        # Brute force: >5 failed logins for same user
+        brute_thresh = config.get("brute_force_threshold", 5)
+        for user, indices in failed_by_user.items():
+            if len(indices) >= brute_thresh:
+                alerts.append(AlertCandidate(
+                    analyzer=self.name,
+                    title=f"Possible brute force: {user}",
+                    severity="high",
+                    description=f"User '{user}' had {len(indices)} failed logins",
+                    evidence=[{"user": user, "failed_count": len(indices), "rows": indices[:10]}],
+                    mitre_technique="T1110",
+                    tags=["brute_force", "authentication"],
+                    score=min(90, 50 + len(indices) * 3),
+                ))
+
+        # Unusual logon types
+        for ltype, indices in logon_types.items():
+            label = "Network logon (Type 3)" if ltype == "3" else "Remote Desktop (Type 10)"
+            if len(indices) >= 3:
+                alerts.append(AlertCandidate(
+                    analyzer=self.name,
+                    title=f"{label} detected",
+                    severity="medium" if ltype == "3" else "high",
+                    description=f"{len(indices)} {label} events detected",
+                    evidence=[{"logon_type": ltype, "count": len(indices), "rows": indices[:10]}],
+                    mitre_technique="T1021",
+                    tags=["authentication", "lateral_movement"],
+                    score=55 if ltype == "3" else 70,
+                ))
+
+        return alerts
+
+
+class PersistenceAnalyzer(BaseAnalyzer):
+    """Detects persistence mechanisms (registry keys, services, scheduled tasks)."""
+
+    name = "persistence"
+    description = "Flags persistence mechanism installations"
+
+    REGISTRY_PATTERNS = [
+        (r"(?i)\\CurrentVersion\\Run", "Run key persistence", "T1547.001"),
+        (r"(?i)\\Services\\", "Service installation", "T1543.003"),
+        (r"(?i)\\Winlogon\\", "Winlogon persistence", "T1547.004"),
+        (r"(?i)\\Image File Execution Options\\", "IFEO debugger persistence", "T1546.012"),
+        (r"(?i)\\Explorer\\Shell Folders", "Shell folder hijack", "T1547.001"),
+    ]
+
+    async def analyze(self, rows, config=None):
+        alerts: list[AlertCandidate] = []
+        compiled = [(re.compile(p), t, m) for p, t, m in self.REGISTRY_PATTERNS]
+
+        for idx, row in enumerate(rows):
+            # Check registry paths
+            reg_path = str(row.get("registry_key", row.get("target_object", row.get("registry_path", ""))))
+            for pattern, title, mitre in compiled:
+                if pattern.search(reg_path):
+                    alerts.append(AlertCandidate(
+                        analyzer=self.name,
+                        title=title,
+                        severity="high",
+                        description=f"Row {idx}: {reg_path[:200]}",
+                        evidence=[{"row_index": idx, "registry_key": reg_path[:300]}],
+                        mitre_technique=mitre,
+                        tags=["persistence", "registry"],
+                        score=75,
+                    ))
+
+            # Check for service creation events
+            event_type = str(row.get("event_type", "")).lower()
+            if "service" in event_type and "creat" in event_type:
+                svc_name = row.get("service_name", row.get("target_filename", "unknown"))
+                alerts.append(AlertCandidate(
+                    analyzer=self.name,
+                    title=f"Service created: {svc_name}",
+                    severity="medium",
+                    description=f"Row {idx}: New service '{svc_name}' created",
+                    evidence=[{"row_index": idx, "service_name": str(svc_name)}],
+                    mitre_technique="T1543.003",
+                    tags=["persistence", "service"],
+                    score=60,
+                ))
+
+        return alerts
+
+
+# ── Analyzer Registry ────────────────────────────────────────────────
+
+
+_ALL_ANALYZERS: list[BaseAnalyzer] = [
+    EntropyAnalyzer(),
+    SuspiciousCommandAnalyzer(),
+    NetworkAnomalyAnalyzer(),
+    FrequencyAnomalyAnalyzer(),
+    AuthAnomalyAnalyzer(),
+    PersistenceAnalyzer(),
+]
+
+
+def get_available_analyzers() -> list[dict[str, str]]:
+    """Return metadata about all registered analyzers."""
+    return [{"name": a.name, "description": a.description} for a in _ALL_ANALYZERS]
+
+
+def get_analyzer(name: str) -> BaseAnalyzer | None:
+    """Get an analyzer by name."""
+    for a in _ALL_ANALYZERS:
+        if a.name == name:
+            return a
+    return None
+
+
+async def run_all_analyzers(
+    rows: list[dict[str, Any]],
+    enabled: list[str] | None = None,
+    config: dict[str, Any] | None = None,
+) -> list[AlertCandidate]:
+    """Run all (or selected) analyzers and return combined alert candidates.
+
+    Args:
+        rows: Flat list of row dicts (normalized_data or data from DatasetRow).
+        enabled: Optional list of analyzer names to run. Runs all if None.
+        config: Optional config overrides passed to each analyzer.
+
+    Returns:
+        Combined list of AlertCandidate from all analyzers, sorted by score desc.
+    """
+    config = config or {}
+    results: list[AlertCandidate] = []
+
+    for analyzer in _ALL_ANALYZERS:
+        if enabled and analyzer.name not in enabled:
+            continue
+        try:
+            candidates = await analyzer.analyze(rows, config)
+            results.extend(candidates)
+            logger.info("Analyzer %s produced %d alerts", analyzer.name, len(candidates))
+        except Exception:
+            logger.exception("Analyzer %s failed", analyzer.name)
+
+    # Sort by score descending
+    results.sort(key=lambda a: a.score, reverse=True)
+    return results