"""Pluggable Analyzer Framework for ThreatHunt. Each analyzer implements a simple protocol: - name / description properties - async analyze(rows, config) -> list[AlertCandidate] The AnalyzerRegistry discovers and runs all enabled analyzers against a dataset, producing alert candidates that the alert system can persist. """ from __future__ import annotations import logging import math import re from abc import ABC, abstractmethod from collections import Counter, defaultdict from dataclasses import dataclass, field from typing import Any, Optional, Sequence logger = logging.getLogger(__name__) # ── Alert Candidate DTO ────────────────────────────────────────────── @dataclass class AlertCandidate: """A single finding from an analyzer, before it becomes a persisted Alert.""" analyzer: str title: str severity: str # critical | high | medium | low | info description: str evidence: list[dict] = field(default_factory=list) # [{row_index, field, value, ...}] mitre_technique: Optional[str] = None tags: list[str] = field(default_factory=list) score: float = 0.0 # 0-100 # ── Base Analyzer ──────────────────────────────────────────────────── class BaseAnalyzer(ABC): """Interface every analyzer must implement.""" @property @abstractmethod def name(self) -> str: ... @property @abstractmethod def description(self) -> str: ... @abstractmethod async def analyze( self, rows: list[dict[str, Any]], config: dict[str, Any] | None = None ) -> list[AlertCandidate]: ... # ── Built-in Analyzers ────────────────────────────────────────────── class EntropyAnalyzer(BaseAnalyzer): """Detects high-entropy strings (encoded payloads, obfuscated commands).""" name = "entropy" description = "Flags fields with high Shannon entropy (possible encoding/obfuscation)" ENTROPY_FIELDS = [ "command_line", "commandline", "process_command_line", "cmdline", "powershell_command", "script_block", "url", "uri", "path", "file_path", "target_filename", "query", "dns_query", ] DEFAULT_THRESHOLD = 4.5 @staticmethod def _shannon(s: str) -> float: if not s or len(s) < 8: return 0.0 freq = Counter(s) length = len(s) return -sum((c / length) * math.log2(c / length) for c in freq.values()) async def analyze(self, rows, config=None): config = config or {} threshold = config.get("entropy_threshold", self.DEFAULT_THRESHOLD) min_length = config.get("min_length", 20) alerts: list[AlertCandidate] = [] for idx, row in enumerate(rows): for field_name in self.ENTROPY_FIELDS: val = str(row.get(field_name, "")) if len(val) < min_length: continue ent = self._shannon(val) if ent >= threshold: sev = "critical" if ent > 5.5 else "high" if ent > 5.0 else "medium" alerts.append(AlertCandidate( analyzer=self.name, title=f"High-entropy string in {field_name}", severity=sev, description=f"Shannon entropy {ent:.2f} (threshold {threshold}) in row {idx}, field '{field_name}'", evidence=[{"row_index": idx, "field": field_name, "value": val[:200], "entropy": round(ent, 3)}], mitre_technique="T1027", # Obfuscated Files or Information tags=["obfuscation", "entropy"], score=min(100, ent * 18), )) return alerts class SuspiciousCommandAnalyzer(BaseAnalyzer): """Detects known-bad command patterns (credential dumping, lateral movement, persistence).""" name = "suspicious_commands" description = "Flags processes executing known-suspicious command patterns" PATTERNS: list[tuple[str, str, str, str]] = [ # (regex, title, severity, mitre_technique) (r"mimikatz|sekurlsa|lsadump|kerberos::list", "Mimikatz / Credential Dumping", "critical", "T1003"), (r"(?i)-enc\s+[A-Za-z0-9+/=]{40,}", "Encoded PowerShell command", "high", "T1059.001"), (r"(?i)invoke-(mimikatz|expression|webrequest|shellcode)", "Suspicious PowerShell Invoke", "high", "T1059.001"), (r"(?i)net\s+(user|localgroup|group)\s+/add", "Local account creation", "high", "T1136.001"), (r"(?i)schtasks\s+/create", "Scheduled task creation", "medium", "T1053.005"), (r"(?i)reg\s+add\s+.*\\run", "Registry Run key persistence", "high", "T1547.001"), (r"(?i)wmic\s+.*(process\s+call|shadowcopy\s+delete)", "WMI abuse / shadow copy deletion", "critical", "T1047"), (r"(?i)psexec|winrm|wmic\s+/node:", "Lateral movement tool", "high", "T1021"), (r"(?i)certutil\s+-urlcache", "Certutil download (LOLBin)", "high", "T1105"), (r"(?i)bitsadmin\s+/transfer", "BITSAdmin download", "medium", "T1197"), (r"(?i)vssadmin\s+delete\s+shadows", "VSS shadow deletion (ransomware)", "critical", "T1490"), (r"(?i)bcdedit.*recoveryenabled.*no", "Boot config tamper (ransomware)", "critical", "T1490"), (r"(?i)attrib\s+\+h\s+\+s", "Hidden file attribute set", "low", "T1564.001"), (r"(?i)netsh\s+advfirewall\s+.*disable", "Firewall disabled", "high", "T1562.004"), (r"(?i)whoami\s*/priv", "Privilege enumeration", "medium", "T1033"), (r"(?i)nltest\s+/dclist", "Domain controller enumeration", "medium", "T1018"), (r"(?i)dsquery|ldapsearch|adfind", "Active Directory enumeration", "medium", "T1087.002"), (r"(?i)procdump.*-ma\s+lsass", "LSASS memory dump", "critical", "T1003.001"), (r"(?i)rundll32.*comsvcs.*MiniDump", "LSASS dump via comsvcs", "critical", "T1003.001"), ] CMD_FIELDS = [ "command_line", "commandline", "process_command_line", "cmdline", "parent_command_line", "powershell_command", ] async def analyze(self, rows, config=None): alerts: list[AlertCandidate] = [] compiled = [(re.compile(p, re.IGNORECASE), t, s, m) for p, t, s, m in self.PATTERNS] for idx, row in enumerate(rows): for fld in self.CMD_FIELDS: val = str(row.get(fld, "")) if len(val) < 3: continue for pattern, title, sev, mitre in compiled: if pattern.search(val): alerts.append(AlertCandidate( analyzer=self.name, title=title, severity=sev, description=f"Suspicious command pattern in row {idx}: {val[:200]}", evidence=[{"row_index": idx, "field": fld, "value": val[:300]}], mitre_technique=mitre, tags=["command", "suspicious"], score={"critical": 95, "high": 80, "medium": 60, "low": 30}.get(sev, 50), )) return alerts class NetworkAnomalyAnalyzer(BaseAnalyzer): """Detects anomalous network patterns (beaconing, unusual ports, large transfers).""" name = "network_anomaly" description = "Flags anomalous network behavior (beaconing, unusual ports, large transfers)" SUSPICIOUS_PORTS = {4444, 5555, 6666, 8888, 9999, 1234, 31337, 12345, 54321, 1337} C2_PORTS = {443, 8443, 8080, 4443, 9443} async def analyze(self, rows, config=None): config = config or {} alerts: list[AlertCandidate] = [] # Track destination IP frequency for beaconing detection dst_freq: dict[str, list[int]] = defaultdict(list) port_hits: list[tuple[int, str, int]] = [] for idx, row in enumerate(rows): dst_ip = str(row.get("dst_ip", row.get("destination_ip", row.get("dest_ip", "")))) dst_port = row.get("dst_port", row.get("destination_port", row.get("dest_port", ""))) if dst_ip and dst_ip != "": dst_freq[dst_ip].append(idx) if dst_port: try: port_num = int(dst_port) if port_num in self.SUSPICIOUS_PORTS: port_hits.append((idx, dst_ip, port_num)) except (ValueError, TypeError): pass # Large transfer detection bytes_val = row.get("bytes_sent", row.get("bytes_out", row.get("sent_bytes", 0))) try: if int(bytes_val or 0) > config.get("large_transfer_threshold", 10_000_000): alerts.append(AlertCandidate( analyzer=self.name, title="Large data transfer detected", severity="medium", description=f"Row {idx}: {bytes_val} bytes sent to {dst_ip}", evidence=[{"row_index": idx, "dst_ip": dst_ip, "bytes": str(bytes_val)}], mitre_technique="T1048", tags=["exfiltration", "network"], score=65, )) except (ValueError, TypeError): pass # Beaconing: IPs contacted more than threshold times beacon_thresh = config.get("beacon_threshold", 20) for ip, indices in dst_freq.items(): if len(indices) >= beacon_thresh: alerts.append(AlertCandidate( analyzer=self.name, title=f"Possible beaconing to {ip}", severity="high", description=f"Destination {ip} contacted {len(indices)} times (threshold: {beacon_thresh})", evidence=[{"dst_ip": ip, "contact_count": len(indices), "sample_rows": indices[:10]}], mitre_technique="T1071", tags=["beaconing", "c2", "network"], score=min(95, 50 + len(indices)), )) # Suspicious ports for idx, ip, port in port_hits: alerts.append(AlertCandidate( analyzer=self.name, title=f"Connection on suspicious port {port}", severity="medium", description=f"Row {idx}: connection to {ip}:{port}", evidence=[{"row_index": idx, "dst_ip": ip, "dst_port": port}], mitre_technique="T1571", tags=["suspicious_port", "network"], score=55, )) return alerts class FrequencyAnomalyAnalyzer(BaseAnalyzer): """Detects statistically rare values that may indicate anomalies.""" name = "frequency_anomaly" description = "Flags statistically rare field values (potential anomalies)" FIELDS_TO_CHECK = [ "process_name", "image_name", "parent_process_name", "user", "username", "user_name", "event_type", "action", "status", ] async def analyze(self, rows, config=None): config = config or {} rarity_threshold = config.get("rarity_threshold", 0.01) # <1% occurrence min_rows = config.get("min_rows", 50) alerts: list[AlertCandidate] = [] if len(rows) < min_rows: return alerts for fld in self.FIELDS_TO_CHECK: values = [str(row.get(fld, "")) for row in rows if row.get(fld)] if not values: continue counts = Counter(values) total = len(values) for val, cnt in counts.items(): pct = cnt / total if pct <= rarity_threshold and cnt <= 3: # Find row indices indices = [i for i, r in enumerate(rows) if str(r.get(fld, "")) == val] alerts.append(AlertCandidate( analyzer=self.name, title=f"Rare {fld}: {val[:80]}", severity="low", description=f"'{val}' appears {cnt}/{total} times ({pct:.2%}) in field '{fld}'", evidence=[{"field": fld, "value": val[:200], "count": cnt, "total": total, "rows": indices[:5]}], tags=["anomaly", "rare"], score=max(20, 50 - (pct * 5000)), )) return alerts class AuthAnomalyAnalyzer(BaseAnalyzer): """Detects authentication anomalies (brute force, unusual logon types).""" name = "auth_anomaly" description = "Flags authentication anomalies (failed logins, unusual logon types)" async def analyze(self, rows, config=None): config = config or {} alerts: list[AlertCandidate] = [] # Track failed logins per user failed_by_user: dict[str, list[int]] = defaultdict(list) logon_types: dict[str, list[int]] = defaultdict(list) for idx, row in enumerate(rows): event_type = str(row.get("event_type", row.get("action", ""))).lower() status = str(row.get("status", row.get("result", ""))).lower() user = str(row.get("username", row.get("user", row.get("user_name", "")))) logon_type = str(row.get("logon_type", "")) if "logon" in event_type or "auth" in event_type or "login" in event_type: if "fail" in status or "4625" in str(row.get("event_id", "")): if user: failed_by_user[user].append(idx) if logon_type in ("3", "10"): # Network/RemoteInteractive logon_types[logon_type].append(idx) # Brute force: >5 failed logins for same user brute_thresh = config.get("brute_force_threshold", 5) for user, indices in failed_by_user.items(): if len(indices) >= brute_thresh: alerts.append(AlertCandidate( analyzer=self.name, title=f"Possible brute force: {user}", severity="high", description=f"User '{user}' had {len(indices)} failed logins", evidence=[{"user": user, "failed_count": len(indices), "rows": indices[:10]}], mitre_technique="T1110", tags=["brute_force", "authentication"], score=min(90, 50 + len(indices) * 3), )) # Unusual logon types for ltype, indices in logon_types.items(): label = "Network logon (Type 3)" if ltype == "3" else "Remote Desktop (Type 10)" if len(indices) >= 3: alerts.append(AlertCandidate( analyzer=self.name, title=f"{label} detected", severity="medium" if ltype == "3" else "high", description=f"{len(indices)} {label} events detected", evidence=[{"logon_type": ltype, "count": len(indices), "rows": indices[:10]}], mitre_technique="T1021", tags=["authentication", "lateral_movement"], score=55 if ltype == "3" else 70, )) return alerts class PersistenceAnalyzer(BaseAnalyzer): """Detects persistence mechanisms (registry keys, services, scheduled tasks).""" name = "persistence" description = "Flags persistence mechanism installations" REGISTRY_PATTERNS = [ (r"(?i)\\CurrentVersion\\Run", "Run key persistence", "T1547.001"), (r"(?i)\\Services\\", "Service installation", "T1543.003"), (r"(?i)\\Winlogon\\", "Winlogon persistence", "T1547.004"), (r"(?i)\\Image File Execution Options\\", "IFEO debugger persistence", "T1546.012"), (r"(?i)\\Explorer\\Shell Folders", "Shell folder hijack", "T1547.001"), ] async def analyze(self, rows, config=None): alerts: list[AlertCandidate] = [] compiled = [(re.compile(p), t, m) for p, t, m in self.REGISTRY_PATTERNS] for idx, row in enumerate(rows): # Check registry paths reg_path = str(row.get("registry_key", row.get("target_object", row.get("registry_path", "")))) for pattern, title, mitre in compiled: if pattern.search(reg_path): alerts.append(AlertCandidate( analyzer=self.name, title=title, severity="high", description=f"Row {idx}: {reg_path[:200]}", evidence=[{"row_index": idx, "registry_key": reg_path[:300]}], mitre_technique=mitre, tags=["persistence", "registry"], score=75, )) # Check for service creation events event_type = str(row.get("event_type", "")).lower() if "service" in event_type and "creat" in event_type: svc_name = row.get("service_name", row.get("target_filename", "unknown")) alerts.append(AlertCandidate( analyzer=self.name, title=f"Service created: {svc_name}", severity="medium", description=f"Row {idx}: New service '{svc_name}' created", evidence=[{"row_index": idx, "service_name": str(svc_name)}], mitre_technique="T1543.003", tags=["persistence", "service"], score=60, )) return alerts # ── Analyzer Registry ──────────────────────────────────────────────── _ALL_ANALYZERS: list[BaseAnalyzer] = [ EntropyAnalyzer(), SuspiciousCommandAnalyzer(), NetworkAnomalyAnalyzer(), FrequencyAnomalyAnalyzer(), AuthAnomalyAnalyzer(), PersistenceAnalyzer(), ] def get_available_analyzers() -> list[dict[str, str]]: """Return metadata about all registered analyzers.""" return [{"name": a.name, "description": a.description} for a in _ALL_ANALYZERS] def get_analyzer(name: str) -> BaseAnalyzer | None: """Get an analyzer by name.""" for a in _ALL_ANALYZERS: if a.name == name: return a return None async def run_all_analyzers( rows: list[dict[str, Any]], enabled: list[str] | None = None, config: dict[str, Any] | None = None, ) -> list[AlertCandidate]: """Run all (or selected) analyzers and return combined alert candidates. Args: rows: Flat list of row dicts (normalized_data or data from DatasetRow). enabled: Optional list of analyzer names to run. Runs all if None. config: Optional config overrides passed to each analyzer. Returns: Combined list of AlertCandidate from all analyzers, sorted by score desc. """ config = config or {} results: list[AlertCandidate] = [] for analyzer in _ALL_ANALYZERS: if enabled and analyzer.name not in enabled: continue try: candidates = await analyzer.analyze(rows, config) results.extend(candidates) logger.info("Analyzer %s produced %d alerts", analyzer.name, len(candidates)) except Exception: logger.exception("Analyzer %s failed", analyzer.name) # Sort by score descending results.sort(key=lambda a: a.score, reverse=True) return results