version 0.4.0

This commit is contained in:
2026-02-20 14:32:42 -05:00
parent ab8038867a
commit 365cf87c90
76 changed files with 34422 additions and 690 deletions

View File

@@ -0,0 +1,484 @@
"""
MITRE ATT&CK mapping service.
Maps dataset events to ATT&CK techniques using pattern-based heuristics.
Uses the enterprise-attack matrix (embedded patterns for offline use).
"""
import logging
import re
from collections import Counter, defaultdict
from typing import Any
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import Dataset, DatasetRow
logger = logging.getLogger(__name__)
# ── ATT&CK Technique Patterns ────────────────────────────────────────
# Subset of enterprise-attack techniques with detection patterns.
# Each entry: (technique_id, name, tactic, patterns_list)
TECHNIQUE_PATTERNS: list[tuple[str, str, str, list[str]]] = [
# Initial Access
("T1566", "Phishing", "initial-access", [
r"phish", r"\.hta\b", r"\.lnk\b", r"mshta\.exe", r"outlook.*attachment",
]),
("T1190", "Exploit Public-Facing Application", "initial-access", [
r"exploit", r"CVE-\d{4}", r"vulnerability", r"webshell",
]),
# Execution
("T1059.001", "PowerShell", "execution", [
r"powershell", r"pwsh", r"-enc\b", r"-encodedcommand",
r"invoke-expression", r"iex\b", r"bypass\b.*execution",
]),
("T1059.003", "Windows Command Shell", "execution", [
r"cmd\.exe", r"/c\s+", r"command\.com",
]),
("T1059.005", "Visual Basic", "execution", [
r"wscript", r"cscript", r"\.vbs\b", r"\.vbe\b",
]),
("T1047", "Windows Management Instrumentation", "execution", [
r"wmic\b", r"winmgmt", r"wmi\b",
]),
("T1053.005", "Scheduled Task", "execution", [
r"schtasks", r"at\.exe", r"taskschd",
]),
("T1204", "User Execution", "execution", [
r"user.*click", r"open.*attachment", r"macro",
]),
# Persistence
("T1547.001", "Registry Run Keys", "persistence", [
r"CurrentVersion\\Run", r"HKLM\\Software\\Microsoft\\Windows\\CurrentVersion\\Run",
r"reg\s+add.*\\Run",
]),
("T1543.003", "Windows Service", "persistence", [
r"sc\s+create", r"new-service", r"service.*install",
]),
("T1136", "Create Account", "persistence", [
r"net\s+user\s+/add", r"new-localuser", r"useradd",
]),
("T1053.005", "Scheduled Task/Job", "persistence", [
r"schtasks\s+/create", r"crontab",
]),
# Privilege Escalation
("T1548.002", "Bypass User Access Control", "privilege-escalation", [
r"eventvwr", r"fodhelper", r"uac.*bypass", r"computerdefaults",
]),
("T1134", "Access Token Manipulation", "privilege-escalation", [
r"token.*impersonat", r"runas", r"adjusttokenprivileges",
]),
# Defense Evasion
("T1070.001", "Clear Windows Event Logs", "defense-evasion", [
r"wevtutil\s+cl", r"clear-eventlog", r"clearlog",
]),
("T1562.001", "Disable or Modify Tools", "defense-evasion", [
r"tamper.*protection", r"disable.*defender", r"set-mppreference",
r"disable.*firewall",
]),
("T1027", "Obfuscated Files or Information", "defense-evasion", [
r"base64", r"-enc\b", r"certutil.*-decode", r"frombase64",
]),
("T1036", "Masquerading", "defense-evasion", [
r"rename.*\.exe", r"masquerad", r"svchost.*unusual",
]),
("T1055", "Process Injection", "defense-evasion", [
r"inject", r"createremotethread", r"ntcreatethreadex",
r"virtualalloc", r"writeprocessmemory",
]),
# Credential Access
("T1003.001", "LSASS Memory", "credential-access", [
r"mimikatz", r"sekurlsa", r"lsass", r"procdump.*lsass",
]),
("T1003.003", "NTDS", "credential-access", [
r"ntds\.dit", r"vssadmin.*shadow", r"ntdsutil",
]),
("T1110", "Brute Force", "credential-access", [
r"brute.*force", r"failed.*login.*\d{3,}", r"hydra", r"medusa",
]),
("T1558.003", "Kerberoasting", "credential-access", [
r"kerberoast", r"invoke-kerberoast", r"GetUserSPNs",
]),
# Discovery
("T1087", "Account Discovery", "discovery", [
r"net\s+user", r"net\s+localgroup", r"get-aduser",
]),
("T1082", "System Information Discovery", "discovery", [
r"systeminfo", r"hostname", r"ver\b",
]),
("T1083", "File and Directory Discovery", "discovery", [
r"dir\s+/s", r"tree\s+/f", r"get-childitem.*-recurse",
]),
("T1057", "Process Discovery", "discovery", [
r"tasklist", r"get-process", r"ps\s+aux",
]),
("T1018", "Remote System Discovery", "discovery", [
r"net\s+view", r"ping\s+-", r"arp\s+-a", r"nslookup",
]),
("T1016", "System Network Configuration Discovery", "discovery", [
r"ipconfig", r"ifconfig", r"netstat",
]),
# Lateral Movement
("T1021.001", "Remote Desktop Protocol", "lateral-movement", [
r"rdp\b", r"mstsc", r"3389", r"remote\s+desktop",
]),
("T1021.002", "SMB/Windows Admin Shares", "lateral-movement", [
r"\\\\.*\\(c|admin)\$", r"psexec", r"smbclient", r"net\s+use",
]),
("T1021.006", "Windows Remote Management", "lateral-movement", [
r"winrm", r"enter-pssession", r"invoke-command.*-computername",
r"wsman", r"5985|5986",
]),
("T1570", "Lateral Tool Transfer", "lateral-movement", [
r"copy.*\\\\", r"xcopy.*\\\\", r"robocopy",
]),
# Collection
("T1560", "Archive Collected Data", "collection", [
r"compress-archive", r"7z\.exe", r"rar\s+a", r"tar\s+-[cz]",
]),
("T1005", "Data from Local System", "collection", [
r"type\s+.*password", r"findstr.*password", r"select-string.*credential",
]),
# Command and Control
("T1071.001", "Web Protocols", "command-and-control", [
r"http[s]?://\d+\.\d+\.\d+\.\d+", r"curl\b", r"wget\b",
r"invoke-webrequest", r"beacon",
]),
("T1573", "Encrypted Channel", "command-and-control", [
r"ssl\b", r"tls\b", r"encrypted.*tunnel", r"stunnel",
]),
("T1105", "Ingress Tool Transfer", "command-and-control", [
r"certutil.*-urlcache", r"bitsadmin.*transfer",
r"downloadfile", r"invoke-webrequest.*-outfile",
]),
("T1219", "Remote Access Software", "command-and-control", [
r"teamviewer", r"anydesk", r"logmein", r"vnc",
]),
# Exfiltration
("T1048", "Exfiltration Over Alternative Protocol", "exfiltration", [
r"dns.*tunnel", r"exfil", r"icmp.*tunnel",
]),
("T1041", "Exfiltration Over C2 Channel", "exfiltration", [
r"upload.*c2", r"exfil.*http",
]),
("T1567", "Exfiltration Over Web Service", "exfiltration", [
r"mega\.nz", r"dropbox", r"pastebin", r"transfer\.sh",
]),
# Impact
("T1486", "Data Encrypted for Impact", "impact", [
r"ransomware", r"encrypt.*files", r"\.locked\b", r"ransom",
]),
("T1489", "Service Stop", "impact", [
r"sc\s+stop", r"net\s+stop", r"stop-service",
]),
("T1529", "System Shutdown/Reboot", "impact", [
r"shutdown\s+/[rs]", r"restart-computer",
]),
]
# Tactic display names and kill-chain order
TACTIC_ORDER = [
"initial-access", "execution", "persistence", "privilege-escalation",
"defense-evasion", "credential-access", "discovery", "lateral-movement",
"collection", "command-and-control", "exfiltration", "impact",
]
TACTIC_NAMES = {
"initial-access": "Initial Access",
"execution": "Execution",
"persistence": "Persistence",
"privilege-escalation": "Privilege Escalation",
"defense-evasion": "Defense Evasion",
"credential-access": "Credential Access",
"discovery": "Discovery",
"lateral-movement": "Lateral Movement",
"collection": "Collection",
"command-and-control": "Command and Control",
"exfiltration": "Exfiltration",
"impact": "Impact",
}
# ── Row fetcher ───────────────────────────────────────────────────────
async def _fetch_rows(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
limit: int = 5000,
) -> list[dict[str, Any]]:
q = select(DatasetRow).join(Dataset)
if dataset_id:
q = q.where(DatasetRow.dataset_id == dataset_id)
elif hunt_id:
q = q.where(Dataset.hunt_id == hunt_id)
q = q.limit(limit)
result = await db.execute(q)
return [r.data for r in result.scalars().all()]
# ── Main functions ────────────────────────────────────────────────────
async def map_to_attack(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
) -> dict[str, Any]:
"""
Map dataset rows to MITRE ATT&CK techniques.
Returns a matrix-style structure + evidence list.
"""
rows = await _fetch_rows(db, dataset_id, hunt_id)
if not rows:
return {"tactics": [], "techniques": [], "evidence": [], "coverage": {}, "total_rows": 0}
# Flatten all string values per row for matching
row_texts: list[str] = []
for row in rows:
parts = []
for v in row.values():
if v is not None:
parts.append(str(v).lower())
row_texts.append(" ".join(parts))
# Match techniques
technique_hits: dict[str, list[dict]] = defaultdict(list) # tech_id -> evidence rows
technique_meta: dict[str, tuple[str, str]] = {} # tech_id -> (name, tactic)
row_techniques: list[set[str]] = [set() for _ in rows]
for tech_id, tech_name, tactic, patterns in TECHNIQUE_PATTERNS:
compiled = [re.compile(p, re.IGNORECASE) for p in patterns]
technique_meta[tech_id] = (tech_name, tactic)
for i, text in enumerate(row_texts):
for pat in compiled:
if pat.search(text):
row_techniques[i].add(tech_id)
if len(technique_hits[tech_id]) < 10: # limit evidence
# find matching field
matched_field = ""
matched_value = ""
for k, v in rows[i].items():
if v and pat.search(str(v).lower()):
matched_field = k
matched_value = str(v)[:200]
break
technique_hits[tech_id].append({
"row_index": i,
"field": matched_field,
"value": matched_value,
"pattern": pat.pattern,
})
break # one pattern match per technique per row is enough
# Build tactic → technique structure
tactic_techniques: dict[str, list[dict]] = defaultdict(list)
for tech_id, evidence_list in technique_hits.items():
name, tactic = technique_meta[tech_id]
tactic_techniques[tactic].append({
"id": tech_id,
"name": name,
"count": len(evidence_list),
"evidence": evidence_list[:5],
})
# Build ordered tactics list
tactics = []
for tactic_key in TACTIC_ORDER:
techs = tactic_techniques.get(tactic_key, [])
tactics.append({
"id": tactic_key,
"name": TACTIC_NAMES.get(tactic_key, tactic_key),
"techniques": sorted(techs, key=lambda t: -t["count"]),
"total_hits": sum(t["count"] for t in techs),
})
# Coverage stats
covered_tactics = sum(1 for t in tactics if t["total_hits"] > 0)
total_technique_hits = sum(t["total_hits"] for t in tactics)
return {
"tactics": tactics,
"coverage": {
"tactics_covered": covered_tactics,
"tactics_total": len(TACTIC_ORDER),
"techniques_matched": len(technique_hits),
"total_evidence": total_technique_hits,
},
"total_rows": len(rows),
}
async def build_knowledge_graph(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
) -> dict[str, Any]:
"""
Build a knowledge graph connecting entities (hosts, users, processes, IPs)
to MITRE techniques and tactics.
Returns Cytoscape-compatible nodes + edges.
"""
rows = await _fetch_rows(db, dataset_id, hunt_id)
if not rows:
return {"nodes": [], "edges": [], "stats": {}}
# Extract entities
entities: dict[str, set[str]] = defaultdict(set) # type -> set of values
row_entity_map: list[list[tuple[str, str]]] = [] # per-row list of (type, value)
# Field name patterns for entity extraction
HOST_FIELDS = re.compile(r"hostname|computer|host|machine", re.I)
USER_FIELDS = re.compile(r"user|account|logon.*name|subject.*name", re.I)
IP_FIELDS = re.compile(r"src.*ip|dst.*ip|ip.*addr|source.*ip|dest.*ip|remote.*addr", re.I)
PROC_FIELDS = re.compile(r"process.*name|image|parent.*image|executable|command", re.I)
for row in rows:
row_ents: list[tuple[str, str]] = []
for k, v in row.items():
if not v or str(v).strip() in ('', '-', 'N/A', 'None'):
continue
val = str(v).strip()
if HOST_FIELDS.search(k):
entities["host"].add(val)
row_ents.append(("host", val))
elif USER_FIELDS.search(k):
entities["user"].add(val)
row_ents.append(("user", val))
elif IP_FIELDS.search(k):
entities["ip"].add(val)
row_ents.append(("ip", val))
elif PROC_FIELDS.search(k):
# Clean process name
pname = val.split("\\")[-1].split("/")[-1][:60]
entities["process"].add(pname)
row_ents.append(("process", pname))
row_entity_map.append(row_ents)
# Map rows to techniques
row_texts = [" ".join(str(v).lower() for v in row.values() if v) for row in rows]
row_techniques: list[set[str]] = [set() for _ in rows]
tech_meta: dict[str, tuple[str, str]] = {}
for tech_id, tech_name, tactic, patterns in TECHNIQUE_PATTERNS:
compiled = [re.compile(p, re.I) for p in patterns]
tech_meta[tech_id] = (tech_name, tactic)
for i, text in enumerate(row_texts):
for pat in compiled:
if pat.search(text):
row_techniques[i].add(tech_id)
break
# Build graph
nodes: list[dict] = []
edges: list[dict] = []
node_ids: set[str] = set()
edge_counter: Counter = Counter()
# Entity nodes
TYPE_COLORS = {
"host": "#3b82f6",
"user": "#10b981",
"ip": "#8b5cf6",
"process": "#f59e0b",
"technique": "#ef4444",
"tactic": "#6366f1",
}
TYPE_SHAPES = {
"host": "roundrectangle",
"user": "ellipse",
"ip": "diamond",
"process": "hexagon",
"technique": "tag",
"tactic": "round-rectangle",
}
for ent_type, values in entities.items():
for val in list(values)[:50]: # limit nodes
nid = f"{ent_type}:{val}"
if nid not in node_ids:
node_ids.add(nid)
nodes.append({
"data": {
"id": nid,
"label": val[:40],
"type": ent_type,
"color": TYPE_COLORS.get(ent_type, "#666"),
"shape": TYPE_SHAPES.get(ent_type, "ellipse"),
},
})
# Technique nodes
seen_techniques: set[str] = set()
for tech_set in row_techniques:
seen_techniques.update(tech_set)
for tech_id in seen_techniques:
name, tactic = tech_meta.get(tech_id, (tech_id, "unknown"))
nid = f"technique:{tech_id}"
if nid not in node_ids:
node_ids.add(nid)
nodes.append({
"data": {
"id": nid,
"label": f"{tech_id}\n{name}",
"type": "technique",
"color": TYPE_COLORS["technique"],
"shape": TYPE_SHAPES["technique"],
"tactic": tactic,
},
})
# Edges: entity → technique (based on co-occurrence in rows)
for i, row_ents in enumerate(row_entity_map):
for ent_type, ent_val in row_ents:
for tech_id in row_techniques[i]:
src = f"{ent_type}:{ent_val}"
tgt = f"technique:{tech_id}"
if src in node_ids and tgt in node_ids:
edge_key = (src, tgt)
edge_counter[edge_key] += 1
# Edges: entity → entity (based on co-occurrence)
for row_ents in row_entity_map:
for j in range(len(row_ents)):
for k in range(j + 1, len(row_ents)):
src = f"{row_ents[j][0]}:{row_ents[j][1]}"
tgt = f"{row_ents[k][0]}:{row_ents[k][1]}"
if src in node_ids and tgt in node_ids and src != tgt:
edge_counter[(src, tgt)] += 1
# Build edge list (filter low-weight edges)
for (src, tgt), weight in edge_counter.most_common(500):
if weight < 1:
continue
edges.append({
"data": {
"source": src,
"target": tgt,
"weight": weight,
"label": str(weight) if weight > 2 else "",
},
})
return {
"nodes": nodes,
"edges": edges,
"stats": {
"total_nodes": len(nodes),
"total_edges": len(edges),
"entity_counts": {t: len(v) for t, v in entities.items()},
"techniques_found": len(seen_techniques),
},
}