mirror of
https://github.com/mblanke/ThreatHunt.git
synced 2026-03-01 14:00:20 -05:00
- Implemented PlaybookManager for creating and managing investigation playbooks with templates. - Added SavedSearches component for managing bookmarked queries and recurring scans. - Introduced TimelineView for visualizing forensic event timelines with zoomable charts. - Enhanced backend processing with auto-queued jobs for dataset uploads and improved database concurrency. - Updated frontend components for better user experience and performance optimizations. - Documented changes in update log for future reference.
397 lines
13 KiB
Python
397 lines
13 KiB
Python
"""Host Inventory Service - builds a deduplicated host-centric network view.
|
|
|
|
Scans all datasets in a hunt to identify unique hosts, their IPs, OS,
|
|
logged-in users, and network connections between them.
|
|
"""
|
|
|
|
import re
|
|
import logging
|
|
from collections import defaultdict
|
|
from typing import Any
|
|
|
|
from sqlalchemy import select, func
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.db.models import Dataset, DatasetRow
|
|
from app.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# --- Column-name patterns (Velociraptor + generic forensic tools) ---
|
|
|
|
_HOST_ID_RE = re.compile(
|
|
r'^(client_?id|clientid|agent_?id|endpoint_?id|host_?id|sensor_?id)$', re.I)
|
|
_FQDN_RE = re.compile(
|
|
r'^(fqdn|fully_?qualified|computer_?name|hostname|host_?name|host|'
|
|
r'system_?name|machine_?name|nodename|workstation)$', re.I)
|
|
_USERNAME_RE = re.compile(
|
|
r'^(user|username|user_?name|logon_?name|account_?name|owner|'
|
|
r'logged_?in_?user|sam_?account_?name|samaccountname)$', re.I)
|
|
_LOCAL_IP_RE = re.compile(
|
|
r'^(laddr\.?ip|laddr|local_?addr(ess)?|src_?ip|source_?ip)$', re.I)
|
|
_REMOTE_IP_RE = re.compile(
|
|
r'^(raddr\.?ip|raddr|remote_?addr(ess)?|dst_?ip|dest_?ip)$', re.I)
|
|
_REMOTE_PORT_RE = re.compile(
|
|
r'^(raddr\.?port|rport|remote_?port|dst_?port|dest_?port)$', re.I)
|
|
_OS_RE = re.compile(
|
|
r'^(os|operating_?system|os_?version|os_?name|platform|os_?type|os_?build)$', re.I)
|
|
_IP_VALID_RE = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$')
|
|
|
|
_IGNORE_IPS = frozenset({
|
|
'0.0.0.0', '::', '::1', '127.0.0.1', '', '-', '*', 'None', 'null',
|
|
})
|
|
_SYSTEM_DOMAINS = frozenset({
|
|
'NT AUTHORITY', 'NT SERVICE', 'FONT DRIVER HOST', 'WINDOW MANAGER',
|
|
})
|
|
_SYSTEM_USERS = frozenset({
|
|
'SYSTEM', 'LOCAL SERVICE', 'NETWORK SERVICE',
|
|
'UMFD-0', 'UMFD-1', 'DWM-1', 'DWM-2', 'DWM-3',
|
|
})
|
|
|
|
|
|
def _is_valid_ip(v: str) -> bool:
|
|
if not v or v in _IGNORE_IPS:
|
|
return False
|
|
return bool(_IP_VALID_RE.match(v))
|
|
|
|
|
|
def _clean(v: Any) -> str:
|
|
s = str(v or '').strip()
|
|
return s if s and s not in ('-', 'None', 'null', '') else ''
|
|
|
|
|
|
_SYSTEM_USER_RE = re.compile(
|
|
r'^(SYSTEM|LOCAL SERVICE|NETWORK SERVICE|DWM-\d+|UMFD-\d+)$', re.I)
|
|
|
|
|
|
def _extract_username(raw: str) -> str:
|
|
"""Clean username, stripping domain prefixes and filtering system accounts."""
|
|
if not raw:
|
|
return ''
|
|
name = raw.strip()
|
|
if '\\' in name:
|
|
domain, _, name = name.rpartition('\\')
|
|
name = name.strip()
|
|
if domain.strip().upper() in _SYSTEM_DOMAINS:
|
|
if not name or _SYSTEM_USER_RE.match(name):
|
|
return ''
|
|
if _SYSTEM_USER_RE.match(name):
|
|
return ''
|
|
return name or ''
|
|
|
|
|
|
|
|
|
|
# In-memory host inventory cache
|
|
# Pre-computed results stored per hunt_id, built in background after upload.
|
|
|
|
import time as _time
|
|
|
|
class _InventoryCache:
|
|
"""Simple in-memory cache for pre-computed host inventories."""
|
|
|
|
def __init__(self):
|
|
self._data: dict[str, dict] = {} # hunt_id -> result dict
|
|
self._timestamps: dict[str, float] = {} # hunt_id -> epoch
|
|
self._building: set[str] = set() # hunt_ids currently being built
|
|
|
|
def get(self, hunt_id: str) -> dict | None:
|
|
"""Return cached result if present. Never expires; only invalidated on new upload."""
|
|
return self._data.get(hunt_id)
|
|
|
|
def put(self, hunt_id: str, result: dict):
|
|
self._data[hunt_id] = result
|
|
self._timestamps[hunt_id] = _time.time()
|
|
self._building.discard(hunt_id)
|
|
logger.info(f"Cached host inventory for hunt {hunt_id} "
|
|
f"({result['stats']['total_hosts']} hosts)")
|
|
|
|
def invalidate(self, hunt_id: str):
|
|
self._data.pop(hunt_id, None)
|
|
self._timestamps.pop(hunt_id, None)
|
|
|
|
def is_building(self, hunt_id: str) -> bool:
|
|
return hunt_id in self._building
|
|
|
|
def set_building(self, hunt_id: str):
|
|
self._building.add(hunt_id)
|
|
|
|
def clear_building(self, hunt_id: str):
|
|
self._building.discard(hunt_id)
|
|
|
|
def status(self, hunt_id: str) -> str:
|
|
if hunt_id in self._building:
|
|
return "building"
|
|
if hunt_id in self._data:
|
|
return "ready"
|
|
return "none"
|
|
|
|
|
|
inventory_cache = _InventoryCache()
|
|
|
|
def _infer_os(fqdn: str) -> str:
|
|
u = fqdn.upper()
|
|
if 'W10-' in u or 'WIN10' in u:
|
|
return 'Windows 10'
|
|
if 'W11-' in u or 'WIN11' in u:
|
|
return 'Windows 11'
|
|
if 'W7-' in u or 'WIN7' in u:
|
|
return 'Windows 7'
|
|
if 'SRV' in u or 'SERVER' in u or 'DC-' in u:
|
|
return 'Windows Server'
|
|
if any(k in u for k in ('LINUX', 'UBUNTU', 'CENTOS', 'RHEL', 'DEBIAN')):
|
|
return 'Linux'
|
|
if 'MAC' in u or 'DARWIN' in u:
|
|
return 'macOS'
|
|
return 'Windows'
|
|
|
|
|
|
def _identify_columns(ds: Dataset) -> dict:
|
|
norm = ds.normalized_columns or {}
|
|
schema = ds.column_schema or {}
|
|
raw_cols = list(schema.keys()) if schema else list(norm.keys())
|
|
|
|
result = {
|
|
'host_id': [], 'fqdn': [], 'username': [],
|
|
'local_ip': [], 'remote_ip': [], 'remote_port': [], 'os': [],
|
|
}
|
|
|
|
for col in raw_cols:
|
|
canonical = (norm.get(col) or '').lower()
|
|
lower = col.lower()
|
|
|
|
if _HOST_ID_RE.match(lower) or (canonical == 'hostname' and lower not in ('hostname', 'host_name', 'host')):
|
|
result['host_id'].append(col)
|
|
|
|
if _FQDN_RE.match(lower) or canonical == 'fqdn':
|
|
result['fqdn'].append(col)
|
|
|
|
if _USERNAME_RE.match(lower) or canonical in ('username', 'user'):
|
|
result['username'].append(col)
|
|
|
|
if _LOCAL_IP_RE.match(lower):
|
|
result['local_ip'].append(col)
|
|
elif _REMOTE_IP_RE.match(lower):
|
|
result['remote_ip'].append(col)
|
|
|
|
if _REMOTE_PORT_RE.match(lower):
|
|
result['remote_port'].append(col)
|
|
|
|
if _OS_RE.match(lower) or canonical == 'os':
|
|
result['os'].append(col)
|
|
|
|
return result
|
|
|
|
|
|
async def build_host_inventory(hunt_id: str, db: AsyncSession) -> dict:
|
|
"""Build a deduplicated host inventory from all datasets in a hunt.
|
|
|
|
Returns dict with 'hosts', 'connections', and 'stats'.
|
|
Each host has: id, hostname, fqdn, client_id, ips, os, users, datasets, row_count.
|
|
"""
|
|
ds_result = await db.execute(
|
|
select(Dataset).where(Dataset.hunt_id == hunt_id)
|
|
)
|
|
all_datasets = ds_result.scalars().all()
|
|
|
|
if not all_datasets:
|
|
return {"hosts": [], "connections": [], "stats": {
|
|
"total_hosts": 0, "total_datasets_scanned": 0,
|
|
"total_rows_scanned": 0,
|
|
}}
|
|
|
|
hosts: dict[str, dict] = {} # fqdn -> host record
|
|
ip_to_host: dict[str, str] = {} # local-ip -> fqdn
|
|
connections: dict[tuple, int] = defaultdict(int)
|
|
total_rows = 0
|
|
ds_with_hosts = 0
|
|
sampled_dataset_count = 0
|
|
total_row_budget = max(0, int(settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS))
|
|
max_connections = max(0, int(settings.NETWORK_INVENTORY_MAX_CONNECTIONS))
|
|
global_budget_reached = False
|
|
dropped_connections = 0
|
|
|
|
for ds in all_datasets:
|
|
if total_row_budget and total_rows >= total_row_budget:
|
|
global_budget_reached = True
|
|
break
|
|
|
|
cols = _identify_columns(ds)
|
|
if not cols['fqdn'] and not cols['host_id']:
|
|
continue
|
|
ds_with_hosts += 1
|
|
|
|
batch_size = 5000
|
|
max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
|
|
rows_scanned_this_dataset = 0
|
|
sampled_dataset = False
|
|
last_row_index = -1
|
|
|
|
while True:
|
|
if total_row_budget and total_rows >= total_row_budget:
|
|
sampled_dataset = True
|
|
global_budget_reached = True
|
|
break
|
|
|
|
rr = await db.execute(
|
|
select(DatasetRow)
|
|
.where(DatasetRow.dataset_id == ds.id)
|
|
.where(DatasetRow.row_index > last_row_index)
|
|
.order_by(DatasetRow.row_index)
|
|
.limit(batch_size)
|
|
)
|
|
rows = rr.scalars().all()
|
|
if not rows:
|
|
break
|
|
|
|
for ro in rows:
|
|
if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
|
|
sampled_dataset = True
|
|
break
|
|
if total_row_budget and total_rows >= total_row_budget:
|
|
sampled_dataset = True
|
|
global_budget_reached = True
|
|
break
|
|
|
|
data = ro.data or {}
|
|
total_rows += 1
|
|
rows_scanned_this_dataset += 1
|
|
|
|
fqdn = ''
|
|
for c in cols['fqdn']:
|
|
fqdn = _clean(data.get(c))
|
|
if fqdn:
|
|
break
|
|
client_id = ''
|
|
for c in cols['host_id']:
|
|
client_id = _clean(data.get(c))
|
|
if client_id:
|
|
break
|
|
|
|
if not fqdn and not client_id:
|
|
continue
|
|
|
|
host_key = fqdn or client_id
|
|
|
|
if host_key not in hosts:
|
|
short = fqdn.split('.')[0] if fqdn and '.' in fqdn else fqdn
|
|
hosts[host_key] = {
|
|
'id': host_key,
|
|
'hostname': short or client_id,
|
|
'fqdn': fqdn,
|
|
'client_id': client_id,
|
|
'ips': set(),
|
|
'os': '',
|
|
'users': set(),
|
|
'datasets': set(),
|
|
'row_count': 0,
|
|
}
|
|
|
|
h = hosts[host_key]
|
|
h['datasets'].add(ds.name)
|
|
h['row_count'] += 1
|
|
if client_id and not h['client_id']:
|
|
h['client_id'] = client_id
|
|
|
|
for c in cols['username']:
|
|
u = _extract_username(_clean(data.get(c)))
|
|
if u:
|
|
h['users'].add(u)
|
|
|
|
for c in cols['local_ip']:
|
|
ip = _clean(data.get(c))
|
|
if _is_valid_ip(ip):
|
|
h['ips'].add(ip)
|
|
ip_to_host[ip] = host_key
|
|
|
|
for c in cols['os']:
|
|
ov = _clean(data.get(c))
|
|
if ov and not h['os']:
|
|
h['os'] = ov
|
|
|
|
for c in cols['remote_ip']:
|
|
rip = _clean(data.get(c))
|
|
if _is_valid_ip(rip):
|
|
rport = ''
|
|
for pc in cols['remote_port']:
|
|
rport = _clean(data.get(pc))
|
|
if rport:
|
|
break
|
|
conn_key = (host_key, rip, rport)
|
|
if max_connections and len(connections) >= max_connections and conn_key not in connections:
|
|
dropped_connections += 1
|
|
continue
|
|
connections[conn_key] += 1
|
|
|
|
if sampled_dataset:
|
|
sampled_dataset_count += 1
|
|
logger.info(
|
|
"Host inventory sampling for dataset %s (%d rows scanned)",
|
|
ds.id,
|
|
rows_scanned_this_dataset,
|
|
)
|
|
break
|
|
|
|
last_row_index = rows[-1].row_index
|
|
if len(rows) < batch_size:
|
|
break
|
|
|
|
if global_budget_reached:
|
|
logger.info(
|
|
"Host inventory global row budget reached for hunt %s at %d rows",
|
|
hunt_id,
|
|
total_rows,
|
|
)
|
|
break
|
|
|
|
# Post-process hosts
|
|
for h in hosts.values():
|
|
if not h['os'] and h['fqdn']:
|
|
h['os'] = _infer_os(h['fqdn'])
|
|
h['ips'] = sorted(h['ips'])
|
|
h['users'] = sorted(h['users'])
|
|
h['datasets'] = sorted(h['datasets'])
|
|
|
|
# Build connections, resolving IPs to host keys
|
|
conn_list = []
|
|
seen = set()
|
|
for (src, dst_ip, dst_port), cnt in connections.items():
|
|
if dst_ip in _IGNORE_IPS:
|
|
continue
|
|
dst_host = ip_to_host.get(dst_ip, '')
|
|
if dst_host == src:
|
|
continue
|
|
key = tuple(sorted([src, dst_host or dst_ip]))
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
conn_list.append({
|
|
'source': src,
|
|
'target': dst_host or dst_ip,
|
|
'target_ip': dst_ip,
|
|
'port': dst_port,
|
|
'count': cnt,
|
|
})
|
|
|
|
host_list = sorted(hosts.values(), key=lambda x: x['row_count'], reverse=True)
|
|
|
|
return {
|
|
"hosts": host_list,
|
|
"connections": conn_list,
|
|
"stats": {
|
|
"total_hosts": len(host_list),
|
|
"total_datasets_scanned": len(all_datasets),
|
|
"datasets_with_hosts": ds_with_hosts,
|
|
"total_rows_scanned": total_rows,
|
|
"hosts_with_ips": sum(1 for h in host_list if h['ips']),
|
|
"hosts_with_users": sum(1 for h in host_list if h['users']),
|
|
"row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
|
|
"row_budget_total": settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS,
|
|
"connection_budget": settings.NETWORK_INVENTORY_MAX_CONNECTIONS,
|
|
"sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0 or settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS > 0,
|
|
"sampled_datasets": sampled_dataset_count,
|
|
"global_budget_reached": global_budget_reached,
|
|
"dropped_connections": dropped_connections,
|
|
},
|
|
}
|