Files
ThreatHunt/backend/app/services/host_inventory.py
mblanke 5a2ad8ec1c feat: Add Playbook Manager, Saved Searches, and Timeline View components
- Implemented PlaybookManager for creating and managing investigation playbooks with templates.
- Added SavedSearches component for managing bookmarked queries and recurring scans.
- Introduced TimelineView for visualizing forensic event timelines with zoomable charts.
- Enhanced backend processing with auto-queued jobs for dataset uploads and improved database concurrency.
- Updated frontend components for better user experience and performance optimizations.
- Documented changes in update log for future reference.
2026-02-23 14:23:07 -05:00

397 lines
13 KiB
Python

"""Host Inventory Service - builds a deduplicated host-centric network view.
Scans all datasets in a hunt to identify unique hosts, their IPs, OS,
logged-in users, and network connections between them.
"""
import re
import logging
from collections import defaultdict
from typing import Any
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import Dataset, DatasetRow
from app.config import settings
logger = logging.getLogger(__name__)
# --- Column-name patterns (Velociraptor + generic forensic tools) ---
_HOST_ID_RE = re.compile(
r'^(client_?id|clientid|agent_?id|endpoint_?id|host_?id|sensor_?id)$', re.I)
_FQDN_RE = re.compile(
r'^(fqdn|fully_?qualified|computer_?name|hostname|host_?name|host|'
r'system_?name|machine_?name|nodename|workstation)$', re.I)
_USERNAME_RE = re.compile(
r'^(user|username|user_?name|logon_?name|account_?name|owner|'
r'logged_?in_?user|sam_?account_?name|samaccountname)$', re.I)
_LOCAL_IP_RE = re.compile(
r'^(laddr\.?ip|laddr|local_?addr(ess)?|src_?ip|source_?ip)$', re.I)
_REMOTE_IP_RE = re.compile(
r'^(raddr\.?ip|raddr|remote_?addr(ess)?|dst_?ip|dest_?ip)$', re.I)
_REMOTE_PORT_RE = re.compile(
r'^(raddr\.?port|rport|remote_?port|dst_?port|dest_?port)$', re.I)
_OS_RE = re.compile(
r'^(os|operating_?system|os_?version|os_?name|platform|os_?type|os_?build)$', re.I)
_IP_VALID_RE = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$')
_IGNORE_IPS = frozenset({
'0.0.0.0', '::', '::1', '127.0.0.1', '', '-', '*', 'None', 'null',
})
_SYSTEM_DOMAINS = frozenset({
'NT AUTHORITY', 'NT SERVICE', 'FONT DRIVER HOST', 'WINDOW MANAGER',
})
_SYSTEM_USERS = frozenset({
'SYSTEM', 'LOCAL SERVICE', 'NETWORK SERVICE',
'UMFD-0', 'UMFD-1', 'DWM-1', 'DWM-2', 'DWM-3',
})
def _is_valid_ip(v: str) -> bool:
if not v or v in _IGNORE_IPS:
return False
return bool(_IP_VALID_RE.match(v))
def _clean(v: Any) -> str:
s = str(v or '').strip()
return s if s and s not in ('-', 'None', 'null', '') else ''
_SYSTEM_USER_RE = re.compile(
r'^(SYSTEM|LOCAL SERVICE|NETWORK SERVICE|DWM-\d+|UMFD-\d+)$', re.I)
def _extract_username(raw: str) -> str:
"""Clean username, stripping domain prefixes and filtering system accounts."""
if not raw:
return ''
name = raw.strip()
if '\\' in name:
domain, _, name = name.rpartition('\\')
name = name.strip()
if domain.strip().upper() in _SYSTEM_DOMAINS:
if not name or _SYSTEM_USER_RE.match(name):
return ''
if _SYSTEM_USER_RE.match(name):
return ''
return name or ''
# In-memory host inventory cache
# Pre-computed results stored per hunt_id, built in background after upload.
import time as _time
class _InventoryCache:
"""Simple in-memory cache for pre-computed host inventories."""
def __init__(self):
self._data: dict[str, dict] = {} # hunt_id -> result dict
self._timestamps: dict[str, float] = {} # hunt_id -> epoch
self._building: set[str] = set() # hunt_ids currently being built
def get(self, hunt_id: str) -> dict | None:
"""Return cached result if present. Never expires; only invalidated on new upload."""
return self._data.get(hunt_id)
def put(self, hunt_id: str, result: dict):
self._data[hunt_id] = result
self._timestamps[hunt_id] = _time.time()
self._building.discard(hunt_id)
logger.info(f"Cached host inventory for hunt {hunt_id} "
f"({result['stats']['total_hosts']} hosts)")
def invalidate(self, hunt_id: str):
self._data.pop(hunt_id, None)
self._timestamps.pop(hunt_id, None)
def is_building(self, hunt_id: str) -> bool:
return hunt_id in self._building
def set_building(self, hunt_id: str):
self._building.add(hunt_id)
def clear_building(self, hunt_id: str):
self._building.discard(hunt_id)
def status(self, hunt_id: str) -> str:
if hunt_id in self._building:
return "building"
if hunt_id in self._data:
return "ready"
return "none"
inventory_cache = _InventoryCache()
def _infer_os(fqdn: str) -> str:
u = fqdn.upper()
if 'W10-' in u or 'WIN10' in u:
return 'Windows 10'
if 'W11-' in u or 'WIN11' in u:
return 'Windows 11'
if 'W7-' in u or 'WIN7' in u:
return 'Windows 7'
if 'SRV' in u or 'SERVER' in u or 'DC-' in u:
return 'Windows Server'
if any(k in u for k in ('LINUX', 'UBUNTU', 'CENTOS', 'RHEL', 'DEBIAN')):
return 'Linux'
if 'MAC' in u or 'DARWIN' in u:
return 'macOS'
return 'Windows'
def _identify_columns(ds: Dataset) -> dict:
norm = ds.normalized_columns or {}
schema = ds.column_schema or {}
raw_cols = list(schema.keys()) if schema else list(norm.keys())
result = {
'host_id': [], 'fqdn': [], 'username': [],
'local_ip': [], 'remote_ip': [], 'remote_port': [], 'os': [],
}
for col in raw_cols:
canonical = (norm.get(col) or '').lower()
lower = col.lower()
if _HOST_ID_RE.match(lower) or (canonical == 'hostname' and lower not in ('hostname', 'host_name', 'host')):
result['host_id'].append(col)
if _FQDN_RE.match(lower) or canonical == 'fqdn':
result['fqdn'].append(col)
if _USERNAME_RE.match(lower) or canonical in ('username', 'user'):
result['username'].append(col)
if _LOCAL_IP_RE.match(lower):
result['local_ip'].append(col)
elif _REMOTE_IP_RE.match(lower):
result['remote_ip'].append(col)
if _REMOTE_PORT_RE.match(lower):
result['remote_port'].append(col)
if _OS_RE.match(lower) or canonical == 'os':
result['os'].append(col)
return result
async def build_host_inventory(hunt_id: str, db: AsyncSession) -> dict:
"""Build a deduplicated host inventory from all datasets in a hunt.
Returns dict with 'hosts', 'connections', and 'stats'.
Each host has: id, hostname, fqdn, client_id, ips, os, users, datasets, row_count.
"""
ds_result = await db.execute(
select(Dataset).where(Dataset.hunt_id == hunt_id)
)
all_datasets = ds_result.scalars().all()
if not all_datasets:
return {"hosts": [], "connections": [], "stats": {
"total_hosts": 0, "total_datasets_scanned": 0,
"total_rows_scanned": 0,
}}
hosts: dict[str, dict] = {} # fqdn -> host record
ip_to_host: dict[str, str] = {} # local-ip -> fqdn
connections: dict[tuple, int] = defaultdict(int)
total_rows = 0
ds_with_hosts = 0
sampled_dataset_count = 0
total_row_budget = max(0, int(settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS))
max_connections = max(0, int(settings.NETWORK_INVENTORY_MAX_CONNECTIONS))
global_budget_reached = False
dropped_connections = 0
for ds in all_datasets:
if total_row_budget and total_rows >= total_row_budget:
global_budget_reached = True
break
cols = _identify_columns(ds)
if not cols['fqdn'] and not cols['host_id']:
continue
ds_with_hosts += 1
batch_size = 5000
max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
rows_scanned_this_dataset = 0
sampled_dataset = False
last_row_index = -1
while True:
if total_row_budget and total_rows >= total_row_budget:
sampled_dataset = True
global_budget_reached = True
break
rr = await db.execute(
select(DatasetRow)
.where(DatasetRow.dataset_id == ds.id)
.where(DatasetRow.row_index > last_row_index)
.order_by(DatasetRow.row_index)
.limit(batch_size)
)
rows = rr.scalars().all()
if not rows:
break
for ro in rows:
if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
sampled_dataset = True
break
if total_row_budget and total_rows >= total_row_budget:
sampled_dataset = True
global_budget_reached = True
break
data = ro.data or {}
total_rows += 1
rows_scanned_this_dataset += 1
fqdn = ''
for c in cols['fqdn']:
fqdn = _clean(data.get(c))
if fqdn:
break
client_id = ''
for c in cols['host_id']:
client_id = _clean(data.get(c))
if client_id:
break
if not fqdn and not client_id:
continue
host_key = fqdn or client_id
if host_key not in hosts:
short = fqdn.split('.')[0] if fqdn and '.' in fqdn else fqdn
hosts[host_key] = {
'id': host_key,
'hostname': short or client_id,
'fqdn': fqdn,
'client_id': client_id,
'ips': set(),
'os': '',
'users': set(),
'datasets': set(),
'row_count': 0,
}
h = hosts[host_key]
h['datasets'].add(ds.name)
h['row_count'] += 1
if client_id and not h['client_id']:
h['client_id'] = client_id
for c in cols['username']:
u = _extract_username(_clean(data.get(c)))
if u:
h['users'].add(u)
for c in cols['local_ip']:
ip = _clean(data.get(c))
if _is_valid_ip(ip):
h['ips'].add(ip)
ip_to_host[ip] = host_key
for c in cols['os']:
ov = _clean(data.get(c))
if ov and not h['os']:
h['os'] = ov
for c in cols['remote_ip']:
rip = _clean(data.get(c))
if _is_valid_ip(rip):
rport = ''
for pc in cols['remote_port']:
rport = _clean(data.get(pc))
if rport:
break
conn_key = (host_key, rip, rport)
if max_connections and len(connections) >= max_connections and conn_key not in connections:
dropped_connections += 1
continue
connections[conn_key] += 1
if sampled_dataset:
sampled_dataset_count += 1
logger.info(
"Host inventory sampling for dataset %s (%d rows scanned)",
ds.id,
rows_scanned_this_dataset,
)
break
last_row_index = rows[-1].row_index
if len(rows) < batch_size:
break
if global_budget_reached:
logger.info(
"Host inventory global row budget reached for hunt %s at %d rows",
hunt_id,
total_rows,
)
break
# Post-process hosts
for h in hosts.values():
if not h['os'] and h['fqdn']:
h['os'] = _infer_os(h['fqdn'])
h['ips'] = sorted(h['ips'])
h['users'] = sorted(h['users'])
h['datasets'] = sorted(h['datasets'])
# Build connections, resolving IPs to host keys
conn_list = []
seen = set()
for (src, dst_ip, dst_port), cnt in connections.items():
if dst_ip in _IGNORE_IPS:
continue
dst_host = ip_to_host.get(dst_ip, '')
if dst_host == src:
continue
key = tuple(sorted([src, dst_host or dst_ip]))
if key in seen:
continue
seen.add(key)
conn_list.append({
'source': src,
'target': dst_host or dst_ip,
'target_ip': dst_ip,
'port': dst_port,
'count': cnt,
})
host_list = sorted(hosts.values(), key=lambda x: x['row_count'], reverse=True)
return {
"hosts": host_list,
"connections": conn_list,
"stats": {
"total_hosts": len(host_list),
"total_datasets_scanned": len(all_datasets),
"datasets_with_hosts": ds_with_hosts,
"total_rows_scanned": total_rows,
"hosts_with_ips": sum(1 for h in host_list if h['ips']),
"hosts_with_users": sum(1 for h in host_list if h['users']),
"row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
"row_budget_total": settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS,
"connection_budget": settings.NETWORK_INVENTORY_MAX_CONNECTIONS,
"sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0 or settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS > 0,
"sampled_datasets": sampled_dataset_count,
"global_budget_reached": global_budget_reached,
"dropped_connections": dropped_connections,
},
}