mirror of
https://github.com/mblanke/ThreatHunt.git
synced 2026-03-01 14:00:20 -05:00
- Implemented PlaybookManager for creating and managing investigation playbooks with templates. - Added SavedSearches component for managing bookmarked queries and recurring scans. - Introduced TimelineView for visualizing forensic event timelines with zoomable charts. - Enhanced backend processing with auto-queued jobs for dataset uploads and improved database concurrency. - Updated frontend components for better user experience and performance optimizations. - Documented changes in update log for future reference.
228 lines
8.3 KiB
Python
228 lines
8.3 KiB
Python
from pathlib import Path
|
|
p=Path(r'd:/Projects/Dev/ThreatHunt/backend/app/services/host_inventory.py')
|
|
t=p.read_text(encoding='utf-8')
|
|
start=t.index('async def build_host_inventory(')
|
|
# find end of function by locating '\n\n' before EOF after ' }\n'
|
|
end=t.index('\n\n', start)
|
|
# need proper end: first double newline after function may occur in docstring? compute by searching for '\n\n' after ' }\n' near end
|
|
ret_idx=t.rfind(' }')
|
|
# safer locate end as last occurrence of '\n }\n' after start, then function ends next newline
|
|
end=t.find('\n\n', ret_idx)
|
|
if end==-1:
|
|
end=len(t)
|
|
new_func='''async def build_host_inventory(hunt_id: str, db: AsyncSession) -> dict:
|
|
"""Build a deduplicated host inventory from all datasets in a hunt.
|
|
|
|
Returns dict with 'hosts', 'connections', and 'stats'.
|
|
Each host has: id, hostname, fqdn, client_id, ips, os, users, datasets, row_count.
|
|
"""
|
|
ds_result = await db.execute(
|
|
select(Dataset).where(Dataset.hunt_id == hunt_id)
|
|
)
|
|
all_datasets = ds_result.scalars().all()
|
|
|
|
if not all_datasets:
|
|
return {"hosts": [], "connections": [], "stats": {
|
|
"total_hosts": 0, "total_datasets_scanned": 0,
|
|
"total_rows_scanned": 0,
|
|
}}
|
|
|
|
hosts: dict[str, dict] = {} # fqdn -> host record
|
|
ip_to_host: dict[str, str] = {} # local-ip -> fqdn
|
|
connections: dict[tuple, int] = defaultdict(int)
|
|
total_rows = 0
|
|
ds_with_hosts = 0
|
|
sampled_dataset_count = 0
|
|
total_row_budget = max(0, int(settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS))
|
|
max_connections = max(0, int(settings.NETWORK_INVENTORY_MAX_CONNECTIONS))
|
|
global_budget_reached = False
|
|
dropped_connections = 0
|
|
|
|
for ds in all_datasets:
|
|
if total_row_budget and total_rows >= total_row_budget:
|
|
global_budget_reached = True
|
|
break
|
|
|
|
cols = _identify_columns(ds)
|
|
if not cols['fqdn'] and not cols['host_id']:
|
|
continue
|
|
ds_with_hosts += 1
|
|
|
|
batch_size = 5000
|
|
max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
|
|
rows_scanned_this_dataset = 0
|
|
sampled_dataset = False
|
|
last_row_index = -1
|
|
|
|
while True:
|
|
if total_row_budget and total_rows >= total_row_budget:
|
|
sampled_dataset = True
|
|
global_budget_reached = True
|
|
break
|
|
|
|
rr = await db.execute(
|
|
select(DatasetRow)
|
|
.where(DatasetRow.dataset_id == ds.id)
|
|
.where(DatasetRow.row_index > last_row_index)
|
|
.order_by(DatasetRow.row_index)
|
|
.limit(batch_size)
|
|
)
|
|
rows = rr.scalars().all()
|
|
if not rows:
|
|
break
|
|
|
|
for ro in rows:
|
|
if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
|
|
sampled_dataset = True
|
|
break
|
|
if total_row_budget and total_rows >= total_row_budget:
|
|
sampled_dataset = True
|
|
global_budget_reached = True
|
|
break
|
|
|
|
data = ro.data or {}
|
|
total_rows += 1
|
|
rows_scanned_this_dataset += 1
|
|
|
|
fqdn = ''
|
|
for c in cols['fqdn']:
|
|
fqdn = _clean(data.get(c))
|
|
if fqdn:
|
|
break
|
|
client_id = ''
|
|
for c in cols['host_id']:
|
|
client_id = _clean(data.get(c))
|
|
if client_id:
|
|
break
|
|
|
|
if not fqdn and not client_id:
|
|
continue
|
|
|
|
host_key = fqdn or client_id
|
|
|
|
if host_key not in hosts:
|
|
short = fqdn.split('.')[0] if fqdn and '.' in fqdn else fqdn
|
|
hosts[host_key] = {
|
|
'id': host_key,
|
|
'hostname': short or client_id,
|
|
'fqdn': fqdn,
|
|
'client_id': client_id,
|
|
'ips': set(),
|
|
'os': '',
|
|
'users': set(),
|
|
'datasets': set(),
|
|
'row_count': 0,
|
|
}
|
|
|
|
h = hosts[host_key]
|
|
h['datasets'].add(ds.name)
|
|
h['row_count'] += 1
|
|
if client_id and not h['client_id']:
|
|
h['client_id'] = client_id
|
|
|
|
for c in cols['username']:
|
|
u = _extract_username(_clean(data.get(c)))
|
|
if u:
|
|
h['users'].add(u)
|
|
|
|
for c in cols['local_ip']:
|
|
ip = _clean(data.get(c))
|
|
if _is_valid_ip(ip):
|
|
h['ips'].add(ip)
|
|
ip_to_host[ip] = host_key
|
|
|
|
for c in cols['os']:
|
|
ov = _clean(data.get(c))
|
|
if ov and not h['os']:
|
|
h['os'] = ov
|
|
|
|
for c in cols['remote_ip']:
|
|
rip = _clean(data.get(c))
|
|
if _is_valid_ip(rip):
|
|
rport = ''
|
|
for pc in cols['remote_port']:
|
|
rport = _clean(data.get(pc))
|
|
if rport:
|
|
break
|
|
conn_key = (host_key, rip, rport)
|
|
if max_connections and len(connections) >= max_connections and conn_key not in connections:
|
|
dropped_connections += 1
|
|
continue
|
|
connections[conn_key] += 1
|
|
|
|
if sampled_dataset:
|
|
sampled_dataset_count += 1
|
|
logger.info(
|
|
"Host inventory sampling for dataset %s (%d rows scanned)",
|
|
ds.id,
|
|
rows_scanned_this_dataset,
|
|
)
|
|
break
|
|
|
|
last_row_index = rows[-1].row_index
|
|
if len(rows) < batch_size:
|
|
break
|
|
|
|
if global_budget_reached:
|
|
logger.info(
|
|
"Host inventory global row budget reached for hunt %s at %d rows",
|
|
hunt_id,
|
|
total_rows,
|
|
)
|
|
break
|
|
|
|
# Post-process hosts
|
|
for h in hosts.values():
|
|
if not h['os'] and h['fqdn']:
|
|
h['os'] = _infer_os(h['fqdn'])
|
|
h['ips'] = sorted(h['ips'])
|
|
h['users'] = sorted(h['users'])
|
|
h['datasets'] = sorted(h['datasets'])
|
|
|
|
# Build connections, resolving IPs to host keys
|
|
conn_list = []
|
|
seen = set()
|
|
for (src, dst_ip, dst_port), cnt in connections.items():
|
|
if dst_ip in _IGNORE_IPS:
|
|
continue
|
|
dst_host = ip_to_host.get(dst_ip, '')
|
|
if dst_host == src:
|
|
continue
|
|
key = tuple(sorted([src, dst_host or dst_ip]))
|
|
if key in seen:
|
|
continue
|
|
seen.add(key)
|
|
conn_list.append({
|
|
'source': src,
|
|
'target': dst_host or dst_ip,
|
|
'target_ip': dst_ip,
|
|
'port': dst_port,
|
|
'count': cnt,
|
|
})
|
|
|
|
host_list = sorted(hosts.values(), key=lambda x: x['row_count'], reverse=True)
|
|
|
|
return {
|
|
"hosts": host_list,
|
|
"connections": conn_list,
|
|
"stats": {
|
|
"total_hosts": len(host_list),
|
|
"total_datasets_scanned": len(all_datasets),
|
|
"datasets_with_hosts": ds_with_hosts,
|
|
"total_rows_scanned": total_rows,
|
|
"hosts_with_ips": sum(1 for h in host_list if h['ips']),
|
|
"hosts_with_users": sum(1 for h in host_list if h['users']),
|
|
"row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
|
|
"row_budget_total": settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS,
|
|
"connection_budget": settings.NETWORK_INVENTORY_MAX_CONNECTIONS,
|
|
"sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0 or settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS > 0,
|
|
"sampled_datasets": sampled_dataset_count,
|
|
"global_budget_reached": global_budget_reached,
|
|
"dropped_connections": dropped_connections,
|
|
},
|
|
}
|
|
'''
|
|
out=t[:start]+new_func+t[end:]
|
|
p.write_text(out,encoding='utf-8')
|
|
print('replaced build_host_inventory with hard-budget fast mode')
|