feat: Add Playbook Manager, Saved Searches, and Timeline View components

- Implemented PlaybookManager for creating and managing investigation playbooks with templates. - Added SavedSearches component for managing bookmarked queries and recurring scans. - Introduced TimelineView for visualizing forensic event timelines with zoomable charts. - Enhanced backend processing with auto-queued jobs for dataset uploads and improved database concurrency. - Updated frontend components for better user experience and performance optimizations. - Documented changes in update log for future reference.
2026-03-01 05:50:21 -05:00 · 2026-02-23 14:23:07 -05:00
parent 37a9584d0c
commit 5a2ad8ec1c
110 changed files with 10537 additions and 1185 deletions
--- a/_perf_replace_build_host_inventory.py
+++ b/_perf_replace_build_host_inventory.py
@@ -0,0 +1,227 @@
+from pathlib import Path
+p=Path(r'd:/Projects/Dev/ThreatHunt/backend/app/services/host_inventory.py')
+t=p.read_text(encoding='utf-8')
+start=t.index('async def build_host_inventory(')
+# find end of function by locating '\n\n' before EOF after '    }\n'
+end=t.index('\n\n', start)
+# need proper end: first double newline after function may occur in docstring? compute by searching for '\n\n' after '    }\n' near end
+ret_idx=t.rfind('    }')
+# safer locate end as last occurrence of '\n    }\n' after start, then function ends next newline
+end=t.find('\n\n', ret_idx)
+if end==-1:
+    end=len(t)
+new_func='''async def build_host_inventory(hunt_id: str, db: AsyncSession) -> dict:
+    """Build a deduplicated host inventory from all datasets in a hunt.
+
+    Returns dict with 'hosts', 'connections', and 'stats'.
+    Each host has: id, hostname, fqdn, client_id, ips, os, users, datasets, row_count.
+    """
+    ds_result = await db.execute(
+        select(Dataset).where(Dataset.hunt_id == hunt_id)
+    )
+    all_datasets = ds_result.scalars().all()
+
+    if not all_datasets:
+        return {"hosts": [], "connections": [], "stats": {
+            "total_hosts": 0, "total_datasets_scanned": 0,
+            "total_rows_scanned": 0,
+        }}
+
+    hosts: dict[str, dict] = {}          # fqdn -> host record
+    ip_to_host: dict[str, str] = {}      # local-ip -> fqdn
+    connections: dict[tuple, int] = defaultdict(int)
+    total_rows = 0
+    ds_with_hosts = 0
+    sampled_dataset_count = 0
+    total_row_budget = max(0, int(settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS))
+    max_connections = max(0, int(settings.NETWORK_INVENTORY_MAX_CONNECTIONS))
+    global_budget_reached = False
+    dropped_connections = 0
+
+    for ds in all_datasets:
+        if total_row_budget and total_rows >= total_row_budget:
+            global_budget_reached = True
+            break
+
+        cols = _identify_columns(ds)
+        if not cols['fqdn'] and not cols['host_id']:
+            continue
+        ds_with_hosts += 1
+
+        batch_size = 5000
+        max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
+        rows_scanned_this_dataset = 0
+        sampled_dataset = False
+        last_row_index = -1
+
+        while True:
+            if total_row_budget and total_rows >= total_row_budget:
+                sampled_dataset = True
+                global_budget_reached = True
+                break
+
+            rr = await db.execute(
+                select(DatasetRow)
+                .where(DatasetRow.dataset_id == ds.id)
+                .where(DatasetRow.row_index > last_row_index)
+                .order_by(DatasetRow.row_index)
+                .limit(batch_size)
+            )
+            rows = rr.scalars().all()
+            if not rows:
+                break
+
+            for ro in rows:
+                if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
+                    sampled_dataset = True
+                    break
+                if total_row_budget and total_rows >= total_row_budget:
+                    sampled_dataset = True
+                    global_budget_reached = True
+                    break
+
+                data = ro.data or {}
+                total_rows += 1
+                rows_scanned_this_dataset += 1
+
+                fqdn = ''
+                for c in cols['fqdn']:
+                    fqdn = _clean(data.get(c))
+                    if fqdn:
+                        break
+                client_id = ''
+                for c in cols['host_id']:
+                    client_id = _clean(data.get(c))
+                    if client_id:
+                        break
+
+                if not fqdn and not client_id:
+                    continue
+
+                host_key = fqdn or client_id
+
+                if host_key not in hosts:
+                    short = fqdn.split('.')[0] if fqdn and '.' in fqdn else fqdn
+                    hosts[host_key] = {
+                        'id': host_key,
+                        'hostname': short or client_id,
+                        'fqdn': fqdn,
+                        'client_id': client_id,
+                        'ips': set(),
+                        'os': '',
+                        'users': set(),
+                        'datasets': set(),
+                        'row_count': 0,
+                    }
+
+                h = hosts[host_key]
+                h['datasets'].add(ds.name)
+                h['row_count'] += 1
+                if client_id and not h['client_id']:
+                    h['client_id'] = client_id
+
+                for c in cols['username']:
+                    u = _extract_username(_clean(data.get(c)))
+                    if u:
+                        h['users'].add(u)
+
+                for c in cols['local_ip']:
+                    ip = _clean(data.get(c))
+                    if _is_valid_ip(ip):
+                        h['ips'].add(ip)
+                        ip_to_host[ip] = host_key
+
+                for c in cols['os']:
+                    ov = _clean(data.get(c))
+                    if ov and not h['os']:
+                        h['os'] = ov
+
+                for c in cols['remote_ip']:
+                    rip = _clean(data.get(c))
+                    if _is_valid_ip(rip):
+                        rport = ''
+                        for pc in cols['remote_port']:
+                            rport = _clean(data.get(pc))
+                            if rport:
+                                break
+                        conn_key = (host_key, rip, rport)
+                        if max_connections and len(connections) >= max_connections and conn_key not in connections:
+                            dropped_connections += 1
+                            continue
+                        connections[conn_key] += 1
+
+            if sampled_dataset:
+                sampled_dataset_count += 1
+                logger.info(
+                    "Host inventory sampling for dataset %s (%d rows scanned)",
+                    ds.id,
+                    rows_scanned_this_dataset,
+                )
+                break
+
+            last_row_index = rows[-1].row_index
+            if len(rows) < batch_size:
+                break
+
+        if global_budget_reached:
+            logger.info(
+                "Host inventory global row budget reached for hunt %s at %d rows",
+                hunt_id,
+                total_rows,
+            )
+            break
+
+    # Post-process hosts
+    for h in hosts.values():
+        if not h['os'] and h['fqdn']:
+            h['os'] = _infer_os(h['fqdn'])
+        h['ips'] = sorted(h['ips'])
+        h['users'] = sorted(h['users'])
+        h['datasets'] = sorted(h['datasets'])
+
+    # Build connections, resolving IPs to host keys
+    conn_list = []
+    seen = set()
+    for (src, dst_ip, dst_port), cnt in connections.items():
+        if dst_ip in _IGNORE_IPS:
+            continue
+        dst_host = ip_to_host.get(dst_ip, '')
+        if dst_host == src:
+            continue
+        key = tuple(sorted([src, dst_host or dst_ip]))
+        if key in seen:
+            continue
+        seen.add(key)
+        conn_list.append({
+            'source': src,
+            'target': dst_host or dst_ip,
+            'target_ip': dst_ip,
+            'port': dst_port,
+            'count': cnt,
+        })
+
+    host_list = sorted(hosts.values(), key=lambda x: x['row_count'], reverse=True)
+
+    return {
+        "hosts": host_list,
+        "connections": conn_list,
+        "stats": {
+            "total_hosts": len(host_list),
+            "total_datasets_scanned": len(all_datasets),
+            "datasets_with_hosts": ds_with_hosts,
+            "total_rows_scanned": total_rows,
+            "hosts_with_ips": sum(1 for h in host_list if h['ips']),
+            "hosts_with_users": sum(1 for h in host_list if h['users']),
+            "row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
+            "row_budget_total": settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS,
+            "connection_budget": settings.NETWORK_INVENTORY_MAX_CONNECTIONS,
+            "sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0 or settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS > 0,
+            "sampled_datasets": sampled_dataset_count,
+            "global_budget_reached": global_budget_reached,
+            "dropped_connections": dropped_connections,
+        },
+    }
+'''
+out=t[:start]+new_func+t[end:]
+p.write_text(out,encoding='utf-8')
+print('replaced build_host_inventory with hard-budget fast mode')