feat: host-centric network map, analysis dashboard, deduped inventory

- Rewrote NetworkMap to use deduplicated host inventory (163 hosts from 394K rows)
- New host_inventory.py service: scans datasets, groups by FQDN/ClientId, extracts IPs/users/OS
- New /api/network/host-inventory endpoint
- Added AnalysisDashboard with 6 tabs (IOC, anomaly, host profile, query, triage, reports)
- Added 16 analysis API endpoints with job queue and load balancer
- Added 4 AI/analysis ORM models (ProcessingJob, AnalysisResult, HostProfile, IOCEntry)
- Filters system accounts (DWM-*, UMFD-*, LOCAL/NETWORK SERVICE)
- Infers OS from hostname patterns (W10-* -> Windows 10)
- Canvas 2D force-directed graph with host/external-IP node types
- Click popover shows hostname, FQDN, IPs, OS, users, datasets, connections
This commit is contained in:
2026-02-20 07:16:17 -05:00
parent 9b98ab9614
commit 04a9946891
24 changed files with 4774 additions and 620 deletions

View File

@@ -0,0 +1,210 @@
"""IOC extraction service extract indicators of compromise from dataset rows.
Identifies: IPv4/IPv6 addresses, domain names, MD5/SHA1/SHA256 hashes,
email addresses, URLs, and file paths that look suspicious.
"""
import re
import logging
from collections import defaultdict
from typing import Optional
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import Dataset, DatasetRow
logger = logging.getLogger(__name__)
# Patterns
_IPV4 = re.compile(
r'\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b'
)
_IPV6 = re.compile(r'\b(?:[0-9a-fA-F]{1,4}:){7}[0-9a-fA-F]{1,4}\b')
_DOMAIN = re.compile(
r'\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)'
r'+(?:com|net|org|io|info|biz|co|us|uk|de|ru|cn|cc|tk|xyz|top|'
r'online|site|club|win|work|download|stream|gdn|bid|review|racing|'
r'loan|date|faith|accountant|cricket|science|trade|party|men)\b',
re.IGNORECASE,
)
_MD5 = re.compile(r'\b[0-9a-fA-F]{32}\b')
_SHA1 = re.compile(r'\b[0-9a-fA-F]{40}\b')
_SHA256 = re.compile(r'\b[0-9a-fA-F]{64}\b')
_EMAIL = re.compile(r'\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b')
_URL = re.compile(r'https?://[^\s<>"\']+', re.IGNORECASE)
# Private / reserved IPs to skip
_PRIVATE_NETS = re.compile(
r'^(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|127\.|0\.|255\.)'
)
PATTERNS = {
'ipv4': _IPV4,
'ipv6': _IPV6,
'domain': _DOMAIN,
'md5': _MD5,
'sha1': _SHA1,
'sha256': _SHA256,
'email': _EMAIL,
'url': _URL,
}
def _is_private_ip(ip: str) -> bool:
return bool(_PRIVATE_NETS.match(ip))
def extract_iocs_from_text(text: str, skip_private: bool = True) -> dict[str, set[str]]:
"""Extract all IOC types from a block of text."""
result: dict[str, set[str]] = defaultdict(set)
for ioc_type, pattern in PATTERNS.items():
for match in pattern.findall(text):
val = match.strip().lower() if ioc_type != 'url' else match.strip()
# Filter private IPs
if ioc_type == 'ipv4' and skip_private and _is_private_ip(val):
continue
# Filter hex strings that are too generic (< 32 chars not a hash)
result[ioc_type].add(val)
return result
async def extract_iocs_from_dataset(
dataset_id: str,
db: AsyncSession,
max_rows: int = 5000,
skip_private: bool = True,
) -> dict[str, list[str]]:
"""Extract IOCs from all rows of a dataset.
Returns {ioc_type: [sorted unique values]}.
"""
# Load rows in batches
all_iocs: dict[str, set[str]] = defaultdict(set)
offset = 0
batch_size = 500
while offset < max_rows:
result = await db.execute(
select(DatasetRow.data)
.where(DatasetRow.dataset_id == dataset_id)
.order_by(DatasetRow.row_index)
.offset(offset)
.limit(batch_size)
)
rows = result.scalars().all()
if not rows:
break
for data in rows:
# Flatten all values to a single string for scanning
text = ' '.join(str(v) for v in data.values()) if isinstance(data, dict) else str(data)
batch_iocs = extract_iocs_from_text(text, skip_private)
for ioc_type, values in batch_iocs.items():
all_iocs[ioc_type].update(values)
offset += batch_size
# Convert sets to sorted lists
return {k: sorted(v) for k, v in all_iocs.items() if v}
async def extract_host_groups(
hunt_id: str,
db: AsyncSession,
) -> list[dict]:
"""Group all data by hostname across datasets in a hunt.
Returns a list of host group dicts with dataset count, total rows,
artifact types, and time range.
"""
# Get all datasets for this hunt
result = await db.execute(
select(Dataset).where(Dataset.hunt_id == hunt_id)
)
ds_list = result.scalars().all()
if not ds_list:
return []
# Known host columns (check normalized data first, then raw)
HOST_COLS = [
'hostname', 'host', 'computer_name', 'computername', 'system',
'machine', 'device_name', 'devicename', 'endpoint',
'ClientId', 'Fqdn', 'client_id', 'fqdn',
]
hosts: dict[str, dict] = {}
for ds in ds_list:
# Sample first few rows to find host column
sample_result = await db.execute(
select(DatasetRow.data, DatasetRow.normalized_data)
.where(DatasetRow.dataset_id == ds.id)
.limit(5)
)
samples = sample_result.all()
if not samples:
continue
# Find which host column exists
host_col = None
for row_data, norm_data in samples:
check = norm_data if norm_data else row_data
if not isinstance(check, dict):
continue
for col in HOST_COLS:
if col in check and check[col]:
host_col = col
break
if host_col:
break
if not host_col:
continue
# Count rows per host in this dataset
all_rows_result = await db.execute(
select(DatasetRow.data, DatasetRow.normalized_data)
.where(DatasetRow.dataset_id == ds.id)
)
all_rows = all_rows_result.all()
for row_data, norm_data in all_rows:
check = norm_data if norm_data else row_data
if not isinstance(check, dict):
continue
host_val = check.get(host_col, '')
if not host_val or not isinstance(host_val, str):
continue
host_val = host_val.strip()
if not host_val:
continue
if host_val not in hosts:
hosts[host_val] = {
'hostname': host_val,
'dataset_ids': set(),
'total_rows': 0,
'artifact_types': set(),
'first_seen': None,
'last_seen': None,
}
hosts[host_val]['dataset_ids'].add(ds.id)
hosts[host_val]['total_rows'] += 1
if ds.artifact_type:
hosts[host_val]['artifact_types'].add(ds.artifact_type)
# Convert to output format
result_list = []
for h in sorted(hosts.values(), key=lambda x: x['total_rows'], reverse=True):
result_list.append({
'hostname': h['hostname'],
'dataset_count': len(h['dataset_ids']),
'total_rows': h['total_rows'],
'artifact_types': sorted(h['artifact_types']),
'first_seen': None, # TODO: extract from timestamp columns
'last_seen': None,
'risk_score': None, # TODO: link to host profiles
})
return result_list