ThreatHunt/backend/app/services/host_inventory.py

"""Host Inventory Service - builds a deduplicated host-centric network view.

Scans all datasets in a hunt to identify unique hosts, their IPs, OS,
logged-in users, and network connections between them.
"""

import re
import logging
from collections import defaultdict
from typing import Any

from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession

from app.db.models import Dataset, DatasetRow
from app.config import settings

logger = logging.getLogger(__name__)

# --- Column-name patterns (Velociraptor + generic forensic tools) ---

_HOST_ID_RE = re.compile(
    r'^(client_?id|clientid|agent_?id|endpoint_?id|host_?id|sensor_?id)$', re.I)
_FQDN_RE = re.compile(
    r'^(fqdn|fully_?qualified|computer_?name|hostname|host_?name|host|'
    r'system_?name|machine_?name|nodename|workstation)$', re.I)
_USERNAME_RE = re.compile(
    r'^(user|username|user_?name|logon_?name|account_?name|owner|'
    r'logged_?in_?user|sam_?account_?name|samaccountname)$', re.I)
_LOCAL_IP_RE = re.compile(
    r'^(laddr\.?ip|laddr|local_?addr(ess)?|src_?ip|source_?ip)$', re.I)
_REMOTE_IP_RE = re.compile(
    r'^(raddr\.?ip|raddr|remote_?addr(ess)?|dst_?ip|dest_?ip)$', re.I)
_REMOTE_PORT_RE = re.compile(
    r'^(raddr\.?port|rport|remote_?port|dst_?port|dest_?port)$', re.I)
_OS_RE = re.compile(
    r'^(os|operating_?system|os_?version|os_?name|platform|os_?type|os_?build)$', re.I)
_IP_VALID_RE = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$')

_IGNORE_IPS = frozenset({
    '0.0.0.0', '::', '::1', '127.0.0.1', '', '-', '*', 'None', 'null',
})
_SYSTEM_DOMAINS = frozenset({
    'NT AUTHORITY', 'NT SERVICE', 'FONT DRIVER HOST', 'WINDOW MANAGER',
})
_SYSTEM_USERS = frozenset({
    'SYSTEM', 'LOCAL SERVICE', 'NETWORK SERVICE',
    'UMFD-0', 'UMFD-1', 'DWM-1', 'DWM-2', 'DWM-3',
})


def _is_valid_ip(v: str) -> bool:
    if not v or v in _IGNORE_IPS:
        return False
    return bool(_IP_VALID_RE.match(v))


def _clean(v: Any) -> str:
    s = str(v or '').strip()
    return s if s and s not in ('-', 'None', 'null', '') else ''


_SYSTEM_USER_RE = re.compile(
    r'^(SYSTEM|LOCAL SERVICE|NETWORK SERVICE|DWM-\d+|UMFD-\d+)$', re.I)


def _extract_username(raw: str) -> str:
    """Clean username, stripping domain prefixes and filtering system accounts."""
    if not raw:
        return ''
    name = raw.strip()
    if '\\' in name:
        domain, _, name = name.rpartition('\\')
        name = name.strip()
        if domain.strip().upper() in _SYSTEM_DOMAINS:
            if not name or _SYSTEM_USER_RE.match(name):
                return ''
    if _SYSTEM_USER_RE.match(name):
        return ''
    return name or ''


#  In-memory host inventory cache
# Pre-computed results stored per hunt_id, built in background after upload.

import time as _time

class _InventoryCache:
    """Simple in-memory cache for pre-computed host inventories."""

    def __init__(self):
        self._data: dict[str, dict] = {}   # hunt_id -> result dict
        self._timestamps: dict[str, float] = {}  # hunt_id -> epoch
        self._building: set[str] = set()   # hunt_ids currently being built

    def get(self, hunt_id: str) -> dict | None:
        """Return cached result if present. Never expires; only invalidated on new upload."""
        return self._data.get(hunt_id)

    def put(self, hunt_id: str, result: dict):
        self._data[hunt_id] = result
        self._timestamps[hunt_id] = _time.time()
        self._building.discard(hunt_id)
        logger.info(f"Cached host inventory for hunt {hunt_id} "
                     f"({result['stats']['total_hosts']} hosts)")

    def invalidate(self, hunt_id: str):
        self._data.pop(hunt_id, None)
        self._timestamps.pop(hunt_id, None)

    def is_building(self, hunt_id: str) -> bool:
        return hunt_id in self._building

    def set_building(self, hunt_id: str):
        self._building.add(hunt_id)

    def clear_building(self, hunt_id: str):
        self._building.discard(hunt_id)

    def status(self, hunt_id: str) -> str:
        if hunt_id in self._building:
            return "building"
        if hunt_id in self._data:
            return "ready"
        return "none"


inventory_cache = _InventoryCache()

def _infer_os(fqdn: str) -> str:
    u = fqdn.upper()
    if 'W10-' in u or 'WIN10' in u:
        return 'Windows 10'
    if 'W11-' in u or 'WIN11' in u:
        return 'Windows 11'
    if 'W7-' in u or 'WIN7' in u:
        return 'Windows 7'
    if 'SRV' in u or 'SERVER' in u or 'DC-' in u:
        return 'Windows Server'
    if any(k in u for k in ('LINUX', 'UBUNTU', 'CENTOS', 'RHEL', 'DEBIAN')):
        return 'Linux'
    if 'MAC' in u or 'DARWIN' in u:
        return 'macOS'
    return 'Windows'


def _identify_columns(ds: Dataset) -> dict:
    norm = ds.normalized_columns or {}
    schema = ds.column_schema or {}
    raw_cols = list(schema.keys()) if schema else list(norm.keys())

    result = {
        'host_id': [], 'fqdn': [], 'username': [],
        'local_ip': [], 'remote_ip': [], 'remote_port': [], 'os': [],
    }

    for col in raw_cols:
        canonical = (norm.get(col) or '').lower()
        lower = col.lower()

        if _HOST_ID_RE.match(lower) or (canonical == 'hostname' and lower not in ('hostname', 'host_name', 'host')):
            result['host_id'].append(col)

        if _FQDN_RE.match(lower) or canonical == 'fqdn':
            result['fqdn'].append(col)

        if _USERNAME_RE.match(lower) or canonical in ('username', 'user'):
            result['username'].append(col)

        if _LOCAL_IP_RE.match(lower):
            result['local_ip'].append(col)
        elif _REMOTE_IP_RE.match(lower):
            result['remote_ip'].append(col)

        if _REMOTE_PORT_RE.match(lower):
            result['remote_port'].append(col)

        if _OS_RE.match(lower) or canonical == 'os':
            result['os'].append(col)

    return result


async def build_host_inventory(hunt_id: str, db: AsyncSession) -> dict:
    """Build a deduplicated host inventory from all datasets in a hunt.

    Returns dict with 'hosts', 'connections', and 'stats'.
    Each host has: id, hostname, fqdn, client_id, ips, os, users, datasets, row_count.
    """
    ds_result = await db.execute(
        select(Dataset).where(Dataset.hunt_id == hunt_id)
    )
    all_datasets = ds_result.scalars().all()

    if not all_datasets:
        return {"hosts": [], "connections": [], "stats": {
            "total_hosts": 0, "total_datasets_scanned": 0,
            "total_rows_scanned": 0,
        }}

    hosts: dict[str, dict] = {}          # fqdn -> host record
    ip_to_host: dict[str, str] = {}      # local-ip -> fqdn
    connections: dict[tuple, int] = defaultdict(int)
    total_rows = 0
    ds_with_hosts = 0
    sampled_dataset_count = 0
    total_row_budget = max(0, int(settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS))
    max_connections = max(0, int(settings.NETWORK_INVENTORY_MAX_CONNECTIONS))
    global_budget_reached = False
    dropped_connections = 0

    for ds in all_datasets:
        if total_row_budget and total_rows >= total_row_budget:
            global_budget_reached = True
            break

        cols = _identify_columns(ds)
        if not cols['fqdn'] and not cols['host_id']:
            continue
        ds_with_hosts += 1

        batch_size = 5000
        max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
        rows_scanned_this_dataset = 0
        sampled_dataset = False
        last_row_index = -1

        while True:
            if total_row_budget and total_rows >= total_row_budget:
                sampled_dataset = True
                global_budget_reached = True
                break

            rr = await db.execute(
                select(DatasetRow)
                .where(DatasetRow.dataset_id == ds.id)
                .where(DatasetRow.row_index > last_row_index)
                .order_by(DatasetRow.row_index)
                .limit(batch_size)
            )
            rows = rr.scalars().all()
            if not rows:
                break

            for ro in rows:
                if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
                    sampled_dataset = True
                    break
                if total_row_budget and total_rows >= total_row_budget:
                    sampled_dataset = True
                    global_budget_reached = True
                    break

                data = ro.data or {}
                total_rows += 1
                rows_scanned_this_dataset += 1

                fqdn = ''
                for c in cols['fqdn']:
                    fqdn = _clean(data.get(c))
                    if fqdn:
                        break
                client_id = ''
                for c in cols['host_id']:
                    client_id = _clean(data.get(c))
                    if client_id:
                        break

                if not fqdn and not client_id:
                    continue

                host_key = fqdn or client_id

                if host_key not in hosts:
                    short = fqdn.split('.')[0] if fqdn and '.' in fqdn else fqdn
                    hosts[host_key] = {
                        'id': host_key,
                        'hostname': short or client_id,
                        'fqdn': fqdn,
                        'client_id': client_id,
                        'ips': set(),
                        'os': '',
                        'users': set(),
                        'datasets': set(),
                        'row_count': 0,
                    }

                h = hosts[host_key]
                h['datasets'].add(ds.name)
                h['row_count'] += 1
                if client_id and not h['client_id']:
                    h['client_id'] = client_id

                for c in cols['username']:
                    u = _extract_username(_clean(data.get(c)))
                    if u:
                        h['users'].add(u)

                for c in cols['local_ip']:
                    ip = _clean(data.get(c))
                    if _is_valid_ip(ip):
                        h['ips'].add(ip)
                        ip_to_host[ip] = host_key

                for c in cols['os']:
                    ov = _clean(data.get(c))
                    if ov and not h['os']:
                        h['os'] = ov

                for c in cols['remote_ip']:
                    rip = _clean(data.get(c))
                    if _is_valid_ip(rip):
                        rport = ''
                        for pc in cols['remote_port']:
                            rport = _clean(data.get(pc))
                            if rport:
                                break
                        conn_key = (host_key, rip, rport)
                        if max_connections and len(connections) >= max_connections and conn_key not in connections:
                            dropped_connections += 1
                            continue
                        connections[conn_key] += 1

            if sampled_dataset:
                sampled_dataset_count += 1
                logger.info(
                    "Host inventory sampling for dataset %s (%d rows scanned)",
                    ds.id,
                    rows_scanned_this_dataset,
                )
                break

            last_row_index = rows[-1].row_index
            if len(rows) < batch_size:
                break

        if global_budget_reached:
            logger.info(
                "Host inventory global row budget reached for hunt %s at %d rows",
                hunt_id,
                total_rows,
            )
            break

    # Post-process hosts
    for h in hosts.values():
        if not h['os'] and h['fqdn']:
            h['os'] = _infer_os(h['fqdn'])
        h['ips'] = sorted(h['ips'])
        h['users'] = sorted(h['users'])
        h['datasets'] = sorted(h['datasets'])

    # Build connections, resolving IPs to host keys
    conn_list = []
    seen = set()
    for (src, dst_ip, dst_port), cnt in connections.items():
        if dst_ip in _IGNORE_IPS:
            continue
        dst_host = ip_to_host.get(dst_ip, '')
        if dst_host == src:
            continue
        key = tuple(sorted([src, dst_host or dst_ip]))
        if key in seen:
            continue
        seen.add(key)
        conn_list.append({
            'source': src,
            'target': dst_host or dst_ip,
            'target_ip': dst_ip,
            'port': dst_port,
            'count': cnt,
        })

    host_list = sorted(hosts.values(), key=lambda x: x['row_count'], reverse=True)

    return {
        "hosts": host_list,
        "connections": conn_list,
        "stats": {
            "total_hosts": len(host_list),
            "total_datasets_scanned": len(all_datasets),
            "datasets_with_hosts": ds_with_hosts,
            "total_rows_scanned": total_rows,
            "hosts_with_ips": sum(1 for h in host_list if h['ips']),
            "hosts_with_users": sum(1 for h in host_list if h['users']),
            "row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
            "row_budget_total": settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS,
            "connection_budget": settings.NETWORK_INVENTORY_MAX_CONNECTIONS,
            "sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0 or settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS > 0,
            "sampled_datasets": sampled_dataset_count,
            "global_budget_reached": global_budget_reached,
            "dropped_connections": dropped_connections,
        },
    }