feat: interactive network map, IOC highlighting, AUP hunt selector, type filters

- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
2026-03-01 14:00:20 -05:00 · 2026-02-19 15:41:15 -05:00
parent d0c9f88268
commit 9b98ab9614
92 changed files with 13042 additions and 1089 deletions
--- a/backend/app/services/csv_parser.py
+++ b/backend/app/services/csv_parser.py
@@ -0,0 +1,165 @@
+"""CSV parsing engine with encoding detection, delimiter sniffing, and streaming.
+
+Handles large Velociraptor CSV exports with resilience to encoding issues,
+varied delimiters, and malformed rows.
+"""
+
+import csv
+import io
+import logging
+from pathlib import Path
+from typing import AsyncIterator
+
+import chardet
+
+logger = logging.getLogger(__name__)
+
+# Reasonable defaults
+MAX_FIELD_SIZE = 1024 * 1024  # 1 MB per field
+csv.field_size_limit(MAX_FIELD_SIZE)
+
+
+def detect_encoding(file_bytes: bytes, sample_size: int = 65536) -> str:
+    """Detect file encoding from a sample of bytes."""
+    result = chardet.detect(file_bytes[:sample_size])
+    encoding = result.get("encoding", "utf-8") or "utf-8"
+    confidence = result.get("confidence", 0)
+    logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
+    # Fall back to utf-8 if confidence is very low
+    if confidence < 0.5:
+        encoding = "utf-8"
+    return encoding
+
+
+def detect_delimiter(text_sample: str) -> str:
+    """Sniff the CSV delimiter from a text sample."""
+    try:
+        dialect = csv.Sniffer().sniff(text_sample, delimiters=",\t;|")
+        return dialect.delimiter
+    except csv.Error:
+        return ","
+
+
+def infer_column_types(rows: list[dict], sample_size: int = 100) -> dict[str, str]:
+    """Infer column types from a sample of rows.
+
+    Returns a mapping of column_name → type_hint where type_hint is one of:
+    timestamp, integer, float, ip, hash_md5, hash_sha1, hash_sha256, domain, path, string
+    """
+    import re
+
+    type_map: dict[str, dict[str, int]] = {}
+    sample = rows[:sample_size]
+
+    patterns = {
+        "ip": re.compile(
+            r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$"
+        ),
+        "hash_md5": re.compile(r"^[a-fA-F0-9]{32}$"),
+        "hash_sha1": re.compile(r"^[a-fA-F0-9]{40}$"),
+        "hash_sha256": re.compile(r"^[a-fA-F0-9]{64}$"),
+        "integer": re.compile(r"^-?\d+$"),
+        "float": re.compile(r"^-?\d+\.\d+$"),
+        "timestamp": re.compile(
+            r"^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}"
+        ),
+        "domain": re.compile(
+            r"^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+$"
+        ),
+        "path": re.compile(r"^([A-Z]:\\|/)", re.IGNORECASE),
+    }
+
+    for row in sample:
+        for col, val in row.items():
+            if col not in type_map:
+                type_map[col] = {}
+            val_str = str(val).strip()
+            if not val_str:
+                continue
+            matched = False
+            for type_name, pattern in patterns.items():
+                if pattern.match(val_str):
+                    type_map[col][type_name] = type_map[col].get(type_name, 0) + 1
+                    matched = True
+                    break
+            if not matched:
+                type_map[col]["string"] = type_map[col].get("string", 0) + 1
+
+    result: dict[str, str] = {}
+    for col, counts in type_map.items():
+        if counts:
+            result[col] = max(counts, key=counts.get)  # type: ignore[arg-type]
+        else:
+            result[col] = "string"
+
+    return result
+
+
+def parse_csv_bytes(
+    raw_bytes: bytes,
+    max_rows: int | None = None,
+) -> tuple[list[dict], dict]:
+    """Parse a CSV file from raw bytes.
+
+    Returns:
+        (rows, metadata) where metadata contains encoding, delimiter, columns, etc.
+    """
+    encoding = detect_encoding(raw_bytes)
+
+    try:
+        text = raw_bytes.decode(encoding, errors="replace")
+    except (UnicodeDecodeError, LookupError):
+        text = raw_bytes.decode("utf-8", errors="replace")
+        encoding = "utf-8"
+
+    # Detect delimiter from first few KB
+    delimiter = detect_delimiter(text[:8192])
+
+    reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
+    columns = reader.fieldnames or []
+
+    rows: list[dict] = []
+    for i, row in enumerate(reader):
+        if max_rows is not None and i >= max_rows:
+            break
+        rows.append(dict(row))
+
+    column_types = infer_column_types(rows) if rows else {}
+
+    metadata = {
+        "encoding": encoding,
+        "delimiter": delimiter,
+        "columns": columns,
+        "column_types": column_types,
+        "row_count": len(rows),
+        "total_rows_in_file": len(rows),  # same when no max_rows
+    }
+
+    return rows, metadata
+
+
+async def parse_csv_streaming(
+    file_path: Path,
+    chunk_size: int = 8192,
+) -> AsyncIterator[tuple[int, dict]]:
+    """Stream-parse a CSV file yielding (row_index, row_dict) tuples.
+
+    Memory-efficient for large files.
+    """
+    import aiofiles  # type: ignore[import-untyped]
+
+    # Read a sample for encoding/delimiter detection
+    with open(file_path, "rb") as f:
+        sample_bytes = f.read(65536)
+
+    encoding = detect_encoding(sample_bytes)
+    text_sample = sample_bytes.decode(encoding, errors="replace")
+    delimiter = detect_delimiter(text_sample[:8192])
+
+    # Now stream-read
+    async with aiofiles.open(file_path, mode="r", encoding=encoding, errors="replace") as f:
+        content = await f.read()  # For DictReader compatibility
+
+    reader = csv.DictReader(io.StringIO(content), delimiter=delimiter)
+    for i, row in enumerate(reader):
+        yield i, dict(row)