mirror of
https://github.com/mblanke/ThreatHunt.git
synced 2026-03-01 14:00:20 -05:00
- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover - NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform - NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types - NetworkMap: brighter colors, 20% smaller nodes - DatasetViewer: IOC columns highlighted with colored headers + cell tinting - AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all - Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration - Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade) - Add OS column mapping to normalizer - Full backend services, DB models, alembic migrations, new routes - New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc. - Docker Compose deployment with nginx reverse proxy
166 lines
5.1 KiB
Python
166 lines
5.1 KiB
Python
"""CSV parsing engine with encoding detection, delimiter sniffing, and streaming.
|
|
|
|
Handles large Velociraptor CSV exports with resilience to encoding issues,
|
|
varied delimiters, and malformed rows.
|
|
"""
|
|
|
|
import csv
|
|
import io
|
|
import logging
|
|
from pathlib import Path
|
|
from typing import AsyncIterator
|
|
|
|
import chardet
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Reasonable defaults
|
|
MAX_FIELD_SIZE = 1024 * 1024 # 1 MB per field
|
|
csv.field_size_limit(MAX_FIELD_SIZE)
|
|
|
|
|
|
def detect_encoding(file_bytes: bytes, sample_size: int = 65536) -> str:
|
|
"""Detect file encoding from a sample of bytes."""
|
|
result = chardet.detect(file_bytes[:sample_size])
|
|
encoding = result.get("encoding", "utf-8") or "utf-8"
|
|
confidence = result.get("confidence", 0)
|
|
logger.info(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
|
|
# Fall back to utf-8 if confidence is very low
|
|
if confidence < 0.5:
|
|
encoding = "utf-8"
|
|
return encoding
|
|
|
|
|
|
def detect_delimiter(text_sample: str) -> str:
|
|
"""Sniff the CSV delimiter from a text sample."""
|
|
try:
|
|
dialect = csv.Sniffer().sniff(text_sample, delimiters=",\t;|")
|
|
return dialect.delimiter
|
|
except csv.Error:
|
|
return ","
|
|
|
|
|
|
def infer_column_types(rows: list[dict], sample_size: int = 100) -> dict[str, str]:
|
|
"""Infer column types from a sample of rows.
|
|
|
|
Returns a mapping of column_name → type_hint where type_hint is one of:
|
|
timestamp, integer, float, ip, hash_md5, hash_sha1, hash_sha256, domain, path, string
|
|
"""
|
|
import re
|
|
|
|
type_map: dict[str, dict[str, int]] = {}
|
|
sample = rows[:sample_size]
|
|
|
|
patterns = {
|
|
"ip": re.compile(
|
|
r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$"
|
|
),
|
|
"hash_md5": re.compile(r"^[a-fA-F0-9]{32}$"),
|
|
"hash_sha1": re.compile(r"^[a-fA-F0-9]{40}$"),
|
|
"hash_sha256": re.compile(r"^[a-fA-F0-9]{64}$"),
|
|
"integer": re.compile(r"^-?\d+$"),
|
|
"float": re.compile(r"^-?\d+\.\d+$"),
|
|
"timestamp": re.compile(
|
|
r"^\d{4}[-/]\d{2}[-/]\d{2}[T ]\d{2}:\d{2}"
|
|
),
|
|
"domain": re.compile(
|
|
r"^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?(\.[a-zA-Z]{2,})+$"
|
|
),
|
|
"path": re.compile(r"^([A-Z]:\\|/)", re.IGNORECASE),
|
|
}
|
|
|
|
for row in sample:
|
|
for col, val in row.items():
|
|
if col not in type_map:
|
|
type_map[col] = {}
|
|
val_str = str(val).strip()
|
|
if not val_str:
|
|
continue
|
|
matched = False
|
|
for type_name, pattern in patterns.items():
|
|
if pattern.match(val_str):
|
|
type_map[col][type_name] = type_map[col].get(type_name, 0) + 1
|
|
matched = True
|
|
break
|
|
if not matched:
|
|
type_map[col]["string"] = type_map[col].get("string", 0) + 1
|
|
|
|
result: dict[str, str] = {}
|
|
for col, counts in type_map.items():
|
|
if counts:
|
|
result[col] = max(counts, key=counts.get) # type: ignore[arg-type]
|
|
else:
|
|
result[col] = "string"
|
|
|
|
return result
|
|
|
|
|
|
def parse_csv_bytes(
|
|
raw_bytes: bytes,
|
|
max_rows: int | None = None,
|
|
) -> tuple[list[dict], dict]:
|
|
"""Parse a CSV file from raw bytes.
|
|
|
|
Returns:
|
|
(rows, metadata) where metadata contains encoding, delimiter, columns, etc.
|
|
"""
|
|
encoding = detect_encoding(raw_bytes)
|
|
|
|
try:
|
|
text = raw_bytes.decode(encoding, errors="replace")
|
|
except (UnicodeDecodeError, LookupError):
|
|
text = raw_bytes.decode("utf-8", errors="replace")
|
|
encoding = "utf-8"
|
|
|
|
# Detect delimiter from first few KB
|
|
delimiter = detect_delimiter(text[:8192])
|
|
|
|
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
|
|
columns = reader.fieldnames or []
|
|
|
|
rows: list[dict] = []
|
|
for i, row in enumerate(reader):
|
|
if max_rows is not None and i >= max_rows:
|
|
break
|
|
rows.append(dict(row))
|
|
|
|
column_types = infer_column_types(rows) if rows else {}
|
|
|
|
metadata = {
|
|
"encoding": encoding,
|
|
"delimiter": delimiter,
|
|
"columns": columns,
|
|
"column_types": column_types,
|
|
"row_count": len(rows),
|
|
"total_rows_in_file": len(rows), # same when no max_rows
|
|
}
|
|
|
|
return rows, metadata
|
|
|
|
|
|
async def parse_csv_streaming(
|
|
file_path: Path,
|
|
chunk_size: int = 8192,
|
|
) -> AsyncIterator[tuple[int, dict]]:
|
|
"""Stream-parse a CSV file yielding (row_index, row_dict) tuples.
|
|
|
|
Memory-efficient for large files.
|
|
"""
|
|
import aiofiles # type: ignore[import-untyped]
|
|
|
|
# Read a sample for encoding/delimiter detection
|
|
with open(file_path, "rb") as f:
|
|
sample_bytes = f.read(65536)
|
|
|
|
encoding = detect_encoding(sample_bytes)
|
|
text_sample = sample_bytes.decode(encoding, errors="replace")
|
|
delimiter = detect_delimiter(text_sample[:8192])
|
|
|
|
# Now stream-read
|
|
async with aiofiles.open(file_path, mode="r", encoding=encoding, errors="replace") as f:
|
|
content = await f.read() # For DictReader compatibility
|
|
|
|
reader = csv.DictReader(io.StringIO(content), delimiter=delimiter)
|
|
for i, row in enumerate(reader):
|
|
yield i, dict(row)
|