mirror of
https://github.com/mblanke/ThreatHunt.git
synced 2026-03-01 05:50:21 -05:00
- Implemented PlaybookManager for creating and managing investigation playbooks with templates. - Added SavedSearches component for managing bookmarked queries and recurring scans. - Introduced TimelineView for visualizing forensic event timelines with zoomable charts. - Enhanced backend processing with auto-queued jobs for dataset uploads and improved database concurrency. - Updated frontend components for better user experience and performance optimizations. - Documented changes in update log for future reference.
107 lines
3.9 KiB
Python
107 lines
3.9 KiB
Python
"""Tests for CSV parser and normalizer services."""
|
|
|
|
import pytest
|
|
from app.services.csv_parser import parse_csv_bytes, detect_encoding, detect_delimiter, infer_column_types
|
|
from app.services.normalizer import normalize_columns, normalize_rows, detect_ioc_columns, detect_time_range
|
|
from tests.conftest import SAMPLE_CSV, SAMPLE_HASH_CSV, make_csv_bytes
|
|
|
|
|
|
class TestCSVParser:
|
|
"""Tests for CSV parsing."""
|
|
|
|
def test_parse_csv_basic(self):
|
|
rows, meta = parse_csv_bytes(SAMPLE_CSV)
|
|
assert len(rows) == 5
|
|
assert "timestamp" in meta["columns"]
|
|
assert "hostname" in meta["columns"]
|
|
assert meta["encoding"] is not None
|
|
assert meta["delimiter"] == ","
|
|
|
|
def test_parse_csv_columns(self):
|
|
rows, meta = parse_csv_bytes(SAMPLE_CSV)
|
|
assert meta["columns"] == ["timestamp", "hostname", "src_ip", "dst_ip", "process_name", "command_line"]
|
|
|
|
def test_parse_csv_row_data(self):
|
|
rows, meta = parse_csv_bytes(SAMPLE_CSV)
|
|
assert rows[0]["hostname"] == "DESKTOP-ABC"
|
|
assert rows[0]["src_ip"] == "192.168.1.100"
|
|
assert rows[2]["process_name"] == "chrome.exe"
|
|
|
|
def test_parse_csv_hash_file(self):
|
|
rows, meta = parse_csv_bytes(SAMPLE_HASH_CSV)
|
|
assert len(rows) == 2
|
|
assert "md5" in meta["columns"]
|
|
assert "sha256" in meta["columns"]
|
|
|
|
def test_parse_tsv(self):
|
|
tsv_data = make_csv_bytes(
|
|
["host", "ip", "port"],
|
|
[["server1", "10.0.0.1", "443"], ["server2", "10.0.0.2", "80"]],
|
|
delimiter="\t",
|
|
)
|
|
rows, meta = parse_csv_bytes(tsv_data)
|
|
assert len(rows) == 2
|
|
|
|
def test_parse_empty_file(self):
|
|
rows, meta = parse_csv_bytes(b"")
|
|
assert len(rows) == 0
|
|
assert meta["row_count"] == 0
|
|
|
|
def test_detect_encoding_utf8(self):
|
|
enc = detect_encoding(SAMPLE_CSV)
|
|
assert enc is not None
|
|
assert "ascii" in enc.lower() or "utf" in enc.lower()
|
|
|
|
def test_infer_column_types(self):
|
|
types = infer_column_types(
|
|
[{"src_ip": "192.168.1.1"}, {"src_ip": "10.0.0.1"}, {"src_ip": "8.8.8.8"}],
|
|
)
|
|
assert types["src_ip"] == "ip"
|
|
|
|
def test_infer_column_types_hash(self):
|
|
types = infer_column_types(
|
|
[{"hash": "d41d8cd98f00b204e9800998ecf8427e"}],
|
|
)
|
|
assert types["hash"] == "hash_md5"
|
|
|
|
|
|
class TestNormalizer:
|
|
"""Tests for column normalization."""
|
|
|
|
def test_normalize_columns(self):
|
|
mapping = normalize_columns(["SourceAddr", "DestAddr", "ProcessName"])
|
|
assert "SourceAddr" in mapping
|
|
# Should map to canonical names
|
|
assert mapping.get("SourceAddr") in ("src_ip", "source_address", None) or isinstance(mapping.get("SourceAddr"), str)
|
|
|
|
def test_normalize_known_columns(self):
|
|
mapping = normalize_columns(["timestamp", "hostname", "src_ip"])
|
|
assert mapping.get("timestamp") == "timestamp"
|
|
assert mapping.get("hostname") == "hostname"
|
|
assert mapping.get("src_ip") == "src_ip"
|
|
|
|
def test_detect_ioc_columns(self):
|
|
rows, meta = parse_csv_bytes(SAMPLE_CSV)
|
|
column_mapping = normalize_columns(meta["columns"])
|
|
iocs = detect_ioc_columns(meta["columns"], meta["column_types"], column_mapping)
|
|
# Should detect IP columns
|
|
assert isinstance(iocs, dict)
|
|
|
|
def test_detect_time_range(self):
|
|
rows, meta = parse_csv_bytes(SAMPLE_CSV)
|
|
column_mapping = normalize_columns(meta["columns"])
|
|
start, end = detect_time_range(rows, column_mapping)
|
|
# Should detect time range from timestamp column
|
|
if start:
|
|
assert "2025" in str(start)
|
|
|
|
def test_normalize_rows(self):
|
|
rows = [{"SourceAddr": "10.0.0.1", "ProcessName": "cmd.exe"}]
|
|
mapping = {"SourceAddr": "src_ip", "ProcessName": "process_name"}
|
|
normalized = normalize_rows(rows, mapping)
|
|
assert len(normalized) == 1
|
|
assert normalized[0].get("src_ip") == "10.0.0.1"
|
|
|
|
|
|
|