"""Artifact normalizer — maps Velociraptor and common tool columns to canonical schema. The canonical schema provides consistent field names regardless of which tool exported the CSV (Velociraptor, OSQuery, Sysmon, etc.). """ import logging import re from datetime import datetime from typing import Any logger = logging.getLogger(__name__) # ── Column mapping: source_column_pattern → canonical_name ───────────── # Patterns are case-insensitive regexes matched against column names. COLUMN_MAPPINGS: list[tuple[str, str]] = [ # Timestamps (r"^(timestamp|time|event_?time|date_?time|created?_?(at|time|date)|modified_?(at|time|date)|mtime|ctime|atime|start_?time|end_?time)$", "timestamp"), (r"^(eventtime|system\.timecreated)$", "timestamp"), # Host identifiers (r"^(hostname|host|fqdn|computer_?name|system_?name|machinename|clientid)$", "hostname"), # Operating system (r"^(os|operating_?system|os_?version|os_?name|platform|os_?type)$", "os"), # Source / destination IPs (r"^(source_?ip|src_?ip|srcaddr|local_?address|sourceaddress)$", "src_ip"), (r"^(dest_?ip|dst_?ip|dstaddr|remote_?address|destinationaddress|destaddress)$", "dst_ip"), (r"^(ip_?address|ipaddress|ip)$", "ip_address"), # Ports (r"^(source_?port|src_?port|localport)$", "src_port"), (r"^(dest_?port|dst_?port|remoteport|destinationport)$", "dst_port"), # Process info (r"^(process_?name|name|image|exe|executable|binary)$", "process_name"), (r"^(pid|process_?id)$", "pid"), (r"^(ppid|parent_?pid|parentprocessid)$", "ppid"), (r"^(command_?line|cmdline|commandline|cmd)$", "command_line"), (r"^(parent_?command_?line|parentcommandline)$", "parent_command_line"), # User info (r"^(user|username|user_?name|account_?name|subjectusername)$", "username"), (r"^(user_?id|uid|sid|subjectusersid)$", "user_id"), # File info (r"^(file_?path|fullpath|full_?name|path|filepath)$", "file_path"), (r"^(file_?name|filename|name)$", "file_name"), (r"^(file_?size|size|bytes|length)$", "file_size"), (r"^(extension|file_?ext)$", "file_extension"), # Hashes (r"^(md5|md5hash|hash_?md5)$", "hash_md5"), (r"^(sha1|sha1hash|hash_?sha1)$", "hash_sha1"), (r"^(sha256|sha256hash|hash_?sha256|hash|filehash)$", "hash_sha256"), # Network (r"^(protocol|proto)$", "protocol"), (r"^(domain|dns_?name|query_?name|queriedname)$", "domain"), (r"^(url|uri|request_?url)$", "url"), # Event info (r"^(event_?id|eventid|eid)$", "event_id"), (r"^(event_?type|eventtype|category|action)$", "event_type"), (r"^(description|message|msg|detail)$", "description"), (r"^(severity|level|priority)$", "severity"), # Registry (r"^(reg_?key|registry_?key|targetobject)$", "registry_key"), (r"^(reg_?value|registry_?value)$", "registry_value"), ] def normalize_columns(columns: list[str]) -> dict[str, str]: """Map raw column names to canonical names. Returns: Dict of {raw_column_name: canonical_column_name}. Columns with no match map to themselves (lowered + underscored). """ mapping: dict[str, str] = {} used_canonical: set[str] = set() for col in columns: col_lower = col.strip().lower() matched = False for pattern, canonical in COLUMN_MAPPINGS: if re.match(pattern, col_lower, re.IGNORECASE): # Avoid duplicate canonical names if canonical not in used_canonical: mapping[col] = canonical used_canonical.add(canonical) matched = True break if not matched: # Produce a clean snake_case version clean = re.sub(r"[^a-z0-9]+", "_", col_lower).strip("_") mapping[col] = clean or col return mapping def normalize_row(row: dict[str, Any], column_mapping: dict[str, str]) -> dict[str, Any]: """Apply column mapping to a single row.""" return {column_mapping.get(k, k): v for k, v in row.items()} def normalize_rows(rows: list[dict], column_mapping: dict[str, str]) -> list[dict]: """Apply column mapping to all rows.""" return [normalize_row(row, column_mapping) for row in rows] def detect_ioc_columns( columns: list[str], column_types: dict[str, str], column_mapping: dict[str, str], ) -> dict[str, str]: """Detect which columns contain IOCs (IPs, hashes, domains). Returns: Dict of {column_name: ioc_type}. """ ioc_columns: dict[str, str] = {} ioc_type_map = { "ip": "ip", "hash_md5": "hash_md5", "hash_sha1": "hash_sha1", "hash_sha256": "hash_sha256", "domain": "domain", } for col in columns: col_type = column_types.get(col) if col_type in ioc_type_map: ioc_columns[col] = ioc_type_map[col_type] # Also check canonical name canonical = column_mapping.get(col, "") if canonical in ("src_ip", "dst_ip", "ip_address"): ioc_columns[col] = "ip" elif canonical == "hash_md5": ioc_columns[col] = "hash_md5" elif canonical == "hash_sha1": ioc_columns[col] = "hash_sha1" elif canonical in ("hash_sha256",): ioc_columns[col] = "hash_sha256" elif canonical == "domain": ioc_columns[col] = "domain" elif canonical == "url": ioc_columns[col] = "url" return ioc_columns def detect_time_range( rows: list[dict], column_mapping: dict[str, str], ) -> tuple[datetime | None, datetime | None]: """Find the earliest and latest timestamps in the dataset.""" ts_col = None for raw_col, canonical in column_mapping.items(): if canonical == "timestamp": ts_col = raw_col break if not ts_col: return None, None timestamps: list[datetime] = [] for row in rows: val = row.get(ts_col) if not val: continue try: dt = _parse_timestamp(str(val)) if dt: timestamps.append(dt) except (ValueError, TypeError): continue if not timestamps: return None, None return min(timestamps), max(timestamps) def _parse_timestamp(value: str) -> datetime | None: """Try multiple timestamp formats.""" formats = [ "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%m/%d/%Y %H:%M:%S", "%d/%m/%Y %H:%M:%S", ] for fmt in formats: try: return datetime.strptime(value.strip(), fmt) except ValueError: continue return None