version 0.4.0

This commit is contained in:
2026-02-20 14:32:42 -05:00
parent ab8038867a
commit 365cf87c90
76 changed files with 34422 additions and 690 deletions

View File

@@ -0,0 +1,254 @@
"""Timeline and field-statistics service.
Provides temporal histogram bins and per-field distribution stats
for dataset rows — used by the TimelineScrubber and QueryBar components.
"""
import logging
from collections import Counter, defaultdict
from datetime import datetime
from typing import Any, Sequence
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models import Dataset, DatasetRow
logger = logging.getLogger(__name__)
# ── Timeline bins ─────────────────────────────────────────────────────
async def build_timeline_bins(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
bins: int = 60,
) -> dict:
"""Create histogram bins of events over time.
Returns {bins: [{start, end, count, events_by_type}], total, range}.
"""
rows = await _fetch_rows(db, dataset_id=dataset_id, hunt_id=hunt_id)
if not rows:
return {"bins": [], "total": 0, "range": None}
# Extract timestamps
events: list[dict] = []
for r in rows:
data = r.normalized_data or r.data
ts_str = data.get("timestamp", "")
if not ts_str:
continue
ts = _parse_ts(str(ts_str))
if ts:
events.append({
"timestamp": ts,
"event_type": _classify_type(data),
"hostname": data.get("hostname", ""),
})
if not events:
return {"bins": [], "total": len(rows), "range": None}
events.sort(key=lambda e: e["timestamp"])
ts_min = events[0]["timestamp"]
ts_max = events[-1]["timestamp"]
if ts_min == ts_max:
return {
"bins": [{"start": ts_min.isoformat(), "end": ts_max.isoformat(),
"count": len(events), "events_by_type": {}}],
"total": len(events),
"range": {"start": ts_min.isoformat(), "end": ts_max.isoformat()},
}
delta = (ts_max - ts_min) / bins
result_bins: list[dict] = []
for i in range(bins):
bin_start = ts_min + delta * i
bin_end = ts_min + delta * (i + 1)
bin_events = [e for e in events
if bin_start <= e["timestamp"] < bin_end
or (i == bins - 1 and e["timestamp"] == ts_max)]
type_counts: dict[str, int] = Counter(e["event_type"] for e in bin_events)
result_bins.append({
"start": bin_start.isoformat(),
"end": bin_end.isoformat(),
"count": len(bin_events),
"events_by_type": dict(type_counts),
})
return {
"bins": result_bins,
"total": len(events),
"range": {"start": ts_min.isoformat(), "end": ts_max.isoformat()},
}
# ── Field stats ───────────────────────────────────────────────────────
async def compute_field_stats(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
fields: list[str] | None = None,
top_n: int = 20,
) -> dict:
"""Compute per-field value distributions.
Returns {fields: {field_name: {total, unique, top: [{value, count}]}}}
"""
rows = await _fetch_rows(db, dataset_id=dataset_id, hunt_id=hunt_id)
if not rows:
return {"fields": {}, "total_rows": 0}
# Determine which fields to analyze
sample_data = rows[0].normalized_data or rows[0].data
all_fields = list(sample_data.keys())
target_fields = fields if fields else all_fields[:30]
stats: dict[str, dict] = {}
for field in target_fields:
values = []
for r in rows:
data = r.normalized_data or r.data
v = data.get(field)
if v is not None and str(v).strip() not in ("", "N/A", "n/a", "-", "None"):
values.append(str(v))
counter = Counter(values)
top = [{"value": v, "count": c} for v, c in counter.most_common(top_n)]
stats[field] = {
"total": len(values),
"unique": len(counter),
"top": top,
}
return {
"fields": stats,
"total_rows": len(rows),
"available_fields": all_fields,
}
# ── Row search with filters ──────────────────────────────────────────
async def search_rows(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
query: str = "",
filters: dict[str, str] | None = None,
time_start: str | None = None,
time_end: str | None = None,
limit: int = 500,
offset: int = 0,
) -> dict:
"""Search/filter dataset rows.
Supports:
- Free-text search across all fields
- Field-specific filters {field: value}
- Time range filters
"""
rows = await _fetch_rows(db, dataset_id=dataset_id, hunt_id=hunt_id, limit=50000)
if not rows:
return {"rows": [], "total": 0, "offset": offset, "limit": limit}
results: list[dict] = []
ts_start = _parse_ts(time_start) if time_start else None
ts_end = _parse_ts(time_end) if time_end else None
for r in rows:
data = r.normalized_data or r.data
# Time filter
if ts_start or ts_end:
ts = _parse_ts(str(data.get("timestamp", "")))
if ts:
if ts_start and ts < ts_start:
continue
if ts_end and ts > ts_end:
continue
# Field filters
if filters:
match = True
for field, value in filters.items():
field_val = str(data.get(field, "")).lower()
if value.lower() not in field_val:
match = False
break
if not match:
continue
# Free-text search
if query:
q = query.lower()
found = any(q in str(v).lower() for v in data.values())
if not found:
continue
results.append(data)
total = len(results)
paged = results[offset:offset + limit]
return {"rows": paged, "total": total, "offset": offset, "limit": limit}
# ── Internal helpers ──────────────────────────────────────────────────
async def _fetch_rows(
db: AsyncSession,
dataset_id: str | None = None,
hunt_id: str | None = None,
limit: int = 50_000,
) -> Sequence[DatasetRow]:
stmt = select(DatasetRow).join(Dataset)
if dataset_id:
stmt = stmt.where(DatasetRow.dataset_id == dataset_id)
elif hunt_id:
stmt = stmt.where(Dataset.hunt_id == hunt_id)
stmt = stmt.order_by(DatasetRow.row_index).limit(limit)
result = await db.execute(stmt)
return result.scalars().all()
def _parse_ts(ts_str: str | None) -> datetime | None:
"""Best-effort timestamp parsing."""
if not ts_str:
return None
for fmt in (
"%Y-%m-%dT%H:%M:%S.%fZ",
"%Y-%m-%dT%H:%M:%SZ",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%dT%H:%M:%S",
"%Y-%m-%d %H:%M:%S.%f",
"%Y-%m-%d %H:%M:%S",
"%m/%d/%Y %H:%M:%S",
"%m/%d/%Y %I:%M:%S %p",
):
try:
return datetime.strptime(ts_str.strip(), fmt)
except (ValueError, AttributeError):
continue
return None
def _classify_type(data: dict) -> str:
if data.get("pid") or data.get("process_name"):
if data.get("dst_ip") or data.get("dst_port"):
return "network"
return "process"
if data.get("dst_ip") or data.get("src_ip"):
return "network"
if data.get("file_path"):
return "file"
return "other"