ThreatHunt/backend/app/api/routes/timeline.py

"""API routes for forensic timeline visualization."""

import logging
from datetime import datetime

from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession

from app.db import get_db
from app.db.models import Dataset, DatasetRow, Hunt

logger = logging.getLogger(__name__)

router = APIRouter(prefix="/api/timeline", tags=["timeline"])


def _parse_timestamp(val: str | None) -> str | None:
    """Try to parse a timestamp string, return ISO format or None."""
    if not val:
        return None
    val = str(val).strip()
    if not val:
        return None
    # Try common formats
    for fmt in [
        "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ",
        "%Y-%m-%dT%H:%M:%S.%f", "%Y-%m-%dT%H:%M:%S",
        "%Y-%m-%d %H:%M:%S.%f", "%Y-%m-%d %H:%M:%S",
        "%Y/%m/%d %H:%M:%S", "%m/%d/%Y %H:%M:%S",
    ]:
        try:
            return datetime.strptime(val, fmt).isoformat() + "Z"
        except ValueError:
            continue
    return None


# Columns likely to contain timestamps
TIME_COLUMNS = {
    "timestamp", "time", "datetime", "date", "created", "modified",
    "eventtime", "event_time", "start_time", "end_time",
    "lastmodified", "last_modified", "created_at", "updated_at",
    "mtime", "atime", "ctime", "btime",
    "timecreated", "timegenerated", "sourcetime",
}


@router.get("/hunt/{hunt_id}")
async def get_hunt_timeline(
    hunt_id: str,
    limit: int = 2000,
    db: AsyncSession = Depends(get_db),
):
    """Build a timeline of events across all datasets in a hunt."""
    result = await db.execute(select(Hunt).where(Hunt.id == hunt_id))
    hunt = result.scalar_one_or_none()
    if not hunt:
        raise HTTPException(status_code=404, detail="Hunt not found")

    result = await db.execute(select(Dataset).where(Dataset.hunt_id == hunt_id))
    datasets = result.scalars().all()
    if not datasets:
        return {"hunt_id": hunt_id, "events": [], "datasets": []}

    events = []
    dataset_info = []

    for ds in datasets:
        artifact_type = getattr(ds, "artifact_type", None) or "Unknown"
        dataset_info.append({
            "id": ds.id, "name": ds.name, "artifact_type": artifact_type,
            "row_count": ds.row_count,
        })

        # Find time columns for this dataset
        schema = ds.column_schema or {}
        time_cols = []
        for col in (ds.normalized_columns or {}).values():
            if col.lower() in TIME_COLUMNS:
                time_cols.append(col)
        if not time_cols:
            for col in schema:
                if col.lower() in TIME_COLUMNS or "time" in col.lower() or "date" in col.lower():
                    time_cols.append(col)
        if not time_cols:
            continue

        # Fetch rows
        rows_result = await db.execute(
            select(DatasetRow)
            .where(DatasetRow.dataset_id == ds.id)
            .order_by(DatasetRow.row_index)
            .limit(limit // max(len(datasets), 1))
        )
        for r in rows_result.scalars().all():
            data = r.normalized_data or r.data
            ts = None
            for tc in time_cols:
                ts = _parse_timestamp(data.get(tc))
                if ts:
                    break
            if ts:
                hostname = data.get("hostname") or data.get("Hostname") or data.get("Fqdn") or ""
                process = data.get("process_name") or data.get("Name") or data.get("ProcessName") or ""
                summary = data.get("command_line") or data.get("CommandLine") or data.get("Details") or ""
                events.append({
                    "timestamp": ts,
                    "dataset_id": ds.id,
                    "dataset_name": ds.name,
                    "artifact_type": artifact_type,
                    "row_index": r.row_index,
                    "hostname": str(hostname)[:128],
                    "process": str(process)[:128],
                    "summary": str(summary)[:256],
                    "data": {k: str(v)[:100] for k, v in list(data.items())[:8]},
                })

    # Sort by timestamp
    events.sort(key=lambda e: e["timestamp"])

    return {
        "hunt_id": hunt_id,
        "hunt_name": hunt.name,
        "event_count": len(events),
        "datasets": dataset_info,
        "events": events[:limit],
    }