Files
ThreatHunt/backend/app/api/routes/datasets.py
mblanke 9b98ab9614 feat: interactive network map, IOC highlighting, AUP hunt selector, type filters
- NetworkMap: hunt-scoped force-directed graph with click-to-inspect popover
- NetworkMap: zoom/pan (wheel, drag, buttons), viewport transform
- NetworkMap: clickable IP/Host/Domain/URL legend chips to filter node types
- NetworkMap: brighter colors, 20% smaller nodes
- DatasetViewer: IOC columns highlighted with colored headers + cell tinting
- AUPScanner: hunt dropdown replacing dataset checkboxes, auto-select all
- Rename 'Social Media (Personal)' theme to 'Social Media' with DB migration
- Fix /api/hunts timeout: Dataset.rows lazy='noload' (was selectin cascade)
- Add OS column mapping to normalizer
- Full backend services, DB models, alembic migrations, new routes
- New components: Dashboard, HuntManager, FileUpload, NetworkMap, etc.
- Docker Compose deployment with nginx reverse proxy
2026-02-19 15:41:15 -05:00

296 lines
8.8 KiB
Python

"""API routes for dataset upload, listing, and management."""
import logging
import os
from pathlib import Path
from fastapi import APIRouter, Depends, File, HTTPException, Query, UploadFile
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.db import get_db
from app.db.repositories.datasets import DatasetRepository
from app.services.csv_parser import parse_csv_bytes, infer_column_types
from app.services.normalizer import (
normalize_columns,
normalize_rows,
detect_ioc_columns,
detect_time_range,
)
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/datasets", tags=["datasets"])
ALLOWED_EXTENSIONS = {".csv", ".tsv", ".txt"}
# ── Response models ───────────────────────────────────────────────────
class DatasetSummary(BaseModel):
id: str
name: str
filename: str
source_tool: str | None = None
row_count: int
column_schema: dict | None = None
normalized_columns: dict | None = None
ioc_columns: dict | None = None
file_size_bytes: int
encoding: str | None = None
delimiter: str | None = None
time_range_start: str | None = None
time_range_end: str | None = None
hunt_id: str | None = None
created_at: str
class DatasetListResponse(BaseModel):
datasets: list[DatasetSummary]
total: int
class RowsResponse(BaseModel):
rows: list[dict]
total: int
offset: int
limit: int
class UploadResponse(BaseModel):
id: str
name: str
row_count: int
columns: list[str]
column_types: dict
normalized_columns: dict
ioc_columns: dict
message: str
# ── Routes ────────────────────────────────────────────────────────────
@router.post(
"/upload",
response_model=UploadResponse,
summary="Upload a CSV dataset",
description="Upload a CSV/TSV file for analysis. The file is parsed, columns normalized, "
"IOCs auto-detected, and rows stored in the database.",
)
async def upload_dataset(
file: UploadFile = File(...),
name: str | None = Query(None, description="Display name for the dataset"),
source_tool: str | None = Query(None, description="Source tool (e.g., velociraptor)"),
hunt_id: str | None = Query(None, description="Hunt ID to associate with"),
db: AsyncSession = Depends(get_db),
):
"""Upload and parse a CSV dataset."""
# Validate file
if not file.filename:
raise HTTPException(status_code=400, detail="No filename provided")
ext = Path(file.filename).suffix.lower()
if ext not in ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"File type '{ext}' not allowed. Accepted: {', '.join(ALLOWED_EXTENSIONS)}",
)
# Read file bytes
raw_bytes = await file.read()
if len(raw_bytes) == 0:
raise HTTPException(status_code=400, detail="File is empty")
if len(raw_bytes) > settings.max_upload_bytes:
raise HTTPException(
status_code=413,
detail=f"File too large. Max size: {settings.MAX_UPLOAD_SIZE_MB} MB",
)
# Parse CSV
try:
rows, metadata = parse_csv_bytes(raw_bytes)
except Exception as e:
logger.error(f"CSV parse error: {e}")
raise HTTPException(
status_code=422,
detail=f"Failed to parse CSV: {str(e)}. Check encoding and format.",
)
if not rows:
raise HTTPException(status_code=422, detail="CSV file contains no data rows")
columns: list[str] = metadata["columns"]
column_types: dict = metadata["column_types"]
# Normalize columns
column_mapping = normalize_columns(columns)
normalized = normalize_rows(rows, column_mapping)
# Detect IOCs
ioc_columns = detect_ioc_columns(columns, column_types, column_mapping)
# Detect time range
time_start, time_end = detect_time_range(rows, column_mapping)
# Store in DB
repo = DatasetRepository(db)
dataset = await repo.create_dataset(
name=name or Path(file.filename).stem,
filename=file.filename,
source_tool=source_tool,
row_count=len(rows),
column_schema=column_types,
normalized_columns=column_mapping,
ioc_columns=ioc_columns,
file_size_bytes=len(raw_bytes),
encoding=metadata["encoding"],
delimiter=metadata["delimiter"],
time_range_start=time_start,
time_range_end=time_end,
hunt_id=hunt_id,
)
await repo.bulk_insert_rows(
dataset_id=dataset.id,
rows=rows,
normalized_rows=normalized,
)
logger.info(
f"Uploaded dataset '{dataset.name}': {len(rows)} rows, "
f"{len(columns)} columns, {len(ioc_columns)} IOC columns detected"
)
return UploadResponse(
id=dataset.id,
name=dataset.name,
row_count=len(rows),
columns=columns,
column_types=column_types,
normalized_columns=column_mapping,
ioc_columns=ioc_columns,
message=f"Successfully uploaded {len(rows)} rows with {len(ioc_columns)} IOC columns detected",
)
@router.get(
"",
response_model=DatasetListResponse,
summary="List datasets",
)
async def list_datasets(
hunt_id: str | None = Query(None),
limit: int = Query(100, ge=1, le=1000),
offset: int = Query(0, ge=0),
db: AsyncSession = Depends(get_db),
):
repo = DatasetRepository(db)
datasets = await repo.list_datasets(hunt_id=hunt_id, limit=limit, offset=offset)
total = await repo.count_datasets(hunt_id=hunt_id)
return DatasetListResponse(
datasets=[
DatasetSummary(
id=ds.id,
name=ds.name,
filename=ds.filename,
source_tool=ds.source_tool,
row_count=ds.row_count,
column_schema=ds.column_schema,
normalized_columns=ds.normalized_columns,
ioc_columns=ds.ioc_columns,
file_size_bytes=ds.file_size_bytes,
encoding=ds.encoding,
delimiter=ds.delimiter,
time_range_start=ds.time_range_start.isoformat() if ds.time_range_start else None,
time_range_end=ds.time_range_end.isoformat() if ds.time_range_end else None,
hunt_id=ds.hunt_id,
created_at=ds.created_at.isoformat(),
)
for ds in datasets
],
total=total,
)
@router.get(
"/{dataset_id}",
response_model=DatasetSummary,
summary="Get dataset details",
)
async def get_dataset(
dataset_id: str,
db: AsyncSession = Depends(get_db),
):
repo = DatasetRepository(db)
ds = await repo.get_dataset(dataset_id)
if not ds:
raise HTTPException(status_code=404, detail="Dataset not found")
return DatasetSummary(
id=ds.id,
name=ds.name,
filename=ds.filename,
source_tool=ds.source_tool,
row_count=ds.row_count,
column_schema=ds.column_schema,
normalized_columns=ds.normalized_columns,
ioc_columns=ds.ioc_columns,
file_size_bytes=ds.file_size_bytes,
encoding=ds.encoding,
delimiter=ds.delimiter,
time_range_start=ds.time_range_start.isoformat() if ds.time_range_start else None,
time_range_end=ds.time_range_end.isoformat() if ds.time_range_end else None,
hunt_id=ds.hunt_id,
created_at=ds.created_at.isoformat(),
)
@router.get(
"/{dataset_id}/rows",
response_model=RowsResponse,
summary="Get dataset rows",
)
async def get_dataset_rows(
dataset_id: str,
limit: int = Query(1000, ge=1, le=10000),
offset: int = Query(0, ge=0),
normalized: bool = Query(False, description="Return normalized column names"),
db: AsyncSession = Depends(get_db),
):
repo = DatasetRepository(db)
ds = await repo.get_dataset(dataset_id)
if not ds:
raise HTTPException(status_code=404, detail="Dataset not found")
rows = await repo.get_rows(dataset_id, limit=limit, offset=offset)
total = await repo.count_rows(dataset_id)
return RowsResponse(
rows=[
(r.normalized_data if normalized and r.normalized_data else r.data)
for r in rows
],
total=total,
offset=offset,
limit=limit,
)
@router.delete(
"/{dataset_id}",
summary="Delete a dataset",
)
async def delete_dataset(
dataset_id: str,
db: AsyncSession = Depends(get_db),
):
repo = DatasetRepository(db)
deleted = await repo.delete_dataset(dataset_id)
if not deleted:
raise HTTPException(status_code=404, detail="Dataset not found")
return {"message": "Dataset deleted", "id": dataset_id}