Files
ThreatHunt/backend/app/services/job_queue.py
mblanke 04a9946891 feat: host-centric network map, analysis dashboard, deduped inventory
- Rewrote NetworkMap to use deduplicated host inventory (163 hosts from 394K rows)
- New host_inventory.py service: scans datasets, groups by FQDN/ClientId, extracts IPs/users/OS
- New /api/network/host-inventory endpoint
- Added AnalysisDashboard with 6 tabs (IOC, anomaly, host profile, query, triage, reports)
- Added 16 analysis API endpoints with job queue and load balancer
- Added 4 AI/analysis ORM models (ProcessingJob, AnalysisResult, HostProfile, IOCEntry)
- Filters system accounts (DWM-*, UMFD-*, LOCAL/NETWORK SERVICE)
- Infers OS from hostname patterns (W10-* -> Windows 10)
- Canvas 2D force-directed graph with host/external-IP node types
- Click popover shows hostname, FQDN, IPs, OS, users, datasets, connections
2026-02-20 07:16:17 -05:00

316 lines
10 KiB
Python

"""Async job queue for background AI tasks.
Manages triage, profiling, report generation, anomaly detection,
and data queries as trackable jobs with status, progress, and
cancellation support.
"""
from __future__ import annotations
import asyncio
import logging
import time
import uuid
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Callable, Coroutine, Optional
logger = logging.getLogger(__name__)
class JobStatus(str, Enum):
QUEUED = "queued"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
CANCELLED = "cancelled"
class JobType(str, Enum):
TRIAGE = "triage"
HOST_PROFILE = "host_profile"
REPORT = "report"
ANOMALY = "anomaly"
QUERY = "query"
@dataclass
class Job:
id: str
job_type: JobType
status: JobStatus = JobStatus.QUEUED
progress: float = 0.0 # 0-100
message: str = ""
result: Any = None
error: str | None = None
created_at: float = field(default_factory=time.time)
started_at: float | None = None
completed_at: float | None = None
params: dict = field(default_factory=dict)
_cancel_event: asyncio.Event = field(default_factory=asyncio.Event, repr=False)
@property
def elapsed_ms(self) -> int:
end = self.completed_at or time.time()
start = self.started_at or self.created_at
return int((end - start) * 1000)
def to_dict(self) -> dict:
return {
"id": self.id,
"job_type": self.job_type.value,
"status": self.status.value,
"progress": round(self.progress, 1),
"message": self.message,
"error": self.error,
"created_at": self.created_at,
"started_at": self.started_at,
"completed_at": self.completed_at,
"elapsed_ms": self.elapsed_ms,
"params": self.params,
}
@property
def is_cancelled(self) -> bool:
return self._cancel_event.is_set()
def cancel(self):
self._cancel_event.set()
self.status = JobStatus.CANCELLED
self.completed_at = time.time()
self.message = "Cancelled by user"
class JobQueue:
"""In-memory async job queue with concurrency control.
Jobs are tracked by ID and can be listed, polled, or cancelled.
A configurable number of workers process jobs from the queue.
"""
def __init__(self, max_workers: int = 3):
self._jobs: dict[str, Job] = {}
self._queue: asyncio.Queue[str] = asyncio.Queue()
self._max_workers = max_workers
self._workers: list[asyncio.Task] = []
self._handlers: dict[JobType, Callable] = {}
self._started = False
def register_handler(
self,
job_type: JobType,
handler: Callable[[Job], Coroutine],
):
"""Register an async handler for a job type.
Handler signature: async def handler(job: Job) -> Any
The handler can update job.progress and job.message during execution.
It should check job.is_cancelled periodically and return early.
"""
self._handlers[job_type] = handler
logger.info(f"Registered handler for {job_type.value}")
async def start(self):
"""Start worker tasks."""
if self._started:
return
self._started = True
for i in range(self._max_workers):
task = asyncio.create_task(self._worker(i))
self._workers.append(task)
logger.info(f"Job queue started with {self._max_workers} workers")
async def stop(self):
"""Stop all workers."""
self._started = False
for w in self._workers:
w.cancel()
await asyncio.gather(*self._workers, return_exceptions=True)
self._workers.clear()
logger.info("Job queue stopped")
def submit(self, job_type: JobType, **params) -> Job:
"""Submit a new job. Returns the Job object immediately."""
job = Job(
id=str(uuid.uuid4()),
job_type=job_type,
params=params,
)
self._jobs[job.id] = job
self._queue.put_nowait(job.id)
logger.info(f"Job submitted: {job.id} ({job_type.value}) params={params}")
return job
def get_job(self, job_id: str) -> Job | None:
return self._jobs.get(job_id)
def cancel_job(self, job_id: str) -> bool:
job = self._jobs.get(job_id)
if not job:
return False
if job.status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED):
return False
job.cancel()
return True
def list_jobs(
self,
status: JobStatus | None = None,
job_type: JobType | None = None,
limit: int = 50,
) -> list[dict]:
"""List jobs, newest first."""
jobs = sorted(self._jobs.values(), key=lambda j: j.created_at, reverse=True)
if status:
jobs = [j for j in jobs if j.status == status]
if job_type:
jobs = [j for j in jobs if j.job_type == job_type]
return [j.to_dict() for j in jobs[:limit]]
def get_stats(self) -> dict:
"""Get queue statistics."""
by_status = {}
for j in self._jobs.values():
by_status[j.status.value] = by_status.get(j.status.value, 0) + 1
return {
"total": len(self._jobs),
"queued": self._queue.qsize(),
"by_status": by_status,
"workers": self._max_workers,
"active_workers": sum(
1 for j in self._jobs.values() if j.status == JobStatus.RUNNING
),
}
def cleanup(self, max_age_seconds: float = 3600):
"""Remove old completed/failed/cancelled jobs."""
now = time.time()
to_remove = [
jid for jid, j in self._jobs.items()
if j.status in (JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.CANCELLED)
and (now - j.created_at) > max_age_seconds
]
for jid in to_remove:
del self._jobs[jid]
if to_remove:
logger.info(f"Cleaned up {len(to_remove)} old jobs")
async def _worker(self, worker_id: int):
"""Worker loop: pull jobs from queue and execute handlers."""
logger.info(f"Worker {worker_id} started")
while self._started:
try:
job_id = await asyncio.wait_for(self._queue.get(), timeout=5.0)
except asyncio.TimeoutError:
continue
except asyncio.CancelledError:
break
job = self._jobs.get(job_id)
if not job or job.is_cancelled:
continue
handler = self._handlers.get(job.job_type)
if not handler:
job.status = JobStatus.FAILED
job.error = f"No handler for {job.job_type.value}"
job.completed_at = time.time()
logger.error(f"No handler for job type {job.job_type.value}")
continue
job.status = JobStatus.RUNNING
job.started_at = time.time()
job.message = "Running..."
logger.info(f"Worker {worker_id}: executing {job.id} ({job.job_type.value})")
try:
result = await handler(job)
if not job.is_cancelled:
job.status = JobStatus.COMPLETED
job.progress = 100.0
job.result = result
job.message = "Completed"
job.completed_at = time.time()
logger.info(
f"Worker {worker_id}: completed {job.id} "
f"in {job.elapsed_ms}ms"
)
except Exception as e:
if not job.is_cancelled:
job.status = JobStatus.FAILED
job.error = str(e)
job.message = f"Failed: {e}"
job.completed_at = time.time()
logger.error(
f"Worker {worker_id}: failed {job.id}: {e}",
exc_info=True,
)
# Singleton + job handlers
job_queue = JobQueue(max_workers=3)
async def _handle_triage(job: Job):
"""Triage handler."""
from app.services.triage import triage_dataset
dataset_id = job.params.get("dataset_id")
job.message = f"Triaging dataset {dataset_id}"
results = await triage_dataset(dataset_id)
return {"count": len(results) if results else 0}
async def _handle_host_profile(job: Job):
"""Host profiling handler."""
from app.services.host_profiler import profile_all_hosts, profile_host
hunt_id = job.params.get("hunt_id")
hostname = job.params.get("hostname")
if hostname:
job.message = f"Profiling host {hostname}"
await profile_host(hunt_id, hostname)
return {"hostname": hostname}
else:
job.message = f"Profiling all hosts in hunt {hunt_id}"
await profile_all_hosts(hunt_id)
return {"hunt_id": hunt_id}
async def _handle_report(job: Job):
"""Report generation handler."""
from app.services.report_generator import generate_report
hunt_id = job.params.get("hunt_id")
job.message = f"Generating report for hunt {hunt_id}"
report = await generate_report(hunt_id)
return {"report_id": report.id if report else None}
async def _handle_anomaly(job: Job):
"""Anomaly detection handler."""
from app.services.anomaly_detector import detect_anomalies
dataset_id = job.params.get("dataset_id")
k = job.params.get("k", 3)
threshold = job.params.get("threshold", 0.35)
job.message = f"Detecting anomalies in dataset {dataset_id}"
results = await detect_anomalies(dataset_id, k=k, outlier_threshold=threshold)
return {"count": len(results) if results else 0}
async def _handle_query(job: Job):
"""Data query handler (non-streaming)."""
from app.services.data_query import query_dataset
dataset_id = job.params.get("dataset_id")
question = job.params.get("question", "")
mode = job.params.get("mode", "quick")
job.message = f"Querying dataset {dataset_id}"
answer = await query_dataset(dataset_id, question, mode)
return {"answer": answer}
def register_all_handlers():
"""Register all job handlers."""
job_queue.register_handler(JobType.TRIAGE, _handle_triage)
job_queue.register_handler(JobType.HOST_PROFILE, _handle_host_profile)
job_queue.register_handler(JobType.REPORT, _handle_report)
job_queue.register_handler(JobType.ANOMALY, _handle_anomaly)
job_queue.register_handler(JobType.QUERY, _handle_query)