Implement Phase 5: Distributed LLM Routing Architecture

Co-authored-by: mblanke <9078342+mblanke@users.noreply.github.com>
2026-03-01 14:00:20 -05:00 · 2025-12-09 18:01:37 +00:00
parent abe97ab26c
commit a6fe219a33
8 changed files with 1829 additions and 5 deletions
--- a/backend/app/core/job_scheduler.py
+++ b/backend/app/core/job_scheduler.py
@@ -0,0 +1,263 @@
+"""
+Job Scheduler
+
+Manages job distribution across GPU nodes based on availability and load.
+"""
+
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass
+from enum import Enum
+import asyncio
+
+
+class NodeStatus(Enum):
+    """Status of GPU node"""
+    AVAILABLE = "available"
+    BUSY = "busy"
+    OFFLINE = "offline"
+
+
+@dataclass
+class GPUNode:
+    """Represents a GPU compute node"""
+    node_id: str
+    hostname: str
+    port: int
+    vram_total_gb: int
+    vram_used_gb: int
+    compute_utilization: float  # 0.0 to 1.0
+    status: NodeStatus
+    models_loaded: List[str]
+    
+    @property
+    def vram_available_gb(self) -> int:
+        """Calculate available VRAM"""
+        return self.vram_total_gb - self.vram_used_gb
+    
+    @property
+    def is_available(self) -> bool:
+        """Check if node is available for work"""
+        return self.status == NodeStatus.AVAILABLE and self.compute_utilization < 0.9
+
+
+@dataclass
+class Job:
+    """Represents an LLM job"""
+    job_id: str
+    model: str
+    priority: int
+    estimated_vram_gb: int
+    requires_parallel: bool
+    requires_chaining: bool
+    payload: Dict[str, Any]
+
+
+class JobScheduler:
+    """
+    Job Scheduler - Manages distribution of LLM jobs across GPU nodes
+    
+    Decides:
+    - Which GB10 device is available
+    - GPU load (VRAM, compute utilization)
+    - Whether to parallelize across both nodes
+    - Whether job requires serial reasoning (chained)
+    """
+    
+    def __init__(self):
+        """Initialize job scheduler"""
+        self.nodes: Dict[str, GPUNode] = {}
+        self.job_queue: List[Job] = []
+        self._initialize_nodes()
+    
+    def _initialize_nodes(self):
+        """Initialize GPU node configuration"""
+        # GB10 Node 1
+        self.nodes["gb10-node-1"] = GPUNode(
+            node_id="gb10-node-1",
+            hostname="gb10-node-1",
+            port=8001,
+            vram_total_gb=80,
+            vram_used_gb=0,
+            compute_utilization=0.0,
+            status=NodeStatus.AVAILABLE,
+            models_loaded=["deepseek", "qwen72"]
+        )
+        
+        # GB10 Node 2
+        self.nodes["gb10-node-2"] = GPUNode(
+            node_id="gb10-node-2",
+            hostname="gb10-node-2",
+            port=8001,
+            vram_total_gb=80,
+            vram_used_gb=0,
+            compute_utilization=0.0,
+            status=NodeStatus.AVAILABLE,
+            models_loaded=["phi4", "qwen-coder", "llama31", "granite-guardian"]
+        )
+    
+    def get_available_nodes(self) -> List[GPUNode]:
+        """Get list of available nodes"""
+        return [node for node in self.nodes.values() if node.is_available]
+    
+    def find_best_node(self, job: Job) -> Optional[GPUNode]:
+        """
+        Find best node for a job based on availability and requirements
+        
+        Args:
+            job: Job to schedule
+        
+        Returns:
+            Best GPU node or None if unavailable
+        """
+        available_nodes = self.get_available_nodes()
+        
+        # Filter nodes that have required model loaded
+        suitable_nodes = [
+            node for node in available_nodes
+            if job.model in node.models_loaded
+            and node.vram_available_gb >= job.estimated_vram_gb
+        ]
+        
+        if not suitable_nodes:
+            return None
+        
+        # Sort by compute utilization (prefer less loaded nodes)
+        suitable_nodes.sort(key=lambda n: n.compute_utilization)
+        
+        return suitable_nodes[0]
+    
+    def should_parallelize(self, job: Job) -> bool:
+        """
+        Determine if job should be parallelized across multiple nodes
+        
+        Args:
+            job: Job to evaluate
+        
+        Returns:
+            True if should parallelize
+        """
+        available_nodes = self.get_available_nodes()
+        
+        # Need at least 2 nodes for parallelization
+        if len(available_nodes) < 2:
+            return False
+        
+        # Job explicitly requires parallel execution
+        if job.requires_parallel:
+            return True
+        
+        # High priority jobs with multiple available nodes
+        if job.priority >= 1 and len(available_nodes) >= 2:
+            return True
+        
+        return False
+    
+    def get_parallel_nodes(self, job: Job) -> List[GPUNode]:
+        """
+        Get nodes for parallel execution
+        
+        Args:
+            job: Job to parallelize
+        
+        Returns:
+            List of nodes to use
+        """
+        available_nodes = self.get_available_nodes()
+        
+        # Filter nodes with required model and sufficient VRAM
+        suitable_nodes = [
+            node for node in available_nodes
+            if job.model in node.models_loaded
+            and node.vram_available_gb >= job.estimated_vram_gb
+        ]
+        
+        # Return up to 2 nodes for parallel execution
+        return suitable_nodes[:2]
+    
+    async def schedule_job(self, job: Job) -> Dict[str, Any]:
+        """
+        Schedule a job for execution
+        
+        Args:
+            job: Job to schedule
+        
+        Returns:
+            Scheduling decision with node assignments
+        """
+        # Check if job should be parallelized
+        if self.should_parallelize(job):
+            nodes = self.get_parallel_nodes(job)
+            if len(nodes) >= 2:
+                return {
+                    "job_id": job.job_id,
+                    "execution_mode": "parallel",
+                    "nodes": [
+                        {"node_id": node.node_id, "endpoint": f"http://{node.hostname}:{node.port}/{job.model}"}
+                        for node in nodes
+                    ],
+                    "estimated_time": "distributed"
+                }
+        
+        # Serial execution on single node
+        node = self.find_best_node(job)
+        
+        if not node:
+            # Add to queue if no nodes available
+            self.job_queue.append(job)
+            return {
+                "job_id": job.job_id,
+                "execution_mode": "queued",
+                "status": "waiting_for_resources",
+                "queue_position": len(self.job_queue)
+            }
+        
+        return {
+            "job_id": job.job_id,
+            "execution_mode": "serial" if job.requires_chaining else "single",
+            "node": {
+                "node_id": node.node_id,
+                "endpoint": f"http://{node.hostname}:{node.port}/{job.model}"
+            },
+            "vram_allocated_gb": job.estimated_vram_gb,
+            "estimated_time": "standard"
+        }
+    
+    def update_node_status(
+        self,
+        node_id: str,
+        vram_used_gb: Optional[int] = None,
+        compute_utilization: Optional[float] = None,
+        status: Optional[NodeStatus] = None
+    ):
+        """
+        Update node status metrics
+        
+        Args:
+            node_id: Node to update
+            vram_used_gb: Current VRAM usage
+            compute_utilization: Current compute utilization (0.0-1.0)
+            status: Node status
+        """
+        if node_id not in self.nodes:
+            return
+        
+        node = self.nodes[node_id]
+        
+        if vram_used_gb is not None:
+            node.vram_used_gb = vram_used_gb
+        
+        if compute_utilization is not None:
+            node.compute_utilization = compute_utilization
+        
+        if status is not None:
+            node.status = status
+
+
+def get_job_scheduler() -> JobScheduler:
+    """
+    Factory function to create job scheduler
+    
+    Returns:
+        Configured JobScheduler instance
+    """
+    return JobScheduler()