""" Job Scheduler Manages job distribution across GPU nodes based on availability and load. """ from typing import Dict, Any, List, Optional from dataclasses import dataclass from enum import Enum import asyncio class NodeStatus(Enum): """Status of GPU node""" AVAILABLE = "available" BUSY = "busy" OFFLINE = "offline" @dataclass class GPUNode: """Represents a GPU compute node""" node_id: str hostname: str port: int vram_total_gb: int vram_used_gb: int compute_utilization: float # 0.0 to 1.0 status: NodeStatus models_loaded: List[str] @property def vram_available_gb(self) -> int: """Calculate available VRAM""" return self.vram_total_gb - self.vram_used_gb @property def is_available(self) -> bool: """Check if node is available for work""" return self.status == NodeStatus.AVAILABLE and self.compute_utilization < 0.9 @dataclass class Job: """Represents an LLM job""" job_id: str model: str priority: int estimated_vram_gb: int requires_parallel: bool requires_chaining: bool payload: Dict[str, Any] class JobScheduler: """ Job Scheduler - Manages distribution of LLM jobs across GPU nodes Decides: - Which GB10 device is available - GPU load (VRAM, compute utilization) - Whether to parallelize across both nodes - Whether job requires serial reasoning (chained) """ def __init__(self): """Initialize job scheduler""" self.nodes: Dict[str, GPUNode] = {} self.job_queue: List[Job] = [] self._initialize_nodes() def _initialize_nodes(self): """Initialize GPU node configuration""" # GB10 Node 1 self.nodes["gb10-node-1"] = GPUNode( node_id="gb10-node-1", hostname="gb10-node-1", port=8001, vram_total_gb=80, vram_used_gb=0, compute_utilization=0.0, status=NodeStatus.AVAILABLE, models_loaded=["deepseek", "qwen72"] ) # GB10 Node 2 self.nodes["gb10-node-2"] = GPUNode( node_id="gb10-node-2", hostname="gb10-node-2", port=8001, vram_total_gb=80, vram_used_gb=0, compute_utilization=0.0, status=NodeStatus.AVAILABLE, models_loaded=["phi4", "qwen-coder", "llama31", "granite-guardian"] ) def get_available_nodes(self) -> List[GPUNode]: """Get list of available nodes""" return [node for node in self.nodes.values() if node.is_available] def find_best_node(self, job: Job) -> Optional[GPUNode]: """ Find best node for a job based on availability and requirements Args: job: Job to schedule Returns: Best GPU node or None if unavailable """ available_nodes = self.get_available_nodes() # Filter nodes that have required model loaded suitable_nodes = [ node for node in available_nodes if job.model in node.models_loaded and node.vram_available_gb >= job.estimated_vram_gb ] if not suitable_nodes: return None # Sort by compute utilization (prefer less loaded nodes) suitable_nodes.sort(key=lambda n: n.compute_utilization) return suitable_nodes[0] def should_parallelize(self, job: Job) -> bool: """ Determine if job should be parallelized across multiple nodes Args: job: Job to evaluate Returns: True if should parallelize """ available_nodes = self.get_available_nodes() # Need at least 2 nodes for parallelization if len(available_nodes) < 2: return False # Job explicitly requires parallel execution if job.requires_parallel: return True # High priority jobs with multiple available nodes if job.priority >= 1 and len(available_nodes) >= 2: return True return False def get_parallel_nodes(self, job: Job) -> List[GPUNode]: """ Get nodes for parallel execution Args: job: Job to parallelize Returns: List of nodes to use """ available_nodes = self.get_available_nodes() # Filter nodes with required model and sufficient VRAM suitable_nodes = [ node for node in available_nodes if job.model in node.models_loaded and node.vram_available_gb >= job.estimated_vram_gb ] # Return up to 2 nodes for parallel execution return suitable_nodes[:2] async def schedule_job(self, job: Job) -> Dict[str, Any]: """ Schedule a job for execution Args: job: Job to schedule Returns: Scheduling decision with node assignments """ # Check if job should be parallelized if self.should_parallelize(job): nodes = self.get_parallel_nodes(job) if len(nodes) >= 2: return { "job_id": job.job_id, "execution_mode": "parallel", "nodes": [ {"node_id": node.node_id, "endpoint": f"http://{node.hostname}:{node.port}/{job.model}"} for node in nodes ], "estimated_time": "distributed" } # Serial execution on single node node = self.find_best_node(job) if not node: # Add to queue if no nodes available self.job_queue.append(job) return { "job_id": job.job_id, "execution_mode": "queued", "status": "waiting_for_resources", "queue_position": len(self.job_queue) } return { "job_id": job.job_id, "execution_mode": "serial" if job.requires_chaining else "single", "node": { "node_id": node.node_id, "endpoint": f"http://{node.hostname}:{node.port}/{job.model}" }, "vram_allocated_gb": job.estimated_vram_gb, "estimated_time": "standard" } def update_node_status( self, node_id: str, vram_used_gb: Optional[int] = None, compute_utilization: Optional[float] = None, status: Optional[NodeStatus] = None ): """ Update node status metrics Args: node_id: Node to update vram_used_gb: Current VRAM usage compute_utilization: Current compute utilization (0.0-1.0) status: Node status """ if node_id not in self.nodes: return node = self.nodes[node_id] if vram_used_gb is not None: node.vram_used_gb = vram_used_gb if compute_utilization is not None: node.compute_utilization = compute_utilization if status is not None: node.status = status def get_job_scheduler() -> JobScheduler: """ Factory function to create job scheduler Returns: Configured JobScheduler instance """ return JobScheduler()