mirror of
https://github.com/mblanke/StrikePackageGPT.git
synced 2026-03-01 14:20:21 -05:00
- Add distinct 'Local Ollama' and 'Networked Ollama' options in dropdown - Local uses localhost, Networked uses remote endpoints with load balancing - Color-coded: Yellow for Local, Blue for Networked - Icons: Local, Networked - Backend supports OLLAMA_LOCAL_URL and OLLAMA_NETWORK_URLS env vars - Updated installer to generate new env var format - Legacy 'ollama' provider still works for backward compatibility
435 lines
15 KiB
Python
435 lines
15 KiB
Python
"""
|
|
LLM Router Service
|
|
Routes requests to different LLM providers (OpenAI, Anthropic, Ollama)
|
|
Supports multiple Ollama endpoints with load balancing
|
|
"""
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel
|
|
from typing import Optional, Literal
|
|
import httpx
|
|
import os
|
|
import random
|
|
import asyncio
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta
|
|
|
|
app = FastAPI(
|
|
title="LLM Router",
|
|
description="Routes requests to multiple LLM providers with load balancing",
|
|
version="0.2.0"
|
|
)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# Configuration from environment
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
|
|
|
# Separate local and networked Ollama endpoints
|
|
OLLAMA_LOCAL_URL = os.getenv("OLLAMA_LOCAL_URL", "http://localhost:11434")
|
|
OLLAMA_NETWORK_URLS_STR = os.getenv("OLLAMA_NETWORK_URLS", os.getenv("OLLAMA_ENDPOINTS", os.getenv("OLLAMA_BASE_URL", "")))
|
|
OLLAMA_NETWORK_URLS = [url.strip() for url in OLLAMA_NETWORK_URLS_STR.split(",") if url.strip()]
|
|
|
|
# Legacy support: if only OLLAMA_ENDPOINTS is set, use it for network
|
|
LOAD_BALANCE_STRATEGY = os.getenv("LOAD_BALANCE_STRATEGY", "round-robin") # round-robin, random, failover
|
|
|
|
@dataclass
|
|
class EndpointHealth:
|
|
url: str
|
|
healthy: bool = True
|
|
last_check: datetime = None
|
|
failure_count: int = 0
|
|
models: list = None
|
|
|
|
# Track endpoint health for both local and network
|
|
all_ollama_endpoints = [OLLAMA_LOCAL_URL] + OLLAMA_NETWORK_URLS if OLLAMA_LOCAL_URL else OLLAMA_NETWORK_URLS
|
|
endpoint_health: dict[str, EndpointHealth] = {url: EndpointHealth(url=url, models=[]) for url in all_ollama_endpoints}
|
|
current_network_endpoint_index = 0
|
|
|
|
|
|
class ChatMessage(BaseModel):
|
|
role: Literal["system", "user", "assistant"]
|
|
content: str
|
|
|
|
|
|
class ChatRequest(BaseModel):
|
|
provider: Literal["openai", "anthropic", "ollama", "ollama-local", "ollama-network"] = "ollama-local"
|
|
model: str = "llama3.2"
|
|
messages: list[ChatMessage]
|
|
temperature: float = 0.7
|
|
max_tokens: int = 2048
|
|
|
|
|
|
class ChatResponse(BaseModel):
|
|
provider: str
|
|
model: str
|
|
content: str
|
|
usage: Optional[dict] = None
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint"""
|
|
return {
|
|
"status": "healthy",
|
|
"service": "llm-router",
|
|
"local_endpoint": OLLAMA_LOCAL_URL,
|
|
"network_endpoints": len(OLLAMA_NETWORK_URLS)
|
|
}
|
|
|
|
|
|
async def check_endpoint_health(url: str) -> tuple[bool, list]:
|
|
"""Check if an Ollama endpoint is healthy and get its models"""
|
|
try:
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.get(f"{url}/api/tags", timeout=5.0)
|
|
if response.status_code == 200:
|
|
data = response.json()
|
|
models = [m["name"] for m in data.get("models", [])]
|
|
return True, models
|
|
except Exception:
|
|
pass
|
|
return False, []
|
|
|
|
|
|
async def get_healthy_endpoint(endpoints: list[str]) -> Optional[str]:
|
|
"""Get a healthy Ollama endpoint from the given list based on load balancing strategy"""
|
|
global current_network_endpoint_index
|
|
|
|
if not endpoints:
|
|
return None
|
|
|
|
# Refresh health status for stale checks (older than 30 seconds)
|
|
now = datetime.now()
|
|
for url in endpoints:
|
|
if url not in endpoint_health:
|
|
endpoint_health[url] = EndpointHealth(url=url, models=[])
|
|
health = endpoint_health[url]
|
|
if health.last_check is None or (now - health.last_check) > timedelta(seconds=30):
|
|
is_healthy, models = await check_endpoint_health(url)
|
|
health.healthy = is_healthy
|
|
health.models = models
|
|
health.last_check = now
|
|
if is_healthy:
|
|
health.failure_count = 0
|
|
|
|
healthy_endpoints = [url for url in endpoints if endpoint_health.get(url, EndpointHealth(url=url)).healthy]
|
|
|
|
if not healthy_endpoints:
|
|
return None
|
|
|
|
if LOAD_BALANCE_STRATEGY == "random":
|
|
return random.choice(healthy_endpoints)
|
|
elif LOAD_BALANCE_STRATEGY == "failover":
|
|
# Always use first available healthy endpoint
|
|
return healthy_endpoints[0]
|
|
else: # round-robin (default)
|
|
# Find next healthy endpoint in rotation
|
|
for _ in range(len(endpoints)):
|
|
current_network_endpoint_index = (current_network_endpoint_index + 1) % len(endpoints)
|
|
url = endpoints[current_network_endpoint_index]
|
|
if url in healthy_endpoints:
|
|
return url
|
|
return healthy_endpoints[0]
|
|
|
|
|
|
@app.get("/providers")
|
|
async def list_providers():
|
|
"""List available LLM providers and their status"""
|
|
providers = {
|
|
"openai": {"available": bool(OPENAI_API_KEY), "models": ["gpt-4o", "gpt-4o-mini", "gpt-4-turbo"]},
|
|
"anthropic": {"available": bool(ANTHROPIC_API_KEY), "models": ["claude-sonnet-4-20250514", "claude-3-5-haiku-20241022"]},
|
|
}
|
|
|
|
# Check local Ollama endpoint
|
|
if OLLAMA_LOCAL_URL:
|
|
is_healthy, models = await check_endpoint_health(OLLAMA_LOCAL_URL)
|
|
endpoint_health[OLLAMA_LOCAL_URL] = EndpointHealth(
|
|
url=OLLAMA_LOCAL_URL,
|
|
healthy=is_healthy,
|
|
models=models,
|
|
last_check=datetime.now()
|
|
)
|
|
providers["ollama-local"] = {
|
|
"available": is_healthy,
|
|
"endpoint": OLLAMA_LOCAL_URL,
|
|
"models": models if models else ["llama3", "mistral", "codellama"]
|
|
}
|
|
else:
|
|
providers["ollama-local"] = {"available": False, "models": []}
|
|
|
|
# Check networked Ollama endpoints
|
|
network_info = []
|
|
network_models = set()
|
|
any_network_available = False
|
|
|
|
for url in OLLAMA_NETWORK_URLS:
|
|
is_healthy, models = await check_endpoint_health(url)
|
|
endpoint_health[url] = EndpointHealth(
|
|
url=url,
|
|
healthy=is_healthy,
|
|
models=models,
|
|
last_check=datetime.now()
|
|
)
|
|
network_info.append({
|
|
"url": url,
|
|
"available": is_healthy,
|
|
"models": models
|
|
})
|
|
if is_healthy:
|
|
any_network_available = True
|
|
network_models.update(models)
|
|
|
|
providers["ollama-network"] = {
|
|
"available": any_network_available,
|
|
"endpoints": network_info,
|
|
"load_balance_strategy": LOAD_BALANCE_STRATEGY,
|
|
"models": list(network_models) if network_models else ["llama3", "mistral", "codellama"]
|
|
}
|
|
|
|
# Legacy: also provide combined "ollama" for backward compatibility
|
|
all_ollama_models = set()
|
|
if providers["ollama-local"]["available"]:
|
|
all_ollama_models.update(providers["ollama-local"]["models"])
|
|
if providers["ollama-network"]["available"]:
|
|
all_ollama_models.update(providers["ollama-network"]["models"])
|
|
|
|
providers["ollama"] = {
|
|
"available": providers["ollama-local"]["available"] or providers["ollama-network"]["available"],
|
|
"models": list(all_ollama_models) if all_ollama_models else ["llama3", "mistral", "codellama"]
|
|
}
|
|
|
|
return providers
|
|
|
|
|
|
@app.get("/endpoints")
|
|
async def list_endpoints():
|
|
"""List all Ollama endpoints with detailed status"""
|
|
results = {
|
|
"local": None,
|
|
"network": []
|
|
}
|
|
|
|
# Local endpoint
|
|
if OLLAMA_LOCAL_URL:
|
|
is_healthy, models = await check_endpoint_health(OLLAMA_LOCAL_URL)
|
|
results["local"] = {
|
|
"url": OLLAMA_LOCAL_URL,
|
|
"healthy": is_healthy,
|
|
"models": models,
|
|
"failure_count": endpoint_health.get(OLLAMA_LOCAL_URL, EndpointHealth(url=OLLAMA_LOCAL_URL)).failure_count
|
|
}
|
|
|
|
# Network endpoints
|
|
for url in OLLAMA_NETWORK_URLS:
|
|
is_healthy, models = await check_endpoint_health(url)
|
|
results["network"].append({
|
|
"url": url,
|
|
"healthy": is_healthy,
|
|
"models": models,
|
|
"failure_count": endpoint_health.get(url, EndpointHealth(url=url)).failure_count
|
|
})
|
|
|
|
return {
|
|
"strategy": LOAD_BALANCE_STRATEGY,
|
|
"endpoints": results,
|
|
"network_healthy_count": sum(1 for r in results["network"] if r["healthy"]),
|
|
"network_total_count": len(results["network"])
|
|
}
|
|
|
|
|
|
@app.post("/chat", response_model=ChatResponse)
|
|
async def chat(request: ChatRequest):
|
|
"""Route chat request to specified LLM provider"""
|
|
|
|
if request.provider == "openai":
|
|
return await _call_openai(request)
|
|
elif request.provider == "anthropic":
|
|
return await _call_anthropic(request)
|
|
elif request.provider == "ollama-local":
|
|
return await _call_ollama_local(request)
|
|
elif request.provider == "ollama-network":
|
|
return await _call_ollama_network(request)
|
|
elif request.provider == "ollama":
|
|
# Legacy: try local first, then network
|
|
if OLLAMA_LOCAL_URL:
|
|
try:
|
|
return await _call_ollama_local(request)
|
|
except HTTPException:
|
|
if OLLAMA_NETWORK_URLS:
|
|
return await _call_ollama_network(request)
|
|
raise
|
|
elif OLLAMA_NETWORK_URLS:
|
|
return await _call_ollama_network(request)
|
|
else:
|
|
raise HTTPException(status_code=503, detail="No Ollama endpoints configured")
|
|
else:
|
|
raise HTTPException(status_code=400, detail=f"Unknown provider: {request.provider}")
|
|
|
|
|
|
async def _call_openai(request: ChatRequest) -> ChatResponse:
|
|
"""Call OpenAI API"""
|
|
if not OPENAI_API_KEY:
|
|
raise HTTPException(status_code=503, detail="OpenAI API key not configured")
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
response = await client.post(
|
|
"https://api.openai.com/v1/chat/completions",
|
|
headers={
|
|
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
|
"Content-Type": "application/json"
|
|
},
|
|
json={
|
|
"model": request.model,
|
|
"messages": [m.model_dump() for m in request.messages],
|
|
"temperature": request.temperature,
|
|
"max_tokens": request.max_tokens
|
|
},
|
|
timeout=60.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=response.status_code, detail=response.text)
|
|
|
|
data = response.json()
|
|
return ChatResponse(
|
|
provider="openai",
|
|
model=request.model,
|
|
content=data["choices"][0]["message"]["content"],
|
|
usage=data.get("usage")
|
|
)
|
|
|
|
|
|
async def _call_anthropic(request: ChatRequest) -> ChatResponse:
|
|
"""Call Anthropic API"""
|
|
if not ANTHROPIC_API_KEY:
|
|
raise HTTPException(status_code=503, detail="Anthropic API key not configured")
|
|
|
|
# Extract system message if present
|
|
system_msg = ""
|
|
messages = []
|
|
for msg in request.messages:
|
|
if msg.role == "system":
|
|
system_msg = msg.content
|
|
else:
|
|
messages.append({"role": msg.role, "content": msg.content})
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
payload = {
|
|
"model": request.model,
|
|
"messages": messages,
|
|
"max_tokens": request.max_tokens,
|
|
"temperature": request.temperature
|
|
}
|
|
if system_msg:
|
|
payload["system"] = system_msg
|
|
|
|
response = await client.post(
|
|
"https://api.anthropic.com/v1/messages",
|
|
headers={
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"Content-Type": "application/json",
|
|
"anthropic-version": "2023-06-01"
|
|
},
|
|
json=payload,
|
|
timeout=60.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
raise HTTPException(status_code=response.status_code, detail=response.text)
|
|
|
|
data = response.json()
|
|
return ChatResponse(
|
|
provider="anthropic",
|
|
model=request.model,
|
|
content=data["content"][0]["text"],
|
|
usage=data.get("usage")
|
|
)
|
|
|
|
|
|
async def _call_ollama_endpoint(request: ChatRequest, endpoint: str, provider_label: str) -> ChatResponse:
|
|
"""Call a specific Ollama endpoint"""
|
|
async with httpx.AsyncClient() as client:
|
|
try:
|
|
response = await client.post(
|
|
f"{endpoint}/api/chat",
|
|
json={
|
|
"model": request.model,
|
|
"messages": [m.model_dump() for m in request.messages],
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": request.temperature,
|
|
"num_predict": request.max_tokens
|
|
}
|
|
},
|
|
timeout=120.0
|
|
)
|
|
|
|
if response.status_code != 200:
|
|
# Mark endpoint as failed
|
|
if endpoint in endpoint_health:
|
|
endpoint_health[endpoint].failure_count += 1
|
|
if endpoint_health[endpoint].failure_count >= 3:
|
|
endpoint_health[endpoint].healthy = False
|
|
raise HTTPException(status_code=response.status_code, detail=response.text)
|
|
|
|
# Reset failure count on success
|
|
if endpoint in endpoint_health:
|
|
endpoint_health[endpoint].failure_count = 0
|
|
|
|
data = response.json()
|
|
return ChatResponse(
|
|
provider=provider_label,
|
|
model=request.model,
|
|
content=data["message"]["content"],
|
|
usage={
|
|
"prompt_tokens": data.get("prompt_eval_count", 0),
|
|
"completion_tokens": data.get("eval_count", 0),
|
|
"endpoint": endpoint
|
|
}
|
|
)
|
|
except httpx.ConnectError:
|
|
# Mark endpoint as unhealthy
|
|
if endpoint in endpoint_health:
|
|
endpoint_health[endpoint].healthy = False
|
|
endpoint_health[endpoint].failure_count += 1
|
|
raise HTTPException(status_code=503, detail=f"Ollama endpoint unavailable: {endpoint}")
|
|
|
|
|
|
async def _call_ollama_local(request: ChatRequest) -> ChatResponse:
|
|
"""Call local Ollama instance"""
|
|
if not OLLAMA_LOCAL_URL:
|
|
raise HTTPException(status_code=503, detail="Local Ollama not configured")
|
|
return await _call_ollama_endpoint(request, OLLAMA_LOCAL_URL, "ollama-local")
|
|
|
|
|
|
async def _call_ollama_network(request: ChatRequest) -> ChatResponse:
|
|
"""Call networked Ollama with load balancing across endpoints"""
|
|
if not OLLAMA_NETWORK_URLS:
|
|
raise HTTPException(status_code=503, detail="No networked Ollama endpoints configured")
|
|
|
|
endpoint = await get_healthy_endpoint(OLLAMA_NETWORK_URLS)
|
|
|
|
if not endpoint:
|
|
raise HTTPException(status_code=503, detail="No healthy networked Ollama endpoints available")
|
|
|
|
try:
|
|
return await _call_ollama_endpoint(request, endpoint, "ollama-network")
|
|
except HTTPException:
|
|
# Try another endpoint if available
|
|
other_endpoint = await get_healthy_endpoint(OLLAMA_NETWORK_URLS)
|
|
if other_endpoint and other_endpoint != endpoint:
|
|
return await _call_ollama_endpoint(request, other_endpoint, "ollama-network")
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=8000) |