mirror of
https://github.com/mblanke/Dashboard.git
synced 2026-03-01 20:10:20 -05:00
feat: add server stats, GPU stats, container CPU/memory display
- Add /api/servers endpoint querying Prometheus for CPU, RAM, disk, uptime, load - Add /api/gpu endpoint for NVIDIA Jetson GPU utilization, temp, power - Add ServerStatsWidget with animated bars for Atlas, Wile, RoadRunner - Add GPUStatsWidget with GPU util, memory, temp color-coding, power draw - Update ContainerGroup to show CPU bar and memory for running containers - Fix docker-compose.yml: traefik network external: true - Fix getTraefikUrl to scan all router labels (not just 'https')
This commit is contained in:
87
src/app/api/gpu/route.ts
Normal file
87
src/app/api/gpu/route.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import { NextResponse } from "next/server";
|
||||
|
||||
const PROMETHEUS_URL = "http://prometheus:9090";
|
||||
|
||||
const INSTANCE_MAP: Record<string, string> = {
|
||||
"192.168.1.50": "Wile",
|
||||
"192.168.1.51": "RoadRunner",
|
||||
};
|
||||
|
||||
async function queryPrometheus(query: string): Promise<any[]> {
|
||||
try {
|
||||
const url = `${PROMETHEUS_URL}/api/v1/query?query=${encodeURIComponent(query)}`;
|
||||
const res = await fetch(url, { cache: "no-store" });
|
||||
if (!res.ok) return [];
|
||||
const json = await res.json();
|
||||
if (json.status === "success" && json.data?.result) {
|
||||
return json.data.result;
|
||||
}
|
||||
return [];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function extractByInstance(results: any[]): Record<string, number> {
|
||||
const map: Record<string, number> = {};
|
||||
for (const r of results) {
|
||||
const instance: string = r.metric?.instance || "";
|
||||
const ip = instance.replace(/:\d+$/, "");
|
||||
const val = parseFloat(r.value?.[1] || "0");
|
||||
if (!isNaN(val)) {
|
||||
map[ip] = val;
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
async function queryWithFallback(
|
||||
primaryMetric: string,
|
||||
...fallbacks: string[]
|
||||
): Promise<Record<string, number>> {
|
||||
const primary = await queryPrometheus(primaryMetric);
|
||||
if (primary.length > 0) return extractByInstance(primary);
|
||||
|
||||
for (const fb of fallbacks) {
|
||||
const res = await queryPrometheus(fb);
|
||||
if (res.length > 0) return extractByInstance(res);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
export async function GET() {
|
||||
try {
|
||||
const [gpuUtilMap, memUtilMap, tempMap, powerMap] = await Promise.all([
|
||||
queryWithFallback(
|
||||
"DCGM_FI_DEV_GPU_UTIL",
|
||||
"nvidia_gpu_utilization_gpu",
|
||||
"gpu_utilization_percentage"
|
||||
),
|
||||
queryWithFallback(
|
||||
"DCGM_FI_DEV_MEM_COPY_UTIL",
|
||||
"nvidia_gpu_memory_used_bytes / nvidia_gpu_memory_total_bytes * 100"
|
||||
),
|
||||
queryWithFallback(
|
||||
"DCGM_FI_DEV_GPU_TEMP",
|
||||
"nvidia_gpu_temperature_gpu"
|
||||
),
|
||||
queryWithFallback(
|
||||
"DCGM_FI_DEV_POWER_USAGE",
|
||||
"nvidia_gpu_power_draw_watts"
|
||||
),
|
||||
]);
|
||||
|
||||
const gpus = Object.entries(INSTANCE_MAP).map(([ip, name]) => ({
|
||||
name,
|
||||
gpu_util: parseFloat((gpuUtilMap[ip] || 0).toFixed(1)),
|
||||
mem_util: parseFloat((memUtilMap[ip] || 0).toFixed(1)),
|
||||
temp: parseFloat((tempMap[ip] || 0).toFixed(0)),
|
||||
power_watts: parseFloat((powerMap[ip] || 0).toFixed(1)),
|
||||
}));
|
||||
|
||||
return NextResponse.json(gpus);
|
||||
} catch (error) {
|
||||
console.error("GPU API error:", error);
|
||||
return NextResponse.json([]);
|
||||
}
|
||||
}
|
||||
93
src/app/api/servers/route.ts
Normal file
93
src/app/api/servers/route.ts
Normal file
@@ -0,0 +1,93 @@
|
||||
import { NextResponse } from "next/server";
|
||||
|
||||
const PROMETHEUS_URL = "http://prometheus:9090";
|
||||
|
||||
const INSTANCE_MAP: Record<string, { name: string; role: string }> = {
|
||||
"192.168.1.21": { name: "Atlas", role: "Control Node" },
|
||||
"192.168.1.50": { name: "Wile", role: "GPU Node - Heavy" },
|
||||
"192.168.1.51": { name: "RoadRunner", role: "GPU Node - Fast" },
|
||||
};
|
||||
|
||||
async function queryPrometheus(query: string): Promise<any[]> {
|
||||
try {
|
||||
const url = `${PROMETHEUS_URL}/api/v1/query?query=${encodeURIComponent(query)}`;
|
||||
const res = await fetch(url, { cache: "no-store" });
|
||||
if (!res.ok) return [];
|
||||
const json = await res.json();
|
||||
if (json.status === "success" && json.data?.result) {
|
||||
return json.data.result;
|
||||
}
|
||||
return [];
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function extractByInstance(results: any[]): Record<string, number> {
|
||||
const map: Record<string, number> = {};
|
||||
for (const r of results) {
|
||||
const instance: string = r.metric?.instance || "";
|
||||
const ip = instance.replace(/:\d+$/, "");
|
||||
const val = parseFloat(r.value?.[1] || "0");
|
||||
if (!isNaN(val)) {
|
||||
map[ip] = val;
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
export async function GET() {
|
||||
try {
|
||||
const [cpuRes, memPercentRes, memTotalRes, memAvailRes, diskRes, uptimeBootRes, uptimeNowRes, loadRes] =
|
||||
await Promise.all([
|
||||
queryPrometheus(
|
||||
'100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)'
|
||||
),
|
||||
queryPrometheus(
|
||||
"(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100"
|
||||
),
|
||||
queryPrometheus("node_memory_MemTotal_bytes"),
|
||||
queryPrometheus("node_memory_MemAvailable_bytes"),
|
||||
queryPrometheus(
|
||||
'100 - (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100)'
|
||||
),
|
||||
queryPrometheus("node_boot_time_seconds"),
|
||||
queryPrometheus("node_time_seconds"),
|
||||
queryPrometheus("node_load1"),
|
||||
]);
|
||||
|
||||
const cpuMap = extractByInstance(cpuRes);
|
||||
const memPercentMap = extractByInstance(memPercentRes);
|
||||
const memTotalMap = extractByInstance(memTotalRes);
|
||||
const memAvailMap = extractByInstance(memAvailRes);
|
||||
const diskMap = extractByInstance(diskRes);
|
||||
const bootMap = extractByInstance(uptimeBootRes);
|
||||
const nowMap = extractByInstance(uptimeNowRes);
|
||||
const loadMap = extractByInstance(loadRes);
|
||||
|
||||
const servers = Object.entries(INSTANCE_MAP).map(([ip, info]) => {
|
||||
const memTotalBytes = memTotalMap[ip] || 0;
|
||||
const memAvailBytes = memAvailMap[ip] || 0;
|
||||
const memUsedBytes = memTotalBytes - memAvailBytes;
|
||||
const uptimeSeconds = (nowMap[ip] || 0) - (bootMap[ip] || 0);
|
||||
|
||||
return {
|
||||
name: info.name,
|
||||
role: info.role,
|
||||
ip,
|
||||
cpu: parseFloat((cpuMap[ip] || 0).toFixed(1)),
|
||||
memoryPercent: parseFloat((memPercentMap[ip] || 0).toFixed(1)),
|
||||
memoryUsedGB: parseFloat((memUsedBytes / 1073741824).toFixed(1)),
|
||||
memoryTotalGB: parseFloat((memTotalBytes / 1073741824).toFixed(1)),
|
||||
diskPercent: parseFloat((diskMap[ip] || 0).toFixed(1)),
|
||||
uptimeSeconds: Math.floor(uptimeSeconds > 0 ? uptimeSeconds : 0),
|
||||
load1: parseFloat((loadMap[ip] || 0).toFixed(2)),
|
||||
};
|
||||
});
|
||||
|
||||
return NextResponse.json(servers);
|
||||
} catch (error) {
|
||||
console.error("Servers API error:", error);
|
||||
return NextResponse.json([]);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user