921 lines
34 KiB
Python
921 lines
34 KiB
Python
"""
|
|
워커 감시 대시보드
|
|
실시간으로 워커 상태, GPU 사용량, 세션 풀 상태를 모니터링합니다.
|
|
Jetson Xavier와 x86 시스템을 모두 지원합니다.
|
|
"""
|
|
import asyncio
|
|
import json
|
|
import logging
|
|
import time
|
|
import psutil
|
|
import os
|
|
from datetime import datetime, timedelta
|
|
from typing import Dict, List, Any
|
|
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi.responses import HTMLResponse
|
|
import uvicorn
|
|
|
|
from ..core.worker_manager import worker_manager
|
|
from ..core.session_pool import session_pool
|
|
from ..utils.gpu_monitor import gpu_monitor
|
|
from ..core.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# 모니터링 앱 생성
|
|
monitor_app = FastAPI(
|
|
title="인페인팅 서버 모니터링 대시보드",
|
|
description="실시간 서버 상태 모니터링 (Jetson Xavier & x86 지원)",
|
|
version="1.0.0"
|
|
)
|
|
|
|
# 연결된 WebSocket 클라이언트들
|
|
connected_clients: List[WebSocket] = []
|
|
|
|
|
|
class MonitoringData:
|
|
def __init__(self):
|
|
self.history: List[Dict[str, Any]] = []
|
|
self.max_history = 100 # 최대 100개 데이터 포인트 저장
|
|
self.api_stats = {
|
|
"total_requests": 0,
|
|
"successful_requests": 0,
|
|
"failed_requests": 0,
|
|
"endpoint_usage": {},
|
|
"response_times": [],
|
|
"errors": []
|
|
}
|
|
self.alerts = []
|
|
|
|
async def collect_data(self) -> Dict[str, Any]:
|
|
"""현재 시스템 상태 데이터를 수집합니다."""
|
|
timestamp = datetime.now().isoformat()
|
|
|
|
# GPU 정보
|
|
gpu_info = gpu_monitor.get_gpu_memory_info()
|
|
gpu_utilization = gpu_monitor.get_gpu_utilization()
|
|
|
|
# 시스템 메모리 정보
|
|
system_memory = gpu_monitor.get_system_memory_info()
|
|
|
|
# 시스템 성능 지표
|
|
system_performance = self._get_system_performance()
|
|
|
|
# 워커 매니저 상태
|
|
worker_status = worker_manager.get_status()
|
|
|
|
# 세션 풀 상태
|
|
session_status = await session_pool.get_pool_status()
|
|
|
|
# Jetson 전용 정보
|
|
jetson_info = {}
|
|
if settings.IS_JETSON:
|
|
jetson_info = gpu_monitor.get_jetson_specific_info()
|
|
|
|
# API 통계
|
|
api_stats = self._get_api_statistics()
|
|
|
|
# 알림 및 경고
|
|
alerts = self._check_alerts(gpu_info, system_memory, worker_status)
|
|
|
|
data = {
|
|
"timestamp": timestamp,
|
|
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
|
|
"gpu": {
|
|
**gpu_info,
|
|
"utilization": gpu_utilization
|
|
},
|
|
"system_memory": system_memory,
|
|
"system_performance": system_performance,
|
|
"workers": worker_status,
|
|
"sessions": session_status,
|
|
"jetson": jetson_info,
|
|
"api_stats": api_stats,
|
|
"alerts": alerts
|
|
}
|
|
|
|
# 히스토리에 추가
|
|
self.history.append(data)
|
|
if len(self.history) > self.max_history:
|
|
self.history.pop(0)
|
|
|
|
return data
|
|
|
|
def _get_system_performance(self) -> Dict[str, Any]:
|
|
"""시스템 성능 지표를 수집합니다."""
|
|
try:
|
|
# CPU 사용률
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
cpu_count = psutil.cpu_count()
|
|
cpu_freq = psutil.cpu_freq()
|
|
|
|
# 디스크 I/O
|
|
disk_io = psutil.disk_io_counters()
|
|
|
|
# 네트워크 I/O
|
|
net_io = psutil.net_io_counters()
|
|
|
|
# 프로세스 정보
|
|
processes = len(psutil.pids())
|
|
|
|
# 시스템 부하
|
|
load_avg = os.getloadavg() if hasattr(os, 'getloadavg') else [0, 0, 0]
|
|
|
|
return {
|
|
"cpu": {
|
|
"usage_percent": cpu_percent,
|
|
"count": cpu_count,
|
|
"frequency_mhz": cpu_freq.current if cpu_freq else 0,
|
|
"load_average": {
|
|
"1min": load_avg[0],
|
|
"5min": load_avg[1],
|
|
"15min": load_avg[2]
|
|
}
|
|
},
|
|
"disk": {
|
|
"read_bytes": disk_io.read_bytes if disk_io else 0,
|
|
"write_bytes": disk_io.write_bytes if disk_io else 0,
|
|
"read_count": disk_io.read_count if disk_io else 0,
|
|
"write_count": disk_io.write_count if disk_io else 0
|
|
},
|
|
"network": {
|
|
"bytes_sent": net_io.bytes_sent if net_io else 0,
|
|
"bytes_recv": net_io.bytes_recv if net_io else 0,
|
|
"packets_sent": net_io.packets_sent if net_io else 0,
|
|
"packets_recv": net_io.packets_recv if net_io else 0
|
|
},
|
|
"processes": processes
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"시스템 성능 정보 수집 실패: {e}")
|
|
return {}
|
|
|
|
def _get_api_statistics(self) -> Dict[str, Any]:
|
|
"""API 통계 정보를 반환합니다."""
|
|
# 실제 구현에서는 API 엔드포인트에서 이 정보를 수집해야 합니다
|
|
return {
|
|
"total_requests": self.api_stats["total_requests"],
|
|
"successful_requests": self.api_stats["successful_requests"],
|
|
"failed_requests": self.api_stats["failed_requests"],
|
|
"success_rate": (
|
|
(self.api_stats["successful_requests"] / max(self.api_stats["total_requests"], 1)) * 100
|
|
),
|
|
"endpoint_usage": self.api_stats["endpoint_usage"],
|
|
"average_response_time": (
|
|
sum(self.api_stats["response_times"]) / max(len(self.api_stats["response_times"]), 1)
|
|
) if self.api_stats["response_times"] else 0,
|
|
"recent_errors": self.api_stats["errors"][-5:] # 최근 5개 에러
|
|
}
|
|
|
|
def _check_alerts(self, gpu_info: Dict, system_memory: Dict, worker_status: Dict) -> List[Dict]:
|
|
"""시스템 상태를 확인하고 알림을 생성합니다."""
|
|
alerts = []
|
|
current_time = datetime.now()
|
|
|
|
# GPU 메모리 경고
|
|
if gpu_info.get("usage_percent", 0) > 90:
|
|
alerts.append({
|
|
"level": "critical",
|
|
"message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%",
|
|
"timestamp": current_time.isoformat(),
|
|
"category": "gpu"
|
|
})
|
|
elif gpu_info.get("usage_percent", 0) > 80:
|
|
alerts.append({
|
|
"level": "warning",
|
|
"message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%",
|
|
"timestamp": current_time.isoformat(),
|
|
"category": "gpu"
|
|
})
|
|
|
|
# 시스템 메모리 경고
|
|
if system_memory.get("usage_percent", 0) > 90:
|
|
alerts.append({
|
|
"level": "critical",
|
|
"message": f"시스템 메모리 사용률이 높습니다: {system_memory.get('usage_percent', 0):.1f}%",
|
|
"timestamp": current_time.isoformat(),
|
|
"category": "memory"
|
|
})
|
|
|
|
# 워커 상태 경고
|
|
if worker_status.get("active_workers", 0) == 0:
|
|
alerts.append({
|
|
"level": "critical",
|
|
"message": "활성 워커가 없습니다",
|
|
"timestamp": current_time.isoformat(),
|
|
"category": "workers"
|
|
})
|
|
|
|
# Jetson 전용 경고
|
|
if settings.IS_JETSON:
|
|
# 온도 경고 (실제 구현에서는 온도 정보를 가져와야 함)
|
|
pass
|
|
|
|
return alerts
|
|
|
|
def update_api_stats(self, endpoint: str, success: bool, response_time: float, error: str = None):
|
|
"""API 통계를 업데이트합니다."""
|
|
self.api_stats["total_requests"] += 1
|
|
|
|
if success:
|
|
self.api_stats["successful_requests"] += 1
|
|
else:
|
|
self.api_stats["failed_requests"] += 1
|
|
if error:
|
|
self.api_stats["errors"].append({
|
|
"timestamp": datetime.now().isoformat(),
|
|
"endpoint": endpoint,
|
|
"error": error
|
|
})
|
|
|
|
# 엔드포인트별 사용량
|
|
if endpoint not in self.api_stats["endpoint_usage"]:
|
|
self.api_stats["endpoint_usage"][endpoint] = 0
|
|
self.api_stats["endpoint_usage"][endpoint] += 1
|
|
|
|
# 응답 시간
|
|
self.api_stats["response_times"].append(response_time)
|
|
if len(self.api_stats["response_times"]) > 100:
|
|
self.api_stats["response_times"].pop(0)
|
|
|
|
# 에러 로그 제한
|
|
if len(self.api_stats["errors"]) > 50:
|
|
self.api_stats["errors"] = self.api_stats["errors"][-50:]
|
|
|
|
def get_history(self) -> List[Dict[str, Any]]:
|
|
"""데이터 히스토리를 반환합니다."""
|
|
return self.history
|
|
|
|
def get_statistics(self) -> Dict[str, Any]:
|
|
"""통계 정보를 반환합니다."""
|
|
if not self.history:
|
|
return {}
|
|
|
|
recent_data = self.history[-10:] # 최근 10개 데이터
|
|
|
|
# GPU 사용률 평균
|
|
gpu_usage_avg = sum(d["gpu"]["usage_percent"] for d in recent_data) / len(recent_data)
|
|
gpu_util_avg = sum(d["gpu"]["utilization"] for d in recent_data) / len(recent_data)
|
|
|
|
# 시스템 메모리 사용률 평균
|
|
sys_mem_avg = sum(d["system_memory"]["usage_percent"] for d in recent_data) / len(recent_data)
|
|
|
|
# 워커 수 평균
|
|
worker_avg = sum(d["workers"]["active_workers"] for d in recent_data) / len(recent_data)
|
|
|
|
return {
|
|
"gpu_usage_avg": round(gpu_usage_avg, 2),
|
|
"gpu_util_avg": round(gpu_util_avg, 2),
|
|
"system_memory_avg": round(sys_mem_avg, 2),
|
|
"worker_avg": round(worker_avg, 2),
|
|
"data_points": len(recent_data)
|
|
}
|
|
|
|
# 전역 모니터링 데이터 인스턴스
|
|
monitoring_data = MonitoringData()
|
|
|
|
|
|
# HTML 템플릿
|
|
HTML_TEMPLATE = """
|
|
<!DOCTYPE html>
|
|
<html lang="ko">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>인페인팅 서버 모니터링</title>
|
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
|
<style>
|
|
body {
|
|
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
|
margin: 0;
|
|
padding: 20px;
|
|
background-color: #f5f5f5;
|
|
}
|
|
.container {
|
|
max-width: 1600px;
|
|
margin: 0 auto;
|
|
}
|
|
.header {
|
|
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
color: white;
|
|
padding: 20px;
|
|
border-radius: 10px;
|
|
margin-bottom: 20px;
|
|
text-align: center;
|
|
}
|
|
.header h1 {
|
|
margin: 0;
|
|
font-size: 2.5em;
|
|
}
|
|
.header p {
|
|
margin: 10px 0 0 0;
|
|
opacity: 0.9;
|
|
}
|
|
.grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
|
|
gap: 20px;
|
|
margin-bottom: 20px;
|
|
}
|
|
.card {
|
|
background: white;
|
|
border-radius: 10px;
|
|
padding: 20px;
|
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
|
transition: transform 0.2s;
|
|
}
|
|
.card:hover {
|
|
transform: translateY(-2px);
|
|
}
|
|
.card h3 {
|
|
margin-top: 0;
|
|
color: #333;
|
|
border-bottom: 2px solid #667eea;
|
|
padding-bottom: 10px;
|
|
}
|
|
.metric {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
align-items: center;
|
|
margin: 15px 0;
|
|
padding: 10px;
|
|
background: #f8f9fa;
|
|
border-radius: 5px;
|
|
}
|
|
.metric-label {
|
|
font-weight: 500;
|
|
color: #555;
|
|
}
|
|
.metric-value {
|
|
font-weight: bold;
|
|
font-size: 1.1em;
|
|
color: #667eea;
|
|
}
|
|
.chart-container {
|
|
background: white;
|
|
border-radius: 10px;
|
|
padding: 20px;
|
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
|
margin-bottom: 20px;
|
|
}
|
|
.chart-container h3 {
|
|
margin-top: 0;
|
|
color: #333;
|
|
border-bottom: 2px solid #667eea;
|
|
padding-bottom: 10px;
|
|
}
|
|
.alerts {
|
|
background: white;
|
|
border-radius: 10px;
|
|
padding: 20px;
|
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
|
margin-bottom: 20px;
|
|
}
|
|
.alert {
|
|
padding: 10px;
|
|
margin: 10px 0;
|
|
border-radius: 5px;
|
|
border-left: 4px solid;
|
|
}
|
|
.alert.critical {
|
|
background: #ffe6e6;
|
|
border-left-color: #dc3545;
|
|
color: #721c24;
|
|
}
|
|
.alert.warning {
|
|
background: #fff3cd;
|
|
border-left-color: #ffc107;
|
|
color: #856404;
|
|
}
|
|
.alert.info {
|
|
background: #d1ecf1;
|
|
border-left-color: #17a2b8;
|
|
color: #0c5460;
|
|
}
|
|
.endpoint-usage {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
|
|
gap: 10px;
|
|
margin-top: 15px;
|
|
}
|
|
.endpoint-item {
|
|
background: #f8f9fa;
|
|
padding: 10px;
|
|
border-radius: 5px;
|
|
text-align: center;
|
|
}
|
|
.endpoint-name {
|
|
font-weight: bold;
|
|
color: #333;
|
|
}
|
|
.endpoint-count {
|
|
color: #667eea;
|
|
font-size: 1.2em;
|
|
}
|
|
.system-info {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
|
gap: 15px;
|
|
}
|
|
.status-indicator {
|
|
display: inline-block;
|
|
width: 12px;
|
|
height: 12px;
|
|
border-radius: 50%;
|
|
margin-right: 8px;
|
|
}
|
|
.status-online { background: #28a745; }
|
|
.status-offline { background: #dc3545; }
|
|
.status-warning { background: #ffc107; }
|
|
.refresh-time {
|
|
text-align: center;
|
|
color: #666;
|
|
font-size: 0.9em;
|
|
margin-top: 20px;
|
|
}
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<div class="header">
|
|
<h1>🚀 인페인팅 서버 모니터링</h1>
|
|
<p>실시간 서버 상태 및 성능 모니터링 대시보드</p>
|
|
</div>
|
|
|
|
<!-- 시스템 개요 -->
|
|
<div class="grid">
|
|
<div class="card">
|
|
<h3>🖥️ 시스템 정보</h3>
|
|
<div class="metric">
|
|
<span class="metric-label">시스템 타입:</span>
|
|
<span class="metric-value" id="system-type">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">CPU 사용률:</span>
|
|
<span class="metric-value" id="cpu-usage">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">시스템 메모리:</span>
|
|
<span class="metric-value" id="system-memory">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">프로세스 수:</span>
|
|
<span class="metric-value" id="process-count">-</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="card">
|
|
<h3>🎮 GPU 상태</h3>
|
|
<div class="metric">
|
|
<span class="metric-label">GPU 메모리:</span>
|
|
<span class="metric-value" id="gpu-memory">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">GPU 사용률:</span>
|
|
<span class="metric-value" id="gpu-util">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">GPU 온도:</span>
|
|
<span class="metric-value" id="gpu-temp">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">GPU 클럭:</span>
|
|
<span class="metric-value" id="gpu-clock">-</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="card">
|
|
<h3>⚙️ 워커 상태</h3>
|
|
<div class="metric">
|
|
<span class="metric-label">활성 워커:</span>
|
|
<span class="metric-value" id="worker-count">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">세션 풀:</span>
|
|
<span class="metric-value" id="session-pool">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">대기열:</span>
|
|
<span class="metric-value" id="queue-size">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">상태:</span>
|
|
<span class="metric-value" id="worker-status">-</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="card">
|
|
<h3>📊 API 통계</h3>
|
|
<div class="metric">
|
|
<span class="metric-label">총 요청:</span>
|
|
<span class="metric-value" id="total-requests">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">성공률:</span>
|
|
<span class="metric-value" id="success-rate">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">평균 응답시간:</span>
|
|
<span class="metric-value" id="avg-response-time">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">에러 수:</span>
|
|
<span class="metric-value" id="error-count">-</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- 시스템 성능 상세 -->
|
|
<div class="card">
|
|
<h3>🔍 시스템 성능 상세</h3>
|
|
<div class="system-info">
|
|
<div>
|
|
<h4>CPU 정보</h4>
|
|
<div class="metric">
|
|
<span class="metric-label">코어 수:</span>
|
|
<span class="metric-value" id="cpu-count">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">클럭 속도:</span>
|
|
<span class="metric-value" id="cpu-freq">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">부하 평균 (1분):</span>
|
|
<span class="metric-value" id="load-1min">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">부하 평균 (5분):</span>
|
|
<span class="metric-value" id="load-5min">-</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div>
|
|
<h4>디스크 I/O</h4>
|
|
<div class="metric">
|
|
<span class="metric-label">읽기 (MB/s):</span>
|
|
<span class="metric-value" id="disk-read">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">쓰기 (MB/s):</span>
|
|
<span class="metric-value" id="disk-write">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">읽기 횟수:</span>
|
|
<span class="metric-value" id="disk-read-count">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">쓰기 횟수:</span>
|
|
<span class="metric-value" id="disk-write-count">-</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div>
|
|
<h4>네트워크 I/O</h4>
|
|
<div class="metric">
|
|
<span class="metric-label">송신 (MB):</span>
|
|
<span class="metric-value" id="net-sent">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">수신 (MB):</span>
|
|
<span class="metric-value" id="net-recv">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">송신 패킷:</span>
|
|
<span class="metric-value" id="net-sent-pkts">-</span>
|
|
</div>
|
|
<div class="metric">
|
|
<span class="metric-label">수신 패킷:</span>
|
|
<span class="metric-value" id="net-recv-pkts">-</span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- 엔드포인트 사용량 -->
|
|
<div class="card">
|
|
<h3>🌐 엔드포인트 사용량</h3>
|
|
<div class="endpoint-usage" id="endpoint-usage">
|
|
<div class="endpoint-item">
|
|
<div class="endpoint-name">로딩 중...</div>
|
|
<div class="endpoint-count">-</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- 알림 및 경고 -->
|
|
<div class="alerts">
|
|
<h3>⚠️ 알림 및 경고</h3>
|
|
<div id="alerts-container">
|
|
<div class="alert info">
|
|
<strong>정보:</strong> 모니터링 데이터를 수집 중입니다...
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- 차트 -->
|
|
<div class="chart-container">
|
|
<h3>📈 실시간 성능 차트</h3>
|
|
<canvas id="performanceChart" width="400" height="200"></canvas>
|
|
</div>
|
|
|
|
<div class="chart-container">
|
|
<h3>🎯 GPU 메모리 사용량</h3>
|
|
<canvas id="gpuChart" width="400" height="200"></canvas>
|
|
</div>
|
|
|
|
<div class="refresh-time">
|
|
마지막 업데이트: <span id="last-update">-</span>
|
|
</div>
|
|
</div>
|
|
|
|
<script>
|
|
// 차트 초기화
|
|
const performanceCtx = document.getElementById('performanceChart').getContext('2d');
|
|
const gpuCtx = document.getElementById('gpuChart').getContext('2d');
|
|
|
|
const performanceChart = new Chart(performanceCtx, {
|
|
type: 'line',
|
|
data: {
|
|
labels: [],
|
|
datasets: [{
|
|
label: 'GPU 사용률 (%)',
|
|
data: [],
|
|
borderColor: 'rgb(75, 192, 192)',
|
|
tension: 0.1
|
|
}, {
|
|
label: '시스템 메모리 (%)',
|
|
data: [],
|
|
borderColor: 'rgb(255, 99, 132)',
|
|
tension: 0.1
|
|
}]
|
|
},
|
|
options: {
|
|
responsive: true,
|
|
scales: {
|
|
y: {
|
|
beginAtZero: true,
|
|
max: 100
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
const gpuChart = new Chart(gpuCtx, {
|
|
type: 'line',
|
|
data: {
|
|
labels: [],
|
|
datasets: [{
|
|
label: 'GPU 메모리 사용률 (%)',
|
|
data: [],
|
|
borderColor: 'rgb(54, 162, 235)',
|
|
backgroundColor: 'rgba(54, 162, 235, 0.1)',
|
|
tension: 0.1
|
|
}]
|
|
},
|
|
options: {
|
|
responsive: true,
|
|
scales: {
|
|
y: {
|
|
beginAtZero: true,
|
|
max: 100
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
// WebSocket 연결
|
|
const ws = new WebSocket(`ws://${window.location.host}/ws`);
|
|
|
|
ws.onmessage = function(event) {
|
|
const data = JSON.parse(event.data);
|
|
updateDashboard(data);
|
|
};
|
|
|
|
ws.onclose = function() {
|
|
console.log('WebSocket 연결이 종료되었습니다.');
|
|
setTimeout(() => {
|
|
location.reload();
|
|
}, 5000);
|
|
};
|
|
|
|
function updateDashboard(data) {
|
|
// 기본 메트릭 업데이트
|
|
document.getElementById('system-type').textContent = data.system_type;
|
|
document.getElementById('cpu-usage').textContent = data.system_performance?.cpu?.usage_percent?.toFixed(1) + '%' || '-';
|
|
document.getElementById('system-memory').textContent = data.system_memory?.usage_percent?.toFixed(1) + '%' || '-';
|
|
document.getElementById('process-count').textContent = data.system_performance?.processes || '-';
|
|
|
|
document.getElementById('gpu-memory').textContent = data.gpu?.usage_percent?.toFixed(1) + '%' || '-';
|
|
document.getElementById('gpu-util').textContent = data.gpu?.utilization?.toFixed(1) + '%' || '-';
|
|
document.getElementById('gpu-temp').textContent = data.jetson?.temperature?.gpu?.toFixed(1) + '°C' || '-';
|
|
document.getElementById('gpu-clock').textContent = data.jetson?.gpu_frequency?.toFixed(0) + 'MHz' || '-';
|
|
|
|
document.getElementById('worker-count').textContent = data.workers?.active_workers || '-';
|
|
document.getElementById('session-pool').textContent = data.sessions?.total_sessions || '-';
|
|
document.getElementById('queue-size').textContent = data.workers?.queue_size || '-';
|
|
document.getElementById('worker-status').textContent = data.workers?.status || '-';
|
|
|
|
// API 통계 업데이트
|
|
document.getElementById('total-requests').textContent = data.api_stats?.total_requests || '-';
|
|
document.getElementById('success-rate').textContent = (data.api_stats?.success_rate?.toFixed(1) || '0') + '%';
|
|
document.getElementById('avg-response-time').textContent = (data.api_stats?.average_response_time?.toFixed(2) || '0') + 'ms';
|
|
document.getElementById('error-count').textContent = data.api_stats?.failed_requests || '-';
|
|
|
|
// 시스템 성능 상세 업데이트
|
|
if (data.system_performance?.cpu) {
|
|
document.getElementById('cpu-count').textContent = data.system_performance.cpu.count || '-';
|
|
document.getElementById('cpu-freq').textContent = (data.system_performance.cpu.frequency_mhz?.toFixed(0) || '0') + 'MHz';
|
|
document.getElementById('load-1min').textContent = data.system_performance.cpu.load_average?.toFixed(2) || '-';
|
|
document.getElementById('load-5min').textContent = data.system_performance.cpu.load_average?.toFixed(2) || '-';
|
|
}
|
|
|
|
if (data.system_performance?.disk) {
|
|
document.getElementById('disk-read').textContent = (data.system_performance.disk.read_bytes / 1024 / 1024).toFixed(2);
|
|
document.getElementById('disk-write').textContent = (data.system_performance.disk.write_bytes / 1024 / 1024).toFixed(2);
|
|
document.getElementById('disk-read-count').textContent = data.system_performance.disk.read_count || '-';
|
|
document.getElementById('disk-write-count').textContent = data.system_performance.disk.write_count || '-';
|
|
}
|
|
|
|
if (data.system_performance?.network) {
|
|
document.getElementById('net-sent').textContent = (data.system_performance.network.bytes_sent / 1024 / 1024).toFixed(2);
|
|
document.getElementById('net-recv').textContent = (data.system_performance.network.bytes_recv / 1024 / 1024).toFixed(2);
|
|
document.getElementById('net-sent-pkts').textContent = data.system_performance.network.packets_sent || '-';
|
|
document.getElementById('net-recv-pkts').textContent = data.system_performance.network.packets_recv || '-';
|
|
}
|
|
|
|
// 엔드포인트 사용량 업데이트
|
|
updateEndpointUsage(data.api_stats?.endpoint_usage || {});
|
|
|
|
// 알림 업데이트
|
|
updateAlerts(data.alerts || []);
|
|
|
|
// 차트 업데이트
|
|
updateCharts(data);
|
|
|
|
// 마지막 업데이트 시간
|
|
document.getElementById('last-update').textContent = new Date().toLocaleTimeString();
|
|
}
|
|
|
|
function updateEndpointUsage(endpointUsage) {
|
|
const container = document.getElementById('endpoint-usage');
|
|
container.innerHTML = '';
|
|
|
|
if (Object.keys(endpointUsage).length === 0) {
|
|
container.innerHTML = '<div class="endpoint-item"><div class="endpoint-name">사용량 없음</div><div class="endpoint-count">-</div></div>';
|
|
return;
|
|
}
|
|
|
|
Object.entries(endpointUsage).forEach(([endpoint, count]) => {
|
|
const item = document.createElement('div');
|
|
item.className = 'endpoint-item';
|
|
item.innerHTML = `
|
|
<div class="endpoint-name">${endpoint}</div>
|
|
<div class="endpoint-count">${count}</div>
|
|
`;
|
|
container.appendChild(item);
|
|
});
|
|
}
|
|
|
|
function updateAlerts(alerts) {
|
|
const container = document.getElementById('alerts-container');
|
|
container.innerHTML = '';
|
|
|
|
if (alerts.length === 0) {
|
|
container.innerHTML = '<div class="alert info"><strong>정보:</strong> 현재 알림이 없습니다.</div>';
|
|
return;
|
|
}
|
|
|
|
alerts.forEach(alert => {
|
|
const alertDiv = document.createElement('div');
|
|
alertDiv.className = `alert ${alert.level}`;
|
|
alertDiv.innerHTML = `
|
|
<strong>${alert.level.toUpperCase()}:</strong> ${alert.message}
|
|
<br><small>${new Date(alert.timestamp).toLocaleString()}</small>
|
|
`;
|
|
container.appendChild(alertDiv);
|
|
});
|
|
}
|
|
|
|
function updateCharts(data) {
|
|
const timestamp = new Date().toLocaleTimeString();
|
|
|
|
// 성능 차트 업데이트
|
|
performanceChart.data.labels.push(timestamp);
|
|
performanceChart.data.datasets[0].data.push(data.gpu?.utilization || 0);
|
|
performanceChart.data.datasets[1].data.push(data.system_memory?.usage_percent || 0);
|
|
|
|
if (performanceChart.data.labels.length > 20) {
|
|
performanceChart.data.labels.shift();
|
|
performanceChart.data.datasets[0].data.shift();
|
|
performanceChart.data.datasets[1].data.shift();
|
|
}
|
|
|
|
// GPU 차트 업데이트
|
|
gpuChart.data.labels.push(timestamp);
|
|
gpuChart.data.datasets[0].data.push(data.gpu?.usage_percent || 0);
|
|
|
|
if (gpuChart.data.labels.length > 20) {
|
|
gpuChart.data.labels.shift();
|
|
gpuChart.data.datasets[0].data.shift();
|
|
}
|
|
|
|
performanceChart.update();
|
|
gpuChart.update();
|
|
}
|
|
</script>
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
|
|
@monitor_app.get("/")
|
|
async def dashboard():
|
|
"""대시보드 HTML 페이지"""
|
|
return HTMLResponse(content=HTML_TEMPLATE)
|
|
|
|
|
|
@monitor_app.websocket("/ws")
|
|
async def websocket_endpoint(websocket: WebSocket):
|
|
"""WebSocket 연결을 처리합니다."""
|
|
await websocket.accept()
|
|
connected_clients.append(websocket)
|
|
|
|
try:
|
|
while True:
|
|
# 클라이언트로부터 메시지 대기 (연결 유지용)
|
|
await websocket.receive_text()
|
|
except WebSocketDisconnect:
|
|
connected_clients.remove(websocket)
|
|
logger.info("클라이언트 연결 해제")
|
|
|
|
|
|
@monitor_app.get("/api/status")
|
|
async def get_current_status():
|
|
"""현재 상태를 JSON으로 반환합니다."""
|
|
return await monitoring_data.collect_data()
|
|
|
|
|
|
@monitor_app.get("/api/history")
|
|
async def get_history():
|
|
"""데이터 히스토리를 반환합니다."""
|
|
return {
|
|
"history": monitoring_data.get_history(),
|
|
"statistics": monitoring_data.get_statistics()
|
|
}
|
|
|
|
|
|
async def broadcast_data():
|
|
"""연결된 모든 클라이언트에게 데이터를 브로드캐스트합니다."""
|
|
while True:
|
|
try:
|
|
if connected_clients:
|
|
data = await monitoring_data.collect_data()
|
|
message = json.dumps(data, ensure_ascii=False)
|
|
|
|
# 연결이 끊어진 클라이언트 제거
|
|
disconnected = []
|
|
for client in connected_clients:
|
|
try:
|
|
await client.send_text(message)
|
|
except Exception:
|
|
disconnected.append(client)
|
|
|
|
for client in disconnected:
|
|
connected_clients.remove(client)
|
|
|
|
await asyncio.sleep(2) # 2초마다 업데이트
|
|
|
|
except Exception as e:
|
|
logger.error(f"브로드캐스트 오류: {e}")
|
|
await asyncio.sleep(5)
|
|
|
|
|
|
@monitor_app.on_event("startup")
|
|
async def start_monitoring():
|
|
"""모니터링 시작"""
|
|
logger.info("모니터링 대시보드 시작")
|
|
|
|
# Jetson 최적화 (시작 시)
|
|
if settings.IS_JETSON:
|
|
logger.info("Jetson Xavier 모드로 모니터링 시작")
|
|
gpu_monitor.optimize_for_jetson()
|
|
|
|
asyncio.create_task(broadcast_data())
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# 로깅 설정
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
)
|
|
|
|
# 모니터링 서버 실행
|
|
uvicorn.run(
|
|
"app.monitoring.dashboard:monitor_app",
|
|
host="0.0.0.0",
|
|
port=settings.MONITORING_PORT,
|
|
log_level="info"
|
|
)
|