상태 저장 기능을 추가하고, 워커 및 세션 풀의 상태를 반환하는 API 엔드포인트를 구현하였습니다. 또한, 모니터링 대시보드와 관련된 코드 개선 및 예외 처리를 강화하였습니다.

This commit is contained in:
AGX 2025-08-27 21:20:22 +09:00
parent ac00287eff
commit b8474dec92
11 changed files with 647 additions and 142 deletions

View File

@ -221,6 +221,25 @@ class SessionPool:
self.pool_sizes[model_type] = new_size
def get_status(self) -> dict:
"""세션 풀의 현재 상태를 반환합니다."""
status_by_model = {}
all_sessions = list(self.pools.values()) # Flatten all sessions from all models
for model_type in ModelType:
model_sessions = [s for s in all_sessions if s.model_type == model_type]
in_use_count = sum(1 for s in model_sessions if s.in_use)
available_count = len(model_sessions) - in_use_count
status_by_model[model_type.value] = {
"total": len(model_sessions),
"in_use": in_use_count,
"available": available_count
}
return status_by_model
# 전역 세션 풀 인스턴스
session_pool = SessionPool()

View File

@ -95,7 +95,7 @@ class WorkerManager:
await self.monitor_task
except asyncio.CancelledError:
pass
# 모든 워커 중지
await self._stop_all_workers()
@ -103,7 +103,7 @@ class WorkerManager:
self.executor.shutdown(wait=True)
logger.info("Worker manager stopped")
async def submit_task(self, task_func: Callable, *args, **kwargs) -> Any:
"""태스크를 워커에게 제출합니다."""
task_id = str(uuid.uuid4())
@ -300,29 +300,31 @@ class WorkerManager:
self.workers.clear()
def get_status(self) -> Dict[str, Any]:
"""워커 매니저 상태를 반환합니다."""
workers_by_status = {}
for status in WorkerStatus:
workers_by_status[status.value] = [
{
"id": w.worker_id,
"created_at": w.created_at,
"last_task_at": w.last_task_at,
"current_task": w.current_task,
"task_count": w.task_count,
"error_count": w.error_count
}
for w in self.workers.values() if w.status == status
]
def get_status(self) -> dict:
"""워커 매니저의 현재 상태를 반환합니다."""
workers_by_status = {
"idle": [],
"busy": [],
"starting": [],
"stopping": [],
"error": []
}
for worker in self.workers:
status_data = {
"id": worker.worker_id,
"status": worker.status.value,
"task_count": worker.task_count,
"error_count": worker.error_count,
"last_task_at": worker.last_task_at
}
workers_by_status[worker.status.value].append(status_data)
return {
"running": self.running,
"total_workers": len(self.workers),
"queue_size": self.task_queue.qsize(),
"workers_by_status": workers_by_status,
"gpu_info": gpu_monitor.get_gpu_memory_info(),
"system_memory": gpu_monitor.get_system_memory_info(),
"running": self.running
"workers_by_status": workers_by_status
}
async def process_inpaint(self, **kwargs) -> Optional[np.ndarray]:

View File

@ -15,14 +15,53 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse
import uvicorn
from fastapi import APIRouter, Request
from ..core.worker_manager import worker_manager
from ..core.session_pool import session_pool
from ..utils.gpu_monitor import gpu_monitor
from ..core.config import settings
# main_app = None
# def init_monitoring(app: FastAPI):
# """모니터링 앱을 초기화하고 메인 앱 객체를 설정합니다."""
# global main_app
# main_app = app
# # lifespan에서 worker_manager와 session_pool이 app.state에 설정되도록 합니다.
# @app.on_event("startup")
# async def startup_event():
# if not hasattr(app.state, 'worker_manager') or not hasattr(app.state, 'session_pool'):
# # main.py의 lifespan에서 설정되므로, 여기서는 경고만 로깅
# logger.warning("worker_manager 또는 session_pool이 app.state에 설정되지 않았습니다.")
logger = logging.getLogger(__name__)
# main.py에서 공유할 객체들 -> 이제 Request 객체를 통해 접근합니다.
# worker_manager = None
# session_pool = None
# def set_shared_objects(wm, sp):
# """메인 서버의 worker_manager와 session_pool을 설정합니다."""
# global worker_manager, session_pool
# worker_manager = wm
# session_pool = sp
def read_status_from_file():
"""status.json 파일에서 상태를 읽어옵니다."""
try:
with open("status.json", "r") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {
"worker_status": {"running": False, "total_workers": 0, "queue_size": 0, "workers_by_status": {}},
"session_status": {},
"timestamp": 0
}
# API 라우터 생성
api_router = APIRouter()
# 모니터링 앱 생성
monitor_app = FastAPI(
title="인페인팅 서버 모니터링 대시보드",
@ -49,45 +88,52 @@ class MonitoringData:
self.alerts = []
async def collect_data(self) -> Dict[str, Any]:
"""현재 시스템 상태 데이터를 수집합니다."""
timestamp = datetime.now().isoformat()
"""주기적으로 서버 상태 데이터를 수집합니다."""
status = read_status_from_file()
worker_status = status.get("worker_status", {})
session_status = status.get("session_status", {})
timestamp = status.get("timestamp", 0)
# 워커 매니저 상태 (안전하게 가져오기)
try:
worker_status = worker_manager.get_status() if worker_manager else self._get_default_worker_status()
except Exception as e:
logger.warning(f"워커 매니저 상태 조회 실패: {e}")
worker_status = self._get_default_worker_status()
# GPU 정보
gpu_info = gpu_monitor.get_gpu_memory_info()
gpu_utilization = gpu_monitor.get_gpu_utilization()
# 세션 풀 상태 (안전하게 가져오기)
try:
session_status = session_pool.get_status() if session_pool else self._get_default_session_status()
except Exception as e:
logger.warning(f"세션 풀 상태 조회 실패: {e}")
session_status = self._get_default_session_status()
# 시스템 메모리 정보
system_memory = gpu_monitor.get_system_memory_info()
# 시스템 성능 지표
system_performance = self._get_system_performance()
# 워커 매니저 상태
worker_status = worker_manager.get_status()
# 세션 풀 상태
session_status = await session_pool.get_pool_status()
# Jetson 전용 정보
# Jetson 전용 정보 (안전하게 가져오기)
jetson_info = {}
if settings.IS_JETSON:
jetson_info = gpu_monitor.get_jetson_specific_info()
try:
jetson_info = gpu_monitor.get_jetson_specific_info()
if jetson_info is None:
jetson_info = {}
except Exception as e:
logger.warning(f"Jetson 전용 정보 조회 실패: {e}")
jetson_info = {}
# API 통계
api_stats = self._get_api_statistics()
# 알림 및 경고
alerts = self._check_alerts(gpu_info, system_memory, worker_status)
alerts = self._check_alerts(worker_status)
data = {
"timestamp": timestamp,
"timestamp": datetime.now().isoformat(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"gpu": {
**gpu_info,
"utilization": gpu_utilization
**gpu_monitor.get_gpu_memory_info(),
"utilization": gpu_monitor.get_gpu_utilization()
},
"system_memory": system_memory,
"system_performance": system_performance,
"system_memory": gpu_monitor.get_system_memory_info(),
"system_performance": self._get_system_performance(),
"workers": worker_status,
"sessions": session_status,
"jetson": jetson_info,
@ -168,36 +214,11 @@ class MonitoringData:
"recent_errors": self.api_stats["errors"][-5:] # 최근 5개 에러
}
def _check_alerts(self, gpu_info: Dict, system_memory: Dict, worker_status: Dict) -> List[Dict]:
def _check_alerts(self, worker_status: Dict) -> List[Dict]:
"""시스템 상태를 확인하고 알림을 생성합니다."""
alerts = []
current_time = datetime.now()
# GPU 메모리 경고
if gpu_info.get("usage_percent", 0) > 90:
alerts.append({
"level": "critical",
"message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%",
"timestamp": current_time.isoformat(),
"category": "gpu"
})
elif gpu_info.get("usage_percent", 0) > 80:
alerts.append({
"level": "warning",
"message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%",
"timestamp": current_time.isoformat(),
"category": "gpu"
})
# 시스템 메모리 경고
if system_memory.get("usage_percent", 0) > 90:
alerts.append({
"level": "critical",
"message": f"시스템 메모리 사용률이 높습니다: {system_memory.get('usage_percent', 0):.1f}%",
"timestamp": current_time.isoformat(),
"category": "memory"
})
# 워커 상태 경고
if worker_status.get("active_workers", 0) == 0:
alerts.append({
@ -272,6 +293,22 @@ class MonitoringData:
"data_points": len(recent_data)
}
def _get_default_worker_status(self):
return {
"total_workers": 0,
"queue_size": 0,
"workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []},
"running": False
}
def _get_default_session_status(self):
return {
"simple-lama": {"total": 0, "in_use": 0, "available": 0},
"migan": {"total": 0, "in_use": 0, "available": 0},
"rembg": {"total": 0, "in_use": 0, "available": 0}
}
# 전역 모니터링 데이터 인스턴스
monitoring_data = MonitoringData()
@ -835,6 +872,112 @@ async def dashboard():
return HTMLResponse(content=HTML_TEMPLATE)
@api_router.get("/status")
async def get_status():
"""실시간 서버 상태 데이터를 반환합니다."""
return await monitoring_data.collect_data()
@api_router.get("/simple")
def get_simple_status():
"""간단한 상태 정보를 반환합니다."""
try:
import psutil
return {
"timestamp": time.time(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"cpu_percent": psutil.cpu_percent(),
"memory_percent": psutil.virtual_memory().percent,
"status": "running"
}
except Exception as e:
return {"error": f"간단한 상태 수집 실패: {str(e)}"}
@api_router.get("/test_data")
async def get_test_data():
"""테스트용 더미 데이터를 반환합니다."""
import random
return {
"timestamp": datetime.now().isoformat(),
"system_type": "Jetson Xavier",
"gpu": {
"total": 8.0,
"used": round(random.uniform(0.5, 2.0), 2),
"free": round(8.0 - random.uniform(0.5, 2.0), 2),
"usage_percent": round(random.uniform(5, 25), 1),
"utilization": round(random.uniform(0, 15), 1),
"temperature": round(random.uniform(35, 45), 1),
"clock_speed": random.randint(1100, 1300)
},
"system_memory": {
"total": 30.26,
"used": round(random.uniform(10, 15), 2),
"free": round(random.uniform(15, 20), 2),
"usage_percent": round(random.uniform(35, 50), 1)
},
"system_performance": {
"cpu_percent": round(random.uniform(5, 20), 1),
"cpu_count": 8,
"cpu_freq": {"current": 2266, "min": 1190, "max": 2265},
"load_avg": [round(random.uniform(0.1, 1.0), 2), round(random.uniform(0.1, 1.0), 2), round(random.uniform(0.1, 1.0), 2)],
"disk_io": {"read_mb": random.randint(10, 100), "write_mb": random.randint(5, 50), "read_count": random.randint(100, 1000), "write_count": random.randint(50, 500)},
"net_io": {"sent_mb": random.randint(1, 10), "recv_mb": random.randint(1, 10), "sent_packets": random.randint(100, 1000), "recv_packets": random.randint(100, 1000)},
"process_count": random.randint(300, 400)
},
"workers": {
"total_workers": 2,
"queue_size": random.randint(0, 5),
"workers_by_status": {
"idle": [{"id": "worker_1", "task_count": random.randint(10, 50)}],
"busy": [{"id": "worker_2", "current_task": "inpainting", "task_count": random.randint(5, 30)}] if random.random() > 0.5 else [],
"starting": [],
"stopping": [],
"error": []
},
"running": True
},
"sessions": {
"simple_lama": {"total": 2, "in_use": random.randint(0, 2), "available": 2 - random.randint(0, 2)},
"migan": {"total": 2, "in_use": random.randint(0, 2), "available": 2 - random.randint(0, 2)},
"rembg": {"total": 1, "in_use": random.randint(0, 1), "available": 1 - random.randint(0, 1)}
},
"api_stats": {
"total_requests": random.randint(100, 500),
"successful_requests": random.randint(90, 480),
"failed_requests": random.randint(0, 10),
"response_times": [round(random.uniform(0.1, 2.0), 2) for _ in range(10)],
"success_rate": round(random.uniform(85, 98), 1),
"avg_response_time": round(random.uniform(0.5, 1.5), 2),
"errors": []
},
"alerts": ["정보: 모니터링 데이터를 수집 중입니다..."] if random.random() > 0.7 else []
}
@api_router.get("/history")
async def get_history():
"""데이터 히스토리를 반환합니다."""
return {
"history": monitoring_data.get_history(),
"statistics": monitoring_data.get_statistics()
}
@api_router.get("/worker-status")
def get_worker_status_api():
"""워커 상태를 반환합니다."""
status = read_status_from_file()
return status.get("worker_status", {})
@api_router.get("/session-status")
def get_session_status_api():
"""세션 풀 상태를 반환합니다."""
status = read_status_from_file()
return status.get("session_status", {})
# FastAPI 앱에 라우터 포함
monitor_app.include_router(api_router, prefix="/api")
# WebSocket 핸들러
@monitor_app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket 연결을 처리합니다."""
@ -843,34 +986,22 @@ async def websocket_endpoint(websocket: WebSocket):
try:
while True:
# 클라이언트로부터 메시지 대기 (연결 유지용)
await websocket.receive_text()
# 주기적으로 데이터 전송
data = await monitoring_data.collect_data()
await websocket.send_json(data)
await asyncio.sleep(2) # 2초마다 업데이트
except WebSocketDisconnect:
connected_clients.remove(websocket)
logger.info("클라이언트 연결 해제")
@monitor_app.get("/api/status")
async def get_current_status():
"""현재 상태를 JSON으로 반환합니다."""
return await monitoring_data.collect_data()
@monitor_app.get("/api/history")
async def get_history():
"""데이터 히스토리를 반환합니다."""
return {
"history": monitoring_data.get_history(),
"statistics": monitoring_data.get_statistics()
}
async def broadcast_data():
"""연결된 모든 클라이언트에게 데이터를 브로드캐스트합니다."""
while True:
try:
if connected_clients:
data = await monitoring_data.collect_data()
data = await monitoring_data.collect_data() # WebSocket 연결이 없으므로 None 전달
message = json.dumps(data, ensure_ascii=False)
# 연결이 끊어진 클라이언트 제거

View File

@ -1,38 +1,62 @@
INFO: Started server process [299962]
2025-08-27 15:17:30,514 - uvicorn.error - INFO - Started server process [299962]
INFO: Started server process [396102]
2025-08-27 21:19:49,229 - uvicorn.error - INFO - Started server process [396102]
INFO: Waiting for application startup.
2025-08-27 15:17:30,514 - uvicorn.error - INFO - Waiting for application startup.
2025-08-27 15:17:30,515 - main - INFO - 🚀 인페인팅 서버 시작 중...
2025-08-27 15:17:30,515 - app.core.session_pool - INFO - Initializing session pools...
2025-08-27 15:17:30,515 - app.core.session_pool - INFO - Initializing 2 sessions for simple_lama
2025-08-27 15:17:30,616 - app.core.session_pool - INFO - Created session simple_lama_0
2025-08-27 15:17:30,717 - app.core.session_pool - INFO - Created session simple_lama_1
2025-08-27 15:17:30,718 - app.core.session_pool - INFO - Initializing 2 sessions for migan
2025-08-27 15:17:30,818 - app.core.session_pool - INFO - Created session migan_0
2025-08-27 15:17:30,920 - app.core.session_pool - INFO - Created session migan_1
2025-08-27 15:17:30,920 - app.core.session_pool - INFO - Initializing 1 sessions for rembg
2025-08-27 15:17:31,021 - app.core.session_pool - INFO - Created session rembg_0
2025-08-27 15:17:31,021 - app.core.session_pool - INFO - Session pools initialized successfully
2025-08-27 15:17:31,021 - main - INFO - ✅ 세션 풀 초기화 완료
2025-08-27 15:17:31,022 - app.core.worker_manager - INFO - Starting worker manager...
2025-08-27 15:17:31,022 - app.core.worker_manager - INFO - Worker manager started with 1 workers
2025-08-27 15:17:31,022 - main - INFO - ✅ 워커 매니저 시작 완료
2025-08-27 15:17:31,023 - main - INFO - 🎉 인페인팅 서버 시작 완료!
2025-08-27 21:19:49,230 - uvicorn.error - INFO - Waiting for application startup.
2025-08-27 21:19:49,231 - main - INFO - 🚀 인페인팅 서버 시작 중...
2025-08-27 21:19:49,231 - main - INFO - ✅ 공유 객체를 app.state에 저장 완료
2025-08-27 21:19:49,231 - app.core.session_pool - INFO - Initializing session pools...
2025-08-27 21:19:49,232 - app.core.session_pool - INFO - Initializing 2 sessions for simple_lama
2025-08-27 21:19:49,232 - main - WARNING - 상태 저장 실패: 'list' object has no attribute 'model_type'
2025-08-27 21:19:49,333 - app.core.session_pool - INFO - Created session simple_lama_0
2025-08-27 21:19:49,435 - app.core.session_pool - INFO - Created session simple_lama_1
2025-08-27 21:19:49,436 - app.core.session_pool - INFO - Initializing 2 sessions for migan
2025-08-27 21:19:49,537 - app.core.session_pool - INFO - Created session migan_0
2025-08-27 21:19:49,639 - app.core.session_pool - INFO - Created session migan_1
2025-08-27 21:19:49,639 - app.core.session_pool - INFO - Initializing 1 sessions for rembg
2025-08-27 21:19:49,741 - app.core.session_pool - INFO - Created session rembg_0
2025-08-27 21:19:49,741 - app.core.session_pool - INFO - Session pools initialized successfully
2025-08-27 21:19:49,741 - main - INFO - ✅ 세션 풀 초기화 완료
2025-08-27 21:19:49,742 - app.core.worker_manager - INFO - Starting worker manager...
2025-08-27 21:19:49,742 - app.core.worker_manager - INFO - Worker manager started with 1 workers
2025-08-27 21:19:49,743 - main - INFO - ✅ 워커 매니저 시작 완료
2025-08-27 21:19:49,743 - main - INFO - 🎉 인페인팅 서버 시작 완료!
INFO: Application startup complete.
2025-08-27 15:17:31,023 - uvicorn.error - INFO - Application startup complete.
2025-08-27 21:19:49,743 - uvicorn.error - INFO - Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
2025-08-27 15:17:31,025 - uvicorn.error - INFO - Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO: 127.0.0.1:47618 - "GET /health HTTP/1.1" 200 OK
INFO: 127.0.0.1:48780 - "GET /health HTTP/1.1" 200 OK
INFO: 127.0.0.1:48790 - "GET /api/v1/server-config HTTP/1.1" 200 OK
INFO: 127.0.0.1:48792 - "GET /api/v1/samplers HTTP/1.1" 200 OK
2025-08-27 15:17:45,959 - app.core.worker_manager - ERROR - 인페인팅 처리 실패: cannot import name 'SimpleLamaModel' from 'app.models.simple_lama' (/home/ckh08045/work/inpaintServer/./app/models/simple_lama.py)
INFO: 127.0.0.1:48798 - "POST /api/v1/inpaint HTTP/1.1" 500 Internal Server Error
2025-08-27 15:17:45,986 - app.core.worker_manager - ERROR - 인페인팅 처리 실패: cannot import name 'SimpleLamaModel' from 'app.models.simple_lama' (/home/ckh08045/work/inpaintServer/./app/models/simple_lama.py)
INFO: 127.0.0.1:48800 - "POST /api/v1/inpaint HTTP/1.1" 500 Internal Server Error
2025-08-27 15:17:46,007 - app.core.worker_manager - ERROR - 배경 제거 처리 실패: cannot import name 'RembgModel' from 'app.models.rembg_model' (/home/ckh08045/work/inpaintServer/./app/models/rembg_model.py)
2025-08-27 15:17:46,008 - app.api.endpoints - ERROR - 배경 제거 처리 실패: cannot unpack non-iterable NoneType object
INFO: 127.0.0.1:48808 - "POST /api/v1/remove_bg HTTP/1.1" 500 Internal Server Error
2025-08-27 15:17:46,031 - app.core.worker_manager - ERROR - 배경 제거 처리 실패: cannot import name 'RembgModel' from 'app.models.rembg_model' (/home/ckh08045/work/inpaintServer/./app/models/rembg_model.py)
2025-08-27 15:17:46,032 - app.api.endpoints - ERROR - 플러그인 이미지 생성 실패: cannot unpack non-iterable NoneType object
INFO: 127.0.0.1:48818 - "POST /api/v1/run_plugin_gen_image HTTP/1.1" 500 Internal Server Error
2025-08-27 21:19:49,745 - uvicorn.error - INFO - Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
2025-08-27 21:19:50,233 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:51092 - "GET /health HTTP/1.1" 200 OK
2025-08-27 21:19:51,234 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:52,236 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:53,238 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:54,241 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:55,242 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:56,244 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:57,245 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:19:58,247 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:51116 - "GET /docs HTTP/1.1" 200 OK
INFO: 127.0.0.1:51116 - "GET /openapi.json HTTP/1.1" 200 OK
2025-08-27 21:19:59,259 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:00,260 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:01,262 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:02,264 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:03,266 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:04,268 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:05,270 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:06,271 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:07,274 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:08,275 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:09,277 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:10,278 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:11,280 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:12,281 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:13,282 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:14,284 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:15,285 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:16,286 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:17,288 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:18,290 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:19,293 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:20,294 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:21,296 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:22,298 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'

View File

@ -1 +1 @@
299962
396102

View File

@ -1,18 +1,20 @@
INFO: Started server process [300005]
INFO: Started server process [396175]
INFO: Waiting for application startup.
Fan control not available
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
INFO: 127.0.0.1:47362 - "GET / HTTP/1.1" 200 OK
INFO: 127.0.0.1:52036 - "GET / HTTP/1.1" 200 OK
INFO: 127.0.0.1:52036 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: ('127.0.0.1', 52060) - "WebSocket /ws" [accepted]
INFO: 127.0.0.1:51590 - "GET / HTTP/1.1" 200 OK
INFO: 127.0.0.1:43898 - "GET / HTTP/1.1" 200 OK
INFO: ('127.0.0.1', 43930) - "WebSocket /ws" [accepted]
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: connection open
INFO: 127.0.0.1:56700 - "GET / HTTP/1.1" 200 OK
INFO: ('127.0.0.1', 56722) - "WebSocket /ws" [accepted]
INFO: connection open
INFO: connection closed
INFO: 127.0.0.1:33852 - "GET / HTTP/1.1" 200 OK
INFO: ('127.0.0.1', 33866) - "WebSocket /ws" [accepted]
INFO: connection open
INFO: connection closed
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'

View File

@ -1 +1 @@
300005
396175

161
main.py
View File

@ -5,6 +5,8 @@ iopaint와 호환되는 API를 제공합니다.
"""
import time
import logging
import json
import asyncio
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
@ -27,6 +29,20 @@ logger = logging.getLogger(__name__)
# 서버 시작 시간 기록
start_time = time.time()
async def save_status_periodically():
"""주기적으로 워커와 세션 상태를 파일에 저장합니다."""
while True:
try:
status = {
"worker_status": worker_manager.get_status(),
"session_status": session_pool.get_status(),
"timestamp": time.time()
}
with open("status.json", "w") as f:
json.dump(status, f)
except Exception as e:
logger.warning(f"상태 저장 실패: {e}")
await asyncio.sleep(1)
@asynccontextmanager
async def lifespan(app: FastAPI):
@ -34,6 +50,14 @@ async def lifespan(app: FastAPI):
# 시작 시
logger.info("🚀 인페인팅 서버 시작 중...")
# app.state에 공유 객체 저장
app.state.worker_manager = worker_manager
app.state.session_pool = session_pool
logger.info("✅ 공유 객체를 app.state에 저장 완료")
# 상태 저장 백그라운드 작업 시작
status_task = asyncio.create_task(save_status_periodically())
try:
# 세션 풀 초기화
await session_pool.initialize()
@ -54,6 +78,9 @@ async def lifespan(app: FastAPI):
# 종료 시
logger.info("🛑 인페인팅 서버 종료 중...")
# 상태 저장 백그라운드 작업 취소
status_task.cancel()
try:
# 워커 매니저 중지
await worker_manager.stop()
@ -85,8 +112,138 @@ app.add_middleware(
# API 라우터 포함
app.include_router(router)
# 모니터링 대시보드 마운트
app.mount("/monitoring", monitor_app, name="monitoring")
# 모니터링은 start_server.sh를 통해 독립적으로 실행됩니다.
# app.mount("/monitoring", monitor_app, name="monitoring")
# 모니터링 데이터 직접 제공 (완전 통합)
@app.get("/monitoring/api/status")
async def get_monitoring_status():
"""모니터링 상태를 직접 반환합니다."""
try:
import psutil
from app.utils.gpu_monitor import GPUMonitor
# 시스템 정보 수집
cpu_percent = psutil.cpu_percent()
memory = psutil.virtual_memory()
process_count = len(psutil.pids())
# GPU 정보 수집
gpu_monitor = GPUMonitor()
gpu_info = await gpu_monitor.get_gpu_info()
# 워커 매니저 상태 (안전한 방식)
try:
worker_status = {
"total_workers": getattr(worker_manager, 'workers', {}).__len__() if hasattr(worker_manager, 'workers') else 0,
"queue_size": getattr(worker_manager, 'queue', None).qsize() if hasattr(worker_manager, 'queue') else 0,
"workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []},
"running": getattr(worker_manager, 'running', False)
}
except Exception as e:
logger.warning(f"워커 상태 수집 실패: {e}")
worker_status = {
"total_workers": 0,
"queue_size": 0,
"workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []},
"running": False
}
# 세션 풀 상태 (안전한 방식)
try:
sessions = getattr(session_pool, 'sessions', {})
total_sessions = len(sessions) if sessions else 0
available_sessions = len([s for s in sessions.values() if getattr(s, 'available', False)]) if sessions else 0
session_status = {
"total_sessions": total_sessions,
"available_sessions": available_sessions,
"model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0}
}
except Exception as e:
logger.warning(f"세션 상태 수집 실패: {e}")
session_status = {
"total_sessions": 0,
"available_sessions": 0,
"model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0}
}
# API 통계 (간단한 버전)
api_stats = {
"total_requests": 0, # 실제로는 카운터가 필요
"success_rate": 100.0,
"average_response_time": 0.0,
"error_count": 0
}
return {
"timestamp": time.time(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"system": {
"cpu_percent": cpu_percent,
"memory_percent": memory.percent,
"process_count": process_count
},
"gpu": gpu_info,
"worker_status": worker_status,
"session_status": session_status,
"api_stats": api_stats
}
except Exception as e:
logger.error(f"모니터링 데이터 수집 실패: {e}")
return {"error": f"모니터링 데이터 수집 실패: {str(e)}"}
@app.get("/monitoring/api/simple")
async def get_simple_monitoring():
"""간단한 모니터링 상태를 반환합니다."""
try:
import psutil
return {
"timestamp": time.time(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"cpu_percent": psutil.cpu_percent(),
"memory_percent": psutil.virtual_memory().percent,
"status": "running"
}
except Exception as e:
return {"error": f"간단한 상태 수집 실패: {str(e)}"}
@app.get("/monitoring/api/worker-status")
async def get_worker_status():
"""워커 상태를 반환합니다."""
try:
workers = getattr(worker_manager, 'workers', {})
queue = getattr(worker_manager, 'queue', None)
running = getattr(worker_manager, 'running', False)
return {
"total_workers": len(workers) if workers else 0,
"queue_size": queue.qsize() if queue else 0,
"running": running,
"status": "active" if running else "stopped"
}
except Exception as e:
logger.warning(f"워커 상태 조회 실패: {e}")
return {"error": f"워커 상태 조회 실패: {str(e)}"}
@app.get("/monitoring/api/session-status")
async def get_session_status():
"""세션 풀 상태를 반환합니다."""
try:
sessions = getattr(session_pool, 'sessions', {})
total_sessions = len(sessions) if sessions else 0
available_sessions = len([s for s in sessions.values() if getattr(s, 'available', False)]) if sessions else 0
return {
"total_sessions": total_sessions,
"available_sessions": available_sessions,
"model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0}
}
except Exception as e:
logger.warning(f"세션 상태 조회 실패: {e}")
return {"error": f"세션 상태 조회 실패: {str(e)}"}
if __name__ == "__main__":

120
monitoring_debug.json Normal file
View File

@ -0,0 +1,120 @@
{
"timestamp": "2025-08-27T19:18:15.546032",
"system_type": "Jetson Xavier",
"gpu": {
"total": 8.0,
"used": 0.0,
"free": 8.0,
"usage_percent": 0.0,
"utilization": 0.0
},
"system_memory": {
"total": 30.26,
"used": 11.77,
"free": 1.96,
"usage_percent": 40.3
},
"system_performance": {
"cpu": {
"usage_percent": 7.0,
"count": 8,
"frequency_mhz": 1190.4,
"load_average": {
"1min": 0.72,
"5min": 0.82,
"15min": 0.8
}
},
"disk": {
"read_bytes": 18622774784,
"write_bytes": 8354913792,
"read_count": 299549,
"write_count": 245067
},
"network": {
"bytes_sent": 1438356649,
"bytes_recv": 2090199696,
"packets_sent": 1124215,
"packets_recv": 2485680
},
"processes": 315
},
"workers": {
"total_workers": 0,
"queue_size": 0,
"workers_by_status": {
"idle": [],
"busy": [],
"starting": [],
"stopping": [],
"error": []
},
"gpu_info": {
"total": 8.0,
"used": 0.0,
"free": 8.0,
"usage_percent": 0.0
},
"system_memory": {
"total": 30.26,
"used": 11.77,
"free": 1.96,
"usage_percent": 40.3
},
"running": false
},
"sessions": {
"simple_lama": {
"total": 0,
"in_use": 0,
"available": 0,
"sessions": []
},
"migan": {
"total": 0,
"in_use": 0,
"available": 0,
"sessions": []
},
"rembg": {
"total": 0,
"in_use": 0,
"available": 0,
"sessions": []
}
},
"jetson": {
"gpu_frequency": null,
"cpu_frequency": null,
"memory_frequency": null,
"temperature": {
"zone_zone2": 36.0,
"zone_zone0": 39.0,
"zone_zone7": 37.5,
"zone_zone5": 37.0,
"zone_zone3": 36.0,
"zone_zone1": 39.0,
"zone_zone6": 41.0,
"zone_zone4": 50.0
},
"power_consumption": null,
"power_mode": "MAXN"
},
"api_stats": {
"total_requests": 0,
"successful_requests": 0,
"failed_requests": 0,
"success_rate": 0.0,
"endpoint_usage": {},
"average_response_time": 0,
"recent_errors": []
},
"alerts": [
{
"level": "critical",
"message": "활성 워커가 없습니다",
"timestamp": "2025-08-27T19:18:17.118052",
"category": "workers"
}
]
}

1
status.json Normal file
View File

@ -0,0 +1 @@
{"worker_status": {"running": true, "total_workers": 1, "queue_size": 0, "workers_by_status": {"idle": [{"id": "worker_d5e6bd0a", "status": "idle", "task_count": 0, "error_count": 0, "last_task_at": null}], "busy": [], "starting": [], "stopping": [], "error": []}}, "session_status": {"simple_lama": {"total": 2, "in_use": 0, "available": 2}, "migan": {"total": 2, "in_use": 0, "available": 2}, "rembg": {"total": 1, "in_use": 0, "available": 1}}, "api_stats": {"total_requests": 3, "successful_requests": 2, "failed_requests": 1, "endpoint_usage": {"/": 1, "/health": 1, "/api/v1/model": 1}, "start_time": 1756296888.0927753, "uptime_seconds": 253.7529318332672, "average_response_time_ms": 2.473115921020508}, "timestamp": 1756297141.8458862}

49
test_monitoring.py Normal file
View File

@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""
모니터링 데이터 수집 테스트
"""
import asyncio
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from app.monitoring.dashboard import monitoring_data
async def test_monitoring():
"""모니터링 데이터 수집을 테스트합니다."""
try:
print("모니터링 데이터 수집 테스트 시작...")
# 데이터 수집
data = await monitoring_data.collect_data()
print("✅ 데이터 수집 성공!")
print(f"📊 데이터 키들: {list(data.keys())}")
# 주요 정보 출력
if 'system_type' in data:
print(f"🖥️ 시스템 타입: {data['system_type']}")
if 'gpu' in data:
print(f"🎮 GPU 정보: {data['gpu']}")
if 'workers' in data:
print(f"⚙️ 워커 상태: {data['workers']}")
if 'sessions' in data:
print(f"🔗 세션 풀: {data['sessions']}")
# JSON 형태로 저장 (디버깅용)
with open('monitoring_debug.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False, default=str)
print("💾 디버그 데이터가 monitoring_debug.json에 저장되었습니다.")
except Exception as e:
print(f"❌ 모니터링 데이터 수집 실패: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_monitoring())