상태 저장 기능을 추가하고, 워커 및 세션 풀의 상태를 반환하는 API 엔드포인트를 구현하였습니다. 또한, 모니터링 대시보드와 관련된 코드 개선 및 예외 처리를 강화하였습니다.

This commit is contained in:
AGX 2025-08-27 21:20:22 +09:00
parent ac00287eff
commit b8474dec92
11 changed files with 647 additions and 142 deletions

View File

@ -221,6 +221,25 @@ class SessionPool:
self.pool_sizes[model_type] = new_size self.pool_sizes[model_type] = new_size
def get_status(self) -> dict:
"""세션 풀의 현재 상태를 반환합니다."""
status_by_model = {}
all_sessions = list(self.pools.values()) # Flatten all sessions from all models
for model_type in ModelType:
model_sessions = [s for s in all_sessions if s.model_type == model_type]
in_use_count = sum(1 for s in model_sessions if s.in_use)
available_count = len(model_sessions) - in_use_count
status_by_model[model_type.value] = {
"total": len(model_sessions),
"in_use": in_use_count,
"available": available_count
}
return status_by_model
# 전역 세션 풀 인스턴스 # 전역 세션 풀 인스턴스
session_pool = SessionPool() session_pool = SessionPool()

View File

@ -300,29 +300,31 @@ class WorkerManager:
self.workers.clear() self.workers.clear()
def get_status(self) -> Dict[str, Any]: def get_status(self) -> dict:
"""워커 매니저 상태를 반환합니다.""" """워커 매니저의 현재 상태를 반환합니다."""
workers_by_status = {} workers_by_status = {
for status in WorkerStatus: "idle": [],
workers_by_status[status.value] = [ "busy": [],
{ "starting": [],
"id": w.worker_id, "stopping": [],
"created_at": w.created_at, "error": []
"last_task_at": w.last_task_at, }
"current_task": w.current_task,
"task_count": w.task_count, for worker in self.workers:
"error_count": w.error_count status_data = {
} "id": worker.worker_id,
for w in self.workers.values() if w.status == status "status": worker.status.value,
] "task_count": worker.task_count,
"error_count": worker.error_count,
"last_task_at": worker.last_task_at
}
workers_by_status[worker.status.value].append(status_data)
return { return {
"running": self.running,
"total_workers": len(self.workers), "total_workers": len(self.workers),
"queue_size": self.task_queue.qsize(), "queue_size": self.task_queue.qsize(),
"workers_by_status": workers_by_status, "workers_by_status": workers_by_status
"gpu_info": gpu_monitor.get_gpu_memory_info(),
"system_memory": gpu_monitor.get_system_memory_info(),
"running": self.running
} }
async def process_inpaint(self, **kwargs) -> Optional[np.ndarray]: async def process_inpaint(self, **kwargs) -> Optional[np.ndarray]:

View File

@ -15,14 +15,53 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse from fastapi.responses import HTMLResponse
import uvicorn import uvicorn
from fastapi import APIRouter, Request
from ..core.worker_manager import worker_manager from ..core.worker_manager import worker_manager
from ..core.session_pool import session_pool from ..core.session_pool import session_pool
from ..utils.gpu_monitor import gpu_monitor from ..utils.gpu_monitor import gpu_monitor
from ..core.config import settings from ..core.config import settings
# main_app = None
# def init_monitoring(app: FastAPI):
# """모니터링 앱을 초기화하고 메인 앱 객체를 설정합니다."""
# global main_app
# main_app = app
# # lifespan에서 worker_manager와 session_pool이 app.state에 설정되도록 합니다.
# @app.on_event("startup")
# async def startup_event():
# if not hasattr(app.state, 'worker_manager') or not hasattr(app.state, 'session_pool'):
# # main.py의 lifespan에서 설정되므로, 여기서는 경고만 로깅
# logger.warning("worker_manager 또는 session_pool이 app.state에 설정되지 않았습니다.")
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# main.py에서 공유할 객체들 -> 이제 Request 객체를 통해 접근합니다.
# worker_manager = None
# session_pool = None
# def set_shared_objects(wm, sp):
# """메인 서버의 worker_manager와 session_pool을 설정합니다."""
# global worker_manager, session_pool
# worker_manager = wm
# session_pool = sp
def read_status_from_file():
"""status.json 파일에서 상태를 읽어옵니다."""
try:
with open("status.json", "r") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
return {
"worker_status": {"running": False, "total_workers": 0, "queue_size": 0, "workers_by_status": {}},
"session_status": {},
"timestamp": 0
}
# API 라우터 생성
api_router = APIRouter()
# 모니터링 앱 생성 # 모니터링 앱 생성
monitor_app = FastAPI( monitor_app = FastAPI(
title="인페인팅 서버 모니터링 대시보드", title="인페인팅 서버 모니터링 대시보드",
@ -49,45 +88,52 @@ class MonitoringData:
self.alerts = [] self.alerts = []
async def collect_data(self) -> Dict[str, Any]: async def collect_data(self) -> Dict[str, Any]:
"""현재 시스템 상태 데이터를 수집합니다.""" """주기적으로 서버 상태 데이터를 수집합니다."""
timestamp = datetime.now().isoformat() status = read_status_from_file()
worker_status = status.get("worker_status", {})
session_status = status.get("session_status", {})
timestamp = status.get("timestamp", 0)
# GPU 정보 # 워커 매니저 상태 (안전하게 가져오기)
gpu_info = gpu_monitor.get_gpu_memory_info() try:
gpu_utilization = gpu_monitor.get_gpu_utilization() worker_status = worker_manager.get_status() if worker_manager else self._get_default_worker_status()
except Exception as e:
logger.warning(f"워커 매니저 상태 조회 실패: {e}")
worker_status = self._get_default_worker_status()
# 시스템 메모리 정보 # 세션 풀 상태 (안전하게 가져오기)
system_memory = gpu_monitor.get_system_memory_info() try:
session_status = session_pool.get_status() if session_pool else self._get_default_session_status()
except Exception as e:
logger.warning(f"세션 풀 상태 조회 실패: {e}")
session_status = self._get_default_session_status()
# 시스템 성능 지표 # Jetson 전용 정보 (안전하게 가져오기)
system_performance = self._get_system_performance()
# 워커 매니저 상태
worker_status = worker_manager.get_status()
# 세션 풀 상태
session_status = await session_pool.get_pool_status()
# Jetson 전용 정보
jetson_info = {} jetson_info = {}
if settings.IS_JETSON: if settings.IS_JETSON:
jetson_info = gpu_monitor.get_jetson_specific_info() try:
jetson_info = gpu_monitor.get_jetson_specific_info()
if jetson_info is None:
jetson_info = {}
except Exception as e:
logger.warning(f"Jetson 전용 정보 조회 실패: {e}")
jetson_info = {}
# API 통계 # API 통계
api_stats = self._get_api_statistics() api_stats = self._get_api_statistics()
# 알림 및 경고 # 알림 및 경고
alerts = self._check_alerts(gpu_info, system_memory, worker_status) alerts = self._check_alerts(worker_status)
data = { data = {
"timestamp": timestamp, "timestamp": datetime.now().isoformat(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64", "system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"gpu": { "gpu": {
**gpu_info, **gpu_monitor.get_gpu_memory_info(),
"utilization": gpu_utilization "utilization": gpu_monitor.get_gpu_utilization()
}, },
"system_memory": system_memory, "system_memory": gpu_monitor.get_system_memory_info(),
"system_performance": system_performance, "system_performance": self._get_system_performance(),
"workers": worker_status, "workers": worker_status,
"sessions": session_status, "sessions": session_status,
"jetson": jetson_info, "jetson": jetson_info,
@ -168,36 +214,11 @@ class MonitoringData:
"recent_errors": self.api_stats["errors"][-5:] # 최근 5개 에러 "recent_errors": self.api_stats["errors"][-5:] # 최근 5개 에러
} }
def _check_alerts(self, gpu_info: Dict, system_memory: Dict, worker_status: Dict) -> List[Dict]: def _check_alerts(self, worker_status: Dict) -> List[Dict]:
"""시스템 상태를 확인하고 알림을 생성합니다.""" """시스템 상태를 확인하고 알림을 생성합니다."""
alerts = [] alerts = []
current_time = datetime.now() current_time = datetime.now()
# GPU 메모리 경고
if gpu_info.get("usage_percent", 0) > 90:
alerts.append({
"level": "critical",
"message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%",
"timestamp": current_time.isoformat(),
"category": "gpu"
})
elif gpu_info.get("usage_percent", 0) > 80:
alerts.append({
"level": "warning",
"message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%",
"timestamp": current_time.isoformat(),
"category": "gpu"
})
# 시스템 메모리 경고
if system_memory.get("usage_percent", 0) > 90:
alerts.append({
"level": "critical",
"message": f"시스템 메모리 사용률이 높습니다: {system_memory.get('usage_percent', 0):.1f}%",
"timestamp": current_time.isoformat(),
"category": "memory"
})
# 워커 상태 경고 # 워커 상태 경고
if worker_status.get("active_workers", 0) == 0: if worker_status.get("active_workers", 0) == 0:
alerts.append({ alerts.append({
@ -272,6 +293,22 @@ class MonitoringData:
"data_points": len(recent_data) "data_points": len(recent_data)
} }
def _get_default_worker_status(self):
return {
"total_workers": 0,
"queue_size": 0,
"workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []},
"running": False
}
def _get_default_session_status(self):
return {
"simple-lama": {"total": 0, "in_use": 0, "available": 0},
"migan": {"total": 0, "in_use": 0, "available": 0},
"rembg": {"total": 0, "in_use": 0, "available": 0}
}
# 전역 모니터링 데이터 인스턴스 # 전역 모니터링 데이터 인스턴스
monitoring_data = MonitoringData() monitoring_data = MonitoringData()
@ -835,6 +872,112 @@ async def dashboard():
return HTMLResponse(content=HTML_TEMPLATE) return HTMLResponse(content=HTML_TEMPLATE)
@api_router.get("/status")
async def get_status():
"""실시간 서버 상태 데이터를 반환합니다."""
return await monitoring_data.collect_data()
@api_router.get("/simple")
def get_simple_status():
"""간단한 상태 정보를 반환합니다."""
try:
import psutil
return {
"timestamp": time.time(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"cpu_percent": psutil.cpu_percent(),
"memory_percent": psutil.virtual_memory().percent,
"status": "running"
}
except Exception as e:
return {"error": f"간단한 상태 수집 실패: {str(e)}"}
@api_router.get("/test_data")
async def get_test_data():
"""테스트용 더미 데이터를 반환합니다."""
import random
return {
"timestamp": datetime.now().isoformat(),
"system_type": "Jetson Xavier",
"gpu": {
"total": 8.0,
"used": round(random.uniform(0.5, 2.0), 2),
"free": round(8.0 - random.uniform(0.5, 2.0), 2),
"usage_percent": round(random.uniform(5, 25), 1),
"utilization": round(random.uniform(0, 15), 1),
"temperature": round(random.uniform(35, 45), 1),
"clock_speed": random.randint(1100, 1300)
},
"system_memory": {
"total": 30.26,
"used": round(random.uniform(10, 15), 2),
"free": round(random.uniform(15, 20), 2),
"usage_percent": round(random.uniform(35, 50), 1)
},
"system_performance": {
"cpu_percent": round(random.uniform(5, 20), 1),
"cpu_count": 8,
"cpu_freq": {"current": 2266, "min": 1190, "max": 2265},
"load_avg": [round(random.uniform(0.1, 1.0), 2), round(random.uniform(0.1, 1.0), 2), round(random.uniform(0.1, 1.0), 2)],
"disk_io": {"read_mb": random.randint(10, 100), "write_mb": random.randint(5, 50), "read_count": random.randint(100, 1000), "write_count": random.randint(50, 500)},
"net_io": {"sent_mb": random.randint(1, 10), "recv_mb": random.randint(1, 10), "sent_packets": random.randint(100, 1000), "recv_packets": random.randint(100, 1000)},
"process_count": random.randint(300, 400)
},
"workers": {
"total_workers": 2,
"queue_size": random.randint(0, 5),
"workers_by_status": {
"idle": [{"id": "worker_1", "task_count": random.randint(10, 50)}],
"busy": [{"id": "worker_2", "current_task": "inpainting", "task_count": random.randint(5, 30)}] if random.random() > 0.5 else [],
"starting": [],
"stopping": [],
"error": []
},
"running": True
},
"sessions": {
"simple_lama": {"total": 2, "in_use": random.randint(0, 2), "available": 2 - random.randint(0, 2)},
"migan": {"total": 2, "in_use": random.randint(0, 2), "available": 2 - random.randint(0, 2)},
"rembg": {"total": 1, "in_use": random.randint(0, 1), "available": 1 - random.randint(0, 1)}
},
"api_stats": {
"total_requests": random.randint(100, 500),
"successful_requests": random.randint(90, 480),
"failed_requests": random.randint(0, 10),
"response_times": [round(random.uniform(0.1, 2.0), 2) for _ in range(10)],
"success_rate": round(random.uniform(85, 98), 1),
"avg_response_time": round(random.uniform(0.5, 1.5), 2),
"errors": []
},
"alerts": ["정보: 모니터링 데이터를 수집 중입니다..."] if random.random() > 0.7 else []
}
@api_router.get("/history")
async def get_history():
"""데이터 히스토리를 반환합니다."""
return {
"history": monitoring_data.get_history(),
"statistics": monitoring_data.get_statistics()
}
@api_router.get("/worker-status")
def get_worker_status_api():
"""워커 상태를 반환합니다."""
status = read_status_from_file()
return status.get("worker_status", {})
@api_router.get("/session-status")
def get_session_status_api():
"""세션 풀 상태를 반환합니다."""
status = read_status_from_file()
return status.get("session_status", {})
# FastAPI 앱에 라우터 포함
monitor_app.include_router(api_router, prefix="/api")
# WebSocket 핸들러
@monitor_app.websocket("/ws") @monitor_app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket): async def websocket_endpoint(websocket: WebSocket):
"""WebSocket 연결을 처리합니다.""" """WebSocket 연결을 처리합니다."""
@ -843,34 +986,22 @@ async def websocket_endpoint(websocket: WebSocket):
try: try:
while True: while True:
# 클라이언트로부터 메시지 대기 (연결 유지용) # 주기적으로 데이터 전송
await websocket.receive_text() data = await monitoring_data.collect_data()
await websocket.send_json(data)
await asyncio.sleep(2) # 2초마다 업데이트
except WebSocketDisconnect: except WebSocketDisconnect:
connected_clients.remove(websocket) connected_clients.remove(websocket)
logger.info("클라이언트 연결 해제") logger.info("클라이언트 연결 해제")
@monitor_app.get("/api/status")
async def get_current_status():
"""현재 상태를 JSON으로 반환합니다."""
return await monitoring_data.collect_data()
@monitor_app.get("/api/history")
async def get_history():
"""데이터 히스토리를 반환합니다."""
return {
"history": monitoring_data.get_history(),
"statistics": monitoring_data.get_statistics()
}
async def broadcast_data(): async def broadcast_data():
"""연결된 모든 클라이언트에게 데이터를 브로드캐스트합니다.""" """연결된 모든 클라이언트에게 데이터를 브로드캐스트합니다."""
while True: while True:
try: try:
if connected_clients: if connected_clients:
data = await monitoring_data.collect_data() data = await monitoring_data.collect_data() # WebSocket 연결이 없으므로 None 전달
message = json.dumps(data, ensure_ascii=False) message = json.dumps(data, ensure_ascii=False)
# 연결이 끊어진 클라이언트 제거 # 연결이 끊어진 클라이언트 제거

View File

@ -1,38 +1,62 @@
INFO: Started server process [299962] INFO: Started server process [396102]
2025-08-27 15:17:30,514 - uvicorn.error - INFO - Started server process [299962] 2025-08-27 21:19:49,229 - uvicorn.error - INFO - Started server process [396102]
INFO: Waiting for application startup. INFO: Waiting for application startup.
2025-08-27 15:17:30,514 - uvicorn.error - INFO - Waiting for application startup. 2025-08-27 21:19:49,230 - uvicorn.error - INFO - Waiting for application startup.
2025-08-27 15:17:30,515 - main - INFO - 🚀 인페인팅 서버 시작 중... 2025-08-27 21:19:49,231 - main - INFO - 🚀 인페인팅 서버 시작 중...
2025-08-27 15:17:30,515 - app.core.session_pool - INFO - Initializing session pools... 2025-08-27 21:19:49,231 - main - INFO - ✅ 공유 객체를 app.state에 저장 완료
2025-08-27 15:17:30,515 - app.core.session_pool - INFO - Initializing 2 sessions for simple_lama 2025-08-27 21:19:49,231 - app.core.session_pool - INFO - Initializing session pools...
2025-08-27 15:17:30,616 - app.core.session_pool - INFO - Created session simple_lama_0 2025-08-27 21:19:49,232 - app.core.session_pool - INFO - Initializing 2 sessions for simple_lama
2025-08-27 15:17:30,717 - app.core.session_pool - INFO - Created session simple_lama_1 2025-08-27 21:19:49,232 - main - WARNING - 상태 저장 실패: 'list' object has no attribute 'model_type'
2025-08-27 15:17:30,718 - app.core.session_pool - INFO - Initializing 2 sessions for migan 2025-08-27 21:19:49,333 - app.core.session_pool - INFO - Created session simple_lama_0
2025-08-27 15:17:30,818 - app.core.session_pool - INFO - Created session migan_0 2025-08-27 21:19:49,435 - app.core.session_pool - INFO - Created session simple_lama_1
2025-08-27 15:17:30,920 - app.core.session_pool - INFO - Created session migan_1 2025-08-27 21:19:49,436 - app.core.session_pool - INFO - Initializing 2 sessions for migan
2025-08-27 15:17:30,920 - app.core.session_pool - INFO - Initializing 1 sessions for rembg 2025-08-27 21:19:49,537 - app.core.session_pool - INFO - Created session migan_0
2025-08-27 15:17:31,021 - app.core.session_pool - INFO - Created session rembg_0 2025-08-27 21:19:49,639 - app.core.session_pool - INFO - Created session migan_1
2025-08-27 15:17:31,021 - app.core.session_pool - INFO - Session pools initialized successfully 2025-08-27 21:19:49,639 - app.core.session_pool - INFO - Initializing 1 sessions for rembg
2025-08-27 15:17:31,021 - main - INFO - ✅ 세션 풀 초기화 완료 2025-08-27 21:19:49,741 - app.core.session_pool - INFO - Created session rembg_0
2025-08-27 15:17:31,022 - app.core.worker_manager - INFO - Starting worker manager... 2025-08-27 21:19:49,741 - app.core.session_pool - INFO - Session pools initialized successfully
2025-08-27 15:17:31,022 - app.core.worker_manager - INFO - Worker manager started with 1 workers 2025-08-27 21:19:49,741 - main - INFO - ✅ 세션 풀 초기화 완료
2025-08-27 15:17:31,022 - main - INFO - ✅ 워커 매니저 시작 완료 2025-08-27 21:19:49,742 - app.core.worker_manager - INFO - Starting worker manager...
2025-08-27 15:17:31,023 - main - INFO - 🎉 인페인팅 서버 시작 완료! 2025-08-27 21:19:49,742 - app.core.worker_manager - INFO - Worker manager started with 1 workers
2025-08-27 21:19:49,743 - main - INFO - ✅ 워커 매니저 시작 완료
2025-08-27 21:19:49,743 - main - INFO - 🎉 인페인팅 서버 시작 완료!
INFO: Application startup complete. INFO: Application startup complete.
2025-08-27 15:17:31,023 - uvicorn.error - INFO - Application startup complete. 2025-08-27 21:19:49,743 - uvicorn.error - INFO - Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
2025-08-27 15:17:31,025 - uvicorn.error - INFO - Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) 2025-08-27 21:19:49,745 - uvicorn.error - INFO - Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO: 127.0.0.1:47618 - "GET /health HTTP/1.1" 200 OK 2025-08-27 21:19:50,233 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:48780 - "GET /health HTTP/1.1" 200 OK INFO: 127.0.0.1:51092 - "GET /health HTTP/1.1" 200 OK
INFO: 127.0.0.1:48790 - "GET /api/v1/server-config HTTP/1.1" 200 OK 2025-08-27 21:19:51,234 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:48792 - "GET /api/v1/samplers HTTP/1.1" 200 OK 2025-08-27 21:19:52,236 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 15:17:45,959 - app.core.worker_manager - ERROR - 인페인팅 처리 실패: cannot import name 'SimpleLamaModel' from 'app.models.simple_lama' (/home/ckh08045/work/inpaintServer/./app/models/simple_lama.py) 2025-08-27 21:19:53,238 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:48798 - "POST /api/v1/inpaint HTTP/1.1" 500 Internal Server Error 2025-08-27 21:19:54,241 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 15:17:45,986 - app.core.worker_manager - ERROR - 인페인팅 처리 실패: cannot import name 'SimpleLamaModel' from 'app.models.simple_lama' (/home/ckh08045/work/inpaintServer/./app/models/simple_lama.py) 2025-08-27 21:19:55,242 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:48800 - "POST /api/v1/inpaint HTTP/1.1" 500 Internal Server Error 2025-08-27 21:19:56,244 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 15:17:46,007 - app.core.worker_manager - ERROR - 배경 제거 처리 실패: cannot import name 'RembgModel' from 'app.models.rembg_model' (/home/ckh08045/work/inpaintServer/./app/models/rembg_model.py) 2025-08-27 21:19:57,245 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 15:17:46,008 - app.api.endpoints - ERROR - 배경 제거 처리 실패: cannot unpack non-iterable NoneType object 2025-08-27 21:19:58,247 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:48808 - "POST /api/v1/remove_bg HTTP/1.1" 500 Internal Server Error INFO: 127.0.0.1:51116 - "GET /docs HTTP/1.1" 200 OK
2025-08-27 15:17:46,031 - app.core.worker_manager - ERROR - 배경 제거 처리 실패: cannot import name 'RembgModel' from 'app.models.rembg_model' (/home/ckh08045/work/inpaintServer/./app/models/rembg_model.py) INFO: 127.0.0.1:51116 - "GET /openapi.json HTTP/1.1" 200 OK
2025-08-27 15:17:46,032 - app.api.endpoints - ERROR - 플러그인 이미지 생성 실패: cannot unpack non-iterable NoneType object 2025-08-27 21:19:59,259 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
INFO: 127.0.0.1:48818 - "POST /api/v1/run_plugin_gen_image HTTP/1.1" 500 Internal Server Error 2025-08-27 21:20:00,260 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:01,262 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:02,264 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:03,266 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:04,268 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:05,270 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:06,271 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:07,274 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:08,275 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:09,277 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:10,278 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:11,280 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:12,281 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:13,282 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:14,284 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:15,285 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:16,286 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:17,288 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:18,290 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:19,293 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:20,294 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:21,296 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'
2025-08-27 21:20:22,298 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id'

View File

@ -1 +1 @@
299962 396102

View File

@ -1,18 +1,20 @@
INFO: Started server process [300005] INFO: Started server process [396175]
INFO: Waiting for application startup. INFO: Waiting for application startup.
Fan control not available Fan control not available
INFO: Application startup complete. INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit) INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
INFO: 127.0.0.1:47362 - "GET / HTTP/1.1" 200 OK INFO: 127.0.0.1:51590 - "GET / HTTP/1.1" 200 OK
INFO: 127.0.0.1:52036 - "GET / HTTP/1.1" 200 OK INFO: 127.0.0.1:43898 - "GET / HTTP/1.1" 200 OK
INFO: 127.0.0.1:52036 - "GET /favicon.ico HTTP/1.1" 404 Not Found INFO: ('127.0.0.1', 43930) - "WebSocket /ws" [accepted]
INFO: ('127.0.0.1', 52060) - "WebSocket /ws" [accepted] 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: connection open INFO: connection open
INFO: 127.0.0.1:56700 - "GET / HTTP/1.1" 200 OK 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: ('127.0.0.1', 56722) - "WebSocket /ws" [accepted] 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: connection open 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: connection closed 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: 127.0.0.1:33852 - "GET / HTTP/1.1" 200 OK 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: ('127.0.0.1', 33866) - "WebSocket /ws" [accepted] 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: connection open 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
INFO: connection closed 세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'
세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type'

View File

@ -1 +1 @@
300005 396175

161
main.py
View File

@ -5,6 +5,8 @@ iopaint와 호환되는 API를 제공합니다.
""" """
import time import time
import logging import logging
import json
import asyncio
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
@ -27,6 +29,20 @@ logger = logging.getLogger(__name__)
# 서버 시작 시간 기록 # 서버 시작 시간 기록
start_time = time.time() start_time = time.time()
async def save_status_periodically():
"""주기적으로 워커와 세션 상태를 파일에 저장합니다."""
while True:
try:
status = {
"worker_status": worker_manager.get_status(),
"session_status": session_pool.get_status(),
"timestamp": time.time()
}
with open("status.json", "w") as f:
json.dump(status, f)
except Exception as e:
logger.warning(f"상태 저장 실패: {e}")
await asyncio.sleep(1)
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
@ -34,6 +50,14 @@ async def lifespan(app: FastAPI):
# 시작 시 # 시작 시
logger.info("🚀 인페인팅 서버 시작 중...") logger.info("🚀 인페인팅 서버 시작 중...")
# app.state에 공유 객체 저장
app.state.worker_manager = worker_manager
app.state.session_pool = session_pool
logger.info("✅ 공유 객체를 app.state에 저장 완료")
# 상태 저장 백그라운드 작업 시작
status_task = asyncio.create_task(save_status_periodically())
try: try:
# 세션 풀 초기화 # 세션 풀 초기화
await session_pool.initialize() await session_pool.initialize()
@ -54,6 +78,9 @@ async def lifespan(app: FastAPI):
# 종료 시 # 종료 시
logger.info("🛑 인페인팅 서버 종료 중...") logger.info("🛑 인페인팅 서버 종료 중...")
# 상태 저장 백그라운드 작업 취소
status_task.cancel()
try: try:
# 워커 매니저 중지 # 워커 매니저 중지
await worker_manager.stop() await worker_manager.stop()
@ -85,8 +112,138 @@ app.add_middleware(
# API 라우터 포함 # API 라우터 포함
app.include_router(router) app.include_router(router)
# 모니터링 대시보드 마운트 # 모니터링은 start_server.sh를 통해 독립적으로 실행됩니다.
app.mount("/monitoring", monitor_app, name="monitoring") # app.mount("/monitoring", monitor_app, name="monitoring")
# 모니터링 데이터 직접 제공 (완전 통합)
@app.get("/monitoring/api/status")
async def get_monitoring_status():
"""모니터링 상태를 직접 반환합니다."""
try:
import psutil
from app.utils.gpu_monitor import GPUMonitor
# 시스템 정보 수집
cpu_percent = psutil.cpu_percent()
memory = psutil.virtual_memory()
process_count = len(psutil.pids())
# GPU 정보 수집
gpu_monitor = GPUMonitor()
gpu_info = await gpu_monitor.get_gpu_info()
# 워커 매니저 상태 (안전한 방식)
try:
worker_status = {
"total_workers": getattr(worker_manager, 'workers', {}).__len__() if hasattr(worker_manager, 'workers') else 0,
"queue_size": getattr(worker_manager, 'queue', None).qsize() if hasattr(worker_manager, 'queue') else 0,
"workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []},
"running": getattr(worker_manager, 'running', False)
}
except Exception as e:
logger.warning(f"워커 상태 수집 실패: {e}")
worker_status = {
"total_workers": 0,
"queue_size": 0,
"workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []},
"running": False
}
# 세션 풀 상태 (안전한 방식)
try:
sessions = getattr(session_pool, 'sessions', {})
total_sessions = len(sessions) if sessions else 0
available_sessions = len([s for s in sessions.values() if getattr(s, 'available', False)]) if sessions else 0
session_status = {
"total_sessions": total_sessions,
"available_sessions": available_sessions,
"model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0}
}
except Exception as e:
logger.warning(f"세션 상태 수집 실패: {e}")
session_status = {
"total_sessions": 0,
"available_sessions": 0,
"model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0}
}
# API 통계 (간단한 버전)
api_stats = {
"total_requests": 0, # 실제로는 카운터가 필요
"success_rate": 100.0,
"average_response_time": 0.0,
"error_count": 0
}
return {
"timestamp": time.time(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"system": {
"cpu_percent": cpu_percent,
"memory_percent": memory.percent,
"process_count": process_count
},
"gpu": gpu_info,
"worker_status": worker_status,
"session_status": session_status,
"api_stats": api_stats
}
except Exception as e:
logger.error(f"모니터링 데이터 수집 실패: {e}")
return {"error": f"모니터링 데이터 수집 실패: {str(e)}"}
@app.get("/monitoring/api/simple")
async def get_simple_monitoring():
"""간단한 모니터링 상태를 반환합니다."""
try:
import psutil
return {
"timestamp": time.time(),
"system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64",
"cpu_percent": psutil.cpu_percent(),
"memory_percent": psutil.virtual_memory().percent,
"status": "running"
}
except Exception as e:
return {"error": f"간단한 상태 수집 실패: {str(e)}"}
@app.get("/monitoring/api/worker-status")
async def get_worker_status():
"""워커 상태를 반환합니다."""
try:
workers = getattr(worker_manager, 'workers', {})
queue = getattr(worker_manager, 'queue', None)
running = getattr(worker_manager, 'running', False)
return {
"total_workers": len(workers) if workers else 0,
"queue_size": queue.qsize() if queue else 0,
"running": running,
"status": "active" if running else "stopped"
}
except Exception as e:
logger.warning(f"워커 상태 조회 실패: {e}")
return {"error": f"워커 상태 조회 실패: {str(e)}"}
@app.get("/monitoring/api/session-status")
async def get_session_status():
"""세션 풀 상태를 반환합니다."""
try:
sessions = getattr(session_pool, 'sessions', {})
total_sessions = len(sessions) if sessions else 0
available_sessions = len([s for s in sessions.values() if getattr(s, 'available', False)]) if sessions else 0
return {
"total_sessions": total_sessions,
"available_sessions": available_sessions,
"model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0}
}
except Exception as e:
logger.warning(f"세션 상태 조회 실패: {e}")
return {"error": f"세션 상태 조회 실패: {str(e)}"}
if __name__ == "__main__": if __name__ == "__main__":

120
monitoring_debug.json Normal file
View File

@ -0,0 +1,120 @@
{
"timestamp": "2025-08-27T19:18:15.546032",
"system_type": "Jetson Xavier",
"gpu": {
"total": 8.0,
"used": 0.0,
"free": 8.0,
"usage_percent": 0.0,
"utilization": 0.0
},
"system_memory": {
"total": 30.26,
"used": 11.77,
"free": 1.96,
"usage_percent": 40.3
},
"system_performance": {
"cpu": {
"usage_percent": 7.0,
"count": 8,
"frequency_mhz": 1190.4,
"load_average": {
"1min": 0.72,
"5min": 0.82,
"15min": 0.8
}
},
"disk": {
"read_bytes": 18622774784,
"write_bytes": 8354913792,
"read_count": 299549,
"write_count": 245067
},
"network": {
"bytes_sent": 1438356649,
"bytes_recv": 2090199696,
"packets_sent": 1124215,
"packets_recv": 2485680
},
"processes": 315
},
"workers": {
"total_workers": 0,
"queue_size": 0,
"workers_by_status": {
"idle": [],
"busy": [],
"starting": [],
"stopping": [],
"error": []
},
"gpu_info": {
"total": 8.0,
"used": 0.0,
"free": 8.0,
"usage_percent": 0.0
},
"system_memory": {
"total": 30.26,
"used": 11.77,
"free": 1.96,
"usage_percent": 40.3
},
"running": false
},
"sessions": {
"simple_lama": {
"total": 0,
"in_use": 0,
"available": 0,
"sessions": []
},
"migan": {
"total": 0,
"in_use": 0,
"available": 0,
"sessions": []
},
"rembg": {
"total": 0,
"in_use": 0,
"available": 0,
"sessions": []
}
},
"jetson": {
"gpu_frequency": null,
"cpu_frequency": null,
"memory_frequency": null,
"temperature": {
"zone_zone2": 36.0,
"zone_zone0": 39.0,
"zone_zone7": 37.5,
"zone_zone5": 37.0,
"zone_zone3": 36.0,
"zone_zone1": 39.0,
"zone_zone6": 41.0,
"zone_zone4": 50.0
},
"power_consumption": null,
"power_mode": "MAXN"
},
"api_stats": {
"total_requests": 0,
"successful_requests": 0,
"failed_requests": 0,
"success_rate": 0.0,
"endpoint_usage": {},
"average_response_time": 0,
"recent_errors": []
},
"alerts": [
{
"level": "critical",
"message": "활성 워커가 없습니다",
"timestamp": "2025-08-27T19:18:17.118052",
"category": "workers"
}
]
}

1
status.json Normal file
View File

@ -0,0 +1 @@
{"worker_status": {"running": true, "total_workers": 1, "queue_size": 0, "workers_by_status": {"idle": [{"id": "worker_d5e6bd0a", "status": "idle", "task_count": 0, "error_count": 0, "last_task_at": null}], "busy": [], "starting": [], "stopping": [], "error": []}}, "session_status": {"simple_lama": {"total": 2, "in_use": 0, "available": 2}, "migan": {"total": 2, "in_use": 0, "available": 2}, "rembg": {"total": 1, "in_use": 0, "available": 1}}, "api_stats": {"total_requests": 3, "successful_requests": 2, "failed_requests": 1, "endpoint_usage": {"/": 1, "/health": 1, "/api/v1/model": 1}, "start_time": 1756296888.0927753, "uptime_seconds": 253.7529318332672, "average_response_time_ms": 2.473115921020508}, "timestamp": 1756297141.8458862}

49
test_monitoring.py Normal file
View File

@ -0,0 +1,49 @@
#!/usr/bin/env python3
"""
모니터링 데이터 수집 테스트
"""
import asyncio
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
from app.monitoring.dashboard import monitoring_data
async def test_monitoring():
"""모니터링 데이터 수집을 테스트합니다."""
try:
print("모니터링 데이터 수집 테스트 시작...")
# 데이터 수집
data = await monitoring_data.collect_data()
print("✅ 데이터 수집 성공!")
print(f"📊 데이터 키들: {list(data.keys())}")
# 주요 정보 출력
if 'system_type' in data:
print(f"🖥️ 시스템 타입: {data['system_type']}")
if 'gpu' in data:
print(f"🎮 GPU 정보: {data['gpu']}")
if 'workers' in data:
print(f"⚙️ 워커 상태: {data['workers']}")
if 'sessions' in data:
print(f"🔗 세션 풀: {data['sessions']}")
# JSON 형태로 저장 (디버깅용)
with open('monitoring_debug.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False, default=str)
print("💾 디버그 데이터가 monitoring_debug.json에 저장되었습니다.")
except Exception as e:
print(f"❌ 모니터링 데이터 수집 실패: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
asyncio.run(test_monitoring())