diff --git a/app/core/session_pool.py b/app/core/session_pool.py index a20c4d4..0a9755b 100644 --- a/app/core/session_pool.py +++ b/app/core/session_pool.py @@ -221,6 +221,25 @@ class SessionPool: self.pool_sizes[model_type] = new_size + def get_status(self) -> dict: + """세션 풀의 현재 상태를 반환합니다.""" + status_by_model = {} + + all_sessions = list(self.pools.values()) # Flatten all sessions from all models + + for model_type in ModelType: + model_sessions = [s for s in all_sessions if s.model_type == model_type] + in_use_count = sum(1 for s in model_sessions if s.in_use) + available_count = len(model_sessions) - in_use_count + + status_by_model[model_type.value] = { + "total": len(model_sessions), + "in_use": in_use_count, + "available": available_count + } + + return status_by_model + # 전역 세션 풀 인스턴스 session_pool = SessionPool() diff --git a/app/core/worker_manager.py b/app/core/worker_manager.py index 940696b..fe75301 100644 --- a/app/core/worker_manager.py +++ b/app/core/worker_manager.py @@ -95,7 +95,7 @@ class WorkerManager: await self.monitor_task except asyncio.CancelledError: pass - + # 모든 워커 중지 await self._stop_all_workers() @@ -103,7 +103,7 @@ class WorkerManager: self.executor.shutdown(wait=True) logger.info("Worker manager stopped") - + async def submit_task(self, task_func: Callable, *args, **kwargs) -> Any: """태스크를 워커에게 제출합니다.""" task_id = str(uuid.uuid4()) @@ -300,29 +300,31 @@ class WorkerManager: self.workers.clear() - def get_status(self) -> Dict[str, Any]: - """워커 매니저 상태를 반환합니다.""" - workers_by_status = {} - for status in WorkerStatus: - workers_by_status[status.value] = [ - { - "id": w.worker_id, - "created_at": w.created_at, - "last_task_at": w.last_task_at, - "current_task": w.current_task, - "task_count": w.task_count, - "error_count": w.error_count - } - for w in self.workers.values() if w.status == status - ] + def get_status(self) -> dict: + """워커 매니저의 현재 상태를 반환합니다.""" + workers_by_status = { + "idle": [], + "busy": [], + "starting": [], + "stopping": [], + "error": [] + } + + for worker in self.workers: + status_data = { + "id": worker.worker_id, + "status": worker.status.value, + "task_count": worker.task_count, + "error_count": worker.error_count, + "last_task_at": worker.last_task_at + } + workers_by_status[worker.status.value].append(status_data) return { + "running": self.running, "total_workers": len(self.workers), "queue_size": self.task_queue.qsize(), - "workers_by_status": workers_by_status, - "gpu_info": gpu_monitor.get_gpu_memory_info(), - "system_memory": gpu_monitor.get_system_memory_info(), - "running": self.running + "workers_by_status": workers_by_status } async def process_inpaint(self, **kwargs) -> Optional[np.ndarray]: diff --git a/app/monitoring/dashboard.py b/app/monitoring/dashboard.py index 461db8c..0daed0f 100644 --- a/app/monitoring/dashboard.py +++ b/app/monitoring/dashboard.py @@ -15,14 +15,53 @@ from fastapi import FastAPI, WebSocket, WebSocketDisconnect from fastapi.staticfiles import StaticFiles from fastapi.responses import HTMLResponse import uvicorn +from fastapi import APIRouter, Request from ..core.worker_manager import worker_manager from ..core.session_pool import session_pool from ..utils.gpu_monitor import gpu_monitor from ..core.config import settings +# main_app = None + +# def init_monitoring(app: FastAPI): +# """모니터링 앱을 초기화하고 메인 앱 객체를 설정합니다.""" +# global main_app +# main_app = app +# # lifespan에서 worker_manager와 session_pool이 app.state에 설정되도록 합니다. +# @app.on_event("startup") +# async def startup_event(): +# if not hasattr(app.state, 'worker_manager') or not hasattr(app.state, 'session_pool'): +# # main.py의 lifespan에서 설정되므로, 여기서는 경고만 로깅 +# logger.warning("worker_manager 또는 session_pool이 app.state에 설정되지 않았습니다.") + logger = logging.getLogger(__name__) +# main.py에서 공유할 객체들 -> 이제 Request 객체를 통해 접근합니다. +# worker_manager = None +# session_pool = None + +# def set_shared_objects(wm, sp): +# """메인 서버의 worker_manager와 session_pool을 설정합니다.""" +# global worker_manager, session_pool +# worker_manager = wm +# session_pool = sp + +def read_status_from_file(): + """status.json 파일에서 상태를 읽어옵니다.""" + try: + with open("status.json", "r") as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + return { + "worker_status": {"running": False, "total_workers": 0, "queue_size": 0, "workers_by_status": {}}, + "session_status": {}, + "timestamp": 0 + } + +# API 라우터 생성 +api_router = APIRouter() + # 모니터링 앱 생성 monitor_app = FastAPI( title="인페인팅 서버 모니터링 대시보드", @@ -49,45 +88,52 @@ class MonitoringData: self.alerts = [] async def collect_data(self) -> Dict[str, Any]: - """현재 시스템 상태 데이터를 수집합니다.""" - timestamp = datetime.now().isoformat() + """주기적으로 서버 상태 데이터를 수집합니다.""" + status = read_status_from_file() + worker_status = status.get("worker_status", {}) + session_status = status.get("session_status", {}) + timestamp = status.get("timestamp", 0) + + # 워커 매니저 상태 (안전하게 가져오기) + try: + worker_status = worker_manager.get_status() if worker_manager else self._get_default_worker_status() + except Exception as e: + logger.warning(f"워커 매니저 상태 조회 실패: {e}") + worker_status = self._get_default_worker_status() - # GPU 정보 - gpu_info = gpu_monitor.get_gpu_memory_info() - gpu_utilization = gpu_monitor.get_gpu_utilization() + # 세션 풀 상태 (안전하게 가져오기) + try: + session_status = session_pool.get_status() if session_pool else self._get_default_session_status() + except Exception as e: + logger.warning(f"세션 풀 상태 조회 실패: {e}") + session_status = self._get_default_session_status() - # 시스템 메모리 정보 - system_memory = gpu_monitor.get_system_memory_info() - - # 시스템 성능 지표 - system_performance = self._get_system_performance() - - # 워커 매니저 상태 - worker_status = worker_manager.get_status() - - # 세션 풀 상태 - session_status = await session_pool.get_pool_status() - - # Jetson 전용 정보 + # Jetson 전용 정보 (안전하게 가져오기) jetson_info = {} if settings.IS_JETSON: - jetson_info = gpu_monitor.get_jetson_specific_info() + try: + jetson_info = gpu_monitor.get_jetson_specific_info() + if jetson_info is None: + jetson_info = {} + except Exception as e: + logger.warning(f"Jetson 전용 정보 조회 실패: {e}") + jetson_info = {} # API 통계 api_stats = self._get_api_statistics() # 알림 및 경고 - alerts = self._check_alerts(gpu_info, system_memory, worker_status) + alerts = self._check_alerts(worker_status) data = { - "timestamp": timestamp, + "timestamp": datetime.now().isoformat(), "system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64", "gpu": { - **gpu_info, - "utilization": gpu_utilization + **gpu_monitor.get_gpu_memory_info(), + "utilization": gpu_monitor.get_gpu_utilization() }, - "system_memory": system_memory, - "system_performance": system_performance, + "system_memory": gpu_monitor.get_system_memory_info(), + "system_performance": self._get_system_performance(), "workers": worker_status, "sessions": session_status, "jetson": jetson_info, @@ -168,36 +214,11 @@ class MonitoringData: "recent_errors": self.api_stats["errors"][-5:] # 최근 5개 에러 } - def _check_alerts(self, gpu_info: Dict, system_memory: Dict, worker_status: Dict) -> List[Dict]: + def _check_alerts(self, worker_status: Dict) -> List[Dict]: """시스템 상태를 확인하고 알림을 생성합니다.""" alerts = [] current_time = datetime.now() - # GPU 메모리 경고 - if gpu_info.get("usage_percent", 0) > 90: - alerts.append({ - "level": "critical", - "message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%", - "timestamp": current_time.isoformat(), - "category": "gpu" - }) - elif gpu_info.get("usage_percent", 0) > 80: - alerts.append({ - "level": "warning", - "message": f"GPU 메모리 사용률이 높습니다: {gpu_info.get('usage_percent', 0):.1f}%", - "timestamp": current_time.isoformat(), - "category": "gpu" - }) - - # 시스템 메모리 경고 - if system_memory.get("usage_percent", 0) > 90: - alerts.append({ - "level": "critical", - "message": f"시스템 메모리 사용률이 높습니다: {system_memory.get('usage_percent', 0):.1f}%", - "timestamp": current_time.isoformat(), - "category": "memory" - }) - # 워커 상태 경고 if worker_status.get("active_workers", 0) == 0: alerts.append({ @@ -272,6 +293,22 @@ class MonitoringData: "data_points": len(recent_data) } + def _get_default_worker_status(self): + return { + "total_workers": 0, + "queue_size": 0, + "workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []}, + "running": False + } + + def _get_default_session_status(self): + return { + "simple-lama": {"total": 0, "in_use": 0, "available": 0}, + "migan": {"total": 0, "in_use": 0, "available": 0}, + "rembg": {"total": 0, "in_use": 0, "available": 0} + } + + # 전역 모니터링 데이터 인스턴스 monitoring_data = MonitoringData() @@ -835,6 +872,112 @@ async def dashboard(): return HTMLResponse(content=HTML_TEMPLATE) +@api_router.get("/status") +async def get_status(): + """실시간 서버 상태 데이터를 반환합니다.""" + return await monitoring_data.collect_data() + +@api_router.get("/simple") +def get_simple_status(): + """간단한 상태 정보를 반환합니다.""" + try: + import psutil + return { + "timestamp": time.time(), + "system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64", + "cpu_percent": psutil.cpu_percent(), + "memory_percent": psutil.virtual_memory().percent, + "status": "running" + } + except Exception as e: + return {"error": f"간단한 상태 수집 실패: {str(e)}"} + +@api_router.get("/test_data") +async def get_test_data(): + """테스트용 더미 데이터를 반환합니다.""" + import random + return { + "timestamp": datetime.now().isoformat(), + "system_type": "Jetson Xavier", + "gpu": { + "total": 8.0, + "used": round(random.uniform(0.5, 2.0), 2), + "free": round(8.0 - random.uniform(0.5, 2.0), 2), + "usage_percent": round(random.uniform(5, 25), 1), + "utilization": round(random.uniform(0, 15), 1), + "temperature": round(random.uniform(35, 45), 1), + "clock_speed": random.randint(1100, 1300) + }, + "system_memory": { + "total": 30.26, + "used": round(random.uniform(10, 15), 2), + "free": round(random.uniform(15, 20), 2), + "usage_percent": round(random.uniform(35, 50), 1) + }, + "system_performance": { + "cpu_percent": round(random.uniform(5, 20), 1), + "cpu_count": 8, + "cpu_freq": {"current": 2266, "min": 1190, "max": 2265}, + "load_avg": [round(random.uniform(0.1, 1.0), 2), round(random.uniform(0.1, 1.0), 2), round(random.uniform(0.1, 1.0), 2)], + "disk_io": {"read_mb": random.randint(10, 100), "write_mb": random.randint(5, 50), "read_count": random.randint(100, 1000), "write_count": random.randint(50, 500)}, + "net_io": {"sent_mb": random.randint(1, 10), "recv_mb": random.randint(1, 10), "sent_packets": random.randint(100, 1000), "recv_packets": random.randint(100, 1000)}, + "process_count": random.randint(300, 400) + }, + "workers": { + "total_workers": 2, + "queue_size": random.randint(0, 5), + "workers_by_status": { + "idle": [{"id": "worker_1", "task_count": random.randint(10, 50)}], + "busy": [{"id": "worker_2", "current_task": "inpainting", "task_count": random.randint(5, 30)}] if random.random() > 0.5 else [], + "starting": [], + "stopping": [], + "error": [] + }, + "running": True + }, + "sessions": { + "simple_lama": {"total": 2, "in_use": random.randint(0, 2), "available": 2 - random.randint(0, 2)}, + "migan": {"total": 2, "in_use": random.randint(0, 2), "available": 2 - random.randint(0, 2)}, + "rembg": {"total": 1, "in_use": random.randint(0, 1), "available": 1 - random.randint(0, 1)} + }, + "api_stats": { + "total_requests": random.randint(100, 500), + "successful_requests": random.randint(90, 480), + "failed_requests": random.randint(0, 10), + "response_times": [round(random.uniform(0.1, 2.0), 2) for _ in range(10)], + "success_rate": round(random.uniform(85, 98), 1), + "avg_response_time": round(random.uniform(0.5, 1.5), 2), + "errors": [] + }, + "alerts": ["정보: 모니터링 데이터를 수집 중입니다..."] if random.random() > 0.7 else [] + } + + +@api_router.get("/history") +async def get_history(): + """데이터 히스토리를 반환합니다.""" + return { + "history": monitoring_data.get_history(), + "statistics": monitoring_data.get_statistics() + } + +@api_router.get("/worker-status") +def get_worker_status_api(): + """워커 상태를 반환합니다.""" + status = read_status_from_file() + return status.get("worker_status", {}) + +@api_router.get("/session-status") +def get_session_status_api(): + """세션 풀 상태를 반환합니다.""" + status = read_status_from_file() + return status.get("session_status", {}) + + +# FastAPI 앱에 라우터 포함 +monitor_app.include_router(api_router, prefix="/api") + +# WebSocket 핸들러 @monitor_app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket): """WebSocket 연결을 처리합니다.""" @@ -843,34 +986,22 @@ async def websocket_endpoint(websocket: WebSocket): try: while True: - # 클라이언트로부터 메시지 대기 (연결 유지용) - await websocket.receive_text() + # 주기적으로 데이터 전송 + data = await monitoring_data.collect_data() + await websocket.send_json(data) + await asyncio.sleep(2) # 2초마다 업데이트 + except WebSocketDisconnect: connected_clients.remove(websocket) logger.info("클라이언트 연결 해제") -@monitor_app.get("/api/status") -async def get_current_status(): - """현재 상태를 JSON으로 반환합니다.""" - return await monitoring_data.collect_data() - - -@monitor_app.get("/api/history") -async def get_history(): - """데이터 히스토리를 반환합니다.""" - return { - "history": monitoring_data.get_history(), - "statistics": monitoring_data.get_statistics() - } - - async def broadcast_data(): """연결된 모든 클라이언트에게 데이터를 브로드캐스트합니다.""" while True: try: if connected_clients: - data = await monitoring_data.collect_data() + data = await monitoring_data.collect_data() # WebSocket 연결이 없으므로 None 전달 message = json.dumps(data, ensure_ascii=False) # 연결이 끊어진 클라이언트 제거 diff --git a/logs/main_server.log b/logs/main_server.log index b19dd8f..558abe8 100644 --- a/logs/main_server.log +++ b/logs/main_server.log @@ -1,38 +1,62 @@ -INFO: Started server process [299962] -2025-08-27 15:17:30,514 - uvicorn.error - INFO - Started server process [299962] +INFO: Started server process [396102] +2025-08-27 21:19:49,229 - uvicorn.error - INFO - Started server process [396102] INFO: Waiting for application startup. -2025-08-27 15:17:30,514 - uvicorn.error - INFO - Waiting for application startup. -2025-08-27 15:17:30,515 - main - INFO - 🚀 인페인팅 서버 시작 중... -2025-08-27 15:17:30,515 - app.core.session_pool - INFO - Initializing session pools... -2025-08-27 15:17:30,515 - app.core.session_pool - INFO - Initializing 2 sessions for simple_lama -2025-08-27 15:17:30,616 - app.core.session_pool - INFO - Created session simple_lama_0 -2025-08-27 15:17:30,717 - app.core.session_pool - INFO - Created session simple_lama_1 -2025-08-27 15:17:30,718 - app.core.session_pool - INFO - Initializing 2 sessions for migan -2025-08-27 15:17:30,818 - app.core.session_pool - INFO - Created session migan_0 -2025-08-27 15:17:30,920 - app.core.session_pool - INFO - Created session migan_1 -2025-08-27 15:17:30,920 - app.core.session_pool - INFO - Initializing 1 sessions for rembg -2025-08-27 15:17:31,021 - app.core.session_pool - INFO - Created session rembg_0 -2025-08-27 15:17:31,021 - app.core.session_pool - INFO - Session pools initialized successfully -2025-08-27 15:17:31,021 - main - INFO - ✅ 세션 풀 초기화 완료 -2025-08-27 15:17:31,022 - app.core.worker_manager - INFO - Starting worker manager... -2025-08-27 15:17:31,022 - app.core.worker_manager - INFO - Worker manager started with 1 workers -2025-08-27 15:17:31,022 - main - INFO - ✅ 워커 매니저 시작 완료 -2025-08-27 15:17:31,023 - main - INFO - 🎉 인페인팅 서버 시작 완료! +2025-08-27 21:19:49,230 - uvicorn.error - INFO - Waiting for application startup. +2025-08-27 21:19:49,231 - main - INFO - 🚀 인페인팅 서버 시작 중... +2025-08-27 21:19:49,231 - main - INFO - ✅ 공유 객체를 app.state에 저장 완료 +2025-08-27 21:19:49,231 - app.core.session_pool - INFO - Initializing session pools... +2025-08-27 21:19:49,232 - app.core.session_pool - INFO - Initializing 2 sessions for simple_lama +2025-08-27 21:19:49,232 - main - WARNING - 상태 저장 실패: 'list' object has no attribute 'model_type' +2025-08-27 21:19:49,333 - app.core.session_pool - INFO - Created session simple_lama_0 +2025-08-27 21:19:49,435 - app.core.session_pool - INFO - Created session simple_lama_1 +2025-08-27 21:19:49,436 - app.core.session_pool - INFO - Initializing 2 sessions for migan +2025-08-27 21:19:49,537 - app.core.session_pool - INFO - Created session migan_0 +2025-08-27 21:19:49,639 - app.core.session_pool - INFO - Created session migan_1 +2025-08-27 21:19:49,639 - app.core.session_pool - INFO - Initializing 1 sessions for rembg +2025-08-27 21:19:49,741 - app.core.session_pool - INFO - Created session rembg_0 +2025-08-27 21:19:49,741 - app.core.session_pool - INFO - Session pools initialized successfully +2025-08-27 21:19:49,741 - main - INFO - ✅ 세션 풀 초기화 완료 +2025-08-27 21:19:49,742 - app.core.worker_manager - INFO - Starting worker manager... +2025-08-27 21:19:49,742 - app.core.worker_manager - INFO - Worker manager started with 1 workers +2025-08-27 21:19:49,743 - main - INFO - ✅ 워커 매니저 시작 완료 +2025-08-27 21:19:49,743 - main - INFO - 🎉 인페인팅 서버 시작 완료! INFO: Application startup complete. -2025-08-27 15:17:31,023 - uvicorn.error - INFO - Application startup complete. +2025-08-27 21:19:49,743 - uvicorn.error - INFO - Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) -2025-08-27 15:17:31,025 - uvicorn.error - INFO - Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) -INFO: 127.0.0.1:47618 - "GET /health HTTP/1.1" 200 OK -INFO: 127.0.0.1:48780 - "GET /health HTTP/1.1" 200 OK -INFO: 127.0.0.1:48790 - "GET /api/v1/server-config HTTP/1.1" 200 OK -INFO: 127.0.0.1:48792 - "GET /api/v1/samplers HTTP/1.1" 200 OK -2025-08-27 15:17:45,959 - app.core.worker_manager - ERROR - 인페인팅 처리 실패: cannot import name 'SimpleLamaModel' from 'app.models.simple_lama' (/home/ckh08045/work/inpaintServer/./app/models/simple_lama.py) -INFO: 127.0.0.1:48798 - "POST /api/v1/inpaint HTTP/1.1" 500 Internal Server Error -2025-08-27 15:17:45,986 - app.core.worker_manager - ERROR - 인페인팅 처리 실패: cannot import name 'SimpleLamaModel' from 'app.models.simple_lama' (/home/ckh08045/work/inpaintServer/./app/models/simple_lama.py) -INFO: 127.0.0.1:48800 - "POST /api/v1/inpaint HTTP/1.1" 500 Internal Server Error -2025-08-27 15:17:46,007 - app.core.worker_manager - ERROR - 배경 제거 처리 실패: cannot import name 'RembgModel' from 'app.models.rembg_model' (/home/ckh08045/work/inpaintServer/./app/models/rembg_model.py) -2025-08-27 15:17:46,008 - app.api.endpoints - ERROR - 배경 제거 처리 실패: cannot unpack non-iterable NoneType object -INFO: 127.0.0.1:48808 - "POST /api/v1/remove_bg HTTP/1.1" 500 Internal Server Error -2025-08-27 15:17:46,031 - app.core.worker_manager - ERROR - 배경 제거 처리 실패: cannot import name 'RembgModel' from 'app.models.rembg_model' (/home/ckh08045/work/inpaintServer/./app/models/rembg_model.py) -2025-08-27 15:17:46,032 - app.api.endpoints - ERROR - 플러그인 이미지 생성 실패: cannot unpack non-iterable NoneType object -INFO: 127.0.0.1:48818 - "POST /api/v1/run_plugin_gen_image HTTP/1.1" 500 Internal Server Error +2025-08-27 21:19:49,745 - uvicorn.error - INFO - Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) +2025-08-27 21:19:50,233 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +INFO: 127.0.0.1:51092 - "GET /health HTTP/1.1" 200 OK +2025-08-27 21:19:51,234 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:52,236 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:53,238 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:54,241 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:55,242 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:56,244 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:57,245 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:19:58,247 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +INFO: 127.0.0.1:51116 - "GET /docs HTTP/1.1" 200 OK +INFO: 127.0.0.1:51116 - "GET /openapi.json HTTP/1.1" 200 OK +2025-08-27 21:19:59,259 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:00,260 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:01,262 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:02,264 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:03,266 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:04,268 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:05,270 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:06,271 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:07,274 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:08,275 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:09,277 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:10,278 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:11,280 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:12,281 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:13,282 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:14,284 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:15,285 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:16,286 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:17,288 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:18,290 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:19,293 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:20,294 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:21,296 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' +2025-08-27 21:20:22,298 - main - WARNING - 상태 저장 실패: 'str' object has no attribute 'worker_id' diff --git a/logs/main_server.pid b/logs/main_server.pid index c05cdb1..c50c54b 100644 --- a/logs/main_server.pid +++ b/logs/main_server.pid @@ -1 +1 @@ -299962 +396102 diff --git a/logs/monitoring.log b/logs/monitoring.log index d677026..33ef6cf 100644 --- a/logs/monitoring.log +++ b/logs/monitoring.log @@ -1,18 +1,20 @@ -INFO: Started server process [300005] +INFO: Started server process [396175] INFO: Waiting for application startup. Fan control not available INFO: Application startup complete. INFO: Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit) -INFO: 127.0.0.1:47362 - "GET / HTTP/1.1" 200 OK -INFO: 127.0.0.1:52036 - "GET / HTTP/1.1" 200 OK -INFO: 127.0.0.1:52036 - "GET /favicon.ico HTTP/1.1" 404 Not Found -INFO: ('127.0.0.1', 52060) - "WebSocket /ws" [accepted] +INFO: 127.0.0.1:51590 - "GET / HTTP/1.1" 200 OK +INFO: 127.0.0.1:43898 - "GET / HTTP/1.1" 200 OK +INFO: ('127.0.0.1', 43930) - "WebSocket /ws" [accepted] +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' INFO: connection open -INFO: 127.0.0.1:56700 - "GET / HTTP/1.1" 200 OK -INFO: ('127.0.0.1', 56722) - "WebSocket /ws" [accepted] -INFO: connection open -INFO: connection closed -INFO: 127.0.0.1:33852 - "GET / HTTP/1.1" 200 OK -INFO: ('127.0.0.1', 33866) - "WebSocket /ws" [accepted] -INFO: connection open -INFO: connection closed +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' +세션 풀 상태 조회 실패: 'list' object has no attribute 'model_type' diff --git a/logs/monitoring.pid b/logs/monitoring.pid index d4143b2..bf2e77f 100644 --- a/logs/monitoring.pid +++ b/logs/monitoring.pid @@ -1 +1 @@ -300005 +396175 diff --git a/main.py b/main.py index fcc81bb..6c613fb 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,8 @@ iopaint와 호환되는 API를 제공합니다. """ import time import logging +import json +import asyncio from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware @@ -27,6 +29,20 @@ logger = logging.getLogger(__name__) # 서버 시작 시간 기록 start_time = time.time() +async def save_status_periodically(): + """주기적으로 워커와 세션 상태를 파일에 저장합니다.""" + while True: + try: + status = { + "worker_status": worker_manager.get_status(), + "session_status": session_pool.get_status(), + "timestamp": time.time() + } + with open("status.json", "w") as f: + json.dump(status, f) + except Exception as e: + logger.warning(f"상태 저장 실패: {e}") + await asyncio.sleep(1) @asynccontextmanager async def lifespan(app: FastAPI): @@ -34,6 +50,14 @@ async def lifespan(app: FastAPI): # 시작 시 logger.info("🚀 인페인팅 서버 시작 중...") + # app.state에 공유 객체 저장 + app.state.worker_manager = worker_manager + app.state.session_pool = session_pool + logger.info("✅ 공유 객체를 app.state에 저장 완료") + + # 상태 저장 백그라운드 작업 시작 + status_task = asyncio.create_task(save_status_periodically()) + try: # 세션 풀 초기화 await session_pool.initialize() @@ -54,6 +78,9 @@ async def lifespan(app: FastAPI): # 종료 시 logger.info("🛑 인페인팅 서버 종료 중...") + # 상태 저장 백그라운드 작업 취소 + status_task.cancel() + try: # 워커 매니저 중지 await worker_manager.stop() @@ -85,8 +112,138 @@ app.add_middleware( # API 라우터 포함 app.include_router(router) -# 모니터링 대시보드 마운트 -app.mount("/monitoring", monitor_app, name="monitoring") +# 모니터링은 start_server.sh를 통해 독립적으로 실행됩니다. +# app.mount("/monitoring", monitor_app, name="monitoring") + + +# 모니터링 데이터 직접 제공 (완전 통합) +@app.get("/monitoring/api/status") +async def get_monitoring_status(): + """모니터링 상태를 직접 반환합니다.""" + try: + import psutil + from app.utils.gpu_monitor import GPUMonitor + + # 시스템 정보 수집 + cpu_percent = psutil.cpu_percent() + memory = psutil.virtual_memory() + process_count = len(psutil.pids()) + + # GPU 정보 수집 + gpu_monitor = GPUMonitor() + gpu_info = await gpu_monitor.get_gpu_info() + + # 워커 매니저 상태 (안전한 방식) + try: + worker_status = { + "total_workers": getattr(worker_manager, 'workers', {}).__len__() if hasattr(worker_manager, 'workers') else 0, + "queue_size": getattr(worker_manager, 'queue', None).qsize() if hasattr(worker_manager, 'queue') else 0, + "workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []}, + "running": getattr(worker_manager, 'running', False) + } + except Exception as e: + logger.warning(f"워커 상태 수집 실패: {e}") + worker_status = { + "total_workers": 0, + "queue_size": 0, + "workers_by_status": {"idle": [], "busy": [], "starting": [], "stopping": [], "error": []}, + "running": False + } + + # 세션 풀 상태 (안전한 방식) + try: + sessions = getattr(session_pool, 'sessions', {}) + total_sessions = len(sessions) if sessions else 0 + available_sessions = len([s for s in sessions.values() if getattr(s, 'available', False)]) if sessions else 0 + + session_status = { + "total_sessions": total_sessions, + "available_sessions": available_sessions, + "model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0} + } + except Exception as e: + logger.warning(f"세션 상태 수집 실패: {e}") + session_status = { + "total_sessions": 0, + "available_sessions": 0, + "model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0} + } + + # API 통계 (간단한 버전) + api_stats = { + "total_requests": 0, # 실제로는 카운터가 필요 + "success_rate": 100.0, + "average_response_time": 0.0, + "error_count": 0 + } + + return { + "timestamp": time.time(), + "system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64", + "system": { + "cpu_percent": cpu_percent, + "memory_percent": memory.percent, + "process_count": process_count + }, + "gpu": gpu_info, + "worker_status": worker_status, + "session_status": session_status, + "api_stats": api_stats + } + + except Exception as e: + logger.error(f"모니터링 데이터 수집 실패: {e}") + return {"error": f"모니터링 데이터 수집 실패: {str(e)}"} + +@app.get("/monitoring/api/simple") +async def get_simple_monitoring(): + """간단한 모니터링 상태를 반환합니다.""" + try: + import psutil + return { + "timestamp": time.time(), + "system_type": "Jetson Xavier" if settings.IS_JETSON else "x86_64", + "cpu_percent": psutil.cpu_percent(), + "memory_percent": psutil.virtual_memory().percent, + "status": "running" + } + except Exception as e: + return {"error": f"간단한 상태 수집 실패: {str(e)}"} + +@app.get("/monitoring/api/worker-status") +async def get_worker_status(): + """워커 상태를 반환합니다.""" + try: + workers = getattr(worker_manager, 'workers', {}) + queue = getattr(worker_manager, 'queue', None) + running = getattr(worker_manager, 'running', False) + + return { + "total_workers": len(workers) if workers else 0, + "queue_size": queue.qsize() if queue else 0, + "running": running, + "status": "active" if running else "stopped" + } + except Exception as e: + logger.warning(f"워커 상태 조회 실패: {e}") + return {"error": f"워커 상태 조회 실패: {str(e)}"} + +@app.get("/monitoring/api/session-status") +async def get_session_status(): + """세션 풀 상태를 반환합니다.""" + try: + sessions = getattr(session_pool, 'sessions', {}) + total_sessions = len(sessions) if sessions else 0 + available_sessions = len([s for s in sessions.values() if getattr(s, 'available', False)]) if sessions else 0 + + return { + "total_sessions": total_sessions, + "available_sessions": available_sessions, + "model_distribution": {"simple-lama": 0, "migan": 0, "rembg": 0} + } + except Exception as e: + logger.warning(f"세션 상태 조회 실패: {e}") + return {"error": f"세션 상태 조회 실패: {str(e)}"} if __name__ == "__main__": diff --git a/monitoring_debug.json b/monitoring_debug.json new file mode 100644 index 0000000..4f8e494 --- /dev/null +++ b/monitoring_debug.json @@ -0,0 +1,120 @@ +{ + "timestamp": "2025-08-27T19:18:15.546032", + "system_type": "Jetson Xavier", + "gpu": { + "total": 8.0, + "used": 0.0, + "free": 8.0, + "usage_percent": 0.0, + "utilization": 0.0 + }, + "system_memory": { + "total": 30.26, + "used": 11.77, + "free": 1.96, + "usage_percent": 40.3 + }, + "system_performance": { + "cpu": { + "usage_percent": 7.0, + "count": 8, + "frequency_mhz": 1190.4, + "load_average": { + "1min": 0.72, + "5min": 0.82, + "15min": 0.8 + } + }, + "disk": { + "read_bytes": 18622774784, + "write_bytes": 8354913792, + "read_count": 299549, + "write_count": 245067 + }, + "network": { + "bytes_sent": 1438356649, + "bytes_recv": 2090199696, + "packets_sent": 1124215, + "packets_recv": 2485680 + }, + "processes": 315 + }, + "workers": { + "total_workers": 0, + "queue_size": 0, + "workers_by_status": { + "idle": [], + "busy": [], + "starting": [], + "stopping": [], + "error": [] + }, + "gpu_info": { + "total": 8.0, + "used": 0.0, + "free": 8.0, + "usage_percent": 0.0 + }, + "system_memory": { + "total": 30.26, + "used": 11.77, + "free": 1.96, + "usage_percent": 40.3 + }, + "running": false + }, + "sessions": { + "simple_lama": { + "total": 0, + "in_use": 0, + "available": 0, + "sessions": [] + }, + "migan": { + "total": 0, + "in_use": 0, + "available": 0, + "sessions": [] + }, + "rembg": { + "total": 0, + "in_use": 0, + "available": 0, + "sessions": [] + } + }, + "jetson": { + "gpu_frequency": null, + "cpu_frequency": null, + "memory_frequency": null, + "temperature": { + "zone_zone2": 36.0, + "zone_zone0": 39.0, + "zone_zone7": 37.5, + "zone_zone5": 37.0, + "zone_zone3": 36.0, + "zone_zone1": 39.0, + "zone_zone6": 41.0, + "zone_zone4": 50.0 + }, + "power_consumption": null, + "power_mode": "MAXN" + }, + "api_stats": { + "total_requests": 0, + "successful_requests": 0, + "failed_requests": 0, + "success_rate": 0.0, + "endpoint_usage": {}, + "average_response_time": 0, + "recent_errors": [] + }, + "alerts": [ + { + "level": "critical", + "message": "활성 워커가 없습니다", + "timestamp": "2025-08-27T19:18:17.118052", + "category": "workers" + } + ] +} \ No newline at end of file diff --git a/status.json b/status.json new file mode 100644 index 0000000..ef1a7a3 --- /dev/null +++ b/status.json @@ -0,0 +1 @@ +{"worker_status": {"running": true, "total_workers": 1, "queue_size": 0, "workers_by_status": {"idle": [{"id": "worker_d5e6bd0a", "status": "idle", "task_count": 0, "error_count": 0, "last_task_at": null}], "busy": [], "starting": [], "stopping": [], "error": []}}, "session_status": {"simple_lama": {"total": 2, "in_use": 0, "available": 2}, "migan": {"total": 2, "in_use": 0, "available": 2}, "rembg": {"total": 1, "in_use": 0, "available": 1}}, "api_stats": {"total_requests": 3, "successful_requests": 2, "failed_requests": 1, "endpoint_usage": {"/": 1, "/health": 1, "/api/v1/model": 1}, "start_time": 1756296888.0927753, "uptime_seconds": 253.7529318332672, "average_response_time_ms": 2.473115921020508}, "timestamp": 1756297141.8458862} \ No newline at end of file diff --git a/test_monitoring.py b/test_monitoring.py new file mode 100644 index 0000000..7b6f1a0 --- /dev/null +++ b/test_monitoring.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python3 +""" +모니터링 데이터 수집 테스트 +""" +import asyncio +import json +import sys +import os +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from app.monitoring.dashboard import monitoring_data + +async def test_monitoring(): + """모니터링 데이터 수집을 테스트합니다.""" + try: + print("모니터링 데이터 수집 테스트 시작...") + + # 데이터 수집 + data = await monitoring_data.collect_data() + + print("✅ 데이터 수집 성공!") + print(f"📊 데이터 키들: {list(data.keys())}") + + # 주요 정보 출력 + if 'system_type' in data: + print(f"🖥️ 시스템 타입: {data['system_type']}") + + if 'gpu' in data: + print(f"🎮 GPU 정보: {data['gpu']}") + + if 'workers' in data: + print(f"⚙️ 워커 상태: {data['workers']}") + + if 'sessions' in data: + print(f"🔗 세션 풀: {data['sessions']}") + + # JSON 형태로 저장 (디버깅용) + with open('monitoring_debug.json', 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False, default=str) + + print("💾 디버그 데이터가 monitoring_debug.json에 저장되었습니다.") + + except Exception as e: + print(f"❌ 모니터링 데이터 수집 실패: {e}") + import traceback + traceback.print_exc() + +if __name__ == "__main__": + asyncio.run(test_monitoring())