From a8e0c166a323e4339a1cfbc4c4cdd365e5a3caa4 Mon Sep 17 00:00:00 2001 From: AGX Date: Fri, 29 Aug 2025 23:01:55 +0900 Subject: [PATCH] =?UTF-8?q?=EC=84=A4=EC=A0=95=20=ED=8C=8C=EC=9D=BC?= =?UTF-8?q?=EC=97=90=20Discord=20=EC=9B=B9=ED=9B=85=20=EC=95=8C=EB=A6=BC?= =?UTF-8?q?=20URL=20=EC=B6=94=EA=B0=80=20=EB=B0=8F=20=EC=84=9C=EB=B2=84=20?= =?UTF-8?q?=EC=83=81=ED=83=9C=20=EA=B0=90=EC=8B=9C=20=EB=B0=8F=20=EC=9E=90?= =?UTF-8?q?=EB=8F=99=20=EC=9E=AC=EC=8B=9C=EC=9E=91=20=EA=B8=B0=EB=8A=A5?= =?UTF-8?q?=EC=9D=84=20=EA=B5=AC=ED=98=84=ED=95=98=EC=98=80=EC=8A=B5?= =?UTF-8?q?=EB=8B=88=EB=8B=A4.=20=EB=98=90=ED=95=9C,=20=EB=AA=A8=EB=8D=B8?= =?UTF-8?q?=20=EB=A1=9C=EB=94=A9=20=EC=8B=9C=EA=B0=84=20=ED=86=B5=EA=B3=84?= =?UTF-8?q?=20=EB=A1=9C=EC=A7=81=EC=9D=84=20=EA=B0=9C=EC=84=A0=ED=95=98?= =?UTF-8?q?=EA=B3=A0,=20=ED=99=98=EA=B2=BD=20=ED=8C=8C=EC=9D=BC=20?= =?UTF-8?q?=EC=9D=B8=EC=BD=94=EB=94=A9=EC=9D=84=20UTF-8=EB=A1=9C=20?= =?UTF-8?q?=EC=84=A4=EC=A0=95=ED=95=98=EC=98=80=EC=8A=B5=EB=8B=88=EB=8B=A4?= =?UTF-8?q?.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/core/config.py | 10 +++-- app/monitoring/dashboard.py | 84 ++++++++++++++++++++++++++++++++--- app/utils/discord_notifier.py | 61 +++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 9 deletions(-) create mode 100644 app/utils/discord_notifier.py diff --git a/app/core/config.py b/app/core/config.py index 281c33e..f90a386 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -15,8 +15,8 @@ class Settings(BaseSettings): # Server settings HOST: str = "0.0.0.0" PORT: int = 8008 - WORKERS: int = 1 - + WORKERS: int = 1 # 개발 모드에서는 1로 고정, 프로덕션에서는 gunicorn으로 관리 + # GPU settings (Jetson Xavier 최적화) CUDA_DEVICE: int = 0 USE_CUDA: bool = True # CUDA 사용 여부 (Jetson에서 항상 True) @@ -71,7 +71,10 @@ class Settings(BaseSettings): # Monitoring ENABLE_MONITORING: bool = True MONITORING_PORT: int = 8888 - + + # Discord 웹훅 알림 설정 + DISCORD_WEBHOOK_URL: Optional[str] = None + # Jetson performance settings JETSON_GPU_FREQ: int = 1200 # MHz JETSON_CPU_FREQ: int = 1900 # MHz @@ -79,6 +82,7 @@ class Settings(BaseSettings): class Config: env_file = ".env" + env_file_encoding = 'utf-8' settings = Settings() diff --git a/app/monitoring/dashboard.py b/app/monitoring/dashboard.py index c79311b..37e63ec 100644 --- a/app/monitoring/dashboard.py +++ b/app/monitoring/dashboard.py @@ -18,6 +18,8 @@ import uvicorn from fastapi import APIRouter, Request import websockets.exceptions import requests +import subprocess +from ..utils.discord_notifier import send_discord_notification from ..core.worker_manager import worker_manager from ..core.session_pool import session_pool @@ -1788,16 +1790,17 @@ async def get_performance_stats(): duration_ms = (success_total - loading_total) * 1000 - if 0 < duration_ms < 10000: # 0-10초 범위만 유효 - model_name = "Simple LAMA" # 기본값 - if "simple_lama" in line.lower(): + if 0 < duration_ms < 60000: # 0-60초 범위만 유효 (모델 다운로드 시간 고려) + model_name = None + if "simple lama model loaded successfully" in lines[j].lower(): model_name = "Simple LAMA" - elif "migan" in line.lower(): + elif "migan onnx model loaded successfully" in lines[j].lower(): model_name = "MIGAN" - elif "rembg" in line.lower(): + elif "rembg model" in lines[j].lower() and "loaded successfully" in lines[j].lower(): model_name = "RemBG" - model_load_times[model_name].append(duration_ms) + if model_name: + model_load_times[model_name].append(duration_ms) break except Exception as parse_error: continue @@ -2076,6 +2079,74 @@ async def broadcast_data(): await asyncio.sleep(5) +# --- 서버 감시 및 자동 재시작 --- +HEALTH_CHECK_INTERVAL = 30 # 30초마다 확인 +RESTART_COOLDOWN = 180 # 재시작 후 3분 대기 +last_restart_time = 0 + +async def health_check_and_restart(): + """메인 서버의 상태를 주기적으로 확인하고, 다운 시 재시작합니다.""" + global last_restart_time + logger.info("🩺 메인 서버 상태 감시 백그라운드 작업 시작...") + + while True: + await asyncio.sleep(HEALTH_CHECK_INTERVAL) + + try: + health_url = f"http://{settings.HOST}:{settings.PORT}/api/v1/health" + response = await asyncio.to_thread(requests.get, health_url, timeout=10) + + if response.status_code == 200: + logger.debug(f"✅ 메인 서버 정상 응답 (상태 코드: {response.status_code})") + continue + else: + logger.warning(f"메인 서버 비정상 응답 (상태 코드: {response.status_code})") + + except requests.RequestException as e: + logger.error(f"❌ 메인 서버 연결 실패: {e}") + + # --- 서버 다운 감지 및 재시작 로직 --- + current_time = time.time() + if current_time - last_restart_time < RESTART_COOLDOWN: + logger.warning(f"재시작 대기 시간({RESTART_COOLDOWN}초)이 지나지 않아 재시작을 건너뜁니다.") + continue + + logger.info("메인 서버 다운 감지. 재시작 절차를 시작합니다.") + last_restart_time = current_time + + # 1. Discord 알림 발송 + error_message = f"🚨 메인 서버(http://{settings.HOST}:{settings.PORT})가 응답하지 않습니다. 자동 재시작을 시도합니다." + send_discord_notification(error_message, level="error") + + # 2. 서버 재시작 스크립트 실행 + try: + script_path = os.path.join(settings.PROJECT_ROOT, "scripts", "start_server.sh") + logger.info(f"'{script_path}' 스크립트를 실행하여 서버를 재시작합니다.") + + # 비동기로 서브프로세스 실행 + process = await asyncio.create_subprocess_shell( + f"bash {script_path}", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE + ) + stdout, stderr = await process.communicate() + + if process.returncode == 0: + success_message = "✅ 메인 서버 재시작 스크립트가 성공적으로 실행되었습니다." + logger.info(success_message) + send_discord_notification(success_message, level="success") + else: + error_log = stderr.decode(errors='ignore') + fail_message = f"❌ 서버 재시작 스크립트 실행 실패 (코드: {process.returncode})\n```\n{error_log}\n```" + logger.error(fail_message) + send_discord_notification(fail_message, level="error") + + except Exception as e: + restart_fail_message = f"🔥 서버 재시작 중 치명적인 오류 발생: {e}" + logger.critical(restart_fail_message, exc_info=True) + send_discord_notification(restart_fail_message, level="error") + + @monitor_app.on_event("startup") async def start_monitoring(): """모니터링 시작""" @@ -2087,6 +2158,7 @@ async def start_monitoring(): gpu_monitor.optimize_for_jetson() asyncio.create_task(broadcast_data()) + asyncio.create_task(health_check_and_restart()) if __name__ == "__main__": diff --git a/app/utils/discord_notifier.py b/app/utils/discord_notifier.py new file mode 100644 index 0000000..8da4b3c --- /dev/null +++ b/app/utils/discord_notifier.py @@ -0,0 +1,61 @@ +""" +Discord 웹훅 알림 유틸리티 +""" +import logging +import requests +import socket +from datetime import datetime + +from ..core.config import settings + +logger = logging.getLogger(__name__) + +def send_discord_notification(message: str, level: str = "info"): + """ + Discord 웹훅으로 알림을 보냅니다. + """ + webhook_url = settings.DISCORD_WEBHOOK_URL + if not webhook_url: + logger.warning("Discord 웹훅 URL이 설정되지 않아 알림을 보낼 수 없습니다.") + return + + hostname = socket.gethostname() + + color_map = { + "info": 3447003, # Blue + "warning": 16776960, # Yellow + "error": 15158332, # Red + "success": 3066993 # Green + } + color = color_map.get(level.lower(), 3447003) + + payload = { + "embeds": [ + { + "title": f"🚨 서버 알림 ({level.upper()})", + "description": message, + "color": color, + "footer": { + "text": f"Host: {hostname} | {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" + } + } + ] + } + + try: + response = requests.post(webhook_url, json=payload, timeout=5) + response.raise_for_status() + logger.info("Discord로 알림을 성공적으로 보냈습니다.") + except requests.RequestException as e: + logger.error(f"Discord 알림 전송 실패: {e}") + +if __name__ == '__main__': + # 테스트용 + settings.DISCORD_WEBHOOK_URL = "YOUR_TEST_WEBHOOK_URL" # 여기에 테스트용 웹훅 URL을 입력하세요. + if settings.DISCORD_WEBHOOK_URL == "YOUR_TEST_WEBHOOK_URL": + print("Please replace 'YOUR_TEST_WEBHOOK_URL' with your actual Discord webhook URL to test.") + else: + send_discord_notification("✅ 테스트: 서버가 성공적으로 시작되었습니다.", level="success") + send_discord_notification("ℹ️ 정보: 모델 캐시를 업데이트하고 있습니다.", level="info") + send_discord_notification("⚠️ 경고: GPU 온도가 85°C를 초과했습니다.", level="warning") + send_discord_notification("❌ 오류: 메인 서버가 응답하지 않습니다. 재시작을 시도합니다.", level="error")