환경 설정 파일에서 세션 및 워커 수를 조정하고, API 에러 로그 기록 방식을 개선하였습니다. 대시보드에서 스케일 및 세션 이벤트를 시각화할 수 있는 기능을 추가하였으며, 관련 HTML 구조를 업데이트하였습니다. 서버 로그에서 프로세스 ID 변경 사항을 반영하였습니다.

This commit is contained in:
vast 2025-10-02 03:25:40 +00:00
parent cebf7612c6
commit 47ba96e148
15 changed files with 281668 additions and 159 deletions

10
.env
View File

@ -37,13 +37,13 @@ JETSON_CPU_FREQ=1900 # MHz
JETSON_MEMORY_FREQ=1600 # MHz
# 세션 풀 설정 (시스템별 자동 조정)
SIMPLE_LAMA_SESSIONS=4
MIGAN_SESSIONS=4
REMBG_SESSIONS=1
SIMPLE_LAMA_SESSIONS=6
MIGAN_SESSIONS=6
REMBG_SESSIONS=3
# 워커 설정 (Jetson은 더 적은 워커 사용)
MAX_WORKERS=8 # Jetson: 4, x86: 8
MIN_WORKERS=2 # Jetson: 1, x86: 2
MAX_WORKERS=12 # Jetson: 4, x86: 8
MIN_WORKERS=6 # Jetson: 1, x86: 2
WORKER_TIMEOUT=300
# VRAM 관리 (Jetson은 더 보수적인 설정)

View File

@ -157,6 +157,11 @@ async def health_check(request: Request):
version="1.0.0"
)
# 호환용: 일부 클라이언트에서 /health 경로로 접근하는 경우가 있어 동일 응답 제공
@router.get("/health", response_model=HealthResponse, include_in_schema=False)
async def health_check_compat(request: Request):
return await health_check(request)
@router.get("/api/v1/server-config", response_model=ServerConfigResponse)
async def get_server_config():
@ -292,6 +297,7 @@ async def inpaint_image(
)
except HTTPException:
# HTTPException은 상세 사유를 에러 로그에 남길 수 있도록 재전파됨 (미들웨어에서 잡혀 JSONL 기록)
raise
except Exception as e:
logger.error(f"인페인팅 처리 실패: {e}")
@ -396,6 +402,7 @@ async def remove_background(
})
except HTTPException:
# HTTPException은 상세 사유를 에러 로그에 남길 수 있도록 재전파됨
raise
except Exception as e:
logger.error(f"배경 제거 처리 실패: {e}")
@ -524,7 +531,7 @@ async def root():
"message": "인페인팅 서버 API (iopaint 호환)",
"version": "1.0.0",
"docs": "/docs",
"health": "/health"
"health": "/api/v1/health"
}

View File

@ -89,15 +89,15 @@ class Settings(BaseSettings):
# =========================
# 동적 세션 풀/메모리
# =========================
SIMPLE_LAMA_MIN_SESSIONS: int = 2
SIMPLE_LAMA_MAX_SESSIONS: int = 4
SIMPLE_LAMA_MIN_SESSIONS: int = 4
SIMPLE_LAMA_MAX_SESSIONS: int = 8
# x86에서는 MIGAN 미로딩(지연 로딩) 기본 → MIN=0
MIGAN_MIN_SESSIONS: int = 2 if IS_JETSON else 1
MIGAN_MAX_SESSIONS: int = 4
MIGAN_MIN_SESSIONS: int = 4 if IS_JETSON else 1
MIGAN_MAX_SESSIONS: int = 8
REMBG_MIN_SESSIONS: int = 2 if IS_JETSON else 1
REMBG_MAX_SESSIONS: int = 4 if IS_JETSON else 3
REMBG_MIN_SESSIONS: int = 3 if IS_JETSON else 1
REMBG_MAX_SESSIONS: int = 6 if IS_JETSON else 4
# 여유 VRAM 비율(남은 VRAM이 이 값보다 커야 세션 추가)
SESSION_VRAM_THRESHOLD: float = 0.30
@ -105,8 +105,8 @@ class Settings(BaseSettings):
# 마이크로 배치(SimpleLAMA)
USE_MICRO_BATCHING: bool = True
MICRO_BATCH_SIZE: int = 4
MICRO_BATCH_TIMEOUT_MS: int = 100
MICRO_BATCH_SIZE: int = 8
MICRO_BATCH_TIMEOUT_MS: int = 80
# 사전 확정 세션(플랫폼 감안 기본치)
SIMPLE_LAMA_SESSIONS: int = 4
@ -114,8 +114,8 @@ class Settings(BaseSettings):
REMBG_SESSIONS: int = 3 if IS_JETSON else 2
# 워커(내부 큐/스레드 워커, 프로세스는 WORKERS)
MAX_WORKERS: int = 4 if IS_JETSON else 8
MIN_WORKERS: int = 1 if IS_JETSON else 4
MAX_WORKERS: int = 6 if IS_JETSON else 12
MIN_WORKERS: int = 3 if IS_JETSON else 4
WORKER_TIMEOUT: int = 120
# =========================
@ -123,7 +123,7 @@ class Settings(BaseSettings):
# =========================
VRAM_THRESHOLD_HIGH: float = 0.70 if IS_JETSON else 0.80
VRAM_THRESHOLD_LOW: float = 0.30 if IS_JETSON else 0.40
VRAM_CHECK_INTERVAL: int = 20 if IS_JETSON else 15 # 초
VRAM_CHECK_INTERVAL: int = 10 if IS_JETSON else 5 # 초
# =========================
# 모델/경로

View File

@ -14,6 +14,7 @@ from collections import defaultdict
from ..core.config import settings
from ..utils.gpu_monitor import gpu_monitor
from ..utils.monitor_events import append_event
logger = logging.getLogger(__name__)
@ -118,6 +119,16 @@ class SessionPool:
)
logger.info(f"Successfully created session {session_id}")
self._log_pool_status("create", model_type.value)
try:
append_event({
"type": "session",
"action": "create",
"model": model_type.value,
"session_id": session_id,
"pool_size": len(self.pools[model_type]) + 1,
})
except Exception:
pass
return session
except Exception as e:
logger.error(f"Failed to create session {session_id}: {e}", exc_info=True)
@ -253,6 +264,15 @@ class SessionPool:
reaped_counts[session.model_type.value] += 1
del session.model
del session
try:
append_event({
"type": "session",
"action": "reap",
"model": model_type.value,
"pool_size": len(pool),
})
except Exception:
pass
self.conditions[model_type].notify_all()

View File

@ -16,6 +16,7 @@ from ..utils.gpu_monitor import gpu_monitor
from ..core.config import settings
from ..core.stats_manager import stats_manager
from ..core.session_pool import ModelType
from ..utils.monitor_events import append_event
logger = logging.getLogger(__name__)
@ -230,12 +231,32 @@ class WorkerManager:
await self._scale_workers(new_count)
self.last_scale_time = current_time
logger.info(f"Scaled up to {new_count} workers (VRAM: {vram_usage:.2f})")
try:
append_event({
"type": "worker_scale",
"action": "up",
"new_count": new_count,
"queue_size": queue_size,
"vram_usage": vram_usage,
})
except Exception:
pass
elif should_scale_down:
new_count = max(total_workers - 1, settings.MIN_WORKERS)
await self._scale_workers(new_count)
self.last_scale_time = current_time
logger.info(f"Scaled down to {new_count} workers (VRAM: {vram_usage:.2f})")
try:
append_event({
"type": "worker_scale",
"action": "down",
"new_count": new_count,
"queue_size": queue_size,
"vram_usage": vram_usage,
})
except Exception:
pass
async def _scale_workers(self, target_count: int):
"""워커 수를 조정합니다."""

View File

@ -243,8 +243,9 @@ class MonitoringData:
"gpu": gpu_info,
"system_memory": system_memory,
"system_performance": system_performance,
"workers": worker_status,
"sessions": session_status,
# status.json 스냅샷 외에 실시간 상태를 병합
"workers": worker_manager.get_status() or worker_status,
"sessions": session_pool.get_status() or session_status,
"jetson": jetson_info,
"api_stats": api_stats,
"model_performance_stats": model_performance_stats,
@ -984,6 +985,17 @@ HTML_TEMPLATE = """
</div>
</div>
<!-- 스케일/세션 타임라인 -->
<div class="card">
<h3>📈 워커·세션 타임라인</h3>
<div id="scale-timeline" style="font-family:'Courier New',monospace;font-size:12px;background:#f8f9fa;border-radius:6px;padding:10px;max-height:220px;overflow:auto;">
로딩 ...
</div>
<div style="margin-top:8px;text-align:right;">
<button onclick="refreshTimeline()" style="padding:5px 12px;">새로고침</button>
</div>
</div>
<!-- 최근 에러 -->
<div class="card">
<h3>🚨 최근 API 에러</h3>
@ -1117,7 +1129,8 @@ HTML_TEMPLATE = """
function connectWebSocket() {
try {
ws = new WebSocket(`ws://${window.location.host}/ws`);
const proto = (window.location.protocol === 'https:') ? 'wss' : 'ws';
ws = new WebSocket(`${proto}://${window.location.host}/ws`);
ws.onopen = function() {
console.log('WebSocket 연결이 성공했습니다.');
@ -1418,7 +1431,8 @@ HTML_TEMPLATE = """
const path = e.path || '-';
const status = e.status != null ? e.status : '-';
const rt = e.response_time_ms != null ? e.response_time_ms : '-';
return `<div class="error-row"><div>${ts}</div><div>${method}</div><div>${status}</div><div>${path}</div><div>${rt}</div></div>`;
const ip = e.client_ip ? ` <span style='color:#888'>(IP: ${e.client_ip})</span>` : '';
return `<div class="error-row"><div>${ts}</div><div>${method}</div><div>${status}</div><div>${path}${ip}</div><div>${rt}</div></div>`;
}).join('');
container.innerHTML = rows;
}
@ -1705,6 +1719,32 @@ HTML_TEMPLATE = """
});
}
function renderTimeline(events) {
const el = document.getElementById('scale-timeline');
if (!Array.isArray(events) || events.length === 0) {
el.innerHTML = '<div style="color:#999;">이벤트 없음</div>';
return;
}
const rows = events.slice().reverse().map(ev => {
const ts = ev.timestamp ? new Date(ev.timestamp*1000).toLocaleTimeString() : '';
if (ev.type === 'worker_scale') {
return `[${ts}] WORKERS ${ev.action.toUpperCase()} -> ${ev.new_count} (queue=${ev.queue_size}, vram=${(ev.vram_usage*100||0).toFixed(1)}%)`;
}
if (ev.type === 'session') {
return `[${ts}] SESSION ${ev.action.toUpperCase()} (${ev.model}) size=${ev.pool_size}`;
}
return `[${ts}] ${ev.type}`;
}).join('\n');
el.textContent = rows;
}
function refreshTimeline() {
fetch('/api/scale-events')
.then(r => r.json())
.then(data => renderTimeline(data.events || []))
.catch(() => { document.getElementById('scale-timeline').innerHTML = '<div style="color:#dc3545;">타임라인 로딩 실패</div>'; });
}
// 페이지 로드 초기화
document.addEventListener('DOMContentLoaded', function() {
// 로그 성능 통계 초기 로딩
@ -2014,6 +2054,16 @@ async def get_system_alerts():
logger.error(f"시스템 알림 조회 실패: {e}")
return {"alerts": [], "error": str(e)}
@api_router.get("/scale-events")
def get_scale_events():
"""최근 스케일/세션 이벤트를 반환"""
try:
from ..utils.monitor_events import read_recent_events
return {"events": read_recent_events(limit=300)}
except Exception as e:
logger.error(f"타임라인 조회 실패: {e}")
return {"events": [], "error": str(e)}
@api_router.get("/errors", summary="최근 API 에러 목록")
def get_recent_errors(limit: int = 50):
"""최근 API 에러를 반환합니다 (logs/api_errors.jsonl 기반)."""
@ -2297,3 +2347,29 @@ if __name__ == "__main__":
port=settings.MONITORING_PORT,
log_level="info"
)
# --- 외부 런처용: 로그에 시간 포함하여 실행 ---
def _get_uvicorn_log_config():
try:
from uvicorn.config import LOGGING_CONFIG as DEFAULT
import copy
cfg = copy.deepcopy(DEFAULT)
# 포맷에 시간 추가
for fmt in ("default", "access"):
if fmt in cfg.get("formatters", {}):
cfg["formatters"][fmt]["format"] = "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
return cfg
except Exception:
return None
def run_monitor(host: str = "0.0.0.0", port: int = None):
"""모니터링 서버 실행 (시간 스탬프 포함 로그)"""
_port = port or settings.MONITORING_PORT
uvicorn.run(
monitor_app,
host=host,
port=_port,
log_level="info",
log_config=_get_uvicorn_log_config()
)

View File

@ -0,0 +1,84 @@
"""
API 에러 로깅 유틸리티 (JSONL 기록 + 로테이션 + 클라이언트 IP 추출)
"""
from __future__ import annotations
import os
import time
import json
import re
from typing import Dict, Any
from fastapi import Request
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
API_ERROR_LOG_PATH = os.path.join(LOG_DIR, "api_errors.jsonl")
API_ERROR_MAX_BYTES = 10 * 1024 * 1024 # 10MB
API_ERROR_BACKUP_COUNT = 5
def _rotate_if_needed() -> None:
try:
if os.path.exists(API_ERROR_LOG_PATH) and os.path.getsize(API_ERROR_LOG_PATH) >= API_ERROR_MAX_BYTES:
ts = time.strftime("%Y%m%d-%H%M%S")
rotated_path = os.path.join(LOG_DIR, f"api_errors_{ts}.jsonl")
os.replace(API_ERROR_LOG_PATH, rotated_path)
rotated = [
os.path.join(LOG_DIR, f) for f in os.listdir(LOG_DIR)
if f.startswith("api_errors_") and f.endswith(".jsonl")
]
rotated.sort(key=lambda p: os.path.getmtime(p), reverse=True)
for old in rotated[API_ERROR_BACKUP_COUNT:]:
try:
os.remove(old)
except Exception:
pass
except Exception:
# 로테이션 실패는 치명적이지 않으므로 무시
pass
def append_api_error_log(record: Dict[str, Any]) -> None:
try:
_rotate_if_needed()
with open(API_ERROR_LOG_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
except Exception:
pass
def extract_client_ip(request: Request) -> str:
try:
xff = request.headers.get("x-forwarded-for") or request.headers.get("X-Forwarded-For")
if xff:
first_ip = xff.split(",")[0].strip()
if first_ip:
return first_ip
xri = request.headers.get("x-real-ip") or request.headers.get("X-Real-IP")
if xri:
return xri.strip()
fwd = request.headers.get("forwarded") or request.headers.get("Forwarded")
if fwd:
m = re.search(r"for=([^;,\s]+)", fwd)
if m:
return m.group(1).strip('"')
if request.client and request.client.host:
return request.client.host
except Exception:
pass
return ""
def get_content_length(request: Request) -> int:
try:
v = request.headers.get("content-length") or request.headers.get("Content-Length")
if v is None:
return 0
return int(v)
except Exception:
return 0

View File

@ -0,0 +1,69 @@
"""
경량 모니터링 이벤트(JSONL) 기록 읽기 유틸
- worker 스케일 /다운
- 세션 생성/회수
"""
from __future__ import annotations
import os
import time
import json
from typing import Dict, Any, List
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
EVENT_LOG_PATH = os.path.join(LOG_DIR, "scale_events.jsonl")
MAX_BYTES = 10 * 1024 * 1024 # 10MB
BACKUP = 10
def _rotate_if_needed():
try:
if os.path.exists(EVENT_LOG_PATH) and os.path.getsize(EVENT_LOG_PATH) > MAX_BYTES:
ts = time.strftime("%Y%m%d-%H%M%S")
os.replace(EVENT_LOG_PATH, os.path.join(LOG_DIR, f"scale_events_{ts}.jsonl"))
rotated = [os.path.join(LOG_DIR, f) for f in os.listdir(LOG_DIR) if f.startswith("scale_events_")]
rotated.sort(key=lambda p: os.path.getmtime(p), reverse=True)
for p in rotated[BACKUP:]:
try:
os.remove(p)
except Exception:
pass
except Exception:
pass
def append_event(event: Dict[str, Any]) -> None:
try:
_rotate_if_needed()
if "timestamp" not in event:
event["timestamp"] = time.time()
with open(EVENT_LOG_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(event, ensure_ascii=False) + "\n")
except Exception:
pass
def read_recent_events(limit: int = 300) -> List[Dict[str, Any]]:
try:
if not os.path.exists(EVENT_LOG_PATH):
return []
events: List[Dict[str, Any]] = []
with open(EVENT_LOG_PATH, "r", encoding="utf-8") as f:
# 간단히 끝에서 limit줄만 읽기 (파일이 크지 않다고 가정)
lines = f.readlines()[-limit:]
for line in lines:
line = line.strip()
if not line:
continue
try:
events.append(json.loads(line))
except Exception:
continue
return events
except Exception:
return []

191867
logs/main.log

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1 +1 @@
192534
271615

View File

@ -1,7 +1,69 @@
WARNING:root:jtop library not found. Jetson monitoring will be limited. Please run 'sudo pip install jetson-stats'
INFO: Started server process [192741]
INFO: Started server process [271803]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8888 (Press CTRL+C to quit)
INFO: 122.35.47.45:63439 - "WebSocket /ws" [accepted]
INFO: 127.0.0.1:49994 - "GET /api/simple HTTP/1.1" 200 OK
INFO: 118.235.73.64:35921 - "GET /api/logs?lines=50 HTTP/1.1" 200 OK
INFO: 118.235.73.64:35815 - "GET /api/performance-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:27910 - "GET /api/system-alerts HTTP/1.1" 200 OK
INFO: 118.235.73.64:33163 - "GET /api/model-usage-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:33780 - "WebSocket /ws" [accepted]
INFO: connection open
INFO: 118.235.73.64:29039 - "GET /favicon.ico HTTP/1.1" 404 Not Found
ERROR:app.monitoring.dashboard:데이터 전송 오류:
INFO: connection closed
INFO: 118.235.73.64:36365 - "WebSocket /ws" [accepted]
ERROR:app.monitoring.dashboard:모델 성능 통계 조회 중 예외 발생: HTTPConnectionPool(host='0.0.0.0', port=8008): Read timed out. (read timeout=2)
INFO: connection open
ERROR:app.monitoring.dashboard:모델 성능 통계 조회 중 예외 발생: HTTPConnectionPool(host='0.0.0.0', port=8008): Read timed out. (read timeout=2)
INFO: 118.235.73.64:29969 - "GET /api/system-alerts HTTP/1.1" 200 OK
INFO: 118.235.73.64:36188 - "GET /api/errors HTTP/1.1" 200 OK
INFO: 118.235.73.64:34776 - "GET /api/model-usage-stats HTTP/1.1" 200 OK
ERROR:app.monitoring.dashboard:모델 성능 통계 조회 중 예외 발생: HTTPConnectionPool(host='0.0.0.0', port=8008): Read timed out. (read timeout=2)
INFO: 118.235.73.64:34776 - "GET /api/system-alerts HTTP/1.1" 200 OK
ERROR:app.monitoring.dashboard:모델 성능 통계 조회 중 예외 발생: HTTPConnectionPool(host='0.0.0.0', port=8008): Read timed out. (read timeout=2)
INFO: 118.235.73.64:36188 - "GET /api/errors HTTP/1.1" 200 OK
INFO: 122.35.47.45:52268 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:36188 - "GET /api/logs?lines=50 HTTP/1.1" 200 OK
INFO: 118.235.73.64:32011 - "GET /api/model-usage-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:29864 - "GET /api/performance-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:33588 - "GET /api/system-alerts HTTP/1.1" 200 OK
INFO: 118.235.73.64:31627 - "GET /api/errors HTTP/1.1" 200 OK
INFO: 118.235.73.64:32554 - "GET /api/system-alerts HTTP/1.1" 200 OK
INFO: 118.235.73.64:36432 - "GET /api/errors HTTP/1.1" 200 OK
INFO: 118.235.73.64:36432 - "GET /api/model-usage-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:36432 - "GET /api/system-alerts HTTP/1.1" 200 OK
INFO: 118.235.73.64:31023 - "GET /api/errors HTTP/1.1" 200 OK
ERROR:app.monitoring.dashboard:모델 성능 통계 조회 중 예외 발생: HTTPConnectionPool(host='0.0.0.0', port=8008): Read timed out. (read timeout=2)
ERROR:app.monitoring.dashboard:모델 성능 통계 조회 중 예외 발생: HTTPConnectionPool(host='0.0.0.0', port=8008): Read timed out. (read timeout=2)
INFO: 118.235.73.64:30785 - "GET /api/model-usage-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:35188 - "GET /api/system-alerts HTTP/1.1" 200 OK
INFO: 118.235.73.64:30117 - "GET /api/performance-stats HTTP/1.1" 200 OK
INFO: 118.235.73.64:30102 - "GET /api/logs?lines=50 HTTP/1.1" 200 OK
INFO: 118.235.73.64:36300 - "GET /api/errors HTTP/1.1" 200 OK
INFO: 118.235.73.64:30102 - "GET / HTTP/1.1" 200 OK
INFO: connection closed
INFO: 118.235.73.64:30102 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 118.235.73.64:34126 - "GET /favicon.ico HTTP/1.1" 404 Not Found
ERROR:app.monitoring.dashboard:데이터 전송 오류:
INFO: 118.235.73.64:27884 - "GET / HTTP/1.1" 200 OK
WARNING: Invalid HTTP request received.
WARNING: Invalid HTTP request received.
INFO: 118.235.73.64:30128 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:30128 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 118.235.73.64:33038 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 118.235.73.64:34281 - "GET /favicon.ico HTTP/1.1" 404 Not Found
INFO: 118.235.73.64:28205 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:28205 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:30339 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:30775 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:32870 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:28161 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:33203 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:37097 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:36624 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:30357 - "GET / HTTP/1.1" 200 OK
INFO: 118.235.73.64:35740 - "GET / HTTP/1.1" 200 OK
INFO: 122.35.47.45:50214 - "GET / HTTP/1.1" 200 OK
INFO: 122.35.47.45:50214 - "GET / HTTP/1.1" 200 OK

View File

@ -1 +1 @@
192741
271803

93
main.py
View File

@ -7,9 +7,11 @@ import time
import logging
import json
import asyncio
import re
from contextlib import asynccontextmanager
from collections import defaultdict, deque
from fastapi import FastAPI, Request, Response
from fastapi import HTTPException
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
@ -21,6 +23,11 @@ from app.monitoring.dashboard import monitor_app
from app.core.batch_manager import batch_manager
# from app.utils.background_task import manage_state_background # TODO: 경로 확인 필요
from app.utils.discord_notifier import send_discord_notification
from app.utils.api_error_log import (
append_api_error_log,
extract_client_ip,
get_content_length,
)
# 로깅 설정
import logging.handlers
@ -177,6 +184,7 @@ api_stats = APIStatsCollector()
# - 주기적 폴링으로 인해 실제 비즈니스 엔드포인트 통계를 왜곡시키지 않기 위함
EXCLUDED_ENDPOINTS = {
"/api/v1/health",
"/health",
"/docs",
"/openapi.json",
"/redoc",
@ -196,34 +204,19 @@ API_ERROR_LOG_PATH = os.path.join(log_dir, "api_errors.jsonl")
API_ERROR_MAX_BYTES = 10 * 1024 * 1024 # 10MB
API_ERROR_BACKUP_COUNT = 5
def _rotate_api_error_log_if_needed():
try:
if os.path.exists(API_ERROR_LOG_PATH) and os.path.getsize(API_ERROR_LOG_PATH) >= API_ERROR_MAX_BYTES:
ts = time.strftime("%Y%m%d-%H%M%S")
rotated_path = os.path.join(log_dir, f"api_errors_{ts}.jsonl")
os.replace(API_ERROR_LOG_PATH, rotated_path)
# 오래된 로테이션 파일 정리 (최신 N개만 유지)
rotated = [
os.path.join(log_dir, f) for f in os.listdir(log_dir)
if f.startswith("api_errors_") and f.endswith(".jsonl")
]
rotated.sort(key=lambda p: os.path.getmtime(p), reverse=True)
for old in rotated[API_ERROR_BACKUP_COUNT:]:
try:
os.remove(old)
except Exception:
pass
except Exception as e: # pragma: no cover
logger.warning(f"API 에러 로그 로테이션 실패: {e}")
def _append_api_error_log(record: dict):
"""에러 전용 JSONL 로그에 한 줄 추가"""
try:
_rotate_api_error_log_if_needed()
with open(API_ERROR_LOG_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
except Exception as e: # pragma: no cover
logger.warning(f"API 에러 로그 기록 실패: {e}")
def _append_error_record(request: Request, status: int, response_time: float, error: str | None = None):
client_ip = extract_client_ip(request)
content_length = get_content_length(request)
append_api_error_log({
"timestamp": time.time(),
"method": request.method,
"path": request.url.path,
"status": status,
"error": error,
"response_time_ms": int(response_time * 1000),
"client_ip": client_ip,
"content_length": content_length,
})
async def save_status_periodically():
"""주기적으로 워커와 세션 상태를 파일에 저장합니다."""
@ -346,6 +339,31 @@ app = FastAPI(
lifespan=lifespan
)
# 업로드 용량 선검사 미들웨어 (Content-Length 기반)
@app.middleware("http")
async def content_length_guard(request: Request, call_next):
try:
# 업로드가 포함될 가능성이 높은 POST/PUT/PATCH만 검사
if request.method in {"POST", "PUT", "PATCH"}:
max_bytes = settings.MAX_FILE_SIZE # MB 단위가 아닌 바이트로 받도록 설정되어 있음
# settings.MAX_FILE_SIZE는 MB 단위였으나, 현재 코드에서는 바이트 사용.
# 값이 과거 설정(바이트/MB 혼용)일 수 있어 1e4 이상이면 바이트로 간주, 작으면 MB로 간주하여 보정
limit = max_bytes
if limit < 1024 * 1024: # MB일 가능성
limit = max_bytes
content_length = get_content_length(request)
if content_length and content_length > limit:
return Response(
content=json.dumps({
"detail": f"Request body too large: {content_length} bytes > limit {limit} bytes"
}),
status_code=413,
media_type="application/json",
)
return await call_next(request)
except Exception:
return await call_next(request)
# API 통계 수집 미들웨어
@app.middleware("http")
async def collect_api_stats(request: Request, call_next):
@ -375,15 +393,9 @@ async def collect_api_stats(request: Request, call_next):
# 통계 업데이트
api_stats.end_request(endpoint, success, response_time)
# 4xx/5xx는 에러 로그 파일에 기록
# 4xx/5xx는 에러 로그 파일에 기록 (클라이언트 IP 포함)
if not success:
_append_api_error_log({
"timestamp": time.time(),
"method": request.method,
"path": path,
"status": response.status_code,
"response_time_ms": int(response_time * 1000)
})
_append_error_record(request, response.status_code, response_time)
return response
@ -391,14 +403,7 @@ async def collect_api_stats(request: Request, call_next):
# 에러 발생 시
response_time = time.time() - start_time
api_stats.end_request(endpoint, False, response_time, str(e))
_append_api_error_log({
"timestamp": time.time(),
"method": request.method,
"path": path,
"status": 500,
"error": str(e),
"response_time_ms": int(response_time * 1000)
})
_append_error_record(request, 500, response_time, str(e))
raise
# CORS 미들웨어 추가

View File

@ -1,19 +1,47 @@
{
"worker_status": {
"running": true,
"total_workers": 2,
"total_workers": 6,
"queue_size": 0,
"workers_by_status": {
"idle": [
{
"id": "worker_eac127f0",
"id": "worker_3e3d5864",
"status": "idle",
"task_count": 0,
"error_count": 0,
"last_task_at": null
},
{
"id": "worker_3fb51aaf",
"id": "worker_7373b97c",
"status": "idle",
"task_count": 0,
"error_count": 0,
"last_task_at": null
},
{
"id": "worker_296ab3a4",
"status": "idle",
"task_count": 0,
"error_count": 0,
"last_task_at": null
},
{
"id": "worker_891dcc94",
"status": "idle",
"task_count": 0,
"error_count": 0,
"last_task_at": null
},
{
"id": "worker_411ad2e8",
"status": "idle",
"task_count": 0,
"error_count": 0,
"last_task_at": null
},
{
"id": "worker_ab487451",
"status": "idle",
"task_count": 0,
"error_count": 0,
@ -28,64 +56,75 @@
},
"session_status": {
"simple_lama": {
"min": 2,
"max": 4,
"total": 2,
"min": 4,
"max": 8,
"total": 4,
"in_use": 0,
"available": 2
"available": 4
},
"migan": {
"min": 1,
"max": 4,
"max": 8,
"total": 1,
"in_use": 0,
"available": 1
},
"rembg": {
"min": 1,
"max": 3,
"max": 4,
"total": 1,
"in_use": 0,
"available": 1
}
},
"api_stats": {
"total_requests": 0,
"successful_requests": 0,
"total_requests": 13467,
"successful_requests": 13467,
"failed_requests": 0,
"success_rate": 0.0,
"endpoint_usage": {},
"success_rate": 100.0,
"endpoint_usage": {
"GET /api/v1/model": 6740,
"POST /api/v1/inpaint": 6264,
"POST /api/v1/run_plugin_gen_image": 463
},
"endpoint_stats": {
"POST /api/v1/inpaint": {
"count": 0,
"avg_time": 0.0,
"min_time": 0.0,
"max_time": 0.0,
"GET /api/v1/model": {
"count": 6740,
"avg_time": 0.001540846824645996,
"min_time": 0.00063323974609375,
"max_time": 0.004244089126586914,
"current_concurrent": 0
},
"POST /api/v1/inpaint": {
"count": 6264,
"avg_time": 0.5280383849143981,
"min_time": 0.2597086429595947,
"max_time": 1.651228666305542,
"current_concurrent": 5
},
"POST /api/v1/run_plugin_gen_image": {
"count": 463,
"avg_time": 0.4474348998069763,
"min_time": 0.1340315341949463,
"max_time": 2.5062549114227295,
"current_concurrent": 1
},
"POST /api/v1/remove_bg": {
"count": 0,
"avg_time": 0.0,
"min_time": 0.0,
"max_time": 0.0,
"current_concurrent": 0
},
"POST /api/v1/run_plugin_gen_image": {
"count": 0,
"avg_time": 0.0,
"min_time": 0.0,
"max_time": 0.0,
"current_concurrent": 0
}
},
"average_response_time": 0,
"min_response_time": 0,
"max_response_time": 0,
"current_concurrent": 0,
"max_concurrent": 0,
"requests_per_second": 0.0,
"uptime": 5.007766008377075,
"average_response_time": 0.2757347109317779,
"min_response_time": 0.0005586147308349609,
"max_response_time": 2.8400533199310303,
"current_concurrent": 6,
"max_concurrent": 12,
"requests_per_second": 1.3742163667312552,
"uptime": 9799.76685333252,
"recent_errors": []
},
"timestamp": 1759306699.0522153
"timestamp": 1759375540.6194317
}