inpaintServer/app/utils/gpu_monitor.py

623 lines
27 KiB
Python

"""
GPU 메모리 모니터링 유틸리티
Jetson Xavier와 x86 시스템을 모두 지원합니다.
"""
import asyncio
import psutil
import logging
import subprocess
import os
from typing import Dict, Optional, List
try:
import jtop
JTOP_AVAILABLE = True
except ImportError:
JTOP_AVAILABLE = False
logging.warning("jtop library not found. Jetson monitoring will be limited. Please run 'sudo pip install jetson-stats'")
try:
import pynvml
NVML_AVAILABLE = True
except ImportError:
NVML_AVAILABLE = False
logging.warning("pynvml not available. GPU monitoring will be limited.")
from ..core.config import settings
logger = logging.getLogger(__name__)
class JetsonMonitor:
"""Jetson Xavier 전용 모니터링 클래스"""
def __init__(self):
self.jetson_clocks_path = "/sys/kernel/debug/clk"
self.jetson_thermal_path = "/sys/devices/virtual/thermal"
self.jetson_power_path = "/sys/kernel/debug/tegra_pcie/pcie_power"
self._jtop = None
if JTOP_AVAILABLE:
try:
self._jtop = jtop.jtop()
self._jtop.start()
except Exception as e:
logger.error(f"Failed to initialize jtop: {e}")
self._jtop = None
def get_gpu_memory_info(self) -> Dict[str, float]:
"""Jetson 전용 GPU 메모리 정보를 가져옵니다."""
if self._jtop and self._jtop.ok:
try:
stats = self._jtop.stats
ram = stats.get('RAM', {})
total_mb = ram.get('tot', 0)
used_mb = ram.get('use', 0)
if total_mb > 0:
return {
"total": total_mb,
"used": used_mb,
"free": total_mb - used_mb,
"usage_percent": (used_mb / total_mb) * 100,
"free_ratio": (total_mb - used_mb) / total_mb
}
except Exception as e:
logger.warning(f"Failed to get memory info from jtop: {e}")
# jtop 실패 시 fallback 로직 (기존 코드)
try:
# 1. Jetson GPU 클래스에서 정보 읽기
if os.path.exists("/sys/class/nvidia-gpu"):
try:
# Jetson GPU 디바이스 정보 확인
gpu_devices = [d for d in os.listdir("/sys/class/nvidia-gpu") if d.startswith("nvidia")]
if gpu_devices:
logger.debug(f"Jetson GPU devices found: {gpu_devices}")
# GPU 메모리 정보 수집
total_memory = 0
used_memory = 0
for device in gpu_devices:
device_path = f"/sys/class/nvidia-gpu/{device}"
# 메모리 정보 파일들 확인
memory_files = [
"total_memory",
"memory_used",
"memory_free",
"memory_usage"
]
for mem_file in memory_files:
file_path = f"{device_path}/{mem_file}"
if os.path.exists(file_path):
try:
with open(file_path, "r") as f:
value = f.read().strip()
logger.debug(f"{mem_file}: {value}")
except:
pass
# 실제 메모리 정보가 있으면 반환
if total_memory > 0:
return {
"total": round(total_memory / 1024, 2),
"used": round(used_memory / 1024, 2),
"free": round((total_memory - used_memory) / 1024, 2),
"usage_percent": round((used_memory / total_memory) * 100, 2) if total_memory > 0 else 0
}
except Exception as e:
logger.debug(f"Jetson GPU class read failed: {e}")
# 2. /sys/kernel/debug/gpu/memory에서 읽기 시도 (권한이 있는 경우)
if os.path.exists("/sys/kernel/debug/gpu/memory"):
try:
with open("/sys/kernel/debug/gpu/memory", "r") as f:
content = f.read()
logger.debug(f"GPU memory debug info: {content}")
# 메모리 정보 파싱
lines = content.split('\n')
total_mb = 0
used_mb = 0
for line in lines:
if "Total" in line and "MB" in line:
try:
total_mb = float(line.split()[-2])
except (ValueError, IndexError):
pass
elif "Used" in line and "MB" in line:
try:
used_mb = float(line.split()[-2])
except (ValueError, IndexError):
pass
if total_mb > 0:
free_mb = total_mb - used_mb
usage_percent = (used_mb / total_mb) * 100
return {
"total": round(total_mb / 1024, 2), # GB
"used": round(used_mb / 1024, 2), # GB
"free": round(free_mb / 1024, 2), # GB
"usage_percent": round(usage_percent, 2)
}
except Exception as e:
logger.debug(f"GPU memory debug read failed: {e}")
# 3. tegrastats 사용 (가장 안정적)
if self._tegrastats_available():
return self._get_memory_from_tegrastats()
# 4. GV11B GPU 정보 확인 (Jetson Xavier)
if os.path.exists("/sys/firmware/devicetree/base/gv11b"):
logger.info("GV11B GPU (Jetson Xavier) 감지됨")
# Jetson Xavier는 통합 메모리 사용
return {
"total": 8.0, # 8GB 통합 메모리
"used": 0.0,
"free": 8.0,
"usage_percent": 0.0
}
# 5. 기본값 반환
logger.warning("GPU 메모리 정보를 가져올 수 없습니다. 기본값을 사용합니다.")
mem = psutil.virtual_memory()
total_mb = mem.total / 1024**2
used_mb = mem.used / 1024**2
return {
"total": total_mb, "used": used_mb, "free": mem.available / 1024**2,
"usage_percent": mem.percent, "free_ratio": mem.available / mem.total
}
except Exception as e:
logger.error(f"Jetson GPU memory read failed: {e}")
return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0}
def __del__(self):
if self._jtop and self._jtop.ok:
self._jtop.close()
def _tegrastats_available(self) -> bool:
"""tegrastats 명령어 사용 가능 여부 확인"""
try:
result = subprocess.run(["which", "tegrastats"],
capture_output=True, text=True, timeout=5)
return result.returncode == 0
except:
return False
def _get_memory_from_tegrastats(self) -> Dict[str, float]:
"""tegrastats에서 메모리 정보 추출"""
try:
# tegrastats 올바른 옵션으로 실행
result = subprocess.run(["timeout", "3", "tegrastats", "--interval", "500"],
capture_output=True, text=True, timeout=10)
if result.returncode == 0 or result.returncode == 124: # timeout도 허용
output = result.stdout
# GPU 메모리 정보 파싱
# 예시: "GR3D_FREQ 0% @ 114MHz GR3D_FREQ 0% @ 114MHz"
# "RAM 0/8192MB (lfb 0x0) @ 1600MHz"
total_gb = 8.0 # Jetson Xavier 기본값
used_gb = 0.0
# RAM 사용량 파싱
for line in output.split('\n'):
if "RAM" in line and "MB" in line:
try:
# "RAM 1024/8192MB" 형태에서 추출
parts = line.split()
for part in parts:
if "/" in part and "MB" in part:
used_str, total_str = part.split('/')
used_mb = float(used_str)
total_mb = float(total_str.replace('MB', ''))
used_gb = used_mb / 1024
total_gb = total_mb / 1024
break
except (ValueError, IndexError):
pass
free_gb = total_gb - used_gb
usage_percent = (used_gb / total_gb) * 100 if total_gb > 0 else 0
return {
"total": round(total_gb, 2),
"used": round(used_gb, 2),
"free": round(free_gb, 2),
"usage_percent": round(usage_percent, 2)
}
except Exception as e:
logger.debug(f"tegrastats parsing failed: {e}")
# 기본값 반환
return {
"total": 8.0,
"used": 0.0,
"free": 8.0,
"usage_percent": 0.0
}
def get_gpu_utilization(self) -> float:
"""Jetson 전용 GPU 사용률을 가져옵니다."""
try:
# tegrastats에서 GPU 사용률 추출 (올바른 옵션 사용)
if self._tegrastats_available():
result = subprocess.run(["timeout", "3", "tegrastats", "--interval", "500"],
capture_output=True, text=True, timeout=10)
if result.returncode == 0 or result.returncode == 124: # timeout도 허용
output = result.stdout
logger.debug(f"tegrastats output: {output}")
# GR3D_FREQ (GPU 사용률) 파싱 - 개선된 로직
lines = output.split('\n')
for line in lines:
if "GR3D_FREQ" in line:
try:
# "GR3D_FREQ 45%" 또는 "GR3D_FREQ 0%" 형태에서 추출
import re
match = re.search(r'GR3D_FREQ\s+(\d+(?:\.\d+)?)%', line)
if match:
usage = float(match.group(1))
logger.debug(f"Parsed GPU utilization: {usage}%")
return min(usage, 100.0)
# 대안 파싱: 공백으로 분리
parts = line.split()
gr3d_idx = -1
for i, part in enumerate(parts):
if "GR3D_FREQ" in part:
gr3d_idx = i
break
if gr3d_idx >= 0 and gr3d_idx + 1 < len(parts):
next_part = parts[gr3d_idx + 1]
if "%" in next_part:
usage = float(next_part.replace('%', ''))
logger.debug(f"Alternative parsed GPU utilization: {usage}%")
return min(usage, 100.0)
except (ValueError, IndexError, AttributeError) as e:
logger.debug(f"GPU parsing error: {e}")
continue
# 대안 1: /sys/devices/gpu.0/load 확인
gpu_load_paths = [
"/sys/devices/gpu.0/load",
"/sys/kernel/debug/gpu/load",
"/sys/class/devfreq/17000000.gv11b/load"
]
for load_path in gpu_load_paths:
if os.path.exists(load_path):
try:
with open(load_path, "r") as f:
load_str = f.read().strip()
# 숫자만 추출
import re
numbers = re.findall(r'\d+', load_str)
if numbers:
load = float(numbers[0])
logger.debug(f"GPU load from {load_path}: {load}")
return min(load, 100.0)
except Exception as e:
logger.debug(f"Failed to read {load_path}: {e}")
continue
# 대안 2: jtop 스타일 파싱 시도
try:
result = subprocess.run(["cat", "/proc/stat"],
capture_output=True, text=True, timeout=2)
if result.returncode == 0:
# GPU 관련 정보가 있는지 확인
pass
except:
pass
logger.debug("No GPU utilization found, returning 0.0")
return 0.0
except Exception as e:
logger.debug(f"Jetson GPU utilization read failed: {e}")
return 0.0
def get_gpu_frequency(self) -> Optional[int]:
"""GPU 클럭 주파수를 가져옵니다 (MHz)"""
try:
if os.path.exists(f"{self.jetson_clocks_path}/gpcclk/clk_rate"):
with open(f"{self.jetson_clocks_path}/gpcclk/clk_rate", "r") as f:
freq = int(f.read().strip()) // 1000000 # Hz to MHz
return freq
except Exception as e:
logger.debug(f"GPU frequency read failed: {e}")
return None
def get_cpu_frequency(self) -> Optional[int]:
"""CPU 클럭 주파수를 가져옵니다 (MHz)"""
try:
if os.path.exists(f"{self.jetson_clocks_path}/cpu_gpcclk/clk_rate"):
with open(f"{self.jetson_clocks_path}/cpu_gpcclk/clk_rate", "r") as f:
freq = int(f.read().strip()) // 1000000 # Hz to MHz
return freq
except Exception as e:
logger.debug(f"CPU frequency read failed: {e}")
return None
def get_memory_frequency(self) -> Optional[int]:
"""메모리 클럭 주파수를 가져옵니다 (MHz)"""
try:
if os.path.exists(f"{self.jetson_clocks_path}/emc/clk_rate"):
with open(f"{self.jetson_clocks_path}/emc/clk_rate", "r") as f:
freq = int(f.read().strip()) // 1000000 # Hz to MHz
return freq
except Exception as e:
logger.debug(f"Memory frequency read failed: {e}")
return None
def get_temperature(self) -> Dict[str, float]:
"""Jetson 온도 정보를 가져옵니다"""
temps = {}
try:
if os.path.exists(self.jetson_thermal_path):
for item in os.listdir(self.jetson_thermal_path):
if item.startswith("thermal_zone"):
temp_file = f"{self.jetson_thermal_path}/{item}/temp"
if os.path.exists(temp_file):
with open(temp_file, "r") as f:
temp = int(f.read().strip()) / 1000.0 # mC to C
zone_name = f"zone_{item.split('_')[-1]}"
temps[zone_name] = temp
except Exception as e:
logger.debug(f"Temperature read failed: {e}")
return temps
def get_power_consumption(self) -> Optional[float]:
"""전력 소비량을 가져옵니다 (W)"""
try:
# Jetson 전력 모니터링 (가능한 경우)
if os.path.exists("/sys/bus/i2c/devices/1-0040/iio_device/in_power0_input"):
with open("/sys/bus/i2c/devices/1-0040/iio_device/in_power0_input", "r") as f:
power = float(f.read().strip()) / 1000.0 # mW to W
return power
except Exception as e:
logger.debug(f"Power consumption read failed: {e}")
return None
def set_power_mode(self, mode: str) -> bool:
"""전력 모드를 설정합니다"""
try:
if mode in ["MAXN", "5W", "10W", "15W"]:
result = subprocess.run(
["sudo", "nvpmodel", "-m", mode],
capture_output=True,
text=True,
timeout=10
)
if result.returncode == 0:
logger.info(f"Power mode set to {mode}")
return True
else:
logger.warning(f"Failed to set power mode: {result.stderr}")
else:
logger.error(f"Invalid power mode: {mode}")
except Exception as e:
logger.error(f"Power mode setting failed: {e}")
return False
def set_fan_speed(self, speed: int) -> bool:
"""팬 속도를 설정합니다 (0-255)"""
try:
if 0 <= speed <= 255:
fan_path = "/sys/devices/pwm-fan/target_pwm"
if os.path.exists(fan_path):
with open(fan_path, "w") as f:
f.write(str(speed))
logger.info(f"Fan speed set to {speed}")
return True
else:
logger.warning("Fan control not available")
else:
logger.error(f"Invalid fan speed: {speed}")
except Exception as e:
logger.error(f"Fan speed setting failed: {e}")
return False
def get_jetson_info(self) -> Dict[str, any]:
"""Jetson 전체 정보를 가져옵니다"""
info = {
"gpu_frequency": self.get_gpu_frequency(),
"cpu_frequency": self.get_cpu_frequency(),
"memory_frequency": self.get_memory_frequency(),
"temperature": self.get_temperature(),
"power_consumption": self.get_power_consumption(),
"power_mode": self._get_current_power_mode()
}
return info
def _get_current_power_mode(self) -> str:
"""현재 전력 모드를 가져옵니다"""
try:
result = subprocess.run(
["nvpmodel", "-q"],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0:
for line in result.stdout.split('\n'):
if 'NV Power Mode:' in line:
return line.split(':')[-1].strip()
except Exception:
pass
return "Unknown"
class GPUMonitor:
def __init__(self):
self.initialized = False
self.is_jetson = settings.IS_JETSON
self.jetson_monitor = JetsonMonitor() if self.is_jetson else None
if NVML_AVAILABLE and not self.is_jetson:
try:
pynvml.nvmlInit()
self.initialized = True
logger.info("GPU monitoring initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize GPU monitoring: {e}")
elif self.is_jetson:
logger.info("Jetson Xavier mode detected - using Jetson-specific monitoring")
self.initialized = True
def get_gpu_memory_info(self, device_id: int = 0) -> Dict[str, float]:
"""GPU 메모리 정보를 반환합니다.
반환 형식 통일:
- keys: total, used, free, usage_percent, free_ratio, unit
- unit: "GiB" 또는 "MiB"
"""
if self.is_jetson:
info = self.jetson_monitor.get_gpu_memory_info()
# Jetson 쪽 반환 값에 누락된 키 보정 및 단위 명시
if info:
total = info.get("total", 0)
used = info.get("used", 0)
free = info.get("free", 0)
# Jetson 경로는 MiB 기반으로 동작하도록 통일
unit = "MiB"
# 일부 fallback 경로는 GB를 반환할 수 있어 값이 작으면 GB로 간주 → MiB로 변환
if total and total < 100: # 100 GiB 미만이면 GB일 가능성
total, used, free = total * 1024, used * 1024, free * 1024
usage_percent = info.get("usage_percent", (used / total * 100) if total else 0)
free_ratio = info.get("free_ratio", (free / total) if total else 0)
return {
"total": total,
"used": used,
"free": free,
"usage_percent": round(float(usage_percent), 2),
"free_ratio": round(float(free_ratio), 4),
"unit": unit,
}
return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0, "unit": "MiB"}
if not self.initialized or not NVML_AVAILABLE:
return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0, "unit": "GiB"}
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
total_gib = mem_info.total / 1024**3
used_gib = mem_info.used / 1024**3
free_gib = mem_info.free / 1024**3
usage_percent = (used_gib / total_gib) * 100 if total_gib else 0
return {
"total": round(total_gib, 2),
"used": round(used_gib, 2),
"free": round(free_gib, 2),
"usage_percent": round(usage_percent, 2),
"free_ratio": round((free_gib / total_gib), 4) if total_gib else 0,
"unit": "GiB",
}
except Exception as e:
logger.error(f"Error getting GPU memory info: {e}")
return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0, "unit": "GiB"}
def get_gpu_utilization(self, device_id: int = 0) -> float:
"""GPU 사용률을 반환합니다."""
if self.is_jetson:
return self.jetson_monitor.get_gpu_utilization()
if not self.initialized or not NVML_AVAILABLE:
return 0.0
try:
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
return float(util.gpu)
except Exception as e:
logger.error(f"Error getting GPU utilization: {e}")
return 0.0
def get_system_memory_info(self) -> Dict[str, float]:
"""시스템 메모리 정보를 반환합니다."""
mem = psutil.virtual_memory()
return {
"total": round(mem.total / 1024**3, 2), # GB
"used": round(mem.used / 1024**3, 2), # GB
"free": round(mem.free / 1024**3, 2), # GB
"usage_percent": round(mem.percent, 2)
}
def get_jetson_specific_info(self) -> Dict[str, any]:
"""Jetson 전용 정보를 반환합니다."""
if not self.is_jetson or not self.jetson_monitor:
return {}
return self.jetson_monitor.get_jetson_info()
def should_scale_up(self, vram_usage: float, threshold: float) -> bool:
"""스케일 업 여부를 결정합니다."""
return vram_usage < threshold
def should_scale_down(self, vram_usage: float, threshold: float) -> bool:
"""스케일 다운 여부를 결정합니다."""
return vram_usage > threshold
def optimize_for_jetson(self) -> bool:
"""Jetson 최적화를 수행합니다."""
if not self.is_jetson or not self.jetson_monitor:
return False
try:
# 전력 모드 설정
power_mode = settings.JETSON_POWER_MODE
if power_mode != "MAXN":
self.jetson_monitor.set_power_mode(power_mode)
# 팬 제어 활성화
if settings.JETSON_FAN_CONTROL:
# 온도에 따른 팬 속도 조정
temps = self.jetson_monitor.get_temperature()
max_temp = max(temps.values()) if temps else 0
if max_temp > settings.JETSON_TEMP_THRESHOLD:
self.jetson_monitor.set_fan_speed(255) # 최대 속도
elif max_temp > 60:
self.jetson_monitor.set_fan_speed(128) # 중간 속도
else:
self.jetson_monitor.set_fan_speed(64) # 낮은 속도
logger.info("Jetson optimization completed")
return True
except Exception as e:
logger.error(f"Jetson optimization failed: {e}")
return False
def get_comprehensive_gpu_info(self) -> Dict[str, any]:
"""GPU와 Jetson 정보를 종합적으로 반환합니다."""
gpu_info = {
"memory": self.get_gpu_memory_info(),
"utilization": self.get_gpu_utilization(),
"system_memory": self.get_system_memory_info()
}
if self.is_jetson:
gpu_info["jetson"] = self.get_jetson_specific_info()
gpu_info["platform"] = "Jetson Xavier"
else:
gpu_info["platform"] = "x86_64"
return gpu_info
# 전역 GPU 모니터 인스턴스
gpu_monitor = GPUMonitor()