""" GPU 메모리 모니터링 유틸리티 Jetson Xavier와 x86 시스템을 모두 지원합니다. """ import asyncio import psutil import logging import subprocess import os from typing import Dict, Optional, List try: import pynvml NVML_AVAILABLE = True except ImportError: NVML_AVAILABLE = False logging.warning("pynvml not available. GPU monitoring will be limited.") from ..core.config import settings logger = logging.getLogger(__name__) class JetsonMonitor: """Jetson Xavier 전용 모니터링 클래스""" def __init__(self): self.jetson_clocks_path = "/sys/kernel/debug/clk" self.jetson_thermal_path = "/sys/devices/virtual/thermal" self.jetson_power_path = "/sys/kernel/debug/tegra_pcie/pcie_power" def get_gpu_memory_info(self) -> Dict[str, float]: """Jetson 전용 GPU 메모리 정보를 가져옵니다.""" try: # 1. Jetson GPU 클래스에서 정보 읽기 if os.path.exists("/sys/class/nvidia-gpu"): try: # Jetson GPU 디바이스 정보 확인 gpu_devices = [d for d in os.listdir("/sys/class/nvidia-gpu") if d.startswith("nvidia")] if gpu_devices: logger.debug(f"Jetson GPU devices found: {gpu_devices}") # GPU 메모리 정보 수집 total_memory = 0 used_memory = 0 for device in gpu_devices: device_path = f"/sys/class/nvidia-gpu/{device}" # 메모리 정보 파일들 확인 memory_files = [ "total_memory", "memory_used", "memory_free", "memory_usage" ] for mem_file in memory_files: file_path = f"{device_path}/{mem_file}" if os.path.exists(file_path): try: with open(file_path, "r") as f: value = f.read().strip() logger.debug(f"{mem_file}: {value}") except: pass # 실제 메모리 정보가 있으면 반환 if total_memory > 0: return { "total": round(total_memory / 1024, 2), "used": round(used_memory / 1024, 2), "free": round((total_memory - used_memory) / 1024, 2), "usage_percent": round((used_memory / total_memory) * 100, 2) if total_memory > 0 else 0 } except Exception as e: logger.debug(f"Jetson GPU class read failed: {e}") # 2. /sys/kernel/debug/gpu/memory에서 읽기 시도 (권한이 있는 경우) if os.path.exists("/sys/kernel/debug/gpu/memory"): try: with open("/sys/kernel/debug/gpu/memory", "r") as f: content = f.read() logger.debug(f"GPU memory debug info: {content}") # 메모리 정보 파싱 lines = content.split('\n') total_mb = 0 used_mb = 0 for line in lines: if "Total" in line and "MB" in line: try: total_mb = float(line.split()[-2]) except (ValueError, IndexError): pass elif "Used" in line and "MB" in line: try: used_mb = float(line.split()[-2]) except (ValueError, IndexError): pass if total_mb > 0: free_mb = total_mb - used_mb usage_percent = (used_mb / total_mb) * 100 return { "total": round(total_mb / 1024, 2), # GB "used": round(used_mb / 1024, 2), # GB "free": round(free_mb / 1024, 2), # GB "usage_percent": round(usage_percent, 2) } except Exception as e: logger.debug(f"GPU memory debug read failed: {e}") # 3. tegrastats 사용 (가장 안정적) if self._tegrastats_available(): return self._get_memory_from_tegrastats() # 4. GV11B GPU 정보 확인 (Jetson Xavier) if os.path.exists("/sys/firmware/devicetree/base/gv11b"): logger.info("GV11B GPU (Jetson Xavier) 감지됨") # Jetson Xavier는 통합 메모리 사용 return { "total": 8.0, # 8GB 통합 메모리 "used": 0.0, "free": 8.0, "usage_percent": 0.0 } # 5. 기본값 반환 logger.warning("GPU 메모리 정보를 가져올 수 없습니다. 기본값을 사용합니다.") return { "total": 8.0, "used": 0.0, "free": 8.0, "usage_percent": 0.0 } except Exception as e: logger.debug(f"Jetson GPU memory read failed: {e}") return { "total": 8.0, "used": 0.0, "free": 8.0, "usage_percent": 0.0 } def _tegrastats_available(self) -> bool: """tegrastats 명령어 사용 가능 여부 확인""" try: result = subprocess.run(["which", "tegrastats"], capture_output=True, text=True, timeout=5) return result.returncode == 0 except: return False def _get_memory_from_tegrastats(self) -> Dict[str, float]: """tegrastats에서 메모리 정보 추출""" try: # tegrastats -1 (한 번만 실행) result = subprocess.run(["tegrastats", "-1"], capture_output=True, text=True, timeout=10) if result.returncode == 0: output = result.stdout # GPU 메모리 정보 파싱 # 예시: "GR3D_FREQ 0% @ 114MHz GR3D_FREQ 0% @ 114MHz" # "RAM 0/8192MB (lfb 0x0) @ 1600MHz" total_gb = 8.0 # Jetson Xavier 기본값 used_gb = 0.0 # RAM 사용량 파싱 for line in output.split('\n'): if "RAM" in line and "MB" in line: try: # "RAM 1024/8192MB" 형태에서 추출 parts = line.split() for part in parts: if "/" in part and "MB" in part: used_str, total_str = part.split('/') used_mb = float(used_str) total_mb = float(total_str.replace('MB', '')) used_gb = used_mb / 1024 total_gb = total_mb / 1024 break except (ValueError, IndexError): pass free_gb = total_gb - used_gb usage_percent = (used_gb / total_gb) * 100 if total_gb > 0 else 0 return { "total": round(total_gb, 2), "used": round(used_gb, 2), "free": round(free_gb, 2), "usage_percent": round(usage_percent, 2) } except Exception as e: logger.debug(f"tegrastats parsing failed: {e}") # 기본값 반환 return { "total": 8.0, "used": 0.0, "free": 8.0, "usage_percent": 0.0 } def get_gpu_utilization(self) -> float: """Jetson 전용 GPU 사용률을 가져옵니다.""" try: # tegrastats에서 GPU 사용률 추출 if self._tegrastats_available(): result = subprocess.run(["tegrastats", "-1"], capture_output=True, text=True, timeout=10) if result.returncode == 0: output = result.stdout # GR3D_FREQ (GPU 사용률) 파싱 for line in output.split('\n'): if "GR3D_FREQ" in line and "%" in line: try: # "GR3D_FREQ 45% @ 114MHz" 형태에서 추출 parts = line.split() for part in parts: if "%" in part: usage = float(part.replace('%', '')) return min(usage, 100.0) # 100% 초과 방지 except (ValueError, IndexError): pass # 대안: /sys/kernel/debug/gpu/load에서 읽기 if os.path.exists("/sys/kernel/debug/gpu/load"): try: with open("/sys/kernel/debug/gpu/load", "r") as f: load = f.read().strip() if load.isdigit(): return min(float(load), 100.0) except: pass return 0.0 except Exception as e: logger.debug(f"Jetson GPU utilization read failed: {e}") return 0.0 def get_gpu_frequency(self) -> Optional[int]: """GPU 클럭 주파수를 가져옵니다 (MHz)""" try: if os.path.exists(f"{self.jetson_clocks_path}/gpcclk/clk_rate"): with open(f"{self.jetson_clocks_path}/gpcclk/clk_rate", "r") as f: freq = int(f.read().strip()) // 1000000 # Hz to MHz return freq except Exception as e: logger.debug(f"GPU frequency read failed: {e}") return None def get_cpu_frequency(self) -> Optional[int]: """CPU 클럭 주파수를 가져옵니다 (MHz)""" try: if os.path.exists(f"{self.jetson_clocks_path}/cpu_gpcclk/clk_rate"): with open(f"{self.jetson_clocks_path}/cpu_gpcclk/clk_rate", "r") as f: freq = int(f.read().strip()) // 1000000 # Hz to MHz return freq except Exception as e: logger.debug(f"CPU frequency read failed: {e}") return None def get_memory_frequency(self) -> Optional[int]: """메모리 클럭 주파수를 가져옵니다 (MHz)""" try: if os.path.exists(f"{self.jetson_clocks_path}/emc/clk_rate"): with open(f"{self.jetson_clocks_path}/emc/clk_rate", "r") as f: freq = int(f.read().strip()) // 1000000 # Hz to MHz return freq except Exception as e: logger.debug(f"Memory frequency read failed: {e}") return None def get_temperature(self) -> Dict[str, float]: """Jetson 온도 정보를 가져옵니다""" temps = {} try: if os.path.exists(self.jetson_thermal_path): for item in os.listdir(self.jetson_thermal_path): if item.startswith("thermal_zone"): temp_file = f"{self.jetson_thermal_path}/{item}/temp" if os.path.exists(temp_file): with open(temp_file, "r") as f: temp = int(f.read().strip()) / 1000.0 # mC to C zone_name = f"zone_{item.split('_')[-1]}" temps[zone_name] = temp except Exception as e: logger.debug(f"Temperature read failed: {e}") return temps def get_power_consumption(self) -> Optional[float]: """전력 소비량을 가져옵니다 (W)""" try: # Jetson 전력 모니터링 (가능한 경우) if os.path.exists("/sys/bus/i2c/devices/1-0040/iio_device/in_power0_input"): with open("/sys/bus/i2c/devices/1-0040/iio_device/in_power0_input", "r") as f: power = float(f.read().strip()) / 1000.0 # mW to W return power except Exception as e: logger.debug(f"Power consumption read failed: {e}") return None def set_power_mode(self, mode: str) -> bool: """전력 모드를 설정합니다""" try: if mode in ["MAXN", "5W", "10W", "15W"]: result = subprocess.run( ["sudo", "nvpmodel", "-m", mode], capture_output=True, text=True, timeout=10 ) if result.returncode == 0: logger.info(f"Power mode set to {mode}") return True else: logger.warning(f"Failed to set power mode: {result.stderr}") else: logger.error(f"Invalid power mode: {mode}") except Exception as e: logger.error(f"Power mode setting failed: {e}") return False def set_fan_speed(self, speed: int) -> bool: """팬 속도를 설정합니다 (0-255)""" try: if 0 <= speed <= 255: fan_path = "/sys/devices/pwm-fan/target_pwm" if os.path.exists(fan_path): with open(fan_path, "w") as f: f.write(str(speed)) logger.info(f"Fan speed set to {speed}") return True else: logger.warning("Fan control not available") else: logger.error(f"Invalid fan speed: {speed}") except Exception as e: logger.error(f"Fan speed setting failed: {e}") return False def get_jetson_info(self) -> Dict[str, any]: """Jetson 전체 정보를 가져옵니다""" info = { "gpu_frequency": self.get_gpu_frequency(), "cpu_frequency": self.get_cpu_frequency(), "memory_frequency": self.get_memory_frequency(), "temperature": self.get_temperature(), "power_consumption": self.get_power_consumption(), "power_mode": self._get_current_power_mode() } return info def _get_current_power_mode(self) -> str: """현재 전력 모드를 가져옵니다""" try: result = subprocess.run( ["nvpmodel", "-q"], capture_output=True, text=True, timeout=5 ) if result.returncode == 0: for line in result.stdout.split('\n'): if 'NV Power Mode:' in line: return line.split(':')[-1].strip() except Exception: pass return "Unknown" class GPUMonitor: def __init__(self): self.initialized = False self.is_jetson = settings.IS_JETSON self.jetson_monitor = JetsonMonitor() if self.is_jetson else None if NVML_AVAILABLE and not self.is_jetson: try: pynvml.nvmlInit() self.initialized = True logger.info("GPU monitoring initialized successfully") except Exception as e: logger.error(f"Failed to initialize GPU monitoring: {e}") elif self.is_jetson: logger.info("Jetson Xavier mode detected - using Jetson-specific monitoring") self.initialized = True def get_gpu_memory_info(self, device_id: int = 0) -> Dict[str, float]: """GPU 메모리 정보를 반환합니다.""" if self.is_jetson: return self.jetson_monitor.get_gpu_memory_info() if not self.initialized or not NVML_AVAILABLE: return {"total": 0, "used": 0, "free": 0, "usage_percent": 0} try: handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle) total = mem_info.total / 1024**3 # GB used = mem_info.used / 1024**3 # GB free = mem_info.free / 1024**3 # GB usage_percent = (used / total) * 100 return { "total": round(total, 2), "used": round(used, 2), "free": round(free, 2), "usage_percent": round(usage_percent, 2) } except Exception as e: logger.error(f"Error getting GPU memory info: {e}") return {"total": 0, "used": 0, "free": 0, "usage_percent": 0} def get_gpu_utilization(self, device_id: int = 0) -> float: """GPU 사용률을 반환합니다.""" if self.is_jetson: return self.jetson_monitor.get_gpu_utilization() if not self.initialized or not NVML_AVAILABLE: return 0.0 try: handle = pynvml.nvmlDeviceGetHandleByIndex(device_id) util = pynvml.nvmlDeviceGetUtilizationRates(handle) return float(util.gpu) except Exception as e: logger.error(f"Error getting GPU utilization: {e}") return 0.0 def get_system_memory_info(self) -> Dict[str, float]: """시스템 메모리 정보를 반환합니다.""" mem = psutil.virtual_memory() return { "total": round(mem.total / 1024**3, 2), # GB "used": round(mem.used / 1024**3, 2), # GB "free": round(mem.free / 1024**3, 2), # GB "usage_percent": round(mem.percent, 2) } def get_jetson_specific_info(self) -> Dict[str, any]: """Jetson 전용 정보를 반환합니다.""" if not self.is_jetson or not self.jetson_monitor: return {} return self.jetson_monitor.get_jetson_info() def should_scale_up(self, vram_usage: float, threshold: float) -> bool: """스케일 업 여부를 결정합니다.""" return vram_usage < threshold def should_scale_down(self, vram_usage: float, threshold: float) -> bool: """스케일 다운 여부를 결정합니다.""" return vram_usage > threshold def optimize_for_jetson(self) -> bool: """Jetson 최적화를 수행합니다.""" if not self.is_jetson or not self.jetson_monitor: return False try: # 전력 모드 설정 power_mode = settings.JETSON_POWER_MODE if power_mode != "MAXN": self.jetson_monitor.set_power_mode(power_mode) # 팬 제어 활성화 if settings.JETSON_FAN_CONTROL: # 온도에 따른 팬 속도 조정 temps = self.jetson_monitor.get_temperature() max_temp = max(temps.values()) if temps else 0 if max_temp > settings.JETSON_TEMP_THRESHOLD: self.jetson_monitor.set_fan_speed(255) # 최대 속도 elif max_temp > 60: self.jetson_monitor.set_fan_speed(128) # 중간 속도 else: self.jetson_monitor.set_fan_speed(64) # 낮은 속도 logger.info("Jetson optimization completed") return True except Exception as e: logger.error(f"Jetson optimization failed: {e}") return False def get_comprehensive_gpu_info(self) -> Dict[str, any]: """GPU와 Jetson 정보를 종합적으로 반환합니다.""" gpu_info = { "memory": self.get_gpu_memory_info(), "utilization": self.get_gpu_utilization(), "system_memory": self.get_system_memory_info() } if self.is_jetson: gpu_info["jetson"] = self.get_jetson_specific_info() gpu_info["platform"] = "Jetson Xavier" else: gpu_info["platform"] = "x86_64" return gpu_info # 전역 GPU 모니터 인스턴스 gpu_monitor = GPUMonitor()