inpaintServer/app/utils/gpu_monitor.py

"""
GPU 메모리 모니터링 유틸리티
Jetson Xavier와 x86 시스템을 모두 지원합니다.
"""
import asyncio
import psutil
import logging
import subprocess
import os
from typing import Dict, Optional, List

try:
    import jtop
    JTOP_AVAILABLE = True
except ImportError:
    JTOP_AVAILABLE = False
    logging.warning("jtop library not found. Jetson monitoring will be limited. Please run 'sudo pip install jetson-stats'")

try:
    import pynvml
    NVML_AVAILABLE = True
except ImportError:
    NVML_AVAILABLE = False
    logging.warning("pynvml not available. GPU monitoring will be limited.")

from ..core.config import settings

logger = logging.getLogger(__name__)


class JetsonMonitor:
    """Jetson Xavier 전용 모니터링 클래스"""

    def __init__(self):
        self.jetson_clocks_path = "/sys/kernel/debug/clk"
        self.jetson_thermal_path = "/sys/devices/virtual/thermal"
        self.jetson_power_path = "/sys/kernel/debug/tegra_pcie/pcie_power"
        self._jtop = None
        if JTOP_AVAILABLE:
            try:
                self._jtop = jtop.jtop()
                self._jtop.start()
            except Exception as e:
                logger.error(f"Failed to initialize jtop: {e}")
                self._jtop = None

    def get_gpu_memory_info(self) -> Dict[str, float]:
        """Jetson 전용 GPU 메모리 정보를 가져옵니다."""
        if self._jtop and self._jtop.ok:
            try:
                stats = self._jtop.stats
                ram = stats.get('RAM', {})
                total_mb = ram.get('tot', 0)
                used_mb = ram.get('use', 0)
                if total_mb > 0:
                    return {
                        "total": total_mb,
                        "used": used_mb,
                        "free": total_mb - used_mb,
                        "usage_percent": (used_mb / total_mb) * 100,
                        "free_ratio": (total_mb - used_mb) / total_mb
                    }
            except Exception as e:
                logger.warning(f"Failed to get memory info from jtop: {e}")

        # jtop 실패 시 fallback 로직 (기존 코드)
        try:
            # 1. Jetson GPU 클래스에서 정보 읽기
            if os.path.exists("/sys/class/nvidia-gpu"):
                try:
                    # Jetson GPU 디바이스 정보 확인
                    gpu_devices = [d for d in os.listdir("/sys/class/nvidia-gpu") if d.startswith("nvidia")]
                    if gpu_devices:
                        logger.debug(f"Jetson GPU devices found: {gpu_devices}")

                        # GPU 메모리 정보 수집
                        total_memory = 0
                        used_memory = 0

                        for device in gpu_devices:
                            device_path = f"/sys/class/nvidia-gpu/{device}"

                            # 메모리 정보 파일들 확인
                            memory_files = [
                                "total_memory",
                                "memory_used",
                                "memory_free",
                                "memory_usage"
                            ]

                            for mem_file in memory_files:
                                file_path = f"{device_path}/{mem_file}"
                                if os.path.exists(file_path):
                                    try:
                                        with open(file_path, "r") as f:
                                            value = f.read().strip()
                                            logger.debug(f"{mem_file}: {value}")
                                    except:
                                        pass

                        # 실제 메모리 정보가 있으면 반환
                        if total_memory > 0:
                            return {
                                "total": round(total_memory / 1024, 2),
                                "used": round(used_memory / 1024, 2),
                                "free": round((total_memory - used_memory) / 1024, 2),
                                "usage_percent": round((used_memory / total_memory) * 100, 2) if total_memory > 0 else 0
                            }
                except Exception as e:
                    logger.debug(f"Jetson GPU class read failed: {e}")

            # 2. /sys/kernel/debug/gpu/memory에서 읽기 시도 (권한이 있는 경우)
            if os.path.exists("/sys/kernel/debug/gpu/memory"):
                try:
                    with open("/sys/kernel/debug/gpu/memory", "r") as f:
                        content = f.read()
                        logger.debug(f"GPU memory debug info: {content}")

                        # 메모리 정보 파싱
                        lines = content.split('\n')
                        total_mb = 0
                        used_mb = 0

                        for line in lines:
                            if "Total" in line and "MB" in line:
                                try:
                                    total_mb = float(line.split()[-2])
                                except (ValueError, IndexError):
                                    pass
                            elif "Used" in line and "MB" in line:
                                try:
                                    used_mb = float(line.split()[-2])
                                except (ValueError, IndexError):
                                    pass

                        if total_mb > 0:
                            free_mb = total_mb - used_mb
                            usage_percent = (used_mb / total_mb) * 100
                            return {
                                "total": round(total_mb / 1024, 2),  # GB
                                "used": round(used_mb / 1024, 2),   # GB
                                "free": round(free_mb / 1024, 2),   # GB
                                "usage_percent": round(usage_percent, 2)
                            }

                except Exception as e:
                    logger.debug(f"GPU memory debug read failed: {e}")

            # 3. tegrastats 사용 (가장 안정적)
            if self._tegrastats_available():
                return self._get_memory_from_tegrastats()

            # 4. GV11B GPU 정보 확인 (Jetson Xavier)
            if os.path.exists("/sys/firmware/devicetree/base/gv11b"):
                logger.info("GV11B GPU (Jetson Xavier) 감지됨")
                # Jetson Xavier는 통합 메모리 사용
                return {
                    "total": 8.0,  # 8GB 통합 메모리
                    "used": 0.0,
                    "free": 8.0,
                    "usage_percent": 0.0
                }

            # 5. 기본값 반환
            logger.warning("GPU 메모리 정보를 가져올 수 없습니다. 기본값을 사용합니다.")
            mem = psutil.virtual_memory()
            total_mb = mem.total / 1024**2
            used_mb = mem.used / 1024**2
            return {
                "total": total_mb, "used": used_mb, "free": mem.available / 1024**2,
                "usage_percent": mem.percent, "free_ratio": mem.available / mem.total
            }

        except Exception as e:
            logger.error(f"Jetson GPU memory read failed: {e}")
            return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0}

    def __del__(self):
        if self._jtop and self._jtop.ok:
            self._jtop.close()

    def _tegrastats_available(self) -> bool:
        """tegrastats 명령어 사용 가능 여부 확인"""
        try:
            result = subprocess.run(["which", "tegrastats"],
                                  capture_output=True, text=True, timeout=5)
            return result.returncode == 0
        except:
            return False

    def _get_memory_from_tegrastats(self) -> Dict[str, float]:
        """tegrastats에서 메모리 정보 추출"""
        try:
            # tegrastats 올바른 옵션으로 실행
            result = subprocess.run(["timeout", "3", "tegrastats", "--interval", "500"],
                                  capture_output=True, text=True, timeout=10)

            if result.returncode == 0 or result.returncode == 124:  # timeout도 허용
                output = result.stdout

                # GPU 메모리 정보 파싱
                # 예시: "GR3D_FREQ 0% @ 114MHz GR3D_FREQ 0% @ 114MHz"
                #       "RAM 0/8192MB (lfb 0x0) @ 1600MHz"

                total_gb = 8.0  # Jetson Xavier 기본값
                used_gb = 0.0

                # RAM 사용량 파싱
                for line in output.split('\n'):
                    if "RAM" in line and "MB" in line:
                        try:
                            # "RAM 1024/8192MB" 형태에서 추출
                            parts = line.split()
                            for part in parts:
                                if "/" in part and "MB" in part:
                                    used_str, total_str = part.split('/')
                                    used_mb = float(used_str)
                                    total_mb = float(total_str.replace('MB', ''))
                                    used_gb = used_mb / 1024
                                    total_gb = total_mb / 1024
                                    break
                        except (ValueError, IndexError):
                            pass

                free_gb = total_gb - used_gb
                usage_percent = (used_gb / total_gb) * 100 if total_gb > 0 else 0

                return {
                    "total": round(total_gb, 2),
                    "used": round(used_gb, 2),
                    "free": round(free_gb, 2),
                    "usage_percent": round(usage_percent, 2)
                }

        except Exception as e:
            logger.debug(f"tegrastats parsing failed: {e}")

        # 기본값 반환
        return {
            "total": 8.0,
            "used": 0.0,
            "free": 8.0,
            "usage_percent": 0.0
        }

    def get_gpu_utilization(self) -> float:
        """Jetson 전용 GPU 사용률을 가져옵니다."""
        try:
            # tegrastats에서 GPU 사용률 추출 (올바른 옵션 사용)
            if self._tegrastats_available():
                result = subprocess.run(["timeout", "3", "tegrastats", "--interval", "500"],
                                      capture_output=True, text=True, timeout=10)

                if result.returncode == 0 or result.returncode == 124:  # timeout도 허용
                    output = result.stdout
                    logger.debug(f"tegrastats output: {output}")

                    # GR3D_FREQ (GPU 사용률) 파싱 - 개선된 로직
                    lines = output.split('\n')
                    for line in lines:
                        if "GR3D_FREQ" in line:
                            try:
                                # "GR3D_FREQ 45%" 또는 "GR3D_FREQ 0%" 형태에서 추출
                                import re
                                match = re.search(r'GR3D_FREQ\s+(\d+(?:\.\d+)?)%', line)
                                if match:
                                    usage = float(match.group(1))
                                    logger.debug(f"Parsed GPU utilization: {usage}%")
                                    return min(usage, 100.0)

                                # 대안 파싱: 공백으로 분리
                                parts = line.split()
                                gr3d_idx = -1
                                for i, part in enumerate(parts):
                                    if "GR3D_FREQ" in part:
                                        gr3d_idx = i
                                        break

                                if gr3d_idx >= 0 and gr3d_idx + 1 < len(parts):
                                    next_part = parts[gr3d_idx + 1]
                                    if "%" in next_part:
                                        usage = float(next_part.replace('%', ''))
                                        logger.debug(f"Alternative parsed GPU utilization: {usage}%")
                                        return min(usage, 100.0)

                            except (ValueError, IndexError, AttributeError) as e:
                                logger.debug(f"GPU parsing error: {e}")
                                continue

            # 대안 1: /sys/devices/gpu.0/load 확인
            gpu_load_paths = [
                "/sys/devices/gpu.0/load",
                "/sys/kernel/debug/gpu/load",
                "/sys/class/devfreq/17000000.gv11b/load"
            ]

            for load_path in gpu_load_paths:
                if os.path.exists(load_path):
                    try:
                        with open(load_path, "r") as f:
                            load_str = f.read().strip()
                            # 숫자만 추출
                            import re
                            numbers = re.findall(r'\d+', load_str)
                            if numbers:
                                load = float(numbers[0])
                                logger.debug(f"GPU load from {load_path}: {load}")
                                return min(load, 100.0)
                    except Exception as e:
                        logger.debug(f"Failed to read {load_path}: {e}")
                        continue

            # 대안 2: jtop 스타일 파싱 시도
            try:
                result = subprocess.run(["cat", "/proc/stat"],
                                      capture_output=True, text=True, timeout=2)
                if result.returncode == 0:
                    # GPU 관련 정보가 있는지 확인
                    pass
            except:
                pass

            logger.debug("No GPU utilization found, returning 0.0")
            return 0.0

        except Exception as e:
            logger.debug(f"Jetson GPU utilization read failed: {e}")
            return 0.0

    def get_gpu_frequency(self) -> Optional[int]:
        """GPU 클럭 주파수를 가져옵니다 (MHz)"""
        try:
            if os.path.exists(f"{self.jetson_clocks_path}/gpcclk/clk_rate"):
                with open(f"{self.jetson_clocks_path}/gpcclk/clk_rate", "r") as f:
                    freq = int(f.read().strip()) // 1000000  # Hz to MHz
                    return freq
        except Exception as e:
            logger.debug(f"GPU frequency read failed: {e}")
        return None

    def get_cpu_frequency(self) -> Optional[int]:
        """CPU 클럭 주파수를 가져옵니다 (MHz)"""
        try:
            if os.path.exists(f"{self.jetson_clocks_path}/cpu_gpcclk/clk_rate"):
                with open(f"{self.jetson_clocks_path}/cpu_gpcclk/clk_rate", "r") as f:
                    freq = int(f.read().strip()) // 1000000  # Hz to MHz
                    return freq
        except Exception as e:
            logger.debug(f"CPU frequency read failed: {e}")
        return None

    def get_memory_frequency(self) -> Optional[int]:
        """메모리 클럭 주파수를 가져옵니다 (MHz)"""
        try:
            if os.path.exists(f"{self.jetson_clocks_path}/emc/clk_rate"):
                with open(f"{self.jetson_clocks_path}/emc/clk_rate", "r") as f:
                    freq = int(f.read().strip()) // 1000000  # Hz to MHz
                    return freq
        except Exception as e:
            logger.debug(f"Memory frequency read failed: {e}")
        return None

    def get_temperature(self) -> Dict[str, float]:
        """Jetson 온도 정보를 가져옵니다"""
        temps = {}
        try:
            if os.path.exists(self.jetson_thermal_path):
                for item in os.listdir(self.jetson_thermal_path):
                    if item.startswith("thermal_zone"):
                        temp_file = f"{self.jetson_thermal_path}/{item}/temp"
                        if os.path.exists(temp_file):
                            with open(temp_file, "r") as f:
                                temp = int(f.read().strip()) / 1000.0  # mC to C
                                zone_name = f"zone_{item.split('_')[-1]}"
                                temps[zone_name] = temp
        except Exception as e:
            logger.debug(f"Temperature read failed: {e}")
        return temps

    def get_power_consumption(self) -> Optional[float]:
        """전력 소비량을 가져옵니다 (W)"""
        try:
            # Jetson 전력 모니터링 (가능한 경우)
            if os.path.exists("/sys/bus/i2c/devices/1-0040/iio_device/in_power0_input"):
                with open("/sys/bus/i2c/devices/1-0040/iio_device/in_power0_input", "r") as f:
                    power = float(f.read().strip()) / 1000.0  # mW to W
                    return power
        except Exception as e:
            logger.debug(f"Power consumption read failed: {e}")
        return None

    def set_power_mode(self, mode: str) -> bool:
        """전력 모드를 설정합니다"""
        try:
            if mode in ["MAXN", "5W", "10W", "15W"]:
                result = subprocess.run(
                    ["sudo", "nvpmodel", "-m", mode],
                    capture_output=True,
                    text=True,
                    timeout=10
                )
                if result.returncode == 0:
                    logger.info(f"Power mode set to {mode}")
                    return True
                else:
                    logger.warning(f"Failed to set power mode: {result.stderr}")
            else:
                logger.error(f"Invalid power mode: {mode}")
        except Exception as e:
            logger.error(f"Power mode setting failed: {e}")
        return False

    def set_fan_speed(self, speed: int) -> bool:
        """팬 속도를 설정합니다 (0-255)"""
        try:
            if 0 <= speed <= 255:
                fan_path = "/sys/devices/pwm-fan/target_pwm"
                if os.path.exists(fan_path):
                    with open(fan_path, "w") as f:
                        f.write(str(speed))
                    logger.info(f"Fan speed set to {speed}")
                    return True
                else:
                    logger.warning("Fan control not available")
            else:
                logger.error(f"Invalid fan speed: {speed}")
        except Exception as e:
            logger.error(f"Fan speed setting failed: {e}")
        return False

    def get_jetson_info(self) -> Dict[str, any]:
        """Jetson 전체 정보를 가져옵니다"""
        info = {
            "gpu_frequency": self.get_gpu_frequency(),
            "cpu_frequency": self.get_cpu_frequency(),
            "memory_frequency": self.get_memory_frequency(),
            "temperature": self.get_temperature(),
            "power_consumption": self.get_power_consumption(),
            "power_mode": self._get_current_power_mode()
        }
        return info

    def _get_current_power_mode(self) -> str:
        """현재 전력 모드를 가져옵니다"""
        try:
            result = subprocess.run(
                ["nvpmodel", "-q"],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode == 0:
                for line in result.stdout.split('\n'):
                    if 'NV Power Mode:' in line:
                        return line.split(':')[-1].strip()
        except Exception:
            pass
        return "Unknown"


class GPUMonitor:
    def __init__(self):
        self.initialized = False
        self.is_jetson = settings.IS_JETSON
        self.jetson_monitor = JetsonMonitor() if self.is_jetson else None

        if NVML_AVAILABLE and not self.is_jetson:
            try:
                pynvml.nvmlInit()
                self.initialized = True
                logger.info("GPU monitoring initialized successfully")
            except Exception as e:
                logger.error(f"Failed to initialize GPU monitoring: {e}")
        elif self.is_jetson:
            logger.info("Jetson Xavier mode detected - using Jetson-specific monitoring")
            self.initialized = True

    def get_gpu_memory_info(self, device_id: int = 0) -> Dict[str, float]:
        """GPU 메모리 정보를 반환합니다.
        반환 형식 통일:
          - keys: total, used, free, usage_percent, free_ratio, unit
          - unit: "GiB" 또는 "MiB"
        """
        if self.is_jetson:
            info = self.jetson_monitor.get_gpu_memory_info()
            # Jetson 쪽 반환 값에 누락된 키 보정 및 단위 명시
            if info:
                total = info.get("total", 0)
                used = info.get("used", 0)
                free = info.get("free", 0)
                # Jetson 경로는 MiB 기반으로 동작하도록 통일
                unit = "MiB"
                # 일부 fallback 경로는 GB를 반환할 수 있어 값이 작으면 GB로 간주 → MiB로 변환
                if total and total < 100:  # 100 GiB 미만이면 GB일 가능성
                    total, used, free = total * 1024, used * 1024, free * 1024
                usage_percent = info.get("usage_percent", (used / total * 100) if total else 0)
                free_ratio = info.get("free_ratio", (free / total) if total else 0)
                return {
                    "total": total,
                    "used": used,
                    "free": free,
                    "usage_percent": round(float(usage_percent), 2),
                    "free_ratio": round(float(free_ratio), 4),
                    "unit": unit,
                }
            return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0, "unit": "MiB"}

        if not self.initialized or not NVML_AVAILABLE:
            return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0, "unit": "GiB"}

        try:
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)

            total_gib = mem_info.total / 1024**3
            used_gib = mem_info.used / 1024**3
            free_gib = mem_info.free / 1024**3
            usage_percent = (used_gib / total_gib) * 100 if total_gib else 0

            return {
                "total": round(total_gib, 2),
                "used": round(used_gib, 2),
                "free": round(free_gib, 2),
                "usage_percent": round(usage_percent, 2),
                "free_ratio": round((free_gib / total_gib), 4) if total_gib else 0,
                "unit": "GiB",
            }
        except Exception as e:
            logger.error(f"Error getting GPU memory info: {e}")
            return {"total": 0, "used": 0, "free": 0, "usage_percent": 0, "free_ratio": 0, "unit": "GiB"}

    def get_gpu_utilization(self, device_id: int = 0) -> float:
        """GPU 사용률을 반환합니다."""
        if self.is_jetson:
            return self.jetson_monitor.get_gpu_utilization()

        if not self.initialized or not NVML_AVAILABLE:
            return 0.0

        try:
            handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
            util = pynvml.nvmlDeviceGetUtilizationRates(handle)
            return float(util.gpu)
        except Exception as e:
            logger.error(f"Error getting GPU utilization: {e}")
            return 0.0

    def get_system_memory_info(self) -> Dict[str, float]:
        """시스템 메모리 정보를 반환합니다."""
        mem = psutil.virtual_memory()
        return {
            "total": round(mem.total / 1024**3, 2),  # GB
            "used": round(mem.used / 1024**3, 2),   # GB
            "free": round(mem.free / 1024**3, 2),   # GB
            "usage_percent": round(mem.percent, 2)
        }

    def get_jetson_specific_info(self) -> Dict[str, any]:
        """Jetson 전용 정보를 반환합니다."""
        if not self.is_jetson or not self.jetson_monitor:
            return {}

        return self.jetson_monitor.get_jetson_info()

    def should_scale_up(self, vram_usage: float, threshold: float) -> bool:
        """스케일 업 여부를 결정합니다."""
        return vram_usage < threshold

    def should_scale_down(self, vram_usage: float, threshold: float) -> bool:
        """스케일 다운 여부를 결정합니다."""
        return vram_usage > threshold

    def optimize_for_jetson(self) -> bool:
        """Jetson 최적화를 수행합니다."""
        if not self.is_jetson or not self.jetson_monitor:
            return False

        try:
            # 전력 모드 설정
            power_mode = settings.JETSON_POWER_MODE
            if power_mode != "MAXN":
                self.jetson_monitor.set_power_mode(power_mode)

            # 팬 제어 활성화
            if settings.JETSON_FAN_CONTROL:
                # 온도에 따른 팬 속도 조정
                temps = self.jetson_monitor.get_temperature()
                max_temp = max(temps.values()) if temps else 0

                if max_temp > settings.JETSON_TEMP_THRESHOLD:
                    self.jetson_monitor.set_fan_speed(255)  # 최대 속도
                elif max_temp > 60:
                    self.jetson_monitor.set_fan_speed(128)  # 중간 속도
                else:
                    self.jetson_monitor.set_fan_speed(64)   # 낮은 속도

            logger.info("Jetson optimization completed")
            return True

        except Exception as e:
            logger.error(f"Jetson optimization failed: {e}")
            return False

    def get_comprehensive_gpu_info(self) -> Dict[str, any]:
        """GPU와 Jetson 정보를 종합적으로 반환합니다."""
        gpu_info = {
            "memory": self.get_gpu_memory_info(),
            "utilization": self.get_gpu_utilization(),
            "system_memory": self.get_system_memory_info()
        }

        if self.is_jetson:
            gpu_info["jetson"] = self.get_jetson_specific_info()
            gpu_info["platform"] = "Jetson Xavier"
        else:
            gpu_info["platform"] = "x86_64"

        return gpu_info


# 전역 GPU 모니터 인스턴스
gpu_monitor = GPUMonitor()