mod/project/docker/app/gpu/nvidia.py

import sys
from collections import defaultdict
from functools import wraps

if "/www/server/panel/class" not in sys.path:
    sys.path.append('/www/server/panel/class')

import public

try:
    import pynvml
except:
    public.ExecShell("btpip install nvidia-ml-py")
    import pynvml

try:
    from mod.project.docker.app.gpu.base import GPUBase
except:
    class GPUBase:
        pass

device_tasks = defaultdict()
system_tasks = defaultdict()


def register_task(name: str):
    def task_decorator(task_func):
        _task_type, _task_name = name.split(':')
        if _task_type == 'device':
            device_tasks[_task_name] = task_func
        elif _task_type == 'system':
            system_tasks[_task_name] = task_func

        @wraps(task_func)
        def func_wrapper(*args, **kwargs):
            return task_func(*args, **kwargs)

        return func_wrapper

    return task_decorator


class NVIDIA(GPUBase):
    name = 'nvidia'
    support = None

    def __init__(self):
        # 判断是否支持，并在判断时初始化pynvml库。
        self.device_count = 0
        if self.is_support():
            self.device_count = pynvml.nvmlDeviceGetCount()

    def __del__(self):
        if self.is_support():
            pynvml.nvmlShutdown()

    def get_all_device_info(self):
        all_info = defaultdict()
        all_info['system'] = self.get_system_info()
        for index in range(self.device_count):
            all_info[index] = self.get_info_by_index(index)
        return all_info

    def get_info_by_index(self, index=0):
        info = defaultdict()
        handle = pynvml.nvmlDeviceGetHandleByIndex(index)

        for t_name, t_func in device_tasks.items():
            try:
                info[t_name] = t_func(self, handle)
            except:
                # public.print_log("pynvml {t_name} error: {}")
                info[t_name] = None

        return info

    def get_system_info(self):
        info = defaultdict()
        for t_name, t_func in system_tasks.items():
            try:
                info[t_name] = t_func(self)
            except:
                # public.print_log(f"pynvml {t_name} error: {e}")
                info[t_name] = None
        return info

    @classmethod
    def is_support(cls):
        try:
            pynvml.nvmlInit()
            cls.support = True
            return True

        except pynvml.NVMLError:
            cls.support = False
            # public.print_log("Nvidia was not supported!")
            return False

    @register_task('device:memory')
    def _get_mem_info(self, handle):
        info = defaultdict()
        info['size'] = int(pynvml.nvmlDeviceGetMemoryInfo(handle).total) / 1024 ** 3
        info['free'] = int(pynvml.nvmlDeviceGetMemoryInfo(handle).free) / 1024 ** 3
        info['used'] = int(pynvml.nvmlDeviceGetMemoryInfo(handle).used) / 1024 ** 3
        return info

    @register_task('device:clock')
    def _get_clock_info(self, handle):
        info = defaultdict()
        info['graphics'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
        info['sm'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM)
        info['memory'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
        info['video'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_VIDEO)

        return info

    @register_task('device:temperature')
    def _get_temp_info(self, handle):
        info = 0
        try:
            info = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
        except pynvml.NVMLError or AttributeError:
            info = pynvml.nvmlDeviceGetTemperatureV1(handle, pynvml.NVML_TEMPERATURE_GPU)
        return info

    @register_task('device:utilization')
    def _get_uti_info(self, handle):
        info = defaultdict()
        info['gpu'] = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
        info['memory'] = pynvml.nvmlDeviceGetUtilizationRates(handle).memory

        return info

    @register_task('device:processes')
    def _get_proc_uti(self, handle):
        info = list()
        for p in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
            p.__dict__['name'] = pynvml.nvmlSystemGetProcessName(p.pid)
            p.__dict__['type'] = 'Compute'
            info.append(p.__dict__)

        for p in pynvml.nvmlDeviceGetGraphicsRunningProcesses(handle):
            p.__dict__['name'] = pynvml.nvmlSystemGetProcessName(p.pid)
            p.__dict__['type'] = 'Graphics'
            info.append(p.__dict__)

        for p in pynvml.nvmlDeviceGetMPSComputeRunningProcesses(handle):
            p.__dict__['name'] = pynvml.nvmlSystemGetProcessName(p.pid)
            p.__dict__['type'] = 'MPS'
            info.append(p.__dict__)

        return info

    @register_task('device:fan')
    def _get_fan_info(self, handle):
        info = defaultdict()
        try:
            info['speed'] = pynvml.nvmlDeviceGetFanSpeedRPM(handle).speed
        except AttributeError:
            info['speed'] = pynvml.nvmlDeviceGetFanSpeed(handle)
        except pynvml.NVMLError:
            info['speed'] = pynvml.nvmlDeviceGetFanSpeed_v2(handle, 0)
        except:
            info['speed'] = 0
        return info

    @register_task('device:name')
    def _get_device_name(self, handle):
        return pynvml.nvmlDeviceGetName(handle)

    @register_task('device:power')
    def _get_device_power(self, handle):
        info = defaultdict()
        info['current'] = pynvml.nvmlDeviceGetPowerUsage(handle)
        info['max'] = pynvml.nvmlDeviceGetPowerManagementLimit(handle)
        return info

    @register_task('system:version')
    def _get_device_version(self):
        info = defaultdict()
        info['driver'] = pynvml.nvmlSystemGetDriverVersion()

        try:
            info['cuda'] = pynvml.nvmlSystemGetCudaDriverVersion()
        except pynvml.NVMLError or AttributeError:
            info['cuda'] = pynvml.nvmlSystemGetCudaDriverVersion_v2()

        return info

    @register_task('system:count')
    def _get_device_count(self):
        info = 0
        info = pynvml.nvmlDeviceGetCount()
        return info


if __name__ == '__main__':
    nvidia = NVIDIA()
    print(nvidia.get_all_device_info())