Initial YakPanel commit
This commit is contained in:
23
mod/project/docker/app/gpu/__init__.py
Normal file
23
mod/project/docker/app/gpu/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from typing import List
|
||||
|
||||
from .base import GPUBase
|
||||
from .nvidia import NVIDIA
|
||||
from .amd import AMD
|
||||
|
||||
class Driver:
|
||||
drivers: List[GPUBase] = []
|
||||
|
||||
def __init__(self):
|
||||
if NVIDIA.is_support():
|
||||
self.drivers.append(NVIDIA())
|
||||
|
||||
if AMD.is_support():
|
||||
self.drivers.append(AMD())
|
||||
|
||||
@property
|
||||
def support(self):
|
||||
return len(self.drivers) > 0
|
||||
|
||||
def get_all_device_info(self, get):
|
||||
for _driver in self.drivers:
|
||||
pass
|
||||
36
mod/project/docker/app/gpu/amd.py
Normal file
36
mod/project/docker/app/gpu/amd.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from mod.project.docker.app.gpu.base import GPUBase
|
||||
|
||||
class AMD(GPUBase):
|
||||
@classmethod
|
||||
def is_support(cls):
|
||||
pass
|
||||
|
||||
def _get_device_version(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def _get_device_name(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def _get_fan_info(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def main(self):
|
||||
pass
|
||||
|
||||
def get_info(self, gpu_id=0):
|
||||
pass
|
||||
|
||||
def _get_mem_info(self):
|
||||
pass
|
||||
|
||||
def _get_clock_info(self):
|
||||
pass
|
||||
|
||||
def _get_temp_info(self):
|
||||
pass
|
||||
|
||||
def _get_uti_info(self):
|
||||
pass
|
||||
|
||||
def _get_proc_uti(self, proc_name='', proc_pid=0):
|
||||
pass
|
||||
70
mod/project/docker/app/gpu/base.py
Normal file
70
mod/project/docker/app/gpu/base.py
Normal file
@@ -0,0 +1,70 @@
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
class GPUBase(ABC):
|
||||
name = 'base'
|
||||
support = None
|
||||
@abstractmethod
|
||||
def _get_mem_info(self, *args, **kwargs):
|
||||
"""
|
||||
获取显存占用
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_clock_info(self, *args, **kwargs):
|
||||
"""
|
||||
获取时钟信息
|
||||
Returns:
|
||||
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_temp_info(self, *args, **kwargs):
|
||||
"""
|
||||
获取温度
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_uti_info(self, *args, **kwargs):
|
||||
"""
|
||||
获取占用
|
||||
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_proc_uti(self, *args, **kwargs):
|
||||
"""
|
||||
获取进程占用
|
||||
Returns:
|
||||
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_fan_info(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_device_name(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def _get_device_version(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_support(cls):
|
||||
pass
|
||||
27
mod/project/docker/app/gpu/constants.py
Normal file
27
mod/project/docker/app/gpu/constants.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class CMD:
|
||||
@dataclass
|
||||
class CTK:
|
||||
@dataclass
|
||||
class APT:
|
||||
GetGPGKey = "curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg"
|
||||
AddSourcesList = "curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list"
|
||||
APTUpdate = "sudo apt-get update"
|
||||
Install = "sudo apt-get install -y nvidia-container-toolkit"
|
||||
OneInstall = GetGPGKey + ';' + AddSourcesList + ';' + APTUpdate + ';' + Install
|
||||
|
||||
@dataclass
|
||||
class YUM:
|
||||
AddRepo = "curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | sudo tee /etc/yum.repos.d/nvidia-container-toolkit.repo"
|
||||
Install = "sudo yum install -y nvidia-container-toolkit"
|
||||
OneInstall = AddRepo + ';' + Install
|
||||
|
||||
@dataclass
|
||||
class ConfigureDocker:
|
||||
Runtime = "sudo nvidia-ctk runtime configure --runtime=docker"
|
||||
Restart = "sudo systemctl restart docker"
|
||||
|
||||
CheckVersion = "nvidia-ctk -v"
|
||||
199
mod/project/docker/app/gpu/nvidia.py
Normal file
199
mod/project/docker/app/gpu/nvidia.py
Normal file
@@ -0,0 +1,199 @@
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from functools import wraps
|
||||
|
||||
if "/www/server/panel/class" not in sys.path:
|
||||
sys.path.append('/www/server/panel/class')
|
||||
|
||||
import public
|
||||
|
||||
try:
|
||||
import pynvml
|
||||
except:
|
||||
public.ExecShell("btpip install nvidia-ml-py")
|
||||
import pynvml
|
||||
|
||||
try:
|
||||
from mod.project.docker.app.gpu.base import GPUBase
|
||||
except:
|
||||
class GPUBase:
|
||||
pass
|
||||
|
||||
device_tasks = defaultdict()
|
||||
system_tasks = defaultdict()
|
||||
|
||||
|
||||
def register_task(name: str):
|
||||
def task_decorator(task_func):
|
||||
_task_type, _task_name = name.split(':')
|
||||
if _task_type == 'device':
|
||||
device_tasks[_task_name] = task_func
|
||||
elif _task_type == 'system':
|
||||
system_tasks[_task_name] = task_func
|
||||
|
||||
@wraps(task_func)
|
||||
def func_wrapper(*args, **kwargs):
|
||||
return task_func(*args, **kwargs)
|
||||
|
||||
return func_wrapper
|
||||
|
||||
return task_decorator
|
||||
|
||||
|
||||
class NVIDIA(GPUBase):
|
||||
name = 'nvidia'
|
||||
support = None
|
||||
|
||||
def __init__(self):
|
||||
# 判断是否支持,并在判断时初始化pynvml库。
|
||||
self.device_count = 0
|
||||
if self.is_support():
|
||||
self.device_count = pynvml.nvmlDeviceGetCount()
|
||||
|
||||
def __del__(self):
|
||||
if self.is_support():
|
||||
pynvml.nvmlShutdown()
|
||||
|
||||
def get_all_device_info(self):
|
||||
all_info = defaultdict()
|
||||
all_info['system'] = self.get_system_info()
|
||||
for index in range(self.device_count):
|
||||
all_info[index] = self.get_info_by_index(index)
|
||||
return all_info
|
||||
|
||||
def get_info_by_index(self, index=0):
|
||||
info = defaultdict()
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(index)
|
||||
|
||||
for t_name, t_func in device_tasks.items():
|
||||
try:
|
||||
info[t_name] = t_func(self, handle)
|
||||
except:
|
||||
# public.print_log("pynvml {t_name} error: {}")
|
||||
info[t_name] = None
|
||||
|
||||
return info
|
||||
|
||||
def get_system_info(self):
|
||||
info = defaultdict()
|
||||
for t_name, t_func in system_tasks.items():
|
||||
try:
|
||||
info[t_name] = t_func(self)
|
||||
except:
|
||||
# public.print_log(f"pynvml {t_name} error: {e}")
|
||||
info[t_name] = None
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def is_support(cls):
|
||||
try:
|
||||
pynvml.nvmlInit()
|
||||
cls.support = True
|
||||
return True
|
||||
|
||||
except pynvml.NVMLError:
|
||||
cls.support = False
|
||||
# public.print_log("Nvidia was not supported!")
|
||||
return False
|
||||
|
||||
@register_task('device:memory')
|
||||
def _get_mem_info(self, handle):
|
||||
info = defaultdict()
|
||||
info['size'] = int(pynvml.nvmlDeviceGetMemoryInfo(handle).total) / 1024 ** 3
|
||||
info['free'] = int(pynvml.nvmlDeviceGetMemoryInfo(handle).free) / 1024 ** 3
|
||||
info['used'] = int(pynvml.nvmlDeviceGetMemoryInfo(handle).used) / 1024 ** 3
|
||||
return info
|
||||
|
||||
@register_task('device:clock')
|
||||
def _get_clock_info(self, handle):
|
||||
info = defaultdict()
|
||||
info['graphics'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_GRAPHICS)
|
||||
info['sm'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_SM)
|
||||
info['memory'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_MEM)
|
||||
info['video'] = pynvml.nvmlDeviceGetClockInfo(handle, pynvml.NVML_CLOCK_VIDEO)
|
||||
|
||||
return info
|
||||
|
||||
@register_task('device:temperature')
|
||||
def _get_temp_info(self, handle):
|
||||
info = 0
|
||||
try:
|
||||
info = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
||||
except pynvml.NVMLError or AttributeError:
|
||||
info = pynvml.nvmlDeviceGetTemperatureV1(handle, pynvml.NVML_TEMPERATURE_GPU)
|
||||
return info
|
||||
|
||||
@register_task('device:utilization')
|
||||
def _get_uti_info(self, handle):
|
||||
info = defaultdict()
|
||||
info['gpu'] = pynvml.nvmlDeviceGetUtilizationRates(handle).gpu
|
||||
info['memory'] = pynvml.nvmlDeviceGetUtilizationRates(handle).memory
|
||||
|
||||
return info
|
||||
|
||||
@register_task('device:processes')
|
||||
def _get_proc_uti(self, handle):
|
||||
info = list()
|
||||
for p in pynvml.nvmlDeviceGetComputeRunningProcesses(handle):
|
||||
p.__dict__['name'] = pynvml.nvmlSystemGetProcessName(p.pid)
|
||||
p.__dict__['type'] = 'Compute'
|
||||
info.append(p.__dict__)
|
||||
|
||||
for p in pynvml.nvmlDeviceGetGraphicsRunningProcesses(handle):
|
||||
p.__dict__['name'] = pynvml.nvmlSystemGetProcessName(p.pid)
|
||||
p.__dict__['type'] = 'Graphics'
|
||||
info.append(p.__dict__)
|
||||
|
||||
for p in pynvml.nvmlDeviceGetMPSComputeRunningProcesses(handle):
|
||||
p.__dict__['name'] = pynvml.nvmlSystemGetProcessName(p.pid)
|
||||
p.__dict__['type'] = 'MPS'
|
||||
info.append(p.__dict__)
|
||||
|
||||
return info
|
||||
|
||||
@register_task('device:fan')
|
||||
def _get_fan_info(self, handle):
|
||||
info = defaultdict()
|
||||
try:
|
||||
info['speed'] = pynvml.nvmlDeviceGetFanSpeedRPM(handle).speed
|
||||
except AttributeError:
|
||||
info['speed'] = pynvml.nvmlDeviceGetFanSpeed(handle)
|
||||
except pynvml.NVMLError:
|
||||
info['speed'] = pynvml.nvmlDeviceGetFanSpeed_v2(handle, 0)
|
||||
except:
|
||||
info['speed'] = 0
|
||||
return info
|
||||
|
||||
@register_task('device:name')
|
||||
def _get_device_name(self, handle):
|
||||
return pynvml.nvmlDeviceGetName(handle)
|
||||
|
||||
@register_task('device:power')
|
||||
def _get_device_power(self, handle):
|
||||
info = defaultdict()
|
||||
info['current'] = pynvml.nvmlDeviceGetPowerUsage(handle)
|
||||
info['max'] = pynvml.nvmlDeviceGetPowerManagementLimit(handle)
|
||||
return info
|
||||
|
||||
@register_task('system:version')
|
||||
def _get_device_version(self):
|
||||
info = defaultdict()
|
||||
info['driver'] = pynvml.nvmlSystemGetDriverVersion()
|
||||
|
||||
try:
|
||||
info['cuda'] = pynvml.nvmlSystemGetCudaDriverVersion()
|
||||
except pynvml.NVMLError or AttributeError:
|
||||
info['cuda'] = pynvml.nvmlSystemGetCudaDriverVersion_v2()
|
||||
|
||||
return info
|
||||
|
||||
@register_task('system:count')
|
||||
def _get_device_count(self):
|
||||
info = 0
|
||||
info = pynvml.nvmlDeviceGetCount()
|
||||
return info
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
nvidia = NVIDIA()
|
||||
print(nvidia.get_all_device_info())
|
||||
158
mod/project/docker/app/gpu/tools.py
Normal file
158
mod/project/docker/app/gpu/tools.py
Normal file
@@ -0,0 +1,158 @@
|
||||
import os
|
||||
import sys
|
||||
from typing import Tuple
|
||||
|
||||
from mod.project.docker.app.gpu.constants import CMD
|
||||
from mod.project.docker.app.gpu.nvidia import NVIDIA
|
||||
|
||||
if "/www/server/panel/class" not in sys.path:
|
||||
sys.path.append('/www/server/panel/class')
|
||||
|
||||
import public
|
||||
|
||||
|
||||
class GPUTool:
|
||||
gpu_option = None
|
||||
option_default = None
|
||||
|
||||
@staticmethod
|
||||
def __get_linux_distribution():
|
||||
"""检测系统是否为 Debian/Ubuntu 或 CentOS/Red Hat 系列"""
|
||||
try:
|
||||
# 优先解析 /etc/os-release
|
||||
with open("/etc/os-release", "r", encoding="utf-8") as f:
|
||||
os_release = {}
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and "=" in line:
|
||||
key, value = line.split("=", 1)
|
||||
os_release[key] = value.strip('"')
|
||||
|
||||
dist_id = os_release.get("ID", "").lower()
|
||||
id_like = os_release.get("ID_LIKE", "").lower()
|
||||
|
||||
# 根据 ID 或 ID_LIKE 判断
|
||||
if dist_id in ["debian", "ubuntu"]:
|
||||
return "debian"
|
||||
elif dist_id in ["centos", "rhel", "fedora"]:
|
||||
return "centos"
|
||||
elif "debian" in id_like:
|
||||
return "debian"
|
||||
elif "rhel" in id_like or "fedora" in id_like:
|
||||
return "centos"
|
||||
|
||||
except FileNotFoundError:
|
||||
# 如果 /etc/os-release 不存在,检查其他文件
|
||||
if os.path.exists("/etc/debian_version"):
|
||||
return "debian"
|
||||
elif os.path.exists("/etc/redhat-release"):
|
||||
return "centos"
|
||||
|
||||
except Exception:
|
||||
raise ValueError("System Distribution Is Unknown")
|
||||
|
||||
@classmethod
|
||||
def __gpu_default_setting(cls) -> Tuple[bool, bool]:
|
||||
"""
|
||||
检测是否开启GPU
|
||||
Returns:
|
||||
gpu_option: 返回是否开启GPU选择
|
||||
option_default: 默认GPU选择是否开启
|
||||
"""
|
||||
if cls.gpu_option is not None and cls.option_default is not None:
|
||||
return cls.gpu_option, cls.option_default
|
||||
|
||||
driver = NVIDIA()
|
||||
# 如果不支持直接返回
|
||||
if driver.support is None or driver.support is False:
|
||||
cls.gpu_option = False
|
||||
cls.option_default = False
|
||||
return cls.gpu_option, cls.option_default
|
||||
|
||||
# 如果支持则检查显存大小
|
||||
device_info = driver.get_all_device_info()
|
||||
mem_size = 0
|
||||
for _, _device in device_info.items():
|
||||
mem_size = mem_size + _device.get('memory', {}).get('size', 0)
|
||||
if mem_size > 3:
|
||||
cls.gpu_option = True
|
||||
cls.option_default = True
|
||||
else:
|
||||
cls.gpu_option = True
|
||||
cls.option_default = False
|
||||
|
||||
return cls.gpu_option, cls.option_default
|
||||
|
||||
@classmethod
|
||||
def register_app_gpu_option(cls, app):
|
||||
option, default = cls.__gpu_default_setting()
|
||||
for field in app.get('field', []):
|
||||
if option == False and field.get('attr', '') == 'gpu':
|
||||
app['field'].remove(field)
|
||||
elif option == True and field.get('attr', '') == 'gpu':
|
||||
field['default'] = default
|
||||
field['suffix'] = field['suffix'] + ' | 已默认设置为{}'.format(default)
|
||||
# public.print_log("\n\n\n\n{}\n\n\n\n".format(field['suffix']))
|
||||
return app
|
||||
|
||||
@staticmethod
|
||||
def is_install_ctk():
|
||||
stdout, stderr = public.ExecShell(CMD.CTK.CheckVersion)
|
||||
if len(stderr) != 0:
|
||||
return False
|
||||
if not stdout.lower().find('version'):
|
||||
public.print_log("Not Nvidia Container Toolkit")
|
||||
return False
|
||||
return True
|
||||
|
||||
@classmethod
|
||||
def __ctk_install_cmd_apt(cls, app_log):
|
||||
return ("{get_gpg_key} >> {app_log};"
|
||||
"{add_sources_list} >> {app_log};"
|
||||
"{apt_update} >> {app_log};"
|
||||
"{install} >> {app_log}"
|
||||
.format(get_gpg_key=CMD.CTK.APT.GetGPGKey,
|
||||
add_sources_list=CMD.CTK.APT.AddSourcesList,
|
||||
apt_update=CMD.CTK.APT.APTUpdate,
|
||||
install=CMD.CTK.APT.Install,
|
||||
app_log=app_log
|
||||
))
|
||||
|
||||
@classmethod
|
||||
def __ctk_install_cmd_yum(cls, app_log):
|
||||
return ("{add_repo} >> {app_log};"
|
||||
"{install} >> {app_log}"
|
||||
.format(add_repo=CMD.CTK.YUM.AddRepo,
|
||||
install=CMD.CTK.YUM.Install,
|
||||
app_log=app_log
|
||||
))
|
||||
|
||||
@classmethod
|
||||
def __config_docker(cls, app_log):
|
||||
return ("{runtime} >> {app_log};"
|
||||
"{restart} >> {app_log}"
|
||||
.format(runtime=CMD.CTK.ConfigureDocker.Runtime,
|
||||
restart=CMD.CTK.ConfigureDocker.Restart,
|
||||
app_log=app_log))
|
||||
|
||||
@classmethod
|
||||
def ctk_install_cmd(cls, app_log):
|
||||
dtb = cls.__get_linux_distribution()
|
||||
cmd = ''
|
||||
if dtb == 'debian':
|
||||
cmd = (
|
||||
"{install_cmd};"
|
||||
"{config_docker}"
|
||||
.format(
|
||||
install_cmd=cls.__ctk_install_cmd_apt(app_log),
|
||||
config_docker=cls.__config_docker(app_log),
|
||||
))
|
||||
elif dtb == 'centos':
|
||||
cmd = (
|
||||
"{install_cmd};"
|
||||
"{config_docker}"
|
||||
.format(
|
||||
install_cmd=cls.__ctk_install_cmd_yum(app_log),
|
||||
config_docker=cls.__config_docker(app_log),
|
||||
))
|
||||
return cmd
|
||||
0
mod/project/docker/app/gpu/type.py
Normal file
0
mod/project/docker/app/gpu/type.py
Normal file
Reference in New Issue
Block a user