获取显卡状态信息

使用 NVIDIA 管理库 NVML 获取 GPU 利用率

简介

NVIDIA Management Library (NVML) 随 CUDA 一起发布,是一个基于 C 代码的 API,用于兼用和管理 NVIDIA GPU 设备的各种状态。简单的使用获取显示显卡利用率。

代码

命令行

1
2
3
4
:: 查询显卡使用情况信息
nvidia-smi
:: 查询可用设备
nvidia-smi -L

C++

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#include "nvml.h"

#pragma comment(lib,"nvml.lib")

int main()
{
nvmlReturn_t result;
unsigned int device_count, i;
// First initialize NVML library
result = nvmlInit();

result = nvmlDeviceGetCount(&device_count);
if (NVML_SUCCESS != result)
{
std::cout << "Failed to query device count: " << nvmlErrorString(result);
}
std::cout << "Found" << device_count << " device" << endl;

std::cout << "Listing devices:";
while (true)
{
for (i = 0; i < device_count; i++)
{
nvmlDevice_t device;
result = nvmlDeviceGetHandleByIndex(i, &device);
if (NVML_SUCCESS != result) {
std::cout << "get device failed " << endl;
continue;
}

char name[NVML_DEVICE_NAME_BUFFER_SIZE];
result = nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
if (NVML_SUCCESS == result) {
std::cout << "GPU name: " << name << endl;
}

char uuid[NVML_DEVICE_UUID_BUFFER_SIZE];
result = nvmlDeviceGetUUID(device, uuid, NVML_DEVICE_UUID_BUFFER_SIZE);
if (NVML_SUCCESS == result) {
std::cout << "GPU uuid: " << uuid << endl;
}

//使用率
nvmlUtilization_t utilization;
result = nvmlDeviceGetUtilizationRates(device, &utilization);
if (NVML_SUCCESS == result)
{
std::cout << "----- 使用率 ----- " << endl;
std::cout << "GPU 使用率: " << utilization.gpu << endl;
std::cout << "显存使用率 " << utilization.memory << endl;
}
}
Sleep(1000);
}
return 0;
}

C

使用 C# 调用 NVML 需要额外封装 nvml.dll 库。
参考 nvml-csharp 库封装一个简单的帮助类,仅用于获取 GPU 与显存使用率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
/// <summary>
/// 显卡利用率统计
/// </summary>
public class NvmlStateHelper
{
/// <summary>
/// NVML 库名称
/// </summary>
const string NVML_SHARED_LIBRARY_STRING = "nvml.dll";

/// <summary>
/// Buffer size guaranteed to be large enough for \ref nvmlDeviceGetName
/// </summary>
const int NVML_DEVICE_NAME_BUFFER_SIZE = 64;

/// <summary>
/// Buffer size guaranteed to be large enough for \ref nvmlDeviceGetUUID
/// </summary>
const int NVML_DEVICE_UUID_BUFFER_SIZE = 80;

/// <summary>
/// 初始化 NVML 库
/// </summary>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, EntryPoint = "nvmlInit_v2")]
internal static extern NvmlReturn NvmlInitV2();

/// <summary>
/// 获取显卡数量
/// </summary>
/// <param name="deviceCount"></param>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, CharSet = CharSet.Ansi, EntryPoint = "nvmlDeviceGetCount_v2")]
internal static extern NvmlReturn NvmlDeviceGetCount_v2(out uint deviceCount);

/// <summary>
/// 获取显卡句柄
/// </summary>
/// <param name="index"></param>
/// <param name="device"></param>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, EntryPoint = "nvmlDeviceGetHandleByIndex")]
internal static extern NvmlReturn NvmlDeviceGetHandleByIndex(uint index, out IntPtr device);

/// <summary>
/// 获取显卡名称
/// </summary>
/// <param name="device"></param>
/// <param name="name"></param>
/// <param name="length"></param>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, CharSet = CharSet.Ansi, EntryPoint = "nvmlDeviceGetName")]
internal static extern NvmlReturn NvmlDeviceGetName(IntPtr device, [Out, MarshalAs(UnmanagedType.LPArray)] byte[] name, uint length);

/// <summary>
/// 获取显卡 UUID
/// </summary>
/// <param name="device"></param>
/// <param name="uuid"></param>
/// <param name="length"></param>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, CharSet = CharSet.Ansi, EntryPoint = "nvmlDeviceGetUUID")]
internal static extern NvmlReturn NvmlDeviceGetUUID(IntPtr device, [Out, MarshalAs(UnmanagedType.LPArray)] byte[] uuid, uint length);

/// <summary>
/// 获取显卡使用率信息
/// </summary>
/// <param name="device"></param>
/// <param name="utilization"></param>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, CharSet = CharSet.Ansi, EntryPoint = "nvmlDeviceGetUtilizationRates")]
internal static extern NvmlReturn NvmlDeviceGetUtilizationRates(IntPtr device, out NvmlUtilization utilization);

/// <summary>
/// 关闭调用
/// </summary>
/// <returns></returns>
[DllImport(NVML_SHARED_LIBRARY_STRING, EntryPoint = "nvmlShutdown")]
internal static extern NvmlReturn NvmlShutdown();

/// <summary>
/// 获取显卡全局使用率
/// </summary>
/// <param name="gpuCount"></param>
/// <exception cref="SystemException"></exception>
public NvmlStateHelper(uint gpuCount = 0)
{
Task.Run(() =>
{
NvmlReturn res = NvmlInitV2();
if (NvmlReturn.NVML_SUCCESS != res)
{
//throw new SystemException(res.ToString());
return;
}

var device = IntPtr.Zero;
res = NvmlDeviceGetHandleByIndex(gpuCount, out device);
if (NvmlReturn.NVML_SUCCESS != res)
{
//throw new SystemException(res.ToString());
return;
}

// 获取显卡名称
byte[] bufferName = new byte[NVML_DEVICE_NAME_BUFFER_SIZE];
res = NvmlDeviceGetName(device, bufferName, NVML_DEVICE_NAME_BUFFER_SIZE);
if (NvmlReturn.NVML_SUCCESS == res)
{
var gpuName = Encoding.Default.GetString(bufferName).Replace("\0", "");
}

// 获取显卡 UUID
byte[] bufferUUID = new byte[NVML_DEVICE_UUID_BUFFER_SIZE];
res = NvmlDeviceGetUUID(device, bufferUUID, NVML_DEVICE_UUID_BUFFER_SIZE);
if (NvmlReturn.NVML_SUCCESS == res)
{
var gpuUUID = Encoding.Default.GetString(bufferUUID).Replace("\0", "");
}

// 获取 GPU 与显存使用率
while (true)
{
try
{
NvmlUtilization nvmlUtilization;
res = NvmlDeviceGetUtilizationRates(device, out nvmlUtilization);
if (NvmlReturn.NVML_SUCCESS != res)
{
//throw new SystemException(res.ToString());
return;
}

GpuChange?.Invoke(nvmlUtilization.gpu);
MemoryChange?.Invoke(nvmlUtilization.memory);
}
catch (Exception)
{
// 获取信息失败
}

Thread.Sleep(1000);
}
});
}

/// <summary>
/// Gpu 使用率
/// </summary>
public event Action<uint> GpuChange;

/// <summary>
/// 显存使用率
/// </summary>
public event Action<uint> MemoryChange;
}

/// <summary>
/// NVML 返回值类型
/// </summary>
public enum NvmlReturn
{
NVML_SUCCESS = 0,
NVML_ERROR_UNINITIALIZED,
NVML_ERROR_INVALID_ARGUMENT,
NVML_ERROR_NOT_SUPPORTED,
NVML_ERROR_NO_PERMISSION,
NVML_ERROR_ALREADY_INITIALIZED,
NVML_ERROR_NOT_FOUND,
NVML_ERROR_INSUFFICIENT_SIZE,
NVML_ERROR_INSUFFICIENT_POWER,
NVML_ERROR_DRIVER_NOT_LOADED,
NVML_ERROR_TIMEOUT,
NVML_ERROR_IRQ_ISSUE,
NVML_ERROR_LIBRARY_NOT_FOUND,
NVML_ERROR_FUNCTION_NOT_FOUND,
NVML_ERROR_CORRUPTED_INFOROM,
NVML_ERROR_GPU_IS_LOST,
NVML_ERROR_RESET_REQUIRED,
NVML_ERROR_OPERATING_SYSTEM,
NVML_ERROR_LIB_RM_VERSION_MISMATCH,
NVML_ERROR_IN_USE,
NVML_ERROR_MEMORY,
NVML_ERROR_NO_DATA,
NVML_ERROR_VGPU_ECC_NOT_SUPPORTED,
NVML_ERROR_INSUFFICIENT_RESOURCES,
NVML_ERROR_UNKNOWN = 999
}

/// <summary>
/// 显卡使用率信息模型
/// </summary>
public struct NvmlUtilization
{
public uint gpu { get; }
public uint memory { get; }
}