-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathgpu_util.py
154 lines (142 loc) · 6.51 KB
/
gpu_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from subprocess import Popen, PIPE, STDOUT
import os
import numpy as np
import random
import time
class GPU:
def __init__(self, ID, load, memoryTotal, memoryUsed, memoryFree, driver, gpu_name, serial, display_mode, display_active):
self.id = ID
self.load = load
self.memoryUtil = float(memoryUsed)/float(memoryTotal)
self.memoryTotal = memoryTotal
self.memoryUsed = memoryUsed
self.memoryFree = memoryFree
self.driver = driver
self.name = gpu_name
self.serial = serial
self.display_mode = display_mode
self.display_active = display_active
def getGPUs():
# Get ID, processing and memory utilization for all GPUs
p = Popen(["nvidia-smi","--query-gpu=index,utilization.gpu,memory.total,memory.used,memory.free,driver_version,name,gpu_serial,display_active,display_mode","--format=csv,noheader,nounits"],stdout=PIPE)
output = p.stdout.read().decode('UTF-8')
# output = output[2:-1] # Remove b' and ' from string added by python
#print(output)
## Parse output
# Split on line break
#lines = output.split('\n')
lines = output.split(os.linesep)
#print(lines)
numDevices = len(lines)-1
deviceIds = np.empty(numDevices,dtype=int)
gpuUtil = np.empty(numDevices,dtype=float)
memTotal = np.empty(numDevices,dtype=int)
memUsed = np.empty(numDevices,dtype=int)
memFree = np.empty(numDevices,dtype=int)
driver = []
GPUs = []
for g in range(numDevices):
line = lines[g]
#print(line)
vals = line.split(', ')
#print(vals)
for i in range(10):
# print(vals[i])
if (i == 0):
deviceIds[g] = int(vals[i])
elif (i == 1):
gpuUtil[g] = float(vals[i])/100
elif (i == 2):
memTotal[g] = int(vals[i])
elif (i == 3):
memUsed[g] = int(vals[i])
elif (i == 4):
memFree[g] = int(vals[i])
elif (i == 5):
driver = vals[i]
elif (i == 6):
gpu_name = vals[i]
elif (i == 7):
serial = vals[i]
elif (i == 8):
display_active = vals[i]
elif (i == 9):
display_mode = vals[i]
GPUs.append(GPU(deviceIds[g], gpuUtil[g], memTotal[g], memUsed[g], memFree[g], driver, gpu_name,serial,display_mode, display_active))
return GPUs #(deviceIds, gpuUtil, memUtil)
def getAvailable(order = 'first', limit = 1, maxLoad = 0.5, maxMemory = 0.5):
# order = first | last | random | load | memory
# first --> select the GPU with the lowest ID (DEFAULT)
# last --> select the GPU with the highest ID
# random --> select a random available GPU
# lowest --> select the GPU with the lowest load
# limit = 1 (DEFAULT), 2, ..., Inf
# Limit sets the upper limit for the number of GPUs to return. E.g. if limit = 2, but only one is available, only one is returned.
# Get devise IDs, load and memory usage
GPUs = getGPUs()
# Determine, which GPUs are available
GPUavailability = np.array(getAvailability(GPUs, maxLoad, maxMemory))
availAbleGPUindex = np.where(GPUavailability == 1)[0]
# Discard unavailable GPUs
GPUs = [GPUs[g] for g in availAbleGPUindex]
# Sort available GPUs according to the order argument
if (order == 'first'):
GPUs.sort(key=lambda x: x.id, reverse=False)
elif (order == 'last'):
GPUs.sort(key=lambda x: x.id, reverse=True)
elif (order == 'random'):
GPUs = [GPUs[g] for g in random.sample(range(0,len(GPUs)),len(GPUs))]
elif (order == 'load'):
GPUs.sort(key=lambda x: x.load, reverse=False)
elif (order == 'memory'):
GPUs.sort(key=lambda x: x.memoryUtil, reverse=False)
# Extract the number of desired GPUs, but limited to the total number of available GPUs
GPUs = GPUs[0:np.minimum(limit, len(GPUs))]
# Extract the device IDs from the GPUs and return them
deviceIds = [GPUs[g].id for g in range(len(GPUs))]
return deviceIds
def getAvailability(GPUs, maxLoad = 0.5, maxMemory = 0.5):
# Determine, which GPUs are available
GPUavailability = np.zeros(len(GPUs))
for i in range(len(GPUs)):
if (GPUs[i].load < maxLoad) & (GPUs[i].memoryUtil < maxMemory):
GPUavailability[i] = 1
return GPUavailability
def getFirstAvailable(order = 'first', maxLoad=0.5, maxMemory=0.5, attempts=1, interval=900, verbose=False):
#GPUs = getGPUs()
#firstAvailableGPU = np.NaN
#for i in range(len(GPUs)):
# if (GPUs[i].load < maxLoad) & (GPUs[i].memory < maxMemory):
# firstAvailableGPU = GPUs[i].id
# break
#return firstAvailableGPU
for i in range(attempts):
if (verbose):
print('Attempting (' + str(i+1) + '/' + str(attempts) + ') to locate available GPU.')
# Get first available GPU
available = getAvailable(order = order, limit = 1, maxLoad = maxLoad, maxMemory = maxMemory)
# If an available GPU was found, break for loop.
if (available):
if (verbose):
print('GPU ' + str(available) + ' located!')
break
# If this is not the last attempt, sleep for 'interval' seconds
if (i != attempts-1):
time.sleep(interval)
# Check if an GPU was found, or if the attempts simply ran out. Throw error, if no GPU was found
if (not(available)):
raise RuntimeError('Could not find an available GPU after ' + str(attempts) + ' attempts with ' + str(interval) + ' seconds interval.')
# Return found GPU
return available
def showUtilization(all=False):
GPUs = getGPUs()
if (all):
print(' ID | Name | Serial || GPU util. | Memory util. || Memory total | Memory used | Memory free || Display mode | Display active |')
print('------------------------------------------------------------------------------------------------------------------------------')
for i in range(len(GPUs)):
print(' {0:2d} | {1:s} | {2:s} || {3:3.0f}% | {4:3.0f}% || {5:d}MB | {6:d}MB | {7:d}MB || {8:s} | {9:s}'.format(GPUs[i].id,GPUs[i].name,GPUs[i].serial,GPUs[i].load*100,GPUs[i].memoryUtil*100,GPUs[i].memoryTotal,GPUs[i].memoryUsed,GPUs[i].memoryFree,GPUs[i].display_mode,GPUs[i].display_active))
else:
print(' ID GPU MEM')
print('--------------')
for i in range(len(GPUs)):
print(' {0:2d} {1:3.0f}% {2:3.0f}%'.format(GPUs[i].id,GPUs[i].load*100,GPUs[i].memoryUtil*100))