Skip to content

Commit 8c77a20

Browse files
committed
Filter out malformed nvidia-smi process_name XML tag
1 parent 20f01e0 commit 8c77a20

File tree

3 files changed

+65
-86
lines changed

3 files changed

+65
-86
lines changed

cwltool/cuda.py

+36-15
Original file line numberDiff line numberDiff line change
@@ -8,23 +8,47 @@
88
from .utils import CWLObjectType
99

1010

11+
def cuda_device_count() -> str:
12+
"""Determine the number of attached CUDA GPUs."""
13+
# For the number of GPUs, we can use the following query
14+
cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"]
15+
try:
16+
# This is equivalent to subprocess.check_output, but use
17+
# subprocess.run so we can use separate MagicMocks in test_cuda.py
18+
proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) # nosec
19+
except Exception as e:
20+
_logger.warning("Error checking number of GPUs with nvidia-smi: %s", e)
21+
return "0"
22+
return proc.stdout.decode("utf-8")
23+
24+
1125
def cuda_version_and_device_count() -> Tuple[str, int]:
1226
"""Determine the CUDA version and number of attached CUDA GPUs."""
27+
count = int(cuda_device_count())
28+
29+
# Since there is no specific query for the cuda version, we have to use
30+
# `nvidia-smi -q -x`
31+
# However, apparently nvidia-smi is not safe to call concurrently.
32+
# With --parallel, sometimes the returned XML will contain
33+
# <process_name>\xff...\xff</process_name>
34+
# (or other arbitrary bytes) and xml.dom.minidom.parseString will raise
35+
# "xml.parsers.expat.ExpatError: not well-formed (invalid token)"
36+
# So we either need to use `grep -v process_name` to blacklist that tag,
37+
# (and hope that no other tags cause problems in the future)
38+
# or better yet use `grep cuda_version` to only grab the tags we will use.
39+
cmd = "nvidia-smi -q -x | grep cuda_version"
1340
try:
14-
out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec
41+
out = subprocess.check_output(cmd, shell=True) # nosec
1542
except Exception as e:
1643
_logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
1744
return ("", 0)
18-
dm = xml.dom.minidom.parseString(out) # nosec
1945

20-
ag = dm.getElementsByTagName("attached_gpus")
21-
if len(ag) < 1 or ag[0].firstChild is None:
22-
_logger.warning(
23-
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
24-
out,
25-
)
46+
try:
47+
dm = xml.dom.minidom.parseString(out) # nosec
48+
except xml.parsers.expat.ExpatError as e:
49+
_logger.warning("Error parsing XML stdout of nvidia-smi: %s", e)
50+
_logger.warning("stdout: %s", out)
2651
return ("", 0)
27-
ag_element = ag[0].firstChild
2852

2953
cv = dm.getElementsByTagName("cuda_version")
3054
if len(cv) < 1 or cv[0].firstChild is None:
@@ -35,13 +59,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
3559
return ("", 0)
3660
cv_element = cv[0].firstChild
3761

38-
if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
39-
ag_element, xml.dom.minidom.Text
40-
):
41-
return (cv_element.data, int(ag_element.data))
62+
if isinstance(cv_element, xml.dom.minidom.Text):
63+
return (cv_element.data, count) # type: ignore
4264
_logger.warning(
43-
"Error checking CUDA version with nvidia-smi. "
44-
"Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
65+
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s",
4566
out,
4667
)
4768
return ("", 0)

tests/test_cuda.py

+28-70
Original file line numberDiff line numberDiff line change
@@ -90,8 +90,11 @@ def _makebuilder(cudaReq: CWLObjectType) -> Builder:
9090

9191

9292
@mock.patch("subprocess.check_output")
93+
@mock.patch("cwltool.cuda.cuda_device_count")
9394
@mock.patch("os.makedirs")
94-
def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
95+
def test_cuda_job_setup_check(
96+
makedirs: MagicMock, count: MagicMock, check_output: MagicMock
97+
) -> None:
9598
runtime_context = RuntimeContext({})
9699

97100
cudaReq: CWLObjectType = {
@@ -101,74 +104,43 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
101104
}
102105
builder = _makebuilder(cudaReq)
103106

107+
count.return_value = "1"
104108
check_output.return_value = """
105-
<nvidia>
106-
<attached_gpus>1</attached_gpus>
107109
<cuda_version>1.0</cuda_version>
108-
</nvidia>
109110
"""
110111

111112
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
112113
jb._setup(runtime_context)
113114

114115

115116
@mock.patch("subprocess.check_output")
117+
@mock.patch("cwltool.cuda.cuda_device_count")
116118
@mock.patch("os.makedirs")
117-
def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
118-
runtime_context = RuntimeContext({})
119-
120-
cudaReq: CWLObjectType = {
121-
"class": "http://commonwl.org/cwltool#CUDARequirement",
122-
"cudaVersionMin": "2.0",
123-
"cudaComputeCapability": "1.0",
124-
}
125-
builder = _makebuilder(cudaReq)
126-
127-
check_output.return_value = """
128-
<nvidia>
129-
<attached_gpus>1</attached_gpus>
130-
<cuda_version>1.0</cuda_version>
131-
</nvidia>
132-
"""
133-
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
134-
with pytest.raises(WorkflowException):
135-
jb._setup(runtime_context)
136-
137-
138-
@mock.patch("subprocess.check_output")
139-
@mock.patch("os.makedirs")
140-
def test_cuda_job_setup_check_err_empty_attached_gpus(
141-
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
119+
def test_cuda_job_setup_check_err(
120+
makedirs: MagicMock, count: MagicMock, check_output: MagicMock
142121
) -> None:
143122
runtime_context = RuntimeContext({})
144123

145124
cudaReq: CWLObjectType = {
146125
"class": "http://commonwl.org/cwltool#CUDARequirement",
147-
"cudaVersionMin": "1.0",
126+
"cudaVersionMin": "2.0",
148127
"cudaComputeCapability": "1.0",
149128
}
150129
builder = _makebuilder(cudaReq)
151130

131+
count.return_value = "1"
152132
check_output.return_value = """
153-
<nvidia>
154-
<attached_gpus></attached_gpus>
155133
<cuda_version>1.0</cuda_version>
156-
</nvidia>
157134
"""
158-
159135
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
160136
with pytest.raises(WorkflowException):
161137
jb._setup(runtime_context)
162-
assert (
163-
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
164-
in caplog.text
165-
)
166138

167139

168-
@mock.patch("subprocess.check_output")
140+
@mock.patch("cwltool.cuda.cuda_device_count")
169141
@mock.patch("os.makedirs")
170-
def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
171-
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
142+
def test_cuda_job_setup_check_err_missing_query_gpu_count(
143+
makedirs: MagicMock, count: MagicMock, caplog: pytest.LogCaptureFixture
172144
) -> None:
173145
runtime_context = RuntimeContext({})
174146

@@ -179,25 +151,19 @@ def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
179151
}
180152
builder = _makebuilder(cudaReq)
181153

182-
check_output.return_value = """
183-
<nvidia>
184-
<cuda_version>1.0</cuda_version>
185-
</nvidia>
186-
"""
154+
count.return_value = ""
187155

188156
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
189157
with pytest.raises(WorkflowException):
190158
jb._setup(runtime_context)
191-
assert (
192-
"Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
193-
in caplog.text
194-
)
159+
assert "invalid literal for int() with base 10" in caplog.text
195160

196161

197162
@mock.patch("subprocess.check_output")
163+
@mock.patch("cwltool.cuda.cuda_device_count")
198164
@mock.patch("os.makedirs")
199165
def test_cuda_job_setup_check_err_empty_cuda_version(
200-
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
166+
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
201167
) -> None:
202168
runtime_context = RuntimeContext({})
203169

@@ -208,11 +174,9 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
208174
}
209175
builder = _makebuilder(cudaReq)
210176

177+
count.return_value = "1"
211178
check_output.return_value = """
212-
<nvidia>
213-
<attached_gpus>1</attached_gpus>
214179
<cuda_version></cuda_version>
215-
</nvidia>
216180
"""
217181

218182
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
@@ -225,9 +189,10 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
225189

226190

227191
@mock.patch("subprocess.check_output")
192+
@mock.patch("cwltool.cuda.cuda_device_count")
228193
@mock.patch("os.makedirs")
229194
def test_cuda_job_setup_check_err_missing_cuda_version(
230-
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
195+
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
231196
) -> None:
232197
runtime_context = RuntimeContext({})
233198

@@ -238,25 +203,20 @@ def test_cuda_job_setup_check_err_missing_cuda_version(
238203
}
239204
builder = _makebuilder(cudaReq)
240205

241-
check_output.return_value = """
242-
<nvidia>
243-
<attached_gpus>1</attached_gpus>
244-
</nvidia>
245-
"""
206+
count.return_value = "1"
207+
check_output.return_value = ""
246208

247209
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
248210
with pytest.raises(WorkflowException):
249211
jb._setup(runtime_context)
250-
assert (
251-
"Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
252-
in caplog.text
253-
)
212+
assert "Error parsing XML stdout of nvidia-smi" in caplog.text
254213

255214

256215
@mock.patch("subprocess.check_output")
216+
@mock.patch("cwltool.cuda.cuda_device_count")
257217
@mock.patch("os.makedirs")
258218
def test_cuda_job_setup_check_err_wrong_type_cuda_version(
259-
makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
219+
makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
260220
) -> None:
261221
runtime_context = RuntimeContext({})
262222

@@ -267,19 +227,17 @@ def test_cuda_job_setup_check_err_wrong_type_cuda_version(
267227
}
268228
builder = _makebuilder(cudaReq)
269229

230+
count.return_value = "1"
270231
check_output.return_value = """
271-
<nvidia>
272-
<attached_gpus>1</attached_gpus>
273232
<cuda_version><subelement /></cuda_version>
274-
</nvidia>
275233
"""
276234

277235
jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
278236
with pytest.raises(WorkflowException):
279237
jb._setup(runtime_context)
280238
assert (
281-
"Error checking CUDA version with nvidia-smi. "
282-
"Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
239+
"Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node"
240+
in caplog.text
283241
)
284242

285243

tests/wf/nvidia-smi-container.cwl

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ requirements:
77
cudaVersionMin: "1.0"
88
cudaComputeCapability: "1.0"
99
DockerRequirement:
10-
dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
10+
dockerPull: "nvidia/cuda:11.4.3-runtime-ubuntu20.04"
1111
inputs: []
1212
outputs: []
1313
# Assume this will exit non-zero (resulting in a failing test case) if

0 commit comments

Comments
 (0)