-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathtest_AllGather.py
112 lines (103 loc) · 4.71 KB
/
test_AllGather.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#################################################################################
# Copyright (C) 2019 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell cop-
# ies of the Software, and to permit persons to whom the Software is furnished
# to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IM-
# PLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNE-
# CTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
################################################################################
import os
import subprocess
import itertools
import math
import pytest
ngpus = 0
if os.environ.get('ROCR_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['ROCR_VISIBLE_DEVICES'].split(","))
elif os.environ.get('HIP_VISIBLE_DEVICES') is not None:
ngpus = len(os.environ['HIP_VISIBLE_DEVICES'].split(","))
else:
ngpus = int(subprocess.check_output("rocminfo | grep \"Device Type:.\s*.GPU\" | wc -l",shell=True))
log_ngpus = int(math.log2(ngpus))
nthreads = ["1"]
nprocs = ["2"]
ngpus_single = [str(2**x) for x in range(log_ngpus+1)]
ngpus_mpi = ["1","2"]
byte_range = [("4", "128M")]
op = ["sum", "prod", "min", "max"]
step_factor = ["2"]
datatype = ["int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"]
memory_type = ["coarse","fine", "host"]
path = os.path.dirname(os.path.abspath(__file__))
executable = path + "/../build/all_gather_perf"
@pytest.mark.parametrize("nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type",
itertools.product(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type))
def test_AllGatherSingleProcess(nthreads, ngpus_single, byte_range, op, step_factor, datatype, memory_type):
try:
args = [executable,
"-t", nthreads,
"-g", ngpus_single,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
rccl_test = subprocess.run(args_str, stdout=subprocess.PIPE, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0
@pytest.mark.parametrize("nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype",
itertools.product(nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype))
def test_AllGatherMPI(request, nthreads, nprocs, ngpus_mpi, byte_range, op, step_factor, datatype):
try:
mpi_hostfile = request.config.getoption('--hostfile')
if not mpi_hostfile:
args = ["mpirun -np", nprocs,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype]
else:
args = ["mpirun -np", nprocs,
"-host", mpi_hostfile,
executable,
"-p 1",
"-t", nthreads,
"-g", ngpus_mpi,
"-b", byte_range[0],
"-e", byte_range[1],
"-o", op,
"-f", step_factor,
"-d", datatype,
"-y", memory_type]
if memory_type == "fine":
args.insert(0, "HSA_FORCE_FINE_GRAIN_PCIE=1")
args_str = " ".join(args)
print(args_str)
rccl_test = subprocess.run(args_str, universal_newlines=True, shell=True)
except subprocess.CalledProcessError as err:
print(rccl_test.stdout)
pytest.fail("AllGather test error(s) detected.")
assert rccl_test.returncode == 0