@@ -21,6 +21,20 @@ def _run(self, X):
2121 return (1 - X ,)
2222
2323
24+ class Transpose2DCastFP16 (OpRun ):
25+ op_domain = "ai.onnx.contrib"
26+
27+ def _run (self , X ):
28+ return (X .T .to (np .float16 ),)
29+
30+
31+ class Transpose2DCastFP32 (OpRun ):
32+ op_domain = "ai.onnx.contrib"
33+
34+ def _run (self , X ):
35+ return (X .T .to (np .float32 ),)
36+
37+
2438class TestCudaOps (unittest .TestCase ):
2539 @staticmethod
2640 def _create_negpos_test_model (domain = "ai.onnx.contrib" ):
@@ -151,8 +165,6 @@ def test_cuda_negxplus1(self):
151165 self ._negxplus1_cuda (TensorProto .FLOAT16 )
152166
153167 def _addmul_shared_input_cuda (self , itype , op_type , shapea = (3 , 2 , 3 ), shapeb = (3 , 2 , 3 ), shapec = (3 , 2 , 3 )):
154- from onnx_extended .ortops .optim .cuda import get_ort_ext_libs
155-
156168 model1 = helper .make_model (
157169 helper .make_graph (
158170 [
@@ -181,7 +193,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
181193 f"{ op_type } SharedInput" ,
182194 ["X" , "Y" , "Z" ],
183195 ["XY" , "XZ" ],
184- domain = "onnx_extended.ortops.optim.cuda " ,
196+ domain = "ai.onnx.contrib " ,
185197 )
186198 ],
187199 "nd" ,
@@ -197,7 +209,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
197209 ),
198210 opset_imports = [
199211 helper .make_opsetid ("" , 18 ),
200- helper .make_opsetid ("onnx_extended.ortops.optim.cuda " , 1 ),
212+ helper .make_opsetid ("ai.onnx.contrib " , 1 ),
201213 ],
202214 ir_version = 9 ,
203215 )
@@ -212,7 +224,7 @@ def _addmul_shared_input_cuda(self, itype, op_type, shapea=(3, 2, 3), shapeb=(3,
212224 expected = ref .run (None , feeds1 )
213225
214226 opts = _ort .SessionOptions ()
215- opts .register_custom_ops_library (get_ort_ext_libs ()[ 0 ] )
227+ opts .register_custom_ops_library (_get_library_path () )
216228 sess = _ort .InferenceSession (model2 .SerializeToString (), opts , providers = ["CUDAExecutionProvider" ])
217229 got = sess .run (None , feeds1 )
218230 for i in range (2 ):
@@ -262,6 +274,62 @@ def test_add_shared_input_cuda_broadcast2(self):
262274 shapec = (3 , 2 , 3 ),
263275 )
264276
277+ def _transpose_cast_cuda (self , itype ):
278+ dtype = np .float32 if itype == TensorProto .FLOAT else np .float16
279+ itype2 = TensorProto .FLOAT if itype == TensorProto .FLOAT16 else TensorProto .FLOAT16
280+ model1 = helper .make_model (
281+ helper .make_graph (
282+ [
283+ helper .make_node ("Transpose" , ["X" ], ["t" ], perm = [1 , 0 ]),
284+ helper .make_node ("Cast" , ["t" ], ["Y" ], to = itype2 ),
285+ ],
286+ "nd" ,
287+ [helper .make_tensor_value_info ("X" , itype , [None , None ])],
288+ [helper .make_tensor_value_info ("Y" , itype2 , [None , None ])],
289+ ),
290+ opset_imports = [helper .make_opsetid ("" , 18 )],
291+ ir_version = 9 ,
292+ )
293+
294+ model2 = helper .make_model (
295+ helper .make_graph (
296+ [
297+ helper .make_node (
298+ ("Transpose2DCastFP16" if itype2 == TensorProto .FLOAT16 else "Transpose2DCastFP32" ),
299+ ["X" ],
300+ ["Y" ],
301+ domain = "ai.onnx.contrib" ,
302+ )
303+ ],
304+ "nd" ,
305+ [helper .make_tensor_value_info ("X" , itype , [None , None ])],
306+ [helper .make_tensor_value_info ("Y" , itype2 , [None , None ])],
307+ ),
308+ opset_imports = [
309+ helper .make_opsetid ("" , 18 ),
310+ helper .make_opsetid ("ai.onnx.contrib" , 1 ),
311+ ],
312+ ir_version = 9 ,
313+ )
314+
315+ dtype = np .float32 if itype == TensorProto .FLOAT else np .float16
316+ x = (np .arange (32 * 32 * 3 ) + 1 ).reshape ((32 , 32 * 3 )).astype (dtype )
317+
318+ feeds1 = dict (X = x )
319+ ref = ReferenceEvaluator (model1 , new_ops = [Transpose2DCastFP16 , Transpose2DCastFP32 ])
320+ expected = ref .run (None , feeds1 )[0 ]
321+
322+ opts = _ort .SessionOptions ()
323+ opts .register_custom_ops_library (_get_library_path ())
324+ sess = _ort .InferenceSession (model2 .SerializeToString (), opts , providers = ["CUDAExecutionProvider" ])
325+ got = sess .run (None , feeds1 )[0 ]
326+ assert_almost_equal (expected , got , decimal = 5 )
327+
328+ @unittest .skipIf (not has_cuda (), reason = "cuda not available" )
329+ def test_transpose_cast_cuda (self ):
330+ self ._transpose_cast_cuda (TensorProto .FLOAT )
331+ self ._transpose_cast_cuda (TensorProto .FLOAT16 )
332+
265333
266334if __name__ == "__main__" :
267- unittest .main ()
335+ unittest .main (verbosity = 2 )
0 commit comments