1
1
# Need https://github.com/JuliaLang/julia/pull/33970
2
2
# and https://github.com/JuliaLang/julia/pull/34043
3
- if VERSION >= v " 1.4 .0-DEV.666 " && capability (device ()) >= v " 7.0"
3
+ if VERSION >= v " 1.5 .0-DEV.437 " && capability (device ()) >= v " 7.0"
4
4
5
5
using CUDAnative. WMMA
6
6
7
- is_debug = ccall (:jl_is_debugbuild , Cint, ()) != 0
8
- (is_debug && VERSION < v " 1.5.0-DEV.437" ) ? @warn (" Skipping WMMA tests due to incompatible Julia" ) : @testset " WMMA" begin
7
+ @testset " WMMA" begin
9
8
10
9
# ###############################################################################
11
10
@@ -231,20 +230,18 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
231
230
return
232
231
end
233
232
234
- @test_broken_if VERSION >= v " 1.5.0-DEV.393" begin
235
- @cuda threads= 32 kernel (a_dev, b_dev, c_dev, d_dev, alpha, beta)
236
- d = Array (d_dev)
233
+ @cuda threads= 32 kernel (a_dev, b_dev, c_dev, d_dev, alpha, beta)
234
+ d = Array (d_dev)
237
235
238
- new_a = (a_layout == ColMajor) ? a : transpose (a)
239
- new_b = (b_layout == ColMajor) ? b : transpose (b)
240
- new_c = (c_layout == ColMajor) ? c : transpose (c)
241
- new_d = (d_layout == ColMajor) ? d : transpose (d)
236
+ new_a = (a_layout == ColMajor) ? a : transpose (a)
237
+ new_b = (b_layout == ColMajor) ? b : transpose (b)
238
+ new_c = (c_layout == ColMajor) ? c : transpose (c)
239
+ new_d = (d_layout == ColMajor) ? d : transpose (d)
242
240
243
- if do_mac
244
- all (isapprox .(alpha * new_a * new_b + beta * new_c, new_d; rtol= sqrt (eps (Float16))))
245
- else
246
- all (isapprox .(alpha * new_a * new_b, new_d; rtol= sqrt (eps (Float16))))
247
- end
241
+ if do_mac
242
+ @test_broken all (isapprox .(alpha * new_a * new_b + beta * new_c, new_d; rtol= sqrt (eps (Float16))))
243
+ else
244
+ @test_broken all (isapprox .(alpha * new_a * new_b, new_d; rtol= sqrt (eps (Float16))))
248
245
end
249
246
end
250
247
@@ -254,40 +251,38 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0
254
251
255
252
# Need https://github.com/JuliaLang/julia/pull/34760
256
253
# See https://github.com/JuliaGPU/CUDAnative.jl/issues/548
257
- if VERSION >= v " 1.5.0-DEV.324"
258
- @testset " Codegen addressing" begin
259
- @testset " Global" begin
260
- function kernel (d)
261
- conf = WMMA. Config{16 , 16 , 16 , Float32}
262
-
263
- d_frag = WMMA. fill_c (Float32 (0 ), conf)
264
- WMMA. store_d (pointer (d), d_frag, 16 , WMMA. ColMajor, conf)
265
-
266
- return
267
- end
254
+ @testset " Codegen addressing" begin
255
+ @testset " Global" begin
256
+ function kernel (d)
257
+ conf = WMMA. Config{16 , 16 , 16 , Float32}
268
258
269
- ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, (CuDeviceArray{Float32,1 ,CUDAnative. AS. Global},)))
259
+ d_frag = WMMA. fill_c (Float32 (0 ), conf)
260
+ WMMA. store_d (pointer (d), d_frag, 16 , WMMA. ColMajor, conf)
270
261
271
- @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
272
- @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32" , ptx)
262
+ return
273
263
end
274
264
275
- @testset " Shared" begin
276
- function kernel ()
277
- shmem = @cuStaticSharedMem (Float32, (16 , 16 ))
278
- conf = WMMA. Config{16 , 16 , 16 , Float32}
265
+ ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, (CuDeviceArray{Float32,1 ,CUDAnative. AS. Global},)))
279
266
280
- d_frag = WMMA. fill_c (Float32 (0 ), conf)
281
- WMMA. store_d (pointer (shmem), d_frag, 16 , WMMA. ColMajor, conf)
267
+ @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
268
+ @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.global.f32" , ptx)
269
+ end
282
270
283
- return
284
- end
271
+ @testset " Shared" begin
272
+ function kernel ()
273
+ shmem = @cuStaticSharedMem (Float32, (16 , 16 ))
274
+ conf = WMMA. Config{16 , 16 , 16 , Float32}
285
275
286
- ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, ()))
276
+ d_frag = WMMA. fill_c (Float32 (0 ), conf)
277
+ WMMA. store_d (pointer (shmem), d_frag, 16 , WMMA. ColMajor, conf)
287
278
288
- @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
289
- @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32" , ptx)
279
+ return
290
280
end
281
+
282
+ ptx = sprint (io -> CUDAnative. code_ptx (io, kernel, ()))
283
+
284
+ @test ! occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.f32" , ptx)
285
+ @test occursin (r" wmma.store.d.sync(.aligned)?.col.m16n16k16.shared.f32" , ptx)
291
286
end
292
287
end
293
288
0 commit comments