Reintroduce workspace_size

thomasfaingnaert · thomasfaingnaert · commit be21dfaa3aeb · 2020-05-25T09:25:28.000-04:00
This ensures that the size of the array in global memory is known
statically.
diff --git a/src/device/matmul_kernels/epilogue.jl b/src/device/matmul_kernels/epilogue.jl
@@ -25,9 +25,9 @@ struct Default end
     # Cooperatively store a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of D from shared to global memory within one threadblock
     @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK)
         @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32)
-            x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile)
+            x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile, block_tile.MN.size)
             x = transform(x, thread_tile)
-            Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j)))
+            Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size)
         end
     end
 end
diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl
@@ -30,9 +30,9 @@ function matmul_impl(a, b, c, d,
 
     @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK)
         @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32)
-            x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j)))
+            x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size)
             x = transf_gl2sh_c(x, thread_tile)
-            Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile)
+            Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile, block_tile.MN.size)
         end
     end
 
@@ -61,18 +61,18 @@ function matmul_impl(a, b, c, d,
         # (3.1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.K tile of A from global to shared memory within one threadblock
         @unroll for warp_tile = parallellise(block_tile.MK, Tile(MEM_A_WARP), warpId, WARPS_PER_BLOCK)
             @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_A_THREAD), laneId, 32)
-                x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k)))
+                x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k)), gemm_sz.MK.size)
                 x = transf_gl2sh_a(x, thread_tile)
-                Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile)
+                Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile, block_tile.MK.size)
             end
         end
 
         # (3.2) Cooperatively load a BLOCK_SHAPE.K x BLOCK_SHAPE.N tile of B from global to shared memory within one threadblock
         @unroll for warp_tile = parallellise(block_tile.KN, Tile(MEM_B_WARP), warpId, WARPS_PER_BLOCK)
             @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_B_THREAD), laneId, 32)
-                x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j)))
+                x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j)), gemm_sz.KN.size)
                 x = transf_gl2sh_b(x, thread_tile)
-                Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile)
+                Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile, block_tile.KN.size)
             end
         end
 
diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl
@@ -28,8 +28,8 @@ end
 
 @inline eltype(::Type{Padded{L, P}}) where {L, P} = eltype(L)
 @inline size(::Type{Padded{L, P}}, logical_size::NamedTuple) where {L, P} = size(L, pad_logical_coord(Padded{L, P}, logical_size))
-@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = load(L, workspace, tile)
-@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile) where {L, P} = store!(L, workspace, value, tile::Tile)
+@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, workspace_size::NamedTuple) where {L, P} = load(L, workspace, tile, pad_logical_coord(Padded{L, P}, workspace_size))
+@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile, workspace_size::NamedTuple) where {L, P} = store!(L, workspace, value, tile::Tile, pad_logical_coord(Padded{L, P}, workspace_size))
 
 # ---------------
 # AlignedColMajor
@@ -38,7 +38,7 @@ end
 struct AlignedColMajor{T} <: LayoutBase{T} end
 
 # TODO: cleanup vectorisation
-@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size}
+@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}, workspace_size::NamedTuple) where {T, size}
     vec_len = 16 ÷ sizeof(T)
     N = (sizeof(T) * vec_len) ÷ sizeof(Float32)
     res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef)
@@ -47,8 +47,8 @@ struct AlignedColMajor{T} <: LayoutBase{T} end
         @unroll for i = 1 : vec_len : size[1]
             t = translate(tile, (i - 1, j - 1))
 
-            linear_base = linearise(t.base, Base.size(workspace))
-            linear_offset = linearise(t.offset, Base.size(workspace))
+            linear_base = linearise(t.base, workspace_size)
+            linear_offset = linearise(t.offset, workspace_size)
 
             @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset)
         end
@@ -57,15 +57,15 @@ struct AlignedColMajor{T} <: LayoutBase{T} end
     return res
 end
 
-@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size}
+@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}, workspace_size::NamedTuple) where {T, size}
     vec_len = 16 ÷ sizeof(T)
 
     @unroll for j = 1 : size[2]
         @unroll for i = 1 : vec_len : size[1]
             t = translate(tile, (i - 1, j - 1))
 
-            linear_base = linearise(t.base, Base.size(workspace))
-            linear_offset = linearise(t.offset, Base.size(workspace))
+            linear_base = linearise(t.base, workspace_size)
+            linear_offset = linearise(t.offset, workspace_size)
 
             vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset)
         end