|
86 | 86 | # @inline function solve_Wx3W!(ap::AbstractStridedPointer{T}, bp::AbstractStridedPointer{T}, U, rowoffset, coloffset, m::VectorizationBase.AbstractMask) where {T}
|
87 | 87 | # WS = VectorizationBase.pick_vector_width(T)
|
88 | 88 | # W = Int(WS)
|
89 |
| -# A11 = vload(bp, Unroll{2,1,W,1,W,0xffffffffffffffff,1}((rowoffset,coloffset)), m) |
90 |
| -# A12 = vload(bp, Unroll{2,1,W,1,W,0xffffffffffffffff,1}((rowoffset,coloffset+WS)), m) |
91 |
| -# A13 = vload(bp, Unroll{2,1,W,1,W,0xffffffffffffffff,1}((rowoffset,coloffset+WS+WS)), m) |
| 89 | +# A11 = vload(bp, Unroll{2,1,W,1,W,(-1%UInt),1}((rowoffset,coloffset)), m) |
| 90 | +# A12 = vload(bp, Unroll{2,1,W,1,W,(-1%UInt),1}((rowoffset,coloffset+WS)), m) |
| 91 | +# A13 = vload(bp, Unroll{2,1,W,1,W,(-1%UInt),1}((rowoffset,coloffset+WS+WS)), m) |
92 | 92 |
|
93 | 93 | # A11, A12, A13 = solve_Wx3W(A11, A12, A13, U, WS)
|
94 | 94 |
|
95 |
| -# vstore!(ap, A11, Unroll{2,1,W,1,W,0xffffffffffffffff,1}((rowoffset,coloffset)), m) |
96 |
| -# vstore!(ap, A12, Unroll{2,1,W,1,W,0xffffffffffffffff,1}((rowoffset,coloffset+WS)), m) |
97 |
| -# vstore!(ap, A13, Unroll{2,1,W,1,W,0xffffffffffffffff,1}((rowoffset,coloffset+WS+WS)), m) |
| 95 | +# vstore!(ap, A11, Unroll{2,1,W,1,W,(-1%UInt),1}((rowoffset,coloffset)), m) |
| 96 | +# vstore!(ap, A12, Unroll{2,1,W,1,W,(-1%UInt),1}((rowoffset,coloffset+WS)), m) |
| 97 | +# vstore!(ap, A13, Unroll{2,1,W,1,W,(-1%UInt),1}((rowoffset,coloffset+WS+WS)), m) |
98 | 98 | # end
|
99 | 99 |
|
100 | 100 | # solve_3Wx3W!(A,B,U::UpperTriangular) = solve_3Wx3W!(A,B,parent(U))
|
@@ -226,15 +226,15 @@ end
|
226 | 226 | quote
|
227 | 227 | $(Expr(:meta,:inline))
|
228 | 228 | # here, we just want to load the vectors
|
229 |
| - C11 = VectorizationBase.data(vload(spa, Unroll{2,1,$W,1,$W,0xffffffffffffffff,1}((StaticInt(0),n)), mask)) |
| 229 | + C11 = VectorizationBase.data(vload(spa, Unroll{2,1,$W,1,$W,(-1%UInt),1}((StaticInt(0),n)), mask)) |
230 | 230 | Base.Cartesian.@nexprs $W c -> C11_c = C11[c]
|
231 | 231 | for nk ∈ SafeCloseOpen(n) # nmuladd
|
232 | 232 | A11 = vload(spc, (MM{$W}(StaticInt(0)),nk), mask)
|
233 | 233 | Base.Cartesian.@nexprs $W c -> C11_c = vfnmadd_fast(A11, vload(spu, (nk,n+(c-1))), C11_c)
|
234 | 234 | end
|
235 | 235 | C11 = VecUnroll((Base.Cartesian.@ntuple $W C11))
|
236 | 236 | C11 = solve_AU(C11, spu, n, Val{$UNIT}())
|
237 |
| - i = Unroll{2,1,$W,1,$W,0xffffffffffffffff,1}((StaticInt(0),n)) |
| 237 | + i = Unroll{2,1,$W,1,$W,(-1%UInt),1}((StaticInt(0),n)) |
238 | 238 | $storecexpr
|
239 | 239 | maybestore!(spb, C11, i, mask)
|
240 | 240 | end
|
|
0 commit comments