SciML · ChrisRackauckas · Nov 12, 2023 · Nov 12, 2023 · May 7, 2024 · Jul 30, 2025
diff --git a/Project.toml b/Project.toml
@@ -26,6 +26,8 @@ SciMLOperators = "c0aeaf25-5076-4817-a8d5-81caf7dfa961"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
+blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
+libflame_jll = "8e9d65e3-b2b8-5a9c-baa2-617b4576f0b9"
 
 [weakdeps]
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
@@ -47,6 +49,7 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Sparspak = "e56a9233-b9d6-4f03-8d0f-1825330902ac"
 
 [extensions]
+LinearSolveBLISExt = ["blis_jll", "libflame_jll"]
 LinearSolveBandedMatricesExt = "BandedMatrices"
 LinearSolveBlockDiagonalsExt = "BlockDiagonals"
 LinearSolveCUDAExt = "CUDA"
@@ -77,8 +80,8 @@ ChainRulesCore = "1.22"
 ConcreteStructs = "0.2.3"
 DocStringExtensions = "0.9.3"
 EnumX = "1.0.4"
-ExplicitImports = "1"
 EnzymeCore = "0.8.1"
+ExplicitImports = "1"
 FastAlmostBandedMatrices = "0.1"
 FastLapackInterface = "2"
 FiniteDiff = "2.22"
@@ -118,14 +121,16 @@ StaticArraysCore = "1.4.2"
 Test = "1"
 UnPack = "1"
 Zygote = "0.7"
+blis_jll = "0.9.0"
 julia = "1.10"
+libflame_jll = "5.2.0"
 
 [extras]
 AllocCheck = "9b6a8646-10ed-4001-bbdc-1d2f46dfbb1a"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
-ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
+ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7"
 FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e"
 FastLapackInterface = "29a986be-02c6-4525-aec4-84b980013641"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
@@ -150,6 +155,8 @@ StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
+blis_jll = "6136c539-28a5-5bf0-87cc-b183200dce32"
+libflame_jll = "8e9d65e3-b2b8-5a9c-baa2-617b4576f0b9"
 
 [targets]
-test = ["Aqua", "Test", "IterativeSolvers", "InteractiveUtils", "KrylovKit", "KrylovPreconditioners", "Pkg", "Random", "SafeTestsets", "MultiFloats", "ForwardDiff", "HYPRE", "MPI", "BlockDiagonals", "FiniteDiff", "BandedMatrices", "FastAlmostBandedMatrices", "StaticArrays", "AllocCheck", "StableRNGs", "Zygote", "RecursiveFactorization", "Sparspak", "FastLapackInterface", "SparseArrays", "ExplicitImports"]
+test = ["Aqua", "Test", "IterativeSolvers", "InteractiveUtils", "KrylovKit", "KrylovPreconditioners", "Pkg", "Random", "SafeTestsets", "MultiFloats", "ForwardDiff", "HYPRE", "MPI", "BlockDiagonals", "FiniteDiff", "BandedMatrices", "blis_jll", "libflame_jll", "FastAlmostBandedMatrices", "StaticArrays", "AllocCheck", "StableRNGs", "Zygote", "RecursiveFactorization", "Sparspak", "FastLapackInterface", "SparseArrays", "ExplicitImports"]
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -16,10 +16,12 @@ the best choices, with SVD being the slowest but most precise.
 
 For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
 150x150 matrices, though this can be dependent on the exact details of the hardware. After this
-point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
-that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
-use your base system BLAS which can be fast or slow depending on the hardware configuration.
-`SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.
+point, `MKLLUFactorization` is usually faster on most hardware. `BLISLUFactorization` provides
+another high-performance option that combines optimized BLAS operations from BLIS with optimized LAPACK routines from libflame.
+Note that on Mac computers that `AppleAccelerateLUFactorization` is generally always the fastest. 
+`LUFactorization` will use your base system BLAS which can be fast or slow depending on the hardware 
+configuration. `SimpleLUFactorization` will be fast only on very small matrices but can cut down on 
-Note that on Mac computers that `AppleAccelerateLUFactorization` is generally always the fastest. 
-`LUFactorization` will use your base system BLAS which can be fast or slow depending on the hardware 
-configuration. `SimpleLUFactorization` will be fast only on very small matrices but can cut down on 
+Note that on Mac computers that `AppleAccelerateLUFactorization` is generally always the fastest.
+`LUFactorization` will use your base system BLAS which can be fast or slow depending on the hardware
+configuration. `SimpleLUFactorization` will be fast only on very small matrices but can cut down on
-Note that on Mac computers that `AppleAccelerateLUFactorization` is generally always the fastest. 
-`LUFactorization` will use your base system BLAS which can be fast or slow depending on the hardware 
-configuration. `SimpleLUFactorization` will be fast only on very small matrices but can cut down on 
+Note that on Mac computers that `AppleAccelerateLUFactorization` is generally always the fastest.
+`LUFactorization` will use your base system BLAS which can be fast or slow depending on the hardware
+configuration. `SimpleLUFactorization` will be fast only on very small matrices but can cut down on
+compile times.
 
 For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
 on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
@@ -185,6 +187,18 @@ KrylovJL
 MKLLUFactorization
 ```
 
+### BLIS.jl
+
+!!! note
+
+    Using this solver requires that both blis_jll and libflame_jll packages are available. 
+    The solver will be automatically available when both packages are loaded, i.e., 
-    Using this solver requires that both blis_jll and libflame_jll packages are available. 
-    The solver will be automatically available when both packages are loaded, i.e., 
+    Using this solver requires that both blis_jll and libflame_jll packages are available.
+    The solver will be automatically available when both packages are loaded, i.e.,
-    Using this solver requires that both blis_jll and libflame_jll packages are available. 
-    The solver will be automatically available when both packages are loaded, i.e., 
+    Using this solver requires that both blis_jll and libflame_jll packages are available.
+    The solver will be automatically available when both packages are loaded, i.e.,
+    `using blis_jll, libflame_jll`.
+
+```@docs
+BLISLUFactorization
+```
+
 ### AppleAccelerate.jl
 
 !!! note

diff --git a/ext/LinearSolveBLISExt.jl b/ext/LinearSolveBLISExt.jl
@@ -0,0 +1,278 @@
+"""
+LinearSolveBLISExt
+
+Extension module that provides BLIS (BLAS-like Library Instantiation Software) integration
+for LinearSolve.jl. This extension combines BLIS for optimized BLAS operations with 
+libflame for optimized LAPACK operations, providing a fully optimized linear algebra 
-for LinearSolve.jl. This extension combines BLIS for optimized BLAS operations with 
-libflame for optimized LAPACK operations, providing a fully optimized linear algebra 
+for LinearSolve.jl. This extension combines BLIS for optimized BLAS operations with
+libflame for optimized LAPACK operations, providing a fully optimized linear algebra
-for LinearSolve.jl. This extension combines BLIS for optimized BLAS operations with 
-libflame for optimized LAPACK operations, providing a fully optimized linear algebra 
+for LinearSolve.jl. This extension combines BLIS for optimized BLAS operations with
+libflame for optimized LAPACK operations, providing a fully optimized linear algebra
+backend.
+
+Key features:
+- Uses BLIS for BLAS operations (matrix multiplication, etc.)
+- Uses libflame for LAPACK operations (LU factorization, solve, etc.)
+- Supports all standard numeric types (Float32/64, ComplexF32/64)
+- Follows MKL-style ccall patterns for consistency
- Uses BLIS for BLAS operations (matrix multiplication, etc.)
- Uses libflame for LAPACK operations (LU factorization, solve, etc.)
- Supports all standard numeric types (Float32/64, ComplexF32/64)
- Follows MKL-style ccall patterns for consistency
+
+  - Uses BLIS for BLAS operations (matrix multiplication, etc.)
+  - Uses libflame for LAPACK operations (LU factorization, solve, etc.)
+  - Supports all standard numeric types (Float32/64, ComplexF32/64)
+  - Follows MKL-style ccall patterns for consistency
- Uses BLIS for BLAS operations (matrix multiplication, etc.)
- Uses libflame for LAPACK operations (LU factorization, solve, etc.)
- Supports all standard numeric types (Float32/64, ComplexF32/64)
- Follows MKL-style ccall patterns for consistency
+
+  - Uses BLIS for BLAS operations (matrix multiplication, etc.)
+  - Uses libflame for LAPACK operations (LU factorization, solve, etc.)
+  - Supports all standard numeric types (Float32/64, ComplexF32/64)
+  - Follows MKL-style ccall patterns for consistency
+"""
+module LinearSolveBLISExt
+
+using Libdl
+using blis_jll
+using libflame_jll
+using LinearAlgebra
+using LinearSolve
+
+using LinearAlgebra: BlasInt, LU, libblastrampoline
+using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, 
-using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, 
+using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
-using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, 
+using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
+                            @blasfunc, chkargsok
+using LinearSolve: ArrayInterface, BLISLUFactorization, @get_cacheval, LinearCache, SciMLBase, do_factorization
-using LinearSolve: ArrayInterface, BLISLUFactorization, @get_cacheval, LinearCache, SciMLBase, do_factorization
+using LinearSolve: ArrayInterface, BLISLUFactorization, @get_cacheval, LinearCache,
+                   SciMLBase, do_factorization
-using LinearSolve: ArrayInterface, BLISLUFactorization, @get_cacheval, LinearCache, SciMLBase, do_factorization
+using LinearSolve: ArrayInterface, BLISLUFactorization, @get_cacheval, LinearCache,
+                   SciMLBase, do_factorization
+
+const global libblis = blis_jll.blis
+const global libflame = libflame_jll.libflame
+
+"""
+    LinearSolve.do_factorization(alg::BLISLUFactorization, A, b, u)
+
+Perform LU factorization using BLIS for the underlying BLAS operations.
+This method converts the matrix to a standard format and calls the BLIS-backed getrf! routine.
+"""
+function LinearSolve.do_factorization(alg::BLISLUFactorization, A, b, u)
+    A = convert(AbstractMatrix, A)
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
+    info = Ref{BlasInt}()
+    A, ipiv, info_val, info_ref = getrf!(A; ipiv=ipiv, info=info)
-    A, ipiv, info_val, info_ref = getrf!(A; ipiv=ipiv, info=info)
+    A, ipiv, info_val, info_ref = getrf!(A; ipiv = ipiv, info = info)
-    A, ipiv, info_val, info_ref = getrf!(A; ipiv=ipiv, info=info)
+    A, ipiv, info_val, info_ref = getrf!(A; ipiv = ipiv, info = info)
+    return LU(A, ipiv, info_val)
+end
+
+function getrf!(A::AbstractMatrix{<:ComplexF64};
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+    info = Ref{BlasInt}(),
+    check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
+    require_one_based_indexing(A)
+    check && chkfinite(A)
+    chkstride1(A)
+    m, n = size(A)
+    lda = max(1, stride(A, 2))
+    if isempty(ipiv)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
+    end
+    ccall(("zgetrf_", libflame), Cvoid,
+        (Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF64},
+            Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
+        m, n, A, lda, ipiv, info)
+    chkargsok(info[])
+    A, ipiv, info[], info #Error code is stored in LU factorization type
+end
+
+function getrf!(A::AbstractMatrix{<:ComplexF32};
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+    info = Ref{BlasInt}(),
+    check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
+    require_one_based_indexing(A)
+    check && chkfinite(A)
+    chkstride1(A)
+    m, n = size(A)
+    lda = max(1, stride(A, 2))
+    if isempty(ipiv)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
+    end
+    ccall(("cgetrf_", libflame), Cvoid,
+        (Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF32},
+            Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
+        m, n, A, lda, ipiv, info)
+    chkargsok(info[])
+    A, ipiv, info[], info #Error code is stored in LU factorization type
+end
+
+function getrf!(A::AbstractMatrix{<:Float64};
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+    info = Ref{BlasInt}(),
+    check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
+    require_one_based_indexing(A)
+    check && chkfinite(A)
+    chkstride1(A)
+    m, n = size(A)
+    lda = max(1, stride(A, 2))
+    if isempty(ipiv)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
+    end
+    ccall(("dgetrf_", libflame), Cvoid,
+        (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
+            Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
+        m, n, A, lda, ipiv, info)
+    chkargsok(info[])
+    A, ipiv, info[], info #Error code is stored in LU factorization type
+end
+
+function getrf!(A::AbstractMatrix{<:Float32};
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+    info = Ref{BlasInt}(),
+    check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
-    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
-    info = Ref{BlasInt}(),
-    check = false)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+        info = Ref{BlasInt}(),
+        check = false)
+    require_one_based_indexing(A)
+    check && chkfinite(A)
+    chkstride1(A)
+    m, n = size(A)
+    lda = max(1, stride(A, 2))
+    if isempty(ipiv)
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
+    end
+    ccall(("sgetrf_", libflame), Cvoid,
+        (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
+            Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
+        m, n, A, lda, ipiv, info)
+    chkargsok(info[])
+    A, ipiv, info[], info #Error code is stored in LU factorization type
+end
+
+function getrs!(trans::AbstractChar,
+    A::AbstractMatrix{<:ComplexF64},
+    ipiv::AbstractVector{BlasInt},
+    B::AbstractVecOrMat{<:ComplexF64};
+    info = Ref{BlasInt}())
-    A::AbstractMatrix{<:ComplexF64},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:ComplexF64};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:ComplexF64},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:ComplexF64};
+        info = Ref{BlasInt}())
-    A::AbstractMatrix{<:ComplexF64},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:ComplexF64};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:ComplexF64},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:ComplexF64};
+        info = Ref{BlasInt}())
+    require_one_based_indexing(A, ipiv, B)
+    LinearAlgebra.LAPACK.chktrans(trans)
+    chkstride1(A, B, ipiv)
+    n = LinearAlgebra.checksquare(A)
+    if n != size(B, 1)
+        throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
+    end
+    if n != length(ipiv)
+        throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
+    end
+    nrhs = size(B, 2)
+    ccall((@blasfunc(zgetrs_), libblis), Cvoid,
+        (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF64}, Ref{BlasInt},
+            Ptr{BlasInt}, Ptr{ComplexF64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
+        trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
+        1)
+    LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
+    B
+end
+
+function getrs!(trans::AbstractChar,
+    A::AbstractMatrix{<:ComplexF32},
+    ipiv::AbstractVector{BlasInt},
+    B::AbstractVecOrMat{<:ComplexF32};
+    info = Ref{BlasInt}())
-    A::AbstractMatrix{<:ComplexF32},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:ComplexF32};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:ComplexF32},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:ComplexF32};
+        info = Ref{BlasInt}())
-    A::AbstractMatrix{<:ComplexF32},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:ComplexF32};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:ComplexF32},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:ComplexF32};
+        info = Ref{BlasInt}())
+    require_one_based_indexing(A, ipiv, B)
+    LinearAlgebra.LAPACK.chktrans(trans)
+    chkstride1(A, B, ipiv)
+    n = LinearAlgebra.checksquare(A)
+    if n != size(B, 1)
+        throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
+    end
+    if n != length(ipiv)
+        throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
+    end
+    nrhs = size(B, 2)
+    ccall((@blasfunc(cgetrs_), libblis), Cvoid,
+        (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{ComplexF32}, Ref{BlasInt},
+            Ptr{BlasInt}, Ptr{ComplexF32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
+        trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
+        1)
+    LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
+    B
+end
+
+function getrs!(trans::AbstractChar,
+    A::AbstractMatrix{<:Float64},
+    ipiv::AbstractVector{BlasInt},
+    B::AbstractVecOrMat{<:Float64};
+    info = Ref{BlasInt}())
-    A::AbstractMatrix{<:Float64},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:Float64};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:Float64},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:Float64};
+        info = Ref{BlasInt}())
-    A::AbstractMatrix{<:Float64},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:Float64};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:Float64},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:Float64};
+        info = Ref{BlasInt}())
+    require_one_based_indexing(A, ipiv, B)
+    LinearAlgebra.LAPACK.chktrans(trans)
+    chkstride1(A, B, ipiv)
+    n = LinearAlgebra.checksquare(A)
+    if n != size(B, 1)
+        throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
+    end
+    if n != length(ipiv)
+        throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
+    end
+    nrhs = size(B, 2)
+    ccall((@blasfunc(dgetrs_), libblis), Cvoid,
+        (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
+            Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
+        trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
+        1)
+    LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
+    B
+end
+
+function getrs!(trans::AbstractChar,
+    A::AbstractMatrix{<:Float32},
+    ipiv::AbstractVector{BlasInt},
+    B::AbstractVecOrMat{<:Float32};
+    info = Ref{BlasInt}())
-    A::AbstractMatrix{<:Float32},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:Float32};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:Float32},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:Float32};
+        info = Ref{BlasInt}())
-    A::AbstractMatrix{<:Float32},
-    ipiv::AbstractVector{BlasInt},
-    B::AbstractVecOrMat{<:Float32};
-    info = Ref{BlasInt}())
+        A::AbstractMatrix{<:Float32},
+        ipiv::AbstractVector{BlasInt},
+        B::AbstractVecOrMat{<:Float32};
+        info = Ref{BlasInt}())
+    require_one_based_indexing(A, ipiv, B)
+    LinearAlgebra.LAPACK.chktrans(trans)
+    chkstride1(A, B, ipiv)
+    n = LinearAlgebra.checksquare(A)
+    if n != size(B, 1)
+        throw(DimensionMismatch("B has leading dimension $(size(B,1)), but needs $n"))
+    end
+    if n != length(ipiv)
+        throw(DimensionMismatch("ipiv has length $(length(ipiv)), but needs to be $n"))
+    end
+    nrhs = size(B, 2)
+    ccall((@blasfunc(sgetrs_), libblis), Cvoid,
+        (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
+            Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
+        trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
+        1)
+    LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
+    B
+end
+
+default_alias_A(::BLISLUFactorization, ::Any, ::Any) = false
+default_alias_b(::BLISLUFactorization, ::Any, ::Any) = false
+
+const PREALLOCATED_BLIS_LU = begin
+    A = rand(0, 0)
+    luinst = ArrayInterface.lu_instance(A), Ref{BlasInt}()
+end
+
+function LinearSolve.init_cacheval(alg::BLISLUFactorization, A, b, u, Pl, Pr,
+    maxiters::Int, abstol, reltol, verbose::Bool,
+    assumptions::OperatorAssumptions)
-    maxiters::Int, abstol, reltol, verbose::Bool,
-    assumptions::OperatorAssumptions)
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
-    maxiters::Int, abstol, reltol, verbose::Bool,
-    assumptions::OperatorAssumptions)
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
+    PREALLOCATED_BLIS_LU
+end
+
+function LinearSolve.init_cacheval(alg::BLISLUFactorization, A::AbstractMatrix{<:Union{Float32,ComplexF32,ComplexF64}}, b, u, Pl, Pr,
+    maxiters::Int, abstol, reltol, verbose::Bool,
+    assumptions::OperatorAssumptions)
-function LinearSolve.init_cacheval(alg::BLISLUFactorization, A::AbstractMatrix{<:Union{Float32,ComplexF32,ComplexF64}}, b, u, Pl, Pr,
-    maxiters::Int, abstol, reltol, verbose::Bool,
-    assumptions::OperatorAssumptions)
+function LinearSolve.init_cacheval(alg::BLISLUFactorization,
+        A::AbstractMatrix{<:Union{Float32, ComplexF32, ComplexF64}}, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
-function LinearSolve.init_cacheval(alg::BLISLUFactorization, A::AbstractMatrix{<:Union{Float32,ComplexF32,ComplexF64}}, b, u, Pl, Pr,
-    maxiters::Int, abstol, reltol, verbose::Bool,
-    assumptions::OperatorAssumptions)
+function LinearSolve.init_cacheval(alg::BLISLUFactorization,
+        A::AbstractMatrix{<:Union{Float32, ComplexF32, ComplexF64}}, b, u, Pl, Pr,
+        maxiters::Int, abstol, reltol, verbose::Bool,
+        assumptions::OperatorAssumptions)
+    A = rand(eltype(A), 0, 0)
+    ArrayInterface.lu_instance(A), Ref{BlasInt}()
+end
+
+function SciMLBase.solve!(cache::LinearCache, alg::BLISLUFactorization;
+    kwargs...)
-    kwargs...)
+        kwargs...)
-    kwargs...)
+        kwargs...)
+    A = cache.A
+    A = convert(AbstractMatrix, A)
+    if cache.isfresh
+        cacheval = @get_cacheval(cache, :BLISLUFactorization)
+        res = getrf!(A; ipiv = cacheval[1].ipiv, info = cacheval[2])
+        fact = LU(res[1:3]...), res[4]
+        cache.cacheval = fact
+        cache.isfresh = false
+    end
+
+    y = ldiv!(cache.u, @get_cacheval(cache, :BLISLUFactorization)[1], cache.b)
+    SciMLBase.build_linear_solution(alg, y, nothing, cache)
+
+    #=
+    A, info = @get_cacheval(cache, :BLISLUFactorization)
+    LinearAlgebra.require_one_based_indexing(cache.u, cache.b)
+    m, n = size(A, 1), size(A, 2)
+    if m > n
+        Bc = copy(cache.b)
+        getrs!('N', A.factors, A.ipiv, Bc; info)
+        return copyto!(cache.u, 1, Bc, 1, n)
+    else
+        copyto!(cache.u, cache.b)
+        getrs!('N', A.factors, A.ipiv, cache.u; info)
+    end
+
+    SciMLBase.build_linear_solution(alg, cache.u, nothing, cache)
+    =#
+end
+
+end
-end
+end
-end
+end