Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions ggml/src/ggml-cpu/arch/loongarch/quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,18 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128

return ((v4f32)res)[0];
}

// multiply int8_t, add results pairwise twice
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// Get absolute values of x vectors
const __m128i ax = __lsx_vsigncov_b(x, x);
// Sign the values of the y vectors
const __m128i sy = __lsx_vsigncov_b(x, y);
// Perform multiplication and create 16-bit values
const __m128i dot = lsx_maddubs_h(ax, sy);
const __m128i ones = __lsx_vreplgr2vr_h(1);
return lsx_madd_h(ones, dot);
}
#endif

#if defined(__loongarch_asx)
Expand Down Expand Up @@ -323,18 +335,6 @@ static inline __m256i lasx_xvandi_b_bit(__m256i a, const unsigned int b) {
}
}

// multiply int8_t, add results pairwise twice
static inline __m128i mul_sum_i8_pairs(const __m128i x, const __m128i y) {
// Get absolute values of x vectors
const __m128i ax = __lsx_vsigncov_b(x, x);
// Sign the values of the y vectors
const __m128i sy = __lsx_vsigncov_b(x, y);
// Perform multiplication and create 16-bit values
const __m128i dot = lsx_maddubs_h(ax, sy);
const __m128i ones = __lsx_vreplgr2vr_h(1);
return lsx_madd_h(ones, dot);
}

// horizontally add 8 floats
static inline float hsum_float_8(const __m256 x) {
__m128 res = lasx_extractf128(x, 1);
Expand Down
16 changes: 8 additions & 8 deletions ggml/src/ggml-cpu/simd-mappings.h
Original file line number Diff line number Diff line change
Expand Up @@ -998,9 +998,9 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
#define GGML_F32_EPR 4

#define GGML_F32x4 __m128
#define GGML_F32x4_ZERO __lsx_vldi(0)
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
#define GGML_F32x4_ZERO (__m128)__lsx_vldi(0)
#define GGML_F32x4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32x4_LOAD(x) (__m128)__lsx_vld((x), 0)
#define GGML_F32x4_STORE(x, y) __lsx_vst(y, x, 0)
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
#define GGML_F32x4_ADD __lsx_vfadd_s
Expand All @@ -1022,7 +1022,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
const __m128 t0 = (__m128)__lsx_vshuf4i_w(tmp, 0x88); \
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
Expand Down Expand Up @@ -1052,7 +1052,7 @@ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
tmp[2] = GGML_CPU_FP16_TO_FP32(x[2]);
tmp[3] = GGML_CPU_FP16_TO_FP32(x[3]);

return __lsx_vld(tmp, 0);
return (__m128)__lsx_vld(tmp, 0);
}

static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
Expand All @@ -1067,9 +1067,9 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
}

#define GGML_F32Cx4 __m128
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
#define GGML_F32Cx4_ZERO (__m128)__lsx_vldi(0)
#define GGML_F32Cx4_SET1(x) (__m128)__lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
#define GGML_F32Cx4_LOAD(x) (__m128)__lsx_f16x4_load(x)
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
#define GGML_F32Cx4_ADD __lsx_vfadd_s
Expand Down
Loading