Skip to content

Commit

Permalink
adding fast/low precision option
Browse files Browse the repository at this point in the history
  • Loading branch information
Geolm committed Jan 25, 2024
1 parent 22cbc5b commit 71e4d1f
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 17 deletions.
9 changes: 6 additions & 3 deletions .github/workflows/cmake-multi-platform.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ jobs:

- name: Test
working-directory: ${{github.workspace}}/
run: ./test
run: ./test_precision
./test_fast

build-macos:
name: macos
Expand All @@ -39,7 +40,8 @@ jobs:

- name: Test
working-directory: ${{github.workspace}}/
run: ./test
run: ./test_precision
./test_fast

build-windows:
name: windows
Expand All @@ -58,4 +60,5 @@ jobs:

- name: Test
working-directory: ${{github.workspace}}\tests\Debug
run: ./test
run: ./test_precision
./test_fast
56 changes: 55 additions & 1 deletion math_intrinsics.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ extern "C" {
static inline simd_vector simd_inv_mant_mask(void){return vreinterpretq_u32_f32(vdupq_n_u32(~0x7f800000));}
static inline simd_vector simd_mant_mask(void){return vreinterpretq_u32_f32(vdupq_n_u32(0x7f800000));}
static inline simd_vector simd_floor(simd_vector a) {return vrndmq_f32(a);}
static inline simd_vector simd_round(simd_vector a) {return vrndnq_f32(a);}
static inline simd_vector simd_neg(simd_vector a) {return vnegq_f32(a);}
static inline simd_vector simd_sqrt(simd_vector a) {return vsqrtq_f32(a);}
static inline simd_vector simd_rcp(simd_vector a) {simd_vector out = vrecpeq_f32(a); return vmulq_f32(out, vrecpsq_f32(out, a));}
Expand All @@ -166,6 +167,7 @@ extern "C" {
#define simd_asin vasinq_f32
#define simd_atan vatanq_f32
#define simd_sincos vsincosq_f32
#define simd_sin vsinq_f32

#else
typedef __m256 simd_vector;
Expand Down Expand Up @@ -202,6 +204,7 @@ extern "C" {
static inline simd_vector simd_inv_mant_mask(void){return _mm256_castsi256_ps(_mm256_set1_epi32(~0x7f800000));}
static inline simd_vector simd_mant_mask(void){return _mm256_castsi256_ps(_mm256_set1_epi32(0x7f800000));}
static inline simd_vector simd_floor(simd_vector a) {return _mm256_floor_ps(a);}
static inline simd_vector simd_round(simd_vector a) {return _mm256_round_ps(a, _MM_FROUND_NINT);}
static inline simd_vector simd_cmp_gt(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_GT_OQ);}
static inline simd_vector simd_cmp_ge(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_GE_OQ);}
static inline simd_vector simd_cmp_lt(simd_vector a, simd_vector b) {return _mm256_cmp_ps(a, b, _CMP_LT_OQ);}
Expand Down Expand Up @@ -232,6 +235,7 @@ extern "C" {
#define simd_asin mm256_asin_ps
#define simd_atan mm256_atan_ps
#define simd_sincos mm256_sincos_ps
#define simd_sin mm256_sin_ps

#endif

Expand Down Expand Up @@ -605,9 +609,36 @@ float32x4_t vsinq_f32(float32x4_t x)
__m256 mm256_sin_ps(__m256 x)
#endif
{
#ifdef __MATH_INTRINSINCS_FAST__
// range reduction from hlslpp, polynomial computed by lolremez
simd_vector invtau = simd_splat(1.f/SIMD_MATH_TAU);
simd_vector tau = simd_splat(SIMD_MATH_TAU);
simd_vector pi2 = simd_splat(SIMD_MATH_PI2);

// Range reduction (into [-pi, pi] range)
// Formula is x = x - round(x / 2pi) * 2pi
x = simd_sub(x, simd_mul(simd_round(simd_mul(x, invtau)), tau));

simd_vector gt_pi2 = simd_cmp_gt(x, pi2);
simd_vector lt_minus_pi2 = simd_cmp_lt(x, simd_neg(pi2));
simd_vector ox = x;

// Use identities/mirroring to remap into the range of the minimax polynomial
simd_vector pi = simd_splat(SIMD_MATH_PI);
x = simd_select(x, simd_sub(pi, ox), gt_pi2);
x = simd_select(x, simd_sub(simd_neg(pi), ox), lt_minus_pi2);

simd_vector x_squared = simd_mul(x, x);
simd_vector result = simd_polynomial4(x_squared, (float[]){2.6000548e-6f, -1.9806615e-4f, 8.3330173e-3f, -1.6666657e-1f});
result = simd_mul(result, x_squared);
result = simd_fmad(result, x, x);

return result;
#else
simd_vector sinus, cosinus;
simd_sincos(x, &sinus, &cosinus);
return sinus;
#endif
}

//----------------------------------------------------------------------------------------------------------------------
Expand All @@ -617,19 +648,32 @@ float32x4_t vcosq_f32(float32x4_t x)
__m256 mm256_cos_ps(__m256 x)
#endif
{
#ifdef __MATH_INTRINSINCS_FAST__
return simd_sin(simd_sub(simd_splat(SIMD_MATH_PI2), x));
#else
simd_vector sinus, cosinus;
simd_sincos(x, &sinus, &cosinus);
return cosinus;
#endif
}

//----------------------------------------------------------------------------------------------------------------------
// based on https://github.com/jeremybarnes/cephes/blob/master/single/asinf.c
#ifdef __MATH__INTRINSICS__NEON__
float32x4_t vasinq_f32(float32x4_t xx)
#else
__m256 mm256_asin_ps(__m256 xx)
#endif
{
#ifdef __MATH_INTRINSINCS_FAST__
// based on https://developer.download.nvidia.com/cg/asin.html
simd_vector x = xx;
simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(x, simd_splat_zero()));
x = simd_abs(x);
simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f});
result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_mul(simd_sqrt(simd_sub(simd_splat(1.f), x)), result));
return simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), result), negate));
#else
// based on https://github.com/jeremybarnes/cephes/blob/master/single/asinf.c
simd_vector x = xx;
simd_vector sign = simd_sign(xx);
simd_vector a = simd_abs(x);
Expand All @@ -656,6 +700,7 @@ __m256 mm256_cos_ps(__m256 x)
z = simd_select(z, simd_splat_zero(), greater_one);
z = simd_mul(z, sign);
return z;
#endif
}

//----------------------------------------------------------------------------------------------------------------------
Expand All @@ -666,10 +711,19 @@ __m256 mm256_cos_ps(__m256 x)
__m256 mm256_acos_ps(__m256 x)
#endif
{
#ifdef __MATH_INTRINSINCS_FAST__
simd_vector negate = simd_select(simd_splat_zero(), simd_splat(1.f), simd_cmp_lt(x, simd_splat_zero()));
x = simd_abs(x);
simd_vector result = simd_polynomial4(x, (float[]){-0.0187293f, 0.0742610f, -0.2121144f, 1.5707288f});
result = simd_mul(result, simd_sqrt(simd_sub(simd_splat(1.f), x)));
result = simd_sub(result, simd_mul(simd_mul(simd_splat(2.f), negate), result));
return simd_fmad(negate, simd_splat(SIMD_MATH_PI), result);
#else
simd_vector out_of_bound = simd_cmp_gt(simd_abs(x), simd_splat(1.f));
simd_vector result = simd_sub(simd_splat(SIMD_MATH_PI2), simd_asin(x));
result = simd_select(result, simd_splat_zero(), out_of_bound);
return result;
#endif
}

//----------------------------------------------------------------------------------------------------------------------
Expand Down
9 changes: 6 additions & 3 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ set(CMAKE_OSX_ARCHITECTURES arm64;x86_64)

project(math_intrinsics_unit_tests)

add_executable(test test.c math_intrinsics.c)
add_executable(test_precision test.c math_intrinsics.c)
add_executable(test_fast test.c math_intrinsics.c)

if(LINUX)
set(CMAKE_EXE_LINKER_FLAGS "-lm")
endif()

if(MSVC)
target_compile_options(test PRIVATE /W4 /WX /std:c17)
target_compile_options(test_precision PRIVATE /W4 /WX /std:c17)
target_compile_options(test_fast PRIVATE /W4 /WX /std:c17 /D__MATH_INTRINSINCS_FAST__)
else()
target_compile_options(test PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma)
target_compile_options(test_precision PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma)
target_compile_options(test_fast PRIVATE -Wall -Wextra -Wpedantic -Werror -mavx2 -mfma -D__MATH_INTRINSINCS_FAST__)
endif()
29 changes: 19 additions & 10 deletions tests/test.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,27 +107,36 @@ float32x4_t simd_atan2(float32x4_t angle)

#define NUM_SAMPLES (1024)

#ifdef __MATH_INTRINSINCS_FAST__
static const float trigo_threshold = 1.e-06f;
static const float arc_theshold = 1.e-04;
#else
static const float trigo_threshold = FLT_EPSILON;
static const float arc_theshold = 1.e-06f;
#endif


SUITE(trigonometry)
{
printf(".");

#ifdef __MATH__INTRINSICS__AVX__
RUN_TESTp(generic_test, sinf, mm256_sin_ps, -10.f, 10.f, FLT_EPSILON, NUM_SAMPLES, false, "mm256_sin_ps");
RUN_TESTp(generic_test, cosf, mm256_cos_ps, -10.f, 10.f, FLT_EPSILON, NUM_SAMPLES, false, "mm256_cos_ps");
RUN_TESTp(generic_test, acosf, mm256_acos_ps, -1.f, 1.f, 1.e-06f, NUM_SAMPLES, false, "mm256_acos_ps");
RUN_TESTp(generic_test, asinf, mm256_asin_ps, -1.f, 1.f, 1.e-06f, NUM_SAMPLES, false, "mm256_asin_ps");
RUN_TESTp(generic_test, sinf, mm256_sin_ps, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "mm256_sin_ps");
RUN_TESTp(generic_test, cosf, mm256_cos_ps, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "mm256_cos_ps");
RUN_TESTp(generic_test, acosf, mm256_acos_ps, -1.f, 1.f, arc_theshold, NUM_SAMPLES, false, "mm256_acos_ps");
RUN_TESTp(generic_test, asinf, mm256_asin_ps, -1.f, 1.f, arc_theshold, NUM_SAMPLES, false, "mm256_asin_ps");
RUN_TESTp(generic_test, atanf, mm256_atan_ps, -10.f, 10.f, 1.e-04f, NUM_SAMPLES, false, "mm256_atan_ps");

// this task fails on linux and I don't have this OS to debug
#if !defined(__linux__)
RUN_TESTp(generic_test, atan2_angle, simd_atan2, 0.f, 6.28318530f, 3.e-07f, 32768, false, "mm256_atan2_ps");
RUN_TESTp(generic_test, atan2_angle, simd_atan2, 0.f, 6.28318530f, arc_theshold, 32768, false, "mm256_atan2_ps");
#endif
#else
RUN_TESTp(generic_test, sinf, vsinq_f32, -10.f, 10.f, FLT_EPSILON, NUM_SAMPLES, false, "vsinq_f32");
RUN_TESTp(generic_test, cosf, vcosq_f32, -10.f, 10.f, FLT_EPSILON, NUM_SAMPLES, false, "vcosq_f32");
RUN_TESTp(generic_test, acosf, vacosq_f32, -1.f, 1.f, 1.e-06f, NUM_SAMPLES, false, "vacosq_f32");
RUN_TESTp(generic_test, asinf, vasinq_f32, -1.f, 1.f, 1.e-06f, NUM_SAMPLES, false, "vasinq_f32");
RUN_TESTp(generic_test, atanf, vatanq_f32, -10.f, 10.f, 1.e-04f, NUM_SAMPLES, false, "vatanq_f32");
RUN_TESTp(generic_test, sinf, vsinq_f32, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "vsinq_f32");
RUN_TESTp(generic_test, cosf, vcosq_f32, -10.f, 10.f, trigo_threshold, NUM_SAMPLES, false, "vcosq_f32");
RUN_TESTp(generic_test, acosf, vacosq_f32, -1.f, 1.f, arc_theshold, NUM_SAMPLES, false, "vacosq_f32");
RUN_TESTp(generic_test, asinf, vasinq_f32, -1.f, 1.f, arc_theshold, NUM_SAMPLES, false, "vasinq_f32");
RUN_TESTp(generic_test, atanf, vatanq_f32, -10.f, 10.f, arc_theshold, NUM_SAMPLES, false, "vatanq_f32");
RUN_TESTp(generic_test, atan2_angle, simd_atan2, 0.f, 6.28318530f, 3.e-07f, 32768, false, "vatan2q_f32");
#endif
}
Expand Down

0 comments on commit 71e4d1f

Please sign in to comment.