Skip to content
This repository was archived by the owner on Apr 28, 2025. It is now read-only.

Port the CORE-MATH version of cbrt #475

Merged
merged 3 commits into from
Feb 7, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions crates/libm-test/src/precision.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
| Bn::Trunc => 0,

// Operations that aren't required to be exact, but our implementations are.
Bn::Cbrt if ctx.fn_ident != Id::Cbrt => 0,
Bn::Cbrt => 0,

// Bessel functions have large inaccuracies.
Bn::J0 | Bn::J1 | Bn::Y0 | Bn::Y1 | Bn::Jn | Bn::Yn => 8_000_000,
Expand All @@ -54,7 +54,6 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
Bn::Atan => 1,
Bn::Atan2 => 2,
Bn::Atanh => 2,
Bn::Cbrt => 1,
Bn::Cos => 1,
Bn::Cosh => 1,
Bn::Erf => 1,
Expand Down Expand Up @@ -92,6 +91,7 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {
}

match ctx.fn_ident {
Id::Cbrt => ulp = 2,
// FIXME(#401): musl has an incorrect result here.
Id::Fdim => ulp = 2,
Id::Sincosf => ulp = 500,
Expand Down Expand Up @@ -119,6 +119,7 @@ pub fn default_ulp(ctx: &CheckCtx) -> u32 {

Id::Asinh => ulp = 3,
Id::Asinhf => ulp = 3,
Id::Cbrt => ulp = 1,
Id::Exp10 | Id::Exp10f => ulp = 1_000_000,
Id::Exp2 | Id::Exp2f => ulp = 10_000_000,
Id::Log1p | Id::Log1pf => ulp = 2,
Expand Down
319 changes: 216 additions & 103 deletions src/math/cbrt.rs
Original file line number Diff line number Diff line change
@@ -1,113 +1,226 @@
/* origin: FreeBSD /usr/src/lib/msun/src/s_cbrt.c */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*
* Optimized by Bruce D. Evans.
/* SPDX-License-Identifier: MIT */
/* origin: core-math/src/binary64/cbrt/cbrt.c
* Copyright (c) 2021-2022 Alexei Sibidanov.
* Ported to Rust in 2025 by Trevor Gross.
*/
/* cbrt(x)
* Return cube root of x
*/

use core::f64;

const B1: u32 = 715094163; /* B1 = (1023-1023/3-0.03306235651)*2**20 */
const B2: u32 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */
use super::Float;
use super::fenv::Rounding;
use super::support::cold_path;

/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */
const P0: f64 = 1.87595182427177009643; /* 0x3ffe03e6, 0x0f61e692 */
const P1: f64 = -1.88497979543377169875; /* 0xbffe28e0, 0x92f02420 */
const P2: f64 = 1.621429720105354466140; /* 0x3ff9f160, 0x4a49d6c2 */
const P3: f64 = -0.758397934778766047437; /* 0xbfe844cb, 0xbee751d9 */
const P4: f64 = 0.145996192886612446982; /* 0x3fc2b000, 0xd4e4edd7 */

// Cube root (f64)
///
/// Computes the cube root of the argument.
/// Compute the cube root of the argument.
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
pub fn cbrt(x: f64) -> f64 {
let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54

let mut ui: u64 = x.to_bits();
let mut r: f64;
let s: f64;
let mut t: f64;
let w: f64;
let mut hx: u32 = (ui >> 32) as u32 & 0x7fffffff;

if hx >= 0x7ff00000 {
/* cbrt(NaN,INF) is itself */
return x + x;
const ESCALE: [f64; 3] = [
1.0,
hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */
hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */
];

/* the polynomial c0+c1*x+c2*x^2+c3*x^3 approximates x^(1/3) on [1,2]
with maximal error < 9.2e-5 (attained at x=2) */
const C: [f64; 4] = [
hf64!("0x1.1b0babccfef9cp-1"),
hf64!("0x1.2c9a3e94d1da5p-1"),
hf64!("-0x1.4dc30b1a1ddbap-3"),
hf64!("0x1.7a8d3e4ec9b07p-6"),
];

let u0: f64 = hf64!("0x1.5555555555555p-2");
let u1: f64 = hf64!("0x1.c71c71c71c71cp-3");

let rsc = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25];

let off = [hf64!("0x1p-53"), 0.0, 0.0, 0.0];

let rm = Rounding::get();

/* rm=0 for rounding to nearest, and other values for directed roundings */
let hx: u64 = x.to_bits();
Comment on lines +36 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: this comment should be next to the let rm above, not the let hx below. Also the comment maybe needs modifying now that rm is an enum, not an integer?

Copy link
Contributor Author

@tgross35 tgross35 Jan 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch, I will update this. Thank you for reviewing!

Before merging I still want to include the .wc tests from core-math. Or maybe download/submodule those similar to musl since each is ~100k entries.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't gotten around to this unfortunately. There is now https://github.com/rust-lang/libm/blob/e66ec88df8325fbe151939c4dc0a9f7c25759fdf/crates/libm-test/src/gen/case_list.rs, it should be reasonably easy to parse .wc files there from a core-math submodule.

That being said, I our tests are thorough enough that I think I can just merge this now and add that later.

let mut mant: u64 = hx & f64::SIG_MASK;
let sign: u64 = hx >> 63;

let mut e: u32 = (hx >> f64::SIG_BITS) as u32 & f64::EXP_SAT;

if ((e + 1) & f64::EXP_SAT) < 2 {
cold_path();

let ix: u64 = hx & !f64::SIGN_MASK;

/* 0, inf, nan: we return x + x instead of simply x,
to that for x a signaling NaN, it correctly triggers
the invalid exception. */
if e == f64::EXP_SAT || ix == 0 {
return x + x;
}

let nz = ix.leading_zeros() - 11; /* subnormal */
mant <<= nz;
mant &= f64::SIG_MASK;
e = e.wrapping_sub(nz - 1);
}

e = e.wrapping_add(3072);
let cvt1: u64 = mant | (0x3ffu64 << 52);
let mut cvt5: u64 = cvt1;

let et: u32 = e / 3;
let it: u32 = e % 3;

/* 2^(3k+it) <= x < 2^(3k+it+1), with 0 <= it <= 3 */
cvt5 += u64::from(it) << f64::SIG_BITS;
cvt5 |= sign << 63;
let zz: f64 = f64::from_bits(cvt5);

/* cbrt(x) = cbrt(zz)*2^(et-1365) where 1 <= zz < 8 */
let mut isc: u64 = ESCALE[it as usize].to_bits(); // todo: index
isc |= sign << 63;
let cvt2: u64 = isc;
let z: f64 = f64::from_bits(cvt1);

/* cbrt(zz) = cbrt(z)*isc, where isc encodes 1, 2^(1/3) or 2^(2/3),
and 1 <= z < 2 */
let r: f64 = 1.0 / z;
let rr: f64 = r * rsc[((it as usize) << 1) | sign as usize];
let z2: f64 = z * z;
let c0: f64 = C[0] + z * C[1];
let c2: f64 = C[2] + z * C[3];
let mut y: f64 = c0 + z2 * c2;
let mut y2: f64 = y * y;

/* y is an approximation of z^(1/3) */
let mut h: f64 = y2 * (y * r) - 1.0;

/* h determines the error between y and z^(1/3) */
y -= (h * y) * (u0 - u1 * h);

/* The correction y -= (h*y)*(u0 - u1*h) corresponds to a cubic variant
of Newton's method, with the function f(y) = 1-z/y^3. */
y *= f64::from_bits(cvt2);

/* Now y is an approximation of zz^(1/3),
* and rr an approximation of 1/zz. We now perform another iteration of
* Newton-Raphson, this time with a linear approximation only. */
y2 = y * y;
let mut y2l: f64 = fmaf64(y, y, -y2);

/* y2 + y2l = y^2 exactly */
let mut y3: f64 = y2 * y;
let mut y3l: f64 = fmaf64(y, y2, -y3) + y * y2l;

/* y3 + y3l approximates y^3 with about 106 bits of accuracy */
h = ((y3 - zz) + y3l) * rr;
let mut dy: f64 = h * (y * u0);

/* the approximation of zz^(1/3) is y - dy */
let mut y1: f64 = y - dy;
dy = (y - y1) - dy;

/* the approximation of zz^(1/3) is now y1 + dy, where |dy| < 1/2 ulp(y)
* (for rounding to nearest) */
let mut ady: f64 = dy.abs();

/* For directed roundings, ady0 is tiny when dy is tiny, or ady0 is near
* from ulp(1);
* for rounding to nearest, ady0 is tiny when dy is near from 1/2 ulp(1),
* or from 3/2 ulp(1). */
let mut ady0: f64 = (ady - off[rm as usize]).abs();
let mut ady1: f64 = (ady - (hf64!("0x1p-52") + off[rm as usize])).abs();

if ady0 < hf64!("0x1p-75") || ady1 < hf64!("0x1p-75") {
cold_path();

y2 = y1 * y1;
y2l = fmaf64(y1, y1, -y2);
y3 = y2 * y1;
y3l = fmaf64(y1, y2, -y3) + y1 * y2l;
h = ((y3 - zz) + y3l) * rr;
dy = h * (y1 * u0);
y = y1 - dy;
dy = (y1 - y) - dy;
y1 = y;
ady = dy.abs();
ady0 = (ady - off[rm as usize]).abs();
ady1 = (ady - (hf64!("0x1p-52") + off[rm as usize])).abs();

if ady0 < hf64!("0x1p-98") || ady1 < hf64!("0x1p-98") {
cold_path();
let azz: f64 = zz.abs();

// ~ 0x1.79d15d0e8d59b80000000000000ffc3dp+0
if azz == hf64!("0x1.9b78223aa307cp+1") {
y1 = hf64!("0x1.79d15d0e8d59cp+0").copysign(zz);
}

// ~ 0x1.de87aa837820e80000000000001c0f08p+0
if azz == hf64!("0x1.a202bfc89ddffp+2") {
y1 = hf64!("0x1.de87aa837820fp+0").copysign(zz);
}

if rm != Rounding::Nearest {
let wlist = [
(hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0
(hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0
(hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0
(hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0
(hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0
(hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0
(hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0
];

for (a, b) in wlist {
if azz == a {
let tmp = if rm as u64 + sign == 2 { hf64!("0x1p-52") } else { 0.0 };
y1 = (b + tmp).copysign(zz);
}
}
}
}
}

let mut cvt3: u64 = y1.to_bits();
cvt3 = cvt3.wrapping_add(((et.wrapping_sub(342).wrapping_sub(1023)) as u64) << 52);
let m0: u64 = cvt3 << 30;
let m1 = m0 >> 63;

if (m0 ^ m1) <= (1u64 << 30) {
cold_path();

let mut cvt4: u64 = y1.to_bits();
cvt4 = (cvt4 + (164 << 15)) & 0xffffffffffff0000u64;

if ((f64::from_bits(cvt4) - y1) - dy).abs() < hf64!("0x1p-60") || (zz).abs() == 1.0 {
cvt3 = (cvt3 + (1u64 << 15)) & 0xffffffffffff0000u64;
}
}

/*
* Rough cbrt to 5 bits:
* cbrt(2**e*(1+m) ~= 2**(e/3)*(1+(e%3+m)/3)
* where e is integral and >= 0, m is real and in [0, 1), and "/" and
* "%" are integer division and modulus with rounding towards minus
* infinity. The RHS is always >= the LHS and has a maximum relative
* error of about 1 in 16. Adding a bias of -0.03306235651 to the
* (e%3+m)/3 term reduces the error to about 1 in 32. With the IEEE
* floating point representation, for finite positive normal values,
* ordinary integer divison of the value in bits magically gives
* almost exactly the RHS of the above provided we first subtract the
* exponent bias (1023 for doubles) and later add it back. We do the
* subtraction virtually to keep e >= 0 so that ordinary integer
* division rounds towards minus infinity; this is also efficient.
*/
if hx < 0x00100000 {
/* zero or subnormal? */
ui = (x * x1p54).to_bits();
hx = (ui >> 32) as u32 & 0x7fffffff;
if hx == 0 {
return x; /* cbrt(0) is itself */
f64::from_bits(cvt3)
}

fn fmaf64(x: f64, y: f64, z: f64) -> f64 {
#[cfg(intrinsics_enabled)]
{
return unsafe { core::intrinsics::fmaf64(x, y, z) };
}

#[cfg(not(intrinsics_enabled))]
{
return super::fma(x, y, z);
}
}
Comment on lines +200 to +210
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this be better as a method on support::Float, similar to abs and copysign? That way the implementation could be shared between this and other (future) users.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that would be preferable. I just did this as a temporary workaround until f16 and f128 also have an implementation, to keep the impl_float macro a bit simpler.

(I am hoping it will be possible to make this generic by putting the magic numbers in a helper trait and recalculating the polynomials for f128. But I'll get this cleaned up to merge before starting on that).


#[cfg(test)]
mod tests {
use super::*;

#[test]
fn spot_checks() {
if !cfg!(x86_no_sse) {
// Exposes a rounding mode problem. Ignored on i586 because of inaccurate FMA.
assert_biteq!(
cbrt(f64::from_bits(0xf7f792b28f600000)),
f64::from_bits(0xd29ce68655d962f3)
);
}
hx = hx / 3 + B2;
} else {
hx = hx / 3 + B1;
}
ui &= 1 << 63;
ui |= (hx as u64) << 32;
t = f64::from_bits(ui);

/*
* New cbrt to 23 bits:
* cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x)
* where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r)
* to within 2**-23.5 when |r - 1| < 1/10. The rough approximation
* has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this
* gives us bounds for r = t**3/x.
*
* Try to optimize for parallel evaluation as in __tanf.c.
*/
r = (t * t) * (t / x);
t = t * ((P0 + r * (P1 + r * P2)) + ((r * r) * r) * (P3 + r * P4));

/*
* Round t away from zero to 23 bits (sloppily except for ensuring that
* the result is larger in magnitude than cbrt(x) but not much more than
* 2 23-bit ulps larger). With rounding towards zero, the error bound
* would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps
* in the rounded t, the infinite-precision error in the Newton
* approximation barely affects third digit in the final error
* 0.667; the error in the rounded t can be up to about 3 23-bit ulps
* before the final error is larger than 0.667 ulps.
*/
ui = t.to_bits();
ui = (ui + 0x80000000) & 0xffffffffc0000000;
t = f64::from_bits(ui);

/* one step Newton iteration to 53 bits with error < 0.667 ulps */
s = t * t; /* t*t is exact */
r = x / s; /* error <= 0.5 ulps; |r| < |t| */
w = t + t; /* t+t is exact */
r = (r - t) / (w + r); /* r-t is exact; w+r ~= 3*t */
t = t + t * r; /* error <= 0.5 + 0.5/3 + epsilon */
t
}
22 changes: 22 additions & 0 deletions src/math/fenv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ pub(crate) const FE_UNDERFLOW: i32 = 0;
pub(crate) const FE_INEXACT: i32 = 0;

pub(crate) const FE_TONEAREST: i32 = 0;
pub(crate) const FE_DOWNWARD: i32 = 1;
pub(crate) const FE_UPWARD: i32 = 2;
pub(crate) const FE_TOWARDZERO: i32 = 3;

#[inline]
pub(crate) fn feclearexcept(_mask: i32) -> i32 {
Expand All @@ -25,3 +28,22 @@ pub(crate) fn fetestexcept(_mask: i32) -> i32 {
pub(crate) fn fegetround() -> i32 {
FE_TONEAREST
}

#[derive(Clone, Copy, Debug, PartialEq)]
pub(crate) enum Rounding {
Nearest = FE_TONEAREST as isize,
Downward = FE_DOWNWARD as isize,
Upward = FE_UPWARD as isize,
ToZero = FE_TOWARDZERO as isize,
}

impl Rounding {
pub(crate) fn get() -> Self {
match fegetround() {
x if x == FE_DOWNWARD => Self::Downward,
x if x == FE_UPWARD => Self::Upward,
x if x == FE_TOWARDZERO => Self::ToZero,
_ => Self::Nearest,
}
}
}