-
Notifications
You must be signed in to change notification settings - Fork 102
Port the CORE-MATH version of cbrt
#475
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,113 +1,226 @@ | ||
/* origin: FreeBSD /usr/src/lib/msun/src/s_cbrt.c */ | ||
/* | ||
* ==================================================== | ||
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. | ||
* | ||
* Developed at SunPro, a Sun Microsystems, Inc. business. | ||
* Permission to use, copy, modify, and distribute this | ||
* software is freely granted, provided that this notice | ||
* is preserved. | ||
* ==================================================== | ||
* | ||
* Optimized by Bruce D. Evans. | ||
/* SPDX-License-Identifier: MIT */ | ||
/* origin: core-math/src/binary64/cbrt/cbrt.c | ||
* Copyright (c) 2021-2022 Alexei Sibidanov. | ||
* Ported to Rust in 2025 by Trevor Gross. | ||
*/ | ||
/* cbrt(x) | ||
* Return cube root of x | ||
*/ | ||
|
||
use core::f64; | ||
|
||
const B1: u32 = 715094163; /* B1 = (1023-1023/3-0.03306235651)*2**20 */ | ||
const B2: u32 = 696219795; /* B2 = (1023-1023/3-54/3-0.03306235651)*2**20 */ | ||
use super::Float; | ||
use super::fenv::Rounding; | ||
use super::support::cold_path; | ||
|
||
/* |1/cbrt(x) - p(x)| < 2**-23.5 (~[-7.93e-8, 7.929e-8]). */ | ||
const P0: f64 = 1.87595182427177009643; /* 0x3ffe03e6, 0x0f61e692 */ | ||
const P1: f64 = -1.88497979543377169875; /* 0xbffe28e0, 0x92f02420 */ | ||
const P2: f64 = 1.621429720105354466140; /* 0x3ff9f160, 0x4a49d6c2 */ | ||
const P3: f64 = -0.758397934778766047437; /* 0xbfe844cb, 0xbee751d9 */ | ||
const P4: f64 = 0.145996192886612446982; /* 0x3fc2b000, 0xd4e4edd7 */ | ||
|
||
// Cube root (f64) | ||
/// | ||
/// Computes the cube root of the argument. | ||
/// Compute the cube root of the argument. | ||
#[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)] | ||
pub fn cbrt(x: f64) -> f64 { | ||
let x1p54 = f64::from_bits(0x4350000000000000); // 0x1p54 === 2 ^ 54 | ||
|
||
let mut ui: u64 = x.to_bits(); | ||
let mut r: f64; | ||
let s: f64; | ||
let mut t: f64; | ||
let w: f64; | ||
let mut hx: u32 = (ui >> 32) as u32 & 0x7fffffff; | ||
|
||
if hx >= 0x7ff00000 { | ||
/* cbrt(NaN,INF) is itself */ | ||
return x + x; | ||
const ESCALE: [f64; 3] = [ | ||
1.0, | ||
hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */ | ||
hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */ | ||
]; | ||
|
||
/* the polynomial c0+c1*x+c2*x^2+c3*x^3 approximates x^(1/3) on [1,2] | ||
with maximal error < 9.2e-5 (attained at x=2) */ | ||
const C: [f64; 4] = [ | ||
hf64!("0x1.1b0babccfef9cp-1"), | ||
hf64!("0x1.2c9a3e94d1da5p-1"), | ||
hf64!("-0x1.4dc30b1a1ddbap-3"), | ||
hf64!("0x1.7a8d3e4ec9b07p-6"), | ||
]; | ||
|
||
let u0: f64 = hf64!("0x1.5555555555555p-2"); | ||
let u1: f64 = hf64!("0x1.c71c71c71c71cp-3"); | ||
|
||
let rsc = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25]; | ||
|
||
let off = [hf64!("0x1p-53"), 0.0, 0.0, 0.0]; | ||
|
||
let rm = Rounding::get(); | ||
|
||
/* rm=0 for rounding to nearest, and other values for directed roundings */ | ||
let hx: u64 = x.to_bits(); | ||
let mut mant: u64 = hx & f64::SIG_MASK; | ||
let sign: u64 = hx >> 63; | ||
|
||
let mut e: u32 = (hx >> f64::SIG_BITS) as u32 & f64::EXP_SAT; | ||
|
||
if ((e + 1) & f64::EXP_SAT) < 2 { | ||
cold_path(); | ||
|
||
let ix: u64 = hx & !f64::SIGN_MASK; | ||
|
||
/* 0, inf, nan: we return x + x instead of simply x, | ||
to that for x a signaling NaN, it correctly triggers | ||
the invalid exception. */ | ||
if e == f64::EXP_SAT || ix == 0 { | ||
return x + x; | ||
} | ||
|
||
let nz = ix.leading_zeros() - 11; /* subnormal */ | ||
mant <<= nz; | ||
mant &= f64::SIG_MASK; | ||
e = e.wrapping_sub(nz - 1); | ||
} | ||
|
||
e = e.wrapping_add(3072); | ||
let cvt1: u64 = mant | (0x3ffu64 << 52); | ||
let mut cvt5: u64 = cvt1; | ||
|
||
let et: u32 = e / 3; | ||
let it: u32 = e % 3; | ||
|
||
/* 2^(3k+it) <= x < 2^(3k+it+1), with 0 <= it <= 3 */ | ||
cvt5 += u64::from(it) << f64::SIG_BITS; | ||
cvt5 |= sign << 63; | ||
let zz: f64 = f64::from_bits(cvt5); | ||
|
||
/* cbrt(x) = cbrt(zz)*2^(et-1365) where 1 <= zz < 8 */ | ||
let mut isc: u64 = ESCALE[it as usize].to_bits(); // todo: index | ||
isc |= sign << 63; | ||
let cvt2: u64 = isc; | ||
let z: f64 = f64::from_bits(cvt1); | ||
|
||
/* cbrt(zz) = cbrt(z)*isc, where isc encodes 1, 2^(1/3) or 2^(2/3), | ||
and 1 <= z < 2 */ | ||
let r: f64 = 1.0 / z; | ||
let rr: f64 = r * rsc[((it as usize) << 1) | sign as usize]; | ||
let z2: f64 = z * z; | ||
let c0: f64 = C[0] + z * C[1]; | ||
let c2: f64 = C[2] + z * C[3]; | ||
let mut y: f64 = c0 + z2 * c2; | ||
let mut y2: f64 = y * y; | ||
|
||
/* y is an approximation of z^(1/3) */ | ||
let mut h: f64 = y2 * (y * r) - 1.0; | ||
|
||
/* h determines the error between y and z^(1/3) */ | ||
y -= (h * y) * (u0 - u1 * h); | ||
|
||
/* The correction y -= (h*y)*(u0 - u1*h) corresponds to a cubic variant | ||
of Newton's method, with the function f(y) = 1-z/y^3. */ | ||
y *= f64::from_bits(cvt2); | ||
|
||
/* Now y is an approximation of zz^(1/3), | ||
* and rr an approximation of 1/zz. We now perform another iteration of | ||
* Newton-Raphson, this time with a linear approximation only. */ | ||
y2 = y * y; | ||
let mut y2l: f64 = fmaf64(y, y, -y2); | ||
|
||
/* y2 + y2l = y^2 exactly */ | ||
let mut y3: f64 = y2 * y; | ||
let mut y3l: f64 = fmaf64(y, y2, -y3) + y * y2l; | ||
|
||
/* y3 + y3l approximates y^3 with about 106 bits of accuracy */ | ||
h = ((y3 - zz) + y3l) * rr; | ||
let mut dy: f64 = h * (y * u0); | ||
|
||
/* the approximation of zz^(1/3) is y - dy */ | ||
let mut y1: f64 = y - dy; | ||
dy = (y - y1) - dy; | ||
|
||
/* the approximation of zz^(1/3) is now y1 + dy, where |dy| < 1/2 ulp(y) | ||
* (for rounding to nearest) */ | ||
let mut ady: f64 = dy.abs(); | ||
|
||
/* For directed roundings, ady0 is tiny when dy is tiny, or ady0 is near | ||
* from ulp(1); | ||
* for rounding to nearest, ady0 is tiny when dy is near from 1/2 ulp(1), | ||
* or from 3/2 ulp(1). */ | ||
let mut ady0: f64 = (ady - off[rm as usize]).abs(); | ||
let mut ady1: f64 = (ady - (hf64!("0x1p-52") + off[rm as usize])).abs(); | ||
|
||
if ady0 < hf64!("0x1p-75") || ady1 < hf64!("0x1p-75") { | ||
cold_path(); | ||
|
||
y2 = y1 * y1; | ||
y2l = fmaf64(y1, y1, -y2); | ||
y3 = y2 * y1; | ||
y3l = fmaf64(y1, y2, -y3) + y1 * y2l; | ||
h = ((y3 - zz) + y3l) * rr; | ||
dy = h * (y1 * u0); | ||
y = y1 - dy; | ||
dy = (y1 - y) - dy; | ||
y1 = y; | ||
ady = dy.abs(); | ||
ady0 = (ady - off[rm as usize]).abs(); | ||
ady1 = (ady - (hf64!("0x1p-52") + off[rm as usize])).abs(); | ||
|
||
if ady0 < hf64!("0x1p-98") || ady1 < hf64!("0x1p-98") { | ||
cold_path(); | ||
let azz: f64 = zz.abs(); | ||
|
||
// ~ 0x1.79d15d0e8d59b80000000000000ffc3dp+0 | ||
if azz == hf64!("0x1.9b78223aa307cp+1") { | ||
y1 = hf64!("0x1.79d15d0e8d59cp+0").copysign(zz); | ||
} | ||
|
||
// ~ 0x1.de87aa837820e80000000000001c0f08p+0 | ||
if azz == hf64!("0x1.a202bfc89ddffp+2") { | ||
y1 = hf64!("0x1.de87aa837820fp+0").copysign(zz); | ||
} | ||
|
||
if rm != Rounding::Nearest { | ||
let wlist = [ | ||
(hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0 | ||
(hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0 | ||
(hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0 | ||
(hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0 | ||
(hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0 | ||
(hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0 | ||
(hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0 | ||
]; | ||
|
||
for (a, b) in wlist { | ||
if azz == a { | ||
let tmp = if rm as u64 + sign == 2 { hf64!("0x1p-52") } else { 0.0 }; | ||
y1 = (b + tmp).copysign(zz); | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
let mut cvt3: u64 = y1.to_bits(); | ||
cvt3 = cvt3.wrapping_add(((et.wrapping_sub(342).wrapping_sub(1023)) as u64) << 52); | ||
let m0: u64 = cvt3 << 30; | ||
let m1 = m0 >> 63; | ||
|
||
if (m0 ^ m1) <= (1u64 << 30) { | ||
cold_path(); | ||
|
||
let mut cvt4: u64 = y1.to_bits(); | ||
cvt4 = (cvt4 + (164 << 15)) & 0xffffffffffff0000u64; | ||
|
||
if ((f64::from_bits(cvt4) - y1) - dy).abs() < hf64!("0x1p-60") || (zz).abs() == 1.0 { | ||
cvt3 = (cvt3 + (1u64 << 15)) & 0xffffffffffff0000u64; | ||
} | ||
} | ||
|
||
/* | ||
* Rough cbrt to 5 bits: | ||
* cbrt(2**e*(1+m) ~= 2**(e/3)*(1+(e%3+m)/3) | ||
* where e is integral and >= 0, m is real and in [0, 1), and "/" and | ||
* "%" are integer division and modulus with rounding towards minus | ||
* infinity. The RHS is always >= the LHS and has a maximum relative | ||
* error of about 1 in 16. Adding a bias of -0.03306235651 to the | ||
* (e%3+m)/3 term reduces the error to about 1 in 32. With the IEEE | ||
* floating point representation, for finite positive normal values, | ||
* ordinary integer divison of the value in bits magically gives | ||
* almost exactly the RHS of the above provided we first subtract the | ||
* exponent bias (1023 for doubles) and later add it back. We do the | ||
* subtraction virtually to keep e >= 0 so that ordinary integer | ||
* division rounds towards minus infinity; this is also efficient. | ||
*/ | ||
if hx < 0x00100000 { | ||
/* zero or subnormal? */ | ||
ui = (x * x1p54).to_bits(); | ||
hx = (ui >> 32) as u32 & 0x7fffffff; | ||
if hx == 0 { | ||
return x; /* cbrt(0) is itself */ | ||
f64::from_bits(cvt3) | ||
} | ||
|
||
fn fmaf64(x: f64, y: f64, z: f64) -> f64 { | ||
#[cfg(intrinsics_enabled)] | ||
{ | ||
return unsafe { core::intrinsics::fmaf64(x, y, z) }; | ||
} | ||
|
||
#[cfg(not(intrinsics_enabled))] | ||
{ | ||
return super::fma(x, y, z); | ||
} | ||
} | ||
Comment on lines
+200
to
+210
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would this be better as a method on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that would be preferable. I just did this as a temporary workaround until (I am hoping it will be possible to make this generic by putting the magic numbers in a helper trait and recalculating the polynomials for |
||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn spot_checks() { | ||
if !cfg!(x86_no_sse) { | ||
// Exposes a rounding mode problem. Ignored on i586 because of inaccurate FMA. | ||
assert_biteq!( | ||
cbrt(f64::from_bits(0xf7f792b28f600000)), | ||
f64::from_bits(0xd29ce68655d962f3) | ||
); | ||
} | ||
hx = hx / 3 + B2; | ||
} else { | ||
hx = hx / 3 + B1; | ||
} | ||
ui &= 1 << 63; | ||
ui |= (hx as u64) << 32; | ||
t = f64::from_bits(ui); | ||
|
||
/* | ||
* New cbrt to 23 bits: | ||
* cbrt(x) = t*cbrt(x/t**3) ~= t*P(t**3/x) | ||
* where P(r) is a polynomial of degree 4 that approximates 1/cbrt(r) | ||
* to within 2**-23.5 when |r - 1| < 1/10. The rough approximation | ||
* has produced t such than |t/cbrt(x) - 1| ~< 1/32, and cubing this | ||
* gives us bounds for r = t**3/x. | ||
* | ||
* Try to optimize for parallel evaluation as in __tanf.c. | ||
*/ | ||
r = (t * t) * (t / x); | ||
t = t * ((P0 + r * (P1 + r * P2)) + ((r * r) * r) * (P3 + r * P4)); | ||
|
||
/* | ||
* Round t away from zero to 23 bits (sloppily except for ensuring that | ||
* the result is larger in magnitude than cbrt(x) but not much more than | ||
* 2 23-bit ulps larger). With rounding towards zero, the error bound | ||
* would be ~5/6 instead of ~4/6. With a maximum error of 2 23-bit ulps | ||
* in the rounded t, the infinite-precision error in the Newton | ||
* approximation barely affects third digit in the final error | ||
* 0.667; the error in the rounded t can be up to about 3 23-bit ulps | ||
* before the final error is larger than 0.667 ulps. | ||
*/ | ||
ui = t.to_bits(); | ||
ui = (ui + 0x80000000) & 0xffffffffc0000000; | ||
t = f64::from_bits(ui); | ||
|
||
/* one step Newton iteration to 53 bits with error < 0.667 ulps */ | ||
s = t * t; /* t*t is exact */ | ||
r = x / s; /* error <= 0.5 ulps; |r| < |t| */ | ||
w = t + t; /* t+t is exact */ | ||
r = (r - t) / (w + r); /* r-t is exact; w+r ~= 3*t */ | ||
t = t + t * r; /* error <= 0.5 + 0.5/3 + epsilon */ | ||
t | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this comment should be next to the
let rm
above, not thelet hx
below. Also the comment maybe needs modifying now thatrm
is an enum, not an integer?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch, I will update this. Thank you for reviewing!
Before merging I still want to include the
.wc
tests from core-math. Or maybe download/submodule those similar to musl since each is ~100k entries.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I haven't gotten around to this unfortunately. There is now https://github.com/rust-lang/libm/blob/e66ec88df8325fbe151939c4dc0a9f7c25759fdf/crates/libm-test/src/gen/case_list.rs, it should be reasonably easy to parse
.wc
files there from a core-math submodule.That being said, I our tests are thorough enough that I think I can just merge this now and add that later.