Add casting from f64 to f32 that preserves values and avoids UB

danakj · danakj · commit 3e70ce5c8f96 · 2023-07-30T18:31:45.000-04:00
diff --git a/subspace/num/__private/intrinsics.h b/subspace/num/__private/intrinsics.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <fenv.h>
 #include <stddef.h>
 #include <stdint.h>
 
@@ -618,7 +619,7 @@ sus_pure_const inline constexpr OverflowOut<T> add_with_overflow(T x,
   };
 }
 
-template <class T, class U = decltype(to_signed(std::declval<T>()))>
+template <class T, class U = decltype(into_signed(std::declval<T>()))>
   requires(std::is_integral_v<T> && !std::is_signed_v<T> &&
            ::sus::mem::size_of<T>() <= 8 &&
            ::sus::mem::size_of<T>() == ::sus::mem::size_of<U>())
@@ -631,7 +632,7 @@ sus_pure_const inline constexpr OverflowOut<T> add_with_overflow_signed(
   };
 }
 
-template <class T, class U = decltype(to_unsigned(std::declval<T>()))>
+template <class T, class U = decltype(into_unsigned(std::declval<T>()))>
   requires(std::is_integral_v<T> && std::is_signed_v<T> &&
            ::sus::mem::size_of<T>() <= 8 &&
            ::sus::mem::size_of<T>() == ::sus::mem::size_of<U>())
@@ -668,7 +669,7 @@ sus_pure_const inline constexpr OverflowOut<T> sub_with_overflow(T x,
   };
 }
 
-template <class T, class U = decltype(to_unsigned(std::declval<T>()))>
+template <class T, class U = decltype(into_unsigned(std::declval<T>()))>
   requires(std::is_integral_v<T> && std::is_signed_v<T> &&
            ::sus::mem::size_of<T>() <= 8 &&
            ::sus::mem::size_of<T>() == ::sus::mem::size_of<U>())
@@ -1258,12 +1259,14 @@ sus_pure_const sus_always_inline constexpr int32_t exponent_bits(
       unchecked_shr(into_unsigned_integer(x) & mask, 52));
 }
 
-sus_pure_const sus_always_inline constexpr int32_t exponent_value(
+/// This function requires that `x` is a normal value to produce a value result.
+sus_pure_const sus_always_inline constexpr int32_t float_normal_exponent_value(
     float x) noexcept {
   return exponent_bits(x) - int32_t{127};
 }
 
-sus_pure_const sus_always_inline constexpr int32_t exponent_value(
+/// This function requires that `x` is a normal value to produce a value result.
+sus_pure_const sus_always_inline constexpr int32_t float_normal_exponent_value(
     double x) noexcept {
   return exponent_bits(x) - int32_t{1023};
 }
@@ -1399,8 +1402,10 @@ sus_pure_const inline constexpr T truncate_float(T x) noexcept {
                                                                : uint32_t{52};
 
   if (float_is_inf_or_nan(x) || float_is_zero(x)) return x;
+  if (float_nonzero_is_subnormal(x)) [[unlikely]]
+    return T{0};
 
-  const int32_t exponent = exponent_value(x);
+  const int32_t exponent = float_normal_exponent_value(x);
 
   // If the exponent is greater than the most negative mantissa
   // exponent, then x is already an integer.
@@ -1521,4 +1526,31 @@ sus_pure_const inline T next_toward(T from, T to) {
     return std::nexttoward(from, to);
 }
 
+#pragma warning(push)
+// MSVC claims that "overflow in constant arithmetic" occurs on the static_cast
+// in `into_smaller_float()` but we check for overflow first, the conversion is
+// in range.
+#pragma warning(disable : 4756)
+
+// Not constexpr as rounding is always toward zero in a constexpr context.
+template <class Out, class T>
+  requires(std::is_floating_point_v<T> && ::sus::mem::size_of<T>() == 8 &&
+           ::sus::mem::size_of<Out>() == 4)
+sus_pure_const inline Out into_smaller_float(T x) noexcept {
+  if (x <= T{max_value<Out>()} && x >= T{min_value<Out>()}) [[likely]] {
+    // SAFETY: Because the value `x` is at or between two valid values of type
+    // `Out`, the static_cast does not cause UB.
+    return static_cast<Out>(x);  // Handles values in range.
+  }
+  if (x > T{max_value<Out>()}) {
+    return infinity<Out>();  // Handles large values and INFINITY.
+  }
+  if (x < T{min_value<Out>()}) {
+    return negative_infinity<Out>();  // Handles small values and NEG_INFINITY.
+  }
+  return nan<Out>();  // All that's left are NaNs.
+}
+
+#pragma warning(pop)
+
 }  // namespace sus::num::__private
diff --git a/subspace/num/convert.h b/subspace/num/convert.h
@@ -18,17 +18,21 @@
 #include <type_traits>
 
 #include "subspace/construct/to_bits.h"
+#include "subspace/num/__private/intrinsics.h"
 #include "subspace/num/float.h"
 #include "subspace/num/signed_integer.h"
 #include "subspace/num/unsigned_integer.h"
 
-/// Casting from a float to an integer will round the float towards zero,
-/// except:
-/// * NaN will return 0.
-/// * Values larger than the maximum integer value, including `INFINITY`, will
+/// * Casting from a float to an integer will round the float towards zero,
+///   except:
+///   * NaN will return 0.
+///   * Values larger than the maximum integer value, including `INFINITY`, will
 ///   saturate to the maximum value of the integer type.
-/// * Values smaller than the minimum integer value, including `NEG_INFINITY`,
-///   will saturate to the minimum value of the integer type.
+///   * Values smaller than the minimum integer value, including `NEG_INFINITY`,
+///     will saturate to the minimum value of the integer type.
+/// * Casting from an f32 to an f64 preserves the value unchanged.
+/// * Casting f64 to f32...
+
 
 // # ================ From signed integers. ============================
 
@@ -335,13 +339,13 @@ struct sus::construct::ToBitsImpl<T, F> {
       if constexpr (::sus::mem::size_of<F>() == 4u) {
         return from;
       } else {
-        return std::bit_cast<float>(
-            static_cast<uint32_t>(std::bit_cast<uint64_t>(from)));
+        return ::sus::num::__private::into_smaller_float<float>(from);
       }
     } else {
       if constexpr (::sus::mem::size_of<F>() == 4u) {
-        return std::bit_cast<double>(
-            static_cast<uint64_t>(std::bit_cast<uint32_t>(from)));
+        // C++20 Section 7.3.7: A prvalue of type float can be converted to a
+        // prvalue of type double. The value is unchanged.
+        return T{from};
       } else {
         return from;
       }
diff --git a/subspace/num/convert_unittest.cc b/subspace/num/convert_unittest.cc
@@ -354,6 +354,37 @@ TEST(ConvertToBits, isize) {
   }
 }
 
+TEST(ConvertToBits, LosslessFloatConversion) {
+  EXPECT_EQ(sus::to_bits<f64>(-1.8949651689383756e-14_f32),
+            -1.8949651689383756e-14_f64);
+  EXPECT_EQ(sus::to_bits<f32>(-1.8949651689383756e-14_f32),
+            -1.8949651689383756e-14_f32);
+  EXPECT_EQ(sus::to_bits<f64>(-4.59218127443847370761468605771e-102_f64),
+            -4.59218127443847370761468605771e-102_f64);
+}
+
+TEST(ConvertToBits, f64tof32) {
+  EXPECT_EQ(sus::to_bits<f32>(f64::NAN).is_nan(), true);
+  EXPECT_EQ(sus::to_bits<f32>(f64::INFINITY), f32::INFINITY);
+  EXPECT_EQ(sus::to_bits<f32>(f64::NEG_INFINITY), f32::NEG_INFINITY);
+  EXPECT_EQ(sus::to_bits<f32>(f64::MAX), f32::INFINITY);
+  EXPECT_EQ(sus::to_bits<f32>(f64::MIN), f32::NEG_INFINITY);
+
+  // Just past the valid range of values for f32 in either direciton. A
+  // static_cast<float>(double) for these values would cause UB.
+  EXPECT_EQ(sus::to_bits<f32>(
+                sus::to_bits<f64>(f32::MIN).next_toward(f64::NEG_INFINITY)),
+            f32::NEG_INFINITY);
+  EXPECT_EQ(
+      sus::to_bits<f32>(sus::to_bits<f64>(f32::MAX).next_toward(f64::INFINITY)),
+      f32::INFINITY);
+
+  // This is a value with bits set throughout the exponent and mantissa. Its
+  // exponent is <= 127 and >= -126 so it's possible to represent it in f32.
+  EXPECT_EQ(sus::to_bits<f32>(-4.59218127443847370761468605771e-102_f64),
+            -4.59218127443847370761468605771e-102_f32);
+}
+
 TEST(ConvertToBits, f32) {
   static_assert(std::same_as<decltype(sus::to_bits<u16>(0_f32)), u16>);
 
@@ -546,7 +577,8 @@ TEST(ConvertToBits, f64) {
     EXPECT_EQ(sus::to_bits<i64>(0.51_f64), 0_i64);
     EXPECT_EQ(sus::to_bits<i64>(0.9999_f64), 0_i64);
     EXPECT_EQ(sus::to_bits<i64>(1_f64), 1_i64);
-    EXPECT_LT(sus::to_bits<i64>((9223372036854775807_f64).next_toward(0_f64)), i64::MAX);
+    EXPECT_LT(sus::to_bits<i64>((9223372036854775807_f64).next_toward(0_f64)),
+              i64::MAX);
     EXPECT_EQ(sus::to_bits<i64>(9223372036854775807_f64), i64::MAX);
     EXPECT_EQ(sus::to_bits<i64>(9223372036854775807.00001_f64), i64::MAX);
     EXPECT_EQ(sus::to_bits<i64>(9223372036854775807_f64 * 2_f64), i64::MAX);