[Runtime] Preserve denormals in floating point conversions (#20252)

bjacob · web-flow · commit de4031d63f7b · 2025-03-14T20:03:39.000-04:00
These helper functions are used notably for printing and parsing, so flushing denormals to zero there was not a good idea, even if actual codegen would flush to zero. For example, flusing to zero when printing could hide denormals, and parsing to zero could prevent reproducing issues with denormals. A couple of unrelated changes are lumped in this PR: dropping some redundant branches to handle `nan_as_neg_zero` (which is used in FP8 types that have FN in their name). I just spotted that and it's redundant because it's already handled at the end of the function, and the tests still passing confirm that. As rightly called out by @krzysz00 in #20242 (review). Signed-off-by: Benoit Jacob <jacob.benoit.1@gmail.com>
diff --git a/runtime/src/iree/base/internal/math.h b/runtime/src/iree/base/internal/math.h
@@ -304,12 +304,8 @@ static inline float iree_math_make_f32_from_bits(uint32_t src, int exp_bits,
   const uint32_t f32_sign = src_sign << (f32_sign_shift - src_sign_shift);
   const uint32_t src_exp = src & src_exp_mask;
   const uint32_t src_mantissa = src & src_mantissa_mask;
-  // Initializing f32_exp and f32_mantissa for the case of normal finite values.
-  // Below we will overload that in other cases.
-  uint32_t f32_exp = ((src_exp >> src_exp_shift) + f32_exp_bias - src_exp_bias)
-                     << f32_exp_shift;
-  uint32_t f32_mantissa = src_mantissa
-                          << (f32_mantissa_bits - src_mantissa_bits);
+  uint32_t f32_exp = 0;
+  uint32_t f32_mantissa = 0;
   if (src_exp == src_exp_mask) {
     // Top exponent value normally means infinity or NaN.
     if (have_infinity) {
@@ -333,16 +329,16 @@ static inline float iree_math_make_f32_from_bits(uint32_t src, int exp_bits,
         f32_mantissa = f32_mantissa_mask;
       }
     }
-  } else if (src_exp == 0) {
-    // Zero or subnormal. Generate zero, except in one case: if the source type
-    // encodes NaN as signed zero, we handle that now.
-    if (nan_as_neg_zero && src == src_sign_mask) {
-      f32_exp = f32_exp_mask;
-      f32_mantissa = f32_mantissa_mask;
-    } else {
-      f32_exp = 0;
-      f32_mantissa = 0;
-    }
+  } else if (nan_as_neg_zero && src == src_sign_mask) {
+    // Source is NaN encoded as negative zero. Generate NaN.
+    f32_exp = f32_exp_mask;
+    f32_mantissa = f32_mantissa_mask;
+  } else if (src_exp == 0 && src_mantissa == 0) {
+    // Zero. Leave f32_exp and f32_mantissa as zero.
+  } else {
+    f32_exp = ((src_exp >> src_exp_shift) + f32_exp_bias - src_exp_bias)
+              << f32_exp_shift;
+    f32_mantissa = src_mantissa << (f32_mantissa_bits - src_mantissa_bits);
   }
   const uint32_t u32_value = f32_sign | f32_exp | f32_mantissa;
   float f32_value;
@@ -378,11 +374,16 @@ static inline uint32_t iree_math_truncate_f32_to_bits_rounding_to_nearest_even(
       // Inf. Leave zero mantissa.
     }
   } else if (f32_exp == 0) {
-    // Zero or subnormal. Generate zero. Leave zero mantissa.
-    if (nan_as_neg_zero) {
-      // The destination has no signed zero. Avoid accidentally generating NaN.
-      dst_sign = 0;
+    // Zero or subnormal.
+    if (dst_exp_bits == f32_exp_bits) {
+      // When the destination type still has as many exponent bits, denormals
+      // can remain nonzero. This happens only with the bf16 type.
+      // Just truncate the mantissa. Not worth bothering with round-to-nearest
+      // for denormals for bf16 only.
+      dst_mantissa = f32_mantissa >> (f32_mantissa_bits - dst_mantissa_bits);
     }
+    // The destination type has fewer exponent bits, so f32 subnormal values
+    // become exactly zero. Leave the mantissa zero.
   } else {
     // Normal finite value.
     int arithmetic_exp = (f32_exp >> f32_exp_shift) - f32_exp_bias;
@@ -397,8 +398,19 @@ static inline uint32_t iree_math_truncate_f32_to_bits_rounding_to_nearest_even(
         generate_nan = true;
       }
     } else if (arithmetic_exp + dst_exp_bias <= 0) {
-      // Underflow. Generate zero. Leave zero mantissa.
+      // Underflow. Generate a subnormal or zero.
       dst_exp = 0;
+      // The exponent has to be clamped to 0 when the value
+      // (arithmetic_exp + dst_exp_bias) is negative. This has to be compensated
+      // by right-shifting the subnormal mantissa.
+      int exp_to_encode_as_bitshift = -(arithmetic_exp + dst_exp_bias);
+      int shift_amount =
+          f32_mantissa_bits - dst_mantissa_bits + exp_to_encode_as_bitshift;
+      if (shift_amount >= f32_mantissa_bits) {
+        dst_mantissa = 0;
+      } else {
+        dst_mantissa = f32_mantissa >> shift_amount;
+      }
     } else {
       // Normal case.
       // Implement round-to-nearest-even, by adding a bias before truncating.
diff --git a/runtime/src/iree/base/internal/math_test.cc b/runtime/src/iree/base/internal/math_test.cc
@@ -192,10 +192,10 @@ TEST(F16ConversionTest, F32ToF16) {
   // Underflow
   EXPECT_EQ(0, iree_math_f32_to_f16(FLT_MIN));
   EXPECT_EQ(0x8000, iree_math_f32_to_f16(-FLT_MIN));
-  EXPECT_EQ(0, iree_math_f32_to_f16(1.0e-05));
-  EXPECT_EQ(0x8000, iree_math_f32_to_f16(-1.0e-05));
-  EXPECT_EQ(0, iree_math_f32_to_f16(6.1e-05));  // Near largest denormal
-  EXPECT_EQ(0x8000, iree_math_f32_to_f16(-6.1e-05));
+  EXPECT_EQ(0x004F, iree_math_f32_to_f16(1.0e-05));
+  EXPECT_EQ(0x804F, iree_math_f32_to_f16(-1.0e-05));
+  EXPECT_EQ(0x03FE, iree_math_f32_to_f16(6.1e-05));  // Near largest denormal
+  EXPECT_EQ(0x83FE, iree_math_f32_to_f16(-6.1e-05));
 
   // Denormals may or may not get flushed to zero. Accept both ways.
   uint16_t positive_denormal = iree_math_f32_to_f16(kF16Min / 2);
@@ -319,7 +319,8 @@ TEST(BF16ConversionTest, F32ToBF16ToF32) {
   EXPECT_EQ(FLT_MIN, iree_math_bf16_to_f32(iree_math_f32_to_bf16(FLT_MIN)));
   EXPECT_EQ(-FLT_MIN, iree_math_bf16_to_f32(iree_math_f32_to_bf16(-FLT_MIN)));
   // Denormals
-  EXPECT_EQ(0.0f, iree_math_bf16_to_f32(iree_math_f32_to_bf16(2.0e-40f)));
+  EXPECT_EQ(1.83670992e-40f,
+            iree_math_bf16_to_f32(iree_math_f32_to_bf16(2.0e-40f)));
   // Inf and Nan
   EXPECT_EQ(INFINITY, iree_math_bf16_to_f32(iree_math_f32_to_bf16(INFINITY)));
   EXPECT_EQ(-INFINITY, iree_math_bf16_to_f32(iree_math_f32_to_bf16(-INFINITY)));
@@ -362,10 +363,10 @@ TEST(F8E5M2ConversionTest, F32ToF8E5M2) {
   // Underflow
   EXPECT_EQ(0, iree_math_f32_to_f8e5m2(FLT_MIN));
   EXPECT_EQ(0x80, iree_math_f32_to_f8e5m2(-FLT_MIN));
-  EXPECT_EQ(0, iree_math_f32_to_f8e5m2(kF8E5M2Min * 0.5f));
+  EXPECT_EQ(0x00, iree_math_f32_to_f8e5m2(kF8E5M2Min * 0.5f));
   EXPECT_EQ(0x80, iree_math_f32_to_f8e5m2(-kF8E5M2Min * 0.5f));
-  EXPECT_EQ(0, iree_math_f32_to_f8e5m2(kF8E5M2Min * 0.75f));
-  EXPECT_EQ(0x80, iree_math_f32_to_f8e5m2(-kF8E5M2Min * 0.75f));
+  EXPECT_EQ(0x02, iree_math_f32_to_f8e5m2(kF8E5M2Min * 0.75f));
+  EXPECT_EQ(0x82, iree_math_f32_to_f8e5m2(-kF8E5M2Min * 0.75f));
 
   // Denormals may or may not get flushed to zero. Accept both ways.
   uint16_t positive_denormal = iree_math_f32_to_f8e5m2(kF8E5M2Min / 2);