[APFloat] Add APFloat support for FP4 data type #95392

durga4github · 2024-06-13T11:11:13Z

This patch adds APFloat type support for the E2M1
FP4 datatype. The definitions for this format are
detailed in section 5.3.3 of the OCP specification,
which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Since the condition-checks specific to the OCP formats were added through earlier PR (#94735),
this patch simply adds the FP4 type and its tests.

llvmbot · 2024-06-13T11:11:44Z

@llvm/pr-subscribers-clang

@llvm/pr-subscribers-llvm-support

Author: Durgadoss R (durga4github)

Changes

This patch adds APFloat type support for the E2M1
FP4 datatype. The definitions for this format are
detailed in section 5.3.3 of the OCP specification,
which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Patch is 20.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95392.diff

4 Files Affected:

(modified) clang/lib/AST/MicrosoftMangle.cpp (+1)
(modified) llvm/include/llvm/ADT/APFloat.h (+8)
(modified) llvm/lib/Support/APFloat.cpp (+23-2)
(modified) llvm/unittests/ADT/APFloatTest.cpp (+254-2)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index ffc5d2d4cd8fc..a863ec7a529b9 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -901,6 +901,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
   case APFloat::S_FloatTF32:
   case APFloat::S_Float6E3M2FN:
   case APFloat::S_Float6E2M3FN:
+  case APFloat::S_Float4E2M1FN:
     llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index a9bb6cc9999b1..c24eae8da3797 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -197,6 +197,10 @@ struct APFloatBase {
     // types, there are no infinity or NaN values. The format is detailed in
     // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     S_Float6E2M3FN,
+    // 4-bit floating point number with bit layout S1E2M1. Unlike IEEE-754
+    // types, there are no infinity or NaN values. The format is detailed in
+    // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    S_Float4E2M1FN,
 
     S_x87DoubleExtended,
     S_MaxSemantics = S_x87DoubleExtended,
@@ -219,6 +223,7 @@ struct APFloatBase {
   static const fltSemantics &FloatTF32() LLVM_READNONE;
   static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
   static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
+  static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -639,6 +644,7 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloatTF32APFloatToAPInt() const;
   APInt convertFloat6E3M2FNAPFloatToAPInt() const;
   APInt convertFloat6E2M3FNAPFloatToAPInt() const;
+  APInt convertFloat4E2M1FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template <const fltSemantics &S> void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -656,6 +662,7 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloatTF32APInt(const APInt &api);
   void initFromFloat6E3M2FNAPInt(const APInt &api);
   void initFromFloat6E2M3FNAPInt(const APInt &api);
+  void initFromFloat4E2M1FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1067,6 +1074,7 @@ class APFloat : public APFloatBase {
     // Below Semantics do not support {NaN or Inf}
     case APFloat::S_Float6E3M2FN:
     case APFloat::S_Float6E2M3FN:
+    case APFloat::S_Float4E2M1FN:
       return false;
     }
   }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 1209bf71a287d..fab3052a9c02e 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -69,8 +69,8 @@ enum class fltNonfiniteBehavior {
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
 
-  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
-  // which do not support Inf or NaN values.
+  // This behavior is present in Float6E3M2FN, Float6E2M3FN and
+  // Float4E2M1FN types, which do not support Inf or NaN values.
   FiniteOnly,
 };
 
@@ -147,6 +147,8 @@ static constexpr fltSemantics semFloat6E3M2FN = {
     4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semFloat6E2M3FN = {
     2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
+static constexpr fltSemantics semFloat4E2M1FN = {
+    2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
 static constexpr fltSemantics semBogus = {0, 0, 0, 0};
 
@@ -218,6 +220,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
     return Float6E3M2FN();
   case S_Float6E2M3FN:
     return Float6E2M3FN();
+  case S_Float4E2M1FN:
+    return Float4E2M1FN();
   case S_x87DoubleExtended:
     return x87DoubleExtended();
   }
@@ -254,6 +258,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     return S_Float6E3M2FN;
   else if (&Sem == &llvm::APFloat::Float6E2M3FN())
     return S_Float6E2M3FN;
+  else if (&Sem == &llvm::APFloat::Float4E2M1FN())
+    return S_Float4E2M1FN;
   else if (&Sem == &llvm::APFloat::x87DoubleExtended())
     return S_x87DoubleExtended;
   else
@@ -278,6 +284,7 @@ const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
 const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
 const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
 const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
+const fltSemantics &APFloatBase::Float4E2M1FN() { return semFloat4E2M1FN; }
 const fltSemantics &APFloatBase::x87DoubleExtended() {
   return semX87DoubleExtended;
 }
@@ -3640,6 +3647,11 @@ APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const {
   return convertIEEEFloatToAPInt<semFloat6E2M3FN>();
 }
 
+APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const {
+  assert(partCount() == 1);
+  return convertIEEEFloatToAPInt<semFloat4E2M1FN>();
+}
+
 // This function creates an APInt that is just a bit map of the floating
 // point constant as it would appear in memory.  It is not a conversion,
 // and treating the result as a normal integer is unlikely to be useful.
@@ -3687,6 +3699,9 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN)
     return convertFloat6E2M3FNAPFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat4E2M1FN)
+    return convertFloat4E2M1FNAPFloatToAPInt();
+
   assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
          "unknown format!");
   return convertF80LongDoubleAPFloatToAPInt();
@@ -3911,6 +3926,10 @@ void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) {
   initFromIEEEAPInt<semFloat6E2M3FN>(api);
 }
 
+void IEEEFloat::initFromFloat4E2M1FNAPInt(const APInt &api) {
+  initFromIEEEAPInt<semFloat4E2M1FN>(api);
+}
+
 /// Treat api as containing the bits of a floating point number.
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
   assert(api.getBitWidth() == Sem->sizeInBits);
@@ -3944,6 +3963,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromFloat6E3M2FNAPInt(api);
   if (Sem == &semFloat6E2M3FN)
     return initFromFloat6E2M3FNAPInt(api);
+  if (Sem == &semFloat4E2M1FN)
+    return initFromFloat4E2M1FNAPInt(api);
 
   llvm_unreachable(nullptr);
 }
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 7007d944801a7..004face53e2d3 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -1828,6 +1828,7 @@ TEST(APFloatTest, getLargest) {
   EXPECT_EQ(28, APFloat::getLargest(APFloat::Float6E3M2FN()).convertToDouble());
   EXPECT_EQ(7.5,
             APFloat::getLargest(APFloat::Float6E2M3FN()).convertToDouble());
+  EXPECT_EQ(6, APFloat::getLargest(APFloat::Float4E2M1FN()).convertToDouble());
 }
 
 TEST(APFloatTest, getSmallest) {
@@ -1900,6 +1901,13 @@ TEST(APFloatTest, getSmallest) {
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_TRUE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float4E2M1FN(), false);
+  expected = APFloat(APFloat::Float4E2M1FN(), "0x0.8p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 
 TEST(APFloatTest, getSmallestNormalized) {
@@ -1984,6 +1992,14 @@ TEST(APFloatTest, getSmallestNormalized) {
   EXPECT_TRUE(test.isSmallestNormalized());
   test = APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
   expected = APFloat(APFloat::Float6E3M2FN(), "0x1p-2");
+
+  test = APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), false);
+  expected = APFloat(APFloat::Float4E2M1FN(), "0x1p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+  EXPECT_TRUE(test.isSmallestNormalized());
   EXPECT_FALSE(test.isNegative());
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_FALSE(test.isDenormal());
@@ -2034,7 +2050,9 @@ TEST(APFloatTest, getZero) {
       {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
       {&APFloat::Float6E3M2FN(), true, true, {0x20ULL, 0}, 1},
       {&APFloat::Float6E2M3FN(), false, true, {0, 0}, 1},
-      {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1}};
+      {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1},
+      {&APFloat::Float4E2M1FN(), false, true, {0, 0}, 1},
+      {&APFloat::Float4E2M1FN(), true, true, {0x8ULL, 0}, 1}};
   const unsigned NumGetZeroTests = std::size(GetZeroTest);
   for (unsigned i = 0; i < NumGetZeroTests; ++i) {
     APFloat test = APFloat::getZero(*GetZeroTest[i].semantics,
@@ -5283,6 +5301,89 @@ TEST(APFloatTest, Float6ExhaustivePair) {
   }
 }
 
+TEST(APFloatTest, Float4ExhaustivePair) {
+  // Test each pair of 4-bit floats with non-standard semantics
+  for (APFloat::Semantics Sem : {APFloat::S_Float4E2M1FN}) {
+    const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem);
+    for (int i = 0; i < 16; i++) {
+      for (int j = 0; j < 16; j++) {
+        SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) +
+                     ",j=" + std::to_string(j));
+        APFloat x(S, APInt(4, i));
+        APFloat y(S, APInt(4, j));
+
+        bool losesInfo;
+        APFloat x16 = x;
+        x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+        APFloat y16 = y;
+        y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+
+        // Add
+        APFloat z = x;
+        z.add(y, APFloat::rmNearestTiesToEven);
+        APFloat z16 = x16;
+        z16.add(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Subtract
+        z = x;
+        z.subtract(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.subtract(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Multiply
+        z = x;
+        z.multiply(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.multiply(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Skip divide by 0
+        if (j == 0 || j == 8)
+          continue;
+
+        // Divide
+        z = x;
+        z.divide(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.divide(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Mod
+        z = x;
+        z.mod(y);
+        z16 = x16;
+        z16.mod(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Remainder
+        z = x;
+        z.remainder(y);
+        z16 = x16;
+        z16.remainder(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+      }
+    }
+  }
+}
+
 TEST(APFloatTest, ConvertE4M3FNToE5M2) {
   bool losesInfo;
   APFloat test(APFloat::Float8E4M3FN(), "1.0");
@@ -6743,7 +6844,7 @@ TEST(APFloatTest, getExactLog2) {
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "3.0").getExactLog2Abs());
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2Abs());
 
-    if (I == APFloat::S_Float6E2M3FN) {
+    if (I == APFloat::S_Float6E2M3FN || I == APFloat::S_Float4E2M1FN) {
       EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2());
       EXPECT_EQ(INT_MIN, APFloat(Semantics, "-4.0").getExactLog2());
       EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2Abs());
@@ -6831,6 +6932,25 @@ TEST(APFloatTest, Float6E2M3FNFromString) {
   EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "-0").isNegZero());
 }
 
+TEST(APFloatTest, Float4E2M1FNFromString) {
+  // Exactly representable
+  EXPECT_EQ(6, APFloat(APFloat::Float4E2M1FN(), "6").convertToDouble());
+  // Round down to maximum value
+  EXPECT_EQ(6, APFloat(APFloat::Float4E2M1FN(), "32").convertToDouble());
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  EXPECT_DEATH(APFloat(APFloat::Float4E2M1FN(), "inf"),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat(APFloat::Float4E2M1FN(), "nan"),
+               "This floating point format does not support NaN");
+#endif
+#endif
+
+  EXPECT_TRUE(APFloat(APFloat::Float4E2M1FN(), "0").isPosZero());
+  EXPECT_TRUE(APFloat(APFloat::Float4E2M1FN(), "-0").isNegZero());
+}
+
 TEST(APFloatTest, ConvertE3M2FToE2M3F) {
   bool losesInfo;
   APFloat test(APFloat::Float6E3M2FN(), "1.0");
@@ -6857,6 +6977,7 @@ TEST(APFloatTest, ConvertE3M2FToE2M3F) {
   EXPECT_EQ(status, APFloat::opInexact);
 
   // Test underflow
+  losesInfo = false;
   test = APFloat(APFloat::Float6E3M2FN(), ".0625");
   status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
                         &losesInfo);
@@ -6907,6 +7028,42 @@ TEST(APFloatTest, ConvertE2M3FToE3M2F) {
   EXPECT_EQ(status, APFloat::opInexact);
 }
 
+TEST(APFloatTest, ConvertDoubleToE2M1F) {
+  bool losesInfo;
+  APFloat test(APFloat::IEEEdouble(), "1.0");
+  APFloat::opStatus status = test.convert(
+      APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(1.0, test.convertToDouble());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  test = APFloat(APFloat::IEEEdouble(), "0.0");
+  status = test.convert(APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToDouble());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  // Test overflow
+  losesInfo = false;
+  test = APFloat(APFloat::IEEEdouble(), "8");
+  status = test.convert(APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(6, test.convertToDouble());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opInexact);
+
+  // Test underflow
+  losesInfo = false;
+  test = APFloat(APFloat::IEEEdouble(), "0.25");
+  status = test.convert(APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0., test.convertToDouble());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+}
+
 TEST(APFloatTest, Float6E3M2FNNext) {
   APFloat test(APFloat::Float6E3M2FN(), APFloat::uninitialized);
   APFloat expected(APFloat::Float6E3M2FN(), APFloat::uninitialized);
@@ -6983,6 +7140,44 @@ TEST(APFloatTest, Float6E2M3FNNext) {
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 
+TEST(APFloatTest, Float4E2M1FNNext) {
+  APFloat test(APFloat::Float4E2M1FN(), APFloat::uninitialized);
+  APFloat expected(APFloat::Float4E2M1FN(), APFloat::uninitialized);
+
+  // 1. NextUp of largest bit pattern is the same
+  test = APFloat::getLargest(APFloat::Float4E2M1FN());
+  expected = APFloat::getLargest(APFloat::Float4E2M1FN());
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 2. NextUp of smallest negative denormal is -0
+  test = APFloat::getSmallest(APFloat::Float4E2M1FN(), true);
+  expected = APFloat::getZero(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_TRUE(test.isNegZero());
+  EXPECT_FALSE(test.isPosZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 3. nextDown of negative of largest value is the same
+  test = APFloat::getLargest(APFloat::Float4E2M1FN(), true);
+  expected = test;
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_FALSE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 4. nextDown of +0 is smallest negative denormal
+  test = APFloat::getZero(APFloat::Float4E2M1FN(), false);
+  expected = APFloat::getSmallest(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
 #ifdef GTEST_HAS_DEATH_TEST
 #ifndef NDEBUG
 TEST(APFloatTest, Float6E3M2FNGetInfNaN) {
@@ -6998,6 +7193,13 @@ TEST(APFloatTest, Float6E2M3FNGetInfNaN) {
   EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E2M3FN()),
                "This floating point format does not support NaN");
 }
+
+TEST(APFloatTest, Float4E2M1FNGetInfNaN) {
+  EXPECT_DEATH(APFloat::getInf(APFloat::Float4E2M1FN()),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat::getNaN(APFloat::Float4E2M1FN()),
+               "This floating point format does not support NaN");
+}
 #endif
 #endif
 
@@ -7043,6 +7245,27 @@ TEST(APFloatTest, Float6E2M3FNToDouble) {
   EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToDouble());
 }
 
+TEST(APFloatTest, Float4E2M1FNToDouble) {
+  APFloat One(APFloat::Float4E2M1FN(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float4E2M1FN(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(6, PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-6, NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(0x1p0, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-0x1p0, NegSmallest.convertToDouble());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float4E2M1FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x0.8p0, SmallestDenorm.convertToDouble());
+}
+
 TEST(APFloatTest, Float6E3M2FNToFloat) {
   APFloat PosZero = APFloat::getZero(APFloat::Float6E3M2FN());
   APFloat PosZeroToFloat(PosZero.convertToFloat());
@@ -7100,4 +7323,33 @@ TEST(APFloatTest, Float6E2M3FNToFloat) {
   EXPECT_TRUE(SmallestDenorm.isDenormal());
   EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToFloat());
 }
+
+TEST(APFloatTest, Float4E2M1FNToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float4E2M1FN());
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float4E2M1FN(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+  APFloat One(APFloat::Float4E2M1FN(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float4E2M1FN(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(6, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-6, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(0x1p0, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-0x1p0, NegSmallest.convertToFloat());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float4E2M1FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(...
[truncated]

llvmbot · 2024-06-13T11:11:44Z

@llvm/pr-subscribers-llvm-adt

Author: Durgadoss R (durga4github)

Changes

This patch adds APFloat type support for the E2M1
FP4 datatype. The definitions for this format are
detailed in section 5.3.3 of the OCP specification,
which can be accessed here:
https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf

Patch is 20.06 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95392.diff

4 Files Affected:

(modified) clang/lib/AST/MicrosoftMangle.cpp (+1)
(modified) llvm/include/llvm/ADT/APFloat.h (+8)
(modified) llvm/lib/Support/APFloat.cpp (+23-2)
(modified) llvm/unittests/ADT/APFloatTest.cpp (+254-2)

diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index ffc5d2d4cd8fc..a863ec7a529b9 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -901,6 +901,7 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
   case APFloat::S_FloatTF32:
   case APFloat::S_Float6E3M2FN:
   case APFloat::S_Float6E2M3FN:
+  case APFloat::S_Float4E2M1FN:
     llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index a9bb6cc9999b1..c24eae8da3797 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -197,6 +197,10 @@ struct APFloatBase {
     // types, there are no infinity or NaN values. The format is detailed in
     // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     S_Float6E2M3FN,
+    // 4-bit floating point number with bit layout S1E2M1. Unlike IEEE-754
+    // types, there are no infinity or NaN values. The format is detailed in
+    // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    S_Float4E2M1FN,
 
     S_x87DoubleExtended,
     S_MaxSemantics = S_x87DoubleExtended,
@@ -219,6 +223,7 @@ struct APFloatBase {
   static const fltSemantics &FloatTF32() LLVM_READNONE;
   static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
   static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
+  static const fltSemantics &Float4E2M1FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -639,6 +644,7 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloatTF32APFloatToAPInt() const;
   APInt convertFloat6E3M2FNAPFloatToAPInt() const;
   APInt convertFloat6E2M3FNAPFloatToAPInt() const;
+  APInt convertFloat4E2M1FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template <const fltSemantics &S> void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -656,6 +662,7 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloatTF32APInt(const APInt &api);
   void initFromFloat6E3M2FNAPInt(const APInt &api);
   void initFromFloat6E2M3FNAPInt(const APInt &api);
+  void initFromFloat4E2M1FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1067,6 +1074,7 @@ class APFloat : public APFloatBase {
     // Below Semantics do not support {NaN or Inf}
     case APFloat::S_Float6E3M2FN:
     case APFloat::S_Float6E2M3FN:
+    case APFloat::S_Float4E2M1FN:
       return false;
     }
   }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 1209bf71a287d..fab3052a9c02e 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -69,8 +69,8 @@ enum class fltNonfiniteBehavior {
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
 
-  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
-  // which do not support Inf or NaN values.
+  // This behavior is present in Float6E3M2FN, Float6E2M3FN and
+  // Float4E2M1FN types, which do not support Inf or NaN values.
   FiniteOnly,
 };
 
@@ -147,6 +147,8 @@ static constexpr fltSemantics semFloat6E3M2FN = {
     4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semFloat6E2M3FN = {
     2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
+static constexpr fltSemantics semFloat4E2M1FN = {
+    2, 0, 2, 4, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
 static constexpr fltSemantics semBogus = {0, 0, 0, 0};
 
@@ -218,6 +220,8 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
     return Float6E3M2FN();
   case S_Float6E2M3FN:
     return Float6E2M3FN();
+  case S_Float4E2M1FN:
+    return Float4E2M1FN();
   case S_x87DoubleExtended:
     return x87DoubleExtended();
   }
@@ -254,6 +258,8 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     return S_Float6E3M2FN;
   else if (&Sem == &llvm::APFloat::Float6E2M3FN())
     return S_Float6E2M3FN;
+  else if (&Sem == &llvm::APFloat::Float4E2M1FN())
+    return S_Float4E2M1FN;
   else if (&Sem == &llvm::APFloat::x87DoubleExtended())
     return S_x87DoubleExtended;
   else
@@ -278,6 +284,7 @@ const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
 const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
 const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
 const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
+const fltSemantics &APFloatBase::Float4E2M1FN() { return semFloat4E2M1FN; }
 const fltSemantics &APFloatBase::x87DoubleExtended() {
   return semX87DoubleExtended;
 }
@@ -3640,6 +3647,11 @@ APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const {
   return convertIEEEFloatToAPInt<semFloat6E2M3FN>();
 }
 
+APInt IEEEFloat::convertFloat4E2M1FNAPFloatToAPInt() const {
+  assert(partCount() == 1);
+  return convertIEEEFloatToAPInt<semFloat4E2M1FN>();
+}
+
 // This function creates an APInt that is just a bit map of the floating
 // point constant as it would appear in memory.  It is not a conversion,
 // and treating the result as a normal integer is unlikely to be useful.
@@ -3687,6 +3699,9 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN)
     return convertFloat6E2M3FNAPFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat4E2M1FN)
+    return convertFloat4E2M1FNAPFloatToAPInt();
+
   assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
          "unknown format!");
   return convertF80LongDoubleAPFloatToAPInt();
@@ -3911,6 +3926,10 @@ void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) {
   initFromIEEEAPInt<semFloat6E2M3FN>(api);
 }
 
+void IEEEFloat::initFromFloat4E2M1FNAPInt(const APInt &api) {
+  initFromIEEEAPInt<semFloat4E2M1FN>(api);
+}
+
 /// Treat api as containing the bits of a floating point number.
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
   assert(api.getBitWidth() == Sem->sizeInBits);
@@ -3944,6 +3963,8 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromFloat6E3M2FNAPInt(api);
   if (Sem == &semFloat6E2M3FN)
     return initFromFloat6E2M3FNAPInt(api);
+  if (Sem == &semFloat4E2M1FN)
+    return initFromFloat4E2M1FNAPInt(api);
 
   llvm_unreachable(nullptr);
 }
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 7007d944801a7..004face53e2d3 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -1828,6 +1828,7 @@ TEST(APFloatTest, getLargest) {
   EXPECT_EQ(28, APFloat::getLargest(APFloat::Float6E3M2FN()).convertToDouble());
   EXPECT_EQ(7.5,
             APFloat::getLargest(APFloat::Float6E2M3FN()).convertToDouble());
+  EXPECT_EQ(6, APFloat::getLargest(APFloat::Float4E2M1FN()).convertToDouble());
 }
 
 TEST(APFloatTest, getSmallest) {
@@ -1900,6 +1901,13 @@ TEST(APFloatTest, getSmallest) {
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_TRUE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float4E2M1FN(), false);
+  expected = APFloat(APFloat::Float4E2M1FN(), "0x0.8p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 
 TEST(APFloatTest, getSmallestNormalized) {
@@ -1984,6 +1992,14 @@ TEST(APFloatTest, getSmallestNormalized) {
   EXPECT_TRUE(test.isSmallestNormalized());
   test = APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
   expected = APFloat(APFloat::Float6E3M2FN(), "0x1p-2");
+
+  test = APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), false);
+  expected = APFloat(APFloat::Float4E2M1FN(), "0x1p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+  EXPECT_TRUE(test.isSmallestNormalized());
   EXPECT_FALSE(test.isNegative());
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_FALSE(test.isDenormal());
@@ -2034,7 +2050,9 @@ TEST(APFloatTest, getZero) {
       {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
       {&APFloat::Float6E3M2FN(), true, true, {0x20ULL, 0}, 1},
       {&APFloat::Float6E2M3FN(), false, true, {0, 0}, 1},
-      {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1}};
+      {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1},
+      {&APFloat::Float4E2M1FN(), false, true, {0, 0}, 1},
+      {&APFloat::Float4E2M1FN(), true, true, {0x8ULL, 0}, 1}};
   const unsigned NumGetZeroTests = std::size(GetZeroTest);
   for (unsigned i = 0; i < NumGetZeroTests; ++i) {
     APFloat test = APFloat::getZero(*GetZeroTest[i].semantics,
@@ -5283,6 +5301,89 @@ TEST(APFloatTest, Float6ExhaustivePair) {
   }
 }
 
+TEST(APFloatTest, Float4ExhaustivePair) {
+  // Test each pair of 4-bit floats with non-standard semantics
+  for (APFloat::Semantics Sem : {APFloat::S_Float4E2M1FN}) {
+    const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem);
+    for (int i = 0; i < 16; i++) {
+      for (int j = 0; j < 16; j++) {
+        SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) +
+                     ",j=" + std::to_string(j));
+        APFloat x(S, APInt(4, i));
+        APFloat y(S, APInt(4, j));
+
+        bool losesInfo;
+        APFloat x16 = x;
+        x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+        APFloat y16 = y;
+        y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+
+        // Add
+        APFloat z = x;
+        z.add(y, APFloat::rmNearestTiesToEven);
+        APFloat z16 = x16;
+        z16.add(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Subtract
+        z = x;
+        z.subtract(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.subtract(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Multiply
+        z = x;
+        z.multiply(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.multiply(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Skip divide by 0
+        if (j == 0 || j == 8)
+          continue;
+
+        // Divide
+        z = x;
+        z.divide(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.divide(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Mod
+        z = x;
+        z.mod(y);
+        z16 = x16;
+        z16.mod(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Remainder
+        z = x;
+        z.remainder(y);
+        z16 = x16;
+        z16.remainder(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+      }
+    }
+  }
+}
+
 TEST(APFloatTest, ConvertE4M3FNToE5M2) {
   bool losesInfo;
   APFloat test(APFloat::Float8E4M3FN(), "1.0");
@@ -6743,7 +6844,7 @@ TEST(APFloatTest, getExactLog2) {
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "3.0").getExactLog2Abs());
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2Abs());
 
-    if (I == APFloat::S_Float6E2M3FN) {
+    if (I == APFloat::S_Float6E2M3FN || I == APFloat::S_Float4E2M1FN) {
       EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2());
       EXPECT_EQ(INT_MIN, APFloat(Semantics, "-4.0").getExactLog2());
       EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2Abs());
@@ -6831,6 +6932,25 @@ TEST(APFloatTest, Float6E2M3FNFromString) {
   EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "-0").isNegZero());
 }
 
+TEST(APFloatTest, Float4E2M1FNFromString) {
+  // Exactly representable
+  EXPECT_EQ(6, APFloat(APFloat::Float4E2M1FN(), "6").convertToDouble());
+  // Round down to maximum value
+  EXPECT_EQ(6, APFloat(APFloat::Float4E2M1FN(), "32").convertToDouble());
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  EXPECT_DEATH(APFloat(APFloat::Float4E2M1FN(), "inf"),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat(APFloat::Float4E2M1FN(), "nan"),
+               "This floating point format does not support NaN");
+#endif
+#endif
+
+  EXPECT_TRUE(APFloat(APFloat::Float4E2M1FN(), "0").isPosZero());
+  EXPECT_TRUE(APFloat(APFloat::Float4E2M1FN(), "-0").isNegZero());
+}
+
 TEST(APFloatTest, ConvertE3M2FToE2M3F) {
   bool losesInfo;
   APFloat test(APFloat::Float6E3M2FN(), "1.0");
@@ -6857,6 +6977,7 @@ TEST(APFloatTest, ConvertE3M2FToE2M3F) {
   EXPECT_EQ(status, APFloat::opInexact);
 
   // Test underflow
+  losesInfo = false;
   test = APFloat(APFloat::Float6E3M2FN(), ".0625");
   status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
                         &losesInfo);
@@ -6907,6 +7028,42 @@ TEST(APFloatTest, ConvertE2M3FToE3M2F) {
   EXPECT_EQ(status, APFloat::opInexact);
 }
 
+TEST(APFloatTest, ConvertDoubleToE2M1F) {
+  bool losesInfo;
+  APFloat test(APFloat::IEEEdouble(), "1.0");
+  APFloat::opStatus status = test.convert(
+      APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(1.0, test.convertToDouble());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  test = APFloat(APFloat::IEEEdouble(), "0.0");
+  status = test.convert(APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToDouble());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  // Test overflow
+  losesInfo = false;
+  test = APFloat(APFloat::IEEEdouble(), "8");
+  status = test.convert(APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(6, test.convertToDouble());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opInexact);
+
+  // Test underflow
+  losesInfo = false;
+  test = APFloat(APFloat::IEEEdouble(), "0.25");
+  status = test.convert(APFloat::Float4E2M1FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0., test.convertToDouble());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+}
+
 TEST(APFloatTest, Float6E3M2FNNext) {
   APFloat test(APFloat::Float6E3M2FN(), APFloat::uninitialized);
   APFloat expected(APFloat::Float6E3M2FN(), APFloat::uninitialized);
@@ -6983,6 +7140,44 @@ TEST(APFloatTest, Float6E2M3FNNext) {
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 
+TEST(APFloatTest, Float4E2M1FNNext) {
+  APFloat test(APFloat::Float4E2M1FN(), APFloat::uninitialized);
+  APFloat expected(APFloat::Float4E2M1FN(), APFloat::uninitialized);
+
+  // 1. NextUp of largest bit pattern is the same
+  test = APFloat::getLargest(APFloat::Float4E2M1FN());
+  expected = APFloat::getLargest(APFloat::Float4E2M1FN());
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 2. NextUp of smallest negative denormal is -0
+  test = APFloat::getSmallest(APFloat::Float4E2M1FN(), true);
+  expected = APFloat::getZero(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_TRUE(test.isNegZero());
+  EXPECT_FALSE(test.isPosZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 3. nextDown of negative of largest value is the same
+  test = APFloat::getLargest(APFloat::Float4E2M1FN(), true);
+  expected = test;
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_FALSE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 4. nextDown of +0 is smallest negative denormal
+  test = APFloat::getZero(APFloat::Float4E2M1FN(), false);
+  expected = APFloat::getSmallest(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
 #ifdef GTEST_HAS_DEATH_TEST
 #ifndef NDEBUG
 TEST(APFloatTest, Float6E3M2FNGetInfNaN) {
@@ -6998,6 +7193,13 @@ TEST(APFloatTest, Float6E2M3FNGetInfNaN) {
   EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E2M3FN()),
                "This floating point format does not support NaN");
 }
+
+TEST(APFloatTest, Float4E2M1FNGetInfNaN) {
+  EXPECT_DEATH(APFloat::getInf(APFloat::Float4E2M1FN()),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat::getNaN(APFloat::Float4E2M1FN()),
+               "This floating point format does not support NaN");
+}
 #endif
 #endif
 
@@ -7043,6 +7245,27 @@ TEST(APFloatTest, Float6E2M3FNToDouble) {
   EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToDouble());
 }
 
+TEST(APFloatTest, Float4E2M1FNToDouble) {
+  APFloat One(APFloat::Float4E2M1FN(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float4E2M1FN(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(6, PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-6, NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(0x1p0, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-0x1p0, NegSmallest.convertToDouble());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float4E2M1FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x0.8p0, SmallestDenorm.convertToDouble());
+}
+
 TEST(APFloatTest, Float6E3M2FNToFloat) {
   APFloat PosZero = APFloat::getZero(APFloat::Float6E3M2FN());
   APFloat PosZeroToFloat(PosZero.convertToFloat());
@@ -7100,4 +7323,33 @@ TEST(APFloatTest, Float6E2M3FNToFloat) {
   EXPECT_TRUE(SmallestDenorm.isDenormal());
   EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToFloat());
 }
+
+TEST(APFloatTest, Float4E2M1FNToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float4E2M1FN());
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float4E2M1FN(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+  APFloat One(APFloat::Float4E2M1FN(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float4E2M1FN(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(6, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-6, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), false);
+  EXPECT_EQ(0x1p0, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float4E2M1FN(), true);
+  EXPECT_EQ(-0x1p0, NegSmallest.convertToFloat());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float4E2M1FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(...
[truncated]

durga4github · 2024-06-13T11:19:51Z

@ThomasRaoux, Could you please help review this change?

llvm/unittests/ADT/APFloatTest.cpp

ThomasRaoux

LGTM, thanks!

llvm/lib/Support/APFloat.cpp

This patch adds APFloat type support for the E2M1 FP4 datatype. The definitions for this format are detailed in section 5.3.3 of the OCP specification, which can be accessed here: https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf Signed-off-by: Durgadoss R <durgadossr@nvidia.com>

This PR adds `f4E2M1FN` type to mlir. `f4E2M1FN` type is proposed in [OpenCompute MX Specification](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf). It defines a 4-bit floating point number with bit layout S1E2M1. Unlike IEEE-754 types, there are no infinity or NaN values. ```c f4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 ``` Related PRs: - [PR-95392](#95392) [APFloat] Add APFloat support for FP4 data type - [PR-105573](#105573) [MLIR] Add f6E3M2FN type - was used as a template for this PR - [PR-107999](#107999) [MLIR] Add f6E2M3FN type

…U) (#2581) This is a proposal to add MX (microscaling) floating point types to StableHLO. Related links: - StableHLO [PR#2582](#2582) Add MX floating point types (f4E2M1FN, f6E2M3FN, f6E3M2FN, f8E8M0FNU) - LLVM [PR#95392](llvm/llvm-project#95392) [APFloat] Add APFloat support for FP4 data type - LLVM [PR#94735](llvm/llvm-project#94735) [APFloat] Add APFloat support for FP6 data types - LLVM [PR#107127](llvm/llvm-project#107127) [APFloat] Add APFloat support for E8M0 type - LLVM [PR#108877](llvm/llvm-project#108877) [MLIR] Add f4E2M1FN type - LLVM [PR#107999](llvm/llvm-project#107999) [MLIR] Add f6E2M3FN type - LLVM [PR#105573](llvm/llvm-project#105573) [MLIR] Add f6E3M2FN type - LLVM [PR#111028](llvm/llvm-project#111028) [MLIR] Add f8E8M0FNU type - JAX-ML [PR#181](jax-ml/ml_dtypes#181) Add sub-byte data types: float4_e2m1fn, float6_e2m3fn, float6_e3m2fn - JAX-ML [PR#166](jax-ml/ml_dtypes#181) Add float8_e8m0_fnu (E8M0) OCP MX scale format

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- fa539fbde987ff6421fd2937fade495baf633630 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 2c014035923e0394b2cfcb81eaf090a96621b0aa by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- e919ed54e825f2e905aaf0cc279dd21cd80f1ce9 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- ca16839096feb93e0454ec380c5c707c30199346 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- eedc079ca9a4db9e611d84877a25b3da21386f16 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 8e0305cd47002f0c1f8668a3cbcbce5428f2a4c6 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- aabe9c68d964609f78f29e17ee0680798ad0c6ac by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- 87da2ebfab388f113482e852009401a9e416974a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- e0ee48c3a37018ba985c850931592d62eadf7c2e by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- be2e457922e2cddeaf5aca13dd022f3ac2a1393b by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#19096 from openxla:skozub/e2m1 be2e457922e2cddeaf5aca13dd022f3ac2a1393b PiperOrigin-RevId: 702273510

Imported from GitHub PR #19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- fa539fb by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 2c01403 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- e919ed5 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- ca16839 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- eedc079 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 8e0305c by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- aabe9c6 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- 87da2eb by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- e0ee48c by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- be2e457 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=#19096 from openxla:skozub/e2m1 be2e457 PiperOrigin-RevId: 702273510

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#19096 from openxla:skozub/e2m1 d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 PiperOrigin-RevId: 707638099

Imported from GitHub PR #19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e48 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d0056 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca820 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f09 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af7 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e34417 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a3 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=#19096 from openxla:skozub/e2m1 d4de0a3 PiperOrigin-RevId: 707638099

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#19096 from openxla:skozub/e2m1 d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 PiperOrigin-RevId: 707638099

Imported from GitHub PR #19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e48 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d0056 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca820 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f09 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af7 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e34417 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a3 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=#19096 from openxla:skozub/e2m1 d4de0a3 PiperOrigin-RevId: 707638099

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#19096 from openxla:skozub/e2m1 d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 PiperOrigin-RevId: 707638099

Imported from GitHub PR #19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e48 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d0056 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca820 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f09 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af7 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e34417 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a3 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=#19096 from openxla:skozub/e2m1 d4de0a3 PiperOrigin-RevId: 707638099

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#19096 from openxla:skozub/e2m1 d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 PiperOrigin-RevId: 707638099

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - #181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - #166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 PiperOrigin-RevId: 708390061

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 PiperOrigin-RevId: 708390061

Imported from GitHub PR #19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e48 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d0056 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca820 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f09 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af7 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e34417 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a3 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 COPYBARA_INTEGRATE_REVIEW=#19096 from openxla:skozub/e2m1 d4de0a3 PiperOrigin-RevId: 708390061

Imported from GitHub PR openxla/xla#19096 This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 The PR is split into multiple commits just to make the review easier, it is possible that some tests could fail if only some (i.e. not all) of these commits are applied. Copybara import of the project: -- f493e4803eaa5ff3da3ceb130e9348c014b4a2e8 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: import mxfloat.h -- 87d005630b310a355d7c30b22828c35237373f17 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: primitive type -- 70ca82093faeec98f2dc5e8b82f617d99ca96849 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: literal support -- c479f0940da490e9668e2f48e14a7466f0c4a97f by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: conversion codegen -- daaa3af3ce3af456f2ef44dbc291ebeb09e86d9b by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: python interface -- 1f0e19ff14733eff790726936b68ef0cf607a766 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: FFI -- 999bf96092e57c7b3039811f2887281f347ff17a by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: HLO evaluator -- d7d5af74c5f8a94522779a121c0a4a962156fb64 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN type: add tests -- 9e8c7bc02849f241d0f05941221d99f1d08d9e67 by Sergey Kozub <skozub@nvidia.com>: Add F8E8M0FNU type -- 1e344174b931cea4978770ab740dfed67186c2f4 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments -- d4de0a369d9dc853f34f3cf3bf7dcc5a47502106 by Sergey Kozub <skozub@nvidia.com>: Addressing PR#19096 review comments (round 2) Merging this change closes #19096 PiperOrigin-RevId: 708390061

Imported from GitHub PR #21380 Previous PR #19096 was rolled back, re-trying. This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 Copybara import of the project: -- d7e00c4 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN and F8E8M0FNU types Merging this change closes #21380 FUTURE_COPYBARA_INTEGRATE_REVIEW=#21380 from openxla:skozub/e2m1_e8m0 d7e00c4 PiperOrigin-RevId: 715070992

Imported from GitHub PR openxla/xla#21380 Previous PR openxla/xla#19096 was rolled back, re-trying. This PR adds F4E2M1FN primitive type (4-bit float with 2 bits exponent and 1 bit mantissa), F8E8M0FNU primitive type (8-bit float with 8 bits exponent, no mantissa and no sign) and enables loads/stores in the same way S4/U4 type is implemented. This will enable using microscaling (MX) formats ([RFC](openxla/xla#18085)), such as MXFP4. ```c F4E2M1FN - Exponent bias: 1 - Maximum stored exponent value: 3 (binary 11) - Maximum unbiased exponent value: 3 - 1 = 2 - Minimum stored exponent value: 1 (binary 01) - Minimum unbiased exponent value: 1 − 1 = 0 - Has Positive and Negative zero - Doesn't have infinity - Doesn't have NaNs Additional details: - Zeros (+/-): S.00.0 - Max normal number: S.11.1 = ±2^(2) x (1 + 0.5) = ±6.0 - Min normal number: S.01.0 = ±2^(0) = ±1.0 - Min subnormal number: S.00.1 = ±2^(0) x 0.5 = ±0.5 F8E8M0FNU - Exponent bias: 127 - Maximum stored exponent value: 254 (binary 1111'1110) - Maximum unbiased exponent value: 254 - 127 = 127 - Minimum stored exponent value: 0 (binary 0000'0000) - Minimum unbiased exponent value: 0 − 127 = -127 - Doesn't have zero - Doesn't have infinity - NaN is encoded as binary 1111'1111 Additional details: - Zeros cannot be represented - Negative values cannot be represented - Mantissa is always 1 ``` Related PRs: - openxla/stablehlo#2582 - jax-ml/ml_dtypes#181 - llvm/llvm-project#95392 - llvm/llvm-project#108877 - jax-ml/ml_dtypes#166 - llvm/llvm-project#107127 - llvm/llvm-project#111028 Copybara import of the project: -- d7e00c49a4b4f26c06266d6bb941275e67464c01 by Sergey Kozub <skozub@nvidia.com>: Add F4E2M1FN and F8E8M0FNU types Merging this change closes #21380 FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#21380 from openxla:skozub/e2m1_e8m0 d7e00c49a4b4f26c06266d6bb941275e67464c01 PiperOrigin-RevId: 715070992