shantanu-gontia · shantanu-gontia · Mar 9, 2025 · Mar 9, 2025 · Mar 14, 2025 · Apr 9, 2025
diff --git a/pkg/float16bits/float16.go b/pkg/float16bits/float16.go
@@ -85,7 +85,7 @@ func (input Bits) ToFloat32() float32 {
 		// = (-1)^sign * 2^(-15) * (1 + m1/2 + m2/4 + m3/8 + ...)
 		// So, if the MSB set bit is m0, then the result exponent = Emin - 1
 		// and, we need to shift the mantissa to the right when it's in the
-		// float32 container. And, there as an extra mantissa left-shift by
+		// float32 container. And, there is an extra mantissa left-shift by
 		// 1
 		// Let's now say it's m2. In that case, we have
 		// (-1)^sign * 2^(-14) * (0/2 + 0/4 + 1/8 + m3/16 + ...)
@@ -181,7 +181,7 @@ func FromBigFloat(input big.Float, rm floatBit.RoundingMode,
 		// the underflow response in the BF16 methods.
 		// F32.PositiveMinSubnormal will trigger underflow response in BF16
 		asFloat32 = math.Float32frombits(F32.PositiveMinSubnormal)
-	} else if closestFloat32 == -0.0 && fromBigFloatAcc == big.Above {
+	} else if closestFloat32 == math.Float32frombits(F32.NegativeZero) && fromBigFloatAcc == big.Above {
 		// And for the negative case
 		// F32.NegativeMinSubnormal will trigger underflow response in BF16
 		asFloat32 = math.Float32frombits(F32.NegativeMinSubnormal)
@@ -254,7 +254,7 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
 	}
 
 	// Special Case #5: Input exceeds the maximum normal value (in magnitude)
-	// that can be represented in the float32 format. In this case, the input om
+	// that can be represented in the float16 format. In this case, the input om
 	// [floatBit.OverflowMode] determines the response.
 
 	// First off, we calculate the actual value of the exponent. To do this,
@@ -285,8 +285,8 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
 	lostPrecision := false
 
 	// Before performing any rounding, we need to make sure this exponent
-	// can actually be represented in the float32 format. If the exponent,
-	// is smaller than the minimum exponent allowed in float32 (-126), this
+	// can actually be represented in the float16 format. If the exponent,
+	// is smaller than the minimum exponent allowed in float16 (-14), this
 	// either results in underflow, or it rounds up or trunc to some subnormal
 	// number in the float32 format. We will need to take special care for
 	// the cases where we truncate, because we might underflow and we need
@@ -325,7 +325,7 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
 		//
 		// In general, by following the pattern, this shift amount is equal
 		// to the difference between the minimum representable exponent (actual)
-		// and the actual value of the exponent in float64.
+		// and the actual value of the exponent in float32.
 		shiftAmount := uint32(ExponentMin - actualExponent)
 		for ; shiftAmount > 0; shiftAmount-- {
 			lastDigit := alignedMantissa & 0x1
@@ -338,9 +338,9 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
 		// Now that we have the value for the mantissa, we can determine
 		// the underflow case. There is underflow, in the case when the
 		// part of the mantissa that has the precision that can be represented
-		// in float32 is 0 (bits m52 to m30), but the rest of the mantissa has
+		// in float16 is 0 (bits m22 to m13), but the rest of the mantissa has
 		// atleast 1 bit set i.e. all of the precision in the number is higher
-		// than that could be represented in float32. In this case, the response
+		// than that could be represented in float16. In this case, the response
 		// is handled by the input um [floatBit.UnderflowMode]
 		if checkUnderflow(alignedMantissa, lostPrecision) {
 			return handleUnderflow(signBit, um)
@@ -386,9 +386,7 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
 
 // Utility function to check if the number with the given exponent and mantissa
 // bits would overflow when trying to represent it in a float16 value
-// exponentBits should correspond to bits which are encoded with the float16
-// bias in mind. mantissaBits should occupy the bits with the float32 format
-// in mind.
+// mantissaBits should occupy the bits as they would in a float32 number
 func checkOverflow(actualExponent int, mantissaBits uint32) bool {
 	// If the exponent is larger than the max, then it's overflow
 	if actualExponent > ExponentMax {
@@ -408,11 +406,11 @@ func checkOverflow(actualExponent int, mantissaBits uint32) bool {
 }
 
 // Utility function to check if the number with the given exponent and mantissa
-// bits would overflow when trying to represent it in a float16 value
+// bits would underflow when trying to represent it in a float16 value
 // Subnormals require shifting the mantissa to align the exponents. This might
 // cause loss of precision that cannot be detected by mantissaBits alone as
 // they are already shifted. The lostPrecision parameter helps us with that.
-// If it's true then there was precision lost when mantissa was being aligned
+// If it's true then there was precision lost when mantissa was being aligned.
 func checkUnderflow(mantissaBits uint32, lostPrecision bool) bool {
 	// This assumes that the exponent is 0, so any extra precision in the
 	// mantissa means underflow.
@@ -465,7 +463,7 @@ func handleOverflow(signBit uint32, om floatBit.OverflowMode) (Bits,
 		return Bits(NegativeNaN), big.Below, floatBit.Overflow
 	case floatBit.SaturateMax:
 		if signBit == 0 {
-			// The maximum normal in float32 is smaller than any number
+			// The maximum normal in float16 is smaller than any number
 			// this function will be invoked for
 			return Bits(PositiveMaxNormal), big.Below, floatBit.Overflow
 		}
@@ -496,7 +494,7 @@ func (b *Bits) ToFloatFormat() floatBit.FloatBitFormat {
 
 	// 5 Exponent Bits
 	exponentRetVal := make([]byte, 0, 5)
-	for i := 0; i < 5; i++ {
+	for range 5 {
 		currentExponentBit := exponentBits & 0x1
 		var valueToAppend byte
 		if currentExponentBit == 0 {
@@ -510,7 +508,7 @@ func (b *Bits) ToFloatFormat() floatBit.FloatBitFormat {
 
 	// 10 Mantissa Bits
 	mantissaRetVal := make([]byte, 0, 10)
-	for i := 0; i < 10; i++ {
+	for range 10 {
 		currentMantissaBit := mantissaBits & 0x1
 		var valueToAppend byte
 		if currentMantissaBit == 0 {

diff --git a/pkg/float16bits/float16_test.go b/pkg/float16bits/float16_test.go
@@ -38,7 +38,7 @@ func TestToFloat32(t *testing.T) {
 		},
 		{
 			input:  0b1_00000_0000000000,
-			golden: -0.0,
+			golden: math.Float32frombits(F32.NegativeZero),
 		},
 		{
 			input:  0b0_00000_1111111111,
@@ -89,7 +89,7 @@ func TestHandleOverflow(t *testing.T) {
 			if (resultVal != tt.goldenVal) || (resultAcc != tt.goldenAcc) || (resultStatus != tt.goldenStatus) {
 				t.Logf("Failed Input Set:\n")
 				t.Logf("SignBit: %v\tOverflowMode: %v\n", tt.signBit, tt.om)
-				t.Errorf("Expected Result: %0#8x, Got: %0#8x\n", tt.goldenVal, resultVal)
+				t.Errorf("Expected Result: %0#4x, Got: %0#4x\n", tt.goldenVal, resultVal)
 				t.Errorf("Expected Accuracy: %v, Got: %v\n", tt.goldenAcc, resultAcc)
 				t.Errorf("Expected Status: %v, Got: %v\n", tt.goldenStatus, resultStatus)
 			}
@@ -121,7 +121,7 @@ func TestHandleUnderflow(t *testing.T) {
 			if (resultVal != tt.goldenVal) || (resultAcc != tt.goldenAcc) || (resultStatus != tt.goldenStatus) {
 				t.Logf("Failed Input Set:\n")
 				t.Logf("SignBit: %v\tUnderflowMode: %v\n", tt.signBit, tt.um)
-				t.Errorf("Expected Result: %0#8x, Got: %0#8x\n", tt.goldenVal, resultVal)
+				t.Errorf("Expected Result: %0#4x, Got: %0#4x\n", tt.goldenVal, resultVal)
 				t.Errorf("Expected Accuracy: %v, Got: %v\n", tt.goldenAcc, resultAcc)
 				t.Errorf("Expected Status: %v, Got: %v\n", tt.goldenStatus, resultStatus)
 			}
@@ -400,8 +400,8 @@ func TestRoundTowardsPositiveInf(t *testing.T) {
 
 func TestRoundTowardsNegativeInf(t *testing.T) {
 
-	// Rounding towards positive infinity involves adding 1 if the number
-	// is positive, otherwise truncating, so that the number is closer to +inf
+	// Rounding towards negative infinity involves adding 1 if the number
+	// is negative, otherwise truncating, so that the number is closer to -inf
 
 	testCases := []struct {
 		// Inputs
@@ -1021,7 +1021,7 @@ func TestRoundHalfTowardsNegativeInf(t *testing.T) {
 func TestRoundNearestEven(t *testing.T) {
 	// Rounding half towards positive infinity involves rounding to the nearest
 	// representable number, and breaking ties by rounding towards the
-	// number closer to +inf
+	// number with LSB = 0
 
 	testCases := []struct {
 		// Inputs
@@ -1240,7 +1240,7 @@ func TestRoundNearestEven(t *testing.T) {
 func TestRoundNearestOdd(t *testing.T) {
 	// Rounding half towards positive infinity involves rounding to the nearest
 	// representable number, and breaking ties by rounding towards the
-	// number closer to +inf
+	// number with LSB = 1
 
 	testCases := []struct {
 		// Inputs

diff --git a/pkg/float16bits/rounddown.go b/pkg/float16bits/rounddown.go
@@ -28,20 +28,21 @@ func roundDown(signBit, exponentBits,
 	float16Mantissa := uint16(mantissaF16Precision >> 13)
 
 	// For this rounding mode, we only need to add 1 to the Least-precision
-	// mantissa, if the input was positive, to bring it closer to +inf.
-	// For negative numbers, this is achieved by simply truncating.
+	// mantissa, if the input was negative, to bring it closer to -inf.
+	// For positive numbers, this is achieved by simply truncating.
 
 	exponentMantissaComposite := (float16Exponent | float16Mantissa)
 
-	// If positive and there is extra precision, then add 1
+	// If negative and there is extra precision, then add 1
 	if (float16Sign != 0) && (mantissaExtraPrecision != 0 || lostPrecision) {
 		exponentMantissaComposite += 1
 	}
 	// Since, we don't handle overflow, all we need to do now is attach the sign
 	resultVal := Bits(float16Sign | exponentMantissaComposite)
 
 	resultAcc := big.Exact
-	// If there was extra precision bits set, then we need to
+	// If there was extra precision bits set, then we need to update the
+    // accuracy
 	if mantissaExtraPrecision != 0 || lostPrecision {
 		// We always round to a smaller value
 		resultAcc = big.Below

diff --git a/pkg/float16bits/roundhalftowardszero.go b/pkg/float16bits/roundhalftowardszero.go
@@ -24,7 +24,7 @@ func roundHalfTowardsZero(signBit, exponentBits,
 	exponentMantissaComposite := float16Exponent | float16Mantissa
 
 	// If the extra precision bits exceed 1 0 0 0 0....
-	// we need to add 1 to LSB of F32 mantissa, otherwise truncate
+	// we need to add 1 to LSB of F32 mantissa.
 	// For all other cases we truncate
 	addedOne := false
 	if mantissaExtraPrecision > f32Float16HalfSubnormalLSB {

diff --git a/pkg/float16bits/roundnearesteven.go b/pkg/float16bits/roundnearesteven.go
@@ -19,12 +19,12 @@ func roundNearestEven(signBit, exponentBits, mantissaBits uint32,
 	// break ties by rounding towards the number that is even (LSB is 0)
 
 	// LSB  |  Extra Precision Bits
-	//  m9    m8 m8 m7
-	// 1. if m29 m28 m27 ... > 1 0 0 0 ... (more than half) we round up
-	// 2. if m29 m28 m27 ... < 1 0 0 0 ... (less than half) we truncate
-	// 3. if m29 m28 m27 ... == 1 0 0 0 ... (exactly half), then
-	// 	  3.1 m30 == 0, we truncate
-	//    3.2 m30 == 1, we round up
+	// m13    m12 m11 m10 ... m0
+	// 1. if m12 m11 m10 ... > 1 0 0 0 ... (more than half) we round up
+	// 2. if m12 m11 m10 ... < 1 0 0 0 ... (less than half) we truncate
+	// 3. if m12 m11 m10 ... == 1 0 0 0 ... (exactly half), then
+	// 	  3.1 m13 == 0, we truncate
+	//    3.2 m13 == 1, we round up
 
 	mantissaF16Precision := mantissaBits & f32Float16MantissaMask
 	mantissaExtraPrecision := mantissaBits & f32Float16HalfSubnormalMask

diff --git a/pkg/float16bits/roundnearestodd.go b/pkg/float16bits/roundnearestodd.go
@@ -22,12 +22,12 @@ func roundNearestOdd(signBit, exponentBits, mantissaBits uint32,
 	// break ties by rounding towards the number that is even (LSB is 0)
 
 	// LSB  |  Extra Precision Bits
-	//  m9    m8 m7 m6
-	// 1. if m8 m7 m6 ... > 1 0 0 0 ... (more than half) we round up
-	// 2. if m8 m7 m6 ... < 1 0 0 0 ... (less than half) we truncate
-	// 3. if m8 m7 m6 ... == 1 0 0 0 ... (exactly half), then
-	// 	  3.1 m9 == 1, we truncate
-	//    3.2 m9 == 0, we round up
+	// m13    m12 m11 m10 ... m0
+	// 1. if m12 m11 m10 ... > 1 0 0 0 ... (more than half) we round up
+	// 2. if m12 m11 m10 ... < 1 0 0 0 ... (less than half) we truncate
+	// 3. if m12 m11 m10 ... == 1 0 0 0 ... (exactly half), then
+	// 	  3.1 m13 == 1, we truncate
+	//    3.2 m13 == 0, we round up
 
 	mantissaF16Precision := mantissaBits & f32Float16MantissaMask
 	mantissaExtraPrecision := mantissaBits & f32Float16HalfSubnormalMask
@@ -55,7 +55,7 @@ func roundNearestOdd(signBit, exponentBits, mantissaBits uint32,
 
 	mantissaF32LSB := mantissaBits & f32Float16SubnormalLSB
 	// In the case we're at the mid-point, we only add 1, if the LSB of the
-	// float32 retained mantissa is 0
+	// float16 retained mantissa is 0
 	if (mantissaF32LSB == 0) && (mantissaExtraPrecision ==
 		f32Float16HalfSubnormalLSB) && !lostPrecision {
 		exponentMantissaComposite += 1

diff --git a/pkg/float16bits/roundtowardszero.go b/pkg/float16bits/roundtowardszero.go
@@ -31,7 +31,7 @@ func truncate(signBit, exponentBits, mantissaBits uint32,
 	resultAcc := big.Exact
 
 	// If there was extra precision, then the number did not fit in the
-	// float32 format, so we need to report the status appropriately
+	// float16 format, so we need to report the status appropriately
 	if mantissaExtraPrecision != 0 || lostPrecision {
 		if signBit == 0 {
 			resultAcc = big.Below