Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 14 additions & 16 deletions pkg/float16bits/float16.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (input Bits) ToFloat32() float32 {
// = (-1)^sign * 2^(-15) * (1 + m1/2 + m2/4 + m3/8 + ...)
// So, if the MSB set bit is m0, then the result exponent = Emin - 1
// and, we need to shift the mantissa to the right when it's in the
// float32 container. And, there as an extra mantissa left-shift by
// float32 container. And, there is an extra mantissa left-shift by
// 1
// Let's now say it's m2. In that case, we have
// (-1)^sign * 2^(-14) * (0/2 + 0/4 + 1/8 + m3/16 + ...)
Expand Down Expand Up @@ -181,7 +181,7 @@ func FromBigFloat(input big.Float, rm floatBit.RoundingMode,
// the underflow response in the BF16 methods.
// F32.PositiveMinSubnormal will trigger underflow response in BF16
asFloat32 = math.Float32frombits(F32.PositiveMinSubnormal)
} else if closestFloat32 == -0.0 && fromBigFloatAcc == big.Above {
} else if closestFloat32 == math.Float32frombits(F32.NegativeZero) && fromBigFloatAcc == big.Above {
// And for the negative case
// F32.NegativeMinSubnormal will trigger underflow response in BF16
asFloat32 = math.Float32frombits(F32.NegativeMinSubnormal)
Expand Down Expand Up @@ -254,7 +254,7 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
}

// Special Case #5: Input exceeds the maximum normal value (in magnitude)
// that can be represented in the float32 format. In this case, the input om
// that can be represented in the float16 format. In this case, the input om
// [floatBit.OverflowMode] determines the response.

// First off, we calculate the actual value of the exponent. To do this,
Expand Down Expand Up @@ -285,8 +285,8 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
lostPrecision := false

// Before performing any rounding, we need to make sure this exponent
// can actually be represented in the float32 format. If the exponent,
// is smaller than the minimum exponent allowed in float32 (-126), this
// can actually be represented in the float16 format. If the exponent,
// is smaller than the minimum exponent allowed in float16 (-14), this
// either results in underflow, or it rounds up or trunc to some subnormal
// number in the float32 format. We will need to take special care for
// the cases where we truncate, because we might underflow and we need
Expand Down Expand Up @@ -325,7 +325,7 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
//
// In general, by following the pattern, this shift amount is equal
// to the difference between the minimum representable exponent (actual)
// and the actual value of the exponent in float64.
// and the actual value of the exponent in float32.
shiftAmount := uint32(ExponentMin - actualExponent)
for ; shiftAmount > 0; shiftAmount-- {
lastDigit := alignedMantissa & 0x1
Expand All @@ -338,9 +338,9 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,
// Now that we have the value for the mantissa, we can determine
// the underflow case. There is underflow, in the case when the
// part of the mantissa that has the precision that can be represented
// in float32 is 0 (bits m52 to m30), but the rest of the mantissa has
// in float16 is 0 (bits m22 to m13), but the rest of the mantissa has
// atleast 1 bit set i.e. all of the precision in the number is higher
// than that could be represented in float32. In this case, the response
// than that could be represented in float16. In this case, the response
// is handled by the input um [floatBit.UnderflowMode]
if checkUnderflow(alignedMantissa, lostPrecision) {
return handleUnderflow(signBit, um)
Expand Down Expand Up @@ -386,9 +386,7 @@ func FromFloat32(input float32, rm floatBit.RoundingMode,

// Utility function to check if the number with the given exponent and mantissa
// bits would overflow when trying to represent it in a float16 value
// exponentBits should correspond to bits which are encoded with the float16
// bias in mind. mantissaBits should occupy the bits with the float32 format
// in mind.
// mantissaBits should occupy the bits as they would in a float32 number
func checkOverflow(actualExponent int, mantissaBits uint32) bool {
// If the exponent is larger than the max, then it's overflow
if actualExponent > ExponentMax {
Expand All @@ -408,11 +406,11 @@ func checkOverflow(actualExponent int, mantissaBits uint32) bool {
}

// Utility function to check if the number with the given exponent and mantissa
// bits would overflow when trying to represent it in a float16 value
// bits would underflow when trying to represent it in a float16 value
// Subnormals require shifting the mantissa to align the exponents. This might
// cause loss of precision that cannot be detected by mantissaBits alone as
// they are already shifted. The lostPrecision parameter helps us with that.
// If it's true then there was precision lost when mantissa was being aligned
// If it's true then there was precision lost when mantissa was being aligned.
func checkUnderflow(mantissaBits uint32, lostPrecision bool) bool {
// This assumes that the exponent is 0, so any extra precision in the
// mantissa means underflow.
Expand Down Expand Up @@ -465,7 +463,7 @@ func handleOverflow(signBit uint32, om floatBit.OverflowMode) (Bits,
return Bits(NegativeNaN), big.Below, floatBit.Overflow
case floatBit.SaturateMax:
if signBit == 0 {
// The maximum normal in float32 is smaller than any number
// The maximum normal in float16 is smaller than any number
// this function will be invoked for
return Bits(PositiveMaxNormal), big.Below, floatBit.Overflow
}
Expand Down Expand Up @@ -496,7 +494,7 @@ func (b *Bits) ToFloatFormat() floatBit.FloatBitFormat {

// 5 Exponent Bits
exponentRetVal := make([]byte, 0, 5)
for i := 0; i < 5; i++ {
for range 5 {
currentExponentBit := exponentBits & 0x1
var valueToAppend byte
if currentExponentBit == 0 {
Expand All @@ -510,7 +508,7 @@ func (b *Bits) ToFloatFormat() floatBit.FloatBitFormat {

// 10 Mantissa Bits
mantissaRetVal := make([]byte, 0, 10)
for i := 0; i < 10; i++ {
for range 10 {
currentMantissaBit := mantissaBits & 0x1
var valueToAppend byte
if currentMantissaBit == 0 {
Expand Down
14 changes: 7 additions & 7 deletions pkg/float16bits/float16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ func TestToFloat32(t *testing.T) {
},
{
input: 0b1_00000_0000000000,
golden: -0.0,
golden: math.Float32frombits(F32.NegativeZero),
},
{
input: 0b0_00000_1111111111,
Expand Down Expand Up @@ -89,7 +89,7 @@ func TestHandleOverflow(t *testing.T) {
if (resultVal != tt.goldenVal) || (resultAcc != tt.goldenAcc) || (resultStatus != tt.goldenStatus) {
t.Logf("Failed Input Set:\n")
t.Logf("SignBit: %v\tOverflowMode: %v\n", tt.signBit, tt.om)
t.Errorf("Expected Result: %0#8x, Got: %0#8x\n", tt.goldenVal, resultVal)
t.Errorf("Expected Result: %0#4x, Got: %0#4x\n", tt.goldenVal, resultVal)
t.Errorf("Expected Accuracy: %v, Got: %v\n", tt.goldenAcc, resultAcc)
t.Errorf("Expected Status: %v, Got: %v\n", tt.goldenStatus, resultStatus)
}
Expand Down Expand Up @@ -121,7 +121,7 @@ func TestHandleUnderflow(t *testing.T) {
if (resultVal != tt.goldenVal) || (resultAcc != tt.goldenAcc) || (resultStatus != tt.goldenStatus) {
t.Logf("Failed Input Set:\n")
t.Logf("SignBit: %v\tUnderflowMode: %v\n", tt.signBit, tt.um)
t.Errorf("Expected Result: %0#8x, Got: %0#8x\n", tt.goldenVal, resultVal)
t.Errorf("Expected Result: %0#4x, Got: %0#4x\n", tt.goldenVal, resultVal)
t.Errorf("Expected Accuracy: %v, Got: %v\n", tt.goldenAcc, resultAcc)
t.Errorf("Expected Status: %v, Got: %v\n", tt.goldenStatus, resultStatus)
}
Expand Down Expand Up @@ -400,8 +400,8 @@ func TestRoundTowardsPositiveInf(t *testing.T) {

func TestRoundTowardsNegativeInf(t *testing.T) {

// Rounding towards positive infinity involves adding 1 if the number
// is positive, otherwise truncating, so that the number is closer to +inf
// Rounding towards negative infinity involves adding 1 if the number
// is negative, otherwise truncating, so that the number is closer to -inf

testCases := []struct {
// Inputs
Expand Down Expand Up @@ -1021,7 +1021,7 @@ func TestRoundHalfTowardsNegativeInf(t *testing.T) {
func TestRoundNearestEven(t *testing.T) {
// Rounding half towards positive infinity involves rounding to the nearest
// representable number, and breaking ties by rounding towards the
// number closer to +inf
// number with LSB = 0

testCases := []struct {
// Inputs
Expand Down Expand Up @@ -1240,7 +1240,7 @@ func TestRoundNearestEven(t *testing.T) {
func TestRoundNearestOdd(t *testing.T) {
// Rounding half towards positive infinity involves rounding to the nearest
// representable number, and breaking ties by rounding towards the
// number closer to +inf
// number with LSB = 1

testCases := []struct {
// Inputs
Expand Down
9 changes: 5 additions & 4 deletions pkg/float16bits/rounddown.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,21 @@ func roundDown(signBit, exponentBits,
float16Mantissa := uint16(mantissaF16Precision >> 13)

// For this rounding mode, we only need to add 1 to the Least-precision
// mantissa, if the input was positive, to bring it closer to +inf.
// For negative numbers, this is achieved by simply truncating.
// mantissa, if the input was negative, to bring it closer to -inf.
// For positive numbers, this is achieved by simply truncating.

exponentMantissaComposite := (float16Exponent | float16Mantissa)

// If positive and there is extra precision, then add 1
// If negative and there is extra precision, then add 1
if (float16Sign != 0) && (mantissaExtraPrecision != 0 || lostPrecision) {
exponentMantissaComposite += 1
}
// Since, we don't handle overflow, all we need to do now is attach the sign
resultVal := Bits(float16Sign | exponentMantissaComposite)

resultAcc := big.Exact
// If there was extra precision bits set, then we need to
// If there was extra precision bits set, then we need to update the
// accuracy
if mantissaExtraPrecision != 0 || lostPrecision {
// We always round to a smaller value
resultAcc = big.Below
Expand Down
2 changes: 1 addition & 1 deletion pkg/float16bits/roundhalftowardszero.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func roundHalfTowardsZero(signBit, exponentBits,
exponentMantissaComposite := float16Exponent | float16Mantissa

// If the extra precision bits exceed 1 0 0 0 0....
// we need to add 1 to LSB of F32 mantissa, otherwise truncate
// we need to add 1 to LSB of F32 mantissa.
// For all other cases we truncate
addedOne := false
if mantissaExtraPrecision > f32Float16HalfSubnormalLSB {
Expand Down
12 changes: 6 additions & 6 deletions pkg/float16bits/roundnearesteven.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ func roundNearestEven(signBit, exponentBits, mantissaBits uint32,
// break ties by rounding towards the number that is even (LSB is 0)

// LSB | Extra Precision Bits
// m9 m8 m8 m7
// 1. if m29 m28 m27 ... > 1 0 0 0 ... (more than half) we round up
// 2. if m29 m28 m27 ... < 1 0 0 0 ... (less than half) we truncate
// 3. if m29 m28 m27 ... == 1 0 0 0 ... (exactly half), then
// 3.1 m30 == 0, we truncate
// 3.2 m30 == 1, we round up
// m13 m12 m11 m10 ... m0
// 1. if m12 m11 m10 ... > 1 0 0 0 ... (more than half) we round up
// 2. if m12 m11 m10 ... < 1 0 0 0 ... (less than half) we truncate
// 3. if m12 m11 m10 ... == 1 0 0 0 ... (exactly half), then
// 3.1 m13 == 0, we truncate
// 3.2 m13 == 1, we round up

mantissaF16Precision := mantissaBits & f32Float16MantissaMask
mantissaExtraPrecision := mantissaBits & f32Float16HalfSubnormalMask
Expand Down
14 changes: 7 additions & 7 deletions pkg/float16bits/roundnearestodd.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ func roundNearestOdd(signBit, exponentBits, mantissaBits uint32,
// break ties by rounding towards the number that is even (LSB is 0)

// LSB | Extra Precision Bits
// m9 m8 m7 m6
// 1. if m8 m7 m6 ... > 1 0 0 0 ... (more than half) we round up
// 2. if m8 m7 m6 ... < 1 0 0 0 ... (less than half) we truncate
// 3. if m8 m7 m6 ... == 1 0 0 0 ... (exactly half), then
// 3.1 m9 == 1, we truncate
// 3.2 m9 == 0, we round up
// m13 m12 m11 m10 ... m0
// 1. if m12 m11 m10 ... > 1 0 0 0 ... (more than half) we round up
// 2. if m12 m11 m10 ... < 1 0 0 0 ... (less than half) we truncate
// 3. if m12 m11 m10 ... == 1 0 0 0 ... (exactly half), then
// 3.1 m13 == 1, we truncate
// 3.2 m13 == 0, we round up

mantissaF16Precision := mantissaBits & f32Float16MantissaMask
mantissaExtraPrecision := mantissaBits & f32Float16HalfSubnormalMask
Expand Down Expand Up @@ -55,7 +55,7 @@ func roundNearestOdd(signBit, exponentBits, mantissaBits uint32,

mantissaF32LSB := mantissaBits & f32Float16SubnormalLSB
// In the case we're at the mid-point, we only add 1, if the LSB of the
// float32 retained mantissa is 0
// float16 retained mantissa is 0
if (mantissaF32LSB == 0) && (mantissaExtraPrecision ==
f32Float16HalfSubnormalLSB) && !lostPrecision {
exponentMantissaComposite += 1
Expand Down
2 changes: 1 addition & 1 deletion pkg/float16bits/roundtowardszero.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func truncate(signBit, exponentBits, mantissaBits uint32,
resultAcc := big.Exact

// If there was extra precision, then the number did not fit in the
// float32 format, so we need to report the status appropriately
// float16 format, so we need to report the status appropriately
if mantissaExtraPrecision != 0 || lostPrecision {
if signBit == 0 {
resultAcc = big.Below
Expand Down
Loading