From 471dbf57a620d15402b83908a71d98003f35e718 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 31 Oct 2024 03:06:45 -0700 Subject: [PATCH 1/2] Improve the comments for hexadecimal float parsing. Replace the comments requesting better comments with better comments, and tidy up the code. --- crates/wast/src/token.rs | 105 ++++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 35 deletions(-) diff --git a/crates/wast/src/token.rs b/crates/wast/src/token.rs index 2443c87a01..df1988b2ab 100644 --- a/crates/wast/src/token.rs +++ b/crates/wast/src/token.rs @@ -435,6 +435,7 @@ macro_rules! float { let signif_bits = width - 1 - $exp_bits; let signif_mask = (1 << exp_offset) - 1; let bias = (1 << ($exp_bits - 1)) - 1; + let msb = 1 << neg_offset; let (hex, integral, fractional, exponent_str) = match val { // Infinity is when the exponent bits are all set and @@ -497,53 +498,77 @@ macro_rules! float { return Some(float.to_bits()); } - // Parsing hex floats is... hard! I don't really know what most of - // this below does. It was copied from Gecko's implementation in - // `WasmTextToBinary.cpp`. Would love comments on this if you have - // them! - let fractional = fractional.as_ref().map(|s| &**s).unwrap_or(""); - let negative = integral.starts_with('-'); + // Parse a hexadecimal floating-point value. + // + // The main loop here is simpler than for parsing decimal floats, + // because we can just parse hexadecimal digits and then shift + // their bits into place in the significand. But in addition to + // that, we also need to handle non-normalized representations, + // where the integral part is not "1", to convert then to + // normalized results, to round, in case we get more digits than + // the target format supports, and to handle overflow and subnormal + // cases. + + // Get slices of digits for the integral and fractional parts. We + // can trivially skip any leading zeros in the integral part. + let is_negative = integral.starts_with('-'); let integral = integral.trim_start_matches('-').trim_start_matches('0'); + let fractional = fractional.as_ref().map(|s| &**s).unwrap_or(""); - // Do a bunch of work up front to locate the first non-zero digit - // to determine the initial exponent. There's a number of - // adjustments depending on where the digit was found, but the - // general idea here is that I'm not really sure why things are - // calculated the way they are but it should match Gecko. + // Locate the first non-zero digit to determine the initial exponent. + // + // If there's no integral part, skip past leading zeros so that + // something like "0x.0000000000000000000002" doesn't cause us to hit + // a shift overflow when we try to shift the value into place. We'll + // adjust the exponent below to account for these skipped zeros. let fractional_no_leading = fractional.trim_start_matches('0'); let fractional_iter = if integral.is_empty() { fractional_no_leading.chars() } else { fractional.chars() }; + + // Create a unified iterator over the digits of the integral part + // followed by the digits of the fractional part. The boolean value + // indicates which of these parts we're in. let mut digits = integral.chars() .map(|c| (to_hex(c) as $int, false)) .chain(fractional_iter.map(|c| (to_hex(c) as $int, true))); + + // Compute the number of leading zeros in the first non-zero digit, + // since if the first digit is not "1" we'll need to adjust for + // normalization. let lead_nonzero_digit = match digits.next() { Some((c, _)) => c, - // No digits? Must be `+0` or `-0`, being careful to handle the - // sign encoding here. - None if negative => return Some(1 << (width - 1)), + // No non-zero digits? Must be `+0` or `-0`, being careful to + // handle the sign encoding here. + None if is_negative => return Some(msb), None => return Some(0), }; - let mut significand = 0 as $int; + let lz = (lead_nonzero_digit as u8).leading_zeros() as i32 - 4; + + // Prepare for the main parsing loop. Calculate the initial values + // of `exponent` and `significand` based on what we've seen so far. let mut exponent = if !integral.is_empty() { 1 } else { + // Adjust the exponent digits to account for any leading zeros + // in the fractional part that we skipped above. -((fractional.len() - fractional_no_leading.len() + 1) as i32) + 1 }; - let lz = (lead_nonzero_digit as u8).leading_zeros() as i32 - 4; - exponent = exponent.checked_mul(4)?.checked_sub(lz + 1)?; let mut significand_pos = (width - (4 - (lz as usize))) as isize; - assert!(significand_pos >= 0); - significand |= lead_nonzero_digit << significand_pos; + let mut significand: $int = lead_nonzero_digit << significand_pos; + let mut discarded_extra_nonzero = false; + + assert!(significand_pos >= 0, "$int should be at least 4 bits wide"); + + // Adjust for leading zeros in the first digit. + exponent = exponent.checked_mul(4)?.checked_sub(lz + 1)?; // Now that we've got an anchor in the string we parse the remaining - // digits. Again, not entirely sure why everything is the way it is - // here! This is copied frmo gecko. - let mut discarded_extra_nonzero = false; - for (digit, is_fractional) in digits { - if !is_fractional { + // hexadecimal digits. + for (digit, in_fractional) in digits { + if !in_fractional { exponent += 4; } if significand_pos > -4 { @@ -560,12 +585,19 @@ macro_rules! float { } } + debug_assert!(significand != 0, "The case of no non-zero digits should have been handled above"); + + // Parse the exponent string, which despite this being a hexadecimal + // syntax, is a decimal number, and add it the exponent we've + // computed from the potentially non-normalized significand. exponent = exponent.checked_add(match exponent_str { Some(s) => s.parse::().ok()?, None => 0, })?; - debug_assert!(significand != 0); + // Encode the exponent and significand. Also calculate the bits of + // the significand which are discarded, as we'll use them to + // determine if we need to round up. let (encoded_exponent, encoded_significand, discarded_significand) = if exponent <= -bias { // Underflow to subnormal or zero. @@ -598,15 +630,18 @@ macro_rules! float { ) }; + // Combine the encoded exponent and encoded significand to produce + // the raw result, except for the sign bit, which we'll apply at + // the end. let bits = encoded_exponent | encoded_significand; - // Apply rounding. If this overflows the significand, it carries - // into the exponent bit according to the magic of the IEEE 754 - // encoding. - // - // Or rather, the comment above is what Gecko says so it's copied - // here too. - let msb = 1 << (width - 1); + // Apply rounding. Do an integer add of `0` or `1` on the raw + // result, depending on whether rounding is needed. Rounding can + // lead to a floating-point overflow, but we don't need to + // special-case that here because it turns out that IEEE 754 floats + // are encoded such that when an integer add of `1` carries into + // the bits of the exponent field, it produces the correct encoding + // for infinity. let bits = bits + (((discarded_significand & msb != 0) && ((discarded_significand & !msb != 0) || @@ -614,10 +649,10 @@ macro_rules! float { // ties to even (encoded_significand & 1 != 0))) as $int); - // Just before we return the bits be sure to handle the sign bit we + // Just before we return the bits, be sure to handle the sign bit we // found at the beginning. - let bits = if negative { - bits | (1 << (width - 1)) + let bits = if is_negative { + bits | msb } else { bits }; From 13be0a1dc99a16d34b8ac683b57e1c74e0d10e95 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Thu, 31 Oct 2024 08:43:31 -0700 Subject: [PATCH 2/2] Fix a typo. --- crates/wast/src/token.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/wast/src/token.rs b/crates/wast/src/token.rs index df1988b2ab..c59d834cb0 100644 --- a/crates/wast/src/token.rs +++ b/crates/wast/src/token.rs @@ -504,7 +504,7 @@ macro_rules! float { // because we can just parse hexadecimal digits and then shift // their bits into place in the significand. But in addition to // that, we also need to handle non-normalized representations, - // where the integral part is not "1", to convert then to + // where the integral part is not "1", to convert them to // normalized results, to round, in case we get more digits than // the target format supports, and to handle overflow and subnormal // cases.