Skip to content

Commit

Permalink
Removed undefined use of union in libminifloat
Browse files Browse the repository at this point in the history
Testing showed use of memcpy produced identical assembly code.
  • Loading branch information
Dr15Jones committed Aug 27, 2021
1 parent fd04911 commit 18a6fa5
Showing 1 changed file with 40 additions and 75 deletions.
115 changes: 40 additions & 75 deletions DataFormats/Math/interface/libminifloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,68 +4,49 @@
#include <cstdint>
#include <cassert>
#include <algorithm>
#include <cstring>

// ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
class MiniFloatConverter {
public:
MiniFloatConverter();
inline static float float16to32(uint16_t h) {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return conv.flt;
uint32_t i32 = mantissatable[offsettable[h >> 10] + (h & 0x3ff)] + exponenttable[h >> 10];
return bit_cast<float>(i32);
}
inline static uint16_t float32to16(float x) { return float32to16round(x); }
/// Fast implementation, but it crops the number so it biases low
inline static uint16_t float32to16crop(float x) {
union {
float flt;
uint32_t i32;
} conv;
conv.flt = x;
return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
uint32_t i32 = bit_cast<uint32_t>(x);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
/// Slower implementation, but it rounds to avoid biases
inline static uint16_t float32to16round(float x) {
union {
float flt;
uint32_t i32;
} conv;
conv.flt = x;
uint8_t shift = shifttable[(conv.i32 >> 23) & 0x1ff];
uint32_t i32 = bit_cast<uint32_t>(x);
uint8_t shift = shifttable[(i32 >> 23) & 0x1ff];
if (shift == 13) {
uint16_t base2 = (conv.i32 & 0x007fffff) >> 12;
uint16_t base2 = (i32 & 0x007fffff) >> 12;
uint16_t base = base2 >> 1;
if (((base2 & 1) != 0) && (base < 1023))
base++;
return basetable[(conv.i32 >> 23) & 0x1ff] + base;
return basetable[(i32 >> 23) & 0x1ff] + base;
} else {
return basetable[(conv.i32 >> 23) & 0x1ff] + ((conv.i32 & 0x007fffff) >> shifttable[(conv.i32 >> 23) & 0x1ff]);
return basetable[(i32 >> 23) & 0x1ff] + ((i32 & 0x007fffff) >> shifttable[(i32 >> 23) & 0x1ff]);
}
}
template <int bits>
inline static float reduceMantissaToNbits(const float &f) {
static_assert(bits <= 23, "max mantissa size is 23 bits");
constexpr uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
conv.i32 &= mask;
return conv.flt;
uint32_t i32 = bit_cast<uint32_t>(f);
i32 &= mask;
return bit_cast<float>(i32);
}
inline static float reduceMantissaToNbits(const float &f, int bits) {
uint32_t mask = (0xFFFFFFFF >> (23 - bits)) << (23 - bits);
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
conv.i32 &= mask;
return conv.flt;
uint32_t i32 = bit_cast<uint32_t>(f);
i32 &= mask;
return bit_cast<float>(i32);
}

class ReduceMantissaToNbitsRounding {
Expand All @@ -77,20 +58,16 @@ class MiniFloatConverter {
float operator()(float f) const {
constexpr uint32_t low23 = (0x007FFFFF); // mask to keep lowest 23 bits = mantissa
constexpr uint32_t hi9 = (0xFF800000); // mask to keep highest 9 bits = the rest
union {
float flt;
uint32_t i32;
} conv;
conv.flt = f;
if (conv.i32 & test) { // need to round
uint32_t mantissa = (conv.i32 & low23) >> shift;
uint32_t i32 = bit_cast<uint32_t>(f);
if (i32 & test) { // need to round
uint32_t mantissa = (i32 & low23) >> shift;
if (mantissa < maxn)
mantissa++;
conv.i32 = (conv.i32 & hi9) | (mantissa << shift);
i32 = (i32 & hi9) | (mantissa << shift);
} else {
conv.i32 &= mask;
i32 &= mask;
}
return conv.flt;
return bit_cast<float>(i32);
}

private:
Expand All @@ -114,54 +91,34 @@ class MiniFloatConverter {
}

inline static float max() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return conv.flt;
constexpr uint32_t i32 = 0x477fe000; // = mantissatable[offsettable[0x1e]+0x3ff]+exponenttable[0x1e]
return bit_cast<float>(i32);
}

// Maximum float32 value that gets rounded to max()
inline static float max32RoundedToMax16() {
union {
float flt;
uint32_t i32;
} conv;
// 2^16 in float32 is the first to result inf in float16, so
// 2^16-1 is the last float32 to result max() in float16
conv.i32 = (0x8f << 23) - 1;
return conv.flt;
constexpr uint32_t i32 = (0x8f << 23) - 1;
return bit_cast<float>(i32);
}

inline static float min() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return conv.flt;
constexpr uint32_t i32 = 0x38800000; // = mantissatable[offsettable[1]+0]+exponenttable[1]
return bit_cast<float>(i32);
}

// Minimum float32 value that gets rounded to min()
inline static float min32RoundedToMin16() {
union {
float flt;
uint32_t i32;
} conv;
// 2^-14-1 in float32 is the first to result denormalized in float16, so
// 2^-14 is the first float32 to result min() in float16
conv.i32 = (0x71 << 23);
return conv.flt;
constexpr uint32_t i32 = (0x71 << 23);
return bit_cast<float>(i32);
}

inline static float denorm_min() {
union {
float flt;
uint32_t i32;
} conv;
conv.i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return conv.flt;
constexpr uint32_t i32 = 0x33800000; // mantissatable[offsettable[0]+1]+exponenttable[0]
return bit_cast<float>(i32);
}

inline static bool isdenorm(uint16_t h) {
Expand All @@ -170,6 +127,14 @@ class MiniFloatConverter {
}

private:
//in C++20 we can use std::bit_cast which is constexpr
template <class To, class From>
inline static To bit_cast(const From &src) noexcept {
static_assert(sizeof(To) == sizeof(From), "incompatible types");
To dst;
std::memcpy(&dst, &src, sizeof(To));
return dst;
}
CMS_THREAD_SAFE static uint32_t mantissatable[2048];
CMS_THREAD_SAFE static uint32_t exponenttable[64];
CMS_THREAD_SAFE static uint16_t offsettable[64];
Expand Down

0 comments on commit 18a6fa5

Please sign in to comment.