float16.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. // Copyright 2019 Montgomery Edwards⁴⁴⁸ and Faye Amacker
  2. //
  3. // Special thanks to Kathryn Long for her Rust implementation
  4. // of float16 at github.com/starkat99/half-rs (MIT license)
  5. package float16
  6. import (
  7. "math"
  8. "strconv"
  9. )
  10. // Float16 represents IEEE 754 half-precision floating-point numbers (binary16).
  11. type Float16 uint16
  12. // Precision indicates whether the conversion to Float16 is
  13. // exact, subnormal without dropped bits, inexact, underflow, or overflow.
  14. type Precision int
  15. const (
  16. // PrecisionExact is for non-subnormals that don't drop bits during conversion.
  17. // All of these can round-trip. Should always convert to float16.
  18. PrecisionExact Precision = iota
  19. // PrecisionUnknown is for subnormals that don't drop bits during conversion but
  20. // not all of these can round-trip so precision is unknown without more effort.
  21. // Only 2046 of these can round-trip and the rest cannot round-trip.
  22. PrecisionUnknown
  23. // PrecisionInexact is for dropped significand bits and cannot round-trip.
  24. // Some of these are subnormals. Cannot round-trip float32->float16->float32.
  25. PrecisionInexact
  26. // PrecisionUnderflow is for Underflows. Cannot round-trip float32->float16->float32.
  27. PrecisionUnderflow
  28. // PrecisionOverflow is for Overflows. Cannot round-trip float32->float16->float32.
  29. PrecisionOverflow
  30. )
  31. // PrecisionFromfloat32 returns Precision without performing
  32. // the conversion. Conversions from both Infinity and NaN
  33. // values will always report PrecisionExact even if NaN payload
  34. // or NaN-Quiet-Bit is lost. This function is kept simple to
  35. // allow inlining and run < 0.5 ns/op, to serve as a fast filter.
  36. func PrecisionFromfloat32(f32 float32) Precision {
  37. u32 := math.Float32bits(f32)
  38. if u32 == 0 || u32 == 0x80000000 {
  39. // +- zero will always be exact conversion
  40. return PrecisionExact
  41. }
  42. const COEFMASK uint32 = 0x7fffff // 23 least significant bits
  43. const EXPSHIFT uint32 = 23
  44. const EXPBIAS uint32 = 127
  45. const EXPMASK uint32 = uint32(0xff) << EXPSHIFT
  46. const DROPMASK uint32 = COEFMASK >> 10
  47. exp := int32(((u32 & EXPMASK) >> EXPSHIFT) - EXPBIAS)
  48. coef := u32 & COEFMASK
  49. if exp == 128 {
  50. // +- infinity or NaN
  51. // apps may want to do extra checks for NaN separately
  52. return PrecisionExact
  53. }
  54. // https://en.wikipedia.org/wiki/Half-precision_floating-point_format says,
  55. // "Decimals between 2^−24 (minimum positive subnormal) and 2^−14 (maximum subnormal): fixed interval 2^−24"
  56. if exp < -24 {
  57. return PrecisionUnderflow
  58. }
  59. if exp > 15 {
  60. return PrecisionOverflow
  61. }
  62. if (coef & DROPMASK) != uint32(0) {
  63. // these include subnormals and non-subnormals that dropped bits
  64. return PrecisionInexact
  65. }
  66. if exp < -14 {
  67. // Subnormals. Caller may want to test these further.
  68. // There are 2046 subnormals that can successfully round-trip f32->f16->f32
  69. // and 20 of those 2046 have 32-bit input coef == 0.
  70. // RFC 7049 and 7049bis Draft 12 don't precisely define "preserves value"
  71. // so some protocols and libraries will choose to handle subnormals differently
  72. // when deciding to encode them to CBOR float32 vs float16.
  73. return PrecisionUnknown
  74. }
  75. return PrecisionExact
  76. }
  77. // Frombits returns the float16 number corresponding to the IEEE 754 binary16
  78. // representation u16, with the sign bit of u16 and the result in the same bit
  79. // position. Frombits(Bits(x)) == x.
  80. func Frombits(u16 uint16) Float16 {
  81. return Float16(u16)
  82. }
  83. // Fromfloat32 returns a Float16 value converted from f32. Conversion uses
  84. // IEEE default rounding (nearest int, with ties to even).
  85. func Fromfloat32(f32 float32) Float16 {
  86. return Float16(f32bitsToF16bits(math.Float32bits(f32)))
  87. }
  88. // ErrInvalidNaNValue indicates a NaN was not received.
  89. const ErrInvalidNaNValue = float16Error("float16: invalid NaN value, expected IEEE 754 NaN")
  90. type float16Error string
  91. func (e float16Error) Error() string { return string(e) }
  92. // FromNaN32ps converts nan to IEEE binary16 NaN while preserving both
  93. // signaling and payload. Unlike Fromfloat32(), which can only return
  94. // qNaN because it sets quiet bit = 1, this can return both sNaN and qNaN.
  95. // If the result is infinity (sNaN with empty payload), then the
  96. // lowest bit of payload is set to make the result a NaN.
  97. // Returns ErrInvalidNaNValue and 0x7c01 (sNaN) if nan isn't IEEE 754 NaN.
  98. // This function was kept simple to be able to inline.
  99. func FromNaN32ps(nan float32) (Float16, error) {
  100. const SNAN = Float16(uint16(0x7c01)) // signalling NaN
  101. u32 := math.Float32bits(nan)
  102. sign := u32 & 0x80000000
  103. exp := u32 & 0x7f800000
  104. coef := u32 & 0x007fffff
  105. if (exp != 0x7f800000) || (coef == 0) {
  106. return SNAN, ErrInvalidNaNValue
  107. }
  108. u16 := uint16((sign >> 16) | uint32(0x7c00) | (coef >> 13))
  109. if (u16 & 0x03ff) == 0 {
  110. // result became infinity, make it NaN by setting lowest bit in payload
  111. u16 = u16 | 0x0001
  112. }
  113. return Float16(u16), nil
  114. }
  115. // NaN returns a Float16 of IEEE 754 binary16 not-a-number (NaN).
  116. // Returned NaN value 0x7e01 has all exponent bits = 1 with the
  117. // first and last bits = 1 in the significand. This is consistent
  118. // with Go's 64-bit math.NaN(). Canonical CBOR in RFC 7049 uses 0x7e00.
  119. func NaN() Float16 {
  120. return Float16(0x7e01)
  121. }
  122. // Inf returns a Float16 with an infinity value with the specified sign.
  123. // A sign >= returns positive infinity.
  124. // A sign < 0 returns negative infinity.
  125. func Inf(sign int) Float16 {
  126. if sign >= 0 {
  127. return Float16(0x7c00)
  128. }
  129. return Float16(0x8000 | 0x7c00)
  130. }
  131. // Float32 returns a float32 converted from f (Float16).
  132. // This is a lossless conversion.
  133. func (f Float16) Float32() float32 {
  134. u32 := f16bitsToF32bits(uint16(f))
  135. return math.Float32frombits(u32)
  136. }
  137. // Bits returns the IEEE 754 binary16 representation of f, with the sign bit
  138. // of f and the result in the same bit position. Bits(Frombits(x)) == x.
  139. func (f Float16) Bits() uint16 {
  140. return uint16(f)
  141. }
  142. // IsNaN reports whether f is an IEEE 754 binary16 “not-a-number” value.
  143. func (f Float16) IsNaN() bool {
  144. return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0)
  145. }
  146. // IsQuietNaN reports whether f is a quiet (non-signaling) IEEE 754 binary16
  147. // “not-a-number” value.
  148. func (f Float16) IsQuietNaN() bool {
  149. return (f&0x7c00 == 0x7c00) && (f&0x03ff != 0) && (f&0x0200 != 0)
  150. }
  151. // IsInf reports whether f is an infinity (inf).
  152. // A sign > 0 reports whether f is positive inf.
  153. // A sign < 0 reports whether f is negative inf.
  154. // A sign == 0 reports whether f is either inf.
  155. func (f Float16) IsInf(sign int) bool {
  156. return ((f == 0x7c00) && sign >= 0) ||
  157. (f == 0xfc00 && sign <= 0)
  158. }
  159. // IsFinite returns true if f is neither infinite nor NaN.
  160. func (f Float16) IsFinite() bool {
  161. return (uint16(f) & uint16(0x7c00)) != uint16(0x7c00)
  162. }
  163. // IsNormal returns true if f is neither zero, infinite, subnormal, or NaN.
  164. func (f Float16) IsNormal() bool {
  165. exp := uint16(f) & uint16(0x7c00)
  166. return (exp != uint16(0x7c00)) && (exp != 0)
  167. }
  168. // Signbit reports whether f is negative or negative zero.
  169. func (f Float16) Signbit() bool {
  170. return (uint16(f) & uint16(0x8000)) != 0
  171. }
  172. // String satisfies the fmt.Stringer interface.
  173. func (f Float16) String() string {
  174. return strconv.FormatFloat(float64(f.Float32()), 'f', -1, 32)
  175. }
  176. // f16bitsToF32bits returns uint32 (float32 bits) converted from specified uint16.
  177. func f16bitsToF32bits(in uint16) uint32 {
  178. // All 65536 conversions with this were confirmed to be correct
  179. // by Montgomery Edwards⁴⁴⁸ (github.com/x448).
  180. sign := uint32(in&0x8000) << 16 // sign for 32-bit
  181. exp := uint32(in&0x7c00) >> 10 // exponenent for 16-bit
  182. coef := uint32(in&0x03ff) << 13 // significand for 32-bit
  183. if exp == 0x1f {
  184. if coef == 0 {
  185. // infinity
  186. return sign | 0x7f800000 | coef
  187. }
  188. // NaN
  189. return sign | 0x7fc00000 | coef
  190. }
  191. if exp == 0 {
  192. if coef == 0 {
  193. // zero
  194. return sign
  195. }
  196. // normalize subnormal numbers
  197. exp++
  198. for coef&0x7f800000 == 0 {
  199. coef <<= 1
  200. exp--
  201. }
  202. coef &= 0x007fffff
  203. }
  204. return sign | ((exp + (0x7f - 0xf)) << 23) | coef
  205. }
  206. // f32bitsToF16bits returns uint16 (Float16 bits) converted from the specified float32.
  207. // Conversion rounds to nearest integer with ties to even.
  208. func f32bitsToF16bits(u32 uint32) uint16 {
  209. // Translated from Rust to Go by Montgomery Edwards⁴⁴⁸ (github.com/x448).
  210. // All 4294967296 conversions with this were confirmed to be correct by x448.
  211. // Original Rust implementation is by Kathryn Long (github.com/starkat99) with MIT license.
  212. sign := u32 & 0x80000000
  213. exp := u32 & 0x7f800000
  214. coef := u32 & 0x007fffff
  215. if exp == 0x7f800000 {
  216. // NaN or Infinity
  217. nanBit := uint32(0)
  218. if coef != 0 {
  219. nanBit = uint32(0x0200)
  220. }
  221. return uint16((sign >> 16) | uint32(0x7c00) | nanBit | (coef >> 13))
  222. }
  223. halfSign := sign >> 16
  224. unbiasedExp := int32(exp>>23) - 127
  225. halfExp := unbiasedExp + 15
  226. if halfExp >= 0x1f {
  227. return uint16(halfSign | uint32(0x7c00))
  228. }
  229. if halfExp <= 0 {
  230. if 14-halfExp > 24 {
  231. return uint16(halfSign)
  232. }
  233. coef := coef | uint32(0x00800000)
  234. halfCoef := coef >> uint32(14-halfExp)
  235. roundBit := uint32(1) << uint32(13-halfExp)
  236. if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 {
  237. halfCoef++
  238. }
  239. return uint16(halfSign | halfCoef)
  240. }
  241. uHalfExp := uint32(halfExp) << 10
  242. halfCoef := coef >> 13
  243. roundBit := uint32(0x00001000)
  244. if (coef&roundBit) != 0 && (coef&(3*roundBit-1)) != 0 {
  245. return uint16((halfSign | uHalfExp | halfCoef) + 1)
  246. }
  247. return uint16(halfSign | uHalfExp | halfCoef)
  248. }