1 /** 2 * Implement IEEE 754 half-precision binary floating point format binary16. 3 * 4 * This a 16 bit type, and consists of a sign bit, a 5 bit exponent, and a 5 * 10 bit significand. 6 * All operations on HalfFloat are CTFE'able. 7 * 8 * References: 9 * $(WEB en.wikipedia.org/wiki/Half-precision_floating-point_format, Wikipedia) 10 * Copyright: Copyright Digital Mars 2012- 11 * License: $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0) 12 * Authors: $(WEB digitalmars.com, Walter Bright) 13 * Source: $(PHOBOSSRC std/_halffloat.d) 14 * Macros: 15 * WIKI=Phobos/StdHalffloat 16 */ 17 18 module std.halffloat; 19 20 /** 21 * The half precision floating point type. 22 * 23 * The only operations are: 24 * $(UL 25 * $(LI explicit conversion of float to HalfFloat) 26 * $(LI implicit conversion of HalfFloat to float) 27 * ) 28 * It operates in an analogous manner to shorts, which are converted to ints 29 * before performing any operations, and explicitly cast back to shorts. 30 * The half float is considered essentially a storage type, not a computation type. 31 * Example: 32 * --- 33 HalfFloat h = hf!27.2f; 34 HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 ); 35 HalfFloat f = HalfFloat(0.0f); 36 * --- 37 * Bugs: 38 * The only rounding mode currently supported is Round To Nearest. 39 * The exceptions OVERFLOW, UNDERFLOW and INEXACT are not thrown. 40 */ 41 42 struct HalfFloat { 43 44 /* Provide implicit conversion of HalfFloat to float 45 */ 46 47 @property float toFloat() { return shortToFloat(s); } 48 alias toFloat this; 49 50 /* Done as a template in order to prevent implicit conversion 51 * of argument to float. 52 */ 53 54 this(T : float)(T f) 55 { 56 static assert(is(T == float)); 57 s = floatToShort(f); 58 } 59 60 /* These are done as properties to avoid 61 * circular reference problems. 62 */ 63 64 /// 65 static @property HalfFloat min_normal() { HalfFloat hf = void; hf.s = 0x0400; return hf; } 66 unittest { assert(min_normal == hf!0x1p-14); } 67 68 /// 69 static @property HalfFloat max() { HalfFloat hf = void; hf.s = 0x7BFF; return hf; } 70 unittest { assert(max == hf!0x1.FFCp+15); } 71 72 /// 73 static @property HalfFloat nan() { HalfFloat hf = void; hf.s = EXPMASK | 1; return hf; } 74 // unittest { assert(nan != hf!(float.nan)); } 75 76 /// 77 static @property HalfFloat infinity() { HalfFloat hf = void; hf.s = EXPMASK; return hf; } 78 unittest { assert(infinity == hf!(float.infinity)); } 79 80 /// 81 static @property HalfFloat epsilon() { HalfFloat hf = void; hf.s = 0x1400; return hf; } 82 unittest { assert(epsilon == hf!0x1p-10); } 83 84 enum dig = 3; /// 85 enum mant_dig = 11; /// 86 enum max_10_exp = 5; /// 87 enum max_exp = 16; /// 88 enum min_10_exp = -5; /// 89 enum min_exp = -14; /// 90 91 private: 92 ushort s = EXPMASK | 1; // .init is HalfFloat.nan 93 } 94 95 /******************** 96 * User defined literal for Half Float. 97 * Example: 98 * --- 99 * auto h = hf!1.3f; 100 * --- 101 */ 102 103 template hf(float v) 104 { 105 enum hf = HalfFloat(v); 106 } 107 108 private: 109 110 // Half float values 111 enum SIGNMASK = 0x8000; 112 enum EXPMASK = 0x7C00; 113 enum MANTMASK = 0x03FF; 114 enum HIDDENBIT = 0x0400; 115 116 // float values 117 enum FSIGNMASK = 0x80000000; 118 enum FEXPMASK = 0x7F800000; 119 enum FMANTMASK = 0x007FFFFF; 120 enum FHIDDENBIT = 0x00800000; 121 122 // Rounding mode 123 enum ROUND { TONEAREST, UPWARD, DOWNWARD, TOZERO }; 124 enum ROUNDMODE = ROUND.TONEAREST; 125 126 ushort floatToShort(float f) 127 { 128 /* If the target CPU has a conversion instruction, this code could be 129 * replaced with inline asm or a compiler intrinsic, but leave this 130 * as the CTFE path so CTFE can work on it. 131 */ 132 133 /* The code currently does not set INEXACT, UNDERFLOW, or OVERFLOW, 134 * but is marked where those would go. 135 */ 136 137 uint s = *cast(uint*)&f; 138 139 ushort u = (s & FSIGNMASK) ? SIGNMASK : 0; 140 int exp = s & FEXPMASK; 141 if (exp == FEXPMASK) // if nan or infinity 142 { 143 if ((s & FMANTMASK) == 0) // if infinity 144 { 145 u |= EXPMASK; 146 } 147 else // else nan 148 { 149 u |= EXPMASK | 1; 150 } 151 return u; 152 } 153 154 uint significand = s & FMANTMASK; 155 156 if (exp == 0) // if subnormal or zero 157 { 158 if (significand == 0) // if zero 159 return u; 160 161 /* A subnormal float is going to give us a zero result anyway, 162 * so just set UNDERFLOW and INEXACT and return +-0. 163 */ 164 return u; 165 } 166 else // else normal 167 { 168 // normalize exponent and remove bias 169 exp = (exp >> 23) - 127; 170 significand |= FHIDDENBIT; 171 } 172 173 exp += 15; // bias the exponent 174 175 bool guard = false; // guard bit 176 bool sticky = false; // sticky bit 177 178 uint shift = 13; // lop off rightmost 13 bits 179 if (exp <= 0) // if subnormal 180 { shift += -exp + 1; // more bits to lop off 181 exp = 0; 182 } 183 if (shift > 23) 184 { 185 // Set UNDERFLOW, INEXACT, return +-0 186 return u; 187 } 188 189 //printf("exp = x%x significand = x%x\n", exp, significand); 190 191 // Lop off rightmost 13 bits, but save guard and sticky bits 192 guard = (significand & (1 << (shift - 1))) != 0; 193 sticky = (significand & ((1 << (shift - 1)) - 1)) != 0; 194 significand >>= shift; 195 196 //printf("guard = %d, sticky = %d\n", guard, sticky); 197 //printf("significand = x%x\n", significand); 198 199 if (guard || sticky) 200 { 201 // Lost some bits, so set INEXACT and round the result 202 switch (ROUNDMODE) 203 { 204 case ROUND.TONEAREST: 205 if (guard && (sticky || (significand & 1))) 206 ++significand; 207 break; 208 209 case ROUND.UPWARD: 210 if (!(s & FSIGNMASK)) 211 ++significand; 212 break; 213 214 case ROUND.DOWNWARD: 215 if (s & FSIGNMASK) 216 ++significand; 217 break; 218 219 case ROUND.TOZERO: 220 break; 221 222 default: 223 assert(0); 224 } 225 if (exp == 0) // if subnormal 226 { 227 if (significand & HIDDENBIT) // and not a subnormal no more 228 ++exp; 229 } 230 else if (significand & (HIDDENBIT << 1)) 231 { 232 significand >>= 1; 233 ++exp; 234 } 235 } 236 237 if (exp > 30) 238 { // Set OVERFLOW and INEXACT, return +-infinity 239 return u | EXPMASK; 240 } 241 242 /* Add exponent and significand into result. 243 */ 244 245 u |= exp << 10; // exponent 246 u |= (significand & ~HIDDENBIT); // significand 247 248 return u; 249 } 250 251 unittest { 252 static struct S { ushort u; float f; } 253 254 static S[] tests = 255 [ 256 { 0x3C00, 1.0f }, 257 { 0x3C01, 1.0009765625f }, 258 { 0xC000, -2.0f }, 259 { 0x7BFF, 65504.0f }, 260 { 0x0400, 6.10352e-5f }, 261 { 0x03FF, 6.09756e-5f }, 262 { 0x0001, 5.9604644775e-8f }, 263 { 0x0000, 0.0f }, 264 { 0x8000, -0.0f }, 265 { 0x7C00, float.infinity }, 266 { 0xFC00, -float.infinity }, 267 { 0x3555, 0.333252f }, 268 { 0x7C01, float.nan }, 269 { 0xFC01, -float.nan }, 270 { 0x0000, 1.0e-8f }, 271 { 0x8000, -1.0e-8f }, 272 { 0x7C00, 1.0e31f }, 273 { 0xFC00, -1.0e31f }, 274 { 0x0000, 1.0e-38 }, // subnormal float 275 { 0x8000, -1.0e-38 }, 276 { 0x6800, 0x1002p-1 }, // guard 277 { 0x6801, 0x1003p-1 }, // guard && sticky 278 { 0x6802, 0x1006p-1 }, // guard && (significand & 1) 279 { 0x6802, 0x1007p-1 }, // guard && sticky && (significand & 1) 280 { 0x0400, 0x1FFFp-27 }, // round up subnormal to normal 281 { 0x0800, 0x3FFFp-27 }, // lose bit, add one to exp 282 //{ , }, 283 ]; 284 285 foreach (i, s; tests) 286 { 287 ushort u = floatToShort(s.f); 288 if (u != s.u) 289 { 290 printf("[%llu] %g %04x expected %04x\n", i, s.f, u, s.u); 291 assert(0); 292 } 293 } 294 } 295 296 float shortToFloat(ushort s) 297 { 298 /* If the target CPU has a conversion instruction, this code could be 299 * replaced with inline asm or a compiler intrinsic, but leave this 300 * as the CTFE path so CTFE can work on it. 301 */ 302 /* This one is fairly easy because there are no possible errors 303 * and no necessary rounding. 304 */ 305 306 int exp = s & EXPMASK; 307 if (exp == EXPMASK) // if nan or infinity 308 { 309 float f; 310 if ((s & MANTMASK) == 0) // if infinity 311 { 312 f = float.infinity; 313 } 314 else // else nan 315 { 316 f = float.nan; 317 } 318 return (s & SIGNMASK) ? -f : f; 319 } 320 321 uint significand = s & MANTMASK; 322 323 if (exp == 0) // if subnormal or zero 324 { 325 if (significand == 0) // if zero 326 return (s & SIGNMASK) ? -0.0f : 0.0f; 327 328 // Normalize by shifting until the hidden bit is 1 329 while (!(significand & HIDDENBIT)) 330 { 331 significand <<= 1; 332 --exp; 333 } 334 significand &= ~HIDDENBIT; // hidden bit is, well, hidden 335 exp -= 14; 336 } 337 else // else normal 338 { 339 // normalize exponent and remove bias 340 exp = (exp >> 10) - 15; 341 } 342 343 /* Assemble sign, exponent, and significand into float. 344 * Don't have to deal with overflow, inexact, or subnormal 345 * because the range of floats is big enough. 346 */ 347 348 assert(-126 <= exp && exp <= 127); // just to be sure 349 350 //printf("exp = %d, significand = x%x\n", exp, significand); 351 352 uint u = (s & SIGNMASK) << 16; // sign bit 353 u |= (exp + 127) << 23; // bias the exponent and shift into position 354 u |= significand << (23 - 10); 355 356 return *cast(float*)&u; 357 } 358 359 unittest { 360 static struct S { ushort u; float f; } 361 362 static S[] tests = 363 [ 364 { 0x3C00, 1.0f }, 365 { 0xC000, -2.0f }, 366 { 0x7BFF, 65504f }, 367 { 0x0000, 0.0f }, 368 { 0x8000, -0.0f }, 369 { 0x7C00, float.infinity}, 370 { 0xFC00, -float.infinity}, 371 //{ , }, 372 ]; 373 374 foreach (i, s; tests) 375 { 376 float f = shortToFloat(s.u); 377 if (f != s.f) 378 { 379 printf("[%llu] %04x %g expected %g\n", i, s.u, f, s.f); 380 assert(0); 381 } 382 } 383 } 384 385 386 version (unittest) import std.stdio; 387 388 unittest { 389 HalfFloat h = hf!27.2f; 390 HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 ); 391 HalfFloat f = HalfFloat(0.0f); 392 393 f.s = 0x1400; 394 writeln("1.0009765625 ", 1.0f + cast(float)f); 395 assert(f == HalfFloat.epsilon); 396 397 f.s = 0x0400; 398 writeln("6.10352e-5 ", cast(float)f); 399 assert(f == HalfFloat.min_normal); 400 401 f.s = 0x03FF; 402 writeln("6.09756e-5 ", cast(float)f); 403 404 f.s = 1; 405 writefln("5.96046e-8 %.10e", cast(float)f); 406 407 f.s = 0; 408 writeln("0 ", cast(float)f); 409 assert(f == 0.0f); 410 411 f.s = 0x8000; 412 writeln("-0 ", cast(float)f); 413 assert(f == -0.0f); 414 415 f.s = 0x3555; 416 writeln("0.33325 ", cast(float)f); 417 418 f = HalfFloat.nan(); 419 assert(f.s == 0x7C01); 420 float fl = f; 421 writefln("%x", *cast(uint*)&fl); 422 assert(*cast(uint*)&fl == 0x7FC0_0000); 423 }