std.halffloat source code

1 /**
2  * Implement IEEE 754 half-precision binary floating point format binary16.
3  *
4  * This a 16 bit type, and consists of a sign bit, a 5 bit exponent, and a
5  * 10 bit significand.
6  * All operations on HalfFloat are CTFE'able.
7  *
8  * References:
9  *      $(WEB en.wikipedia.org/wiki/Half-precision_floating-point_format, Wikipedia)
10  * Copyright: Copyright Digital Mars 2012-
11  * License:   $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0)
12  * Authors:   $(WEB digitalmars.com, Walter Bright)
13  * Source:    $(PHOBOSSRC std/_halffloat.d)
14  * Macros:
15  *   WIKI=Phobos/StdHalffloat
16  */
17 
18 module std.halffloat;
19 
20 /**
21  * The half precision floating point type.
22  *
23  * The only operations are:
24  * $(UL
25  * $(LI explicit conversion of float to HalfFloat)
26  * $(LI implicit conversion of HalfFloat to float)
27  * )
28  * It operates in an analogous manner to shorts, which are converted to ints
29  * before performing any operations, and explicitly cast back to shorts.
30  * The half float is considered essentially a storage type, not a computation type.
31  * Example:
32  * ---
33  HalfFloat h = hf!27.2f;
34  HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 );
35  HalfFloat f = HalfFloat(0.0f);
36  * ---
37  * Bugs:
38  *      The only rounding mode currently supported is Round To Nearest.
39  *      The exceptions OVERFLOW, UNDERFLOW and INEXACT are not thrown.
40  */
41 
42 struct HalfFloat {
43 
44     /* Provide implicit conversion of HalfFloat to float
45      */
46 
47     @property float toFloat() { return shortToFloat(s); }
48     alias toFloat this;
49 
50     /* Done as a template in order to prevent implicit conversion
51      * of argument to float.
52      */
53 
54     this(T : float)(T f)
55     {
56         static assert(is(T == float));
57         s = floatToShort(f);
58     }
59 
60     /* These are done as properties to avoid
61      * circular reference problems.
62      */
63 
64     ///
65     static @property HalfFloat min_normal() { HalfFloat hf = void; hf.s = 0x0400; return hf; }
66     unittest { assert(min_normal == hf!0x1p-14); }
67 
68     ///
69     static @property HalfFloat max()        { HalfFloat hf = void; hf.s = 0x7BFF; return hf; }
70     unittest { assert(max == hf!0x1.FFCp+15); }
71 
72     ///
73     static @property HalfFloat nan()        { HalfFloat hf = void; hf.s = EXPMASK | 1; return hf; }
74     // unittest { assert(nan != hf!(float.nan)); }
75 
76     ///
77     static @property HalfFloat infinity()   { HalfFloat hf = void; hf.s = EXPMASK; return hf; }
78     unittest { assert(infinity == hf!(float.infinity)); }
79 
80     ///
81     static @property HalfFloat epsilon()    { HalfFloat hf = void; hf.s = 0x1400; return hf; }
82     unittest { assert(epsilon == hf!0x1p-10); }
83 
84     enum dig =        3;        ///
85     enum mant_dig =   11;       ///
86     enum max_10_exp = 5;        ///
87     enum max_exp =    16;       ///
88     enum min_10_exp = -5;       ///
89     enum min_exp =    -14;      ///
90 
91 private:
92     ushort s = EXPMASK | 1;     // .init is HalfFloat.nan
93 }
94 
95 /********************
96  * User defined literal for Half Float.
97  * Example:
98  * ---
99  * auto h = hf!1.3f;
100  * ---
101  */
102 
103 template hf(float v)
104 {
105     enum hf = HalfFloat(v);
106 }
107 
108 private:
109 
110 // Half float values
111 enum SIGNMASK  = 0x8000;
112 enum EXPMASK   = 0x7C00;
113 enum MANTMASK  = 0x03FF;
114 enum HIDDENBIT = 0x0400;
115 
116 // float values
117 enum FSIGNMASK  = 0x80000000;
118 enum FEXPMASK   = 0x7F800000;
119 enum FMANTMASK  = 0x007FFFFF;
120 enum FHIDDENBIT = 0x00800000;
121 
122 // Rounding mode
123 enum ROUND { TONEAREST, UPWARD, DOWNWARD, TOZERO };
124 enum ROUNDMODE = ROUND.TONEAREST;
125 
126 ushort floatToShort(float f)
127 {
128     /* If the target CPU has a conversion instruction, this code could be
129      * replaced with inline asm or a compiler intrinsic, but leave this
130      * as the CTFE path so CTFE can work on it.
131      */
132 
133     /* The code currently does not set INEXACT, UNDERFLOW, or OVERFLOW,
134      * but is marked where those would go.
135      */
136 
137     uint s = *cast(uint*)&f;
138 
139     ushort u = (s & FSIGNMASK) ? SIGNMASK : 0;
140     int exp = s & FEXPMASK;
141     if (exp == FEXPMASK)  // if nan or infinity
142     {
143         if ((s & FMANTMASK) == 0)       // if infinity
144         {
145             u |= EXPMASK;
146         }
147         else                            // else nan
148         {
149             u |= EXPMASK | 1;
150         }
151         return u;
152     }
153 
154     uint significand = s & FMANTMASK;
155 
156     if (exp == 0)                       // if subnormal or zero
157     {
158         if (significand == 0)           // if zero
159             return u;
160 
161         /* A subnormal float is going to give us a zero result anyway,
162          * so just set UNDERFLOW and INEXACT and return +-0.
163          */
164         return u;
165     }
166     else                                // else normal
167     {
168         // normalize exponent and remove bias
169         exp = (exp >> 23) - 127;
170         significand |= FHIDDENBIT;
171     }
172 
173     exp += 15;                          // bias the exponent
174 
175     bool guard = false;                 // guard bit
176     bool sticky = false;                // sticky bit
177 
178     uint shift = 13;                    // lop off rightmost 13 bits
179     if (exp <= 0)                       // if subnormal
180     {   shift += -exp + 1;              // more bits to lop off
181         exp = 0;
182     }
183     if (shift > 23)
184     {
185         // Set UNDERFLOW, INEXACT, return +-0
186         return u;
187     }
188 
189 //printf("exp = x%x significand = x%x\n", exp, significand);
190 
191     // Lop off rightmost 13 bits, but save guard and sticky bits
192     guard = (significand & (1 << (shift - 1))) != 0;
193     sticky = (significand & ((1 << (shift - 1)) - 1)) != 0;
194     significand >>= shift;
195 
196 //printf("guard = %d, sticky = %d\n", guard, sticky);
197 //printf("significand = x%x\n", significand);
198 
199     if (guard || sticky)
200     {
201         // Lost some bits, so set INEXACT and round the result
202         switch (ROUNDMODE)
203         {
204         case ROUND.TONEAREST:
205             if (guard && (sticky || (significand & 1)))
206                 ++significand;
207             break;
208 
209         case ROUND.UPWARD:
210             if (!(s & FSIGNMASK))
211                 ++significand;
212             break;
213 
214         case ROUND.DOWNWARD:
215             if (s & FSIGNMASK)
216                 ++significand;
217             break;
218 
219         case ROUND.TOZERO:
220             break;
221 
222         default:
223             assert(0);
224         }
225         if (exp == 0)                           // if subnormal
226         {
227             if (significand & HIDDENBIT)        // and not a subnormal no more
228                 ++exp;
229         }
230         else if (significand & (HIDDENBIT << 1))
231         {
232             significand >>= 1;
233             ++exp;
234         }
235     }
236 
237     if (exp > 30)
238     {   // Set OVERFLOW and INEXACT, return +-infinity
239         return u | EXPMASK;
240     }
241 
242     /* Add exponent and significand into result.
243      */
244 
245     u |= exp << 10;                             // exponent
246     u |= (significand & ~HIDDENBIT);            // significand
247 
248     return u;
249 }
250 
251 unittest
252 {
253     static struct S { ushort u; float f; }
254 
255     static S[] tests =
256     [
257         { 0x3C00,  1.0f },
258         { 0x3C01,  1.0009765625f },
259         { 0xC000, -2.0f },
260         { 0x7BFF,  65504.0f },
261         { 0x0400,  6.10352e-5f },
262         { 0x03FF,  6.09756e-5f },
263         { 0x0001,  5.9604644775e-8f },
264         { 0x0000,  0.0f },
265         { 0x8000, -0.0f },
266         { 0x7C00,  float.infinity },
267         { 0xFC00, -float.infinity },
268         { 0x3555,  0.333252f },
269         { 0x7C01,  float.nan },
270         { 0xFC01, -float.nan },
271         { 0x0000,  1.0e-8f },
272         { 0x8000, -1.0e-8f },
273         { 0x7C00,  1.0e31f },
274         { 0xFC00, -1.0e31f },
275         { 0x0000,  1.0e-38 },   // subnormal float
276         { 0x8000, -1.0e-38 },
277         { 0x6800,  0x1002p-1 }, // guard
278         { 0x6801,  0x1003p-1 }, // guard && sticky
279         { 0x6802,  0x1006p-1 }, // guard && (significand & 1)
280         { 0x6802,  0x1007p-1 }, // guard && sticky && (significand & 1)
281         { 0x0400,  0x1FFFp-27 }, // round up subnormal to normal
282         { 0x0800,  0x3FFFp-27 }, // lose bit, add one to exp
283         //{ , },
284         ];
285 
286     foreach (i, s; tests)
287     {
288         ushort u = floatToShort(s.f);
289         if (u != s.u)
290         {
291             printf("[%llu] %g %04x expected %04x\n", i, s.f, u, s.u);
292             assert(0);
293         }
294     }
295 }
296 
297 float shortToFloat(ushort s)
298 {
299     /* If the target CPU has a conversion instruction, this code could be
300      * replaced with inline asm or a compiler intrinsic, but leave this
301      * as the CTFE path so CTFE can work on it.
302      */
303     /* This one is fairly easy because there are no possible errors
304      * and no necessary rounding.
305      */
306 
307     int exp = s & EXPMASK;
308     if (exp == EXPMASK)  // if nan or infinity
309     {
310         float f;
311         if ((s & MANTMASK) == 0)        // if infinity
312         {
313             f = float.infinity;
314         }
315         else                            // else nan
316         {
317             f = float.nan;
318         }
319         return (s & SIGNMASK) ? -f : f;
320     }
321 
322     uint significand = s & MANTMASK;
323 
324     if (exp == 0)                       // if subnormal or zero
325     {
326         if (significand == 0)           // if zero
327             return (s & SIGNMASK) ? -0.0f : 0.0f;
328 
329         // Normalize by shifting until the hidden bit is 1
330         while (!(significand & HIDDENBIT))
331         {
332             significand <<= 1;
333             --exp;
334         }
335         significand &= ~HIDDENBIT;      // hidden bit is, well, hidden
336         exp -= 14;
337     }
338     else                                // else normal
339     {
340         // normalize exponent and remove bias
341         exp = (exp >> 10) - 15;
342     }
343 
344     /* Assemble sign, exponent, and significand into float.
345      * Don't have to deal with overflow, inexact, or subnormal
346      * because the range of floats is big enough.
347      */
348 
349     assert(-126 <= exp && exp <= 127);  // just to be sure
350 
351     //printf("exp = %d, significand = x%x\n", exp, significand);
352 
353     uint u = (s & SIGNMASK) << 16;      // sign bit
354     u |= (exp + 127) << 23;             // bias the exponent and shift into position
355     u |= significand << (23 - 10);
356 
357     return *cast(float*)&u;
358 }
359 
360 unittest
361 {
362     static struct S { ushort u; float f; }
363 
364     static S[] tests =
365     [
366         { 0x3C00,  1.0f },
367         { 0xC000, -2.0f },
368         { 0x7BFF,  65504f },
369         { 0x0000,  0.0f },
370         { 0x8000, -0.0f },
371         { 0x7C00,  float.infinity},
372         { 0xFC00,  -float.infinity},
373         //{ , },
374         ];
375 
376     foreach (i, s; tests)
377     {
378         float f = shortToFloat(s.u);
379         if (f != s.f)
380         {
381             printf("[%llu] %04x %g expected %g\n", i, s.u, f, s.f);
382             assert(0);
383         }
384     }
385 }
386 
387 
388 version (unittest) import std.stdio;
389 
390 unittest
391 {
392     HalfFloat h = hf!27.2f;
393     HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 );
394     HalfFloat f = HalfFloat(0.0f);
395 
396     f.s = 0x1400;
397     writeln("1.0009765625 ", 1.0f + cast(float)f);
398     assert(f == HalfFloat.epsilon);
399 
400     f.s = 0x0400;
401     writeln("6.10352e-5 ", cast(float)f);
402     assert(f == HalfFloat.min_normal);
403 
404     f.s = 0x03FF;
405     writeln("6.09756e-5 ", cast(float)f);
406 
407     f.s = 1;
408     writefln("5.96046e-8 %.10e", cast(float)f);
409 
410     f.s = 0;
411     writeln("0 ", cast(float)f);
412     assert(f == 0.0f);
413 
414     f.s = 0x8000;
415     writeln("-0 ", cast(float)f);
416     assert(f == -0.0f);
417 
418     f.s = 0x3555;
419     writeln("0.33325 ", cast(float)f);
420 
421     f = HalfFloat.nan();
422     assert(f.s == 0x7C01);
423     float fl = f;
424     writefln("%x", *cast(uint*)&fl);
425     assert(*cast(uint*)&fl == 0x7FC0_0000);
426 }