std.halffloat source code

1 /**
2  * Implement IEEE 754 half-precision binary floating point format binary16.
3  *
4  * This a 16 bit type, and consists of a sign bit, a 5 bit exponent, and a
5  * 10 bit significand.
6  * All operations on HalfFloat are CTFE'able.
7  *
8  * References:
9  *	  $(WEB en.wikipedia.org/wiki/Half-precision_floating-point_format, Wikipedia)
10  * Copyright: Copyright Digital Mars 2012-
11  * License:   $(WEB boost.org/LICENSE_1_0.txt, Boost License 1.0)
12  * Authors:   $(WEB digitalmars.com, Walter Bright)
13  * Source:	$(PHOBOSSRC std/_halffloat.d)
14  * Macros:
15  *   WIKI=Phobos/StdHalffloat
16  */
17 
18 module std.halffloat;
19 
20 /**
21  * The half precision floating point type.
22  *
23  * The only operations are:
24  * $(UL
25  * $(LI explicit conversion of float to HalfFloat)
26  * $(LI implicit conversion of HalfFloat to float)
27  * )
28  * It operates in an analogous manner to shorts, which are converted to ints
29  * before performing any operations, and explicitly cast back to shorts.
30  * The half float is considered essentially a storage type, not a computation type.
31  * Example:
32  * ---
33  HalfFloat h = hf!27.2f;
34  HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 );
35  HalfFloat f = HalfFloat(0.0f);
36  * ---
37  * Bugs:
38  *	  The only rounding mode currently supported is Round To Nearest.
39  *	  The exceptions OVERFLOW, UNDERFLOW and INEXACT are not thrown.
40  */
41 
42 struct HalfFloat {
43 
44 	/* Provide implicit conversion of HalfFloat to float
45 	 */
46 
47 	@property float toFloat() { return shortToFloat(s); }
48 	alias toFloat this;
49 
50 	/* Done as a template in order to prevent implicit conversion
51 	 * of argument to float.
52 	 */
53 
54 	this(T : float)(T f)
55 	{
56 		static assert(is(T == float));
57 		s = floatToShort(f);
58 	}
59 
60 	/* These are done as properties to avoid
61 	 * circular reference problems.
62 	 */
63 
64 	///
65 	static @property HalfFloat min_normal() { HalfFloat hf = void; hf.s = 0x0400; return hf; }
66 	unittest { assert(min_normal == hf!0x1p-14); }
67 
68 	///
69 	static @property HalfFloat max()		{ HalfFloat hf = void; hf.s = 0x7BFF; return hf; }
70 	unittest { assert(max == hf!0x1.FFCp+15); }
71 
72 	///
73 	static @property HalfFloat nan()		{ HalfFloat hf = void; hf.s = EXPMASK | 1; return hf; }
74 	// unittest { assert(nan != hf!(float.nan)); }
75 
76 	///
77 	static @property HalfFloat infinity()   { HalfFloat hf = void; hf.s = EXPMASK; return hf; }
78 	unittest { assert(infinity == hf!(float.infinity)); }
79 
80 	///
81 	static @property HalfFloat epsilon()	{ HalfFloat hf = void; hf.s = 0x1400; return hf; }
82 	unittest { assert(epsilon == hf!0x1p-10); }
83 
84 	enum dig =		3;		///
85 	enum mant_dig =   11;	   ///
86 	enum max_10_exp = 5;		///
87 	enum max_exp =	16;	   ///
88 	enum min_10_exp = -5;	   ///
89 	enum min_exp =	-14;	  ///
90 
91 private:
92 	ushort s = EXPMASK | 1;	 // .init is HalfFloat.nan
93 }
94 
95 /********************
96  * User defined literal for Half Float.
97  * Example:
98  * ---
99  * auto h = hf!1.3f;
100  * ---
101  */
102 
103 template hf(float v)
104 {
105 	enum hf = HalfFloat(v);
106 }
107 
108 private:
109 
110 // Half float values
111 enum SIGNMASK  = 0x8000;
112 enum EXPMASK   = 0x7C00;
113 enum MANTMASK  = 0x03FF;
114 enum HIDDENBIT = 0x0400;
115 
116 // float values
117 enum FSIGNMASK  = 0x80000000;
118 enum FEXPMASK   = 0x7F800000;
119 enum FMANTMASK  = 0x007FFFFF;
120 enum FHIDDENBIT = 0x00800000;
121 
122 // Rounding mode
123 enum ROUND { TONEAREST, UPWARD, DOWNWARD, TOZERO };
124 enum ROUNDMODE = ROUND.TONEAREST;
125 
126 ushort floatToShort(float f)
127 {
128 	/* If the target CPU has a conversion instruction, this code could be
129 	 * replaced with inline asm or a compiler intrinsic, but leave this
130 	 * as the CTFE path so CTFE can work on it.
131 	 */
132 
133 	/* The code currently does not set INEXACT, UNDERFLOW, or OVERFLOW,
134 	 * but is marked where those would go.
135 	 */
136 
137 	uint s = *cast(uint*)&f;
138 
139 	ushort u = (s & FSIGNMASK) ? SIGNMASK : 0;
140 	int exp = s & FEXPMASK;
141 	if (exp == FEXPMASK)  // if nan or infinity
142 	{
143 		if ((s & FMANTMASK) == 0)	   // if infinity
144 		{
145 			u |= EXPMASK;
146 		}
147 		else							// else nan
148 		{
149 			u |= EXPMASK | 1;
150 		}
151 		return u;
152 	}
153 
154 	uint significand = s & FMANTMASK;
155 
156 	if (exp == 0)					   // if subnormal or zero
157 	{
158 		if (significand == 0)		   // if zero
159 			return u;
160 
161 		/* A subnormal float is going to give us a zero result anyway,
162 		 * so just set UNDERFLOW and INEXACT and return +-0.
163 		 */
164 		return u;
165 	}
166 	else								// else normal
167 	{
168 		// normalize exponent and remove bias
169 		exp = (exp >> 23) - 127;
170 		significand |= FHIDDENBIT;
171 	}
172 
173 	exp += 15;						  // bias the exponent
174 
175 	bool guard = false;				 // guard bit
176 	bool sticky = false;				// sticky bit
177 
178 	uint shift = 13;					// lop off rightmost 13 bits
179 	if (exp <= 0)					   // if subnormal
180 	{   shift += -exp + 1;			  // more bits to lop off
181 		exp = 0;
182 	}
183 	if (shift > 23)
184 	{
185 		// Set UNDERFLOW, INEXACT, return +-0
186 		return u;
187 	}
188 
189 //printf("exp = x%x significand = x%x\n", exp, significand);
190 
191 	// Lop off rightmost 13 bits, but save guard and sticky bits
192 	guard = (significand & (1 << (shift - 1))) != 0;
193 	sticky = (significand & ((1 << (shift - 1)) - 1)) != 0;
194 	significand >>= shift;
195 
196 //printf("guard = %d, sticky = %d\n", guard, sticky);
197 //printf("significand = x%x\n", significand);
198 
199 	if (guard || sticky)
200 	{
201 		// Lost some bits, so set INEXACT and round the result
202 		switch (ROUNDMODE)
203 		{
204 		case ROUND.TONEAREST:
205 			if (guard && (sticky || (significand & 1)))
206 				++significand;
207 			break;
208 
209 		case ROUND.UPWARD:
210 			if (!(s & FSIGNMASK))
211 				++significand;
212 			break;
213 
214 		case ROUND.DOWNWARD:
215 			if (s & FSIGNMASK)
216 				++significand;
217 			break;
218 
219 		case ROUND.TOZERO:
220 			break;
221 
222 		default:
223 			assert(0);
224 		}
225 		if (exp == 0)						   // if subnormal
226 		{
227 			if (significand & HIDDENBIT)		// and not a subnormal no more
228 				++exp;
229 		}
230 		else if (significand & (HIDDENBIT << 1))
231 		{
232 			significand >>= 1;
233 			++exp;
234 		}
235 	}
236 
237 	if (exp > 30)
238 	{   // Set OVERFLOW and INEXACT, return +-infinity
239 		return u | EXPMASK;
240 	}
241 
242 	/* Add exponent and significand into result.
243 	 */
244 
245 	u |= exp << 10;							 // exponent
246 	u |= (significand & ~HIDDENBIT);			// significand
247 
248 	return u;
249 }
250 
251 unittest {
252 	static struct S { ushort u; float f; }
253 
254 	static S[] tests =
255 	[
256 		{ 0x3C00,  1.0f },
257 		{ 0x3C01,  1.0009765625f },
258 		{ 0xC000, -2.0f },
259 		{ 0x7BFF,  65504.0f },
260 		{ 0x0400,  6.10352e-5f },
261 		{ 0x03FF,  6.09756e-5f },
262 		{ 0x0001,  5.9604644775e-8f },
263 		{ 0x0000,  0.0f },
264 		{ 0x8000, -0.0f },
265 		{ 0x7C00,  float.infinity },
266 		{ 0xFC00, -float.infinity },
267 		{ 0x3555,  0.333252f },
268 		{ 0x7C01,  float.nan },
269 		{ 0xFC01, -float.nan },
270 		{ 0x0000,  1.0e-8f },
271 		{ 0x8000, -1.0e-8f },
272 		{ 0x7C00,  1.0e31f },
273 		{ 0xFC00, -1.0e31f },
274 		{ 0x0000,  1.0e-38 },   // subnormal float
275 		{ 0x8000, -1.0e-38 },
276 		{ 0x6800,  0x1002p-1 }, // guard
277 		{ 0x6801,  0x1003p-1 }, // guard && sticky
278 		{ 0x6802,  0x1006p-1 }, // guard && (significand & 1)
279 		{ 0x6802,  0x1007p-1 }, // guard && sticky && (significand & 1)
280 		{ 0x0400,  0x1FFFp-27 }, // round up subnormal to normal
281 		{ 0x0800,  0x3FFFp-27 }, // lose bit, add one to exp
282 		//{ , },
283 		];
284 
285 	foreach (i, s; tests)
286 	{
287 		ushort u = floatToShort(s.f);
288 		if (u != s.u)
289 		{
290 			printf("[%llu] %g %04x expected %04x\n", i, s.f, u, s.u);
291 			assert(0);
292 		}
293 	}
294 }
295 
296 float shortToFloat(ushort s)
297 {
298 	/* If the target CPU has a conversion instruction, this code could be
299 	 * replaced with inline asm or a compiler intrinsic, but leave this
300 	 * as the CTFE path so CTFE can work on it.
301 	 */
302 	/* This one is fairly easy because there are no possible errors
303 	 * and no necessary rounding.
304 	 */
305 
306 	int exp = s & EXPMASK;
307 	if (exp == EXPMASK)  // if nan or infinity
308 	{
309 		float f;
310 		if ((s & MANTMASK) == 0)		// if infinity
311 		{
312 			f = float.infinity;
313 		}
314 		else							// else nan
315 		{
316 			f = float.nan;
317 		}
318 		return (s & SIGNMASK) ? -f : f;
319 	}
320 
321 	uint significand = s & MANTMASK;
322 
323 	if (exp == 0)					   // if subnormal or zero
324 	{
325 		if (significand == 0)		   // if zero
326 			return (s & SIGNMASK) ? -0.0f : 0.0f;
327 
328 		// Normalize by shifting until the hidden bit is 1
329 		while (!(significand & HIDDENBIT))
330 		{
331 			significand <<= 1;
332 			--exp;
333 		}
334 		significand &= ~HIDDENBIT;	  // hidden bit is, well, hidden
335 		exp -= 14;
336 	}
337 	else								// else normal
338 	{
339 		// normalize exponent and remove bias
340 		exp = (exp >> 10) - 15;
341 	}
342 
343 	/* Assemble sign, exponent, and significand into float.
344 	 * Don't have to deal with overflow, inexact, or subnormal
345 	 * because the range of floats is big enough.
346 	 */
347 
348 	assert(-126 <= exp && exp <= 127);  // just to be sure
349 
350 	//printf("exp = %d, significand = x%x\n", exp, significand);
351 
352 	uint u = (s & SIGNMASK) << 16;	  // sign bit
353 	u |= (exp + 127) << 23;			 // bias the exponent and shift into position
354 	u |= significand << (23 - 10);
355 
356 	return *cast(float*)&u;
357 }
358 
359 unittest {
360 	static struct S { ushort u; float f; }
361 
362 	static S[] tests =
363 	[
364 		{ 0x3C00,  1.0f },
365 		{ 0xC000, -2.0f },
366 		{ 0x7BFF,  65504f },
367 		{ 0x0000,  0.0f },
368 		{ 0x8000, -0.0f },
369 		{ 0x7C00,  float.infinity},
370 		{ 0xFC00,  -float.infinity},
371 		//{ , },
372 		];
373 
374 	foreach (i, s; tests)
375 	{
376 		float f = shortToFloat(s.u);
377 		if (f != s.f)
378 		{
379 			printf("[%llu] %04x %g expected %g\n", i, s.u, f, s.f);
380 			assert(0);
381 		}
382 	}
383 }
384 
385 
386 version (unittest) import std.stdio;
387 
388 unittest {
389 	HalfFloat h = hf!27.2f;
390 	HalfFloat j = cast(HalfFloat)( hf!3.5f + hf!5 );
391 	HalfFloat f = HalfFloat(0.0f);
392 
393 	f.s = 0x1400;
394 	writeln("1.0009765625 ", 1.0f + cast(float)f);
395 	assert(f == HalfFloat.epsilon);
396 
397 	f.s = 0x0400;
398 	writeln("6.10352e-5 ", cast(float)f);
399 	assert(f == HalfFloat.min_normal);
400 
401 	f.s = 0x03FF;
402 	writeln("6.09756e-5 ", cast(float)f);
403 
404 	f.s = 1;
405 	writefln("5.96046e-8 %.10e", cast(float)f);
406 
407 	f.s = 0;
408 	writeln("0 ", cast(float)f);
409 	assert(f == 0.0f);
410 
411 	f.s = 0x8000;
412 	writeln("-0 ", cast(float)f);
413 	assert(f == -0.0f);
414 
415 	f.s = 0x3555;
416 	writeln("0.33325 ", cast(float)f);
417 
418 	f = HalfFloat.nan();
419 	assert(f.s == 0x7C01);
420 	float fl = f;
421 	writefln("%x", *cast(uint*)&fl);
422 	assert(*cast(uint*)&fl == 0x7FC0_0000);
423 }