Annotation of sys/lib/libkern/softfloat.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: softfloat.c,v 1.1 2002/04/28 20:55:14 pvalchev Exp $ */
2: /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */
3:
4: /*
5: * This version hacked for use with gcc -msoft-float by bjh21.
6: * (Mostly a case of #ifdefing out things GCC doesn't need or provides
7: * itself).
8: */
9:
10: /*
11: * Things you may want to define:
12: *
13: * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
14: * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
15: * properly renamed.
16: */
17:
18: /*
19: ===============================================================================
20:
21: This C source file is part of the SoftFloat IEC/IEEE Floating-point
22: Arithmetic Package, Release 2a.
23:
24: Written by John R. Hauser. This work was made possible in part by the
25: International Computer Science Institute, located at Suite 600, 1947 Center
26: Street, Berkeley, California 94704. Funding was partially provided by the
27: National Science Foundation under grant MIP-9311980. The original version
28: of this code was written as part of a project to build a fixed-point vector
29: processor in collaboration with the University of California at Berkeley,
30: overseen by Profs. Nelson Morgan and John Wawrzynek. More information
31: is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
32: arithmetic/SoftFloat.html'.
33:
34: THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable
35: effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
36: WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS
37: RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
38: RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
39: THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
40: (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
41: COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
42: ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
43:
44: Derivative works are acceptable, even for commercial purposes, so long as
45: (1) they include prominent notice that the work is derivative, and (2) they
46: include prominent notice akin to these four paragraphs for those parts of
47: this code that are retained.
48:
49: ===============================================================================
50: */
51:
52: #ifndef NO_IEEE
53:
54: #include <sys/cdefs.h>
55: #if defined(LIBC_SCCS) && !defined(lint)
56: __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
57: #endif /* LIBC_SCCS and not lint */
58:
59: #ifdef SOFTFLOAT_FOR_GCC
60: #include "softfloat-for-gcc.h"
61: #endif
62:
63: #include "milieu.h"
64: #include "softfloat.h"
65:
66: /*
67: * Conversions between floats as stored in memory and floats as
68: * SoftFloat uses them
69: */
70: #ifndef FLOAT64_DEMANGLE
71: #define FLOAT64_DEMANGLE(a) (a)
72: #endif
73: #ifndef FLOAT64_MANGLE
74: #define FLOAT64_MANGLE(a) (a)
75: #endif
76:
77: /*
78: -------------------------------------------------------------------------------
79: Floating-point rounding mode, extended double-precision rounding precision,
80: and exception flags.
81: -------------------------------------------------------------------------------
82: */
83:
84: /*
85: * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
86: * Right now, it does not. I've removed all other dynamic global
87: * variables. [ross]
88: */
89: #ifdef FLOATX80
90: int8 floatx80_rounding_precision = 80;
91: #endif
92:
93: /*
94: -------------------------------------------------------------------------------
95: Primitive arithmetic functions, including multi-word arithmetic, and
96: division and square root approximations. (Can be specialized to target if
97: desired.)
98: -------------------------------------------------------------------------------
99: */
100: #include "softfloat-macros.h"
101:
102: /*
103: -------------------------------------------------------------------------------
104: Functions and definitions to determine: (1) whether tininess for underflow
105: is detected before or after rounding by default, (2) what (if anything)
106: happens when exceptions are raised, (3) how signaling NaNs are distinguished
107: from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
108: are propagated from function inputs to output. These details are target-
109: specific.
110: -------------------------------------------------------------------------------
111: */
112: #include "softfloat-specialize.h"
113:
114: #ifndef SOFTFLOAT_FOR_GCC /* Not used */
115: /*
116: -------------------------------------------------------------------------------
117: Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
118: and 7, and returns the properly rounded 32-bit integer corresponding to the
119: input. If `zSign' is 1, the input is negated before being converted to an
120: integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
121: is simply rounded to an integer, with the inexact exception raised if the
122: input cannot be represented exactly as an integer. However, if the fixed-
123: point input is too large, the invalid exception is raised and the largest
124: positive or negative integer is returned.
125: -------------------------------------------------------------------------------
126: */
127: static int32 roundAndPackInt32( flag zSign, bits64 absZ )
128: {
129: int8 roundingMode;
130: flag roundNearestEven;
131: int8 roundIncrement, roundBits;
132: int32 z;
133:
134: roundingMode = float_rounding_mode();
135: roundNearestEven = ( roundingMode == float_round_nearest_even );
136: roundIncrement = 0x40;
137: if ( ! roundNearestEven ) {
138: if ( roundingMode == float_round_to_zero ) {
139: roundIncrement = 0;
140: }
141: else {
142: roundIncrement = 0x7F;
143: if ( zSign ) {
144: if ( roundingMode == float_round_up ) roundIncrement = 0;
145: }
146: else {
147: if ( roundingMode == float_round_down ) roundIncrement = 0;
148: }
149: }
150: }
151: roundBits = absZ & 0x7F;
152: absZ = ( absZ + roundIncrement )>>7;
153: absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
154: z = absZ;
155: if ( zSign ) z = - z;
156: if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
157: float_raise( float_flag_invalid );
158: return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
159: }
160: if ( roundBits ) float_set_inexact();
161: return z;
162:
163: }
164:
165: /*
166: -------------------------------------------------------------------------------
167: Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
168: `absZ1', with binary point between bits 63 and 64 (between the input words),
169: and returns the properly rounded 64-bit integer corresponding to the input.
170: If `zSign' is 1, the input is negated before being converted to an integer.
171: Ordinarily, the fixed-point input is simply rounded to an integer, with
172: the inexact exception raised if the input cannot be represented exactly as
173: an integer. However, if the fixed-point input is too large, the invalid
174: exception is raised and the largest positive or negative integer is
175: returned.
176: -------------------------------------------------------------------------------
177: */
178: static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
179: {
180: int8 roundingMode;
181: flag roundNearestEven, increment;
182: int64 z;
183:
184: roundingMode = float_rounding_mode();
185: roundNearestEven = ( roundingMode == float_round_nearest_even );
186: increment = ( (sbits64) absZ1 < 0 );
187: if ( ! roundNearestEven ) {
188: if ( roundingMode == float_round_to_zero ) {
189: increment = 0;
190: }
191: else {
192: if ( zSign ) {
193: increment = ( roundingMode == float_round_down ) && absZ1;
194: }
195: else {
196: increment = ( roundingMode == float_round_up ) && absZ1;
197: }
198: }
199: }
200: if ( increment ) {
201: ++absZ0;
202: if ( absZ0 == 0 ) goto overflow;
203: absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
204: }
205: z = absZ0;
206: if ( zSign ) z = - z;
207: if ( z && ( ( z < 0 ) ^ zSign ) ) {
208: overflow:
209: float_raise( float_flag_invalid );
210: return
211: zSign ? (sbits64) LIT64( 0x8000000000000000 )
212: : LIT64( 0x7FFFFFFFFFFFFFFF );
213: }
214: if ( absZ1 ) float_set_inexact();
215: return z;
216:
217: }
218: #endif
219:
220: /*
221: -------------------------------------------------------------------------------
222: Returns the fraction bits of the single-precision floating-point value `a'.
223: -------------------------------------------------------------------------------
224: */
225: INLINE bits32 extractFloat32Frac( float32 a )
226: {
227:
228: return a & 0x007FFFFF;
229:
230: }
231:
232: /*
233: -------------------------------------------------------------------------------
234: Returns the exponent bits of the single-precision floating-point value `a'.
235: -------------------------------------------------------------------------------
236: */
237: INLINE int16 extractFloat32Exp( float32 a )
238: {
239:
240: return ( a>>23 ) & 0xFF;
241:
242: }
243:
244: /*
245: -------------------------------------------------------------------------------
246: Returns the sign bit of the single-precision floating-point value `a'.
247: -------------------------------------------------------------------------------
248: */
249: INLINE flag extractFloat32Sign( float32 a )
250: {
251:
252: return a>>31;
253:
254: }
255:
256: /*
257: -------------------------------------------------------------------------------
258: Normalizes the subnormal single-precision floating-point value represented
259: by the denormalized significand `aSig'. The normalized exponent and
260: significand are stored at the locations pointed to by `zExpPtr' and
261: `zSigPtr', respectively.
262: -------------------------------------------------------------------------------
263: */
264: static void
265: normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
266: {
267: int8 shiftCount;
268:
269: shiftCount = countLeadingZeros32( aSig ) - 8;
270: *zSigPtr = aSig<<shiftCount;
271: *zExpPtr = 1 - shiftCount;
272:
273: }
274:
275: /*
276: -------------------------------------------------------------------------------
277: Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
278: single-precision floating-point value, returning the result. After being
279: shifted into the proper positions, the three fields are simply added
280: together to form the result. This means that any integer portion of `zSig'
281: will be added into the exponent. Since a properly normalized significand
282: will have an integer portion equal to 1, the `zExp' input should be 1 less
283: than the desired result exponent whenever `zSig' is a complete, normalized
284: significand.
285: -------------------------------------------------------------------------------
286: */
287: INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
288: {
289:
290: return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
291:
292: }
293:
294: /*
295: -------------------------------------------------------------------------------
296: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
297: and significand `zSig', and returns the proper single-precision floating-
298: point value corresponding to the abstract input. Ordinarily, the abstract
299: value is simply rounded and packed into the single-precision format, with
300: the inexact exception raised if the abstract input cannot be represented
301: exactly. However, if the abstract value is too large, the overflow and
302: inexact exceptions are raised and an infinity or maximal finite value is
303: returned. If the abstract value is too small, the input value is rounded to
304: a subnormal number, and the underflow and inexact exceptions are raised if
305: the abstract input cannot be represented exactly as a subnormal single-
306: precision floating-point number.
307: The input significand `zSig' has its binary point between bits 30
308: and 29, which is 7 bits to the left of the usual location. This shifted
309: significand must be normalized or smaller. If `zSig' is not normalized,
310: `zExp' must be 0; in that case, the result returned is a subnormal number,
311: and it must not require rounding. In the usual case that `zSig' is
312: normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
313: The handling of underflow and overflow follows the IEC/IEEE Standard for
314: Binary Floating-Point Arithmetic.
315: -------------------------------------------------------------------------------
316: */
317: static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
318: {
319: int8 roundingMode;
320: flag roundNearestEven;
321: int8 roundIncrement, roundBits;
322: flag isTiny;
323:
324: roundingMode = float_rounding_mode();
325: roundNearestEven = ( roundingMode == float_round_nearest_even );
326: roundIncrement = 0x40;
327: if ( ! roundNearestEven ) {
328: if ( roundingMode == float_round_to_zero ) {
329: roundIncrement = 0;
330: }
331: else {
332: roundIncrement = 0x7F;
333: if ( zSign ) {
334: if ( roundingMode == float_round_up ) roundIncrement = 0;
335: }
336: else {
337: if ( roundingMode == float_round_down ) roundIncrement = 0;
338: }
339: }
340: }
341: roundBits = zSig & 0x7F;
342: if ( 0xFD <= (bits16) zExp ) {
343: if ( ( 0xFD < zExp )
344: || ( ( zExp == 0xFD )
345: && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
346: ) {
347: float_raise( float_flag_overflow | float_flag_inexact );
348: return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
349: }
350: if ( zExp < 0 ) {
351: isTiny =
352: ( float_detect_tininess == float_tininess_before_rounding )
353: || ( zExp < -1 )
354: || ( zSig + roundIncrement < 0x80000000 );
355: shift32RightJamming( zSig, - zExp, &zSig );
356: zExp = 0;
357: roundBits = zSig & 0x7F;
358: if ( isTiny && roundBits ) float_raise( float_flag_underflow );
359: }
360: }
361: if ( roundBits ) float_set_inexact();
362: zSig = ( zSig + roundIncrement )>>7;
363: zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
364: if ( zSig == 0 ) zExp = 0;
365: return packFloat32( zSign, zExp, zSig );
366:
367: }
368:
369: /*
370: -------------------------------------------------------------------------------
371: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
372: and significand `zSig', and returns the proper single-precision floating-
373: point value corresponding to the abstract input. This routine is just like
374: `roundAndPackFloat32' except that `zSig' does not have to be normalized.
375: Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
376: floating-point exponent.
377: -------------------------------------------------------------------------------
378: */
379: static float32
380: normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
381: {
382: int8 shiftCount;
383:
384: shiftCount = countLeadingZeros32( zSig ) - 1;
385: return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
386:
387: }
388:
389: /*
390: -------------------------------------------------------------------------------
391: Returns the fraction bits of the double-precision floating-point value `a'.
392: -------------------------------------------------------------------------------
393: */
394: INLINE bits64 extractFloat64Frac( float64 a )
395: {
396:
397: return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
398:
399: }
400:
401: /*
402: -------------------------------------------------------------------------------
403: Returns the exponent bits of the double-precision floating-point value `a'.
404: -------------------------------------------------------------------------------
405: */
406: INLINE int16 extractFloat64Exp( float64 a )
407: {
408:
409: return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
410:
411: }
412:
413: /*
414: -------------------------------------------------------------------------------
415: Returns the sign bit of the double-precision floating-point value `a'.
416: -------------------------------------------------------------------------------
417: */
418: INLINE flag extractFloat64Sign( float64 a )
419: {
420:
421: return FLOAT64_DEMANGLE(a)>>63;
422:
423: }
424:
425: /*
426: -------------------------------------------------------------------------------
427: Normalizes the subnormal double-precision floating-point value represented
428: by the denormalized significand `aSig'. The normalized exponent and
429: significand are stored at the locations pointed to by `zExpPtr' and
430: `zSigPtr', respectively.
431: -------------------------------------------------------------------------------
432: */
433: static void
434: normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
435: {
436: int8 shiftCount;
437:
438: shiftCount = countLeadingZeros64( aSig ) - 11;
439: *zSigPtr = aSig<<shiftCount;
440: *zExpPtr = 1 - shiftCount;
441:
442: }
443:
444: /*
445: -------------------------------------------------------------------------------
446: Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
447: double-precision floating-point value, returning the result. After being
448: shifted into the proper positions, the three fields are simply added
449: together to form the result. This means that any integer portion of `zSig'
450: will be added into the exponent. Since a properly normalized significand
451: will have an integer portion equal to 1, the `zExp' input should be 1 less
452: than the desired result exponent whenever `zSig' is a complete, normalized
453: significand.
454: -------------------------------------------------------------------------------
455: */
456: INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
457: {
458:
459: return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
460: ( ( (bits64) zExp )<<52 ) + zSig );
461:
462: }
463:
464: /*
465: -------------------------------------------------------------------------------
466: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
467: and significand `zSig', and returns the proper double-precision floating-
468: point value corresponding to the abstract input. Ordinarily, the abstract
469: value is simply rounded and packed into the double-precision format, with
470: the inexact exception raised if the abstract input cannot be represented
471: exactly. However, if the abstract value is too large, the overflow and
472: inexact exceptions are raised and an infinity or maximal finite value is
473: returned. If the abstract value is too small, the input value is rounded to
474: a subnormal number, and the underflow and inexact exceptions are raised if
475: the abstract input cannot be represented exactly as a subnormal double-
476: precision floating-point number.
477: The input significand `zSig' has its binary point between bits 62
478: and 61, which is 10 bits to the left of the usual location. This shifted
479: significand must be normalized or smaller. If `zSig' is not normalized,
480: `zExp' must be 0; in that case, the result returned is a subnormal number,
481: and it must not require rounding. In the usual case that `zSig' is
482: normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
483: The handling of underflow and overflow follows the IEC/IEEE Standard for
484: Binary Floating-Point Arithmetic.
485: -------------------------------------------------------------------------------
486: */
487: static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
488: {
489: int8 roundingMode;
490: flag roundNearestEven;
491: int16 roundIncrement, roundBits;
492: flag isTiny;
493:
494: roundingMode = float_rounding_mode();
495: roundNearestEven = ( roundingMode == float_round_nearest_even );
496: roundIncrement = 0x200;
497: if ( ! roundNearestEven ) {
498: if ( roundingMode == float_round_to_zero ) {
499: roundIncrement = 0;
500: }
501: else {
502: roundIncrement = 0x3FF;
503: if ( zSign ) {
504: if ( roundingMode == float_round_up ) roundIncrement = 0;
505: }
506: else {
507: if ( roundingMode == float_round_down ) roundIncrement = 0;
508: }
509: }
510: }
511: roundBits = zSig & 0x3FF;
512: if ( 0x7FD <= (bits16) zExp ) {
513: if ( ( 0x7FD < zExp )
514: || ( ( zExp == 0x7FD )
515: && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
516: ) {
517: float_raise( float_flag_overflow | float_flag_inexact );
518: return FLOAT64_MANGLE(
519: FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
520: ( roundIncrement == 0 ));
521: }
522: if ( zExp < 0 ) {
523: isTiny =
524: ( float_detect_tininess == float_tininess_before_rounding )
525: || ( zExp < -1 )
526: || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
527: shift64RightJamming( zSig, - zExp, &zSig );
528: zExp = 0;
529: roundBits = zSig & 0x3FF;
530: if ( isTiny && roundBits ) float_raise( float_flag_underflow );
531: }
532: }
533: if ( roundBits ) float_set_inexact();
534: zSig = ( zSig + roundIncrement )>>10;
535: zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
536: if ( zSig == 0 ) zExp = 0;
537: return packFloat64( zSign, zExp, zSig );
538:
539: }
540:
541: /*
542: -------------------------------------------------------------------------------
543: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
544: and significand `zSig', and returns the proper double-precision floating-
545: point value corresponding to the abstract input. This routine is just like
546: `roundAndPackFloat64' except that `zSig' does not have to be normalized.
547: Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
548: floating-point exponent.
549: -------------------------------------------------------------------------------
550: */
551: static float64
552: normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
553: {
554: int8 shiftCount;
555:
556: shiftCount = countLeadingZeros64( zSig ) - 1;
557: return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
558:
559: }
560:
561: #ifdef FLOATX80
562:
563: /*
564: -------------------------------------------------------------------------------
565: Returns the fraction bits of the extended double-precision floating-point
566: value `a'.
567: -------------------------------------------------------------------------------
568: */
569: INLINE bits64 extractFloatx80Frac( floatx80 a )
570: {
571:
572: return a.low;
573:
574: }
575:
576: /*
577: -------------------------------------------------------------------------------
578: Returns the exponent bits of the extended double-precision floating-point
579: value `a'.
580: -------------------------------------------------------------------------------
581: */
582: INLINE int32 extractFloatx80Exp( floatx80 a )
583: {
584:
585: return a.high & 0x7FFF;
586:
587: }
588:
589: /*
590: -------------------------------------------------------------------------------
591: Returns the sign bit of the extended double-precision floating-point value
592: `a'.
593: -------------------------------------------------------------------------------
594: */
595: INLINE flag extractFloatx80Sign( floatx80 a )
596: {
597:
598: return a.high>>15;
599:
600: }
601:
602: /*
603: -------------------------------------------------------------------------------
604: Normalizes the subnormal extended double-precision floating-point value
605: represented by the denormalized significand `aSig'. The normalized exponent
606: and significand are stored at the locations pointed to by `zExpPtr' and
607: `zSigPtr', respectively.
608: -------------------------------------------------------------------------------
609: */
610: static void
611: normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
612: {
613: int8 shiftCount;
614:
615: shiftCount = countLeadingZeros64( aSig );
616: *zSigPtr = aSig<<shiftCount;
617: *zExpPtr = 1 - shiftCount;
618:
619: }
620:
621: /*
622: -------------------------------------------------------------------------------
623: Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
624: extended double-precision floating-point value, returning the result.
625: -------------------------------------------------------------------------------
626: */
627: INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
628: {
629: floatx80 z;
630:
631: z.low = zSig;
632: z.high = ( ( (bits16) zSign )<<15 ) + zExp;
633: return z;
634:
635: }
636:
637: /*
638: -------------------------------------------------------------------------------
639: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
640: and extended significand formed by the concatenation of `zSig0' and `zSig1',
641: and returns the proper extended double-precision floating-point value
642: corresponding to the abstract input. Ordinarily, the abstract value is
643: rounded and packed into the extended double-precision format, with the
644: inexact exception raised if the abstract input cannot be represented
645: exactly. However, if the abstract value is too large, the overflow and
646: inexact exceptions are raised and an infinity or maximal finite value is
647: returned. If the abstract value is too small, the input value is rounded to
648: a subnormal number, and the underflow and inexact exceptions are raised if
649: the abstract input cannot be represented exactly as a subnormal extended
650: double-precision floating-point number.
651: If `roundingPrecision' is 32 or 64, the result is rounded to the same
652: number of bits as single or double precision, respectively. Otherwise, the
653: result is rounded to the full precision of the extended double-precision
654: format.
655: The input significand must be normalized or smaller. If the input
656: significand is not normalized, `zExp' must be 0; in that case, the result
657: returned is a subnormal number, and it must not require rounding. The
658: handling of underflow and overflow follows the IEC/IEEE Standard for Binary
659: Floating-Point Arithmetic.
660: -------------------------------------------------------------------------------
661: */
662: static floatx80
663: roundAndPackFloatx80(
664: int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
665: )
666: {
667: int8 roundingMode;
668: flag roundNearestEven, increment, isTiny;
669: int64 roundIncrement, roundMask, roundBits;
670:
671: roundingMode = float_rounding_mode();
672: roundNearestEven = ( roundingMode == float_round_nearest_even );
673: if ( roundingPrecision == 80 ) goto precision80;
674: if ( roundingPrecision == 64 ) {
675: roundIncrement = LIT64( 0x0000000000000400 );
676: roundMask = LIT64( 0x00000000000007FF );
677: }
678: else if ( roundingPrecision == 32 ) {
679: roundIncrement = LIT64( 0x0000008000000000 );
680: roundMask = LIT64( 0x000000FFFFFFFFFF );
681: }
682: else {
683: goto precision80;
684: }
685: zSig0 |= ( zSig1 != 0 );
686: if ( ! roundNearestEven ) {
687: if ( roundingMode == float_round_to_zero ) {
688: roundIncrement = 0;
689: }
690: else {
691: roundIncrement = roundMask;
692: if ( zSign ) {
693: if ( roundingMode == float_round_up ) roundIncrement = 0;
694: }
695: else {
696: if ( roundingMode == float_round_down ) roundIncrement = 0;
697: }
698: }
699: }
700: roundBits = zSig0 & roundMask;
701: if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
702: if ( ( 0x7FFE < zExp )
703: || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
704: ) {
705: goto overflow;
706: }
707: if ( zExp <= 0 ) {
708: isTiny =
709: ( float_detect_tininess == float_tininess_before_rounding )
710: || ( zExp < 0 )
711: || ( zSig0 <= zSig0 + roundIncrement );
712: shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
713: zExp = 0;
714: roundBits = zSig0 & roundMask;
715: if ( isTiny && roundBits ) float_raise( float_flag_underflow );
716: if ( roundBits ) float_set_inexact();
717: zSig0 += roundIncrement;
718: if ( (sbits64) zSig0 < 0 ) zExp = 1;
719: roundIncrement = roundMask + 1;
720: if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
721: roundMask |= roundIncrement;
722: }
723: zSig0 &= ~ roundMask;
724: return packFloatx80( zSign, zExp, zSig0 );
725: }
726: }
727: if ( roundBits ) float_set_inexact();
728: zSig0 += roundIncrement;
729: if ( zSig0 < roundIncrement ) {
730: ++zExp;
731: zSig0 = LIT64( 0x8000000000000000 );
732: }
733: roundIncrement = roundMask + 1;
734: if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
735: roundMask |= roundIncrement;
736: }
737: zSig0 &= ~ roundMask;
738: if ( zSig0 == 0 ) zExp = 0;
739: return packFloatx80( zSign, zExp, zSig0 );
740: precision80:
741: increment = ( (sbits64) zSig1 < 0 );
742: if ( ! roundNearestEven ) {
743: if ( roundingMode == float_round_to_zero ) {
744: increment = 0;
745: }
746: else {
747: if ( zSign ) {
748: increment = ( roundingMode == float_round_down ) && zSig1;
749: }
750: else {
751: increment = ( roundingMode == float_round_up ) && zSig1;
752: }
753: }
754: }
755: if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
756: if ( ( 0x7FFE < zExp )
757: || ( ( zExp == 0x7FFE )
758: && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
759: && increment
760: )
761: ) {
762: roundMask = 0;
763: overflow:
764: float_raise( float_flag_overflow | float_flag_inexact );
765: if ( ( roundingMode == float_round_to_zero )
766: || ( zSign && ( roundingMode == float_round_up ) )
767: || ( ! zSign && ( roundingMode == float_round_down ) )
768: ) {
769: return packFloatx80( zSign, 0x7FFE, ~ roundMask );
770: }
771: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
772: }
773: if ( zExp <= 0 ) {
774: isTiny =
775: ( float_detect_tininess == float_tininess_before_rounding )
776: || ( zExp < 0 )
777: || ! increment
778: || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
779: shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
780: zExp = 0;
781: if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
782: if ( zSig1 ) float_set_inexact();
783: if ( roundNearestEven ) {
784: increment = ( (sbits64) zSig1 < 0 );
785: }
786: else {
787: if ( zSign ) {
788: increment = ( roundingMode == float_round_down ) && zSig1;
789: }
790: else {
791: increment = ( roundingMode == float_round_up ) && zSig1;
792: }
793: }
794: if ( increment ) {
795: ++zSig0;
796: zSig0 &=
797: ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
798: if ( (sbits64) zSig0 < 0 ) zExp = 1;
799: }
800: return packFloatx80( zSign, zExp, zSig0 );
801: }
802: }
803: if ( zSig1 ) float_set_inexact();
804: if ( increment ) {
805: ++zSig0;
806: if ( zSig0 == 0 ) {
807: ++zExp;
808: zSig0 = LIT64( 0x8000000000000000 );
809: }
810: else {
811: zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
812: }
813: }
814: else {
815: if ( zSig0 == 0 ) zExp = 0;
816: }
817: return packFloatx80( zSign, zExp, zSig0 );
818:
819: }
820:
821: /*
822: -------------------------------------------------------------------------------
823: Takes an abstract floating-point value having sign `zSign', exponent
824: `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
825: and returns the proper extended double-precision floating-point value
826: corresponding to the abstract input. This routine is just like
827: `roundAndPackFloatx80' except that the input significand does not have to be
828: normalized.
829: -------------------------------------------------------------------------------
830: */
831: static floatx80
832: normalizeRoundAndPackFloatx80(
833: int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
834: )
835: {
836: int8 shiftCount;
837:
838: if ( zSig0 == 0 ) {
839: zSig0 = zSig1;
840: zSig1 = 0;
841: zExp -= 64;
842: }
843: shiftCount = countLeadingZeros64( zSig0 );
844: shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
845: zExp -= shiftCount;
846: return
847: roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
848:
849: }
850:
851: #endif
852:
853: #ifdef FLOAT128
854:
855: /*
856: -------------------------------------------------------------------------------
857: Returns the least-significant 64 fraction bits of the quadruple-precision
858: floating-point value `a'.
859: -------------------------------------------------------------------------------
860: */
861: INLINE bits64 extractFloat128Frac1( float128 a )
862: {
863:
864: return a.low;
865:
866: }
867:
868: /*
869: -------------------------------------------------------------------------------
870: Returns the most-significant 48 fraction bits of the quadruple-precision
871: floating-point value `a'.
872: -------------------------------------------------------------------------------
873: */
874: INLINE bits64 extractFloat128Frac0( float128 a )
875: {
876:
877: return a.high & LIT64( 0x0000FFFFFFFFFFFF );
878:
879: }
880:
881: /*
882: -------------------------------------------------------------------------------
883: Returns the exponent bits of the quadruple-precision floating-point value
884: `a'.
885: -------------------------------------------------------------------------------
886: */
887: INLINE int32 extractFloat128Exp( float128 a )
888: {
889:
890: return ( a.high>>48 ) & 0x7FFF;
891:
892: }
893:
894: /*
895: -------------------------------------------------------------------------------
896: Returns the sign bit of the quadruple-precision floating-point value `a'.
897: -------------------------------------------------------------------------------
898: */
899: INLINE flag extractFloat128Sign( float128 a )
900: {
901:
902: return a.high>>63;
903:
904: }
905:
906: /*
907: -------------------------------------------------------------------------------
908: Normalizes the subnormal quadruple-precision floating-point value
909: represented by the denormalized significand formed by the concatenation of
910: `aSig0' and `aSig1'. The normalized exponent is stored at the location
911: pointed to by `zExpPtr'. The most significant 49 bits of the normalized
912: significand are stored at the location pointed to by `zSig0Ptr', and the
913: least significant 64 bits of the normalized significand are stored at the
914: location pointed to by `zSig1Ptr'.
915: -------------------------------------------------------------------------------
916: */
917: static void
918: normalizeFloat128Subnormal(
919: bits64 aSig0,
920: bits64 aSig1,
921: int32 *zExpPtr,
922: bits64 *zSig0Ptr,
923: bits64 *zSig1Ptr
924: )
925: {
926: int8 shiftCount;
927:
928: if ( aSig0 == 0 ) {
929: shiftCount = countLeadingZeros64( aSig1 ) - 15;
930: if ( shiftCount < 0 ) {
931: *zSig0Ptr = aSig1>>( - shiftCount );
932: *zSig1Ptr = aSig1<<( shiftCount & 63 );
933: }
934: else {
935: *zSig0Ptr = aSig1<<shiftCount;
936: *zSig1Ptr = 0;
937: }
938: *zExpPtr = - shiftCount - 63;
939: }
940: else {
941: shiftCount = countLeadingZeros64( aSig0 ) - 15;
942: shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
943: *zExpPtr = 1 - shiftCount;
944: }
945:
946: }
947:
948: /*
949: -------------------------------------------------------------------------------
950: Packs the sign `zSign', the exponent `zExp', and the significand formed
951: by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
952: floating-point value, returning the result. After being shifted into the
953: proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
954: added together to form the most significant 32 bits of the result. This
955: means that any integer portion of `zSig0' will be added into the exponent.
956: Since a properly normalized significand will have an integer portion equal
957: to 1, the `zExp' input should be 1 less than the desired result exponent
958: whenever `zSig0' and `zSig1' concatenated form a complete, normalized
959: significand.
960: -------------------------------------------------------------------------------
961: */
962: INLINE float128
963: packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
964: {
965: float128 z;
966:
967: z.low = zSig1;
968: z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
969: return z;
970:
971: }
972:
973: /*
974: -------------------------------------------------------------------------------
975: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
976: and extended significand formed by the concatenation of `zSig0', `zSig1',
977: and `zSig2', and returns the proper quadruple-precision floating-point value
978: corresponding to the abstract input. Ordinarily, the abstract value is
979: simply rounded and packed into the quadruple-precision format, with the
980: inexact exception raised if the abstract input cannot be represented
981: exactly. However, if the abstract value is too large, the overflow and
982: inexact exceptions are raised and an infinity or maximal finite value is
983: returned. If the abstract value is too small, the input value is rounded to
984: a subnormal number, and the underflow and inexact exceptions are raised if
985: the abstract input cannot be represented exactly as a subnormal quadruple-
986: precision floating-point number.
987: The input significand must be normalized or smaller. If the input
988: significand is not normalized, `zExp' must be 0; in that case, the result
989: returned is a subnormal number, and it must not require rounding. In the
990: usual case that the input significand is normalized, `zExp' must be 1 less
991: than the ``true'' floating-point exponent. The handling of underflow and
992: overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
993: -------------------------------------------------------------------------------
994: */
995: static float128
996: roundAndPackFloat128(
997: flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
998: {
999: int8 roundingMode;
1000: flag roundNearestEven, increment, isTiny;
1001:
1002: roundingMode = float_rounding_mode();
1003: roundNearestEven = ( roundingMode == float_round_nearest_even );
1004: increment = ( (sbits64) zSig2 < 0 );
1005: if ( ! roundNearestEven ) {
1006: if ( roundingMode == float_round_to_zero ) {
1007: increment = 0;
1008: }
1009: else {
1010: if ( zSign ) {
1011: increment = ( roundingMode == float_round_down ) && zSig2;
1012: }
1013: else {
1014: increment = ( roundingMode == float_round_up ) && zSig2;
1015: }
1016: }
1017: }
1018: if ( 0x7FFD <= (bits32) zExp ) {
1019: if ( ( 0x7FFD < zExp )
1020: || ( ( zExp == 0x7FFD )
1021: && eq128(
1022: LIT64( 0x0001FFFFFFFFFFFF ),
1023: LIT64( 0xFFFFFFFFFFFFFFFF ),
1024: zSig0,
1025: zSig1
1026: )
1027: && increment
1028: )
1029: ) {
1030: float_raise( float_flag_overflow | float_flag_inexact );
1031: if ( ( roundingMode == float_round_to_zero )
1032: || ( zSign && ( roundingMode == float_round_up ) )
1033: || ( ! zSign && ( roundingMode == float_round_down ) )
1034: ) {
1035: return
1036: packFloat128(
1037: zSign,
1038: 0x7FFE,
1039: LIT64( 0x0000FFFFFFFFFFFF ),
1040: LIT64( 0xFFFFFFFFFFFFFFFF )
1041: );
1042: }
1043: return packFloat128( zSign, 0x7FFF, 0, 0 );
1044: }
1045: if ( zExp < 0 ) {
1046: isTiny =
1047: ( float_detect_tininess == float_tininess_before_rounding )
1048: || ( zExp < -1 )
1049: || ! increment
1050: || lt128(
1051: zSig0,
1052: zSig1,
1053: LIT64( 0x0001FFFFFFFFFFFF ),
1054: LIT64( 0xFFFFFFFFFFFFFFFF )
1055: );
1056: shift128ExtraRightJamming(
1057: zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1058: zExp = 0;
1059: if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
1060: if ( roundNearestEven ) {
1061: increment = ( (sbits64) zSig2 < 0 );
1062: }
1063: else {
1064: if ( zSign ) {
1065: increment = ( roundingMode == float_round_down ) && zSig2;
1066: }
1067: else {
1068: increment = ( roundingMode == float_round_up ) && zSig2;
1069: }
1070: }
1071: }
1072: }
1073: if ( zSig2 ) float_set_inexact();
1074: if ( increment ) {
1075: add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1076: zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1077: }
1078: else {
1079: if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1080: }
1081: return packFloat128( zSign, zExp, zSig0, zSig1 );
1082:
1083: }
1084:
1085: /*
1086: -------------------------------------------------------------------------------
1087: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1088: and significand formed by the concatenation of `zSig0' and `zSig1', and
1089: returns the proper quadruple-precision floating-point value corresponding
1090: to the abstract input. This routine is just like `roundAndPackFloat128'
1091: except that the input significand has fewer bits and does not have to be
1092: normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1093: point exponent.
1094: -------------------------------------------------------------------------------
1095: */
1096: static float128
1097: normalizeRoundAndPackFloat128(
1098: flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1099: {
1100: int8 shiftCount;
1101: bits64 zSig2;
1102:
1103: if ( zSig0 == 0 ) {
1104: zSig0 = zSig1;
1105: zSig1 = 0;
1106: zExp -= 64;
1107: }
1108: shiftCount = countLeadingZeros64( zSig0 ) - 15;
1109: if ( 0 <= shiftCount ) {
1110: zSig2 = 0;
1111: shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1112: }
1113: else {
1114: shift128ExtraRightJamming(
1115: zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1116: }
1117: zExp -= shiftCount;
1118: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1119:
1120: }
1121:
1122: #endif
1123:
1124: /*
1125: -------------------------------------------------------------------------------
1126: Returns the result of converting the 32-bit two's complement integer `a'
1127: to the single-precision floating-point format. The conversion is performed
1128: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1129: -------------------------------------------------------------------------------
1130: */
1131: float32 int32_to_float32( int32 a )
1132: {
1133: flag zSign;
1134:
1135: if ( a == 0 ) return 0;
1136: if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1137: zSign = ( a < 0 );
1138: return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1139:
1140: }
1141:
1142: /*
1143: -------------------------------------------------------------------------------
1144: Returns the result of converting the 32-bit two's complement integer `a'
1145: to the double-precision floating-point format. The conversion is performed
1146: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1147: -------------------------------------------------------------------------------
1148: */
1149: float64 int32_to_float64( int32 a )
1150: {
1151: flag zSign;
1152: uint32 absA;
1153: int8 shiftCount;
1154: bits64 zSig;
1155:
1156: if ( a == 0 ) return 0;
1157: zSign = ( a < 0 );
1158: absA = zSign ? - a : a;
1159: shiftCount = countLeadingZeros32( absA ) + 21;
1160: zSig = absA;
1161: return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1162:
1163: }
1164:
1165: #ifdef FLOATX80
1166:
1167: /*
1168: -------------------------------------------------------------------------------
1169: Returns the result of converting the 32-bit two's complement integer `a'
1170: to the extended double-precision floating-point format. The conversion
1171: is performed according to the IEC/IEEE Standard for Binary Floating-Point
1172: Arithmetic.
1173: -------------------------------------------------------------------------------
1174: */
1175: floatx80 int32_to_floatx80( int32 a )
1176: {
1177: flag zSign;
1178: uint32 absA;
1179: int8 shiftCount;
1180: bits64 zSig;
1181:
1182: if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1183: zSign = ( a < 0 );
1184: absA = zSign ? - a : a;
1185: shiftCount = countLeadingZeros32( absA ) + 32;
1186: zSig = absA;
1187: return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1188:
1189: }
1190:
1191: #endif
1192:
1193: #ifdef FLOAT128
1194:
1195: /*
1196: -------------------------------------------------------------------------------
1197: Returns the result of converting the 32-bit two's complement integer `a' to
1198: the quadruple-precision floating-point format. The conversion is performed
1199: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1200: -------------------------------------------------------------------------------
1201: */
1202: float128 int32_to_float128( int32 a )
1203: {
1204: flag zSign;
1205: uint32 absA;
1206: int8 shiftCount;
1207: bits64 zSig0;
1208:
1209: if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1210: zSign = ( a < 0 );
1211: absA = zSign ? - a : a;
1212: shiftCount = countLeadingZeros32( absA ) + 17;
1213: zSig0 = absA;
1214: return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1215:
1216: }
1217:
1218: #endif
1219:
1220: #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
1221: /*
1222: -------------------------------------------------------------------------------
1223: Returns the result of converting the 64-bit two's complement integer `a'
1224: to the single-precision floating-point format. The conversion is performed
1225: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1226: -------------------------------------------------------------------------------
1227: */
1228: float32 int64_to_float32( int64 a )
1229: {
1230: flag zSign;
1231: uint64 absA;
1232: int8 shiftCount;
1233:
1234: if ( a == 0 ) return 0;
1235: zSign = ( a < 0 );
1236: absA = zSign ? - a : a;
1237: shiftCount = countLeadingZeros64( absA ) - 40;
1238: if ( 0 <= shiftCount ) {
1239: return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1240: }
1241: else {
1242: shiftCount += 7;
1243: if ( shiftCount < 0 ) {
1244: shift64RightJamming( absA, - shiftCount, &absA );
1245: }
1246: else {
1247: absA <<= shiftCount;
1248: }
1249: return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1250: }
1251:
1252: }
1253:
1254: /*
1255: -------------------------------------------------------------------------------
1256: Returns the result of converting the 64-bit two's complement integer `a'
1257: to the double-precision floating-point format. The conversion is performed
1258: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1259: -------------------------------------------------------------------------------
1260: */
1261: float64 int64_to_float64( int64 a )
1262: {
1263: flag zSign;
1264:
1265: if ( a == 0 ) return 0;
1266: if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1267: return packFloat64( 1, 0x43E, 0 );
1268: }
1269: zSign = ( a < 0 );
1270: return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1271:
1272: }
1273:
1274: #ifdef FLOATX80
1275:
1276: /*
1277: -------------------------------------------------------------------------------
1278: Returns the result of converting the 64-bit two's complement integer `a'
1279: to the extended double-precision floating-point format. The conversion
1280: is performed according to the IEC/IEEE Standard for Binary Floating-Point
1281: Arithmetic.
1282: -------------------------------------------------------------------------------
1283: */
1284: floatx80 int64_to_floatx80( int64 a )
1285: {
1286: flag zSign;
1287: uint64 absA;
1288: int8 shiftCount;
1289:
1290: if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1291: zSign = ( a < 0 );
1292: absA = zSign ? - a : a;
1293: shiftCount = countLeadingZeros64( absA );
1294: return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1295:
1296: }
1297:
1298: #endif
1299:
1300: #ifdef FLOAT128
1301:
1302: /*
1303: -------------------------------------------------------------------------------
1304: Returns the result of converting the 64-bit two's complement integer `a' to
1305: the quadruple-precision floating-point format. The conversion is performed
1306: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1307: -------------------------------------------------------------------------------
1308: */
1309: float128 int64_to_float128( int64 a )
1310: {
1311: flag zSign;
1312: uint64 absA;
1313: int8 shiftCount;
1314: int32 zExp;
1315: bits64 zSig0, zSig1;
1316:
1317: if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1318: zSign = ( a < 0 );
1319: absA = zSign ? - a : a;
1320: shiftCount = countLeadingZeros64( absA ) + 49;
1321: zExp = 0x406E - shiftCount;
1322: if ( 64 <= shiftCount ) {
1323: zSig1 = 0;
1324: zSig0 = absA;
1325: shiftCount -= 64;
1326: }
1327: else {
1328: zSig1 = absA;
1329: zSig0 = 0;
1330: }
1331: shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1332: return packFloat128( zSign, zExp, zSig0, zSig1 );
1333:
1334: }
1335:
1336: #endif
1337: #endif /* !SOFTFLOAT_FOR_GCC */
1338:
1339: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1340: /*
1341: -------------------------------------------------------------------------------
1342: Returns the result of converting the single-precision floating-point value
1343: `a' to the 32-bit two's complement integer format. The conversion is
1344: performed according to the IEC/IEEE Standard for Binary Floating-Point
1345: Arithmetic---which means in particular that the conversion is rounded
1346: according to the current rounding mode. If `a' is a NaN, the largest
1347: positive integer is returned. Otherwise, if the conversion overflows, the
1348: largest integer with the same sign as `a' is returned.
1349: -------------------------------------------------------------------------------
1350: */
1351: int32 float32_to_int32( float32 a )
1352: {
1353: flag aSign;
1354: int16 aExp, shiftCount;
1355: bits32 aSig;
1356: bits64 aSig64;
1357:
1358: aSig = extractFloat32Frac( a );
1359: aExp = extractFloat32Exp( a );
1360: aSign = extractFloat32Sign( a );
1361: if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1362: if ( aExp ) aSig |= 0x00800000;
1363: shiftCount = 0xAF - aExp;
1364: aSig64 = aSig;
1365: aSig64 <<= 32;
1366: if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1367: return roundAndPackInt32( aSign, aSig64 );
1368:
1369: }
1370: #endif /* !SOFTFLOAT_FOR_GCC */
1371:
1372: /*
1373: -------------------------------------------------------------------------------
1374: Returns the result of converting the single-precision floating-point value
1375: `a' to the 32-bit two's complement integer format. The conversion is
1376: performed according to the IEC/IEEE Standard for Binary Floating-Point
1377: Arithmetic, except that the conversion is always rounded toward zero.
1378: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1379: the conversion overflows, the largest integer with the same sign as `a' is
1380: returned.
1381: -------------------------------------------------------------------------------
1382: */
1383: int32 float32_to_int32_round_to_zero( float32 a )
1384: {
1385: flag aSign;
1386: int16 aExp, shiftCount;
1387: bits32 aSig;
1388: int32 z;
1389:
1390: aSig = extractFloat32Frac( a );
1391: aExp = extractFloat32Exp( a );
1392: aSign = extractFloat32Sign( a );
1393: shiftCount = aExp - 0x9E;
1394: if ( 0 <= shiftCount ) {
1395: if ( a != 0xCF000000 ) {
1396: float_raise( float_flag_invalid );
1397: if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1398: }
1399: return (sbits32) 0x80000000;
1400: }
1401: else if ( aExp <= 0x7E ) {
1402: if ( aExp | aSig ) float_set_inexact();
1403: return 0;
1404: }
1405: aSig = ( aSig | 0x00800000 )<<8;
1406: z = aSig>>( - shiftCount );
1407: if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1408: float_set_inexact();
1409: }
1410: if ( aSign ) z = - z;
1411: return z;
1412:
1413: }
1414:
1415: #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
1416: /*
1417: -------------------------------------------------------------------------------
1418: Returns the result of converting the single-precision floating-point value
1419: `a' to the 64-bit two's complement integer format. The conversion is
1420: performed according to the IEC/IEEE Standard for Binary Floating-Point
1421: Arithmetic---which means in particular that the conversion is rounded
1422: according to the current rounding mode. If `a' is a NaN, the largest
1423: positive integer is returned. Otherwise, if the conversion overflows, the
1424: largest integer with the same sign as `a' is returned.
1425: -------------------------------------------------------------------------------
1426: */
1427: int64 float32_to_int64( float32 a )
1428: {
1429: flag aSign;
1430: int16 aExp, shiftCount;
1431: bits32 aSig;
1432: bits64 aSig64, aSigExtra;
1433:
1434: aSig = extractFloat32Frac( a );
1435: aExp = extractFloat32Exp( a );
1436: aSign = extractFloat32Sign( a );
1437: shiftCount = 0xBE - aExp;
1438: if ( shiftCount < 0 ) {
1439: float_raise( float_flag_invalid );
1440: if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1441: return LIT64( 0x7FFFFFFFFFFFFFFF );
1442: }
1443: return (sbits64) LIT64( 0x8000000000000000 );
1444: }
1445: if ( aExp ) aSig |= 0x00800000;
1446: aSig64 = aSig;
1447: aSig64 <<= 40;
1448: shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1449: return roundAndPackInt64( aSign, aSig64, aSigExtra );
1450:
1451: }
1452:
1453: /*
1454: -------------------------------------------------------------------------------
1455: Returns the result of converting the single-precision floating-point value
1456: `a' to the 64-bit two's complement integer format. The conversion is
1457: performed according to the IEC/IEEE Standard for Binary Floating-Point
1458: Arithmetic, except that the conversion is always rounded toward zero. If
1459: `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1460: conversion overflows, the largest integer with the same sign as `a' is
1461: returned.
1462: -------------------------------------------------------------------------------
1463: */
1464: int64 float32_to_int64_round_to_zero( float32 a )
1465: {
1466: flag aSign;
1467: int16 aExp, shiftCount;
1468: bits32 aSig;
1469: bits64 aSig64;
1470: int64 z;
1471:
1472: aSig = extractFloat32Frac( a );
1473: aExp = extractFloat32Exp( a );
1474: aSign = extractFloat32Sign( a );
1475: shiftCount = aExp - 0xBE;
1476: if ( 0 <= shiftCount ) {
1477: if ( a != 0xDF000000 ) {
1478: float_raise( float_flag_invalid );
1479: if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1480: return LIT64( 0x7FFFFFFFFFFFFFFF );
1481: }
1482: }
1483: return (sbits64) LIT64( 0x8000000000000000 );
1484: }
1485: else if ( aExp <= 0x7E ) {
1486: if ( aExp | aSig ) float_set_inexact();
1487: return 0;
1488: }
1489: aSig64 = aSig | 0x00800000;
1490: aSig64 <<= 40;
1491: z = aSig64>>( - shiftCount );
1492: if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1493: float_set_inexact();
1494: }
1495: if ( aSign ) z = - z;
1496: return z;
1497:
1498: }
1499: #endif /* !SOFTFLOAT_FOR_GCC */
1500:
1501: /*
1502: -------------------------------------------------------------------------------
1503: Returns the result of converting the single-precision floating-point value
1504: `a' to the double-precision floating-point format. The conversion is
1505: performed according to the IEC/IEEE Standard for Binary Floating-Point
1506: Arithmetic.
1507: -------------------------------------------------------------------------------
1508: */
1509: float64 float32_to_float64( float32 a )
1510: {
1511: flag aSign;
1512: int16 aExp;
1513: bits32 aSig;
1514:
1515: aSig = extractFloat32Frac( a );
1516: aExp = extractFloat32Exp( a );
1517: aSign = extractFloat32Sign( a );
1518: if ( aExp == 0xFF ) {
1519: if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1520: return packFloat64( aSign, 0x7FF, 0 );
1521: }
1522: if ( aExp == 0 ) {
1523: if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1524: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1525: --aExp;
1526: }
1527: return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1528:
1529: }
1530:
1531: #ifdef FLOATX80
1532:
1533: /*
1534: -------------------------------------------------------------------------------
1535: Returns the result of converting the single-precision floating-point value
1536: `a' to the extended double-precision floating-point format. The conversion
1537: is performed according to the IEC/IEEE Standard for Binary Floating-Point
1538: Arithmetic.
1539: -------------------------------------------------------------------------------
1540: */
1541: floatx80 float32_to_floatx80( float32 a )
1542: {
1543: flag aSign;
1544: int16 aExp;
1545: bits32 aSig;
1546:
1547: aSig = extractFloat32Frac( a );
1548: aExp = extractFloat32Exp( a );
1549: aSign = extractFloat32Sign( a );
1550: if ( aExp == 0xFF ) {
1551: if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1552: return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1553: }
1554: if ( aExp == 0 ) {
1555: if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1556: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1557: }
1558: aSig |= 0x00800000;
1559: return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1560:
1561: }
1562:
1563: #endif
1564:
1565: #ifdef FLOAT128
1566:
1567: /*
1568: -------------------------------------------------------------------------------
1569: Returns the result of converting the single-precision floating-point value
1570: `a' to the double-precision floating-point format. The conversion is
1571: performed according to the IEC/IEEE Standard for Binary Floating-Point
1572: Arithmetic.
1573: -------------------------------------------------------------------------------
1574: */
1575: float128 float32_to_float128( float32 a )
1576: {
1577: flag aSign;
1578: int16 aExp;
1579: bits32 aSig;
1580:
1581: aSig = extractFloat32Frac( a );
1582: aExp = extractFloat32Exp( a );
1583: aSign = extractFloat32Sign( a );
1584: if ( aExp == 0xFF ) {
1585: if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1586: return packFloat128( aSign, 0x7FFF, 0, 0 );
1587: }
1588: if ( aExp == 0 ) {
1589: if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1590: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1591: --aExp;
1592: }
1593: return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1594:
1595: }
1596:
1597: #endif
1598:
1599: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1600: /*
1601: -------------------------------------------------------------------------------
1602: Rounds the single-precision floating-point value `a' to an integer, and
1603: returns the result as a single-precision floating-point value. The
1604: operation is performed according to the IEC/IEEE Standard for Binary
1605: Floating-Point Arithmetic.
1606: -------------------------------------------------------------------------------
1607: */
1608: float32 float32_round_to_int( float32 a )
1609: {
1610: flag aSign;
1611: int16 aExp;
1612: bits32 lastBitMask, roundBitsMask;
1613: int8 roundingMode;
1614: float32 z;
1615:
1616: aExp = extractFloat32Exp( a );
1617: if ( 0x96 <= aExp ) {
1618: if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1619: return propagateFloat32NaN( a, a );
1620: }
1621: return a;
1622: }
1623: if ( aExp <= 0x7E ) {
1624: if ( (bits32) ( a<<1 ) == 0 ) return a;
1625: float_set_inexact();
1626: aSign = extractFloat32Sign( a );
1627: switch ( float_rounding_mode() ) {
1628: case float_round_nearest_even:
1629: if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1630: return packFloat32( aSign, 0x7F, 0 );
1631: }
1632: break;
1633: case float_round_down:
1634: return aSign ? 0xBF800000 : 0;
1635: case float_round_up:
1636: return aSign ? 0x80000000 : 0x3F800000;
1637: }
1638: return packFloat32( aSign, 0, 0 );
1639: }
1640: lastBitMask = 1;
1641: lastBitMask <<= 0x96 - aExp;
1642: roundBitsMask = lastBitMask - 1;
1643: z = a;
1644: roundingMode = float_rounding_mode();
1645: if ( roundingMode == float_round_nearest_even ) {
1646: z += lastBitMask>>1;
1647: if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1648: }
1649: else if ( roundingMode != float_round_to_zero ) {
1650: if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1651: z += roundBitsMask;
1652: }
1653: }
1654: z &= ~ roundBitsMask;
1655: if ( z != a ) float_set_inexact();
1656: return z;
1657:
1658: }
1659: #endif /* !SOFTFLOAT_FOR_GCC */
1660:
1661: /*
1662: -------------------------------------------------------------------------------
1663: Returns the result of adding the absolute values of the single-precision
1664: floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1665: before being returned. `zSign' is ignored if the result is a NaN.
1666: The addition is performed according to the IEC/IEEE Standard for Binary
1667: Floating-Point Arithmetic.
1668: -------------------------------------------------------------------------------
1669: */
1670: static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1671: {
1672: int16 aExp, bExp, zExp;
1673: bits32 aSig, bSig, zSig;
1674: int16 expDiff;
1675:
1676: aSig = extractFloat32Frac( a );
1677: aExp = extractFloat32Exp( a );
1678: bSig = extractFloat32Frac( b );
1679: bExp = extractFloat32Exp( b );
1680: expDiff = aExp - bExp;
1681: aSig <<= 6;
1682: bSig <<= 6;
1683: if ( 0 < expDiff ) {
1684: if ( aExp == 0xFF ) {
1685: if ( aSig ) return propagateFloat32NaN( a, b );
1686: return a;
1687: }
1688: if ( bExp == 0 ) {
1689: --expDiff;
1690: }
1691: else {
1692: bSig |= 0x20000000;
1693: }
1694: shift32RightJamming( bSig, expDiff, &bSig );
1695: zExp = aExp;
1696: }
1697: else if ( expDiff < 0 ) {
1698: if ( bExp == 0xFF ) {
1699: if ( bSig ) return propagateFloat32NaN( a, b );
1700: return packFloat32( zSign, 0xFF, 0 );
1701: }
1702: if ( aExp == 0 ) {
1703: ++expDiff;
1704: }
1705: else {
1706: aSig |= 0x20000000;
1707: }
1708: shift32RightJamming( aSig, - expDiff, &aSig );
1709: zExp = bExp;
1710: }
1711: else {
1712: if ( aExp == 0xFF ) {
1713: if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1714: return a;
1715: }
1716: if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1717: zSig = 0x40000000 + aSig + bSig;
1718: zExp = aExp;
1719: goto roundAndPack;
1720: }
1721: aSig |= 0x20000000;
1722: zSig = ( aSig + bSig )<<1;
1723: --zExp;
1724: if ( (sbits32) zSig < 0 ) {
1725: zSig = aSig + bSig;
1726: ++zExp;
1727: }
1728: roundAndPack:
1729: return roundAndPackFloat32( zSign, zExp, zSig );
1730:
1731: }
1732:
1733: /*
1734: -------------------------------------------------------------------------------
1735: Returns the result of subtracting the absolute values of the single-
1736: precision floating-point values `a' and `b'. If `zSign' is 1, the
1737: difference is negated before being returned. `zSign' is ignored if the
1738: result is a NaN. The subtraction is performed according to the IEC/IEEE
1739: Standard for Binary Floating-Point Arithmetic.
1740: -------------------------------------------------------------------------------
1741: */
1742: static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1743: {
1744: int16 aExp, bExp, zExp;
1745: bits32 aSig, bSig, zSig;
1746: int16 expDiff;
1747:
1748: aSig = extractFloat32Frac( a );
1749: aExp = extractFloat32Exp( a );
1750: bSig = extractFloat32Frac( b );
1751: bExp = extractFloat32Exp( b );
1752: expDiff = aExp - bExp;
1753: aSig <<= 7;
1754: bSig <<= 7;
1755: if ( 0 < expDiff ) goto aExpBigger;
1756: if ( expDiff < 0 ) goto bExpBigger;
1757: if ( aExp == 0xFF ) {
1758: if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1759: float_raise( float_flag_invalid );
1760: return float32_default_nan;
1761: }
1762: if ( aExp == 0 ) {
1763: aExp = 1;
1764: bExp = 1;
1765: }
1766: if ( bSig < aSig ) goto aBigger;
1767: if ( aSig < bSig ) goto bBigger;
1768: return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
1769: bExpBigger:
1770: if ( bExp == 0xFF ) {
1771: if ( bSig ) return propagateFloat32NaN( a, b );
1772: return packFloat32( zSign ^ 1, 0xFF, 0 );
1773: }
1774: if ( aExp == 0 ) {
1775: ++expDiff;
1776: }
1777: else {
1778: aSig |= 0x40000000;
1779: }
1780: shift32RightJamming( aSig, - expDiff, &aSig );
1781: bSig |= 0x40000000;
1782: bBigger:
1783: zSig = bSig - aSig;
1784: zExp = bExp;
1785: zSign ^= 1;
1786: goto normalizeRoundAndPack;
1787: aExpBigger:
1788: if ( aExp == 0xFF ) {
1789: if ( aSig ) return propagateFloat32NaN( a, b );
1790: return a;
1791: }
1792: if ( bExp == 0 ) {
1793: --expDiff;
1794: }
1795: else {
1796: bSig |= 0x40000000;
1797: }
1798: shift32RightJamming( bSig, expDiff, &bSig );
1799: aSig |= 0x40000000;
1800: aBigger:
1801: zSig = aSig - bSig;
1802: zExp = aExp;
1803: normalizeRoundAndPack:
1804: --zExp;
1805: return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1806:
1807: }
1808:
1809: /*
1810: -------------------------------------------------------------------------------
1811: Returns the result of adding the single-precision floating-point values `a'
1812: and `b'. The operation is performed according to the IEC/IEEE Standard for
1813: Binary Floating-Point Arithmetic.
1814: -------------------------------------------------------------------------------
1815: */
1816: float32 float32_add( float32 a, float32 b )
1817: {
1818: flag aSign, bSign;
1819:
1820: aSign = extractFloat32Sign( a );
1821: bSign = extractFloat32Sign( b );
1822: if ( aSign == bSign ) {
1823: return addFloat32Sigs( a, b, aSign );
1824: }
1825: else {
1826: return subFloat32Sigs( a, b, aSign );
1827: }
1828:
1829: }
1830:
1831: /*
1832: -------------------------------------------------------------------------------
1833: Returns the result of subtracting the single-precision floating-point values
1834: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1835: for Binary Floating-Point Arithmetic.
1836: -------------------------------------------------------------------------------
1837: */
1838: float32 float32_sub( float32 a, float32 b )
1839: {
1840: flag aSign, bSign;
1841:
1842: aSign = extractFloat32Sign( a );
1843: bSign = extractFloat32Sign( b );
1844: if ( aSign == bSign ) {
1845: return subFloat32Sigs( a, b, aSign );
1846: }
1847: else {
1848: return addFloat32Sigs( a, b, aSign );
1849: }
1850:
1851: }
1852:
1853: /*
1854: -------------------------------------------------------------------------------
1855: Returns the result of multiplying the single-precision floating-point values
1856: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1857: for Binary Floating-Point Arithmetic.
1858: -------------------------------------------------------------------------------
1859: */
1860: float32 float32_mul( float32 a, float32 b )
1861: {
1862: flag aSign, bSign, zSign;
1863: int16 aExp, bExp, zExp;
1864: bits32 aSig, bSig;
1865: bits64 zSig64;
1866: bits32 zSig;
1867:
1868: aSig = extractFloat32Frac( a );
1869: aExp = extractFloat32Exp( a );
1870: aSign = extractFloat32Sign( a );
1871: bSig = extractFloat32Frac( b );
1872: bExp = extractFloat32Exp( b );
1873: bSign = extractFloat32Sign( b );
1874: zSign = aSign ^ bSign;
1875: if ( aExp == 0xFF ) {
1876: if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1877: return propagateFloat32NaN( a, b );
1878: }
1879: if ( ( bExp | bSig ) == 0 ) {
1880: float_raise( float_flag_invalid );
1881: return float32_default_nan;
1882: }
1883: return packFloat32( zSign, 0xFF, 0 );
1884: }
1885: if ( bExp == 0xFF ) {
1886: if ( bSig ) return propagateFloat32NaN( a, b );
1887: if ( ( aExp | aSig ) == 0 ) {
1888: float_raise( float_flag_invalid );
1889: return float32_default_nan;
1890: }
1891: return packFloat32( zSign, 0xFF, 0 );
1892: }
1893: if ( aExp == 0 ) {
1894: if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1895: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1896: }
1897: if ( bExp == 0 ) {
1898: if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1899: normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1900: }
1901: zExp = aExp + bExp - 0x7F;
1902: aSig = ( aSig | 0x00800000 )<<7;
1903: bSig = ( bSig | 0x00800000 )<<8;
1904: shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1905: zSig = zSig64;
1906: if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1907: zSig <<= 1;
1908: --zExp;
1909: }
1910: return roundAndPackFloat32( zSign, zExp, zSig );
1911:
1912: }
1913:
1914: /*
1915: -------------------------------------------------------------------------------
1916: Returns the result of dividing the single-precision floating-point value `a'
1917: by the corresponding value `b'. The operation is performed according to the
1918: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1919: -------------------------------------------------------------------------------
1920: */
1921: float32 float32_div( float32 a, float32 b )
1922: {
1923: flag aSign, bSign, zSign;
1924: int16 aExp, bExp, zExp;
1925: bits32 aSig, bSig, zSig;
1926:
1927: aSig = extractFloat32Frac( a );
1928: aExp = extractFloat32Exp( a );
1929: aSign = extractFloat32Sign( a );
1930: bSig = extractFloat32Frac( b );
1931: bExp = extractFloat32Exp( b );
1932: bSign = extractFloat32Sign( b );
1933: zSign = aSign ^ bSign;
1934: if ( aExp == 0xFF ) {
1935: if ( aSig ) return propagateFloat32NaN( a, b );
1936: if ( bExp == 0xFF ) {
1937: if ( bSig ) return propagateFloat32NaN( a, b );
1938: float_raise( float_flag_invalid );
1939: return float32_default_nan;
1940: }
1941: return packFloat32( zSign, 0xFF, 0 );
1942: }
1943: if ( bExp == 0xFF ) {
1944: if ( bSig ) return propagateFloat32NaN( a, b );
1945: return packFloat32( zSign, 0, 0 );
1946: }
1947: if ( bExp == 0 ) {
1948: if ( bSig == 0 ) {
1949: if ( ( aExp | aSig ) == 0 ) {
1950: float_raise( float_flag_invalid );
1951: return float32_default_nan;
1952: }
1953: float_raise( float_flag_divbyzero );
1954: return packFloat32( zSign, 0xFF, 0 );
1955: }
1956: normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1957: }
1958: if ( aExp == 0 ) {
1959: if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1960: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1961: }
1962: zExp = aExp - bExp + 0x7D;
1963: aSig = ( aSig | 0x00800000 )<<7;
1964: bSig = ( bSig | 0x00800000 )<<8;
1965: if ( bSig <= ( aSig + aSig ) ) {
1966: aSig >>= 1;
1967: ++zExp;
1968: }
1969: zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1970: if ( ( zSig & 0x3F ) == 0 ) {
1971: zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1972: }
1973: return roundAndPackFloat32( zSign, zExp, zSig );
1974:
1975: }
1976:
1977: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1978: /*
1979: -------------------------------------------------------------------------------
1980: Returns the remainder of the single-precision floating-point value `a'
1981: with respect to the corresponding value `b'. The operation is performed
1982: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1983: -------------------------------------------------------------------------------
1984: */
1985: float32 float32_rem( float32 a, float32 b )
1986: {
1987: flag aSign, bSign, zSign;
1988: int16 aExp, bExp, expDiff;
1989: bits32 aSig, bSig;
1990: bits32 q;
1991: bits64 aSig64, bSig64, q64;
1992: bits32 alternateASig;
1993: sbits32 sigMean;
1994:
1995: aSig = extractFloat32Frac( a );
1996: aExp = extractFloat32Exp( a );
1997: aSign = extractFloat32Sign( a );
1998: bSig = extractFloat32Frac( b );
1999: bExp = extractFloat32Exp( b );
2000: bSign = extractFloat32Sign( b );
2001: if ( aExp == 0xFF ) {
2002: if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2003: return propagateFloat32NaN( a, b );
2004: }
2005: float_raise( float_flag_invalid );
2006: return float32_default_nan;
2007: }
2008: if ( bExp == 0xFF ) {
2009: if ( bSig ) return propagateFloat32NaN( a, b );
2010: return a;
2011: }
2012: if ( bExp == 0 ) {
2013: if ( bSig == 0 ) {
2014: float_raise( float_flag_invalid );
2015: return float32_default_nan;
2016: }
2017: normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2018: }
2019: if ( aExp == 0 ) {
2020: if ( aSig == 0 ) return a;
2021: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2022: }
2023: expDiff = aExp - bExp;
2024: aSig |= 0x00800000;
2025: bSig |= 0x00800000;
2026: if ( expDiff < 32 ) {
2027: aSig <<= 8;
2028: bSig <<= 8;
2029: if ( expDiff < 0 ) {
2030: if ( expDiff < -1 ) return a;
2031: aSig >>= 1;
2032: }
2033: q = ( bSig <= aSig );
2034: if ( q ) aSig -= bSig;
2035: if ( 0 < expDiff ) {
2036: q = ( ( (bits64) aSig )<<32 ) / bSig;
2037: q >>= 32 - expDiff;
2038: bSig >>= 2;
2039: aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2040: }
2041: else {
2042: aSig >>= 2;
2043: bSig >>= 2;
2044: }
2045: }
2046: else {
2047: if ( bSig <= aSig ) aSig -= bSig;
2048: aSig64 = ( (bits64) aSig )<<40;
2049: bSig64 = ( (bits64) bSig )<<40;
2050: expDiff -= 64;
2051: while ( 0 < expDiff ) {
2052: q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2053: q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2054: aSig64 = - ( ( bSig * q64 )<<38 );
2055: expDiff -= 62;
2056: }
2057: expDiff += 64;
2058: q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2059: q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2060: q = q64>>( 64 - expDiff );
2061: bSig <<= 6;
2062: aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2063: }
2064: do {
2065: alternateASig = aSig;
2066: ++q;
2067: aSig -= bSig;
2068: } while ( 0 <= (sbits32) aSig );
2069: sigMean = aSig + alternateASig;
2070: if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2071: aSig = alternateASig;
2072: }
2073: zSign = ( (sbits32) aSig < 0 );
2074: if ( zSign ) aSig = - aSig;
2075: return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
2076:
2077: }
2078: #endif /* !SOFTFLOAT_FOR_GCC */
2079:
2080: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2081: /*
2082: -------------------------------------------------------------------------------
2083: Returns the square root of the single-precision floating-point value `a'.
2084: The operation is performed according to the IEC/IEEE Standard for Binary
2085: Floating-Point Arithmetic.
2086: -------------------------------------------------------------------------------
2087: */
2088: float32 float32_sqrt( float32 a )
2089: {
2090: flag aSign;
2091: int16 aExp, zExp;
2092: bits32 aSig, zSig;
2093: bits64 rem, term;
2094:
2095: aSig = extractFloat32Frac( a );
2096: aExp = extractFloat32Exp( a );
2097: aSign = extractFloat32Sign( a );
2098: if ( aExp == 0xFF ) {
2099: if ( aSig ) return propagateFloat32NaN( a, 0 );
2100: if ( ! aSign ) return a;
2101: float_raise( float_flag_invalid );
2102: return float32_default_nan;
2103: }
2104: if ( aSign ) {
2105: if ( ( aExp | aSig ) == 0 ) return a;
2106: float_raise( float_flag_invalid );
2107: return float32_default_nan;
2108: }
2109: if ( aExp == 0 ) {
2110: if ( aSig == 0 ) return 0;
2111: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2112: }
2113: zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2114: aSig = ( aSig | 0x00800000 )<<8;
2115: zSig = estimateSqrt32( aExp, aSig ) + 2;
2116: if ( ( zSig & 0x7F ) <= 5 ) {
2117: if ( zSig < 2 ) {
2118: zSig = 0x7FFFFFFF;
2119: goto roundAndPack;
2120: }
2121: aSig >>= aExp & 1;
2122: term = ( (bits64) zSig ) * zSig;
2123: rem = ( ( (bits64) aSig )<<32 ) - term;
2124: while ( (sbits64) rem < 0 ) {
2125: --zSig;
2126: rem += ( ( (bits64) zSig )<<1 ) | 1;
2127: }
2128: zSig |= ( rem != 0 );
2129: }
2130: shift32RightJamming( zSig, 1, &zSig );
2131: roundAndPack:
2132: return roundAndPackFloat32( 0, zExp, zSig );
2133:
2134: }
2135: #endif /* !SOFTFLOAT_FOR_GCC */
2136:
2137: /*
2138: -------------------------------------------------------------------------------
2139: Returns 1 if the single-precision floating-point value `a' is equal to
2140: the corresponding value `b', and 0 otherwise. The comparison is performed
2141: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2142: -------------------------------------------------------------------------------
2143: */
2144: flag float32_eq( float32 a, float32 b )
2145: {
2146:
2147: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2148: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2149: ) {
2150: if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2151: float_raise( float_flag_invalid );
2152: }
2153: return 0;
2154: }
2155: return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2156:
2157: }
2158:
2159: /*
2160: -------------------------------------------------------------------------------
2161: Returns 1 if the single-precision floating-point value `a' is less than
2162: or equal to the corresponding value `b', and 0 otherwise. The comparison
2163: is performed according to the IEC/IEEE Standard for Binary Floating-Point
2164: Arithmetic.
2165: -------------------------------------------------------------------------------
2166: */
2167: flag float32_le( float32 a, float32 b )
2168: {
2169: flag aSign, bSign;
2170:
2171: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2172: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2173: ) {
2174: float_raise( float_flag_invalid );
2175: return 0;
2176: }
2177: aSign = extractFloat32Sign( a );
2178: bSign = extractFloat32Sign( b );
2179: if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2180: return ( a == b ) || ( aSign ^ ( a < b ) );
2181:
2182: }
2183:
2184: /*
2185: -------------------------------------------------------------------------------
2186: Returns 1 if the single-precision floating-point value `a' is less than
2187: the corresponding value `b', and 0 otherwise. The comparison is performed
2188: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2189: -------------------------------------------------------------------------------
2190: */
2191: flag float32_lt( float32 a, float32 b )
2192: {
2193: flag aSign, bSign;
2194:
2195: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2196: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2197: ) {
2198: float_raise( float_flag_invalid );
2199: return 0;
2200: }
2201: aSign = extractFloat32Sign( a );
2202: bSign = extractFloat32Sign( b );
2203: if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2204: return ( a != b ) && ( aSign ^ ( a < b ) );
2205:
2206: }
2207:
2208: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2209: /*
2210: -------------------------------------------------------------------------------
2211: Returns 1 if the single-precision floating-point value `a' is equal to
2212: the corresponding value `b', and 0 otherwise. The invalid exception is
2213: raised if either operand is a NaN. Otherwise, the comparison is performed
2214: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2215: -------------------------------------------------------------------------------
2216: */
2217: flag float32_eq_signaling( float32 a, float32 b )
2218: {
2219:
2220: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2221: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2222: ) {
2223: float_raise( float_flag_invalid );
2224: return 0;
2225: }
2226: return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2227:
2228: }
2229:
2230: /*
2231: -------------------------------------------------------------------------------
2232: Returns 1 if the single-precision floating-point value `a' is less than or
2233: equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2234: cause an exception. Otherwise, the comparison is performed according to the
2235: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2236: -------------------------------------------------------------------------------
2237: */
2238: flag float32_le_quiet( float32 a, float32 b )
2239: {
2240: flag aSign, bSign;
2241:
2242: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2243: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2244: ) {
2245: if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2246: float_raise( float_flag_invalid );
2247: }
2248: return 0;
2249: }
2250: aSign = extractFloat32Sign( a );
2251: bSign = extractFloat32Sign( b );
2252: if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2253: return ( a == b ) || ( aSign ^ ( a < b ) );
2254:
2255: }
2256:
2257: /*
2258: -------------------------------------------------------------------------------
2259: Returns 1 if the single-precision floating-point value `a' is less than
2260: the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2261: exception. Otherwise, the comparison is performed according to the IEC/IEEE
2262: Standard for Binary Floating-Point Arithmetic.
2263: -------------------------------------------------------------------------------
2264: */
2265: flag float32_lt_quiet( float32 a, float32 b )
2266: {
2267: flag aSign, bSign;
2268:
2269: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2270: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2271: ) {
2272: if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2273: float_raise( float_flag_invalid );
2274: }
2275: return 0;
2276: }
2277: aSign = extractFloat32Sign( a );
2278: bSign = extractFloat32Sign( b );
2279: if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2280: return ( a != b ) && ( aSign ^ ( a < b ) );
2281:
2282: }
2283: #endif /* !SOFTFLOAT_FOR_GCC */
2284:
2285: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2286: /*
2287: -------------------------------------------------------------------------------
2288: Returns the result of converting the double-precision floating-point value
2289: `a' to the 32-bit two's complement integer format. The conversion is
2290: performed according to the IEC/IEEE Standard for Binary Floating-Point
2291: Arithmetic---which means in particular that the conversion is rounded
2292: according to the current rounding mode. If `a' is a NaN, the largest
2293: positive integer is returned. Otherwise, if the conversion overflows, the
2294: largest integer with the same sign as `a' is returned.
2295: -------------------------------------------------------------------------------
2296: */
2297: int32 float64_to_int32( float64 a )
2298: {
2299: flag aSign;
2300: int16 aExp, shiftCount;
2301: bits64 aSig;
2302:
2303: aSig = extractFloat64Frac( a );
2304: aExp = extractFloat64Exp( a );
2305: aSign = extractFloat64Sign( a );
2306: if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2307: if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2308: shiftCount = 0x42C - aExp;
2309: if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2310: return roundAndPackInt32( aSign, aSig );
2311:
2312: }
2313: #endif /* !SOFTFLOAT_FOR_GCC */
2314:
2315: /*
2316: -------------------------------------------------------------------------------
2317: Returns the result of converting the double-precision floating-point value
2318: `a' to the 32-bit two's complement integer format. The conversion is
2319: performed according to the IEC/IEEE Standard for Binary Floating-Point
2320: Arithmetic, except that the conversion is always rounded toward zero.
2321: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2322: the conversion overflows, the largest integer with the same sign as `a' is
2323: returned.
2324: -------------------------------------------------------------------------------
2325: */
2326: int32 float64_to_int32_round_to_zero( float64 a )
2327: {
2328: flag aSign;
2329: int16 aExp, shiftCount;
2330: bits64 aSig, savedASig;
2331: int32 z;
2332:
2333: aSig = extractFloat64Frac( a );
2334: aExp = extractFloat64Exp( a );
2335: aSign = extractFloat64Sign( a );
2336: if ( 0x41E < aExp ) {
2337: if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2338: goto invalid;
2339: }
2340: else if ( aExp < 0x3FF ) {
2341: if ( aExp || aSig ) float_set_inexact();
2342: return 0;
2343: }
2344: aSig |= LIT64( 0x0010000000000000 );
2345: shiftCount = 0x433 - aExp;
2346: savedASig = aSig;
2347: aSig >>= shiftCount;
2348: z = aSig;
2349: if ( aSign ) z = - z;
2350: if ( ( z < 0 ) ^ aSign ) {
2351: invalid:
2352: float_raise( float_flag_invalid );
2353: return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2354: }
2355: if ( ( aSig<<shiftCount ) != savedASig ) {
2356: float_set_inexact();
2357: }
2358: return z;
2359:
2360: }
2361:
2362: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
2363: /*
2364: -------------------------------------------------------------------------------
2365: Returns the result of converting the double-precision floating-point value
2366: `a' to the 64-bit two's complement integer format. The conversion is
2367: performed according to the IEC/IEEE Standard for Binary Floating-Point
2368: Arithmetic---which means in particular that the conversion is rounded
2369: according to the current rounding mode. If `a' is a NaN, the largest
2370: positive integer is returned. Otherwise, if the conversion overflows, the
2371: largest integer with the same sign as `a' is returned.
2372: -------------------------------------------------------------------------------
2373: */
2374: int64 float64_to_int64( float64 a )
2375: {
2376: flag aSign;
2377: int16 aExp, shiftCount;
2378: bits64 aSig, aSigExtra;
2379:
2380: aSig = extractFloat64Frac( a );
2381: aExp = extractFloat64Exp( a );
2382: aSign = extractFloat64Sign( a );
2383: if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2384: shiftCount = 0x433 - aExp;
2385: if ( shiftCount <= 0 ) {
2386: if ( 0x43E < aExp ) {
2387: float_raise( float_flag_invalid );
2388: if ( ! aSign
2389: || ( ( aExp == 0x7FF )
2390: && ( aSig != LIT64( 0x0010000000000000 ) ) )
2391: ) {
2392: return LIT64( 0x7FFFFFFFFFFFFFFF );
2393: }
2394: return (sbits64) LIT64( 0x8000000000000000 );
2395: }
2396: aSigExtra = 0;
2397: aSig <<= - shiftCount;
2398: }
2399: else {
2400: shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2401: }
2402: return roundAndPackInt64( aSign, aSig, aSigExtra );
2403:
2404: }
2405:
2406: /*
2407: -------------------------------------------------------------------------------
2408: Returns the result of converting the double-precision floating-point value
2409: `a' to the 64-bit two's complement integer format. The conversion is
2410: performed according to the IEC/IEEE Standard for Binary Floating-Point
2411: Arithmetic, except that the conversion is always rounded toward zero.
2412: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2413: the conversion overflows, the largest integer with the same sign as `a' is
2414: returned.
2415: -------------------------------------------------------------------------------
2416: */
2417: int64 float64_to_int64_round_to_zero( float64 a )
2418: {
2419: flag aSign;
2420: int16 aExp, shiftCount;
2421: bits64 aSig;
2422: int64 z;
2423:
2424: aSig = extractFloat64Frac( a );
2425: aExp = extractFloat64Exp( a );
2426: aSign = extractFloat64Sign( a );
2427: if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2428: shiftCount = aExp - 0x433;
2429: if ( 0 <= shiftCount ) {
2430: if ( 0x43E <= aExp ) {
2431: if ( a != LIT64( 0xC3E0000000000000 ) ) {
2432: float_raise( float_flag_invalid );
2433: if ( ! aSign
2434: || ( ( aExp == 0x7FF )
2435: && ( aSig != LIT64( 0x0010000000000000 ) ) )
2436: ) {
2437: return LIT64( 0x7FFFFFFFFFFFFFFF );
2438: }
2439: }
2440: return (sbits64) LIT64( 0x8000000000000000 );
2441: }
2442: z = aSig<<shiftCount;
2443: }
2444: else {
2445: if ( aExp < 0x3FE ) {
2446: if ( aExp | aSig ) float_set_inexact();
2447: return 0;
2448: }
2449: z = aSig>>( - shiftCount );
2450: if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2451: float_set_inexact();
2452: }
2453: }
2454: if ( aSign ) z = - z;
2455: return z;
2456:
2457: }
2458: #endif /* !SOFTFLOAT_FOR_GCC */
2459:
2460: /*
2461: -------------------------------------------------------------------------------
2462: Returns the result of converting the double-precision floating-point value
2463: `a' to the single-precision floating-point format. The conversion is
2464: performed according to the IEC/IEEE Standard for Binary Floating-Point
2465: Arithmetic.
2466: -------------------------------------------------------------------------------
2467: */
2468: float32 float64_to_float32( float64 a )
2469: {
2470: flag aSign;
2471: int16 aExp;
2472: bits64 aSig;
2473: bits32 zSig;
2474:
2475: aSig = extractFloat64Frac( a );
2476: aExp = extractFloat64Exp( a );
2477: aSign = extractFloat64Sign( a );
2478: if ( aExp == 0x7FF ) {
2479: if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2480: return packFloat32( aSign, 0xFF, 0 );
2481: }
2482: shift64RightJamming( aSig, 22, &aSig );
2483: zSig = aSig;
2484: if ( aExp || zSig ) {
2485: zSig |= 0x40000000;
2486: aExp -= 0x381;
2487: }
2488: return roundAndPackFloat32( aSign, aExp, zSig );
2489:
2490: }
2491:
2492: #ifdef FLOATX80
2493:
2494: /*
2495: -------------------------------------------------------------------------------
2496: Returns the result of converting the double-precision floating-point value
2497: `a' to the extended double-precision floating-point format. The conversion
2498: is performed according to the IEC/IEEE Standard for Binary Floating-Point
2499: Arithmetic.
2500: -------------------------------------------------------------------------------
2501: */
2502: floatx80 float64_to_floatx80( float64 a )
2503: {
2504: flag aSign;
2505: int16 aExp;
2506: bits64 aSig;
2507:
2508: aSig = extractFloat64Frac( a );
2509: aExp = extractFloat64Exp( a );
2510: aSign = extractFloat64Sign( a );
2511: if ( aExp == 0x7FF ) {
2512: if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2513: return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2514: }
2515: if ( aExp == 0 ) {
2516: if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2517: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2518: }
2519: return
2520: packFloatx80(
2521: aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2522:
2523: }
2524:
2525: #endif
2526:
2527: #ifdef FLOAT128
2528:
2529: /*
2530: -------------------------------------------------------------------------------
2531: Returns the result of converting the double-precision floating-point value
2532: `a' to the quadruple-precision floating-point format. The conversion is
2533: performed according to the IEC/IEEE Standard for Binary Floating-Point
2534: Arithmetic.
2535: -------------------------------------------------------------------------------
2536: */
2537: float128 float64_to_float128( float64 a )
2538: {
2539: flag aSign;
2540: int16 aExp;
2541: bits64 aSig, zSig0, zSig1;
2542:
2543: aSig = extractFloat64Frac( a );
2544: aExp = extractFloat64Exp( a );
2545: aSign = extractFloat64Sign( a );
2546: if ( aExp == 0x7FF ) {
2547: if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2548: return packFloat128( aSign, 0x7FFF, 0, 0 );
2549: }
2550: if ( aExp == 0 ) {
2551: if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2552: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2553: --aExp;
2554: }
2555: shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2556: return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2557:
2558: }
2559:
2560: #endif
2561:
2562: #ifndef SOFTFLOAT_FOR_GCC
2563: /*
2564: -------------------------------------------------------------------------------
2565: Rounds the double-precision floating-point value `a' to an integer, and
2566: returns the result as a double-precision floating-point value. The
2567: operation is performed according to the IEC/IEEE Standard for Binary
2568: Floating-Point Arithmetic.
2569: -------------------------------------------------------------------------------
2570: */
2571: float64 float64_round_to_int( float64 a )
2572: {
2573: flag aSign;
2574: int16 aExp;
2575: bits64 lastBitMask, roundBitsMask;
2576: int8 roundingMode;
2577: float64 z;
2578:
2579: aExp = extractFloat64Exp( a );
2580: if ( 0x433 <= aExp ) {
2581: if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2582: return propagateFloat64NaN( a, a );
2583: }
2584: return a;
2585: }
2586: if ( aExp < 0x3FF ) {
2587: if ( (bits64) ( a<<1 ) == 0 ) return a;
2588: float_set_inexact();
2589: aSign = extractFloat64Sign( a );
2590: switch ( float_rounding_mode() ) {
2591: case float_round_nearest_even:
2592: if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2593: return packFloat64( aSign, 0x3FF, 0 );
2594: }
2595: break;
2596: case float_round_down:
2597: return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2598: case float_round_up:
2599: return
2600: aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2601: }
2602: return packFloat64( aSign, 0, 0 );
2603: }
2604: lastBitMask = 1;
2605: lastBitMask <<= 0x433 - aExp;
2606: roundBitsMask = lastBitMask - 1;
2607: z = a;
2608: roundingMode = float_rounding_mode();
2609: if ( roundingMode == float_round_nearest_even ) {
2610: z += lastBitMask>>1;
2611: if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2612: }
2613: else if ( roundingMode != float_round_to_zero ) {
2614: if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2615: z += roundBitsMask;
2616: }
2617: }
2618: z &= ~ roundBitsMask;
2619: if ( z != a ) float_set_inexact();
2620: return z;
2621:
2622: }
2623: #endif
2624:
2625: /*
2626: -------------------------------------------------------------------------------
2627: Returns the result of adding the absolute values of the double-precision
2628: floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2629: before being returned. `zSign' is ignored if the result is a NaN.
2630: The addition is performed according to the IEC/IEEE Standard for Binary
2631: Floating-Point Arithmetic.
2632: -------------------------------------------------------------------------------
2633: */
2634: static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2635: {
2636: int16 aExp, bExp, zExp;
2637: bits64 aSig, bSig, zSig;
2638: int16 expDiff;
2639:
2640: aSig = extractFloat64Frac( a );
2641: aExp = extractFloat64Exp( a );
2642: bSig = extractFloat64Frac( b );
2643: bExp = extractFloat64Exp( b );
2644: expDiff = aExp - bExp;
2645: aSig <<= 9;
2646: bSig <<= 9;
2647: if ( 0 < expDiff ) {
2648: if ( aExp == 0x7FF ) {
2649: if ( aSig ) return propagateFloat64NaN( a, b );
2650: return a;
2651: }
2652: if ( bExp == 0 ) {
2653: --expDiff;
2654: }
2655: else {
2656: bSig |= LIT64( 0x2000000000000000 );
2657: }
2658: shift64RightJamming( bSig, expDiff, &bSig );
2659: zExp = aExp;
2660: }
2661: else if ( expDiff < 0 ) {
2662: if ( bExp == 0x7FF ) {
2663: if ( bSig ) return propagateFloat64NaN( a, b );
2664: return packFloat64( zSign, 0x7FF, 0 );
2665: }
2666: if ( aExp == 0 ) {
2667: ++expDiff;
2668: }
2669: else {
2670: aSig |= LIT64( 0x2000000000000000 );
2671: }
2672: shift64RightJamming( aSig, - expDiff, &aSig );
2673: zExp = bExp;
2674: }
2675: else {
2676: if ( aExp == 0x7FF ) {
2677: if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2678: return a;
2679: }
2680: if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2681: zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2682: zExp = aExp;
2683: goto roundAndPack;
2684: }
2685: aSig |= LIT64( 0x2000000000000000 );
2686: zSig = ( aSig + bSig )<<1;
2687: --zExp;
2688: if ( (sbits64) zSig < 0 ) {
2689: zSig = aSig + bSig;
2690: ++zExp;
2691: }
2692: roundAndPack:
2693: return roundAndPackFloat64( zSign, zExp, zSig );
2694:
2695: }
2696:
2697: /*
2698: -------------------------------------------------------------------------------
2699: Returns the result of subtracting the absolute values of the double-
2700: precision floating-point values `a' and `b'. If `zSign' is 1, the
2701: difference is negated before being returned. `zSign' is ignored if the
2702: result is a NaN. The subtraction is performed according to the IEC/IEEE
2703: Standard for Binary Floating-Point Arithmetic.
2704: -------------------------------------------------------------------------------
2705: */
2706: static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2707: {
2708: int16 aExp, bExp, zExp;
2709: bits64 aSig, bSig, zSig;
2710: int16 expDiff;
2711:
2712: aSig = extractFloat64Frac( a );
2713: aExp = extractFloat64Exp( a );
2714: bSig = extractFloat64Frac( b );
2715: bExp = extractFloat64Exp( b );
2716: expDiff = aExp - bExp;
2717: aSig <<= 10;
2718: bSig <<= 10;
2719: if ( 0 < expDiff ) goto aExpBigger;
2720: if ( expDiff < 0 ) goto bExpBigger;
2721: if ( aExp == 0x7FF ) {
2722: if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2723: float_raise( float_flag_invalid );
2724: return float64_default_nan;
2725: }
2726: if ( aExp == 0 ) {
2727: aExp = 1;
2728: bExp = 1;
2729: }
2730: if ( bSig < aSig ) goto aBigger;
2731: if ( aSig < bSig ) goto bBigger;
2732: return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
2733: bExpBigger:
2734: if ( bExp == 0x7FF ) {
2735: if ( bSig ) return propagateFloat64NaN( a, b );
2736: return packFloat64( zSign ^ 1, 0x7FF, 0 );
2737: }
2738: if ( aExp == 0 ) {
2739: ++expDiff;
2740: }
2741: else {
2742: aSig |= LIT64( 0x4000000000000000 );
2743: }
2744: shift64RightJamming( aSig, - expDiff, &aSig );
2745: bSig |= LIT64( 0x4000000000000000 );
2746: bBigger:
2747: zSig = bSig - aSig;
2748: zExp = bExp;
2749: zSign ^= 1;
2750: goto normalizeRoundAndPack;
2751: aExpBigger:
2752: if ( aExp == 0x7FF ) {
2753: if ( aSig ) return propagateFloat64NaN( a, b );
2754: return a;
2755: }
2756: if ( bExp == 0 ) {
2757: --expDiff;
2758: }
2759: else {
2760: bSig |= LIT64( 0x4000000000000000 );
2761: }
2762: shift64RightJamming( bSig, expDiff, &bSig );
2763: aSig |= LIT64( 0x4000000000000000 );
2764: aBigger:
2765: zSig = aSig - bSig;
2766: zExp = aExp;
2767: normalizeRoundAndPack:
2768: --zExp;
2769: return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2770:
2771: }
2772:
2773: /*
2774: -------------------------------------------------------------------------------
2775: Returns the result of adding the double-precision floating-point values `a'
2776: and `b'. The operation is performed according to the IEC/IEEE Standard for
2777: Binary Floating-Point Arithmetic.
2778: -------------------------------------------------------------------------------
2779: */
2780: float64 float64_add( float64 a, float64 b )
2781: {
2782: flag aSign, bSign;
2783:
2784: aSign = extractFloat64Sign( a );
2785: bSign = extractFloat64Sign( b );
2786: if ( aSign == bSign ) {
2787: return addFloat64Sigs( a, b, aSign );
2788: }
2789: else {
2790: return subFloat64Sigs( a, b, aSign );
2791: }
2792:
2793: }
2794:
2795: /*
2796: -------------------------------------------------------------------------------
2797: Returns the result of subtracting the double-precision floating-point values
2798: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2799: for Binary Floating-Point Arithmetic.
2800: -------------------------------------------------------------------------------
2801: */
2802: float64 float64_sub( float64 a, float64 b )
2803: {
2804: flag aSign, bSign;
2805:
2806: aSign = extractFloat64Sign( a );
2807: bSign = extractFloat64Sign( b );
2808: if ( aSign == bSign ) {
2809: return subFloat64Sigs( a, b, aSign );
2810: }
2811: else {
2812: return addFloat64Sigs( a, b, aSign );
2813: }
2814:
2815: }
2816:
2817: /*
2818: -------------------------------------------------------------------------------
2819: Returns the result of multiplying the double-precision floating-point values
2820: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2821: for Binary Floating-Point Arithmetic.
2822: -------------------------------------------------------------------------------
2823: */
2824: float64 float64_mul( float64 a, float64 b )
2825: {
2826: flag aSign, bSign, zSign;
2827: int16 aExp, bExp, zExp;
2828: bits64 aSig, bSig, zSig0, zSig1;
2829:
2830: aSig = extractFloat64Frac( a );
2831: aExp = extractFloat64Exp( a );
2832: aSign = extractFloat64Sign( a );
2833: bSig = extractFloat64Frac( b );
2834: bExp = extractFloat64Exp( b );
2835: bSign = extractFloat64Sign( b );
2836: zSign = aSign ^ bSign;
2837: if ( aExp == 0x7FF ) {
2838: if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2839: return propagateFloat64NaN( a, b );
2840: }
2841: if ( ( bExp | bSig ) == 0 ) {
2842: float_raise( float_flag_invalid );
2843: return float64_default_nan;
2844: }
2845: return packFloat64( zSign, 0x7FF, 0 );
2846: }
2847: if ( bExp == 0x7FF ) {
2848: if ( bSig ) return propagateFloat64NaN( a, b );
2849: if ( ( aExp | aSig ) == 0 ) {
2850: float_raise( float_flag_invalid );
2851: return float64_default_nan;
2852: }
2853: return packFloat64( zSign, 0x7FF, 0 );
2854: }
2855: if ( aExp == 0 ) {
2856: if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2857: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2858: }
2859: if ( bExp == 0 ) {
2860: if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2861: normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2862: }
2863: zExp = aExp + bExp - 0x3FF;
2864: aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2865: bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2866: mul64To128( aSig, bSig, &zSig0, &zSig1 );
2867: zSig0 |= ( zSig1 != 0 );
2868: if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2869: zSig0 <<= 1;
2870: --zExp;
2871: }
2872: return roundAndPackFloat64( zSign, zExp, zSig0 );
2873:
2874: }
2875:
2876: /*
2877: -------------------------------------------------------------------------------
2878: Returns the result of dividing the double-precision floating-point value `a'
2879: by the corresponding value `b'. The operation is performed according to
2880: the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2881: -------------------------------------------------------------------------------
2882: */
2883: float64 float64_div( float64 a, float64 b )
2884: {
2885: flag aSign, bSign, zSign;
2886: int16 aExp, bExp, zExp;
2887: bits64 aSig, bSig, zSig;
2888: bits64 rem0, rem1;
2889: bits64 term0, term1;
2890:
2891: aSig = extractFloat64Frac( a );
2892: aExp = extractFloat64Exp( a );
2893: aSign = extractFloat64Sign( a );
2894: bSig = extractFloat64Frac( b );
2895: bExp = extractFloat64Exp( b );
2896: bSign = extractFloat64Sign( b );
2897: zSign = aSign ^ bSign;
2898: if ( aExp == 0x7FF ) {
2899: if ( aSig ) return propagateFloat64NaN( a, b );
2900: if ( bExp == 0x7FF ) {
2901: if ( bSig ) return propagateFloat64NaN( a, b );
2902: float_raise( float_flag_invalid );
2903: return float64_default_nan;
2904: }
2905: return packFloat64( zSign, 0x7FF, 0 );
2906: }
2907: if ( bExp == 0x7FF ) {
2908: if ( bSig ) return propagateFloat64NaN( a, b );
2909: return packFloat64( zSign, 0, 0 );
2910: }
2911: if ( bExp == 0 ) {
2912: if ( bSig == 0 ) {
2913: if ( ( aExp | aSig ) == 0 ) {
2914: float_raise( float_flag_invalid );
2915: return float64_default_nan;
2916: }
2917: float_raise( float_flag_divbyzero );
2918: return packFloat64( zSign, 0x7FF, 0 );
2919: }
2920: normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2921: }
2922: if ( aExp == 0 ) {
2923: if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2924: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2925: }
2926: zExp = aExp - bExp + 0x3FD;
2927: aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2928: bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2929: if ( bSig <= ( aSig + aSig ) ) {
2930: aSig >>= 1;
2931: ++zExp;
2932: }
2933: zSig = estimateDiv128To64( aSig, 0, bSig );
2934: if ( ( zSig & 0x1FF ) <= 2 ) {
2935: mul64To128( bSig, zSig, &term0, &term1 );
2936: sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2937: while ( (sbits64) rem0 < 0 ) {
2938: --zSig;
2939: add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2940: }
2941: zSig |= ( rem1 != 0 );
2942: }
2943: return roundAndPackFloat64( zSign, zExp, zSig );
2944:
2945: }
2946:
2947: #ifndef SOFTFLOAT_FOR_GCC
2948: /*
2949: -------------------------------------------------------------------------------
2950: Returns the remainder of the double-precision floating-point value `a'
2951: with respect to the corresponding value `b'. The operation is performed
2952: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2953: -------------------------------------------------------------------------------
2954: */
2955: float64 float64_rem( float64 a, float64 b )
2956: {
2957: flag aSign, bSign, zSign;
2958: int16 aExp, bExp, expDiff;
2959: bits64 aSig, bSig;
2960: bits64 q, alternateASig;
2961: sbits64 sigMean;
2962:
2963: aSig = extractFloat64Frac( a );
2964: aExp = extractFloat64Exp( a );
2965: aSign = extractFloat64Sign( a );
2966: bSig = extractFloat64Frac( b );
2967: bExp = extractFloat64Exp( b );
2968: bSign = extractFloat64Sign( b );
2969: if ( aExp == 0x7FF ) {
2970: if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2971: return propagateFloat64NaN( a, b );
2972: }
2973: float_raise( float_flag_invalid );
2974: return float64_default_nan;
2975: }
2976: if ( bExp == 0x7FF ) {
2977: if ( bSig ) return propagateFloat64NaN( a, b );
2978: return a;
2979: }
2980: if ( bExp == 0 ) {
2981: if ( bSig == 0 ) {
2982: float_raise( float_flag_invalid );
2983: return float64_default_nan;
2984: }
2985: normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2986: }
2987: if ( aExp == 0 ) {
2988: if ( aSig == 0 ) return a;
2989: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2990: }
2991: expDiff = aExp - bExp;
2992: aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2993: bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2994: if ( expDiff < 0 ) {
2995: if ( expDiff < -1 ) return a;
2996: aSig >>= 1;
2997: }
2998: q = ( bSig <= aSig );
2999: if ( q ) aSig -= bSig;
3000: expDiff -= 64;
3001: while ( 0 < expDiff ) {
3002: q = estimateDiv128To64( aSig, 0, bSig );
3003: q = ( 2 < q ) ? q - 2 : 0;
3004: aSig = - ( ( bSig>>2 ) * q );
3005: expDiff -= 62;
3006: }
3007: expDiff += 64;
3008: if ( 0 < expDiff ) {
3009: q = estimateDiv128To64( aSig, 0, bSig );
3010: q = ( 2 < q ) ? q - 2 : 0;
3011: q >>= 64 - expDiff;
3012: bSig >>= 2;
3013: aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3014: }
3015: else {
3016: aSig >>= 2;
3017: bSig >>= 2;
3018: }
3019: do {
3020: alternateASig = aSig;
3021: ++q;
3022: aSig -= bSig;
3023: } while ( 0 <= (sbits64) aSig );
3024: sigMean = aSig + alternateASig;
3025: if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3026: aSig = alternateASig;
3027: }
3028: zSign = ( (sbits64) aSig < 0 );
3029: if ( zSign ) aSig = - aSig;
3030: return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
3031:
3032: }
3033:
3034: /*
3035: -------------------------------------------------------------------------------
3036: Returns the square root of the double-precision floating-point value `a'.
3037: The operation is performed according to the IEC/IEEE Standard for Binary
3038: Floating-Point Arithmetic.
3039: -------------------------------------------------------------------------------
3040: */
3041: float64 float64_sqrt( float64 a )
3042: {
3043: flag aSign;
3044: int16 aExp, zExp;
3045: bits64 aSig, zSig, doubleZSig;
3046: bits64 rem0, rem1, term0, term1;
3047:
3048: aSig = extractFloat64Frac( a );
3049: aExp = extractFloat64Exp( a );
3050: aSign = extractFloat64Sign( a );
3051: if ( aExp == 0x7FF ) {
3052: if ( aSig ) return propagateFloat64NaN( a, a );
3053: if ( ! aSign ) return a;
3054: float_raise( float_flag_invalid );
3055: return float64_default_nan;
3056: }
3057: if ( aSign ) {
3058: if ( ( aExp | aSig ) == 0 ) return a;
3059: float_raise( float_flag_invalid );
3060: return float64_default_nan;
3061: }
3062: if ( aExp == 0 ) {
3063: if ( aSig == 0 ) return 0;
3064: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3065: }
3066: zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3067: aSig |= LIT64( 0x0010000000000000 );
3068: zSig = estimateSqrt32( aExp, aSig>>21 );
3069: aSig <<= 9 - ( aExp & 1 );
3070: zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3071: if ( ( zSig & 0x1FF ) <= 5 ) {
3072: doubleZSig = zSig<<1;
3073: mul64To128( zSig, zSig, &term0, &term1 );
3074: sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3075: while ( (sbits64) rem0 < 0 ) {
3076: --zSig;
3077: doubleZSig -= 2;
3078: add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3079: }
3080: zSig |= ( ( rem0 | rem1 ) != 0 );
3081: }
3082: return roundAndPackFloat64( 0, zExp, zSig );
3083:
3084: }
3085: #endif
3086:
3087: /*
3088: -------------------------------------------------------------------------------
3089: Returns 1 if the double-precision floating-point value `a' is equal to the
3090: corresponding value `b', and 0 otherwise. The comparison is performed
3091: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3092: -------------------------------------------------------------------------------
3093: */
3094: flag float64_eq( float64 a, float64 b )
3095: {
3096:
3097: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3098: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3099: ) {
3100: if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3101: float_raise( float_flag_invalid );
3102: }
3103: return 0;
3104: }
3105: return ( a == b ) ||
3106: ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
3107:
3108: }
3109:
3110: /*
3111: -------------------------------------------------------------------------------
3112: Returns 1 if the double-precision floating-point value `a' is less than or
3113: equal to the corresponding value `b', and 0 otherwise. The comparison is
3114: performed according to the IEC/IEEE Standard for Binary Floating-Point
3115: Arithmetic.
3116: -------------------------------------------------------------------------------
3117: */
3118: flag float64_le( float64 a, float64 b )
3119: {
3120: flag aSign, bSign;
3121:
3122: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3123: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3124: ) {
3125: float_raise( float_flag_invalid );
3126: return 0;
3127: }
3128: aSign = extractFloat64Sign( a );
3129: bSign = extractFloat64Sign( b );
3130: if ( aSign != bSign )
3131: return aSign ||
3132: ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
3133: 0 );
3134: return ( a == b ) ||
3135: ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3136:
3137: }
3138:
3139: /*
3140: -------------------------------------------------------------------------------
3141: Returns 1 if the double-precision floating-point value `a' is less than
3142: the corresponding value `b', and 0 otherwise. The comparison is performed
3143: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3144: -------------------------------------------------------------------------------
3145: */
3146: flag float64_lt( float64 a, float64 b )
3147: {
3148: flag aSign, bSign;
3149:
3150: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3151: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3152: ) {
3153: float_raise( float_flag_invalid );
3154: return 0;
3155: }
3156: aSign = extractFloat64Sign( a );
3157: bSign = extractFloat64Sign( b );
3158: if ( aSign != bSign )
3159: return aSign &&
3160: ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
3161: 0 );
3162: return ( a != b ) &&
3163: ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
3164:
3165: }
3166:
3167: #ifndef SOFTFLOAT_FOR_GCC
3168: /*
3169: -------------------------------------------------------------------------------
3170: Returns 1 if the double-precision floating-point value `a' is equal to the
3171: corresponding value `b', and 0 otherwise. The invalid exception is raised
3172: if either operand is a NaN. Otherwise, the comparison is performed
3173: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3174: -------------------------------------------------------------------------------
3175: */
3176: flag float64_eq_signaling( float64 a, float64 b )
3177: {
3178:
3179: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3180: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3181: ) {
3182: float_raise( float_flag_invalid );
3183: return 0;
3184: }
3185: return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3186:
3187: }
3188:
3189: /*
3190: -------------------------------------------------------------------------------
3191: Returns 1 if the double-precision floating-point value `a' is less than or
3192: equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3193: cause an exception. Otherwise, the comparison is performed according to the
3194: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3195: -------------------------------------------------------------------------------
3196: */
3197: flag float64_le_quiet( float64 a, float64 b )
3198: {
3199: flag aSign, bSign;
3200:
3201: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3202: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3203: ) {
3204: if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3205: float_raise( float_flag_invalid );
3206: }
3207: return 0;
3208: }
3209: aSign = extractFloat64Sign( a );
3210: bSign = extractFloat64Sign( b );
3211: if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3212: return ( a == b ) || ( aSign ^ ( a < b ) );
3213:
3214: }
3215:
3216: /*
3217: -------------------------------------------------------------------------------
3218: Returns 1 if the double-precision floating-point value `a' is less than
3219: the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3220: exception. Otherwise, the comparison is performed according to the IEC/IEEE
3221: Standard for Binary Floating-Point Arithmetic.
3222: -------------------------------------------------------------------------------
3223: */
3224: flag float64_lt_quiet( float64 a, float64 b )
3225: {
3226: flag aSign, bSign;
3227:
3228: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3229: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3230: ) {
3231: if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3232: float_raise( float_flag_invalid );
3233: }
3234: return 0;
3235: }
3236: aSign = extractFloat64Sign( a );
3237: bSign = extractFloat64Sign( b );
3238: if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3239: return ( a != b ) && ( aSign ^ ( a < b ) );
3240:
3241: }
3242: #endif
3243:
3244: #ifdef FLOATX80
3245:
3246: /*
3247: -------------------------------------------------------------------------------
3248: Returns the result of converting the extended double-precision floating-
3249: point value `a' to the 32-bit two's complement integer format. The
3250: conversion is performed according to the IEC/IEEE Standard for Binary
3251: Floating-Point Arithmetic---which means in particular that the conversion
3252: is rounded according to the current rounding mode. If `a' is a NaN, the
3253: largest positive integer is returned. Otherwise, if the conversion
3254: overflows, the largest integer with the same sign as `a' is returned.
3255: -------------------------------------------------------------------------------
3256: */
3257: int32 floatx80_to_int32( floatx80 a )
3258: {
3259: flag aSign;
3260: int32 aExp, shiftCount;
3261: bits64 aSig;
3262:
3263: aSig = extractFloatx80Frac( a );
3264: aExp = extractFloatx80Exp( a );
3265: aSign = extractFloatx80Sign( a );
3266: if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3267: shiftCount = 0x4037 - aExp;
3268: if ( shiftCount <= 0 ) shiftCount = 1;
3269: shift64RightJamming( aSig, shiftCount, &aSig );
3270: return roundAndPackInt32( aSign, aSig );
3271:
3272: }
3273:
3274: /*
3275: -------------------------------------------------------------------------------
3276: Returns the result of converting the extended double-precision floating-
3277: point value `a' to the 32-bit two's complement integer format. The
3278: conversion is performed according to the IEC/IEEE Standard for Binary
3279: Floating-Point Arithmetic, except that the conversion is always rounded
3280: toward zero. If `a' is a NaN, the largest positive integer is returned.
3281: Otherwise, if the conversion overflows, the largest integer with the same
3282: sign as `a' is returned.
3283: -------------------------------------------------------------------------------
3284: */
3285: int32 floatx80_to_int32_round_to_zero( floatx80 a )
3286: {
3287: flag aSign;
3288: int32 aExp, shiftCount;
3289: bits64 aSig, savedASig;
3290: int32 z;
3291:
3292: aSig = extractFloatx80Frac( a );
3293: aExp = extractFloatx80Exp( a );
3294: aSign = extractFloatx80Sign( a );
3295: if ( 0x401E < aExp ) {
3296: if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3297: goto invalid;
3298: }
3299: else if ( aExp < 0x3FFF ) {
3300: if ( aExp || aSig ) float_set_inexact();
3301: return 0;
3302: }
3303: shiftCount = 0x403E - aExp;
3304: savedASig = aSig;
3305: aSig >>= shiftCount;
3306: z = aSig;
3307: if ( aSign ) z = - z;
3308: if ( ( z < 0 ) ^ aSign ) {
3309: invalid:
3310: float_raise( float_flag_invalid );
3311: return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3312: }
3313: if ( ( aSig<<shiftCount ) != savedASig ) {
3314: float_set_inexact();
3315: }
3316: return z;
3317:
3318: }
3319:
3320: /*
3321: -------------------------------------------------------------------------------
3322: Returns the result of converting the extended double-precision floating-
3323: point value `a' to the 64-bit two's complement integer format. The
3324: conversion is performed according to the IEC/IEEE Standard for Binary
3325: Floating-Point Arithmetic---which means in particular that the conversion
3326: is rounded according to the current rounding mode. If `a' is a NaN,
3327: the largest positive integer is returned. Otherwise, if the conversion
3328: overflows, the largest integer with the same sign as `a' is returned.
3329: -------------------------------------------------------------------------------
3330: */
3331: int64 floatx80_to_int64( floatx80 a )
3332: {
3333: flag aSign;
3334: int32 aExp, shiftCount;
3335: bits64 aSig, aSigExtra;
3336:
3337: aSig = extractFloatx80Frac( a );
3338: aExp = extractFloatx80Exp( a );
3339: aSign = extractFloatx80Sign( a );
3340: shiftCount = 0x403E - aExp;
3341: if ( shiftCount <= 0 ) {
3342: if ( shiftCount ) {
3343: float_raise( float_flag_invalid );
3344: if ( ! aSign
3345: || ( ( aExp == 0x7FFF )
3346: && ( aSig != LIT64( 0x8000000000000000 ) ) )
3347: ) {
3348: return LIT64( 0x7FFFFFFFFFFFFFFF );
3349: }
3350: return (sbits64) LIT64( 0x8000000000000000 );
3351: }
3352: aSigExtra = 0;
3353: }
3354: else {
3355: shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3356: }
3357: return roundAndPackInt64( aSign, aSig, aSigExtra );
3358:
3359: }
3360:
3361: /*
3362: -------------------------------------------------------------------------------
3363: Returns the result of converting the extended double-precision floating-
3364: point value `a' to the 64-bit two's complement integer format. The
3365: conversion is performed according to the IEC/IEEE Standard for Binary
3366: Floating-Point Arithmetic, except that the conversion is always rounded
3367: toward zero. If `a' is a NaN, the largest positive integer is returned.
3368: Otherwise, if the conversion overflows, the largest integer with the same
3369: sign as `a' is returned.
3370: -------------------------------------------------------------------------------
3371: */
3372: int64 floatx80_to_int64_round_to_zero( floatx80 a )
3373: {
3374: flag aSign;
3375: int32 aExp, shiftCount;
3376: bits64 aSig;
3377: int64 z;
3378:
3379: aSig = extractFloatx80Frac( a );
3380: aExp = extractFloatx80Exp( a );
3381: aSign = extractFloatx80Sign( a );
3382: shiftCount = aExp - 0x403E;
3383: if ( 0 <= shiftCount ) {
3384: aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3385: if ( ( a.high != 0xC03E ) || aSig ) {
3386: float_raise( float_flag_invalid );
3387: if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3388: return LIT64( 0x7FFFFFFFFFFFFFFF );
3389: }
3390: }
3391: return (sbits64) LIT64( 0x8000000000000000 );
3392: }
3393: else if ( aExp < 0x3FFF ) {
3394: if ( aExp | aSig ) float_set_inexact();
3395: return 0;
3396: }
3397: z = aSig>>( - shiftCount );
3398: if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3399: float_set_inexact();
3400: }
3401: if ( aSign ) z = - z;
3402: return z;
3403:
3404: }
3405:
3406: /*
3407: -------------------------------------------------------------------------------
3408: Returns the result of converting the extended double-precision floating-
3409: point value `a' to the single-precision floating-point format. The
3410: conversion is performed according to the IEC/IEEE Standard for Binary
3411: Floating-Point Arithmetic.
3412: -------------------------------------------------------------------------------
3413: */
3414: float32 floatx80_to_float32( floatx80 a )
3415: {
3416: flag aSign;
3417: int32 aExp;
3418: bits64 aSig;
3419:
3420: aSig = extractFloatx80Frac( a );
3421: aExp = extractFloatx80Exp( a );
3422: aSign = extractFloatx80Sign( a );
3423: if ( aExp == 0x7FFF ) {
3424: if ( (bits64) ( aSig<<1 ) ) {
3425: return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3426: }
3427: return packFloat32( aSign, 0xFF, 0 );
3428: }
3429: shift64RightJamming( aSig, 33, &aSig );
3430: if ( aExp || aSig ) aExp -= 0x3F81;
3431: return roundAndPackFloat32( aSign, aExp, aSig );
3432:
3433: }
3434:
3435: /*
3436: -------------------------------------------------------------------------------
3437: Returns the result of converting the extended double-precision floating-
3438: point value `a' to the double-precision floating-point format. The
3439: conversion is performed according to the IEC/IEEE Standard for Binary
3440: Floating-Point Arithmetic.
3441: -------------------------------------------------------------------------------
3442: */
3443: float64 floatx80_to_float64( floatx80 a )
3444: {
3445: flag aSign;
3446: int32 aExp;
3447: bits64 aSig, zSig;
3448:
3449: aSig = extractFloatx80Frac( a );
3450: aExp = extractFloatx80Exp( a );
3451: aSign = extractFloatx80Sign( a );
3452: if ( aExp == 0x7FFF ) {
3453: if ( (bits64) ( aSig<<1 ) ) {
3454: return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3455: }
3456: return packFloat64( aSign, 0x7FF, 0 );
3457: }
3458: shift64RightJamming( aSig, 1, &zSig );
3459: if ( aExp || aSig ) aExp -= 0x3C01;
3460: return roundAndPackFloat64( aSign, aExp, zSig );
3461:
3462: }
3463:
3464: #ifdef FLOAT128
3465:
3466: /*
3467: -------------------------------------------------------------------------------
3468: Returns the result of converting the extended double-precision floating-
3469: point value `a' to the quadruple-precision floating-point format. The
3470: conversion is performed according to the IEC/IEEE Standard for Binary
3471: Floating-Point Arithmetic.
3472: -------------------------------------------------------------------------------
3473: */
3474: float128 floatx80_to_float128( floatx80 a )
3475: {
3476: flag aSign;
3477: int16 aExp;
3478: bits64 aSig, zSig0, zSig1;
3479:
3480: aSig = extractFloatx80Frac( a );
3481: aExp = extractFloatx80Exp( a );
3482: aSign = extractFloatx80Sign( a );
3483: if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3484: return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3485: }
3486: shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3487: return packFloat128( aSign, aExp, zSig0, zSig1 );
3488:
3489: }
3490:
3491: #endif
3492:
3493: /*
3494: -------------------------------------------------------------------------------
3495: Rounds the extended double-precision floating-point value `a' to an integer,
3496: and returns the result as an extended quadruple-precision floating-point
3497: value. The operation is performed according to the IEC/IEEE Standard for
3498: Binary Floating-Point Arithmetic.
3499: -------------------------------------------------------------------------------
3500: */
3501: floatx80 floatx80_round_to_int( floatx80 a )
3502: {
3503: flag aSign;
3504: int32 aExp;
3505: bits64 lastBitMask, roundBitsMask;
3506: int8 roundingMode;
3507: floatx80 z;
3508:
3509: aExp = extractFloatx80Exp( a );
3510: if ( 0x403E <= aExp ) {
3511: if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3512: return propagateFloatx80NaN( a, a );
3513: }
3514: return a;
3515: }
3516: if ( aExp < 0x3FFF ) {
3517: if ( ( aExp == 0 )
3518: && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3519: return a;
3520: }
3521: float_set_inexact();
3522: aSign = extractFloatx80Sign( a );
3523: switch ( float_rounding_mode() ) {
3524: case float_round_nearest_even:
3525: if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3526: ) {
3527: return
3528: packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3529: }
3530: break;
3531: case float_round_down:
3532: return
3533: aSign ?
3534: packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3535: : packFloatx80( 0, 0, 0 );
3536: case float_round_up:
3537: return
3538: aSign ? packFloatx80( 1, 0, 0 )
3539: : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3540: }
3541: return packFloatx80( aSign, 0, 0 );
3542: }
3543: lastBitMask = 1;
3544: lastBitMask <<= 0x403E - aExp;
3545: roundBitsMask = lastBitMask - 1;
3546: z = a;
3547: roundingMode = float_rounding_mode();
3548: if ( roundingMode == float_round_nearest_even ) {
3549: z.low += lastBitMask>>1;
3550: if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3551: }
3552: else if ( roundingMode != float_round_to_zero ) {
3553: if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3554: z.low += roundBitsMask;
3555: }
3556: }
3557: z.low &= ~ roundBitsMask;
3558: if ( z.low == 0 ) {
3559: ++z.high;
3560: z.low = LIT64( 0x8000000000000000 );
3561: }
3562: if ( z.low != a.low ) float_set_inexact();
3563: return z;
3564:
3565: }
3566:
3567: /*
3568: -------------------------------------------------------------------------------
3569: Returns the result of adding the absolute values of the extended double-
3570: precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3571: negated before being returned. `zSign' is ignored if the result is a NaN.
3572: The addition is performed according to the IEC/IEEE Standard for Binary
3573: Floating-Point Arithmetic.
3574: -------------------------------------------------------------------------------
3575: */
3576: static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3577: {
3578: int32 aExp, bExp, zExp;
3579: bits64 aSig, bSig, zSig0, zSig1;
3580: int32 expDiff;
3581:
3582: aSig = extractFloatx80Frac( a );
3583: aExp = extractFloatx80Exp( a );
3584: bSig = extractFloatx80Frac( b );
3585: bExp = extractFloatx80Exp( b );
3586: expDiff = aExp - bExp;
3587: if ( 0 < expDiff ) {
3588: if ( aExp == 0x7FFF ) {
3589: if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3590: return a;
3591: }
3592: if ( bExp == 0 ) --expDiff;
3593: shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3594: zExp = aExp;
3595: }
3596: else if ( expDiff < 0 ) {
3597: if ( bExp == 0x7FFF ) {
3598: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3599: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3600: }
3601: if ( aExp == 0 ) ++expDiff;
3602: shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3603: zExp = bExp;
3604: }
3605: else {
3606: if ( aExp == 0x7FFF ) {
3607: if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3608: return propagateFloatx80NaN( a, b );
3609: }
3610: return a;
3611: }
3612: zSig1 = 0;
3613: zSig0 = aSig + bSig;
3614: if ( aExp == 0 ) {
3615: normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3616: goto roundAndPack;
3617: }
3618: zExp = aExp;
3619: goto shiftRight1;
3620: }
3621: zSig0 = aSig + bSig;
3622: if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3623: shiftRight1:
3624: shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3625: zSig0 |= LIT64( 0x8000000000000000 );
3626: ++zExp;
3627: roundAndPack:
3628: return
3629: roundAndPackFloatx80(
3630: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3631:
3632: }
3633:
3634: /*
3635: -------------------------------------------------------------------------------
3636: Returns the result of subtracting the absolute values of the extended
3637: double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3638: difference is negated before being returned. `zSign' is ignored if the
3639: result is a NaN. The subtraction is performed according to the IEC/IEEE
3640: Standard for Binary Floating-Point Arithmetic.
3641: -------------------------------------------------------------------------------
3642: */
3643: static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3644: {
3645: int32 aExp, bExp, zExp;
3646: bits64 aSig, bSig, zSig0, zSig1;
3647: int32 expDiff;
3648: floatx80 z;
3649:
3650: aSig = extractFloatx80Frac( a );
3651: aExp = extractFloatx80Exp( a );
3652: bSig = extractFloatx80Frac( b );
3653: bExp = extractFloatx80Exp( b );
3654: expDiff = aExp - bExp;
3655: if ( 0 < expDiff ) goto aExpBigger;
3656: if ( expDiff < 0 ) goto bExpBigger;
3657: if ( aExp == 0x7FFF ) {
3658: if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3659: return propagateFloatx80NaN( a, b );
3660: }
3661: float_raise( float_flag_invalid );
3662: z.low = floatx80_default_nan_low;
3663: z.high = floatx80_default_nan_high;
3664: return z;
3665: }
3666: if ( aExp == 0 ) {
3667: aExp = 1;
3668: bExp = 1;
3669: }
3670: zSig1 = 0;
3671: if ( bSig < aSig ) goto aBigger;
3672: if ( aSig < bSig ) goto bBigger;
3673: return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
3674: bExpBigger:
3675: if ( bExp == 0x7FFF ) {
3676: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3677: return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3678: }
3679: if ( aExp == 0 ) ++expDiff;
3680: shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3681: bBigger:
3682: sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3683: zExp = bExp;
3684: zSign ^= 1;
3685: goto normalizeRoundAndPack;
3686: aExpBigger:
3687: if ( aExp == 0x7FFF ) {
3688: if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3689: return a;
3690: }
3691: if ( bExp == 0 ) --expDiff;
3692: shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3693: aBigger:
3694: sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3695: zExp = aExp;
3696: normalizeRoundAndPack:
3697: return
3698: normalizeRoundAndPackFloatx80(
3699: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3700:
3701: }
3702:
3703: /*
3704: -------------------------------------------------------------------------------
3705: Returns the result of adding the extended double-precision floating-point
3706: values `a' and `b'. The operation is performed according to the IEC/IEEE
3707: Standard for Binary Floating-Point Arithmetic.
3708: -------------------------------------------------------------------------------
3709: */
3710: floatx80 floatx80_add( floatx80 a, floatx80 b )
3711: {
3712: flag aSign, bSign;
3713:
3714: aSign = extractFloatx80Sign( a );
3715: bSign = extractFloatx80Sign( b );
3716: if ( aSign == bSign ) {
3717: return addFloatx80Sigs( a, b, aSign );
3718: }
3719: else {
3720: return subFloatx80Sigs( a, b, aSign );
3721: }
3722:
3723: }
3724:
3725: /*
3726: -------------------------------------------------------------------------------
3727: Returns the result of subtracting the extended double-precision floating-
3728: point values `a' and `b'. The operation is performed according to the
3729: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3730: -------------------------------------------------------------------------------
3731: */
3732: floatx80 floatx80_sub( floatx80 a, floatx80 b )
3733: {
3734: flag aSign, bSign;
3735:
3736: aSign = extractFloatx80Sign( a );
3737: bSign = extractFloatx80Sign( b );
3738: if ( aSign == bSign ) {
3739: return subFloatx80Sigs( a, b, aSign );
3740: }
3741: else {
3742: return addFloatx80Sigs( a, b, aSign );
3743: }
3744:
3745: }
3746:
3747: /*
3748: -------------------------------------------------------------------------------
3749: Returns the result of multiplying the extended double-precision floating-
3750: point values `a' and `b'. The operation is performed according to the
3751: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3752: -------------------------------------------------------------------------------
3753: */
3754: floatx80 floatx80_mul( floatx80 a, floatx80 b )
3755: {
3756: flag aSign, bSign, zSign;
3757: int32 aExp, bExp, zExp;
3758: bits64 aSig, bSig, zSig0, zSig1;
3759: floatx80 z;
3760:
3761: aSig = extractFloatx80Frac( a );
3762: aExp = extractFloatx80Exp( a );
3763: aSign = extractFloatx80Sign( a );
3764: bSig = extractFloatx80Frac( b );
3765: bExp = extractFloatx80Exp( b );
3766: bSign = extractFloatx80Sign( b );
3767: zSign = aSign ^ bSign;
3768: if ( aExp == 0x7FFF ) {
3769: if ( (bits64) ( aSig<<1 )
3770: || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3771: return propagateFloatx80NaN( a, b );
3772: }
3773: if ( ( bExp | bSig ) == 0 ) goto invalid;
3774: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3775: }
3776: if ( bExp == 0x7FFF ) {
3777: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3778: if ( ( aExp | aSig ) == 0 ) {
3779: invalid:
3780: float_raise( float_flag_invalid );
3781: z.low = floatx80_default_nan_low;
3782: z.high = floatx80_default_nan_high;
3783: return z;
3784: }
3785: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3786: }
3787: if ( aExp == 0 ) {
3788: if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3789: normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3790: }
3791: if ( bExp == 0 ) {
3792: if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3793: normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3794: }
3795: zExp = aExp + bExp - 0x3FFE;
3796: mul64To128( aSig, bSig, &zSig0, &zSig1 );
3797: if ( 0 < (sbits64) zSig0 ) {
3798: shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3799: --zExp;
3800: }
3801: return
3802: roundAndPackFloatx80(
3803: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3804:
3805: }
3806:
3807: /*
3808: -------------------------------------------------------------------------------
3809: Returns the result of dividing the extended double-precision floating-point
3810: value `a' by the corresponding value `b'. The operation is performed
3811: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3812: -------------------------------------------------------------------------------
3813: */
3814: floatx80 floatx80_div( floatx80 a, floatx80 b )
3815: {
3816: flag aSign, bSign, zSign;
3817: int32 aExp, bExp, zExp;
3818: bits64 aSig, bSig, zSig0, zSig1;
3819: bits64 rem0, rem1, rem2, term0, term1, term2;
3820: floatx80 z;
3821:
3822: aSig = extractFloatx80Frac( a );
3823: aExp = extractFloatx80Exp( a );
3824: aSign = extractFloatx80Sign( a );
3825: bSig = extractFloatx80Frac( b );
3826: bExp = extractFloatx80Exp( b );
3827: bSign = extractFloatx80Sign( b );
3828: zSign = aSign ^ bSign;
3829: if ( aExp == 0x7FFF ) {
3830: if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3831: if ( bExp == 0x7FFF ) {
3832: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3833: goto invalid;
3834: }
3835: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3836: }
3837: if ( bExp == 0x7FFF ) {
3838: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3839: return packFloatx80( zSign, 0, 0 );
3840: }
3841: if ( bExp == 0 ) {
3842: if ( bSig == 0 ) {
3843: if ( ( aExp | aSig ) == 0 ) {
3844: invalid:
3845: float_raise( float_flag_invalid );
3846: z.low = floatx80_default_nan_low;
3847: z.high = floatx80_default_nan_high;
3848: return z;
3849: }
3850: float_raise( float_flag_divbyzero );
3851: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3852: }
3853: normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3854: }
3855: if ( aExp == 0 ) {
3856: if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3857: normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3858: }
3859: zExp = aExp - bExp + 0x3FFE;
3860: rem1 = 0;
3861: if ( bSig <= aSig ) {
3862: shift128Right( aSig, 0, 1, &aSig, &rem1 );
3863: ++zExp;
3864: }
3865: zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3866: mul64To128( bSig, zSig0, &term0, &term1 );
3867: sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3868: while ( (sbits64) rem0 < 0 ) {
3869: --zSig0;
3870: add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3871: }
3872: zSig1 = estimateDiv128To64( rem1, 0, bSig );
3873: if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3874: mul64To128( bSig, zSig1, &term1, &term2 );
3875: sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3876: while ( (sbits64) rem1 < 0 ) {
3877: --zSig1;
3878: add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3879: }
3880: zSig1 |= ( ( rem1 | rem2 ) != 0 );
3881: }
3882: return
3883: roundAndPackFloatx80(
3884: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3885:
3886: }
3887:
3888: /*
3889: -------------------------------------------------------------------------------
3890: Returns the remainder of the extended double-precision floating-point value
3891: `a' with respect to the corresponding value `b'. The operation is performed
3892: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3893: -------------------------------------------------------------------------------
3894: */
3895: floatx80 floatx80_rem( floatx80 a, floatx80 b )
3896: {
3897: flag aSign, bSign, zSign;
3898: int32 aExp, bExp, expDiff;
3899: bits64 aSig0, aSig1, bSig;
3900: bits64 q, term0, term1, alternateASig0, alternateASig1;
3901: floatx80 z;
3902:
3903: aSig0 = extractFloatx80Frac( a );
3904: aExp = extractFloatx80Exp( a );
3905: aSign = extractFloatx80Sign( a );
3906: bSig = extractFloatx80Frac( b );
3907: bExp = extractFloatx80Exp( b );
3908: bSign = extractFloatx80Sign( b );
3909: if ( aExp == 0x7FFF ) {
3910: if ( (bits64) ( aSig0<<1 )
3911: || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3912: return propagateFloatx80NaN( a, b );
3913: }
3914: goto invalid;
3915: }
3916: if ( bExp == 0x7FFF ) {
3917: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3918: return a;
3919: }
3920: if ( bExp == 0 ) {
3921: if ( bSig == 0 ) {
3922: invalid:
3923: float_raise( float_flag_invalid );
3924: z.low = floatx80_default_nan_low;
3925: z.high = floatx80_default_nan_high;
3926: return z;
3927: }
3928: normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3929: }
3930: if ( aExp == 0 ) {
3931: if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3932: normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3933: }
3934: bSig |= LIT64( 0x8000000000000000 );
3935: zSign = aSign;
3936: expDiff = aExp - bExp;
3937: aSig1 = 0;
3938: if ( expDiff < 0 ) {
3939: if ( expDiff < -1 ) return a;
3940: shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3941: expDiff = 0;
3942: }
3943: q = ( bSig <= aSig0 );
3944: if ( q ) aSig0 -= bSig;
3945: expDiff -= 64;
3946: while ( 0 < expDiff ) {
3947: q = estimateDiv128To64( aSig0, aSig1, bSig );
3948: q = ( 2 < q ) ? q - 2 : 0;
3949: mul64To128( bSig, q, &term0, &term1 );
3950: sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3951: shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3952: expDiff -= 62;
3953: }
3954: expDiff += 64;
3955: if ( 0 < expDiff ) {
3956: q = estimateDiv128To64( aSig0, aSig1, bSig );
3957: q = ( 2 < q ) ? q - 2 : 0;
3958: q >>= 64 - expDiff;
3959: mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3960: sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3961: shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3962: while ( le128( term0, term1, aSig0, aSig1 ) ) {
3963: ++q;
3964: sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3965: }
3966: }
3967: else {
3968: term1 = 0;
3969: term0 = bSig;
3970: }
3971: sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3972: if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3973: || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3974: && ( q & 1 ) )
3975: ) {
3976: aSig0 = alternateASig0;
3977: aSig1 = alternateASig1;
3978: zSign = ! zSign;
3979: }
3980: return
3981: normalizeRoundAndPackFloatx80(
3982: 80, zSign, bExp + expDiff, aSig0, aSig1 );
3983:
3984: }
3985:
3986: /*
3987: -------------------------------------------------------------------------------
3988: Returns the square root of the extended double-precision floating-point
3989: value `a'. The operation is performed according to the IEC/IEEE Standard
3990: for Binary Floating-Point Arithmetic.
3991: -------------------------------------------------------------------------------
3992: */
3993: floatx80 floatx80_sqrt( floatx80 a )
3994: {
3995: flag aSign;
3996: int32 aExp, zExp;
3997: bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3998: bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3999: floatx80 z;
4000:
4001: aSig0 = extractFloatx80Frac( a );
4002: aExp = extractFloatx80Exp( a );
4003: aSign = extractFloatx80Sign( a );
4004: if ( aExp == 0x7FFF ) {
4005: if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
4006: if ( ! aSign ) return a;
4007: goto invalid;
4008: }
4009: if ( aSign ) {
4010: if ( ( aExp | aSig0 ) == 0 ) return a;
4011: invalid:
4012: float_raise( float_flag_invalid );
4013: z.low = floatx80_default_nan_low;
4014: z.high = floatx80_default_nan_high;
4015: return z;
4016: }
4017: if ( aExp == 0 ) {
4018: if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4019: normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4020: }
4021: zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4022: zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4023: shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4024: zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4025: doubleZSig0 = zSig0<<1;
4026: mul64To128( zSig0, zSig0, &term0, &term1 );
4027: sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4028: while ( (sbits64) rem0 < 0 ) {
4029: --zSig0;
4030: doubleZSig0 -= 2;
4031: add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4032: }
4033: zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4034: if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4035: if ( zSig1 == 0 ) zSig1 = 1;
4036: mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4037: sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4038: mul64To128( zSig1, zSig1, &term2, &term3 );
4039: sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4040: while ( (sbits64) rem1 < 0 ) {
4041: --zSig1;
4042: shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4043: term3 |= 1;
4044: term2 |= doubleZSig0;
4045: add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4046: }
4047: zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4048: }
4049: shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4050: zSig0 |= doubleZSig0;
4051: return
4052: roundAndPackFloatx80(
4053: floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
4054:
4055: }
4056:
4057: /*
4058: -------------------------------------------------------------------------------
4059: Returns 1 if the extended double-precision floating-point value `a' is
4060: equal to the corresponding value `b', and 0 otherwise. The comparison is
4061: performed according to the IEC/IEEE Standard for Binary Floating-Point
4062: Arithmetic.
4063: -------------------------------------------------------------------------------
4064: */
4065: flag floatx80_eq( floatx80 a, floatx80 b )
4066: {
4067:
4068: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4069: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4070: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4071: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4072: ) {
4073: if ( floatx80_is_signaling_nan( a )
4074: || floatx80_is_signaling_nan( b ) ) {
4075: float_raise( float_flag_invalid );
4076: }
4077: return 0;
4078: }
4079: return
4080: ( a.low == b.low )
4081: && ( ( a.high == b.high )
4082: || ( ( a.low == 0 )
4083: && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4084: );
4085:
4086: }
4087:
4088: /*
4089: -------------------------------------------------------------------------------
4090: Returns 1 if the extended double-precision floating-point value `a' is
4091: less than or equal to the corresponding value `b', and 0 otherwise. The
4092: comparison is performed according to the IEC/IEEE Standard for Binary
4093: Floating-Point Arithmetic.
4094: -------------------------------------------------------------------------------
4095: */
4096: flag floatx80_le( floatx80 a, floatx80 b )
4097: {
4098: flag aSign, bSign;
4099:
4100: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4101: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4102: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4103: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4104: ) {
4105: float_raise( float_flag_invalid );
4106: return 0;
4107: }
4108: aSign = extractFloatx80Sign( a );
4109: bSign = extractFloatx80Sign( b );
4110: if ( aSign != bSign ) {
4111: return
4112: aSign
4113: || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4114: == 0 );
4115: }
4116: return
4117: aSign ? le128( b.high, b.low, a.high, a.low )
4118: : le128( a.high, a.low, b.high, b.low );
4119:
4120: }
4121:
4122: /*
4123: -------------------------------------------------------------------------------
4124: Returns 1 if the extended double-precision floating-point value `a' is
4125: less than the corresponding value `b', and 0 otherwise. The comparison
4126: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4127: Arithmetic.
4128: -------------------------------------------------------------------------------
4129: */
4130: flag floatx80_lt( floatx80 a, floatx80 b )
4131: {
4132: flag aSign, bSign;
4133:
4134: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4135: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4136: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4137: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4138: ) {
4139: float_raise( float_flag_invalid );
4140: return 0;
4141: }
4142: aSign = extractFloatx80Sign( a );
4143: bSign = extractFloatx80Sign( b );
4144: if ( aSign != bSign ) {
4145: return
4146: aSign
4147: && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4148: != 0 );
4149: }
4150: return
4151: aSign ? lt128( b.high, b.low, a.high, a.low )
4152: : lt128( a.high, a.low, b.high, b.low );
4153:
4154: }
4155:
4156: /*
4157: -------------------------------------------------------------------------------
4158: Returns 1 if the extended double-precision floating-point value `a' is equal
4159: to the corresponding value `b', and 0 otherwise. The invalid exception is
4160: raised if either operand is a NaN. Otherwise, the comparison is performed
4161: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4162: -------------------------------------------------------------------------------
4163: */
4164: flag floatx80_eq_signaling( floatx80 a, floatx80 b )
4165: {
4166:
4167: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4168: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4169: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4170: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4171: ) {
4172: float_raise( float_flag_invalid );
4173: return 0;
4174: }
4175: return
4176: ( a.low == b.low )
4177: && ( ( a.high == b.high )
4178: || ( ( a.low == 0 )
4179: && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4180: );
4181:
4182: }
4183:
4184: /*
4185: -------------------------------------------------------------------------------
4186: Returns 1 if the extended double-precision floating-point value `a' is less
4187: than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4188: do not cause an exception. Otherwise, the comparison is performed according
4189: to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4190: -------------------------------------------------------------------------------
4191: */
4192: flag floatx80_le_quiet( floatx80 a, floatx80 b )
4193: {
4194: flag aSign, bSign;
4195:
4196: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4197: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4198: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4199: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4200: ) {
4201: if ( floatx80_is_signaling_nan( a )
4202: || floatx80_is_signaling_nan( b ) ) {
4203: float_raise( float_flag_invalid );
4204: }
4205: return 0;
4206: }
4207: aSign = extractFloatx80Sign( a );
4208: bSign = extractFloatx80Sign( b );
4209: if ( aSign != bSign ) {
4210: return
4211: aSign
4212: || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4213: == 0 );
4214: }
4215: return
4216: aSign ? le128( b.high, b.low, a.high, a.low )
4217: : le128( a.high, a.low, b.high, b.low );
4218:
4219: }
4220:
4221: /*
4222: -------------------------------------------------------------------------------
4223: Returns 1 if the extended double-precision floating-point value `a' is less
4224: than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4225: an exception. Otherwise, the comparison is performed according to the
4226: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4227: -------------------------------------------------------------------------------
4228: */
4229: flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4230: {
4231: flag aSign, bSign;
4232:
4233: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4234: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4235: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4236: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4237: ) {
4238: if ( floatx80_is_signaling_nan( a )
4239: || floatx80_is_signaling_nan( b ) ) {
4240: float_raise( float_flag_invalid );
4241: }
4242: return 0;
4243: }
4244: aSign = extractFloatx80Sign( a );
4245: bSign = extractFloatx80Sign( b );
4246: if ( aSign != bSign ) {
4247: return
4248: aSign
4249: && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4250: != 0 );
4251: }
4252: return
4253: aSign ? lt128( b.high, b.low, a.high, a.low )
4254: : lt128( a.high, a.low, b.high, b.low );
4255:
4256: }
4257:
4258: #endif
4259:
4260: #ifdef FLOAT128
4261:
4262: /*
4263: -------------------------------------------------------------------------------
4264: Returns the result of converting the quadruple-precision floating-point
4265: value `a' to the 32-bit two's complement integer format. The conversion
4266: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4267: Arithmetic---which means in particular that the conversion is rounded
4268: according to the current rounding mode. If `a' is a NaN, the largest
4269: positive integer is returned. Otherwise, if the conversion overflows, the
4270: largest integer with the same sign as `a' is returned.
4271: -------------------------------------------------------------------------------
4272: */
4273: int32 float128_to_int32( float128 a )
4274: {
4275: flag aSign;
4276: int32 aExp, shiftCount;
4277: bits64 aSig0, aSig1;
4278:
4279: aSig1 = extractFloat128Frac1( a );
4280: aSig0 = extractFloat128Frac0( a );
4281: aExp = extractFloat128Exp( a );
4282: aSign = extractFloat128Sign( a );
4283: if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4284: if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4285: aSig0 |= ( aSig1 != 0 );
4286: shiftCount = 0x4028 - aExp;
4287: if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4288: return roundAndPackInt32( aSign, aSig0 );
4289:
4290: }
4291:
4292: /*
4293: -------------------------------------------------------------------------------
4294: Returns the result of converting the quadruple-precision floating-point
4295: value `a' to the 32-bit two's complement integer format. The conversion
4296: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4297: Arithmetic, except that the conversion is always rounded toward zero. If
4298: `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4299: conversion overflows, the largest integer with the same sign as `a' is
4300: returned.
4301: -------------------------------------------------------------------------------
4302: */
4303: int32 float128_to_int32_round_to_zero( float128 a )
4304: {
4305: flag aSign;
4306: int32 aExp, shiftCount;
4307: bits64 aSig0, aSig1, savedASig;
4308: int32 z;
4309:
4310: aSig1 = extractFloat128Frac1( a );
4311: aSig0 = extractFloat128Frac0( a );
4312: aExp = extractFloat128Exp( a );
4313: aSign = extractFloat128Sign( a );
4314: aSig0 |= ( aSig1 != 0 );
4315: if ( 0x401E < aExp ) {
4316: if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4317: goto invalid;
4318: }
4319: else if ( aExp < 0x3FFF ) {
4320: if ( aExp || aSig0 ) float_set_inexact();
4321: return 0;
4322: }
4323: aSig0 |= LIT64( 0x0001000000000000 );
4324: shiftCount = 0x402F - aExp;
4325: savedASig = aSig0;
4326: aSig0 >>= shiftCount;
4327: z = aSig0;
4328: if ( aSign ) z = - z;
4329: if ( ( z < 0 ) ^ aSign ) {
4330: invalid:
4331: float_raise( float_flag_invalid );
4332: return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4333: }
4334: if ( ( aSig0<<shiftCount ) != savedASig ) {
4335: float_set_inexact();
4336: }
4337: return z;
4338:
4339: }
4340:
4341: /*
4342: -------------------------------------------------------------------------------
4343: Returns the result of converting the quadruple-precision floating-point
4344: value `a' to the 64-bit two's complement integer format. The conversion
4345: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4346: Arithmetic---which means in particular that the conversion is rounded
4347: according to the current rounding mode. If `a' is a NaN, the largest
4348: positive integer is returned. Otherwise, if the conversion overflows, the
4349: largest integer with the same sign as `a' is returned.
4350: -------------------------------------------------------------------------------
4351: */
4352: int64 float128_to_int64( float128 a )
4353: {
4354: flag aSign;
4355: int32 aExp, shiftCount;
4356: bits64 aSig0, aSig1;
4357:
4358: aSig1 = extractFloat128Frac1( a );
4359: aSig0 = extractFloat128Frac0( a );
4360: aExp = extractFloat128Exp( a );
4361: aSign = extractFloat128Sign( a );
4362: if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4363: shiftCount = 0x402F - aExp;
4364: if ( shiftCount <= 0 ) {
4365: if ( 0x403E < aExp ) {
4366: float_raise( float_flag_invalid );
4367: if ( ! aSign
4368: || ( ( aExp == 0x7FFF )
4369: && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4370: )
4371: ) {
4372: return LIT64( 0x7FFFFFFFFFFFFFFF );
4373: }
4374: return (sbits64) LIT64( 0x8000000000000000 );
4375: }
4376: shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4377: }
4378: else {
4379: shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4380: }
4381: return roundAndPackInt64( aSign, aSig0, aSig1 );
4382:
4383: }
4384:
4385: /*
4386: -------------------------------------------------------------------------------
4387: Returns the result of converting the quadruple-precision floating-point
4388: value `a' to the 64-bit two's complement integer format. The conversion
4389: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4390: Arithmetic, except that the conversion is always rounded toward zero.
4391: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4392: the conversion overflows, the largest integer with the same sign as `a' is
4393: returned.
4394: -------------------------------------------------------------------------------
4395: */
4396: int64 float128_to_int64_round_to_zero( float128 a )
4397: {
4398: flag aSign;
4399: int32 aExp, shiftCount;
4400: bits64 aSig0, aSig1;
4401: int64 z;
4402:
4403: aSig1 = extractFloat128Frac1( a );
4404: aSig0 = extractFloat128Frac0( a );
4405: aExp = extractFloat128Exp( a );
4406: aSign = extractFloat128Sign( a );
4407: if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4408: shiftCount = aExp - 0x402F;
4409: if ( 0 < shiftCount ) {
4410: if ( 0x403E <= aExp ) {
4411: aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4412: if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4413: && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4414: if ( aSig1 ) float_set_inexact();
4415: }
4416: else {
4417: float_raise( float_flag_invalid );
4418: if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4419: return LIT64( 0x7FFFFFFFFFFFFFFF );
4420: }
4421: }
4422: return (sbits64) LIT64( 0x8000000000000000 );
4423: }
4424: z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4425: if ( (bits64) ( aSig1<<shiftCount ) ) {
4426: float_set_inexact();
4427: }
4428: }
4429: else {
4430: if ( aExp < 0x3FFF ) {
4431: if ( aExp | aSig0 | aSig1 ) {
4432: float_set_inexact();
4433: }
4434: return 0;
4435: }
4436: z = aSig0>>( - shiftCount );
4437: if ( aSig1
4438: || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4439: float_set_inexact();
4440: }
4441: }
4442: if ( aSign ) z = - z;
4443: return z;
4444:
4445: }
4446:
4447: /*
4448: -------------------------------------------------------------------------------
4449: Returns the result of converting the quadruple-precision floating-point
4450: value `a' to the single-precision floating-point format. The conversion
4451: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4452: Arithmetic.
4453: -------------------------------------------------------------------------------
4454: */
4455: float32 float128_to_float32( float128 a )
4456: {
4457: flag aSign;
4458: int32 aExp;
4459: bits64 aSig0, aSig1;
4460: bits32 zSig;
4461:
4462: aSig1 = extractFloat128Frac1( a );
4463: aSig0 = extractFloat128Frac0( a );
4464: aExp = extractFloat128Exp( a );
4465: aSign = extractFloat128Sign( a );
4466: if ( aExp == 0x7FFF ) {
4467: if ( aSig0 | aSig1 ) {
4468: return commonNaNToFloat32( float128ToCommonNaN( a ) );
4469: }
4470: return packFloat32( aSign, 0xFF, 0 );
4471: }
4472: aSig0 |= ( aSig1 != 0 );
4473: shift64RightJamming( aSig0, 18, &aSig0 );
4474: zSig = aSig0;
4475: if ( aExp || zSig ) {
4476: zSig |= 0x40000000;
4477: aExp -= 0x3F81;
4478: }
4479: return roundAndPackFloat32( aSign, aExp, zSig );
4480:
4481: }
4482:
4483: /*
4484: -------------------------------------------------------------------------------
4485: Returns the result of converting the quadruple-precision floating-point
4486: value `a' to the double-precision floating-point format. The conversion
4487: is performed according to the IEC/IEEE Standard for Binary Floating-Point
4488: Arithmetic.
4489: -------------------------------------------------------------------------------
4490: */
4491: float64 float128_to_float64( float128 a )
4492: {
4493: flag aSign;
4494: int32 aExp;
4495: bits64 aSig0, aSig1;
4496:
4497: aSig1 = extractFloat128Frac1( a );
4498: aSig0 = extractFloat128Frac0( a );
4499: aExp = extractFloat128Exp( a );
4500: aSign = extractFloat128Sign( a );
4501: if ( aExp == 0x7FFF ) {
4502: if ( aSig0 | aSig1 ) {
4503: return commonNaNToFloat64( float128ToCommonNaN( a ) );
4504: }
4505: return packFloat64( aSign, 0x7FF, 0 );
4506: }
4507: shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4508: aSig0 |= ( aSig1 != 0 );
4509: if ( aExp || aSig0 ) {
4510: aSig0 |= LIT64( 0x4000000000000000 );
4511: aExp -= 0x3C01;
4512: }
4513: return roundAndPackFloat64( aSign, aExp, aSig0 );
4514:
4515: }
4516:
4517: #ifdef FLOATX80
4518:
4519: /*
4520: -------------------------------------------------------------------------------
4521: Returns the result of converting the quadruple-precision floating-point
4522: value `a' to the extended double-precision floating-point format. The
4523: conversion is performed according to the IEC/IEEE Standard for Binary
4524: Floating-Point Arithmetic.
4525: -------------------------------------------------------------------------------
4526: */
4527: floatx80 float128_to_floatx80( float128 a )
4528: {
4529: flag aSign;
4530: int32 aExp;
4531: bits64 aSig0, aSig1;
4532:
4533: aSig1 = extractFloat128Frac1( a );
4534: aSig0 = extractFloat128Frac0( a );
4535: aExp = extractFloat128Exp( a );
4536: aSign = extractFloat128Sign( a );
4537: if ( aExp == 0x7FFF ) {
4538: if ( aSig0 | aSig1 ) {
4539: return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4540: }
4541: return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4542: }
4543: if ( aExp == 0 ) {
4544: if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4545: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4546: }
4547: else {
4548: aSig0 |= LIT64( 0x0001000000000000 );
4549: }
4550: shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4551: return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4552:
4553: }
4554:
4555: #endif
4556:
4557: /*
4558: -------------------------------------------------------------------------------
4559: Rounds the quadruple-precision floating-point value `a' to an integer, and
4560: returns the result as a quadruple-precision floating-point value. The
4561: operation is performed according to the IEC/IEEE Standard for Binary
4562: Floating-Point Arithmetic.
4563: -------------------------------------------------------------------------------
4564: */
4565: float128 float128_round_to_int( float128 a )
4566: {
4567: flag aSign;
4568: int32 aExp;
4569: bits64 lastBitMask, roundBitsMask;
4570: int8 roundingMode;
4571: float128 z;
4572:
4573: aExp = extractFloat128Exp( a );
4574: if ( 0x402F <= aExp ) {
4575: if ( 0x406F <= aExp ) {
4576: if ( ( aExp == 0x7FFF )
4577: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4578: ) {
4579: return propagateFloat128NaN( a, a );
4580: }
4581: return a;
4582: }
4583: lastBitMask = 1;
4584: lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4585: roundBitsMask = lastBitMask - 1;
4586: z = a;
4587: roundingMode = float_rounding_mode();
4588: if ( roundingMode == float_round_nearest_even ) {
4589: if ( lastBitMask ) {
4590: add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4591: if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4592: }
4593: else {
4594: if ( (sbits64) z.low < 0 ) {
4595: ++z.high;
4596: if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4597: }
4598: }
4599: }
4600: else if ( roundingMode != float_round_to_zero ) {
4601: if ( extractFloat128Sign( z )
4602: ^ ( roundingMode == float_round_up ) ) {
4603: add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4604: }
4605: }
4606: z.low &= ~ roundBitsMask;
4607: }
4608: else {
4609: if ( aExp < 0x3FFF ) {
4610: if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4611: float_set_inexact();
4612: aSign = extractFloat128Sign( a );
4613: switch ( float_rounding_mode() ) {
4614: case float_round_nearest_even:
4615: if ( ( aExp == 0x3FFE )
4616: && ( extractFloat128Frac0( a )
4617: | extractFloat128Frac1( a ) )
4618: ) {
4619: return packFloat128( aSign, 0x3FFF, 0, 0 );
4620: }
4621: break;
4622: case float_round_down:
4623: return
4624: aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4625: : packFloat128( 0, 0, 0, 0 );
4626: case float_round_up:
4627: return
4628: aSign ? packFloat128( 1, 0, 0, 0 )
4629: : packFloat128( 0, 0x3FFF, 0, 0 );
4630: }
4631: return packFloat128( aSign, 0, 0, 0 );
4632: }
4633: lastBitMask = 1;
4634: lastBitMask <<= 0x402F - aExp;
4635: roundBitsMask = lastBitMask - 1;
4636: z.low = 0;
4637: z.high = a.high;
4638: roundingMode = float_rounding_mode();
4639: if ( roundingMode == float_round_nearest_even ) {
4640: z.high += lastBitMask>>1;
4641: if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4642: z.high &= ~ lastBitMask;
4643: }
4644: }
4645: else if ( roundingMode != float_round_to_zero ) {
4646: if ( extractFloat128Sign( z )
4647: ^ ( roundingMode == float_round_up ) ) {
4648: z.high |= ( a.low != 0 );
4649: z.high += roundBitsMask;
4650: }
4651: }
4652: z.high &= ~ roundBitsMask;
4653: }
4654: if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4655: float_set_inexact();
4656: }
4657: return z;
4658:
4659: }
4660:
4661: /*
4662: -------------------------------------------------------------------------------
4663: Returns the result of adding the absolute values of the quadruple-precision
4664: floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4665: before being returned. `zSign' is ignored if the result is a NaN.
4666: The addition is performed according to the IEC/IEEE Standard for Binary
4667: Floating-Point Arithmetic.
4668: -------------------------------------------------------------------------------
4669: */
4670: static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4671: {
4672: int32 aExp, bExp, zExp;
4673: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4674: int32 expDiff;
4675:
4676: aSig1 = extractFloat128Frac1( a );
4677: aSig0 = extractFloat128Frac0( a );
4678: aExp = extractFloat128Exp( a );
4679: bSig1 = extractFloat128Frac1( b );
4680: bSig0 = extractFloat128Frac0( b );
4681: bExp = extractFloat128Exp( b );
4682: expDiff = aExp - bExp;
4683: if ( 0 < expDiff ) {
4684: if ( aExp == 0x7FFF ) {
4685: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4686: return a;
4687: }
4688: if ( bExp == 0 ) {
4689: --expDiff;
4690: }
4691: else {
4692: bSig0 |= LIT64( 0x0001000000000000 );
4693: }
4694: shift128ExtraRightJamming(
4695: bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4696: zExp = aExp;
4697: }
4698: else if ( expDiff < 0 ) {
4699: if ( bExp == 0x7FFF ) {
4700: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4701: return packFloat128( zSign, 0x7FFF, 0, 0 );
4702: }
4703: if ( aExp == 0 ) {
4704: ++expDiff;
4705: }
4706: else {
4707: aSig0 |= LIT64( 0x0001000000000000 );
4708: }
4709: shift128ExtraRightJamming(
4710: aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4711: zExp = bExp;
4712: }
4713: else {
4714: if ( aExp == 0x7FFF ) {
4715: if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4716: return propagateFloat128NaN( a, b );
4717: }
4718: return a;
4719: }
4720: add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4721: if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4722: zSig2 = 0;
4723: zSig0 |= LIT64( 0x0002000000000000 );
4724: zExp = aExp;
4725: goto shiftRight1;
4726: }
4727: aSig0 |= LIT64( 0x0001000000000000 );
4728: add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4729: --zExp;
4730: if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4731: ++zExp;
4732: shiftRight1:
4733: shift128ExtraRightJamming(
4734: zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4735: roundAndPack:
4736: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4737:
4738: }
4739:
4740: /*
4741: -------------------------------------------------------------------------------
4742: Returns the result of subtracting the absolute values of the quadruple-
4743: precision floating-point values `a' and `b'. If `zSign' is 1, the
4744: difference is negated before being returned. `zSign' is ignored if the
4745: result is a NaN. The subtraction is performed according to the IEC/IEEE
4746: Standard for Binary Floating-Point Arithmetic.
4747: -------------------------------------------------------------------------------
4748: */
4749: static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4750: {
4751: int32 aExp, bExp, zExp;
4752: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4753: int32 expDiff;
4754: float128 z;
4755:
4756: aSig1 = extractFloat128Frac1( a );
4757: aSig0 = extractFloat128Frac0( a );
4758: aExp = extractFloat128Exp( a );
4759: bSig1 = extractFloat128Frac1( b );
4760: bSig0 = extractFloat128Frac0( b );
4761: bExp = extractFloat128Exp( b );
4762: expDiff = aExp - bExp;
4763: shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4764: shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4765: if ( 0 < expDiff ) goto aExpBigger;
4766: if ( expDiff < 0 ) goto bExpBigger;
4767: if ( aExp == 0x7FFF ) {
4768: if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4769: return propagateFloat128NaN( a, b );
4770: }
4771: float_raise( float_flag_invalid );
4772: z.low = float128_default_nan_low;
4773: z.high = float128_default_nan_high;
4774: return z;
4775: }
4776: if ( aExp == 0 ) {
4777: aExp = 1;
4778: bExp = 1;
4779: }
4780: if ( bSig0 < aSig0 ) goto aBigger;
4781: if ( aSig0 < bSig0 ) goto bBigger;
4782: if ( bSig1 < aSig1 ) goto aBigger;
4783: if ( aSig1 < bSig1 ) goto bBigger;
4784: return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
4785: bExpBigger:
4786: if ( bExp == 0x7FFF ) {
4787: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4788: return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4789: }
4790: if ( aExp == 0 ) {
4791: ++expDiff;
4792: }
4793: else {
4794: aSig0 |= LIT64( 0x4000000000000000 );
4795: }
4796: shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4797: bSig0 |= LIT64( 0x4000000000000000 );
4798: bBigger:
4799: sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4800: zExp = bExp;
4801: zSign ^= 1;
4802: goto normalizeRoundAndPack;
4803: aExpBigger:
4804: if ( aExp == 0x7FFF ) {
4805: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4806: return a;
4807: }
4808: if ( bExp == 0 ) {
4809: --expDiff;
4810: }
4811: else {
4812: bSig0 |= LIT64( 0x4000000000000000 );
4813: }
4814: shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4815: aSig0 |= LIT64( 0x4000000000000000 );
4816: aBigger:
4817: sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4818: zExp = aExp;
4819: normalizeRoundAndPack:
4820: --zExp;
4821: return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4822:
4823: }
4824:
4825: /*
4826: -------------------------------------------------------------------------------
4827: Returns the result of adding the quadruple-precision floating-point values
4828: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4829: for Binary Floating-Point Arithmetic.
4830: -------------------------------------------------------------------------------
4831: */
4832: float128 float128_add( float128 a, float128 b )
4833: {
4834: flag aSign, bSign;
4835:
4836: aSign = extractFloat128Sign( a );
4837: bSign = extractFloat128Sign( b );
4838: if ( aSign == bSign ) {
4839: return addFloat128Sigs( a, b, aSign );
4840: }
4841: else {
4842: return subFloat128Sigs( a, b, aSign );
4843: }
4844:
4845: }
4846:
4847: /*
4848: -------------------------------------------------------------------------------
4849: Returns the result of subtracting the quadruple-precision floating-point
4850: values `a' and `b'. The operation is performed according to the IEC/IEEE
4851: Standard for Binary Floating-Point Arithmetic.
4852: -------------------------------------------------------------------------------
4853: */
4854: float128 float128_sub( float128 a, float128 b )
4855: {
4856: flag aSign, bSign;
4857:
4858: aSign = extractFloat128Sign( a );
4859: bSign = extractFloat128Sign( b );
4860: if ( aSign == bSign ) {
4861: return subFloat128Sigs( a, b, aSign );
4862: }
4863: else {
4864: return addFloat128Sigs( a, b, aSign );
4865: }
4866:
4867: }
4868:
4869: /*
4870: -------------------------------------------------------------------------------
4871: Returns the result of multiplying the quadruple-precision floating-point
4872: values `a' and `b'. The operation is performed according to the IEC/IEEE
4873: Standard for Binary Floating-Point Arithmetic.
4874: -------------------------------------------------------------------------------
4875: */
4876: float128 float128_mul( float128 a, float128 b )
4877: {
4878: flag aSign, bSign, zSign;
4879: int32 aExp, bExp, zExp;
4880: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4881: float128 z;
4882:
4883: aSig1 = extractFloat128Frac1( a );
4884: aSig0 = extractFloat128Frac0( a );
4885: aExp = extractFloat128Exp( a );
4886: aSign = extractFloat128Sign( a );
4887: bSig1 = extractFloat128Frac1( b );
4888: bSig0 = extractFloat128Frac0( b );
4889: bExp = extractFloat128Exp( b );
4890: bSign = extractFloat128Sign( b );
4891: zSign = aSign ^ bSign;
4892: if ( aExp == 0x7FFF ) {
4893: if ( ( aSig0 | aSig1 )
4894: || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4895: return propagateFloat128NaN( a, b );
4896: }
4897: if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4898: return packFloat128( zSign, 0x7FFF, 0, 0 );
4899: }
4900: if ( bExp == 0x7FFF ) {
4901: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4902: if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4903: invalid:
4904: float_raise( float_flag_invalid );
4905: z.low = float128_default_nan_low;
4906: z.high = float128_default_nan_high;
4907: return z;
4908: }
4909: return packFloat128( zSign, 0x7FFF, 0, 0 );
4910: }
4911: if ( aExp == 0 ) {
4912: if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4913: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4914: }
4915: if ( bExp == 0 ) {
4916: if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4917: normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4918: }
4919: zExp = aExp + bExp - 0x4000;
4920: aSig0 |= LIT64( 0x0001000000000000 );
4921: shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4922: mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4923: add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4924: zSig2 |= ( zSig3 != 0 );
4925: if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4926: shift128ExtraRightJamming(
4927: zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4928: ++zExp;
4929: }
4930: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4931:
4932: }
4933:
4934: /*
4935: -------------------------------------------------------------------------------
4936: Returns the result of dividing the quadruple-precision floating-point value
4937: `a' by the corresponding value `b'. The operation is performed according to
4938: the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4939: -------------------------------------------------------------------------------
4940: */
4941: float128 float128_div( float128 a, float128 b )
4942: {
4943: flag aSign, bSign, zSign;
4944: int32 aExp, bExp, zExp;
4945: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4946: bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4947: float128 z;
4948:
4949: aSig1 = extractFloat128Frac1( a );
4950: aSig0 = extractFloat128Frac0( a );
4951: aExp = extractFloat128Exp( a );
4952: aSign = extractFloat128Sign( a );
4953: bSig1 = extractFloat128Frac1( b );
4954: bSig0 = extractFloat128Frac0( b );
4955: bExp = extractFloat128Exp( b );
4956: bSign = extractFloat128Sign( b );
4957: zSign = aSign ^ bSign;
4958: if ( aExp == 0x7FFF ) {
4959: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4960: if ( bExp == 0x7FFF ) {
4961: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4962: goto invalid;
4963: }
4964: return packFloat128( zSign, 0x7FFF, 0, 0 );
4965: }
4966: if ( bExp == 0x7FFF ) {
4967: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4968: return packFloat128( zSign, 0, 0, 0 );
4969: }
4970: if ( bExp == 0 ) {
4971: if ( ( bSig0 | bSig1 ) == 0 ) {
4972: if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4973: invalid:
4974: float_raise( float_flag_invalid );
4975: z.low = float128_default_nan_low;
4976: z.high = float128_default_nan_high;
4977: return z;
4978: }
4979: float_raise( float_flag_divbyzero );
4980: return packFloat128( zSign, 0x7FFF, 0, 0 );
4981: }
4982: normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4983: }
4984: if ( aExp == 0 ) {
4985: if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4986: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4987: }
4988: zExp = aExp - bExp + 0x3FFD;
4989: shortShift128Left(
4990: aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4991: shortShift128Left(
4992: bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4993: if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4994: shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4995: ++zExp;
4996: }
4997: zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4998: mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4999: sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5000: while ( (sbits64) rem0 < 0 ) {
5001: --zSig0;
5002: add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5003: }
5004: zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5005: if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5006: mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5007: sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5008: while ( (sbits64) rem1 < 0 ) {
5009: --zSig1;
5010: add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5011: }
5012: zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5013: }
5014: shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5015: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
5016:
5017: }
5018:
5019: /*
5020: -------------------------------------------------------------------------------
5021: Returns the remainder of the quadruple-precision floating-point value `a'
5022: with respect to the corresponding value `b'. The operation is performed
5023: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5024: -------------------------------------------------------------------------------
5025: */
5026: float128 float128_rem( float128 a, float128 b )
5027: {
5028: flag aSign, bSign, zSign;
5029: int32 aExp, bExp, expDiff;
5030: bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5031: bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5032: sbits64 sigMean0;
5033: float128 z;
5034:
5035: aSig1 = extractFloat128Frac1( a );
5036: aSig0 = extractFloat128Frac0( a );
5037: aExp = extractFloat128Exp( a );
5038: aSign = extractFloat128Sign( a );
5039: bSig1 = extractFloat128Frac1( b );
5040: bSig0 = extractFloat128Frac0( b );
5041: bExp = extractFloat128Exp( b );
5042: bSign = extractFloat128Sign( b );
5043: if ( aExp == 0x7FFF ) {
5044: if ( ( aSig0 | aSig1 )
5045: || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5046: return propagateFloat128NaN( a, b );
5047: }
5048: goto invalid;
5049: }
5050: if ( bExp == 0x7FFF ) {
5051: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
5052: return a;
5053: }
5054: if ( bExp == 0 ) {
5055: if ( ( bSig0 | bSig1 ) == 0 ) {
5056: invalid:
5057: float_raise( float_flag_invalid );
5058: z.low = float128_default_nan_low;
5059: z.high = float128_default_nan_high;
5060: return z;
5061: }
5062: normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5063: }
5064: if ( aExp == 0 ) {
5065: if ( ( aSig0 | aSig1 ) == 0 ) return a;
5066: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5067: }
5068: expDiff = aExp - bExp;
5069: if ( expDiff < -1 ) return a;
5070: shortShift128Left(
5071: aSig0 | LIT64( 0x0001000000000000 ),
5072: aSig1,
5073: 15 - ( expDiff < 0 ),
5074: &aSig0,
5075: &aSig1
5076: );
5077: shortShift128Left(
5078: bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5079: q = le128( bSig0, bSig1, aSig0, aSig1 );
5080: if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5081: expDiff -= 64;
5082: while ( 0 < expDiff ) {
5083: q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5084: q = ( 4 < q ) ? q - 4 : 0;
5085: mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5086: shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5087: shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5088: sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5089: expDiff -= 61;
5090: }
5091: if ( -64 < expDiff ) {
5092: q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5093: q = ( 4 < q ) ? q - 4 : 0;
5094: q >>= - expDiff;
5095: shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5096: expDiff += 52;
5097: if ( expDiff < 0 ) {
5098: shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5099: }
5100: else {
5101: shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5102: }
5103: mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5104: sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5105: }
5106: else {
5107: shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5108: shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5109: }
5110: do {
5111: alternateASig0 = aSig0;
5112: alternateASig1 = aSig1;
5113: ++q;
5114: sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5115: } while ( 0 <= (sbits64) aSig0 );
5116: add128(
5117: aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
5118: if ( ( sigMean0 < 0 )
5119: || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5120: aSig0 = alternateASig0;
5121: aSig1 = alternateASig1;
5122: }
5123: zSign = ( (sbits64) aSig0 < 0 );
5124: if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5125: return
5126: normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
5127:
5128: }
5129:
5130: /*
5131: -------------------------------------------------------------------------------
5132: Returns the square root of the quadruple-precision floating-point value `a'.
5133: The operation is performed according to the IEC/IEEE Standard for Binary
5134: Floating-Point Arithmetic.
5135: -------------------------------------------------------------------------------
5136: */
5137: float128 float128_sqrt( float128 a )
5138: {
5139: flag aSign;
5140: int32 aExp, zExp;
5141: bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5142: bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5143: float128 z;
5144:
5145: aSig1 = extractFloat128Frac1( a );
5146: aSig0 = extractFloat128Frac0( a );
5147: aExp = extractFloat128Exp( a );
5148: aSign = extractFloat128Sign( a );
5149: if ( aExp == 0x7FFF ) {
5150: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
5151: if ( ! aSign ) return a;
5152: goto invalid;
5153: }
5154: if ( aSign ) {
5155: if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5156: invalid:
5157: float_raise( float_flag_invalid );
5158: z.low = float128_default_nan_low;
5159: z.high = float128_default_nan_high;
5160: return z;
5161: }
5162: if ( aExp == 0 ) {
5163: if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5164: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5165: }
5166: zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5167: aSig0 |= LIT64( 0x0001000000000000 );
5168: zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5169: shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5170: zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5171: doubleZSig0 = zSig0<<1;
5172: mul64To128( zSig0, zSig0, &term0, &term1 );
5173: sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5174: while ( (sbits64) rem0 < 0 ) {
5175: --zSig0;
5176: doubleZSig0 -= 2;
5177: add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5178: }
5179: zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5180: if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5181: if ( zSig1 == 0 ) zSig1 = 1;
5182: mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5183: sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5184: mul64To128( zSig1, zSig1, &term2, &term3 );
5185: sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5186: while ( (sbits64) rem1 < 0 ) {
5187: --zSig1;
5188: shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5189: term3 |= 1;
5190: term2 |= doubleZSig0;
5191: add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5192: }
5193: zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5194: }
5195: shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5196: return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
5197:
5198: }
5199:
5200: /*
5201: -------------------------------------------------------------------------------
5202: Returns 1 if the quadruple-precision floating-point value `a' is equal to
5203: the corresponding value `b', and 0 otherwise. The comparison is performed
5204: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5205: -------------------------------------------------------------------------------
5206: */
5207: flag float128_eq( float128 a, float128 b )
5208: {
5209:
5210: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5211: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5212: || ( ( extractFloat128Exp( b ) == 0x7FFF )
5213: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5214: ) {
5215: if ( float128_is_signaling_nan( a )
5216: || float128_is_signaling_nan( b ) ) {
5217: float_raise( float_flag_invalid );
5218: }
5219: return 0;
5220: }
5221: return
5222: ( a.low == b.low )
5223: && ( ( a.high == b.high )
5224: || ( ( a.low == 0 )
5225: && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5226: );
5227:
5228: }
5229:
5230: /*
5231: -------------------------------------------------------------------------------
5232: Returns 1 if the quadruple-precision floating-point value `a' is less than
5233: or equal to the corresponding value `b', and 0 otherwise. The comparison
5234: is performed according to the IEC/IEEE Standard for Binary Floating-Point
5235: Arithmetic.
5236: -------------------------------------------------------------------------------
5237: */
5238: flag float128_le( float128 a, float128 b )
5239: {
5240: flag aSign, bSign;
5241:
5242: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5243: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5244: || ( ( extractFloat128Exp( b ) == 0x7FFF )
5245: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5246: ) {
5247: float_raise( float_flag_invalid );
5248: return 0;
5249: }
5250: aSign = extractFloat128Sign( a );
5251: bSign = extractFloat128Sign( b );
5252: if ( aSign != bSign ) {
5253: return
5254: aSign
5255: || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5256: == 0 );
5257: }
5258: return
5259: aSign ? le128( b.high, b.low, a.high, a.low )
5260: : le128( a.high, a.low, b.high, b.low );
5261:
5262: }
5263:
5264: /*
5265: -------------------------------------------------------------------------------
5266: Returns 1 if the quadruple-precision floating-point value `a' is less than
5267: the corresponding value `b', and 0 otherwise. The comparison is performed
5268: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5269: -------------------------------------------------------------------------------
5270: */
5271: flag float128_lt( float128 a, float128 b )
5272: {
5273: flag aSign, bSign;
5274:
5275: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5276: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5277: || ( ( extractFloat128Exp( b ) == 0x7FFF )
5278: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5279: ) {
5280: float_raise( float_flag_invalid );
5281: return 0;
5282: }
5283: aSign = extractFloat128Sign( a );
5284: bSign = extractFloat128Sign( b );
5285: if ( aSign != bSign ) {
5286: return
5287: aSign
5288: && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5289: != 0 );
5290: }
5291: return
5292: aSign ? lt128( b.high, b.low, a.high, a.low )
5293: : lt128( a.high, a.low, b.high, b.low );
5294:
5295: }
5296:
5297: /*
5298: -------------------------------------------------------------------------------
5299: Returns 1 if the quadruple-precision floating-point value `a' is equal to
5300: the corresponding value `b', and 0 otherwise. The invalid exception is
5301: raised if either operand is a NaN. Otherwise, the comparison is performed
5302: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303: -------------------------------------------------------------------------------
5304: */
5305: flag float128_eq_signaling( float128 a, float128 b )
5306: {
5307:
5308: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5309: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5310: || ( ( extractFloat128Exp( b ) == 0x7FFF )
5311: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5312: ) {
5313: float_raise( float_flag_invalid );
5314: return 0;
5315: }
5316: return
5317: ( a.low == b.low )
5318: && ( ( a.high == b.high )
5319: || ( ( a.low == 0 )
5320: && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5321: );
5322:
5323: }
5324:
5325: /*
5326: -------------------------------------------------------------------------------
5327: Returns 1 if the quadruple-precision floating-point value `a' is less than
5328: or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5329: cause an exception. Otherwise, the comparison is performed according to the
5330: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5331: -------------------------------------------------------------------------------
5332: */
5333: flag float128_le_quiet( float128 a, float128 b )
5334: {
5335: flag aSign, bSign;
5336:
5337: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5338: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5339: || ( ( extractFloat128Exp( b ) == 0x7FFF )
5340: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5341: ) {
5342: if ( float128_is_signaling_nan( a )
5343: || float128_is_signaling_nan( b ) ) {
5344: float_raise( float_flag_invalid );
5345: }
5346: return 0;
5347: }
5348: aSign = extractFloat128Sign( a );
5349: bSign = extractFloat128Sign( b );
5350: if ( aSign != bSign ) {
5351: return
5352: aSign
5353: || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5354: == 0 );
5355: }
5356: return
5357: aSign ? le128( b.high, b.low, a.high, a.low )
5358: : le128( a.high, a.low, b.high, b.low );
5359:
5360: }
5361:
5362: /*
5363: -------------------------------------------------------------------------------
5364: Returns 1 if the quadruple-precision floating-point value `a' is less than
5365: the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5366: exception. Otherwise, the comparison is performed according to the IEC/IEEE
5367: Standard for Binary Floating-Point Arithmetic.
5368: -------------------------------------------------------------------------------
5369: */
5370: flag float128_lt_quiet( float128 a, float128 b )
5371: {
5372: flag aSign, bSign;
5373:
5374: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5375: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5376: || ( ( extractFloat128Exp( b ) == 0x7FFF )
5377: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5378: ) {
5379: if ( float128_is_signaling_nan( a )
5380: || float128_is_signaling_nan( b ) ) {
5381: float_raise( float_flag_invalid );
5382: }
5383: return 0;
5384: }
5385: aSign = extractFloat128Sign( a );
5386: bSign = extractFloat128Sign( b );
5387: if ( aSign != bSign ) {
5388: return
5389: aSign
5390: && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5391: != 0 );
5392: }
5393: return
5394: aSign ? lt128( b.high, b.low, a.high, a.low )
5395: : lt128( a.high, a.low, b.high, b.low );
5396:
5397: }
5398:
5399: #endif
5400:
5401:
5402: #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
5403:
5404: /*
5405: * These two routines are not part of the original softfloat distribution.
5406: *
5407: * They are based on the corresponding conversions to integer but return
5408: * unsigned numbers instead since these functions are required by GCC.
5409: *
5410: * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97
5411: *
5412: * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
5413: */
5414:
5415: /*
5416: -------------------------------------------------------------------------------
5417: Returns the result of converting the double-precision floating-point value
5418: `a' to the 32-bit unsigned integer format. The conversion is
5419: performed according to the IEC/IEEE Standard for Binary Floating-point
5420: Arithmetic, except that the conversion is always rounded toward zero. If
5421: `a' is a NaN, the largest positive integer is returned. If the conversion
5422: overflows, the largest integer positive is returned.
5423: -------------------------------------------------------------------------------
5424: */
5425: uint32 float64_to_uint32_round_to_zero( float64 a )
5426: {
5427: flag aSign;
5428: int16 aExp, shiftCount;
5429: bits64 aSig, savedASig;
5430: uint32 z;
5431:
5432: aSig = extractFloat64Frac( a );
5433: aExp = extractFloat64Exp( a );
5434: aSign = extractFloat64Sign( a );
5435:
5436: if (aSign) {
5437: float_raise( float_flag_invalid );
5438: return(0);
5439: }
5440:
5441: if ( 0x41E < aExp ) {
5442: float_raise( float_flag_invalid );
5443: return 0xffffffff;
5444: }
5445: else if ( aExp < 0x3FF ) {
5446: if ( aExp || aSig ) float_set_inexact();
5447: return 0;
5448: }
5449: aSig |= LIT64( 0x0010000000000000 );
5450: shiftCount = 0x433 - aExp;
5451: savedASig = aSig;
5452: aSig >>= shiftCount;
5453: z = aSig;
5454: if ( ( aSig<<shiftCount ) != savedASig ) {
5455: float_set_inexact();
5456: }
5457: return z;
5458:
5459: }
5460:
5461: /*
5462: -------------------------------------------------------------------------------
5463: Returns the result of converting the single-precision floating-point value
5464: `a' to the 32-bit unsigned integer format. The conversion is
5465: performed according to the IEC/IEEE Standard for Binary Floating-point
5466: Arithmetic, except that the conversion is always rounded toward zero. If
5467: `a' is a NaN, the largest positive integer is returned. If the conversion
5468: overflows, the largest positive integer is returned.
5469: -------------------------------------------------------------------------------
5470: */
5471: uint32 float32_to_uint32_round_to_zero( float32 a )
5472: {
5473: flag aSign;
5474: int16 aExp, shiftCount;
5475: bits32 aSig;
5476: uint32 z;
5477:
5478: aSig = extractFloat32Frac( a );
5479: aExp = extractFloat32Exp( a );
5480: aSign = extractFloat32Sign( a );
5481: shiftCount = aExp - 0x9E;
5482:
5483: if (aSign) {
5484: float_raise( float_flag_invalid );
5485: return(0);
5486: }
5487: if ( 0 < shiftCount ) {
5488: float_raise( float_flag_invalid );
5489: return 0xFFFFFFFF;
5490: }
5491: else if ( aExp <= 0x7E ) {
5492: if ( aExp | aSig ) float_set_inexact();
5493: return 0;
5494: }
5495: aSig = ( aSig | 0x800000 )<<8;
5496: z = aSig>>( - shiftCount );
5497: if ( aSig<<( shiftCount & 31 ) ) {
5498: float_set_inexact();
5499: }
5500: return z;
5501:
5502: }
5503:
5504: #endif
5505:
5506: #endif /* !NO_IEEE */
CVSweb