Annotation of sys/lib/libkern/softfloat.c, Revision 1.1
1.1 ! nbrk 1: /* $OpenBSD: softfloat.c,v 1.1 2002/04/28 20:55:14 pvalchev Exp $ */
! 2: /* $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $ */
! 3:
! 4: /*
! 5: * This version hacked for use with gcc -msoft-float by bjh21.
! 6: * (Mostly a case of #ifdefing out things GCC doesn't need or provides
! 7: * itself).
! 8: */
! 9:
! 10: /*
! 11: * Things you may want to define:
! 12: *
! 13: * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
! 14: * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
! 15: * properly renamed.
! 16: */
! 17:
! 18: /*
! 19: ===============================================================================
! 20:
! 21: This C source file is part of the SoftFloat IEC/IEEE Floating-point
! 22: Arithmetic Package, Release 2a.
! 23:
! 24: Written by John R. Hauser. This work was made possible in part by the
! 25: International Computer Science Institute, located at Suite 600, 1947 Center
! 26: Street, Berkeley, California 94704. Funding was partially provided by the
! 27: National Science Foundation under grant MIP-9311980. The original version
! 28: of this code was written as part of a project to build a fixed-point vector
! 29: processor in collaboration with the University of California at Berkeley,
! 30: overseen by Profs. Nelson Morgan and John Wawrzynek. More information
! 31: is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
! 32: arithmetic/SoftFloat.html'.
! 33:
! 34: THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable
! 35: effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
! 36: WILL AT TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS
! 37: RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
! 38: RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
! 39: THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
! 40: (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
! 41: COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
! 42: ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
! 43:
! 44: Derivative works are acceptable, even for commercial purposes, so long as
! 45: (1) they include prominent notice that the work is derivative, and (2) they
! 46: include prominent notice akin to these four paragraphs for those parts of
! 47: this code that are retained.
! 48:
! 49: ===============================================================================
! 50: */
! 51:
! 52: #ifndef NO_IEEE
! 53:
! 54: #include <sys/cdefs.h>
! 55: #if defined(LIBC_SCCS) && !defined(lint)
! 56: __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
! 57: #endif /* LIBC_SCCS and not lint */
! 58:
! 59: #ifdef SOFTFLOAT_FOR_GCC
! 60: #include "softfloat-for-gcc.h"
! 61: #endif
! 62:
! 63: #include "milieu.h"
! 64: #include "softfloat.h"
! 65:
! 66: /*
! 67: * Conversions between floats as stored in memory and floats as
! 68: * SoftFloat uses them
! 69: */
! 70: #ifndef FLOAT64_DEMANGLE
! 71: #define FLOAT64_DEMANGLE(a) (a)
! 72: #endif
! 73: #ifndef FLOAT64_MANGLE
! 74: #define FLOAT64_MANGLE(a) (a)
! 75: #endif
! 76:
! 77: /*
! 78: -------------------------------------------------------------------------------
! 79: Floating-point rounding mode, extended double-precision rounding precision,
! 80: and exception flags.
! 81: -------------------------------------------------------------------------------
! 82: */
! 83:
! 84: /*
! 85: * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
! 86: * Right now, it does not. I've removed all other dynamic global
! 87: * variables. [ross]
! 88: */
! 89: #ifdef FLOATX80
! 90: int8 floatx80_rounding_precision = 80;
! 91: #endif
! 92:
! 93: /*
! 94: -------------------------------------------------------------------------------
! 95: Primitive arithmetic functions, including multi-word arithmetic, and
! 96: division and square root approximations. (Can be specialized to target if
! 97: desired.)
! 98: -------------------------------------------------------------------------------
! 99: */
! 100: #include "softfloat-macros.h"
! 101:
! 102: /*
! 103: -------------------------------------------------------------------------------
! 104: Functions and definitions to determine: (1) whether tininess for underflow
! 105: is detected before or after rounding by default, (2) what (if anything)
! 106: happens when exceptions are raised, (3) how signaling NaNs are distinguished
! 107: from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
! 108: are propagated from function inputs to output. These details are target-
! 109: specific.
! 110: -------------------------------------------------------------------------------
! 111: */
! 112: #include "softfloat-specialize.h"
! 113:
! 114: #ifndef SOFTFLOAT_FOR_GCC /* Not used */
! 115: /*
! 116: -------------------------------------------------------------------------------
! 117: Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
! 118: and 7, and returns the properly rounded 32-bit integer corresponding to the
! 119: input. If `zSign' is 1, the input is negated before being converted to an
! 120: integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
! 121: is simply rounded to an integer, with the inexact exception raised if the
! 122: input cannot be represented exactly as an integer. However, if the fixed-
! 123: point input is too large, the invalid exception is raised and the largest
! 124: positive or negative integer is returned.
! 125: -------------------------------------------------------------------------------
! 126: */
! 127: static int32 roundAndPackInt32( flag zSign, bits64 absZ )
! 128: {
! 129: int8 roundingMode;
! 130: flag roundNearestEven;
! 131: int8 roundIncrement, roundBits;
! 132: int32 z;
! 133:
! 134: roundingMode = float_rounding_mode();
! 135: roundNearestEven = ( roundingMode == float_round_nearest_even );
! 136: roundIncrement = 0x40;
! 137: if ( ! roundNearestEven ) {
! 138: if ( roundingMode == float_round_to_zero ) {
! 139: roundIncrement = 0;
! 140: }
! 141: else {
! 142: roundIncrement = 0x7F;
! 143: if ( zSign ) {
! 144: if ( roundingMode == float_round_up ) roundIncrement = 0;
! 145: }
! 146: else {
! 147: if ( roundingMode == float_round_down ) roundIncrement = 0;
! 148: }
! 149: }
! 150: }
! 151: roundBits = absZ & 0x7F;
! 152: absZ = ( absZ + roundIncrement )>>7;
! 153: absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
! 154: z = absZ;
! 155: if ( zSign ) z = - z;
! 156: if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
! 157: float_raise( float_flag_invalid );
! 158: return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
! 159: }
! 160: if ( roundBits ) float_set_inexact();
! 161: return z;
! 162:
! 163: }
! 164:
! 165: /*
! 166: -------------------------------------------------------------------------------
! 167: Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
! 168: `absZ1', with binary point between bits 63 and 64 (between the input words),
! 169: and returns the properly rounded 64-bit integer corresponding to the input.
! 170: If `zSign' is 1, the input is negated before being converted to an integer.
! 171: Ordinarily, the fixed-point input is simply rounded to an integer, with
! 172: the inexact exception raised if the input cannot be represented exactly as
! 173: an integer. However, if the fixed-point input is too large, the invalid
! 174: exception is raised and the largest positive or negative integer is
! 175: returned.
! 176: -------------------------------------------------------------------------------
! 177: */
! 178: static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
! 179: {
! 180: int8 roundingMode;
! 181: flag roundNearestEven, increment;
! 182: int64 z;
! 183:
! 184: roundingMode = float_rounding_mode();
! 185: roundNearestEven = ( roundingMode == float_round_nearest_even );
! 186: increment = ( (sbits64) absZ1 < 0 );
! 187: if ( ! roundNearestEven ) {
! 188: if ( roundingMode == float_round_to_zero ) {
! 189: increment = 0;
! 190: }
! 191: else {
! 192: if ( zSign ) {
! 193: increment = ( roundingMode == float_round_down ) && absZ1;
! 194: }
! 195: else {
! 196: increment = ( roundingMode == float_round_up ) && absZ1;
! 197: }
! 198: }
! 199: }
! 200: if ( increment ) {
! 201: ++absZ0;
! 202: if ( absZ0 == 0 ) goto overflow;
! 203: absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
! 204: }
! 205: z = absZ0;
! 206: if ( zSign ) z = - z;
! 207: if ( z && ( ( z < 0 ) ^ zSign ) ) {
! 208: overflow:
! 209: float_raise( float_flag_invalid );
! 210: return
! 211: zSign ? (sbits64) LIT64( 0x8000000000000000 )
! 212: : LIT64( 0x7FFFFFFFFFFFFFFF );
! 213: }
! 214: if ( absZ1 ) float_set_inexact();
! 215: return z;
! 216:
! 217: }
! 218: #endif
! 219:
! 220: /*
! 221: -------------------------------------------------------------------------------
! 222: Returns the fraction bits of the single-precision floating-point value `a'.
! 223: -------------------------------------------------------------------------------
! 224: */
! 225: INLINE bits32 extractFloat32Frac( float32 a )
! 226: {
! 227:
! 228: return a & 0x007FFFFF;
! 229:
! 230: }
! 231:
! 232: /*
! 233: -------------------------------------------------------------------------------
! 234: Returns the exponent bits of the single-precision floating-point value `a'.
! 235: -------------------------------------------------------------------------------
! 236: */
! 237: INLINE int16 extractFloat32Exp( float32 a )
! 238: {
! 239:
! 240: return ( a>>23 ) & 0xFF;
! 241:
! 242: }
! 243:
! 244: /*
! 245: -------------------------------------------------------------------------------
! 246: Returns the sign bit of the single-precision floating-point value `a'.
! 247: -------------------------------------------------------------------------------
! 248: */
! 249: INLINE flag extractFloat32Sign( float32 a )
! 250: {
! 251:
! 252: return a>>31;
! 253:
! 254: }
! 255:
! 256: /*
! 257: -------------------------------------------------------------------------------
! 258: Normalizes the subnormal single-precision floating-point value represented
! 259: by the denormalized significand `aSig'. The normalized exponent and
! 260: significand are stored at the locations pointed to by `zExpPtr' and
! 261: `zSigPtr', respectively.
! 262: -------------------------------------------------------------------------------
! 263: */
! 264: static void
! 265: normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
! 266: {
! 267: int8 shiftCount;
! 268:
! 269: shiftCount = countLeadingZeros32( aSig ) - 8;
! 270: *zSigPtr = aSig<<shiftCount;
! 271: *zExpPtr = 1 - shiftCount;
! 272:
! 273: }
! 274:
! 275: /*
! 276: -------------------------------------------------------------------------------
! 277: Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
! 278: single-precision floating-point value, returning the result. After being
! 279: shifted into the proper positions, the three fields are simply added
! 280: together to form the result. This means that any integer portion of `zSig'
! 281: will be added into the exponent. Since a properly normalized significand
! 282: will have an integer portion equal to 1, the `zExp' input should be 1 less
! 283: than the desired result exponent whenever `zSig' is a complete, normalized
! 284: significand.
! 285: -------------------------------------------------------------------------------
! 286: */
! 287: INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
! 288: {
! 289:
! 290: return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
! 291:
! 292: }
! 293:
! 294: /*
! 295: -------------------------------------------------------------------------------
! 296: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 297: and significand `zSig', and returns the proper single-precision floating-
! 298: point value corresponding to the abstract input. Ordinarily, the abstract
! 299: value is simply rounded and packed into the single-precision format, with
! 300: the inexact exception raised if the abstract input cannot be represented
! 301: exactly. However, if the abstract value is too large, the overflow and
! 302: inexact exceptions are raised and an infinity or maximal finite value is
! 303: returned. If the abstract value is too small, the input value is rounded to
! 304: a subnormal number, and the underflow and inexact exceptions are raised if
! 305: the abstract input cannot be represented exactly as a subnormal single-
! 306: precision floating-point number.
! 307: The input significand `zSig' has its binary point between bits 30
! 308: and 29, which is 7 bits to the left of the usual location. This shifted
! 309: significand must be normalized or smaller. If `zSig' is not normalized,
! 310: `zExp' must be 0; in that case, the result returned is a subnormal number,
! 311: and it must not require rounding. In the usual case that `zSig' is
! 312: normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
! 313: The handling of underflow and overflow follows the IEC/IEEE Standard for
! 314: Binary Floating-Point Arithmetic.
! 315: -------------------------------------------------------------------------------
! 316: */
! 317: static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
! 318: {
! 319: int8 roundingMode;
! 320: flag roundNearestEven;
! 321: int8 roundIncrement, roundBits;
! 322: flag isTiny;
! 323:
! 324: roundingMode = float_rounding_mode();
! 325: roundNearestEven = ( roundingMode == float_round_nearest_even );
! 326: roundIncrement = 0x40;
! 327: if ( ! roundNearestEven ) {
! 328: if ( roundingMode == float_round_to_zero ) {
! 329: roundIncrement = 0;
! 330: }
! 331: else {
! 332: roundIncrement = 0x7F;
! 333: if ( zSign ) {
! 334: if ( roundingMode == float_round_up ) roundIncrement = 0;
! 335: }
! 336: else {
! 337: if ( roundingMode == float_round_down ) roundIncrement = 0;
! 338: }
! 339: }
! 340: }
! 341: roundBits = zSig & 0x7F;
! 342: if ( 0xFD <= (bits16) zExp ) {
! 343: if ( ( 0xFD < zExp )
! 344: || ( ( zExp == 0xFD )
! 345: && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
! 346: ) {
! 347: float_raise( float_flag_overflow | float_flag_inexact );
! 348: return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
! 349: }
! 350: if ( zExp < 0 ) {
! 351: isTiny =
! 352: ( float_detect_tininess == float_tininess_before_rounding )
! 353: || ( zExp < -1 )
! 354: || ( zSig + roundIncrement < 0x80000000 );
! 355: shift32RightJamming( zSig, - zExp, &zSig );
! 356: zExp = 0;
! 357: roundBits = zSig & 0x7F;
! 358: if ( isTiny && roundBits ) float_raise( float_flag_underflow );
! 359: }
! 360: }
! 361: if ( roundBits ) float_set_inexact();
! 362: zSig = ( zSig + roundIncrement )>>7;
! 363: zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
! 364: if ( zSig == 0 ) zExp = 0;
! 365: return packFloat32( zSign, zExp, zSig );
! 366:
! 367: }
! 368:
! 369: /*
! 370: -------------------------------------------------------------------------------
! 371: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 372: and significand `zSig', and returns the proper single-precision floating-
! 373: point value corresponding to the abstract input. This routine is just like
! 374: `roundAndPackFloat32' except that `zSig' does not have to be normalized.
! 375: Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
! 376: floating-point exponent.
! 377: -------------------------------------------------------------------------------
! 378: */
! 379: static float32
! 380: normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
! 381: {
! 382: int8 shiftCount;
! 383:
! 384: shiftCount = countLeadingZeros32( zSig ) - 1;
! 385: return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
! 386:
! 387: }
! 388:
! 389: /*
! 390: -------------------------------------------------------------------------------
! 391: Returns the fraction bits of the double-precision floating-point value `a'.
! 392: -------------------------------------------------------------------------------
! 393: */
! 394: INLINE bits64 extractFloat64Frac( float64 a )
! 395: {
! 396:
! 397: return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
! 398:
! 399: }
! 400:
! 401: /*
! 402: -------------------------------------------------------------------------------
! 403: Returns the exponent bits of the double-precision floating-point value `a'.
! 404: -------------------------------------------------------------------------------
! 405: */
! 406: INLINE int16 extractFloat64Exp( float64 a )
! 407: {
! 408:
! 409: return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
! 410:
! 411: }
! 412:
! 413: /*
! 414: -------------------------------------------------------------------------------
! 415: Returns the sign bit of the double-precision floating-point value `a'.
! 416: -------------------------------------------------------------------------------
! 417: */
! 418: INLINE flag extractFloat64Sign( float64 a )
! 419: {
! 420:
! 421: return FLOAT64_DEMANGLE(a)>>63;
! 422:
! 423: }
! 424:
! 425: /*
! 426: -------------------------------------------------------------------------------
! 427: Normalizes the subnormal double-precision floating-point value represented
! 428: by the denormalized significand `aSig'. The normalized exponent and
! 429: significand are stored at the locations pointed to by `zExpPtr' and
! 430: `zSigPtr', respectively.
! 431: -------------------------------------------------------------------------------
! 432: */
! 433: static void
! 434: normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
! 435: {
! 436: int8 shiftCount;
! 437:
! 438: shiftCount = countLeadingZeros64( aSig ) - 11;
! 439: *zSigPtr = aSig<<shiftCount;
! 440: *zExpPtr = 1 - shiftCount;
! 441:
! 442: }
! 443:
! 444: /*
! 445: -------------------------------------------------------------------------------
! 446: Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
! 447: double-precision floating-point value, returning the result. After being
! 448: shifted into the proper positions, the three fields are simply added
! 449: together to form the result. This means that any integer portion of `zSig'
! 450: will be added into the exponent. Since a properly normalized significand
! 451: will have an integer portion equal to 1, the `zExp' input should be 1 less
! 452: than the desired result exponent whenever `zSig' is a complete, normalized
! 453: significand.
! 454: -------------------------------------------------------------------------------
! 455: */
! 456: INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
! 457: {
! 458:
! 459: return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
! 460: ( ( (bits64) zExp )<<52 ) + zSig );
! 461:
! 462: }
! 463:
! 464: /*
! 465: -------------------------------------------------------------------------------
! 466: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 467: and significand `zSig', and returns the proper double-precision floating-
! 468: point value corresponding to the abstract input. Ordinarily, the abstract
! 469: value is simply rounded and packed into the double-precision format, with
! 470: the inexact exception raised if the abstract input cannot be represented
! 471: exactly. However, if the abstract value is too large, the overflow and
! 472: inexact exceptions are raised and an infinity or maximal finite value is
! 473: returned. If the abstract value is too small, the input value is rounded to
! 474: a subnormal number, and the underflow and inexact exceptions are raised if
! 475: the abstract input cannot be represented exactly as a subnormal double-
! 476: precision floating-point number.
! 477: The input significand `zSig' has its binary point between bits 62
! 478: and 61, which is 10 bits to the left of the usual location. This shifted
! 479: significand must be normalized or smaller. If `zSig' is not normalized,
! 480: `zExp' must be 0; in that case, the result returned is a subnormal number,
! 481: and it must not require rounding. In the usual case that `zSig' is
! 482: normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
! 483: The handling of underflow and overflow follows the IEC/IEEE Standard for
! 484: Binary Floating-Point Arithmetic.
! 485: -------------------------------------------------------------------------------
! 486: */
! 487: static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
! 488: {
! 489: int8 roundingMode;
! 490: flag roundNearestEven;
! 491: int16 roundIncrement, roundBits;
! 492: flag isTiny;
! 493:
! 494: roundingMode = float_rounding_mode();
! 495: roundNearestEven = ( roundingMode == float_round_nearest_even );
! 496: roundIncrement = 0x200;
! 497: if ( ! roundNearestEven ) {
! 498: if ( roundingMode == float_round_to_zero ) {
! 499: roundIncrement = 0;
! 500: }
! 501: else {
! 502: roundIncrement = 0x3FF;
! 503: if ( zSign ) {
! 504: if ( roundingMode == float_round_up ) roundIncrement = 0;
! 505: }
! 506: else {
! 507: if ( roundingMode == float_round_down ) roundIncrement = 0;
! 508: }
! 509: }
! 510: }
! 511: roundBits = zSig & 0x3FF;
! 512: if ( 0x7FD <= (bits16) zExp ) {
! 513: if ( ( 0x7FD < zExp )
! 514: || ( ( zExp == 0x7FD )
! 515: && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
! 516: ) {
! 517: float_raise( float_flag_overflow | float_flag_inexact );
! 518: return FLOAT64_MANGLE(
! 519: FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
! 520: ( roundIncrement == 0 ));
! 521: }
! 522: if ( zExp < 0 ) {
! 523: isTiny =
! 524: ( float_detect_tininess == float_tininess_before_rounding )
! 525: || ( zExp < -1 )
! 526: || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
! 527: shift64RightJamming( zSig, - zExp, &zSig );
! 528: zExp = 0;
! 529: roundBits = zSig & 0x3FF;
! 530: if ( isTiny && roundBits ) float_raise( float_flag_underflow );
! 531: }
! 532: }
! 533: if ( roundBits ) float_set_inexact();
! 534: zSig = ( zSig + roundIncrement )>>10;
! 535: zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
! 536: if ( zSig == 0 ) zExp = 0;
! 537: return packFloat64( zSign, zExp, zSig );
! 538:
! 539: }
! 540:
! 541: /*
! 542: -------------------------------------------------------------------------------
! 543: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 544: and significand `zSig', and returns the proper double-precision floating-
! 545: point value corresponding to the abstract input. This routine is just like
! 546: `roundAndPackFloat64' except that `zSig' does not have to be normalized.
! 547: Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
! 548: floating-point exponent.
! 549: -------------------------------------------------------------------------------
! 550: */
! 551: static float64
! 552: normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
! 553: {
! 554: int8 shiftCount;
! 555:
! 556: shiftCount = countLeadingZeros64( zSig ) - 1;
! 557: return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
! 558:
! 559: }
! 560:
! 561: #ifdef FLOATX80
! 562:
! 563: /*
! 564: -------------------------------------------------------------------------------
! 565: Returns the fraction bits of the extended double-precision floating-point
! 566: value `a'.
! 567: -------------------------------------------------------------------------------
! 568: */
! 569: INLINE bits64 extractFloatx80Frac( floatx80 a )
! 570: {
! 571:
! 572: return a.low;
! 573:
! 574: }
! 575:
! 576: /*
! 577: -------------------------------------------------------------------------------
! 578: Returns the exponent bits of the extended double-precision floating-point
! 579: value `a'.
! 580: -------------------------------------------------------------------------------
! 581: */
! 582: INLINE int32 extractFloatx80Exp( floatx80 a )
! 583: {
! 584:
! 585: return a.high & 0x7FFF;
! 586:
! 587: }
! 588:
! 589: /*
! 590: -------------------------------------------------------------------------------
! 591: Returns the sign bit of the extended double-precision floating-point value
! 592: `a'.
! 593: -------------------------------------------------------------------------------
! 594: */
! 595: INLINE flag extractFloatx80Sign( floatx80 a )
! 596: {
! 597:
! 598: return a.high>>15;
! 599:
! 600: }
! 601:
! 602: /*
! 603: -------------------------------------------------------------------------------
! 604: Normalizes the subnormal extended double-precision floating-point value
! 605: represented by the denormalized significand `aSig'. The normalized exponent
! 606: and significand are stored at the locations pointed to by `zExpPtr' and
! 607: `zSigPtr', respectively.
! 608: -------------------------------------------------------------------------------
! 609: */
! 610: static void
! 611: normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
! 612: {
! 613: int8 shiftCount;
! 614:
! 615: shiftCount = countLeadingZeros64( aSig );
! 616: *zSigPtr = aSig<<shiftCount;
! 617: *zExpPtr = 1 - shiftCount;
! 618:
! 619: }
! 620:
! 621: /*
! 622: -------------------------------------------------------------------------------
! 623: Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
! 624: extended double-precision floating-point value, returning the result.
! 625: -------------------------------------------------------------------------------
! 626: */
! 627: INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
! 628: {
! 629: floatx80 z;
! 630:
! 631: z.low = zSig;
! 632: z.high = ( ( (bits16) zSign )<<15 ) + zExp;
! 633: return z;
! 634:
! 635: }
! 636:
! 637: /*
! 638: -------------------------------------------------------------------------------
! 639: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 640: and extended significand formed by the concatenation of `zSig0' and `zSig1',
! 641: and returns the proper extended double-precision floating-point value
! 642: corresponding to the abstract input. Ordinarily, the abstract value is
! 643: rounded and packed into the extended double-precision format, with the
! 644: inexact exception raised if the abstract input cannot be represented
! 645: exactly. However, if the abstract value is too large, the overflow and
! 646: inexact exceptions are raised and an infinity or maximal finite value is
! 647: returned. If the abstract value is too small, the input value is rounded to
! 648: a subnormal number, and the underflow and inexact exceptions are raised if
! 649: the abstract input cannot be represented exactly as a subnormal extended
! 650: double-precision floating-point number.
! 651: If `roundingPrecision' is 32 or 64, the result is rounded to the same
! 652: number of bits as single or double precision, respectively. Otherwise, the
! 653: result is rounded to the full precision of the extended double-precision
! 654: format.
! 655: The input significand must be normalized or smaller. If the input
! 656: significand is not normalized, `zExp' must be 0; in that case, the result
! 657: returned is a subnormal number, and it must not require rounding. The
! 658: handling of underflow and overflow follows the IEC/IEEE Standard for Binary
! 659: Floating-Point Arithmetic.
! 660: -------------------------------------------------------------------------------
! 661: */
! 662: static floatx80
! 663: roundAndPackFloatx80(
! 664: int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
! 665: )
! 666: {
! 667: int8 roundingMode;
! 668: flag roundNearestEven, increment, isTiny;
! 669: int64 roundIncrement, roundMask, roundBits;
! 670:
! 671: roundingMode = float_rounding_mode();
! 672: roundNearestEven = ( roundingMode == float_round_nearest_even );
! 673: if ( roundingPrecision == 80 ) goto precision80;
! 674: if ( roundingPrecision == 64 ) {
! 675: roundIncrement = LIT64( 0x0000000000000400 );
! 676: roundMask = LIT64( 0x00000000000007FF );
! 677: }
! 678: else if ( roundingPrecision == 32 ) {
! 679: roundIncrement = LIT64( 0x0000008000000000 );
! 680: roundMask = LIT64( 0x000000FFFFFFFFFF );
! 681: }
! 682: else {
! 683: goto precision80;
! 684: }
! 685: zSig0 |= ( zSig1 != 0 );
! 686: if ( ! roundNearestEven ) {
! 687: if ( roundingMode == float_round_to_zero ) {
! 688: roundIncrement = 0;
! 689: }
! 690: else {
! 691: roundIncrement = roundMask;
! 692: if ( zSign ) {
! 693: if ( roundingMode == float_round_up ) roundIncrement = 0;
! 694: }
! 695: else {
! 696: if ( roundingMode == float_round_down ) roundIncrement = 0;
! 697: }
! 698: }
! 699: }
! 700: roundBits = zSig0 & roundMask;
! 701: if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
! 702: if ( ( 0x7FFE < zExp )
! 703: || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
! 704: ) {
! 705: goto overflow;
! 706: }
! 707: if ( zExp <= 0 ) {
! 708: isTiny =
! 709: ( float_detect_tininess == float_tininess_before_rounding )
! 710: || ( zExp < 0 )
! 711: || ( zSig0 <= zSig0 + roundIncrement );
! 712: shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
! 713: zExp = 0;
! 714: roundBits = zSig0 & roundMask;
! 715: if ( isTiny && roundBits ) float_raise( float_flag_underflow );
! 716: if ( roundBits ) float_set_inexact();
! 717: zSig0 += roundIncrement;
! 718: if ( (sbits64) zSig0 < 0 ) zExp = 1;
! 719: roundIncrement = roundMask + 1;
! 720: if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
! 721: roundMask |= roundIncrement;
! 722: }
! 723: zSig0 &= ~ roundMask;
! 724: return packFloatx80( zSign, zExp, zSig0 );
! 725: }
! 726: }
! 727: if ( roundBits ) float_set_inexact();
! 728: zSig0 += roundIncrement;
! 729: if ( zSig0 < roundIncrement ) {
! 730: ++zExp;
! 731: zSig0 = LIT64( 0x8000000000000000 );
! 732: }
! 733: roundIncrement = roundMask + 1;
! 734: if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
! 735: roundMask |= roundIncrement;
! 736: }
! 737: zSig0 &= ~ roundMask;
! 738: if ( zSig0 == 0 ) zExp = 0;
! 739: return packFloatx80( zSign, zExp, zSig0 );
! 740: precision80:
! 741: increment = ( (sbits64) zSig1 < 0 );
! 742: if ( ! roundNearestEven ) {
! 743: if ( roundingMode == float_round_to_zero ) {
! 744: increment = 0;
! 745: }
! 746: else {
! 747: if ( zSign ) {
! 748: increment = ( roundingMode == float_round_down ) && zSig1;
! 749: }
! 750: else {
! 751: increment = ( roundingMode == float_round_up ) && zSig1;
! 752: }
! 753: }
! 754: }
! 755: if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
! 756: if ( ( 0x7FFE < zExp )
! 757: || ( ( zExp == 0x7FFE )
! 758: && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
! 759: && increment
! 760: )
! 761: ) {
! 762: roundMask = 0;
! 763: overflow:
! 764: float_raise( float_flag_overflow | float_flag_inexact );
! 765: if ( ( roundingMode == float_round_to_zero )
! 766: || ( zSign && ( roundingMode == float_round_up ) )
! 767: || ( ! zSign && ( roundingMode == float_round_down ) )
! 768: ) {
! 769: return packFloatx80( zSign, 0x7FFE, ~ roundMask );
! 770: }
! 771: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 772: }
! 773: if ( zExp <= 0 ) {
! 774: isTiny =
! 775: ( float_detect_tininess == float_tininess_before_rounding )
! 776: || ( zExp < 0 )
! 777: || ! increment
! 778: || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
! 779: shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
! 780: zExp = 0;
! 781: if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
! 782: if ( zSig1 ) float_set_inexact();
! 783: if ( roundNearestEven ) {
! 784: increment = ( (sbits64) zSig1 < 0 );
! 785: }
! 786: else {
! 787: if ( zSign ) {
! 788: increment = ( roundingMode == float_round_down ) && zSig1;
! 789: }
! 790: else {
! 791: increment = ( roundingMode == float_round_up ) && zSig1;
! 792: }
! 793: }
! 794: if ( increment ) {
! 795: ++zSig0;
! 796: zSig0 &=
! 797: ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
! 798: if ( (sbits64) zSig0 < 0 ) zExp = 1;
! 799: }
! 800: return packFloatx80( zSign, zExp, zSig0 );
! 801: }
! 802: }
! 803: if ( zSig1 ) float_set_inexact();
! 804: if ( increment ) {
! 805: ++zSig0;
! 806: if ( zSig0 == 0 ) {
! 807: ++zExp;
! 808: zSig0 = LIT64( 0x8000000000000000 );
! 809: }
! 810: else {
! 811: zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
! 812: }
! 813: }
! 814: else {
! 815: if ( zSig0 == 0 ) zExp = 0;
! 816: }
! 817: return packFloatx80( zSign, zExp, zSig0 );
! 818:
! 819: }
! 820:
! 821: /*
! 822: -------------------------------------------------------------------------------
! 823: Takes an abstract floating-point value having sign `zSign', exponent
! 824: `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
! 825: and returns the proper extended double-precision floating-point value
! 826: corresponding to the abstract input. This routine is just like
! 827: `roundAndPackFloatx80' except that the input significand does not have to be
! 828: normalized.
! 829: -------------------------------------------------------------------------------
! 830: */
! 831: static floatx80
! 832: normalizeRoundAndPackFloatx80(
! 833: int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
! 834: )
! 835: {
! 836: int8 shiftCount;
! 837:
! 838: if ( zSig0 == 0 ) {
! 839: zSig0 = zSig1;
! 840: zSig1 = 0;
! 841: zExp -= 64;
! 842: }
! 843: shiftCount = countLeadingZeros64( zSig0 );
! 844: shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
! 845: zExp -= shiftCount;
! 846: return
! 847: roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
! 848:
! 849: }
! 850:
! 851: #endif
! 852:
! 853: #ifdef FLOAT128
! 854:
! 855: /*
! 856: -------------------------------------------------------------------------------
! 857: Returns the least-significant 64 fraction bits of the quadruple-precision
! 858: floating-point value `a'.
! 859: -------------------------------------------------------------------------------
! 860: */
! 861: INLINE bits64 extractFloat128Frac1( float128 a )
! 862: {
! 863:
! 864: return a.low;
! 865:
! 866: }
! 867:
! 868: /*
! 869: -------------------------------------------------------------------------------
! 870: Returns the most-significant 48 fraction bits of the quadruple-precision
! 871: floating-point value `a'.
! 872: -------------------------------------------------------------------------------
! 873: */
! 874: INLINE bits64 extractFloat128Frac0( float128 a )
! 875: {
! 876:
! 877: return a.high & LIT64( 0x0000FFFFFFFFFFFF );
! 878:
! 879: }
! 880:
! 881: /*
! 882: -------------------------------------------------------------------------------
! 883: Returns the exponent bits of the quadruple-precision floating-point value
! 884: `a'.
! 885: -------------------------------------------------------------------------------
! 886: */
! 887: INLINE int32 extractFloat128Exp( float128 a )
! 888: {
! 889:
! 890: return ( a.high>>48 ) & 0x7FFF;
! 891:
! 892: }
! 893:
! 894: /*
! 895: -------------------------------------------------------------------------------
! 896: Returns the sign bit of the quadruple-precision floating-point value `a'.
! 897: -------------------------------------------------------------------------------
! 898: */
! 899: INLINE flag extractFloat128Sign( float128 a )
! 900: {
! 901:
! 902: return a.high>>63;
! 903:
! 904: }
! 905:
! 906: /*
! 907: -------------------------------------------------------------------------------
! 908: Normalizes the subnormal quadruple-precision floating-point value
! 909: represented by the denormalized significand formed by the concatenation of
! 910: `aSig0' and `aSig1'. The normalized exponent is stored at the location
! 911: pointed to by `zExpPtr'. The most significant 49 bits of the normalized
! 912: significand are stored at the location pointed to by `zSig0Ptr', and the
! 913: least significant 64 bits of the normalized significand are stored at the
! 914: location pointed to by `zSig1Ptr'.
! 915: -------------------------------------------------------------------------------
! 916: */
! 917: static void
! 918: normalizeFloat128Subnormal(
! 919: bits64 aSig0,
! 920: bits64 aSig1,
! 921: int32 *zExpPtr,
! 922: bits64 *zSig0Ptr,
! 923: bits64 *zSig1Ptr
! 924: )
! 925: {
! 926: int8 shiftCount;
! 927:
! 928: if ( aSig0 == 0 ) {
! 929: shiftCount = countLeadingZeros64( aSig1 ) - 15;
! 930: if ( shiftCount < 0 ) {
! 931: *zSig0Ptr = aSig1>>( - shiftCount );
! 932: *zSig1Ptr = aSig1<<( shiftCount & 63 );
! 933: }
! 934: else {
! 935: *zSig0Ptr = aSig1<<shiftCount;
! 936: *zSig1Ptr = 0;
! 937: }
! 938: *zExpPtr = - shiftCount - 63;
! 939: }
! 940: else {
! 941: shiftCount = countLeadingZeros64( aSig0 ) - 15;
! 942: shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
! 943: *zExpPtr = 1 - shiftCount;
! 944: }
! 945:
! 946: }
! 947:
! 948: /*
! 949: -------------------------------------------------------------------------------
! 950: Packs the sign `zSign', the exponent `zExp', and the significand formed
! 951: by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
! 952: floating-point value, returning the result. After being shifted into the
! 953: proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
! 954: added together to form the most significant 32 bits of the result. This
! 955: means that any integer portion of `zSig0' will be added into the exponent.
! 956: Since a properly normalized significand will have an integer portion equal
! 957: to 1, the `zExp' input should be 1 less than the desired result exponent
! 958: whenever `zSig0' and `zSig1' concatenated form a complete, normalized
! 959: significand.
! 960: -------------------------------------------------------------------------------
! 961: */
! 962: INLINE float128
! 963: packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
! 964: {
! 965: float128 z;
! 966:
! 967: z.low = zSig1;
! 968: z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
! 969: return z;
! 970:
! 971: }
! 972:
! 973: /*
! 974: -------------------------------------------------------------------------------
! 975: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 976: and extended significand formed by the concatenation of `zSig0', `zSig1',
! 977: and `zSig2', and returns the proper quadruple-precision floating-point value
! 978: corresponding to the abstract input. Ordinarily, the abstract value is
! 979: simply rounded and packed into the quadruple-precision format, with the
! 980: inexact exception raised if the abstract input cannot be represented
! 981: exactly. However, if the abstract value is too large, the overflow and
! 982: inexact exceptions are raised and an infinity or maximal finite value is
! 983: returned. If the abstract value is too small, the input value is rounded to
! 984: a subnormal number, and the underflow and inexact exceptions are raised if
! 985: the abstract input cannot be represented exactly as a subnormal quadruple-
! 986: precision floating-point number.
! 987: The input significand must be normalized or smaller. If the input
! 988: significand is not normalized, `zExp' must be 0; in that case, the result
! 989: returned is a subnormal number, and it must not require rounding. In the
! 990: usual case that the input significand is normalized, `zExp' must be 1 less
! 991: than the ``true'' floating-point exponent. The handling of underflow and
! 992: overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 993: -------------------------------------------------------------------------------
! 994: */
! 995: static float128
! 996: roundAndPackFloat128(
! 997: flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
! 998: {
! 999: int8 roundingMode;
! 1000: flag roundNearestEven, increment, isTiny;
! 1001:
! 1002: roundingMode = float_rounding_mode();
! 1003: roundNearestEven = ( roundingMode == float_round_nearest_even );
! 1004: increment = ( (sbits64) zSig2 < 0 );
! 1005: if ( ! roundNearestEven ) {
! 1006: if ( roundingMode == float_round_to_zero ) {
! 1007: increment = 0;
! 1008: }
! 1009: else {
! 1010: if ( zSign ) {
! 1011: increment = ( roundingMode == float_round_down ) && zSig2;
! 1012: }
! 1013: else {
! 1014: increment = ( roundingMode == float_round_up ) && zSig2;
! 1015: }
! 1016: }
! 1017: }
! 1018: if ( 0x7FFD <= (bits32) zExp ) {
! 1019: if ( ( 0x7FFD < zExp )
! 1020: || ( ( zExp == 0x7FFD )
! 1021: && eq128(
! 1022: LIT64( 0x0001FFFFFFFFFFFF ),
! 1023: LIT64( 0xFFFFFFFFFFFFFFFF ),
! 1024: zSig0,
! 1025: zSig1
! 1026: )
! 1027: && increment
! 1028: )
! 1029: ) {
! 1030: float_raise( float_flag_overflow | float_flag_inexact );
! 1031: if ( ( roundingMode == float_round_to_zero )
! 1032: || ( zSign && ( roundingMode == float_round_up ) )
! 1033: || ( ! zSign && ( roundingMode == float_round_down ) )
! 1034: ) {
! 1035: return
! 1036: packFloat128(
! 1037: zSign,
! 1038: 0x7FFE,
! 1039: LIT64( 0x0000FFFFFFFFFFFF ),
! 1040: LIT64( 0xFFFFFFFFFFFFFFFF )
! 1041: );
! 1042: }
! 1043: return packFloat128( zSign, 0x7FFF, 0, 0 );
! 1044: }
! 1045: if ( zExp < 0 ) {
! 1046: isTiny =
! 1047: ( float_detect_tininess == float_tininess_before_rounding )
! 1048: || ( zExp < -1 )
! 1049: || ! increment
! 1050: || lt128(
! 1051: zSig0,
! 1052: zSig1,
! 1053: LIT64( 0x0001FFFFFFFFFFFF ),
! 1054: LIT64( 0xFFFFFFFFFFFFFFFF )
! 1055: );
! 1056: shift128ExtraRightJamming(
! 1057: zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
! 1058: zExp = 0;
! 1059: if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
! 1060: if ( roundNearestEven ) {
! 1061: increment = ( (sbits64) zSig2 < 0 );
! 1062: }
! 1063: else {
! 1064: if ( zSign ) {
! 1065: increment = ( roundingMode == float_round_down ) && zSig2;
! 1066: }
! 1067: else {
! 1068: increment = ( roundingMode == float_round_up ) && zSig2;
! 1069: }
! 1070: }
! 1071: }
! 1072: }
! 1073: if ( zSig2 ) float_set_inexact();
! 1074: if ( increment ) {
! 1075: add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
! 1076: zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
! 1077: }
! 1078: else {
! 1079: if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
! 1080: }
! 1081: return packFloat128( zSign, zExp, zSig0, zSig1 );
! 1082:
! 1083: }
! 1084:
! 1085: /*
! 1086: -------------------------------------------------------------------------------
! 1087: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
! 1088: and significand formed by the concatenation of `zSig0' and `zSig1', and
! 1089: returns the proper quadruple-precision floating-point value corresponding
! 1090: to the abstract input. This routine is just like `roundAndPackFloat128'
! 1091: except that the input significand has fewer bits and does not have to be
! 1092: normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
! 1093: point exponent.
! 1094: -------------------------------------------------------------------------------
! 1095: */
! 1096: static float128
! 1097: normalizeRoundAndPackFloat128(
! 1098: flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
! 1099: {
! 1100: int8 shiftCount;
! 1101: bits64 zSig2;
! 1102:
! 1103: if ( zSig0 == 0 ) {
! 1104: zSig0 = zSig1;
! 1105: zSig1 = 0;
! 1106: zExp -= 64;
! 1107: }
! 1108: shiftCount = countLeadingZeros64( zSig0 ) - 15;
! 1109: if ( 0 <= shiftCount ) {
! 1110: zSig2 = 0;
! 1111: shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
! 1112: }
! 1113: else {
! 1114: shift128ExtraRightJamming(
! 1115: zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
! 1116: }
! 1117: zExp -= shiftCount;
! 1118: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
! 1119:
! 1120: }
! 1121:
! 1122: #endif
! 1123:
! 1124: /*
! 1125: -------------------------------------------------------------------------------
! 1126: Returns the result of converting the 32-bit two's complement integer `a'
! 1127: to the single-precision floating-point format. The conversion is performed
! 1128: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1129: -------------------------------------------------------------------------------
! 1130: */
! 1131: float32 int32_to_float32( int32 a )
! 1132: {
! 1133: flag zSign;
! 1134:
! 1135: if ( a == 0 ) return 0;
! 1136: if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
! 1137: zSign = ( a < 0 );
! 1138: return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
! 1139:
! 1140: }
! 1141:
! 1142: /*
! 1143: -------------------------------------------------------------------------------
! 1144: Returns the result of converting the 32-bit two's complement integer `a'
! 1145: to the double-precision floating-point format. The conversion is performed
! 1146: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1147: -------------------------------------------------------------------------------
! 1148: */
! 1149: float64 int32_to_float64( int32 a )
! 1150: {
! 1151: flag zSign;
! 1152: uint32 absA;
! 1153: int8 shiftCount;
! 1154: bits64 zSig;
! 1155:
! 1156: if ( a == 0 ) return 0;
! 1157: zSign = ( a < 0 );
! 1158: absA = zSign ? - a : a;
! 1159: shiftCount = countLeadingZeros32( absA ) + 21;
! 1160: zSig = absA;
! 1161: return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
! 1162:
! 1163: }
! 1164:
! 1165: #ifdef FLOATX80
! 1166:
! 1167: /*
! 1168: -------------------------------------------------------------------------------
! 1169: Returns the result of converting the 32-bit two's complement integer `a'
! 1170: to the extended double-precision floating-point format. The conversion
! 1171: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1172: Arithmetic.
! 1173: -------------------------------------------------------------------------------
! 1174: */
! 1175: floatx80 int32_to_floatx80( int32 a )
! 1176: {
! 1177: flag zSign;
! 1178: uint32 absA;
! 1179: int8 shiftCount;
! 1180: bits64 zSig;
! 1181:
! 1182: if ( a == 0 ) return packFloatx80( 0, 0, 0 );
! 1183: zSign = ( a < 0 );
! 1184: absA = zSign ? - a : a;
! 1185: shiftCount = countLeadingZeros32( absA ) + 32;
! 1186: zSig = absA;
! 1187: return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
! 1188:
! 1189: }
! 1190:
! 1191: #endif
! 1192:
! 1193: #ifdef FLOAT128
! 1194:
! 1195: /*
! 1196: -------------------------------------------------------------------------------
! 1197: Returns the result of converting the 32-bit two's complement integer `a' to
! 1198: the quadruple-precision floating-point format. The conversion is performed
! 1199: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1200: -------------------------------------------------------------------------------
! 1201: */
! 1202: float128 int32_to_float128( int32 a )
! 1203: {
! 1204: flag zSign;
! 1205: uint32 absA;
! 1206: int8 shiftCount;
! 1207: bits64 zSig0;
! 1208:
! 1209: if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
! 1210: zSign = ( a < 0 );
! 1211: absA = zSign ? - a : a;
! 1212: shiftCount = countLeadingZeros32( absA ) + 17;
! 1213: zSig0 = absA;
! 1214: return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
! 1215:
! 1216: }
! 1217:
! 1218: #endif
! 1219:
! 1220: #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
! 1221: /*
! 1222: -------------------------------------------------------------------------------
! 1223: Returns the result of converting the 64-bit two's complement integer `a'
! 1224: to the single-precision floating-point format. The conversion is performed
! 1225: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1226: -------------------------------------------------------------------------------
! 1227: */
! 1228: float32 int64_to_float32( int64 a )
! 1229: {
! 1230: flag zSign;
! 1231: uint64 absA;
! 1232: int8 shiftCount;
! 1233:
! 1234: if ( a == 0 ) return 0;
! 1235: zSign = ( a < 0 );
! 1236: absA = zSign ? - a : a;
! 1237: shiftCount = countLeadingZeros64( absA ) - 40;
! 1238: if ( 0 <= shiftCount ) {
! 1239: return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
! 1240: }
! 1241: else {
! 1242: shiftCount += 7;
! 1243: if ( shiftCount < 0 ) {
! 1244: shift64RightJamming( absA, - shiftCount, &absA );
! 1245: }
! 1246: else {
! 1247: absA <<= shiftCount;
! 1248: }
! 1249: return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
! 1250: }
! 1251:
! 1252: }
! 1253:
! 1254: /*
! 1255: -------------------------------------------------------------------------------
! 1256: Returns the result of converting the 64-bit two's complement integer `a'
! 1257: to the double-precision floating-point format. The conversion is performed
! 1258: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1259: -------------------------------------------------------------------------------
! 1260: */
! 1261: float64 int64_to_float64( int64 a )
! 1262: {
! 1263: flag zSign;
! 1264:
! 1265: if ( a == 0 ) return 0;
! 1266: if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
! 1267: return packFloat64( 1, 0x43E, 0 );
! 1268: }
! 1269: zSign = ( a < 0 );
! 1270: return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
! 1271:
! 1272: }
! 1273:
! 1274: #ifdef FLOATX80
! 1275:
! 1276: /*
! 1277: -------------------------------------------------------------------------------
! 1278: Returns the result of converting the 64-bit two's complement integer `a'
! 1279: to the extended double-precision floating-point format. The conversion
! 1280: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1281: Arithmetic.
! 1282: -------------------------------------------------------------------------------
! 1283: */
! 1284: floatx80 int64_to_floatx80( int64 a )
! 1285: {
! 1286: flag zSign;
! 1287: uint64 absA;
! 1288: int8 shiftCount;
! 1289:
! 1290: if ( a == 0 ) return packFloatx80( 0, 0, 0 );
! 1291: zSign = ( a < 0 );
! 1292: absA = zSign ? - a : a;
! 1293: shiftCount = countLeadingZeros64( absA );
! 1294: return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
! 1295:
! 1296: }
! 1297:
! 1298: #endif
! 1299:
! 1300: #ifdef FLOAT128
! 1301:
! 1302: /*
! 1303: -------------------------------------------------------------------------------
! 1304: Returns the result of converting the 64-bit two's complement integer `a' to
! 1305: the quadruple-precision floating-point format. The conversion is performed
! 1306: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1307: -------------------------------------------------------------------------------
! 1308: */
! 1309: float128 int64_to_float128( int64 a )
! 1310: {
! 1311: flag zSign;
! 1312: uint64 absA;
! 1313: int8 shiftCount;
! 1314: int32 zExp;
! 1315: bits64 zSig0, zSig1;
! 1316:
! 1317: if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
! 1318: zSign = ( a < 0 );
! 1319: absA = zSign ? - a : a;
! 1320: shiftCount = countLeadingZeros64( absA ) + 49;
! 1321: zExp = 0x406E - shiftCount;
! 1322: if ( 64 <= shiftCount ) {
! 1323: zSig1 = 0;
! 1324: zSig0 = absA;
! 1325: shiftCount -= 64;
! 1326: }
! 1327: else {
! 1328: zSig1 = absA;
! 1329: zSig0 = 0;
! 1330: }
! 1331: shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
! 1332: return packFloat128( zSign, zExp, zSig0, zSig1 );
! 1333:
! 1334: }
! 1335:
! 1336: #endif
! 1337: #endif /* !SOFTFLOAT_FOR_GCC */
! 1338:
! 1339: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 1340: /*
! 1341: -------------------------------------------------------------------------------
! 1342: Returns the result of converting the single-precision floating-point value
! 1343: `a' to the 32-bit two's complement integer format. The conversion is
! 1344: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1345: Arithmetic---which means in particular that the conversion is rounded
! 1346: according to the current rounding mode. If `a' is a NaN, the largest
! 1347: positive integer is returned. Otherwise, if the conversion overflows, the
! 1348: largest integer with the same sign as `a' is returned.
! 1349: -------------------------------------------------------------------------------
! 1350: */
! 1351: int32 float32_to_int32( float32 a )
! 1352: {
! 1353: flag aSign;
! 1354: int16 aExp, shiftCount;
! 1355: bits32 aSig;
! 1356: bits64 aSig64;
! 1357:
! 1358: aSig = extractFloat32Frac( a );
! 1359: aExp = extractFloat32Exp( a );
! 1360: aSign = extractFloat32Sign( a );
! 1361: if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
! 1362: if ( aExp ) aSig |= 0x00800000;
! 1363: shiftCount = 0xAF - aExp;
! 1364: aSig64 = aSig;
! 1365: aSig64 <<= 32;
! 1366: if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
! 1367: return roundAndPackInt32( aSign, aSig64 );
! 1368:
! 1369: }
! 1370: #endif /* !SOFTFLOAT_FOR_GCC */
! 1371:
! 1372: /*
! 1373: -------------------------------------------------------------------------------
! 1374: Returns the result of converting the single-precision floating-point value
! 1375: `a' to the 32-bit two's complement integer format. The conversion is
! 1376: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1377: Arithmetic, except that the conversion is always rounded toward zero.
! 1378: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
! 1379: the conversion overflows, the largest integer with the same sign as `a' is
! 1380: returned.
! 1381: -------------------------------------------------------------------------------
! 1382: */
! 1383: int32 float32_to_int32_round_to_zero( float32 a )
! 1384: {
! 1385: flag aSign;
! 1386: int16 aExp, shiftCount;
! 1387: bits32 aSig;
! 1388: int32 z;
! 1389:
! 1390: aSig = extractFloat32Frac( a );
! 1391: aExp = extractFloat32Exp( a );
! 1392: aSign = extractFloat32Sign( a );
! 1393: shiftCount = aExp - 0x9E;
! 1394: if ( 0 <= shiftCount ) {
! 1395: if ( a != 0xCF000000 ) {
! 1396: float_raise( float_flag_invalid );
! 1397: if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
! 1398: }
! 1399: return (sbits32) 0x80000000;
! 1400: }
! 1401: else if ( aExp <= 0x7E ) {
! 1402: if ( aExp | aSig ) float_set_inexact();
! 1403: return 0;
! 1404: }
! 1405: aSig = ( aSig | 0x00800000 )<<8;
! 1406: z = aSig>>( - shiftCount );
! 1407: if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
! 1408: float_set_inexact();
! 1409: }
! 1410: if ( aSign ) z = - z;
! 1411: return z;
! 1412:
! 1413: }
! 1414:
! 1415: #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
! 1416: /*
! 1417: -------------------------------------------------------------------------------
! 1418: Returns the result of converting the single-precision floating-point value
! 1419: `a' to the 64-bit two's complement integer format. The conversion is
! 1420: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1421: Arithmetic---which means in particular that the conversion is rounded
! 1422: according to the current rounding mode. If `a' is a NaN, the largest
! 1423: positive integer is returned. Otherwise, if the conversion overflows, the
! 1424: largest integer with the same sign as `a' is returned.
! 1425: -------------------------------------------------------------------------------
! 1426: */
! 1427: int64 float32_to_int64( float32 a )
! 1428: {
! 1429: flag aSign;
! 1430: int16 aExp, shiftCount;
! 1431: bits32 aSig;
! 1432: bits64 aSig64, aSigExtra;
! 1433:
! 1434: aSig = extractFloat32Frac( a );
! 1435: aExp = extractFloat32Exp( a );
! 1436: aSign = extractFloat32Sign( a );
! 1437: shiftCount = 0xBE - aExp;
! 1438: if ( shiftCount < 0 ) {
! 1439: float_raise( float_flag_invalid );
! 1440: if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
! 1441: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 1442: }
! 1443: return (sbits64) LIT64( 0x8000000000000000 );
! 1444: }
! 1445: if ( aExp ) aSig |= 0x00800000;
! 1446: aSig64 = aSig;
! 1447: aSig64 <<= 40;
! 1448: shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
! 1449: return roundAndPackInt64( aSign, aSig64, aSigExtra );
! 1450:
! 1451: }
! 1452:
! 1453: /*
! 1454: -------------------------------------------------------------------------------
! 1455: Returns the result of converting the single-precision floating-point value
! 1456: `a' to the 64-bit two's complement integer format. The conversion is
! 1457: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1458: Arithmetic, except that the conversion is always rounded toward zero. If
! 1459: `a' is a NaN, the largest positive integer is returned. Otherwise, if the
! 1460: conversion overflows, the largest integer with the same sign as `a' is
! 1461: returned.
! 1462: -------------------------------------------------------------------------------
! 1463: */
! 1464: int64 float32_to_int64_round_to_zero( float32 a )
! 1465: {
! 1466: flag aSign;
! 1467: int16 aExp, shiftCount;
! 1468: bits32 aSig;
! 1469: bits64 aSig64;
! 1470: int64 z;
! 1471:
! 1472: aSig = extractFloat32Frac( a );
! 1473: aExp = extractFloat32Exp( a );
! 1474: aSign = extractFloat32Sign( a );
! 1475: shiftCount = aExp - 0xBE;
! 1476: if ( 0 <= shiftCount ) {
! 1477: if ( a != 0xDF000000 ) {
! 1478: float_raise( float_flag_invalid );
! 1479: if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
! 1480: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 1481: }
! 1482: }
! 1483: return (sbits64) LIT64( 0x8000000000000000 );
! 1484: }
! 1485: else if ( aExp <= 0x7E ) {
! 1486: if ( aExp | aSig ) float_set_inexact();
! 1487: return 0;
! 1488: }
! 1489: aSig64 = aSig | 0x00800000;
! 1490: aSig64 <<= 40;
! 1491: z = aSig64>>( - shiftCount );
! 1492: if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
! 1493: float_set_inexact();
! 1494: }
! 1495: if ( aSign ) z = - z;
! 1496: return z;
! 1497:
! 1498: }
! 1499: #endif /* !SOFTFLOAT_FOR_GCC */
! 1500:
! 1501: /*
! 1502: -------------------------------------------------------------------------------
! 1503: Returns the result of converting the single-precision floating-point value
! 1504: `a' to the double-precision floating-point format. The conversion is
! 1505: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1506: Arithmetic.
! 1507: -------------------------------------------------------------------------------
! 1508: */
! 1509: float64 float32_to_float64( float32 a )
! 1510: {
! 1511: flag aSign;
! 1512: int16 aExp;
! 1513: bits32 aSig;
! 1514:
! 1515: aSig = extractFloat32Frac( a );
! 1516: aExp = extractFloat32Exp( a );
! 1517: aSign = extractFloat32Sign( a );
! 1518: if ( aExp == 0xFF ) {
! 1519: if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
! 1520: return packFloat64( aSign, 0x7FF, 0 );
! 1521: }
! 1522: if ( aExp == 0 ) {
! 1523: if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
! 1524: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 1525: --aExp;
! 1526: }
! 1527: return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
! 1528:
! 1529: }
! 1530:
! 1531: #ifdef FLOATX80
! 1532:
! 1533: /*
! 1534: -------------------------------------------------------------------------------
! 1535: Returns the result of converting the single-precision floating-point value
! 1536: `a' to the extended double-precision floating-point format. The conversion
! 1537: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1538: Arithmetic.
! 1539: -------------------------------------------------------------------------------
! 1540: */
! 1541: floatx80 float32_to_floatx80( float32 a )
! 1542: {
! 1543: flag aSign;
! 1544: int16 aExp;
! 1545: bits32 aSig;
! 1546:
! 1547: aSig = extractFloat32Frac( a );
! 1548: aExp = extractFloat32Exp( a );
! 1549: aSign = extractFloat32Sign( a );
! 1550: if ( aExp == 0xFF ) {
! 1551: if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
! 1552: return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 1553: }
! 1554: if ( aExp == 0 ) {
! 1555: if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
! 1556: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 1557: }
! 1558: aSig |= 0x00800000;
! 1559: return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
! 1560:
! 1561: }
! 1562:
! 1563: #endif
! 1564:
! 1565: #ifdef FLOAT128
! 1566:
! 1567: /*
! 1568: -------------------------------------------------------------------------------
! 1569: Returns the result of converting the single-precision floating-point value
! 1570: `a' to the double-precision floating-point format. The conversion is
! 1571: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 1572: Arithmetic.
! 1573: -------------------------------------------------------------------------------
! 1574: */
! 1575: float128 float32_to_float128( float32 a )
! 1576: {
! 1577: flag aSign;
! 1578: int16 aExp;
! 1579: bits32 aSig;
! 1580:
! 1581: aSig = extractFloat32Frac( a );
! 1582: aExp = extractFloat32Exp( a );
! 1583: aSign = extractFloat32Sign( a );
! 1584: if ( aExp == 0xFF ) {
! 1585: if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
! 1586: return packFloat128( aSign, 0x7FFF, 0, 0 );
! 1587: }
! 1588: if ( aExp == 0 ) {
! 1589: if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
! 1590: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 1591: --aExp;
! 1592: }
! 1593: return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
! 1594:
! 1595: }
! 1596:
! 1597: #endif
! 1598:
! 1599: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 1600: /*
! 1601: -------------------------------------------------------------------------------
! 1602: Rounds the single-precision floating-point value `a' to an integer, and
! 1603: returns the result as a single-precision floating-point value. The
! 1604: operation is performed according to the IEC/IEEE Standard for Binary
! 1605: Floating-Point Arithmetic.
! 1606: -------------------------------------------------------------------------------
! 1607: */
! 1608: float32 float32_round_to_int( float32 a )
! 1609: {
! 1610: flag aSign;
! 1611: int16 aExp;
! 1612: bits32 lastBitMask, roundBitsMask;
! 1613: int8 roundingMode;
! 1614: float32 z;
! 1615:
! 1616: aExp = extractFloat32Exp( a );
! 1617: if ( 0x96 <= aExp ) {
! 1618: if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
! 1619: return propagateFloat32NaN( a, a );
! 1620: }
! 1621: return a;
! 1622: }
! 1623: if ( aExp <= 0x7E ) {
! 1624: if ( (bits32) ( a<<1 ) == 0 ) return a;
! 1625: float_set_inexact();
! 1626: aSign = extractFloat32Sign( a );
! 1627: switch ( float_rounding_mode() ) {
! 1628: case float_round_nearest_even:
! 1629: if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
! 1630: return packFloat32( aSign, 0x7F, 0 );
! 1631: }
! 1632: break;
! 1633: case float_round_down:
! 1634: return aSign ? 0xBF800000 : 0;
! 1635: case float_round_up:
! 1636: return aSign ? 0x80000000 : 0x3F800000;
! 1637: }
! 1638: return packFloat32( aSign, 0, 0 );
! 1639: }
! 1640: lastBitMask = 1;
! 1641: lastBitMask <<= 0x96 - aExp;
! 1642: roundBitsMask = lastBitMask - 1;
! 1643: z = a;
! 1644: roundingMode = float_rounding_mode();
! 1645: if ( roundingMode == float_round_nearest_even ) {
! 1646: z += lastBitMask>>1;
! 1647: if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
! 1648: }
! 1649: else if ( roundingMode != float_round_to_zero ) {
! 1650: if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
! 1651: z += roundBitsMask;
! 1652: }
! 1653: }
! 1654: z &= ~ roundBitsMask;
! 1655: if ( z != a ) float_set_inexact();
! 1656: return z;
! 1657:
! 1658: }
! 1659: #endif /* !SOFTFLOAT_FOR_GCC */
! 1660:
! 1661: /*
! 1662: -------------------------------------------------------------------------------
! 1663: Returns the result of adding the absolute values of the single-precision
! 1664: floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
! 1665: before being returned. `zSign' is ignored if the result is a NaN.
! 1666: The addition is performed according to the IEC/IEEE Standard for Binary
! 1667: Floating-Point Arithmetic.
! 1668: -------------------------------------------------------------------------------
! 1669: */
! 1670: static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
! 1671: {
! 1672: int16 aExp, bExp, zExp;
! 1673: bits32 aSig, bSig, zSig;
! 1674: int16 expDiff;
! 1675:
! 1676: aSig = extractFloat32Frac( a );
! 1677: aExp = extractFloat32Exp( a );
! 1678: bSig = extractFloat32Frac( b );
! 1679: bExp = extractFloat32Exp( b );
! 1680: expDiff = aExp - bExp;
! 1681: aSig <<= 6;
! 1682: bSig <<= 6;
! 1683: if ( 0 < expDiff ) {
! 1684: if ( aExp == 0xFF ) {
! 1685: if ( aSig ) return propagateFloat32NaN( a, b );
! 1686: return a;
! 1687: }
! 1688: if ( bExp == 0 ) {
! 1689: --expDiff;
! 1690: }
! 1691: else {
! 1692: bSig |= 0x20000000;
! 1693: }
! 1694: shift32RightJamming( bSig, expDiff, &bSig );
! 1695: zExp = aExp;
! 1696: }
! 1697: else if ( expDiff < 0 ) {
! 1698: if ( bExp == 0xFF ) {
! 1699: if ( bSig ) return propagateFloat32NaN( a, b );
! 1700: return packFloat32( zSign, 0xFF, 0 );
! 1701: }
! 1702: if ( aExp == 0 ) {
! 1703: ++expDiff;
! 1704: }
! 1705: else {
! 1706: aSig |= 0x20000000;
! 1707: }
! 1708: shift32RightJamming( aSig, - expDiff, &aSig );
! 1709: zExp = bExp;
! 1710: }
! 1711: else {
! 1712: if ( aExp == 0xFF ) {
! 1713: if ( aSig | bSig ) return propagateFloat32NaN( a, b );
! 1714: return a;
! 1715: }
! 1716: if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
! 1717: zSig = 0x40000000 + aSig + bSig;
! 1718: zExp = aExp;
! 1719: goto roundAndPack;
! 1720: }
! 1721: aSig |= 0x20000000;
! 1722: zSig = ( aSig + bSig )<<1;
! 1723: --zExp;
! 1724: if ( (sbits32) zSig < 0 ) {
! 1725: zSig = aSig + bSig;
! 1726: ++zExp;
! 1727: }
! 1728: roundAndPack:
! 1729: return roundAndPackFloat32( zSign, zExp, zSig );
! 1730:
! 1731: }
! 1732:
! 1733: /*
! 1734: -------------------------------------------------------------------------------
! 1735: Returns the result of subtracting the absolute values of the single-
! 1736: precision floating-point values `a' and `b'. If `zSign' is 1, the
! 1737: difference is negated before being returned. `zSign' is ignored if the
! 1738: result is a NaN. The subtraction is performed according to the IEC/IEEE
! 1739: Standard for Binary Floating-Point Arithmetic.
! 1740: -------------------------------------------------------------------------------
! 1741: */
! 1742: static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
! 1743: {
! 1744: int16 aExp, bExp, zExp;
! 1745: bits32 aSig, bSig, zSig;
! 1746: int16 expDiff;
! 1747:
! 1748: aSig = extractFloat32Frac( a );
! 1749: aExp = extractFloat32Exp( a );
! 1750: bSig = extractFloat32Frac( b );
! 1751: bExp = extractFloat32Exp( b );
! 1752: expDiff = aExp - bExp;
! 1753: aSig <<= 7;
! 1754: bSig <<= 7;
! 1755: if ( 0 < expDiff ) goto aExpBigger;
! 1756: if ( expDiff < 0 ) goto bExpBigger;
! 1757: if ( aExp == 0xFF ) {
! 1758: if ( aSig | bSig ) return propagateFloat32NaN( a, b );
! 1759: float_raise( float_flag_invalid );
! 1760: return float32_default_nan;
! 1761: }
! 1762: if ( aExp == 0 ) {
! 1763: aExp = 1;
! 1764: bExp = 1;
! 1765: }
! 1766: if ( bSig < aSig ) goto aBigger;
! 1767: if ( aSig < bSig ) goto bBigger;
! 1768: return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
! 1769: bExpBigger:
! 1770: if ( bExp == 0xFF ) {
! 1771: if ( bSig ) return propagateFloat32NaN( a, b );
! 1772: return packFloat32( zSign ^ 1, 0xFF, 0 );
! 1773: }
! 1774: if ( aExp == 0 ) {
! 1775: ++expDiff;
! 1776: }
! 1777: else {
! 1778: aSig |= 0x40000000;
! 1779: }
! 1780: shift32RightJamming( aSig, - expDiff, &aSig );
! 1781: bSig |= 0x40000000;
! 1782: bBigger:
! 1783: zSig = bSig - aSig;
! 1784: zExp = bExp;
! 1785: zSign ^= 1;
! 1786: goto normalizeRoundAndPack;
! 1787: aExpBigger:
! 1788: if ( aExp == 0xFF ) {
! 1789: if ( aSig ) return propagateFloat32NaN( a, b );
! 1790: return a;
! 1791: }
! 1792: if ( bExp == 0 ) {
! 1793: --expDiff;
! 1794: }
! 1795: else {
! 1796: bSig |= 0x40000000;
! 1797: }
! 1798: shift32RightJamming( bSig, expDiff, &bSig );
! 1799: aSig |= 0x40000000;
! 1800: aBigger:
! 1801: zSig = aSig - bSig;
! 1802: zExp = aExp;
! 1803: normalizeRoundAndPack:
! 1804: --zExp;
! 1805: return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
! 1806:
! 1807: }
! 1808:
! 1809: /*
! 1810: -------------------------------------------------------------------------------
! 1811: Returns the result of adding the single-precision floating-point values `a'
! 1812: and `b'. The operation is performed according to the IEC/IEEE Standard for
! 1813: Binary Floating-Point Arithmetic.
! 1814: -------------------------------------------------------------------------------
! 1815: */
! 1816: float32 float32_add( float32 a, float32 b )
! 1817: {
! 1818: flag aSign, bSign;
! 1819:
! 1820: aSign = extractFloat32Sign( a );
! 1821: bSign = extractFloat32Sign( b );
! 1822: if ( aSign == bSign ) {
! 1823: return addFloat32Sigs( a, b, aSign );
! 1824: }
! 1825: else {
! 1826: return subFloat32Sigs( a, b, aSign );
! 1827: }
! 1828:
! 1829: }
! 1830:
! 1831: /*
! 1832: -------------------------------------------------------------------------------
! 1833: Returns the result of subtracting the single-precision floating-point values
! 1834: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
! 1835: for Binary Floating-Point Arithmetic.
! 1836: -------------------------------------------------------------------------------
! 1837: */
! 1838: float32 float32_sub( float32 a, float32 b )
! 1839: {
! 1840: flag aSign, bSign;
! 1841:
! 1842: aSign = extractFloat32Sign( a );
! 1843: bSign = extractFloat32Sign( b );
! 1844: if ( aSign == bSign ) {
! 1845: return subFloat32Sigs( a, b, aSign );
! 1846: }
! 1847: else {
! 1848: return addFloat32Sigs( a, b, aSign );
! 1849: }
! 1850:
! 1851: }
! 1852:
! 1853: /*
! 1854: -------------------------------------------------------------------------------
! 1855: Returns the result of multiplying the single-precision floating-point values
! 1856: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
! 1857: for Binary Floating-Point Arithmetic.
! 1858: -------------------------------------------------------------------------------
! 1859: */
! 1860: float32 float32_mul( float32 a, float32 b )
! 1861: {
! 1862: flag aSign, bSign, zSign;
! 1863: int16 aExp, bExp, zExp;
! 1864: bits32 aSig, bSig;
! 1865: bits64 zSig64;
! 1866: bits32 zSig;
! 1867:
! 1868: aSig = extractFloat32Frac( a );
! 1869: aExp = extractFloat32Exp( a );
! 1870: aSign = extractFloat32Sign( a );
! 1871: bSig = extractFloat32Frac( b );
! 1872: bExp = extractFloat32Exp( b );
! 1873: bSign = extractFloat32Sign( b );
! 1874: zSign = aSign ^ bSign;
! 1875: if ( aExp == 0xFF ) {
! 1876: if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
! 1877: return propagateFloat32NaN( a, b );
! 1878: }
! 1879: if ( ( bExp | bSig ) == 0 ) {
! 1880: float_raise( float_flag_invalid );
! 1881: return float32_default_nan;
! 1882: }
! 1883: return packFloat32( zSign, 0xFF, 0 );
! 1884: }
! 1885: if ( bExp == 0xFF ) {
! 1886: if ( bSig ) return propagateFloat32NaN( a, b );
! 1887: if ( ( aExp | aSig ) == 0 ) {
! 1888: float_raise( float_flag_invalid );
! 1889: return float32_default_nan;
! 1890: }
! 1891: return packFloat32( zSign, 0xFF, 0 );
! 1892: }
! 1893: if ( aExp == 0 ) {
! 1894: if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
! 1895: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 1896: }
! 1897: if ( bExp == 0 ) {
! 1898: if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
! 1899: normalizeFloat32Subnormal( bSig, &bExp, &bSig );
! 1900: }
! 1901: zExp = aExp + bExp - 0x7F;
! 1902: aSig = ( aSig | 0x00800000 )<<7;
! 1903: bSig = ( bSig | 0x00800000 )<<8;
! 1904: shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
! 1905: zSig = zSig64;
! 1906: if ( 0 <= (sbits32) ( zSig<<1 ) ) {
! 1907: zSig <<= 1;
! 1908: --zExp;
! 1909: }
! 1910: return roundAndPackFloat32( zSign, zExp, zSig );
! 1911:
! 1912: }
! 1913:
! 1914: /*
! 1915: -------------------------------------------------------------------------------
! 1916: Returns the result of dividing the single-precision floating-point value `a'
! 1917: by the corresponding value `b'. The operation is performed according to the
! 1918: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1919: -------------------------------------------------------------------------------
! 1920: */
! 1921: float32 float32_div( float32 a, float32 b )
! 1922: {
! 1923: flag aSign, bSign, zSign;
! 1924: int16 aExp, bExp, zExp;
! 1925: bits32 aSig, bSig, zSig;
! 1926:
! 1927: aSig = extractFloat32Frac( a );
! 1928: aExp = extractFloat32Exp( a );
! 1929: aSign = extractFloat32Sign( a );
! 1930: bSig = extractFloat32Frac( b );
! 1931: bExp = extractFloat32Exp( b );
! 1932: bSign = extractFloat32Sign( b );
! 1933: zSign = aSign ^ bSign;
! 1934: if ( aExp == 0xFF ) {
! 1935: if ( aSig ) return propagateFloat32NaN( a, b );
! 1936: if ( bExp == 0xFF ) {
! 1937: if ( bSig ) return propagateFloat32NaN( a, b );
! 1938: float_raise( float_flag_invalid );
! 1939: return float32_default_nan;
! 1940: }
! 1941: return packFloat32( zSign, 0xFF, 0 );
! 1942: }
! 1943: if ( bExp == 0xFF ) {
! 1944: if ( bSig ) return propagateFloat32NaN( a, b );
! 1945: return packFloat32( zSign, 0, 0 );
! 1946: }
! 1947: if ( bExp == 0 ) {
! 1948: if ( bSig == 0 ) {
! 1949: if ( ( aExp | aSig ) == 0 ) {
! 1950: float_raise( float_flag_invalid );
! 1951: return float32_default_nan;
! 1952: }
! 1953: float_raise( float_flag_divbyzero );
! 1954: return packFloat32( zSign, 0xFF, 0 );
! 1955: }
! 1956: normalizeFloat32Subnormal( bSig, &bExp, &bSig );
! 1957: }
! 1958: if ( aExp == 0 ) {
! 1959: if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
! 1960: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 1961: }
! 1962: zExp = aExp - bExp + 0x7D;
! 1963: aSig = ( aSig | 0x00800000 )<<7;
! 1964: bSig = ( bSig | 0x00800000 )<<8;
! 1965: if ( bSig <= ( aSig + aSig ) ) {
! 1966: aSig >>= 1;
! 1967: ++zExp;
! 1968: }
! 1969: zSig = ( ( (bits64) aSig )<<32 ) / bSig;
! 1970: if ( ( zSig & 0x3F ) == 0 ) {
! 1971: zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
! 1972: }
! 1973: return roundAndPackFloat32( zSign, zExp, zSig );
! 1974:
! 1975: }
! 1976:
! 1977: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 1978: /*
! 1979: -------------------------------------------------------------------------------
! 1980: Returns the remainder of the single-precision floating-point value `a'
! 1981: with respect to the corresponding value `b'. The operation is performed
! 1982: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 1983: -------------------------------------------------------------------------------
! 1984: */
! 1985: float32 float32_rem( float32 a, float32 b )
! 1986: {
! 1987: flag aSign, bSign, zSign;
! 1988: int16 aExp, bExp, expDiff;
! 1989: bits32 aSig, bSig;
! 1990: bits32 q;
! 1991: bits64 aSig64, bSig64, q64;
! 1992: bits32 alternateASig;
! 1993: sbits32 sigMean;
! 1994:
! 1995: aSig = extractFloat32Frac( a );
! 1996: aExp = extractFloat32Exp( a );
! 1997: aSign = extractFloat32Sign( a );
! 1998: bSig = extractFloat32Frac( b );
! 1999: bExp = extractFloat32Exp( b );
! 2000: bSign = extractFloat32Sign( b );
! 2001: if ( aExp == 0xFF ) {
! 2002: if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
! 2003: return propagateFloat32NaN( a, b );
! 2004: }
! 2005: float_raise( float_flag_invalid );
! 2006: return float32_default_nan;
! 2007: }
! 2008: if ( bExp == 0xFF ) {
! 2009: if ( bSig ) return propagateFloat32NaN( a, b );
! 2010: return a;
! 2011: }
! 2012: if ( bExp == 0 ) {
! 2013: if ( bSig == 0 ) {
! 2014: float_raise( float_flag_invalid );
! 2015: return float32_default_nan;
! 2016: }
! 2017: normalizeFloat32Subnormal( bSig, &bExp, &bSig );
! 2018: }
! 2019: if ( aExp == 0 ) {
! 2020: if ( aSig == 0 ) return a;
! 2021: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 2022: }
! 2023: expDiff = aExp - bExp;
! 2024: aSig |= 0x00800000;
! 2025: bSig |= 0x00800000;
! 2026: if ( expDiff < 32 ) {
! 2027: aSig <<= 8;
! 2028: bSig <<= 8;
! 2029: if ( expDiff < 0 ) {
! 2030: if ( expDiff < -1 ) return a;
! 2031: aSig >>= 1;
! 2032: }
! 2033: q = ( bSig <= aSig );
! 2034: if ( q ) aSig -= bSig;
! 2035: if ( 0 < expDiff ) {
! 2036: q = ( ( (bits64) aSig )<<32 ) / bSig;
! 2037: q >>= 32 - expDiff;
! 2038: bSig >>= 2;
! 2039: aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
! 2040: }
! 2041: else {
! 2042: aSig >>= 2;
! 2043: bSig >>= 2;
! 2044: }
! 2045: }
! 2046: else {
! 2047: if ( bSig <= aSig ) aSig -= bSig;
! 2048: aSig64 = ( (bits64) aSig )<<40;
! 2049: bSig64 = ( (bits64) bSig )<<40;
! 2050: expDiff -= 64;
! 2051: while ( 0 < expDiff ) {
! 2052: q64 = estimateDiv128To64( aSig64, 0, bSig64 );
! 2053: q64 = ( 2 < q64 ) ? q64 - 2 : 0;
! 2054: aSig64 = - ( ( bSig * q64 )<<38 );
! 2055: expDiff -= 62;
! 2056: }
! 2057: expDiff += 64;
! 2058: q64 = estimateDiv128To64( aSig64, 0, bSig64 );
! 2059: q64 = ( 2 < q64 ) ? q64 - 2 : 0;
! 2060: q = q64>>( 64 - expDiff );
! 2061: bSig <<= 6;
! 2062: aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
! 2063: }
! 2064: do {
! 2065: alternateASig = aSig;
! 2066: ++q;
! 2067: aSig -= bSig;
! 2068: } while ( 0 <= (sbits32) aSig );
! 2069: sigMean = aSig + alternateASig;
! 2070: if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
! 2071: aSig = alternateASig;
! 2072: }
! 2073: zSign = ( (sbits32) aSig < 0 );
! 2074: if ( zSign ) aSig = - aSig;
! 2075: return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
! 2076:
! 2077: }
! 2078: #endif /* !SOFTFLOAT_FOR_GCC */
! 2079:
! 2080: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 2081: /*
! 2082: -------------------------------------------------------------------------------
! 2083: Returns the square root of the single-precision floating-point value `a'.
! 2084: The operation is performed according to the IEC/IEEE Standard for Binary
! 2085: Floating-Point Arithmetic.
! 2086: -------------------------------------------------------------------------------
! 2087: */
! 2088: float32 float32_sqrt( float32 a )
! 2089: {
! 2090: flag aSign;
! 2091: int16 aExp, zExp;
! 2092: bits32 aSig, zSig;
! 2093: bits64 rem, term;
! 2094:
! 2095: aSig = extractFloat32Frac( a );
! 2096: aExp = extractFloat32Exp( a );
! 2097: aSign = extractFloat32Sign( a );
! 2098: if ( aExp == 0xFF ) {
! 2099: if ( aSig ) return propagateFloat32NaN( a, 0 );
! 2100: if ( ! aSign ) return a;
! 2101: float_raise( float_flag_invalid );
! 2102: return float32_default_nan;
! 2103: }
! 2104: if ( aSign ) {
! 2105: if ( ( aExp | aSig ) == 0 ) return a;
! 2106: float_raise( float_flag_invalid );
! 2107: return float32_default_nan;
! 2108: }
! 2109: if ( aExp == 0 ) {
! 2110: if ( aSig == 0 ) return 0;
! 2111: normalizeFloat32Subnormal( aSig, &aExp, &aSig );
! 2112: }
! 2113: zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
! 2114: aSig = ( aSig | 0x00800000 )<<8;
! 2115: zSig = estimateSqrt32( aExp, aSig ) + 2;
! 2116: if ( ( zSig & 0x7F ) <= 5 ) {
! 2117: if ( zSig < 2 ) {
! 2118: zSig = 0x7FFFFFFF;
! 2119: goto roundAndPack;
! 2120: }
! 2121: aSig >>= aExp & 1;
! 2122: term = ( (bits64) zSig ) * zSig;
! 2123: rem = ( ( (bits64) aSig )<<32 ) - term;
! 2124: while ( (sbits64) rem < 0 ) {
! 2125: --zSig;
! 2126: rem += ( ( (bits64) zSig )<<1 ) | 1;
! 2127: }
! 2128: zSig |= ( rem != 0 );
! 2129: }
! 2130: shift32RightJamming( zSig, 1, &zSig );
! 2131: roundAndPack:
! 2132: return roundAndPackFloat32( 0, zExp, zSig );
! 2133:
! 2134: }
! 2135: #endif /* !SOFTFLOAT_FOR_GCC */
! 2136:
! 2137: /*
! 2138: -------------------------------------------------------------------------------
! 2139: Returns 1 if the single-precision floating-point value `a' is equal to
! 2140: the corresponding value `b', and 0 otherwise. The comparison is performed
! 2141: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 2142: -------------------------------------------------------------------------------
! 2143: */
! 2144: flag float32_eq( float32 a, float32 b )
! 2145: {
! 2146:
! 2147: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
! 2148: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
! 2149: ) {
! 2150: if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
! 2151: float_raise( float_flag_invalid );
! 2152: }
! 2153: return 0;
! 2154: }
! 2155: return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
! 2156:
! 2157: }
! 2158:
! 2159: /*
! 2160: -------------------------------------------------------------------------------
! 2161: Returns 1 if the single-precision floating-point value `a' is less than
! 2162: or equal to the corresponding value `b', and 0 otherwise. The comparison
! 2163: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2164: Arithmetic.
! 2165: -------------------------------------------------------------------------------
! 2166: */
! 2167: flag float32_le( float32 a, float32 b )
! 2168: {
! 2169: flag aSign, bSign;
! 2170:
! 2171: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
! 2172: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
! 2173: ) {
! 2174: float_raise( float_flag_invalid );
! 2175: return 0;
! 2176: }
! 2177: aSign = extractFloat32Sign( a );
! 2178: bSign = extractFloat32Sign( b );
! 2179: if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
! 2180: return ( a == b ) || ( aSign ^ ( a < b ) );
! 2181:
! 2182: }
! 2183:
! 2184: /*
! 2185: -------------------------------------------------------------------------------
! 2186: Returns 1 if the single-precision floating-point value `a' is less than
! 2187: the corresponding value `b', and 0 otherwise. The comparison is performed
! 2188: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 2189: -------------------------------------------------------------------------------
! 2190: */
! 2191: flag float32_lt( float32 a, float32 b )
! 2192: {
! 2193: flag aSign, bSign;
! 2194:
! 2195: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
! 2196: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
! 2197: ) {
! 2198: float_raise( float_flag_invalid );
! 2199: return 0;
! 2200: }
! 2201: aSign = extractFloat32Sign( a );
! 2202: bSign = extractFloat32Sign( b );
! 2203: if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
! 2204: return ( a != b ) && ( aSign ^ ( a < b ) );
! 2205:
! 2206: }
! 2207:
! 2208: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 2209: /*
! 2210: -------------------------------------------------------------------------------
! 2211: Returns 1 if the single-precision floating-point value `a' is equal to
! 2212: the corresponding value `b', and 0 otherwise. The invalid exception is
! 2213: raised if either operand is a NaN. Otherwise, the comparison is performed
! 2214: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 2215: -------------------------------------------------------------------------------
! 2216: */
! 2217: flag float32_eq_signaling( float32 a, float32 b )
! 2218: {
! 2219:
! 2220: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
! 2221: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
! 2222: ) {
! 2223: float_raise( float_flag_invalid );
! 2224: return 0;
! 2225: }
! 2226: return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
! 2227:
! 2228: }
! 2229:
! 2230: /*
! 2231: -------------------------------------------------------------------------------
! 2232: Returns 1 if the single-precision floating-point value `a' is less than or
! 2233: equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
! 2234: cause an exception. Otherwise, the comparison is performed according to the
! 2235: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 2236: -------------------------------------------------------------------------------
! 2237: */
! 2238: flag float32_le_quiet( float32 a, float32 b )
! 2239: {
! 2240: flag aSign, bSign;
! 2241:
! 2242: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
! 2243: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
! 2244: ) {
! 2245: if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
! 2246: float_raise( float_flag_invalid );
! 2247: }
! 2248: return 0;
! 2249: }
! 2250: aSign = extractFloat32Sign( a );
! 2251: bSign = extractFloat32Sign( b );
! 2252: if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
! 2253: return ( a == b ) || ( aSign ^ ( a < b ) );
! 2254:
! 2255: }
! 2256:
! 2257: /*
! 2258: -------------------------------------------------------------------------------
! 2259: Returns 1 if the single-precision floating-point value `a' is less than
! 2260: the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
! 2261: exception. Otherwise, the comparison is performed according to the IEC/IEEE
! 2262: Standard for Binary Floating-Point Arithmetic.
! 2263: -------------------------------------------------------------------------------
! 2264: */
! 2265: flag float32_lt_quiet( float32 a, float32 b )
! 2266: {
! 2267: flag aSign, bSign;
! 2268:
! 2269: if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
! 2270: || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
! 2271: ) {
! 2272: if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
! 2273: float_raise( float_flag_invalid );
! 2274: }
! 2275: return 0;
! 2276: }
! 2277: aSign = extractFloat32Sign( a );
! 2278: bSign = extractFloat32Sign( b );
! 2279: if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
! 2280: return ( a != b ) && ( aSign ^ ( a < b ) );
! 2281:
! 2282: }
! 2283: #endif /* !SOFTFLOAT_FOR_GCC */
! 2284:
! 2285: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 2286: /*
! 2287: -------------------------------------------------------------------------------
! 2288: Returns the result of converting the double-precision floating-point value
! 2289: `a' to the 32-bit two's complement integer format. The conversion is
! 2290: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2291: Arithmetic---which means in particular that the conversion is rounded
! 2292: according to the current rounding mode. If `a' is a NaN, the largest
! 2293: positive integer is returned. Otherwise, if the conversion overflows, the
! 2294: largest integer with the same sign as `a' is returned.
! 2295: -------------------------------------------------------------------------------
! 2296: */
! 2297: int32 float64_to_int32( float64 a )
! 2298: {
! 2299: flag aSign;
! 2300: int16 aExp, shiftCount;
! 2301: bits64 aSig;
! 2302:
! 2303: aSig = extractFloat64Frac( a );
! 2304: aExp = extractFloat64Exp( a );
! 2305: aSign = extractFloat64Sign( a );
! 2306: if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
! 2307: if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
! 2308: shiftCount = 0x42C - aExp;
! 2309: if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
! 2310: return roundAndPackInt32( aSign, aSig );
! 2311:
! 2312: }
! 2313: #endif /* !SOFTFLOAT_FOR_GCC */
! 2314:
! 2315: /*
! 2316: -------------------------------------------------------------------------------
! 2317: Returns the result of converting the double-precision floating-point value
! 2318: `a' to the 32-bit two's complement integer format. The conversion is
! 2319: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2320: Arithmetic, except that the conversion is always rounded toward zero.
! 2321: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
! 2322: the conversion overflows, the largest integer with the same sign as `a' is
! 2323: returned.
! 2324: -------------------------------------------------------------------------------
! 2325: */
! 2326: int32 float64_to_int32_round_to_zero( float64 a )
! 2327: {
! 2328: flag aSign;
! 2329: int16 aExp, shiftCount;
! 2330: bits64 aSig, savedASig;
! 2331: int32 z;
! 2332:
! 2333: aSig = extractFloat64Frac( a );
! 2334: aExp = extractFloat64Exp( a );
! 2335: aSign = extractFloat64Sign( a );
! 2336: if ( 0x41E < aExp ) {
! 2337: if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
! 2338: goto invalid;
! 2339: }
! 2340: else if ( aExp < 0x3FF ) {
! 2341: if ( aExp || aSig ) float_set_inexact();
! 2342: return 0;
! 2343: }
! 2344: aSig |= LIT64( 0x0010000000000000 );
! 2345: shiftCount = 0x433 - aExp;
! 2346: savedASig = aSig;
! 2347: aSig >>= shiftCount;
! 2348: z = aSig;
! 2349: if ( aSign ) z = - z;
! 2350: if ( ( z < 0 ) ^ aSign ) {
! 2351: invalid:
! 2352: float_raise( float_flag_invalid );
! 2353: return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
! 2354: }
! 2355: if ( ( aSig<<shiftCount ) != savedASig ) {
! 2356: float_set_inexact();
! 2357: }
! 2358: return z;
! 2359:
! 2360: }
! 2361:
! 2362: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
! 2363: /*
! 2364: -------------------------------------------------------------------------------
! 2365: Returns the result of converting the double-precision floating-point value
! 2366: `a' to the 64-bit two's complement integer format. The conversion is
! 2367: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2368: Arithmetic---which means in particular that the conversion is rounded
! 2369: according to the current rounding mode. If `a' is a NaN, the largest
! 2370: positive integer is returned. Otherwise, if the conversion overflows, the
! 2371: largest integer with the same sign as `a' is returned.
! 2372: -------------------------------------------------------------------------------
! 2373: */
! 2374: int64 float64_to_int64( float64 a )
! 2375: {
! 2376: flag aSign;
! 2377: int16 aExp, shiftCount;
! 2378: bits64 aSig, aSigExtra;
! 2379:
! 2380: aSig = extractFloat64Frac( a );
! 2381: aExp = extractFloat64Exp( a );
! 2382: aSign = extractFloat64Sign( a );
! 2383: if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
! 2384: shiftCount = 0x433 - aExp;
! 2385: if ( shiftCount <= 0 ) {
! 2386: if ( 0x43E < aExp ) {
! 2387: float_raise( float_flag_invalid );
! 2388: if ( ! aSign
! 2389: || ( ( aExp == 0x7FF )
! 2390: && ( aSig != LIT64( 0x0010000000000000 ) ) )
! 2391: ) {
! 2392: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 2393: }
! 2394: return (sbits64) LIT64( 0x8000000000000000 );
! 2395: }
! 2396: aSigExtra = 0;
! 2397: aSig <<= - shiftCount;
! 2398: }
! 2399: else {
! 2400: shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
! 2401: }
! 2402: return roundAndPackInt64( aSign, aSig, aSigExtra );
! 2403:
! 2404: }
! 2405:
! 2406: /*
! 2407: -------------------------------------------------------------------------------
! 2408: Returns the result of converting the double-precision floating-point value
! 2409: `a' to the 64-bit two's complement integer format. The conversion is
! 2410: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2411: Arithmetic, except that the conversion is always rounded toward zero.
! 2412: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
! 2413: the conversion overflows, the largest integer with the same sign as `a' is
! 2414: returned.
! 2415: -------------------------------------------------------------------------------
! 2416: */
! 2417: int64 float64_to_int64_round_to_zero( float64 a )
! 2418: {
! 2419: flag aSign;
! 2420: int16 aExp, shiftCount;
! 2421: bits64 aSig;
! 2422: int64 z;
! 2423:
! 2424: aSig = extractFloat64Frac( a );
! 2425: aExp = extractFloat64Exp( a );
! 2426: aSign = extractFloat64Sign( a );
! 2427: if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
! 2428: shiftCount = aExp - 0x433;
! 2429: if ( 0 <= shiftCount ) {
! 2430: if ( 0x43E <= aExp ) {
! 2431: if ( a != LIT64( 0xC3E0000000000000 ) ) {
! 2432: float_raise( float_flag_invalid );
! 2433: if ( ! aSign
! 2434: || ( ( aExp == 0x7FF )
! 2435: && ( aSig != LIT64( 0x0010000000000000 ) ) )
! 2436: ) {
! 2437: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 2438: }
! 2439: }
! 2440: return (sbits64) LIT64( 0x8000000000000000 );
! 2441: }
! 2442: z = aSig<<shiftCount;
! 2443: }
! 2444: else {
! 2445: if ( aExp < 0x3FE ) {
! 2446: if ( aExp | aSig ) float_set_inexact();
! 2447: return 0;
! 2448: }
! 2449: z = aSig>>( - shiftCount );
! 2450: if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
! 2451: float_set_inexact();
! 2452: }
! 2453: }
! 2454: if ( aSign ) z = - z;
! 2455: return z;
! 2456:
! 2457: }
! 2458: #endif /* !SOFTFLOAT_FOR_GCC */
! 2459:
! 2460: /*
! 2461: -------------------------------------------------------------------------------
! 2462: Returns the result of converting the double-precision floating-point value
! 2463: `a' to the single-precision floating-point format. The conversion is
! 2464: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2465: Arithmetic.
! 2466: -------------------------------------------------------------------------------
! 2467: */
! 2468: float32 float64_to_float32( float64 a )
! 2469: {
! 2470: flag aSign;
! 2471: int16 aExp;
! 2472: bits64 aSig;
! 2473: bits32 zSig;
! 2474:
! 2475: aSig = extractFloat64Frac( a );
! 2476: aExp = extractFloat64Exp( a );
! 2477: aSign = extractFloat64Sign( a );
! 2478: if ( aExp == 0x7FF ) {
! 2479: if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
! 2480: return packFloat32( aSign, 0xFF, 0 );
! 2481: }
! 2482: shift64RightJamming( aSig, 22, &aSig );
! 2483: zSig = aSig;
! 2484: if ( aExp || zSig ) {
! 2485: zSig |= 0x40000000;
! 2486: aExp -= 0x381;
! 2487: }
! 2488: return roundAndPackFloat32( aSign, aExp, zSig );
! 2489:
! 2490: }
! 2491:
! 2492: #ifdef FLOATX80
! 2493:
! 2494: /*
! 2495: -------------------------------------------------------------------------------
! 2496: Returns the result of converting the double-precision floating-point value
! 2497: `a' to the extended double-precision floating-point format. The conversion
! 2498: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2499: Arithmetic.
! 2500: -------------------------------------------------------------------------------
! 2501: */
! 2502: floatx80 float64_to_floatx80( float64 a )
! 2503: {
! 2504: flag aSign;
! 2505: int16 aExp;
! 2506: bits64 aSig;
! 2507:
! 2508: aSig = extractFloat64Frac( a );
! 2509: aExp = extractFloat64Exp( a );
! 2510: aSign = extractFloat64Sign( a );
! 2511: if ( aExp == 0x7FF ) {
! 2512: if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
! 2513: return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 2514: }
! 2515: if ( aExp == 0 ) {
! 2516: if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
! 2517: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
! 2518: }
! 2519: return
! 2520: packFloatx80(
! 2521: aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
! 2522:
! 2523: }
! 2524:
! 2525: #endif
! 2526:
! 2527: #ifdef FLOAT128
! 2528:
! 2529: /*
! 2530: -------------------------------------------------------------------------------
! 2531: Returns the result of converting the double-precision floating-point value
! 2532: `a' to the quadruple-precision floating-point format. The conversion is
! 2533: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 2534: Arithmetic.
! 2535: -------------------------------------------------------------------------------
! 2536: */
! 2537: float128 float64_to_float128( float64 a )
! 2538: {
! 2539: flag aSign;
! 2540: int16 aExp;
! 2541: bits64 aSig, zSig0, zSig1;
! 2542:
! 2543: aSig = extractFloat64Frac( a );
! 2544: aExp = extractFloat64Exp( a );
! 2545: aSign = extractFloat64Sign( a );
! 2546: if ( aExp == 0x7FF ) {
! 2547: if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
! 2548: return packFloat128( aSign, 0x7FFF, 0, 0 );
! 2549: }
! 2550: if ( aExp == 0 ) {
! 2551: if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
! 2552: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
! 2553: --aExp;
! 2554: }
! 2555: shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
! 2556: return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
! 2557:
! 2558: }
! 2559:
! 2560: #endif
! 2561:
! 2562: #ifndef SOFTFLOAT_FOR_GCC
! 2563: /*
! 2564: -------------------------------------------------------------------------------
! 2565: Rounds the double-precision floating-point value `a' to an integer, and
! 2566: returns the result as a double-precision floating-point value. The
! 2567: operation is performed according to the IEC/IEEE Standard for Binary
! 2568: Floating-Point Arithmetic.
! 2569: -------------------------------------------------------------------------------
! 2570: */
! 2571: float64 float64_round_to_int( float64 a )
! 2572: {
! 2573: flag aSign;
! 2574: int16 aExp;
! 2575: bits64 lastBitMask, roundBitsMask;
! 2576: int8 roundingMode;
! 2577: float64 z;
! 2578:
! 2579: aExp = extractFloat64Exp( a );
! 2580: if ( 0x433 <= aExp ) {
! 2581: if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
! 2582: return propagateFloat64NaN( a, a );
! 2583: }
! 2584: return a;
! 2585: }
! 2586: if ( aExp < 0x3FF ) {
! 2587: if ( (bits64) ( a<<1 ) == 0 ) return a;
! 2588: float_set_inexact();
! 2589: aSign = extractFloat64Sign( a );
! 2590: switch ( float_rounding_mode() ) {
! 2591: case float_round_nearest_even:
! 2592: if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
! 2593: return packFloat64( aSign, 0x3FF, 0 );
! 2594: }
! 2595: break;
! 2596: case float_round_down:
! 2597: return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
! 2598: case float_round_up:
! 2599: return
! 2600: aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
! 2601: }
! 2602: return packFloat64( aSign, 0, 0 );
! 2603: }
! 2604: lastBitMask = 1;
! 2605: lastBitMask <<= 0x433 - aExp;
! 2606: roundBitsMask = lastBitMask - 1;
! 2607: z = a;
! 2608: roundingMode = float_rounding_mode();
! 2609: if ( roundingMode == float_round_nearest_even ) {
! 2610: z += lastBitMask>>1;
! 2611: if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
! 2612: }
! 2613: else if ( roundingMode != float_round_to_zero ) {
! 2614: if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
! 2615: z += roundBitsMask;
! 2616: }
! 2617: }
! 2618: z &= ~ roundBitsMask;
! 2619: if ( z != a ) float_set_inexact();
! 2620: return z;
! 2621:
! 2622: }
! 2623: #endif
! 2624:
! 2625: /*
! 2626: -------------------------------------------------------------------------------
! 2627: Returns the result of adding the absolute values of the double-precision
! 2628: floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
! 2629: before being returned. `zSign' is ignored if the result is a NaN.
! 2630: The addition is performed according to the IEC/IEEE Standard for Binary
! 2631: Floating-Point Arithmetic.
! 2632: -------------------------------------------------------------------------------
! 2633: */
! 2634: static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
! 2635: {
! 2636: int16 aExp, bExp, zExp;
! 2637: bits64 aSig, bSig, zSig;
! 2638: int16 expDiff;
! 2639:
! 2640: aSig = extractFloat64Frac( a );
! 2641: aExp = extractFloat64Exp( a );
! 2642: bSig = extractFloat64Frac( b );
! 2643: bExp = extractFloat64Exp( b );
! 2644: expDiff = aExp - bExp;
! 2645: aSig <<= 9;
! 2646: bSig <<= 9;
! 2647: if ( 0 < expDiff ) {
! 2648: if ( aExp == 0x7FF ) {
! 2649: if ( aSig ) return propagateFloat64NaN( a, b );
! 2650: return a;
! 2651: }
! 2652: if ( bExp == 0 ) {
! 2653: --expDiff;
! 2654: }
! 2655: else {
! 2656: bSig |= LIT64( 0x2000000000000000 );
! 2657: }
! 2658: shift64RightJamming( bSig, expDiff, &bSig );
! 2659: zExp = aExp;
! 2660: }
! 2661: else if ( expDiff < 0 ) {
! 2662: if ( bExp == 0x7FF ) {
! 2663: if ( bSig ) return propagateFloat64NaN( a, b );
! 2664: return packFloat64( zSign, 0x7FF, 0 );
! 2665: }
! 2666: if ( aExp == 0 ) {
! 2667: ++expDiff;
! 2668: }
! 2669: else {
! 2670: aSig |= LIT64( 0x2000000000000000 );
! 2671: }
! 2672: shift64RightJamming( aSig, - expDiff, &aSig );
! 2673: zExp = bExp;
! 2674: }
! 2675: else {
! 2676: if ( aExp == 0x7FF ) {
! 2677: if ( aSig | bSig ) return propagateFloat64NaN( a, b );
! 2678: return a;
! 2679: }
! 2680: if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
! 2681: zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
! 2682: zExp = aExp;
! 2683: goto roundAndPack;
! 2684: }
! 2685: aSig |= LIT64( 0x2000000000000000 );
! 2686: zSig = ( aSig + bSig )<<1;
! 2687: --zExp;
! 2688: if ( (sbits64) zSig < 0 ) {
! 2689: zSig = aSig + bSig;
! 2690: ++zExp;
! 2691: }
! 2692: roundAndPack:
! 2693: return roundAndPackFloat64( zSign, zExp, zSig );
! 2694:
! 2695: }
! 2696:
! 2697: /*
! 2698: -------------------------------------------------------------------------------
! 2699: Returns the result of subtracting the absolute values of the double-
! 2700: precision floating-point values `a' and `b'. If `zSign' is 1, the
! 2701: difference is negated before being returned. `zSign' is ignored if the
! 2702: result is a NaN. The subtraction is performed according to the IEC/IEEE
! 2703: Standard for Binary Floating-Point Arithmetic.
! 2704: -------------------------------------------------------------------------------
! 2705: */
! 2706: static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
! 2707: {
! 2708: int16 aExp, bExp, zExp;
! 2709: bits64 aSig, bSig, zSig;
! 2710: int16 expDiff;
! 2711:
! 2712: aSig = extractFloat64Frac( a );
! 2713: aExp = extractFloat64Exp( a );
! 2714: bSig = extractFloat64Frac( b );
! 2715: bExp = extractFloat64Exp( b );
! 2716: expDiff = aExp - bExp;
! 2717: aSig <<= 10;
! 2718: bSig <<= 10;
! 2719: if ( 0 < expDiff ) goto aExpBigger;
! 2720: if ( expDiff < 0 ) goto bExpBigger;
! 2721: if ( aExp == 0x7FF ) {
! 2722: if ( aSig | bSig ) return propagateFloat64NaN( a, b );
! 2723: float_raise( float_flag_invalid );
! 2724: return float64_default_nan;
! 2725: }
! 2726: if ( aExp == 0 ) {
! 2727: aExp = 1;
! 2728: bExp = 1;
! 2729: }
! 2730: if ( bSig < aSig ) goto aBigger;
! 2731: if ( aSig < bSig ) goto bBigger;
! 2732: return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
! 2733: bExpBigger:
! 2734: if ( bExp == 0x7FF ) {
! 2735: if ( bSig ) return propagateFloat64NaN( a, b );
! 2736: return packFloat64( zSign ^ 1, 0x7FF, 0 );
! 2737: }
! 2738: if ( aExp == 0 ) {
! 2739: ++expDiff;
! 2740: }
! 2741: else {
! 2742: aSig |= LIT64( 0x4000000000000000 );
! 2743: }
! 2744: shift64RightJamming( aSig, - expDiff, &aSig );
! 2745: bSig |= LIT64( 0x4000000000000000 );
! 2746: bBigger:
! 2747: zSig = bSig - aSig;
! 2748: zExp = bExp;
! 2749: zSign ^= 1;
! 2750: goto normalizeRoundAndPack;
! 2751: aExpBigger:
! 2752: if ( aExp == 0x7FF ) {
! 2753: if ( aSig ) return propagateFloat64NaN( a, b );
! 2754: return a;
! 2755: }
! 2756: if ( bExp == 0 ) {
! 2757: --expDiff;
! 2758: }
! 2759: else {
! 2760: bSig |= LIT64( 0x4000000000000000 );
! 2761: }
! 2762: shift64RightJamming( bSig, expDiff, &bSig );
! 2763: aSig |= LIT64( 0x4000000000000000 );
! 2764: aBigger:
! 2765: zSig = aSig - bSig;
! 2766: zExp = aExp;
! 2767: normalizeRoundAndPack:
! 2768: --zExp;
! 2769: return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
! 2770:
! 2771: }
! 2772:
! 2773: /*
! 2774: -------------------------------------------------------------------------------
! 2775: Returns the result of adding the double-precision floating-point values `a'
! 2776: and `b'. The operation is performed according to the IEC/IEEE Standard for
! 2777: Binary Floating-Point Arithmetic.
! 2778: -------------------------------------------------------------------------------
! 2779: */
! 2780: float64 float64_add( float64 a, float64 b )
! 2781: {
! 2782: flag aSign, bSign;
! 2783:
! 2784: aSign = extractFloat64Sign( a );
! 2785: bSign = extractFloat64Sign( b );
! 2786: if ( aSign == bSign ) {
! 2787: return addFloat64Sigs( a, b, aSign );
! 2788: }
! 2789: else {
! 2790: return subFloat64Sigs( a, b, aSign );
! 2791: }
! 2792:
! 2793: }
! 2794:
! 2795: /*
! 2796: -------------------------------------------------------------------------------
! 2797: Returns the result of subtracting the double-precision floating-point values
! 2798: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
! 2799: for Binary Floating-Point Arithmetic.
! 2800: -------------------------------------------------------------------------------
! 2801: */
! 2802: float64 float64_sub( float64 a, float64 b )
! 2803: {
! 2804: flag aSign, bSign;
! 2805:
! 2806: aSign = extractFloat64Sign( a );
! 2807: bSign = extractFloat64Sign( b );
! 2808: if ( aSign == bSign ) {
! 2809: return subFloat64Sigs( a, b, aSign );
! 2810: }
! 2811: else {
! 2812: return addFloat64Sigs( a, b, aSign );
! 2813: }
! 2814:
! 2815: }
! 2816:
! 2817: /*
! 2818: -------------------------------------------------------------------------------
! 2819: Returns the result of multiplying the double-precision floating-point values
! 2820: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
! 2821: for Binary Floating-Point Arithmetic.
! 2822: -------------------------------------------------------------------------------
! 2823: */
! 2824: float64 float64_mul( float64 a, float64 b )
! 2825: {
! 2826: flag aSign, bSign, zSign;
! 2827: int16 aExp, bExp, zExp;
! 2828: bits64 aSig, bSig, zSig0, zSig1;
! 2829:
! 2830: aSig = extractFloat64Frac( a );
! 2831: aExp = extractFloat64Exp( a );
! 2832: aSign = extractFloat64Sign( a );
! 2833: bSig = extractFloat64Frac( b );
! 2834: bExp = extractFloat64Exp( b );
! 2835: bSign = extractFloat64Sign( b );
! 2836: zSign = aSign ^ bSign;
! 2837: if ( aExp == 0x7FF ) {
! 2838: if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
! 2839: return propagateFloat64NaN( a, b );
! 2840: }
! 2841: if ( ( bExp | bSig ) == 0 ) {
! 2842: float_raise( float_flag_invalid );
! 2843: return float64_default_nan;
! 2844: }
! 2845: return packFloat64( zSign, 0x7FF, 0 );
! 2846: }
! 2847: if ( bExp == 0x7FF ) {
! 2848: if ( bSig ) return propagateFloat64NaN( a, b );
! 2849: if ( ( aExp | aSig ) == 0 ) {
! 2850: float_raise( float_flag_invalid );
! 2851: return float64_default_nan;
! 2852: }
! 2853: return packFloat64( zSign, 0x7FF, 0 );
! 2854: }
! 2855: if ( aExp == 0 ) {
! 2856: if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
! 2857: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
! 2858: }
! 2859: if ( bExp == 0 ) {
! 2860: if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
! 2861: normalizeFloat64Subnormal( bSig, &bExp, &bSig );
! 2862: }
! 2863: zExp = aExp + bExp - 0x3FF;
! 2864: aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
! 2865: bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
! 2866: mul64To128( aSig, bSig, &zSig0, &zSig1 );
! 2867: zSig0 |= ( zSig1 != 0 );
! 2868: if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
! 2869: zSig0 <<= 1;
! 2870: --zExp;
! 2871: }
! 2872: return roundAndPackFloat64( zSign, zExp, zSig0 );
! 2873:
! 2874: }
! 2875:
! 2876: /*
! 2877: -------------------------------------------------------------------------------
! 2878: Returns the result of dividing the double-precision floating-point value `a'
! 2879: by the corresponding value `b'. The operation is performed according to
! 2880: the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 2881: -------------------------------------------------------------------------------
! 2882: */
! 2883: float64 float64_div( float64 a, float64 b )
! 2884: {
! 2885: flag aSign, bSign, zSign;
! 2886: int16 aExp, bExp, zExp;
! 2887: bits64 aSig, bSig, zSig;
! 2888: bits64 rem0, rem1;
! 2889: bits64 term0, term1;
! 2890:
! 2891: aSig = extractFloat64Frac( a );
! 2892: aExp = extractFloat64Exp( a );
! 2893: aSign = extractFloat64Sign( a );
! 2894: bSig = extractFloat64Frac( b );
! 2895: bExp = extractFloat64Exp( b );
! 2896: bSign = extractFloat64Sign( b );
! 2897: zSign = aSign ^ bSign;
! 2898: if ( aExp == 0x7FF ) {
! 2899: if ( aSig ) return propagateFloat64NaN( a, b );
! 2900: if ( bExp == 0x7FF ) {
! 2901: if ( bSig ) return propagateFloat64NaN( a, b );
! 2902: float_raise( float_flag_invalid );
! 2903: return float64_default_nan;
! 2904: }
! 2905: return packFloat64( zSign, 0x7FF, 0 );
! 2906: }
! 2907: if ( bExp == 0x7FF ) {
! 2908: if ( bSig ) return propagateFloat64NaN( a, b );
! 2909: return packFloat64( zSign, 0, 0 );
! 2910: }
! 2911: if ( bExp == 0 ) {
! 2912: if ( bSig == 0 ) {
! 2913: if ( ( aExp | aSig ) == 0 ) {
! 2914: float_raise( float_flag_invalid );
! 2915: return float64_default_nan;
! 2916: }
! 2917: float_raise( float_flag_divbyzero );
! 2918: return packFloat64( zSign, 0x7FF, 0 );
! 2919: }
! 2920: normalizeFloat64Subnormal( bSig, &bExp, &bSig );
! 2921: }
! 2922: if ( aExp == 0 ) {
! 2923: if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
! 2924: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
! 2925: }
! 2926: zExp = aExp - bExp + 0x3FD;
! 2927: aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
! 2928: bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
! 2929: if ( bSig <= ( aSig + aSig ) ) {
! 2930: aSig >>= 1;
! 2931: ++zExp;
! 2932: }
! 2933: zSig = estimateDiv128To64( aSig, 0, bSig );
! 2934: if ( ( zSig & 0x1FF ) <= 2 ) {
! 2935: mul64To128( bSig, zSig, &term0, &term1 );
! 2936: sub128( aSig, 0, term0, term1, &rem0, &rem1 );
! 2937: while ( (sbits64) rem0 < 0 ) {
! 2938: --zSig;
! 2939: add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
! 2940: }
! 2941: zSig |= ( rem1 != 0 );
! 2942: }
! 2943: return roundAndPackFloat64( zSign, zExp, zSig );
! 2944:
! 2945: }
! 2946:
! 2947: #ifndef SOFTFLOAT_FOR_GCC
! 2948: /*
! 2949: -------------------------------------------------------------------------------
! 2950: Returns the remainder of the double-precision floating-point value `a'
! 2951: with respect to the corresponding value `b'. The operation is performed
! 2952: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 2953: -------------------------------------------------------------------------------
! 2954: */
! 2955: float64 float64_rem( float64 a, float64 b )
! 2956: {
! 2957: flag aSign, bSign, zSign;
! 2958: int16 aExp, bExp, expDiff;
! 2959: bits64 aSig, bSig;
! 2960: bits64 q, alternateASig;
! 2961: sbits64 sigMean;
! 2962:
! 2963: aSig = extractFloat64Frac( a );
! 2964: aExp = extractFloat64Exp( a );
! 2965: aSign = extractFloat64Sign( a );
! 2966: bSig = extractFloat64Frac( b );
! 2967: bExp = extractFloat64Exp( b );
! 2968: bSign = extractFloat64Sign( b );
! 2969: if ( aExp == 0x7FF ) {
! 2970: if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
! 2971: return propagateFloat64NaN( a, b );
! 2972: }
! 2973: float_raise( float_flag_invalid );
! 2974: return float64_default_nan;
! 2975: }
! 2976: if ( bExp == 0x7FF ) {
! 2977: if ( bSig ) return propagateFloat64NaN( a, b );
! 2978: return a;
! 2979: }
! 2980: if ( bExp == 0 ) {
! 2981: if ( bSig == 0 ) {
! 2982: float_raise( float_flag_invalid );
! 2983: return float64_default_nan;
! 2984: }
! 2985: normalizeFloat64Subnormal( bSig, &bExp, &bSig );
! 2986: }
! 2987: if ( aExp == 0 ) {
! 2988: if ( aSig == 0 ) return a;
! 2989: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
! 2990: }
! 2991: expDiff = aExp - bExp;
! 2992: aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
! 2993: bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
! 2994: if ( expDiff < 0 ) {
! 2995: if ( expDiff < -1 ) return a;
! 2996: aSig >>= 1;
! 2997: }
! 2998: q = ( bSig <= aSig );
! 2999: if ( q ) aSig -= bSig;
! 3000: expDiff -= 64;
! 3001: while ( 0 < expDiff ) {
! 3002: q = estimateDiv128To64( aSig, 0, bSig );
! 3003: q = ( 2 < q ) ? q - 2 : 0;
! 3004: aSig = - ( ( bSig>>2 ) * q );
! 3005: expDiff -= 62;
! 3006: }
! 3007: expDiff += 64;
! 3008: if ( 0 < expDiff ) {
! 3009: q = estimateDiv128To64( aSig, 0, bSig );
! 3010: q = ( 2 < q ) ? q - 2 : 0;
! 3011: q >>= 64 - expDiff;
! 3012: bSig >>= 2;
! 3013: aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
! 3014: }
! 3015: else {
! 3016: aSig >>= 2;
! 3017: bSig >>= 2;
! 3018: }
! 3019: do {
! 3020: alternateASig = aSig;
! 3021: ++q;
! 3022: aSig -= bSig;
! 3023: } while ( 0 <= (sbits64) aSig );
! 3024: sigMean = aSig + alternateASig;
! 3025: if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
! 3026: aSig = alternateASig;
! 3027: }
! 3028: zSign = ( (sbits64) aSig < 0 );
! 3029: if ( zSign ) aSig = - aSig;
! 3030: return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
! 3031:
! 3032: }
! 3033:
! 3034: /*
! 3035: -------------------------------------------------------------------------------
! 3036: Returns the square root of the double-precision floating-point value `a'.
! 3037: The operation is performed according to the IEC/IEEE Standard for Binary
! 3038: Floating-Point Arithmetic.
! 3039: -------------------------------------------------------------------------------
! 3040: */
! 3041: float64 float64_sqrt( float64 a )
! 3042: {
! 3043: flag aSign;
! 3044: int16 aExp, zExp;
! 3045: bits64 aSig, zSig, doubleZSig;
! 3046: bits64 rem0, rem1, term0, term1;
! 3047:
! 3048: aSig = extractFloat64Frac( a );
! 3049: aExp = extractFloat64Exp( a );
! 3050: aSign = extractFloat64Sign( a );
! 3051: if ( aExp == 0x7FF ) {
! 3052: if ( aSig ) return propagateFloat64NaN( a, a );
! 3053: if ( ! aSign ) return a;
! 3054: float_raise( float_flag_invalid );
! 3055: return float64_default_nan;
! 3056: }
! 3057: if ( aSign ) {
! 3058: if ( ( aExp | aSig ) == 0 ) return a;
! 3059: float_raise( float_flag_invalid );
! 3060: return float64_default_nan;
! 3061: }
! 3062: if ( aExp == 0 ) {
! 3063: if ( aSig == 0 ) return 0;
! 3064: normalizeFloat64Subnormal( aSig, &aExp, &aSig );
! 3065: }
! 3066: zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
! 3067: aSig |= LIT64( 0x0010000000000000 );
! 3068: zSig = estimateSqrt32( aExp, aSig>>21 );
! 3069: aSig <<= 9 - ( aExp & 1 );
! 3070: zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
! 3071: if ( ( zSig & 0x1FF ) <= 5 ) {
! 3072: doubleZSig = zSig<<1;
! 3073: mul64To128( zSig, zSig, &term0, &term1 );
! 3074: sub128( aSig, 0, term0, term1, &rem0, &rem1 );
! 3075: while ( (sbits64) rem0 < 0 ) {
! 3076: --zSig;
! 3077: doubleZSig -= 2;
! 3078: add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
! 3079: }
! 3080: zSig |= ( ( rem0 | rem1 ) != 0 );
! 3081: }
! 3082: return roundAndPackFloat64( 0, zExp, zSig );
! 3083:
! 3084: }
! 3085: #endif
! 3086:
! 3087: /*
! 3088: -------------------------------------------------------------------------------
! 3089: Returns 1 if the double-precision floating-point value `a' is equal to the
! 3090: corresponding value `b', and 0 otherwise. The comparison is performed
! 3091: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3092: -------------------------------------------------------------------------------
! 3093: */
! 3094: flag float64_eq( float64 a, float64 b )
! 3095: {
! 3096:
! 3097: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
! 3098: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
! 3099: ) {
! 3100: if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
! 3101: float_raise( float_flag_invalid );
! 3102: }
! 3103: return 0;
! 3104: }
! 3105: return ( a == b ) ||
! 3106: ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
! 3107:
! 3108: }
! 3109:
! 3110: /*
! 3111: -------------------------------------------------------------------------------
! 3112: Returns 1 if the double-precision floating-point value `a' is less than or
! 3113: equal to the corresponding value `b', and 0 otherwise. The comparison is
! 3114: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 3115: Arithmetic.
! 3116: -------------------------------------------------------------------------------
! 3117: */
! 3118: flag float64_le( float64 a, float64 b )
! 3119: {
! 3120: flag aSign, bSign;
! 3121:
! 3122: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
! 3123: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
! 3124: ) {
! 3125: float_raise( float_flag_invalid );
! 3126: return 0;
! 3127: }
! 3128: aSign = extractFloat64Sign( a );
! 3129: bSign = extractFloat64Sign( b );
! 3130: if ( aSign != bSign )
! 3131: return aSign ||
! 3132: ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
! 3133: 0 );
! 3134: return ( a == b ) ||
! 3135: ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
! 3136:
! 3137: }
! 3138:
! 3139: /*
! 3140: -------------------------------------------------------------------------------
! 3141: Returns 1 if the double-precision floating-point value `a' is less than
! 3142: the corresponding value `b', and 0 otherwise. The comparison is performed
! 3143: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3144: -------------------------------------------------------------------------------
! 3145: */
! 3146: flag float64_lt( float64 a, float64 b )
! 3147: {
! 3148: flag aSign, bSign;
! 3149:
! 3150: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
! 3151: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
! 3152: ) {
! 3153: float_raise( float_flag_invalid );
! 3154: return 0;
! 3155: }
! 3156: aSign = extractFloat64Sign( a );
! 3157: bSign = extractFloat64Sign( b );
! 3158: if ( aSign != bSign )
! 3159: return aSign &&
! 3160: ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
! 3161: 0 );
! 3162: return ( a != b ) &&
! 3163: ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
! 3164:
! 3165: }
! 3166:
! 3167: #ifndef SOFTFLOAT_FOR_GCC
! 3168: /*
! 3169: -------------------------------------------------------------------------------
! 3170: Returns 1 if the double-precision floating-point value `a' is equal to the
! 3171: corresponding value `b', and 0 otherwise. The invalid exception is raised
! 3172: if either operand is a NaN. Otherwise, the comparison is performed
! 3173: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3174: -------------------------------------------------------------------------------
! 3175: */
! 3176: flag float64_eq_signaling( float64 a, float64 b )
! 3177: {
! 3178:
! 3179: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
! 3180: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
! 3181: ) {
! 3182: float_raise( float_flag_invalid );
! 3183: return 0;
! 3184: }
! 3185: return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
! 3186:
! 3187: }
! 3188:
! 3189: /*
! 3190: -------------------------------------------------------------------------------
! 3191: Returns 1 if the double-precision floating-point value `a' is less than or
! 3192: equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
! 3193: cause an exception. Otherwise, the comparison is performed according to the
! 3194: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3195: -------------------------------------------------------------------------------
! 3196: */
! 3197: flag float64_le_quiet( float64 a, float64 b )
! 3198: {
! 3199: flag aSign, bSign;
! 3200:
! 3201: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
! 3202: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
! 3203: ) {
! 3204: if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
! 3205: float_raise( float_flag_invalid );
! 3206: }
! 3207: return 0;
! 3208: }
! 3209: aSign = extractFloat64Sign( a );
! 3210: bSign = extractFloat64Sign( b );
! 3211: if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
! 3212: return ( a == b ) || ( aSign ^ ( a < b ) );
! 3213:
! 3214: }
! 3215:
! 3216: /*
! 3217: -------------------------------------------------------------------------------
! 3218: Returns 1 if the double-precision floating-point value `a' is less than
! 3219: the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
! 3220: exception. Otherwise, the comparison is performed according to the IEC/IEEE
! 3221: Standard for Binary Floating-Point Arithmetic.
! 3222: -------------------------------------------------------------------------------
! 3223: */
! 3224: flag float64_lt_quiet( float64 a, float64 b )
! 3225: {
! 3226: flag aSign, bSign;
! 3227:
! 3228: if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
! 3229: || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
! 3230: ) {
! 3231: if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
! 3232: float_raise( float_flag_invalid );
! 3233: }
! 3234: return 0;
! 3235: }
! 3236: aSign = extractFloat64Sign( a );
! 3237: bSign = extractFloat64Sign( b );
! 3238: if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
! 3239: return ( a != b ) && ( aSign ^ ( a < b ) );
! 3240:
! 3241: }
! 3242: #endif
! 3243:
! 3244: #ifdef FLOATX80
! 3245:
! 3246: /*
! 3247: -------------------------------------------------------------------------------
! 3248: Returns the result of converting the extended double-precision floating-
! 3249: point value `a' to the 32-bit two's complement integer format. The
! 3250: conversion is performed according to the IEC/IEEE Standard for Binary
! 3251: Floating-Point Arithmetic---which means in particular that the conversion
! 3252: is rounded according to the current rounding mode. If `a' is a NaN, the
! 3253: largest positive integer is returned. Otherwise, if the conversion
! 3254: overflows, the largest integer with the same sign as `a' is returned.
! 3255: -------------------------------------------------------------------------------
! 3256: */
! 3257: int32 floatx80_to_int32( floatx80 a )
! 3258: {
! 3259: flag aSign;
! 3260: int32 aExp, shiftCount;
! 3261: bits64 aSig;
! 3262:
! 3263: aSig = extractFloatx80Frac( a );
! 3264: aExp = extractFloatx80Exp( a );
! 3265: aSign = extractFloatx80Sign( a );
! 3266: if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
! 3267: shiftCount = 0x4037 - aExp;
! 3268: if ( shiftCount <= 0 ) shiftCount = 1;
! 3269: shift64RightJamming( aSig, shiftCount, &aSig );
! 3270: return roundAndPackInt32( aSign, aSig );
! 3271:
! 3272: }
! 3273:
! 3274: /*
! 3275: -------------------------------------------------------------------------------
! 3276: Returns the result of converting the extended double-precision floating-
! 3277: point value `a' to the 32-bit two's complement integer format. The
! 3278: conversion is performed according to the IEC/IEEE Standard for Binary
! 3279: Floating-Point Arithmetic, except that the conversion is always rounded
! 3280: toward zero. If `a' is a NaN, the largest positive integer is returned.
! 3281: Otherwise, if the conversion overflows, the largest integer with the same
! 3282: sign as `a' is returned.
! 3283: -------------------------------------------------------------------------------
! 3284: */
! 3285: int32 floatx80_to_int32_round_to_zero( floatx80 a )
! 3286: {
! 3287: flag aSign;
! 3288: int32 aExp, shiftCount;
! 3289: bits64 aSig, savedASig;
! 3290: int32 z;
! 3291:
! 3292: aSig = extractFloatx80Frac( a );
! 3293: aExp = extractFloatx80Exp( a );
! 3294: aSign = extractFloatx80Sign( a );
! 3295: if ( 0x401E < aExp ) {
! 3296: if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
! 3297: goto invalid;
! 3298: }
! 3299: else if ( aExp < 0x3FFF ) {
! 3300: if ( aExp || aSig ) float_set_inexact();
! 3301: return 0;
! 3302: }
! 3303: shiftCount = 0x403E - aExp;
! 3304: savedASig = aSig;
! 3305: aSig >>= shiftCount;
! 3306: z = aSig;
! 3307: if ( aSign ) z = - z;
! 3308: if ( ( z < 0 ) ^ aSign ) {
! 3309: invalid:
! 3310: float_raise( float_flag_invalid );
! 3311: return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
! 3312: }
! 3313: if ( ( aSig<<shiftCount ) != savedASig ) {
! 3314: float_set_inexact();
! 3315: }
! 3316: return z;
! 3317:
! 3318: }
! 3319:
! 3320: /*
! 3321: -------------------------------------------------------------------------------
! 3322: Returns the result of converting the extended double-precision floating-
! 3323: point value `a' to the 64-bit two's complement integer format. The
! 3324: conversion is performed according to the IEC/IEEE Standard for Binary
! 3325: Floating-Point Arithmetic---which means in particular that the conversion
! 3326: is rounded according to the current rounding mode. If `a' is a NaN,
! 3327: the largest positive integer is returned. Otherwise, if the conversion
! 3328: overflows, the largest integer with the same sign as `a' is returned.
! 3329: -------------------------------------------------------------------------------
! 3330: */
! 3331: int64 floatx80_to_int64( floatx80 a )
! 3332: {
! 3333: flag aSign;
! 3334: int32 aExp, shiftCount;
! 3335: bits64 aSig, aSigExtra;
! 3336:
! 3337: aSig = extractFloatx80Frac( a );
! 3338: aExp = extractFloatx80Exp( a );
! 3339: aSign = extractFloatx80Sign( a );
! 3340: shiftCount = 0x403E - aExp;
! 3341: if ( shiftCount <= 0 ) {
! 3342: if ( shiftCount ) {
! 3343: float_raise( float_flag_invalid );
! 3344: if ( ! aSign
! 3345: || ( ( aExp == 0x7FFF )
! 3346: && ( aSig != LIT64( 0x8000000000000000 ) ) )
! 3347: ) {
! 3348: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 3349: }
! 3350: return (sbits64) LIT64( 0x8000000000000000 );
! 3351: }
! 3352: aSigExtra = 0;
! 3353: }
! 3354: else {
! 3355: shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
! 3356: }
! 3357: return roundAndPackInt64( aSign, aSig, aSigExtra );
! 3358:
! 3359: }
! 3360:
! 3361: /*
! 3362: -------------------------------------------------------------------------------
! 3363: Returns the result of converting the extended double-precision floating-
! 3364: point value `a' to the 64-bit two's complement integer format. The
! 3365: conversion is performed according to the IEC/IEEE Standard for Binary
! 3366: Floating-Point Arithmetic, except that the conversion is always rounded
! 3367: toward zero. If `a' is a NaN, the largest positive integer is returned.
! 3368: Otherwise, if the conversion overflows, the largest integer with the same
! 3369: sign as `a' is returned.
! 3370: -------------------------------------------------------------------------------
! 3371: */
! 3372: int64 floatx80_to_int64_round_to_zero( floatx80 a )
! 3373: {
! 3374: flag aSign;
! 3375: int32 aExp, shiftCount;
! 3376: bits64 aSig;
! 3377: int64 z;
! 3378:
! 3379: aSig = extractFloatx80Frac( a );
! 3380: aExp = extractFloatx80Exp( a );
! 3381: aSign = extractFloatx80Sign( a );
! 3382: shiftCount = aExp - 0x403E;
! 3383: if ( 0 <= shiftCount ) {
! 3384: aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
! 3385: if ( ( a.high != 0xC03E ) || aSig ) {
! 3386: float_raise( float_flag_invalid );
! 3387: if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
! 3388: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 3389: }
! 3390: }
! 3391: return (sbits64) LIT64( 0x8000000000000000 );
! 3392: }
! 3393: else if ( aExp < 0x3FFF ) {
! 3394: if ( aExp | aSig ) float_set_inexact();
! 3395: return 0;
! 3396: }
! 3397: z = aSig>>( - shiftCount );
! 3398: if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
! 3399: float_set_inexact();
! 3400: }
! 3401: if ( aSign ) z = - z;
! 3402: return z;
! 3403:
! 3404: }
! 3405:
! 3406: /*
! 3407: -------------------------------------------------------------------------------
! 3408: Returns the result of converting the extended double-precision floating-
! 3409: point value `a' to the single-precision floating-point format. The
! 3410: conversion is performed according to the IEC/IEEE Standard for Binary
! 3411: Floating-Point Arithmetic.
! 3412: -------------------------------------------------------------------------------
! 3413: */
! 3414: float32 floatx80_to_float32( floatx80 a )
! 3415: {
! 3416: flag aSign;
! 3417: int32 aExp;
! 3418: bits64 aSig;
! 3419:
! 3420: aSig = extractFloatx80Frac( a );
! 3421: aExp = extractFloatx80Exp( a );
! 3422: aSign = extractFloatx80Sign( a );
! 3423: if ( aExp == 0x7FFF ) {
! 3424: if ( (bits64) ( aSig<<1 ) ) {
! 3425: return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
! 3426: }
! 3427: return packFloat32( aSign, 0xFF, 0 );
! 3428: }
! 3429: shift64RightJamming( aSig, 33, &aSig );
! 3430: if ( aExp || aSig ) aExp -= 0x3F81;
! 3431: return roundAndPackFloat32( aSign, aExp, aSig );
! 3432:
! 3433: }
! 3434:
! 3435: /*
! 3436: -------------------------------------------------------------------------------
! 3437: Returns the result of converting the extended double-precision floating-
! 3438: point value `a' to the double-precision floating-point format. The
! 3439: conversion is performed according to the IEC/IEEE Standard for Binary
! 3440: Floating-Point Arithmetic.
! 3441: -------------------------------------------------------------------------------
! 3442: */
! 3443: float64 floatx80_to_float64( floatx80 a )
! 3444: {
! 3445: flag aSign;
! 3446: int32 aExp;
! 3447: bits64 aSig, zSig;
! 3448:
! 3449: aSig = extractFloatx80Frac( a );
! 3450: aExp = extractFloatx80Exp( a );
! 3451: aSign = extractFloatx80Sign( a );
! 3452: if ( aExp == 0x7FFF ) {
! 3453: if ( (bits64) ( aSig<<1 ) ) {
! 3454: return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
! 3455: }
! 3456: return packFloat64( aSign, 0x7FF, 0 );
! 3457: }
! 3458: shift64RightJamming( aSig, 1, &zSig );
! 3459: if ( aExp || aSig ) aExp -= 0x3C01;
! 3460: return roundAndPackFloat64( aSign, aExp, zSig );
! 3461:
! 3462: }
! 3463:
! 3464: #ifdef FLOAT128
! 3465:
! 3466: /*
! 3467: -------------------------------------------------------------------------------
! 3468: Returns the result of converting the extended double-precision floating-
! 3469: point value `a' to the quadruple-precision floating-point format. The
! 3470: conversion is performed according to the IEC/IEEE Standard for Binary
! 3471: Floating-Point Arithmetic.
! 3472: -------------------------------------------------------------------------------
! 3473: */
! 3474: float128 floatx80_to_float128( floatx80 a )
! 3475: {
! 3476: flag aSign;
! 3477: int16 aExp;
! 3478: bits64 aSig, zSig0, zSig1;
! 3479:
! 3480: aSig = extractFloatx80Frac( a );
! 3481: aExp = extractFloatx80Exp( a );
! 3482: aSign = extractFloatx80Sign( a );
! 3483: if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
! 3484: return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
! 3485: }
! 3486: shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
! 3487: return packFloat128( aSign, aExp, zSig0, zSig1 );
! 3488:
! 3489: }
! 3490:
! 3491: #endif
! 3492:
! 3493: /*
! 3494: -------------------------------------------------------------------------------
! 3495: Rounds the extended double-precision floating-point value `a' to an integer,
! 3496: and returns the result as an extended quadruple-precision floating-point
! 3497: value. The operation is performed according to the IEC/IEEE Standard for
! 3498: Binary Floating-Point Arithmetic.
! 3499: -------------------------------------------------------------------------------
! 3500: */
! 3501: floatx80 floatx80_round_to_int( floatx80 a )
! 3502: {
! 3503: flag aSign;
! 3504: int32 aExp;
! 3505: bits64 lastBitMask, roundBitsMask;
! 3506: int8 roundingMode;
! 3507: floatx80 z;
! 3508:
! 3509: aExp = extractFloatx80Exp( a );
! 3510: if ( 0x403E <= aExp ) {
! 3511: if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
! 3512: return propagateFloatx80NaN( a, a );
! 3513: }
! 3514: return a;
! 3515: }
! 3516: if ( aExp < 0x3FFF ) {
! 3517: if ( ( aExp == 0 )
! 3518: && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
! 3519: return a;
! 3520: }
! 3521: float_set_inexact();
! 3522: aSign = extractFloatx80Sign( a );
! 3523: switch ( float_rounding_mode() ) {
! 3524: case float_round_nearest_even:
! 3525: if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
! 3526: ) {
! 3527: return
! 3528: packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
! 3529: }
! 3530: break;
! 3531: case float_round_down:
! 3532: return
! 3533: aSign ?
! 3534: packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
! 3535: : packFloatx80( 0, 0, 0 );
! 3536: case float_round_up:
! 3537: return
! 3538: aSign ? packFloatx80( 1, 0, 0 )
! 3539: : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
! 3540: }
! 3541: return packFloatx80( aSign, 0, 0 );
! 3542: }
! 3543: lastBitMask = 1;
! 3544: lastBitMask <<= 0x403E - aExp;
! 3545: roundBitsMask = lastBitMask - 1;
! 3546: z = a;
! 3547: roundingMode = float_rounding_mode();
! 3548: if ( roundingMode == float_round_nearest_even ) {
! 3549: z.low += lastBitMask>>1;
! 3550: if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
! 3551: }
! 3552: else if ( roundingMode != float_round_to_zero ) {
! 3553: if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
! 3554: z.low += roundBitsMask;
! 3555: }
! 3556: }
! 3557: z.low &= ~ roundBitsMask;
! 3558: if ( z.low == 0 ) {
! 3559: ++z.high;
! 3560: z.low = LIT64( 0x8000000000000000 );
! 3561: }
! 3562: if ( z.low != a.low ) float_set_inexact();
! 3563: return z;
! 3564:
! 3565: }
! 3566:
! 3567: /*
! 3568: -------------------------------------------------------------------------------
! 3569: Returns the result of adding the absolute values of the extended double-
! 3570: precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
! 3571: negated before being returned. `zSign' is ignored if the result is a NaN.
! 3572: The addition is performed according to the IEC/IEEE Standard for Binary
! 3573: Floating-Point Arithmetic.
! 3574: -------------------------------------------------------------------------------
! 3575: */
! 3576: static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
! 3577: {
! 3578: int32 aExp, bExp, zExp;
! 3579: bits64 aSig, bSig, zSig0, zSig1;
! 3580: int32 expDiff;
! 3581:
! 3582: aSig = extractFloatx80Frac( a );
! 3583: aExp = extractFloatx80Exp( a );
! 3584: bSig = extractFloatx80Frac( b );
! 3585: bExp = extractFloatx80Exp( b );
! 3586: expDiff = aExp - bExp;
! 3587: if ( 0 < expDiff ) {
! 3588: if ( aExp == 0x7FFF ) {
! 3589: if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3590: return a;
! 3591: }
! 3592: if ( bExp == 0 ) --expDiff;
! 3593: shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
! 3594: zExp = aExp;
! 3595: }
! 3596: else if ( expDiff < 0 ) {
! 3597: if ( bExp == 0x7FFF ) {
! 3598: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3599: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 3600: }
! 3601: if ( aExp == 0 ) ++expDiff;
! 3602: shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
! 3603: zExp = bExp;
! 3604: }
! 3605: else {
! 3606: if ( aExp == 0x7FFF ) {
! 3607: if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
! 3608: return propagateFloatx80NaN( a, b );
! 3609: }
! 3610: return a;
! 3611: }
! 3612: zSig1 = 0;
! 3613: zSig0 = aSig + bSig;
! 3614: if ( aExp == 0 ) {
! 3615: normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
! 3616: goto roundAndPack;
! 3617: }
! 3618: zExp = aExp;
! 3619: goto shiftRight1;
! 3620: }
! 3621: zSig0 = aSig + bSig;
! 3622: if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
! 3623: shiftRight1:
! 3624: shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
! 3625: zSig0 |= LIT64( 0x8000000000000000 );
! 3626: ++zExp;
! 3627: roundAndPack:
! 3628: return
! 3629: roundAndPackFloatx80(
! 3630: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
! 3631:
! 3632: }
! 3633:
! 3634: /*
! 3635: -------------------------------------------------------------------------------
! 3636: Returns the result of subtracting the absolute values of the extended
! 3637: double-precision floating-point values `a' and `b'. If `zSign' is 1, the
! 3638: difference is negated before being returned. `zSign' is ignored if the
! 3639: result is a NaN. The subtraction is performed according to the IEC/IEEE
! 3640: Standard for Binary Floating-Point Arithmetic.
! 3641: -------------------------------------------------------------------------------
! 3642: */
! 3643: static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
! 3644: {
! 3645: int32 aExp, bExp, zExp;
! 3646: bits64 aSig, bSig, zSig0, zSig1;
! 3647: int32 expDiff;
! 3648: floatx80 z;
! 3649:
! 3650: aSig = extractFloatx80Frac( a );
! 3651: aExp = extractFloatx80Exp( a );
! 3652: bSig = extractFloatx80Frac( b );
! 3653: bExp = extractFloatx80Exp( b );
! 3654: expDiff = aExp - bExp;
! 3655: if ( 0 < expDiff ) goto aExpBigger;
! 3656: if ( expDiff < 0 ) goto bExpBigger;
! 3657: if ( aExp == 0x7FFF ) {
! 3658: if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
! 3659: return propagateFloatx80NaN( a, b );
! 3660: }
! 3661: float_raise( float_flag_invalid );
! 3662: z.low = floatx80_default_nan_low;
! 3663: z.high = floatx80_default_nan_high;
! 3664: return z;
! 3665: }
! 3666: if ( aExp == 0 ) {
! 3667: aExp = 1;
! 3668: bExp = 1;
! 3669: }
! 3670: zSig1 = 0;
! 3671: if ( bSig < aSig ) goto aBigger;
! 3672: if ( aSig < bSig ) goto bBigger;
! 3673: return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
! 3674: bExpBigger:
! 3675: if ( bExp == 0x7FFF ) {
! 3676: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3677: return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 3678: }
! 3679: if ( aExp == 0 ) ++expDiff;
! 3680: shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
! 3681: bBigger:
! 3682: sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
! 3683: zExp = bExp;
! 3684: zSign ^= 1;
! 3685: goto normalizeRoundAndPack;
! 3686: aExpBigger:
! 3687: if ( aExp == 0x7FFF ) {
! 3688: if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3689: return a;
! 3690: }
! 3691: if ( bExp == 0 ) --expDiff;
! 3692: shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
! 3693: aBigger:
! 3694: sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
! 3695: zExp = aExp;
! 3696: normalizeRoundAndPack:
! 3697: return
! 3698: normalizeRoundAndPackFloatx80(
! 3699: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
! 3700:
! 3701: }
! 3702:
! 3703: /*
! 3704: -------------------------------------------------------------------------------
! 3705: Returns the result of adding the extended double-precision floating-point
! 3706: values `a' and `b'. The operation is performed according to the IEC/IEEE
! 3707: Standard for Binary Floating-Point Arithmetic.
! 3708: -------------------------------------------------------------------------------
! 3709: */
! 3710: floatx80 floatx80_add( floatx80 a, floatx80 b )
! 3711: {
! 3712: flag aSign, bSign;
! 3713:
! 3714: aSign = extractFloatx80Sign( a );
! 3715: bSign = extractFloatx80Sign( b );
! 3716: if ( aSign == bSign ) {
! 3717: return addFloatx80Sigs( a, b, aSign );
! 3718: }
! 3719: else {
! 3720: return subFloatx80Sigs( a, b, aSign );
! 3721: }
! 3722:
! 3723: }
! 3724:
! 3725: /*
! 3726: -------------------------------------------------------------------------------
! 3727: Returns the result of subtracting the extended double-precision floating-
! 3728: point values `a' and `b'. The operation is performed according to the
! 3729: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3730: -------------------------------------------------------------------------------
! 3731: */
! 3732: floatx80 floatx80_sub( floatx80 a, floatx80 b )
! 3733: {
! 3734: flag aSign, bSign;
! 3735:
! 3736: aSign = extractFloatx80Sign( a );
! 3737: bSign = extractFloatx80Sign( b );
! 3738: if ( aSign == bSign ) {
! 3739: return subFloatx80Sigs( a, b, aSign );
! 3740: }
! 3741: else {
! 3742: return addFloatx80Sigs( a, b, aSign );
! 3743: }
! 3744:
! 3745: }
! 3746:
! 3747: /*
! 3748: -------------------------------------------------------------------------------
! 3749: Returns the result of multiplying the extended double-precision floating-
! 3750: point values `a' and `b'. The operation is performed according to the
! 3751: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3752: -------------------------------------------------------------------------------
! 3753: */
! 3754: floatx80 floatx80_mul( floatx80 a, floatx80 b )
! 3755: {
! 3756: flag aSign, bSign, zSign;
! 3757: int32 aExp, bExp, zExp;
! 3758: bits64 aSig, bSig, zSig0, zSig1;
! 3759: floatx80 z;
! 3760:
! 3761: aSig = extractFloatx80Frac( a );
! 3762: aExp = extractFloatx80Exp( a );
! 3763: aSign = extractFloatx80Sign( a );
! 3764: bSig = extractFloatx80Frac( b );
! 3765: bExp = extractFloatx80Exp( b );
! 3766: bSign = extractFloatx80Sign( b );
! 3767: zSign = aSign ^ bSign;
! 3768: if ( aExp == 0x7FFF ) {
! 3769: if ( (bits64) ( aSig<<1 )
! 3770: || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
! 3771: return propagateFloatx80NaN( a, b );
! 3772: }
! 3773: if ( ( bExp | bSig ) == 0 ) goto invalid;
! 3774: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 3775: }
! 3776: if ( bExp == 0x7FFF ) {
! 3777: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3778: if ( ( aExp | aSig ) == 0 ) {
! 3779: invalid:
! 3780: float_raise( float_flag_invalid );
! 3781: z.low = floatx80_default_nan_low;
! 3782: z.high = floatx80_default_nan_high;
! 3783: return z;
! 3784: }
! 3785: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 3786: }
! 3787: if ( aExp == 0 ) {
! 3788: if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
! 3789: normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
! 3790: }
! 3791: if ( bExp == 0 ) {
! 3792: if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
! 3793: normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
! 3794: }
! 3795: zExp = aExp + bExp - 0x3FFE;
! 3796: mul64To128( aSig, bSig, &zSig0, &zSig1 );
! 3797: if ( 0 < (sbits64) zSig0 ) {
! 3798: shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
! 3799: --zExp;
! 3800: }
! 3801: return
! 3802: roundAndPackFloatx80(
! 3803: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
! 3804:
! 3805: }
! 3806:
! 3807: /*
! 3808: -------------------------------------------------------------------------------
! 3809: Returns the result of dividing the extended double-precision floating-point
! 3810: value `a' by the corresponding value `b'. The operation is performed
! 3811: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3812: -------------------------------------------------------------------------------
! 3813: */
! 3814: floatx80 floatx80_div( floatx80 a, floatx80 b )
! 3815: {
! 3816: flag aSign, bSign, zSign;
! 3817: int32 aExp, bExp, zExp;
! 3818: bits64 aSig, bSig, zSig0, zSig1;
! 3819: bits64 rem0, rem1, rem2, term0, term1, term2;
! 3820: floatx80 z;
! 3821:
! 3822: aSig = extractFloatx80Frac( a );
! 3823: aExp = extractFloatx80Exp( a );
! 3824: aSign = extractFloatx80Sign( a );
! 3825: bSig = extractFloatx80Frac( b );
! 3826: bExp = extractFloatx80Exp( b );
! 3827: bSign = extractFloatx80Sign( b );
! 3828: zSign = aSign ^ bSign;
! 3829: if ( aExp == 0x7FFF ) {
! 3830: if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3831: if ( bExp == 0x7FFF ) {
! 3832: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3833: goto invalid;
! 3834: }
! 3835: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 3836: }
! 3837: if ( bExp == 0x7FFF ) {
! 3838: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3839: return packFloatx80( zSign, 0, 0 );
! 3840: }
! 3841: if ( bExp == 0 ) {
! 3842: if ( bSig == 0 ) {
! 3843: if ( ( aExp | aSig ) == 0 ) {
! 3844: invalid:
! 3845: float_raise( float_flag_invalid );
! 3846: z.low = floatx80_default_nan_low;
! 3847: z.high = floatx80_default_nan_high;
! 3848: return z;
! 3849: }
! 3850: float_raise( float_flag_divbyzero );
! 3851: return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 3852: }
! 3853: normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
! 3854: }
! 3855: if ( aExp == 0 ) {
! 3856: if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
! 3857: normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
! 3858: }
! 3859: zExp = aExp - bExp + 0x3FFE;
! 3860: rem1 = 0;
! 3861: if ( bSig <= aSig ) {
! 3862: shift128Right( aSig, 0, 1, &aSig, &rem1 );
! 3863: ++zExp;
! 3864: }
! 3865: zSig0 = estimateDiv128To64( aSig, rem1, bSig );
! 3866: mul64To128( bSig, zSig0, &term0, &term1 );
! 3867: sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
! 3868: while ( (sbits64) rem0 < 0 ) {
! 3869: --zSig0;
! 3870: add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
! 3871: }
! 3872: zSig1 = estimateDiv128To64( rem1, 0, bSig );
! 3873: if ( (bits64) ( zSig1<<1 ) <= 8 ) {
! 3874: mul64To128( bSig, zSig1, &term1, &term2 );
! 3875: sub128( rem1, 0, term1, term2, &rem1, &rem2 );
! 3876: while ( (sbits64) rem1 < 0 ) {
! 3877: --zSig1;
! 3878: add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
! 3879: }
! 3880: zSig1 |= ( ( rem1 | rem2 ) != 0 );
! 3881: }
! 3882: return
! 3883: roundAndPackFloatx80(
! 3884: floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
! 3885:
! 3886: }
! 3887:
! 3888: /*
! 3889: -------------------------------------------------------------------------------
! 3890: Returns the remainder of the extended double-precision floating-point value
! 3891: `a' with respect to the corresponding value `b'. The operation is performed
! 3892: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 3893: -------------------------------------------------------------------------------
! 3894: */
! 3895: floatx80 floatx80_rem( floatx80 a, floatx80 b )
! 3896: {
! 3897: flag aSign, bSign, zSign;
! 3898: int32 aExp, bExp, expDiff;
! 3899: bits64 aSig0, aSig1, bSig;
! 3900: bits64 q, term0, term1, alternateASig0, alternateASig1;
! 3901: floatx80 z;
! 3902:
! 3903: aSig0 = extractFloatx80Frac( a );
! 3904: aExp = extractFloatx80Exp( a );
! 3905: aSign = extractFloatx80Sign( a );
! 3906: bSig = extractFloatx80Frac( b );
! 3907: bExp = extractFloatx80Exp( b );
! 3908: bSign = extractFloatx80Sign( b );
! 3909: if ( aExp == 0x7FFF ) {
! 3910: if ( (bits64) ( aSig0<<1 )
! 3911: || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
! 3912: return propagateFloatx80NaN( a, b );
! 3913: }
! 3914: goto invalid;
! 3915: }
! 3916: if ( bExp == 0x7FFF ) {
! 3917: if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
! 3918: return a;
! 3919: }
! 3920: if ( bExp == 0 ) {
! 3921: if ( bSig == 0 ) {
! 3922: invalid:
! 3923: float_raise( float_flag_invalid );
! 3924: z.low = floatx80_default_nan_low;
! 3925: z.high = floatx80_default_nan_high;
! 3926: return z;
! 3927: }
! 3928: normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
! 3929: }
! 3930: if ( aExp == 0 ) {
! 3931: if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
! 3932: normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
! 3933: }
! 3934: bSig |= LIT64( 0x8000000000000000 );
! 3935: zSign = aSign;
! 3936: expDiff = aExp - bExp;
! 3937: aSig1 = 0;
! 3938: if ( expDiff < 0 ) {
! 3939: if ( expDiff < -1 ) return a;
! 3940: shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
! 3941: expDiff = 0;
! 3942: }
! 3943: q = ( bSig <= aSig0 );
! 3944: if ( q ) aSig0 -= bSig;
! 3945: expDiff -= 64;
! 3946: while ( 0 < expDiff ) {
! 3947: q = estimateDiv128To64( aSig0, aSig1, bSig );
! 3948: q = ( 2 < q ) ? q - 2 : 0;
! 3949: mul64To128( bSig, q, &term0, &term1 );
! 3950: sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
! 3951: shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
! 3952: expDiff -= 62;
! 3953: }
! 3954: expDiff += 64;
! 3955: if ( 0 < expDiff ) {
! 3956: q = estimateDiv128To64( aSig0, aSig1, bSig );
! 3957: q = ( 2 < q ) ? q - 2 : 0;
! 3958: q >>= 64 - expDiff;
! 3959: mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
! 3960: sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
! 3961: shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
! 3962: while ( le128( term0, term1, aSig0, aSig1 ) ) {
! 3963: ++q;
! 3964: sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
! 3965: }
! 3966: }
! 3967: else {
! 3968: term1 = 0;
! 3969: term0 = bSig;
! 3970: }
! 3971: sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
! 3972: if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
! 3973: || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
! 3974: && ( q & 1 ) )
! 3975: ) {
! 3976: aSig0 = alternateASig0;
! 3977: aSig1 = alternateASig1;
! 3978: zSign = ! zSign;
! 3979: }
! 3980: return
! 3981: normalizeRoundAndPackFloatx80(
! 3982: 80, zSign, bExp + expDiff, aSig0, aSig1 );
! 3983:
! 3984: }
! 3985:
! 3986: /*
! 3987: -------------------------------------------------------------------------------
! 3988: Returns the square root of the extended double-precision floating-point
! 3989: value `a'. The operation is performed according to the IEC/IEEE Standard
! 3990: for Binary Floating-Point Arithmetic.
! 3991: -------------------------------------------------------------------------------
! 3992: */
! 3993: floatx80 floatx80_sqrt( floatx80 a )
! 3994: {
! 3995: flag aSign;
! 3996: int32 aExp, zExp;
! 3997: bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
! 3998: bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
! 3999: floatx80 z;
! 4000:
! 4001: aSig0 = extractFloatx80Frac( a );
! 4002: aExp = extractFloatx80Exp( a );
! 4003: aSign = extractFloatx80Sign( a );
! 4004: if ( aExp == 0x7FFF ) {
! 4005: if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
! 4006: if ( ! aSign ) return a;
! 4007: goto invalid;
! 4008: }
! 4009: if ( aSign ) {
! 4010: if ( ( aExp | aSig0 ) == 0 ) return a;
! 4011: invalid:
! 4012: float_raise( float_flag_invalid );
! 4013: z.low = floatx80_default_nan_low;
! 4014: z.high = floatx80_default_nan_high;
! 4015: return z;
! 4016: }
! 4017: if ( aExp == 0 ) {
! 4018: if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
! 4019: normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
! 4020: }
! 4021: zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
! 4022: zSig0 = estimateSqrt32( aExp, aSig0>>32 );
! 4023: shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
! 4024: zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
! 4025: doubleZSig0 = zSig0<<1;
! 4026: mul64To128( zSig0, zSig0, &term0, &term1 );
! 4027: sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
! 4028: while ( (sbits64) rem0 < 0 ) {
! 4029: --zSig0;
! 4030: doubleZSig0 -= 2;
! 4031: add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
! 4032: }
! 4033: zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
! 4034: if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
! 4035: if ( zSig1 == 0 ) zSig1 = 1;
! 4036: mul64To128( doubleZSig0, zSig1, &term1, &term2 );
! 4037: sub128( rem1, 0, term1, term2, &rem1, &rem2 );
! 4038: mul64To128( zSig1, zSig1, &term2, &term3 );
! 4039: sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
! 4040: while ( (sbits64) rem1 < 0 ) {
! 4041: --zSig1;
! 4042: shortShift128Left( 0, zSig1, 1, &term2, &term3 );
! 4043: term3 |= 1;
! 4044: term2 |= doubleZSig0;
! 4045: add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
! 4046: }
! 4047: zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
! 4048: }
! 4049: shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
! 4050: zSig0 |= doubleZSig0;
! 4051: return
! 4052: roundAndPackFloatx80(
! 4053: floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
! 4054:
! 4055: }
! 4056:
! 4057: /*
! 4058: -------------------------------------------------------------------------------
! 4059: Returns 1 if the extended double-precision floating-point value `a' is
! 4060: equal to the corresponding value `b', and 0 otherwise. The comparison is
! 4061: performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4062: Arithmetic.
! 4063: -------------------------------------------------------------------------------
! 4064: */
! 4065: flag floatx80_eq( floatx80 a, floatx80 b )
! 4066: {
! 4067:
! 4068: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
! 4069: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
! 4070: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
! 4071: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
! 4072: ) {
! 4073: if ( floatx80_is_signaling_nan( a )
! 4074: || floatx80_is_signaling_nan( b ) ) {
! 4075: float_raise( float_flag_invalid );
! 4076: }
! 4077: return 0;
! 4078: }
! 4079: return
! 4080: ( a.low == b.low )
! 4081: && ( ( a.high == b.high )
! 4082: || ( ( a.low == 0 )
! 4083: && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
! 4084: );
! 4085:
! 4086: }
! 4087:
! 4088: /*
! 4089: -------------------------------------------------------------------------------
! 4090: Returns 1 if the extended double-precision floating-point value `a' is
! 4091: less than or equal to the corresponding value `b', and 0 otherwise. The
! 4092: comparison is performed according to the IEC/IEEE Standard for Binary
! 4093: Floating-Point Arithmetic.
! 4094: -------------------------------------------------------------------------------
! 4095: */
! 4096: flag floatx80_le( floatx80 a, floatx80 b )
! 4097: {
! 4098: flag aSign, bSign;
! 4099:
! 4100: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
! 4101: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
! 4102: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
! 4103: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
! 4104: ) {
! 4105: float_raise( float_flag_invalid );
! 4106: return 0;
! 4107: }
! 4108: aSign = extractFloatx80Sign( a );
! 4109: bSign = extractFloatx80Sign( b );
! 4110: if ( aSign != bSign ) {
! 4111: return
! 4112: aSign
! 4113: || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 4114: == 0 );
! 4115: }
! 4116: return
! 4117: aSign ? le128( b.high, b.low, a.high, a.low )
! 4118: : le128( a.high, a.low, b.high, b.low );
! 4119:
! 4120: }
! 4121:
! 4122: /*
! 4123: -------------------------------------------------------------------------------
! 4124: Returns 1 if the extended double-precision floating-point value `a' is
! 4125: less than the corresponding value `b', and 0 otherwise. The comparison
! 4126: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4127: Arithmetic.
! 4128: -------------------------------------------------------------------------------
! 4129: */
! 4130: flag floatx80_lt( floatx80 a, floatx80 b )
! 4131: {
! 4132: flag aSign, bSign;
! 4133:
! 4134: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
! 4135: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
! 4136: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
! 4137: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
! 4138: ) {
! 4139: float_raise( float_flag_invalid );
! 4140: return 0;
! 4141: }
! 4142: aSign = extractFloatx80Sign( a );
! 4143: bSign = extractFloatx80Sign( b );
! 4144: if ( aSign != bSign ) {
! 4145: return
! 4146: aSign
! 4147: && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 4148: != 0 );
! 4149: }
! 4150: return
! 4151: aSign ? lt128( b.high, b.low, a.high, a.low )
! 4152: : lt128( a.high, a.low, b.high, b.low );
! 4153:
! 4154: }
! 4155:
! 4156: /*
! 4157: -------------------------------------------------------------------------------
! 4158: Returns 1 if the extended double-precision floating-point value `a' is equal
! 4159: to the corresponding value `b', and 0 otherwise. The invalid exception is
! 4160: raised if either operand is a NaN. Otherwise, the comparison is performed
! 4161: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 4162: -------------------------------------------------------------------------------
! 4163: */
! 4164: flag floatx80_eq_signaling( floatx80 a, floatx80 b )
! 4165: {
! 4166:
! 4167: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
! 4168: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
! 4169: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
! 4170: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
! 4171: ) {
! 4172: float_raise( float_flag_invalid );
! 4173: return 0;
! 4174: }
! 4175: return
! 4176: ( a.low == b.low )
! 4177: && ( ( a.high == b.high )
! 4178: || ( ( a.low == 0 )
! 4179: && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
! 4180: );
! 4181:
! 4182: }
! 4183:
! 4184: /*
! 4185: -------------------------------------------------------------------------------
! 4186: Returns 1 if the extended double-precision floating-point value `a' is less
! 4187: than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
! 4188: do not cause an exception. Otherwise, the comparison is performed according
! 4189: to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 4190: -------------------------------------------------------------------------------
! 4191: */
! 4192: flag floatx80_le_quiet( floatx80 a, floatx80 b )
! 4193: {
! 4194: flag aSign, bSign;
! 4195:
! 4196: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
! 4197: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
! 4198: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
! 4199: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
! 4200: ) {
! 4201: if ( floatx80_is_signaling_nan( a )
! 4202: || floatx80_is_signaling_nan( b ) ) {
! 4203: float_raise( float_flag_invalid );
! 4204: }
! 4205: return 0;
! 4206: }
! 4207: aSign = extractFloatx80Sign( a );
! 4208: bSign = extractFloatx80Sign( b );
! 4209: if ( aSign != bSign ) {
! 4210: return
! 4211: aSign
! 4212: || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 4213: == 0 );
! 4214: }
! 4215: return
! 4216: aSign ? le128( b.high, b.low, a.high, a.low )
! 4217: : le128( a.high, a.low, b.high, b.low );
! 4218:
! 4219: }
! 4220:
! 4221: /*
! 4222: -------------------------------------------------------------------------------
! 4223: Returns 1 if the extended double-precision floating-point value `a' is less
! 4224: than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
! 4225: an exception. Otherwise, the comparison is performed according to the
! 4226: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 4227: -------------------------------------------------------------------------------
! 4228: */
! 4229: flag floatx80_lt_quiet( floatx80 a, floatx80 b )
! 4230: {
! 4231: flag aSign, bSign;
! 4232:
! 4233: if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
! 4234: && (bits64) ( extractFloatx80Frac( a )<<1 ) )
! 4235: || ( ( extractFloatx80Exp( b ) == 0x7FFF )
! 4236: && (bits64) ( extractFloatx80Frac( b )<<1 ) )
! 4237: ) {
! 4238: if ( floatx80_is_signaling_nan( a )
! 4239: || floatx80_is_signaling_nan( b ) ) {
! 4240: float_raise( float_flag_invalid );
! 4241: }
! 4242: return 0;
! 4243: }
! 4244: aSign = extractFloatx80Sign( a );
! 4245: bSign = extractFloatx80Sign( b );
! 4246: if ( aSign != bSign ) {
! 4247: return
! 4248: aSign
! 4249: && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 4250: != 0 );
! 4251: }
! 4252: return
! 4253: aSign ? lt128( b.high, b.low, a.high, a.low )
! 4254: : lt128( a.high, a.low, b.high, b.low );
! 4255:
! 4256: }
! 4257:
! 4258: #endif
! 4259:
! 4260: #ifdef FLOAT128
! 4261:
! 4262: /*
! 4263: -------------------------------------------------------------------------------
! 4264: Returns the result of converting the quadruple-precision floating-point
! 4265: value `a' to the 32-bit two's complement integer format. The conversion
! 4266: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4267: Arithmetic---which means in particular that the conversion is rounded
! 4268: according to the current rounding mode. If `a' is a NaN, the largest
! 4269: positive integer is returned. Otherwise, if the conversion overflows, the
! 4270: largest integer with the same sign as `a' is returned.
! 4271: -------------------------------------------------------------------------------
! 4272: */
! 4273: int32 float128_to_int32( float128 a )
! 4274: {
! 4275: flag aSign;
! 4276: int32 aExp, shiftCount;
! 4277: bits64 aSig0, aSig1;
! 4278:
! 4279: aSig1 = extractFloat128Frac1( a );
! 4280: aSig0 = extractFloat128Frac0( a );
! 4281: aExp = extractFloat128Exp( a );
! 4282: aSign = extractFloat128Sign( a );
! 4283: if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
! 4284: if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
! 4285: aSig0 |= ( aSig1 != 0 );
! 4286: shiftCount = 0x4028 - aExp;
! 4287: if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
! 4288: return roundAndPackInt32( aSign, aSig0 );
! 4289:
! 4290: }
! 4291:
! 4292: /*
! 4293: -------------------------------------------------------------------------------
! 4294: Returns the result of converting the quadruple-precision floating-point
! 4295: value `a' to the 32-bit two's complement integer format. The conversion
! 4296: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4297: Arithmetic, except that the conversion is always rounded toward zero. If
! 4298: `a' is a NaN, the largest positive integer is returned. Otherwise, if the
! 4299: conversion overflows, the largest integer with the same sign as `a' is
! 4300: returned.
! 4301: -------------------------------------------------------------------------------
! 4302: */
! 4303: int32 float128_to_int32_round_to_zero( float128 a )
! 4304: {
! 4305: flag aSign;
! 4306: int32 aExp, shiftCount;
! 4307: bits64 aSig0, aSig1, savedASig;
! 4308: int32 z;
! 4309:
! 4310: aSig1 = extractFloat128Frac1( a );
! 4311: aSig0 = extractFloat128Frac0( a );
! 4312: aExp = extractFloat128Exp( a );
! 4313: aSign = extractFloat128Sign( a );
! 4314: aSig0 |= ( aSig1 != 0 );
! 4315: if ( 0x401E < aExp ) {
! 4316: if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
! 4317: goto invalid;
! 4318: }
! 4319: else if ( aExp < 0x3FFF ) {
! 4320: if ( aExp || aSig0 ) float_set_inexact();
! 4321: return 0;
! 4322: }
! 4323: aSig0 |= LIT64( 0x0001000000000000 );
! 4324: shiftCount = 0x402F - aExp;
! 4325: savedASig = aSig0;
! 4326: aSig0 >>= shiftCount;
! 4327: z = aSig0;
! 4328: if ( aSign ) z = - z;
! 4329: if ( ( z < 0 ) ^ aSign ) {
! 4330: invalid:
! 4331: float_raise( float_flag_invalid );
! 4332: return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
! 4333: }
! 4334: if ( ( aSig0<<shiftCount ) != savedASig ) {
! 4335: float_set_inexact();
! 4336: }
! 4337: return z;
! 4338:
! 4339: }
! 4340:
! 4341: /*
! 4342: -------------------------------------------------------------------------------
! 4343: Returns the result of converting the quadruple-precision floating-point
! 4344: value `a' to the 64-bit two's complement integer format. The conversion
! 4345: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4346: Arithmetic---which means in particular that the conversion is rounded
! 4347: according to the current rounding mode. If `a' is a NaN, the largest
! 4348: positive integer is returned. Otherwise, if the conversion overflows, the
! 4349: largest integer with the same sign as `a' is returned.
! 4350: -------------------------------------------------------------------------------
! 4351: */
! 4352: int64 float128_to_int64( float128 a )
! 4353: {
! 4354: flag aSign;
! 4355: int32 aExp, shiftCount;
! 4356: bits64 aSig0, aSig1;
! 4357:
! 4358: aSig1 = extractFloat128Frac1( a );
! 4359: aSig0 = extractFloat128Frac0( a );
! 4360: aExp = extractFloat128Exp( a );
! 4361: aSign = extractFloat128Sign( a );
! 4362: if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
! 4363: shiftCount = 0x402F - aExp;
! 4364: if ( shiftCount <= 0 ) {
! 4365: if ( 0x403E < aExp ) {
! 4366: float_raise( float_flag_invalid );
! 4367: if ( ! aSign
! 4368: || ( ( aExp == 0x7FFF )
! 4369: && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
! 4370: )
! 4371: ) {
! 4372: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 4373: }
! 4374: return (sbits64) LIT64( 0x8000000000000000 );
! 4375: }
! 4376: shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
! 4377: }
! 4378: else {
! 4379: shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
! 4380: }
! 4381: return roundAndPackInt64( aSign, aSig0, aSig1 );
! 4382:
! 4383: }
! 4384:
! 4385: /*
! 4386: -------------------------------------------------------------------------------
! 4387: Returns the result of converting the quadruple-precision floating-point
! 4388: value `a' to the 64-bit two's complement integer format. The conversion
! 4389: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4390: Arithmetic, except that the conversion is always rounded toward zero.
! 4391: If `a' is a NaN, the largest positive integer is returned. Otherwise, if
! 4392: the conversion overflows, the largest integer with the same sign as `a' is
! 4393: returned.
! 4394: -------------------------------------------------------------------------------
! 4395: */
! 4396: int64 float128_to_int64_round_to_zero( float128 a )
! 4397: {
! 4398: flag aSign;
! 4399: int32 aExp, shiftCount;
! 4400: bits64 aSig0, aSig1;
! 4401: int64 z;
! 4402:
! 4403: aSig1 = extractFloat128Frac1( a );
! 4404: aSig0 = extractFloat128Frac0( a );
! 4405: aExp = extractFloat128Exp( a );
! 4406: aSign = extractFloat128Sign( a );
! 4407: if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
! 4408: shiftCount = aExp - 0x402F;
! 4409: if ( 0 < shiftCount ) {
! 4410: if ( 0x403E <= aExp ) {
! 4411: aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
! 4412: if ( ( a.high == LIT64( 0xC03E000000000000 ) )
! 4413: && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
! 4414: if ( aSig1 ) float_set_inexact();
! 4415: }
! 4416: else {
! 4417: float_raise( float_flag_invalid );
! 4418: if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
! 4419: return LIT64( 0x7FFFFFFFFFFFFFFF );
! 4420: }
! 4421: }
! 4422: return (sbits64) LIT64( 0x8000000000000000 );
! 4423: }
! 4424: z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
! 4425: if ( (bits64) ( aSig1<<shiftCount ) ) {
! 4426: float_set_inexact();
! 4427: }
! 4428: }
! 4429: else {
! 4430: if ( aExp < 0x3FFF ) {
! 4431: if ( aExp | aSig0 | aSig1 ) {
! 4432: float_set_inexact();
! 4433: }
! 4434: return 0;
! 4435: }
! 4436: z = aSig0>>( - shiftCount );
! 4437: if ( aSig1
! 4438: || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
! 4439: float_set_inexact();
! 4440: }
! 4441: }
! 4442: if ( aSign ) z = - z;
! 4443: return z;
! 4444:
! 4445: }
! 4446:
! 4447: /*
! 4448: -------------------------------------------------------------------------------
! 4449: Returns the result of converting the quadruple-precision floating-point
! 4450: value `a' to the single-precision floating-point format. The conversion
! 4451: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4452: Arithmetic.
! 4453: -------------------------------------------------------------------------------
! 4454: */
! 4455: float32 float128_to_float32( float128 a )
! 4456: {
! 4457: flag aSign;
! 4458: int32 aExp;
! 4459: bits64 aSig0, aSig1;
! 4460: bits32 zSig;
! 4461:
! 4462: aSig1 = extractFloat128Frac1( a );
! 4463: aSig0 = extractFloat128Frac0( a );
! 4464: aExp = extractFloat128Exp( a );
! 4465: aSign = extractFloat128Sign( a );
! 4466: if ( aExp == 0x7FFF ) {
! 4467: if ( aSig0 | aSig1 ) {
! 4468: return commonNaNToFloat32( float128ToCommonNaN( a ) );
! 4469: }
! 4470: return packFloat32( aSign, 0xFF, 0 );
! 4471: }
! 4472: aSig0 |= ( aSig1 != 0 );
! 4473: shift64RightJamming( aSig0, 18, &aSig0 );
! 4474: zSig = aSig0;
! 4475: if ( aExp || zSig ) {
! 4476: zSig |= 0x40000000;
! 4477: aExp -= 0x3F81;
! 4478: }
! 4479: return roundAndPackFloat32( aSign, aExp, zSig );
! 4480:
! 4481: }
! 4482:
! 4483: /*
! 4484: -------------------------------------------------------------------------------
! 4485: Returns the result of converting the quadruple-precision floating-point
! 4486: value `a' to the double-precision floating-point format. The conversion
! 4487: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 4488: Arithmetic.
! 4489: -------------------------------------------------------------------------------
! 4490: */
! 4491: float64 float128_to_float64( float128 a )
! 4492: {
! 4493: flag aSign;
! 4494: int32 aExp;
! 4495: bits64 aSig0, aSig1;
! 4496:
! 4497: aSig1 = extractFloat128Frac1( a );
! 4498: aSig0 = extractFloat128Frac0( a );
! 4499: aExp = extractFloat128Exp( a );
! 4500: aSign = extractFloat128Sign( a );
! 4501: if ( aExp == 0x7FFF ) {
! 4502: if ( aSig0 | aSig1 ) {
! 4503: return commonNaNToFloat64( float128ToCommonNaN( a ) );
! 4504: }
! 4505: return packFloat64( aSign, 0x7FF, 0 );
! 4506: }
! 4507: shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
! 4508: aSig0 |= ( aSig1 != 0 );
! 4509: if ( aExp || aSig0 ) {
! 4510: aSig0 |= LIT64( 0x4000000000000000 );
! 4511: aExp -= 0x3C01;
! 4512: }
! 4513: return roundAndPackFloat64( aSign, aExp, aSig0 );
! 4514:
! 4515: }
! 4516:
! 4517: #ifdef FLOATX80
! 4518:
! 4519: /*
! 4520: -------------------------------------------------------------------------------
! 4521: Returns the result of converting the quadruple-precision floating-point
! 4522: value `a' to the extended double-precision floating-point format. The
! 4523: conversion is performed according to the IEC/IEEE Standard for Binary
! 4524: Floating-Point Arithmetic.
! 4525: -------------------------------------------------------------------------------
! 4526: */
! 4527: floatx80 float128_to_floatx80( float128 a )
! 4528: {
! 4529: flag aSign;
! 4530: int32 aExp;
! 4531: bits64 aSig0, aSig1;
! 4532:
! 4533: aSig1 = extractFloat128Frac1( a );
! 4534: aSig0 = extractFloat128Frac0( a );
! 4535: aExp = extractFloat128Exp( a );
! 4536: aSign = extractFloat128Sign( a );
! 4537: if ( aExp == 0x7FFF ) {
! 4538: if ( aSig0 | aSig1 ) {
! 4539: return commonNaNToFloatx80( float128ToCommonNaN( a ) );
! 4540: }
! 4541: return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
! 4542: }
! 4543: if ( aExp == 0 ) {
! 4544: if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
! 4545: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
! 4546: }
! 4547: else {
! 4548: aSig0 |= LIT64( 0x0001000000000000 );
! 4549: }
! 4550: shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
! 4551: return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
! 4552:
! 4553: }
! 4554:
! 4555: #endif
! 4556:
! 4557: /*
! 4558: -------------------------------------------------------------------------------
! 4559: Rounds the quadruple-precision floating-point value `a' to an integer, and
! 4560: returns the result as a quadruple-precision floating-point value. The
! 4561: operation is performed according to the IEC/IEEE Standard for Binary
! 4562: Floating-Point Arithmetic.
! 4563: -------------------------------------------------------------------------------
! 4564: */
! 4565: float128 float128_round_to_int( float128 a )
! 4566: {
! 4567: flag aSign;
! 4568: int32 aExp;
! 4569: bits64 lastBitMask, roundBitsMask;
! 4570: int8 roundingMode;
! 4571: float128 z;
! 4572:
! 4573: aExp = extractFloat128Exp( a );
! 4574: if ( 0x402F <= aExp ) {
! 4575: if ( 0x406F <= aExp ) {
! 4576: if ( ( aExp == 0x7FFF )
! 4577: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
! 4578: ) {
! 4579: return propagateFloat128NaN( a, a );
! 4580: }
! 4581: return a;
! 4582: }
! 4583: lastBitMask = 1;
! 4584: lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
! 4585: roundBitsMask = lastBitMask - 1;
! 4586: z = a;
! 4587: roundingMode = float_rounding_mode();
! 4588: if ( roundingMode == float_round_nearest_even ) {
! 4589: if ( lastBitMask ) {
! 4590: add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
! 4591: if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
! 4592: }
! 4593: else {
! 4594: if ( (sbits64) z.low < 0 ) {
! 4595: ++z.high;
! 4596: if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
! 4597: }
! 4598: }
! 4599: }
! 4600: else if ( roundingMode != float_round_to_zero ) {
! 4601: if ( extractFloat128Sign( z )
! 4602: ^ ( roundingMode == float_round_up ) ) {
! 4603: add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
! 4604: }
! 4605: }
! 4606: z.low &= ~ roundBitsMask;
! 4607: }
! 4608: else {
! 4609: if ( aExp < 0x3FFF ) {
! 4610: if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
! 4611: float_set_inexact();
! 4612: aSign = extractFloat128Sign( a );
! 4613: switch ( float_rounding_mode() ) {
! 4614: case float_round_nearest_even:
! 4615: if ( ( aExp == 0x3FFE )
! 4616: && ( extractFloat128Frac0( a )
! 4617: | extractFloat128Frac1( a ) )
! 4618: ) {
! 4619: return packFloat128( aSign, 0x3FFF, 0, 0 );
! 4620: }
! 4621: break;
! 4622: case float_round_down:
! 4623: return
! 4624: aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
! 4625: : packFloat128( 0, 0, 0, 0 );
! 4626: case float_round_up:
! 4627: return
! 4628: aSign ? packFloat128( 1, 0, 0, 0 )
! 4629: : packFloat128( 0, 0x3FFF, 0, 0 );
! 4630: }
! 4631: return packFloat128( aSign, 0, 0, 0 );
! 4632: }
! 4633: lastBitMask = 1;
! 4634: lastBitMask <<= 0x402F - aExp;
! 4635: roundBitsMask = lastBitMask - 1;
! 4636: z.low = 0;
! 4637: z.high = a.high;
! 4638: roundingMode = float_rounding_mode();
! 4639: if ( roundingMode == float_round_nearest_even ) {
! 4640: z.high += lastBitMask>>1;
! 4641: if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
! 4642: z.high &= ~ lastBitMask;
! 4643: }
! 4644: }
! 4645: else if ( roundingMode != float_round_to_zero ) {
! 4646: if ( extractFloat128Sign( z )
! 4647: ^ ( roundingMode == float_round_up ) ) {
! 4648: z.high |= ( a.low != 0 );
! 4649: z.high += roundBitsMask;
! 4650: }
! 4651: }
! 4652: z.high &= ~ roundBitsMask;
! 4653: }
! 4654: if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
! 4655: float_set_inexact();
! 4656: }
! 4657: return z;
! 4658:
! 4659: }
! 4660:
! 4661: /*
! 4662: -------------------------------------------------------------------------------
! 4663: Returns the result of adding the absolute values of the quadruple-precision
! 4664: floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
! 4665: before being returned. `zSign' is ignored if the result is a NaN.
! 4666: The addition is performed according to the IEC/IEEE Standard for Binary
! 4667: Floating-Point Arithmetic.
! 4668: -------------------------------------------------------------------------------
! 4669: */
! 4670: static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
! 4671: {
! 4672: int32 aExp, bExp, zExp;
! 4673: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
! 4674: int32 expDiff;
! 4675:
! 4676: aSig1 = extractFloat128Frac1( a );
! 4677: aSig0 = extractFloat128Frac0( a );
! 4678: aExp = extractFloat128Exp( a );
! 4679: bSig1 = extractFloat128Frac1( b );
! 4680: bSig0 = extractFloat128Frac0( b );
! 4681: bExp = extractFloat128Exp( b );
! 4682: expDiff = aExp - bExp;
! 4683: if ( 0 < expDiff ) {
! 4684: if ( aExp == 0x7FFF ) {
! 4685: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
! 4686: return a;
! 4687: }
! 4688: if ( bExp == 0 ) {
! 4689: --expDiff;
! 4690: }
! 4691: else {
! 4692: bSig0 |= LIT64( 0x0001000000000000 );
! 4693: }
! 4694: shift128ExtraRightJamming(
! 4695: bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
! 4696: zExp = aExp;
! 4697: }
! 4698: else if ( expDiff < 0 ) {
! 4699: if ( bExp == 0x7FFF ) {
! 4700: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
! 4701: return packFloat128( zSign, 0x7FFF, 0, 0 );
! 4702: }
! 4703: if ( aExp == 0 ) {
! 4704: ++expDiff;
! 4705: }
! 4706: else {
! 4707: aSig0 |= LIT64( 0x0001000000000000 );
! 4708: }
! 4709: shift128ExtraRightJamming(
! 4710: aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
! 4711: zExp = bExp;
! 4712: }
! 4713: else {
! 4714: if ( aExp == 0x7FFF ) {
! 4715: if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
! 4716: return propagateFloat128NaN( a, b );
! 4717: }
! 4718: return a;
! 4719: }
! 4720: add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
! 4721: if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
! 4722: zSig2 = 0;
! 4723: zSig0 |= LIT64( 0x0002000000000000 );
! 4724: zExp = aExp;
! 4725: goto shiftRight1;
! 4726: }
! 4727: aSig0 |= LIT64( 0x0001000000000000 );
! 4728: add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
! 4729: --zExp;
! 4730: if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
! 4731: ++zExp;
! 4732: shiftRight1:
! 4733: shift128ExtraRightJamming(
! 4734: zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
! 4735: roundAndPack:
! 4736: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
! 4737:
! 4738: }
! 4739:
! 4740: /*
! 4741: -------------------------------------------------------------------------------
! 4742: Returns the result of subtracting the absolute values of the quadruple-
! 4743: precision floating-point values `a' and `b'. If `zSign' is 1, the
! 4744: difference is negated before being returned. `zSign' is ignored if the
! 4745: result is a NaN. The subtraction is performed according to the IEC/IEEE
! 4746: Standard for Binary Floating-Point Arithmetic.
! 4747: -------------------------------------------------------------------------------
! 4748: */
! 4749: static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
! 4750: {
! 4751: int32 aExp, bExp, zExp;
! 4752: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
! 4753: int32 expDiff;
! 4754: float128 z;
! 4755:
! 4756: aSig1 = extractFloat128Frac1( a );
! 4757: aSig0 = extractFloat128Frac0( a );
! 4758: aExp = extractFloat128Exp( a );
! 4759: bSig1 = extractFloat128Frac1( b );
! 4760: bSig0 = extractFloat128Frac0( b );
! 4761: bExp = extractFloat128Exp( b );
! 4762: expDiff = aExp - bExp;
! 4763: shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
! 4764: shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
! 4765: if ( 0 < expDiff ) goto aExpBigger;
! 4766: if ( expDiff < 0 ) goto bExpBigger;
! 4767: if ( aExp == 0x7FFF ) {
! 4768: if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
! 4769: return propagateFloat128NaN( a, b );
! 4770: }
! 4771: float_raise( float_flag_invalid );
! 4772: z.low = float128_default_nan_low;
! 4773: z.high = float128_default_nan_high;
! 4774: return z;
! 4775: }
! 4776: if ( aExp == 0 ) {
! 4777: aExp = 1;
! 4778: bExp = 1;
! 4779: }
! 4780: if ( bSig0 < aSig0 ) goto aBigger;
! 4781: if ( aSig0 < bSig0 ) goto bBigger;
! 4782: if ( bSig1 < aSig1 ) goto aBigger;
! 4783: if ( aSig1 < bSig1 ) goto bBigger;
! 4784: return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
! 4785: bExpBigger:
! 4786: if ( bExp == 0x7FFF ) {
! 4787: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
! 4788: return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
! 4789: }
! 4790: if ( aExp == 0 ) {
! 4791: ++expDiff;
! 4792: }
! 4793: else {
! 4794: aSig0 |= LIT64( 0x4000000000000000 );
! 4795: }
! 4796: shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
! 4797: bSig0 |= LIT64( 0x4000000000000000 );
! 4798: bBigger:
! 4799: sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
! 4800: zExp = bExp;
! 4801: zSign ^= 1;
! 4802: goto normalizeRoundAndPack;
! 4803: aExpBigger:
! 4804: if ( aExp == 0x7FFF ) {
! 4805: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
! 4806: return a;
! 4807: }
! 4808: if ( bExp == 0 ) {
! 4809: --expDiff;
! 4810: }
! 4811: else {
! 4812: bSig0 |= LIT64( 0x4000000000000000 );
! 4813: }
! 4814: shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
! 4815: aSig0 |= LIT64( 0x4000000000000000 );
! 4816: aBigger:
! 4817: sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
! 4818: zExp = aExp;
! 4819: normalizeRoundAndPack:
! 4820: --zExp;
! 4821: return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
! 4822:
! 4823: }
! 4824:
! 4825: /*
! 4826: -------------------------------------------------------------------------------
! 4827: Returns the result of adding the quadruple-precision floating-point values
! 4828: `a' and `b'. The operation is performed according to the IEC/IEEE Standard
! 4829: for Binary Floating-Point Arithmetic.
! 4830: -------------------------------------------------------------------------------
! 4831: */
! 4832: float128 float128_add( float128 a, float128 b )
! 4833: {
! 4834: flag aSign, bSign;
! 4835:
! 4836: aSign = extractFloat128Sign( a );
! 4837: bSign = extractFloat128Sign( b );
! 4838: if ( aSign == bSign ) {
! 4839: return addFloat128Sigs( a, b, aSign );
! 4840: }
! 4841: else {
! 4842: return subFloat128Sigs( a, b, aSign );
! 4843: }
! 4844:
! 4845: }
! 4846:
! 4847: /*
! 4848: -------------------------------------------------------------------------------
! 4849: Returns the result of subtracting the quadruple-precision floating-point
! 4850: values `a' and `b'. The operation is performed according to the IEC/IEEE
! 4851: Standard for Binary Floating-Point Arithmetic.
! 4852: -------------------------------------------------------------------------------
! 4853: */
! 4854: float128 float128_sub( float128 a, float128 b )
! 4855: {
! 4856: flag aSign, bSign;
! 4857:
! 4858: aSign = extractFloat128Sign( a );
! 4859: bSign = extractFloat128Sign( b );
! 4860: if ( aSign == bSign ) {
! 4861: return subFloat128Sigs( a, b, aSign );
! 4862: }
! 4863: else {
! 4864: return addFloat128Sigs( a, b, aSign );
! 4865: }
! 4866:
! 4867: }
! 4868:
! 4869: /*
! 4870: -------------------------------------------------------------------------------
! 4871: Returns the result of multiplying the quadruple-precision floating-point
! 4872: values `a' and `b'. The operation is performed according to the IEC/IEEE
! 4873: Standard for Binary Floating-Point Arithmetic.
! 4874: -------------------------------------------------------------------------------
! 4875: */
! 4876: float128 float128_mul( float128 a, float128 b )
! 4877: {
! 4878: flag aSign, bSign, zSign;
! 4879: int32 aExp, bExp, zExp;
! 4880: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
! 4881: float128 z;
! 4882:
! 4883: aSig1 = extractFloat128Frac1( a );
! 4884: aSig0 = extractFloat128Frac0( a );
! 4885: aExp = extractFloat128Exp( a );
! 4886: aSign = extractFloat128Sign( a );
! 4887: bSig1 = extractFloat128Frac1( b );
! 4888: bSig0 = extractFloat128Frac0( b );
! 4889: bExp = extractFloat128Exp( b );
! 4890: bSign = extractFloat128Sign( b );
! 4891: zSign = aSign ^ bSign;
! 4892: if ( aExp == 0x7FFF ) {
! 4893: if ( ( aSig0 | aSig1 )
! 4894: || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
! 4895: return propagateFloat128NaN( a, b );
! 4896: }
! 4897: if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
! 4898: return packFloat128( zSign, 0x7FFF, 0, 0 );
! 4899: }
! 4900: if ( bExp == 0x7FFF ) {
! 4901: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
! 4902: if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
! 4903: invalid:
! 4904: float_raise( float_flag_invalid );
! 4905: z.low = float128_default_nan_low;
! 4906: z.high = float128_default_nan_high;
! 4907: return z;
! 4908: }
! 4909: return packFloat128( zSign, 0x7FFF, 0, 0 );
! 4910: }
! 4911: if ( aExp == 0 ) {
! 4912: if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
! 4913: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
! 4914: }
! 4915: if ( bExp == 0 ) {
! 4916: if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
! 4917: normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
! 4918: }
! 4919: zExp = aExp + bExp - 0x4000;
! 4920: aSig0 |= LIT64( 0x0001000000000000 );
! 4921: shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
! 4922: mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
! 4923: add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
! 4924: zSig2 |= ( zSig3 != 0 );
! 4925: if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
! 4926: shift128ExtraRightJamming(
! 4927: zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
! 4928: ++zExp;
! 4929: }
! 4930: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
! 4931:
! 4932: }
! 4933:
! 4934: /*
! 4935: -------------------------------------------------------------------------------
! 4936: Returns the result of dividing the quadruple-precision floating-point value
! 4937: `a' by the corresponding value `b'. The operation is performed according to
! 4938: the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 4939: -------------------------------------------------------------------------------
! 4940: */
! 4941: float128 float128_div( float128 a, float128 b )
! 4942: {
! 4943: flag aSign, bSign, zSign;
! 4944: int32 aExp, bExp, zExp;
! 4945: bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
! 4946: bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
! 4947: float128 z;
! 4948:
! 4949: aSig1 = extractFloat128Frac1( a );
! 4950: aSig0 = extractFloat128Frac0( a );
! 4951: aExp = extractFloat128Exp( a );
! 4952: aSign = extractFloat128Sign( a );
! 4953: bSig1 = extractFloat128Frac1( b );
! 4954: bSig0 = extractFloat128Frac0( b );
! 4955: bExp = extractFloat128Exp( b );
! 4956: bSign = extractFloat128Sign( b );
! 4957: zSign = aSign ^ bSign;
! 4958: if ( aExp == 0x7FFF ) {
! 4959: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
! 4960: if ( bExp == 0x7FFF ) {
! 4961: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
! 4962: goto invalid;
! 4963: }
! 4964: return packFloat128( zSign, 0x7FFF, 0, 0 );
! 4965: }
! 4966: if ( bExp == 0x7FFF ) {
! 4967: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
! 4968: return packFloat128( zSign, 0, 0, 0 );
! 4969: }
! 4970: if ( bExp == 0 ) {
! 4971: if ( ( bSig0 | bSig1 ) == 0 ) {
! 4972: if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
! 4973: invalid:
! 4974: float_raise( float_flag_invalid );
! 4975: z.low = float128_default_nan_low;
! 4976: z.high = float128_default_nan_high;
! 4977: return z;
! 4978: }
! 4979: float_raise( float_flag_divbyzero );
! 4980: return packFloat128( zSign, 0x7FFF, 0, 0 );
! 4981: }
! 4982: normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
! 4983: }
! 4984: if ( aExp == 0 ) {
! 4985: if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
! 4986: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
! 4987: }
! 4988: zExp = aExp - bExp + 0x3FFD;
! 4989: shortShift128Left(
! 4990: aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
! 4991: shortShift128Left(
! 4992: bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
! 4993: if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
! 4994: shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
! 4995: ++zExp;
! 4996: }
! 4997: zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
! 4998: mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
! 4999: sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
! 5000: while ( (sbits64) rem0 < 0 ) {
! 5001: --zSig0;
! 5002: add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
! 5003: }
! 5004: zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
! 5005: if ( ( zSig1 & 0x3FFF ) <= 4 ) {
! 5006: mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
! 5007: sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
! 5008: while ( (sbits64) rem1 < 0 ) {
! 5009: --zSig1;
! 5010: add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
! 5011: }
! 5012: zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
! 5013: }
! 5014: shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
! 5015: return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
! 5016:
! 5017: }
! 5018:
! 5019: /*
! 5020: -------------------------------------------------------------------------------
! 5021: Returns the remainder of the quadruple-precision floating-point value `a'
! 5022: with respect to the corresponding value `b'. The operation is performed
! 5023: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 5024: -------------------------------------------------------------------------------
! 5025: */
! 5026: float128 float128_rem( float128 a, float128 b )
! 5027: {
! 5028: flag aSign, bSign, zSign;
! 5029: int32 aExp, bExp, expDiff;
! 5030: bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
! 5031: bits64 allZero, alternateASig0, alternateASig1, sigMean1;
! 5032: sbits64 sigMean0;
! 5033: float128 z;
! 5034:
! 5035: aSig1 = extractFloat128Frac1( a );
! 5036: aSig0 = extractFloat128Frac0( a );
! 5037: aExp = extractFloat128Exp( a );
! 5038: aSign = extractFloat128Sign( a );
! 5039: bSig1 = extractFloat128Frac1( b );
! 5040: bSig0 = extractFloat128Frac0( b );
! 5041: bExp = extractFloat128Exp( b );
! 5042: bSign = extractFloat128Sign( b );
! 5043: if ( aExp == 0x7FFF ) {
! 5044: if ( ( aSig0 | aSig1 )
! 5045: || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
! 5046: return propagateFloat128NaN( a, b );
! 5047: }
! 5048: goto invalid;
! 5049: }
! 5050: if ( bExp == 0x7FFF ) {
! 5051: if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
! 5052: return a;
! 5053: }
! 5054: if ( bExp == 0 ) {
! 5055: if ( ( bSig0 | bSig1 ) == 0 ) {
! 5056: invalid:
! 5057: float_raise( float_flag_invalid );
! 5058: z.low = float128_default_nan_low;
! 5059: z.high = float128_default_nan_high;
! 5060: return z;
! 5061: }
! 5062: normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
! 5063: }
! 5064: if ( aExp == 0 ) {
! 5065: if ( ( aSig0 | aSig1 ) == 0 ) return a;
! 5066: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
! 5067: }
! 5068: expDiff = aExp - bExp;
! 5069: if ( expDiff < -1 ) return a;
! 5070: shortShift128Left(
! 5071: aSig0 | LIT64( 0x0001000000000000 ),
! 5072: aSig1,
! 5073: 15 - ( expDiff < 0 ),
! 5074: &aSig0,
! 5075: &aSig1
! 5076: );
! 5077: shortShift128Left(
! 5078: bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
! 5079: q = le128( bSig0, bSig1, aSig0, aSig1 );
! 5080: if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
! 5081: expDiff -= 64;
! 5082: while ( 0 < expDiff ) {
! 5083: q = estimateDiv128To64( aSig0, aSig1, bSig0 );
! 5084: q = ( 4 < q ) ? q - 4 : 0;
! 5085: mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
! 5086: shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
! 5087: shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
! 5088: sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
! 5089: expDiff -= 61;
! 5090: }
! 5091: if ( -64 < expDiff ) {
! 5092: q = estimateDiv128To64( aSig0, aSig1, bSig0 );
! 5093: q = ( 4 < q ) ? q - 4 : 0;
! 5094: q >>= - expDiff;
! 5095: shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
! 5096: expDiff += 52;
! 5097: if ( expDiff < 0 ) {
! 5098: shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
! 5099: }
! 5100: else {
! 5101: shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
! 5102: }
! 5103: mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
! 5104: sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
! 5105: }
! 5106: else {
! 5107: shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
! 5108: shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
! 5109: }
! 5110: do {
! 5111: alternateASig0 = aSig0;
! 5112: alternateASig1 = aSig1;
! 5113: ++q;
! 5114: sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
! 5115: } while ( 0 <= (sbits64) aSig0 );
! 5116: add128(
! 5117: aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
! 5118: if ( ( sigMean0 < 0 )
! 5119: || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
! 5120: aSig0 = alternateASig0;
! 5121: aSig1 = alternateASig1;
! 5122: }
! 5123: zSign = ( (sbits64) aSig0 < 0 );
! 5124: if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
! 5125: return
! 5126: normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
! 5127:
! 5128: }
! 5129:
! 5130: /*
! 5131: -------------------------------------------------------------------------------
! 5132: Returns the square root of the quadruple-precision floating-point value `a'.
! 5133: The operation is performed according to the IEC/IEEE Standard for Binary
! 5134: Floating-Point Arithmetic.
! 5135: -------------------------------------------------------------------------------
! 5136: */
! 5137: float128 float128_sqrt( float128 a )
! 5138: {
! 5139: flag aSign;
! 5140: int32 aExp, zExp;
! 5141: bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
! 5142: bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
! 5143: float128 z;
! 5144:
! 5145: aSig1 = extractFloat128Frac1( a );
! 5146: aSig0 = extractFloat128Frac0( a );
! 5147: aExp = extractFloat128Exp( a );
! 5148: aSign = extractFloat128Sign( a );
! 5149: if ( aExp == 0x7FFF ) {
! 5150: if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
! 5151: if ( ! aSign ) return a;
! 5152: goto invalid;
! 5153: }
! 5154: if ( aSign ) {
! 5155: if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
! 5156: invalid:
! 5157: float_raise( float_flag_invalid );
! 5158: z.low = float128_default_nan_low;
! 5159: z.high = float128_default_nan_high;
! 5160: return z;
! 5161: }
! 5162: if ( aExp == 0 ) {
! 5163: if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
! 5164: normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
! 5165: }
! 5166: zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
! 5167: aSig0 |= LIT64( 0x0001000000000000 );
! 5168: zSig0 = estimateSqrt32( aExp, aSig0>>17 );
! 5169: shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
! 5170: zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
! 5171: doubleZSig0 = zSig0<<1;
! 5172: mul64To128( zSig0, zSig0, &term0, &term1 );
! 5173: sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
! 5174: while ( (sbits64) rem0 < 0 ) {
! 5175: --zSig0;
! 5176: doubleZSig0 -= 2;
! 5177: add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
! 5178: }
! 5179: zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
! 5180: if ( ( zSig1 & 0x1FFF ) <= 5 ) {
! 5181: if ( zSig1 == 0 ) zSig1 = 1;
! 5182: mul64To128( doubleZSig0, zSig1, &term1, &term2 );
! 5183: sub128( rem1, 0, term1, term2, &rem1, &rem2 );
! 5184: mul64To128( zSig1, zSig1, &term2, &term3 );
! 5185: sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
! 5186: while ( (sbits64) rem1 < 0 ) {
! 5187: --zSig1;
! 5188: shortShift128Left( 0, zSig1, 1, &term2, &term3 );
! 5189: term3 |= 1;
! 5190: term2 |= doubleZSig0;
! 5191: add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
! 5192: }
! 5193: zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
! 5194: }
! 5195: shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
! 5196: return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
! 5197:
! 5198: }
! 5199:
! 5200: /*
! 5201: -------------------------------------------------------------------------------
! 5202: Returns 1 if the quadruple-precision floating-point value `a' is equal to
! 5203: the corresponding value `b', and 0 otherwise. The comparison is performed
! 5204: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 5205: -------------------------------------------------------------------------------
! 5206: */
! 5207: flag float128_eq( float128 a, float128 b )
! 5208: {
! 5209:
! 5210: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
! 5211: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
! 5212: || ( ( extractFloat128Exp( b ) == 0x7FFF )
! 5213: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
! 5214: ) {
! 5215: if ( float128_is_signaling_nan( a )
! 5216: || float128_is_signaling_nan( b ) ) {
! 5217: float_raise( float_flag_invalid );
! 5218: }
! 5219: return 0;
! 5220: }
! 5221: return
! 5222: ( a.low == b.low )
! 5223: && ( ( a.high == b.high )
! 5224: || ( ( a.low == 0 )
! 5225: && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
! 5226: );
! 5227:
! 5228: }
! 5229:
! 5230: /*
! 5231: -------------------------------------------------------------------------------
! 5232: Returns 1 if the quadruple-precision floating-point value `a' is less than
! 5233: or equal to the corresponding value `b', and 0 otherwise. The comparison
! 5234: is performed according to the IEC/IEEE Standard for Binary Floating-Point
! 5235: Arithmetic.
! 5236: -------------------------------------------------------------------------------
! 5237: */
! 5238: flag float128_le( float128 a, float128 b )
! 5239: {
! 5240: flag aSign, bSign;
! 5241:
! 5242: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
! 5243: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
! 5244: || ( ( extractFloat128Exp( b ) == 0x7FFF )
! 5245: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
! 5246: ) {
! 5247: float_raise( float_flag_invalid );
! 5248: return 0;
! 5249: }
! 5250: aSign = extractFloat128Sign( a );
! 5251: bSign = extractFloat128Sign( b );
! 5252: if ( aSign != bSign ) {
! 5253: return
! 5254: aSign
! 5255: || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 5256: == 0 );
! 5257: }
! 5258: return
! 5259: aSign ? le128( b.high, b.low, a.high, a.low )
! 5260: : le128( a.high, a.low, b.high, b.low );
! 5261:
! 5262: }
! 5263:
! 5264: /*
! 5265: -------------------------------------------------------------------------------
! 5266: Returns 1 if the quadruple-precision floating-point value `a' is less than
! 5267: the corresponding value `b', and 0 otherwise. The comparison is performed
! 5268: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 5269: -------------------------------------------------------------------------------
! 5270: */
! 5271: flag float128_lt( float128 a, float128 b )
! 5272: {
! 5273: flag aSign, bSign;
! 5274:
! 5275: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
! 5276: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
! 5277: || ( ( extractFloat128Exp( b ) == 0x7FFF )
! 5278: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
! 5279: ) {
! 5280: float_raise( float_flag_invalid );
! 5281: return 0;
! 5282: }
! 5283: aSign = extractFloat128Sign( a );
! 5284: bSign = extractFloat128Sign( b );
! 5285: if ( aSign != bSign ) {
! 5286: return
! 5287: aSign
! 5288: && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 5289: != 0 );
! 5290: }
! 5291: return
! 5292: aSign ? lt128( b.high, b.low, a.high, a.low )
! 5293: : lt128( a.high, a.low, b.high, b.low );
! 5294:
! 5295: }
! 5296:
! 5297: /*
! 5298: -------------------------------------------------------------------------------
! 5299: Returns 1 if the quadruple-precision floating-point value `a' is equal to
! 5300: the corresponding value `b', and 0 otherwise. The invalid exception is
! 5301: raised if either operand is a NaN. Otherwise, the comparison is performed
! 5302: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 5303: -------------------------------------------------------------------------------
! 5304: */
! 5305: flag float128_eq_signaling( float128 a, float128 b )
! 5306: {
! 5307:
! 5308: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
! 5309: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
! 5310: || ( ( extractFloat128Exp( b ) == 0x7FFF )
! 5311: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
! 5312: ) {
! 5313: float_raise( float_flag_invalid );
! 5314: return 0;
! 5315: }
! 5316: return
! 5317: ( a.low == b.low )
! 5318: && ( ( a.high == b.high )
! 5319: || ( ( a.low == 0 )
! 5320: && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
! 5321: );
! 5322:
! 5323: }
! 5324:
! 5325: /*
! 5326: -------------------------------------------------------------------------------
! 5327: Returns 1 if the quadruple-precision floating-point value `a' is less than
! 5328: or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
! 5329: cause an exception. Otherwise, the comparison is performed according to the
! 5330: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
! 5331: -------------------------------------------------------------------------------
! 5332: */
! 5333: flag float128_le_quiet( float128 a, float128 b )
! 5334: {
! 5335: flag aSign, bSign;
! 5336:
! 5337: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
! 5338: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
! 5339: || ( ( extractFloat128Exp( b ) == 0x7FFF )
! 5340: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
! 5341: ) {
! 5342: if ( float128_is_signaling_nan( a )
! 5343: || float128_is_signaling_nan( b ) ) {
! 5344: float_raise( float_flag_invalid );
! 5345: }
! 5346: return 0;
! 5347: }
! 5348: aSign = extractFloat128Sign( a );
! 5349: bSign = extractFloat128Sign( b );
! 5350: if ( aSign != bSign ) {
! 5351: return
! 5352: aSign
! 5353: || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 5354: == 0 );
! 5355: }
! 5356: return
! 5357: aSign ? le128( b.high, b.low, a.high, a.low )
! 5358: : le128( a.high, a.low, b.high, b.low );
! 5359:
! 5360: }
! 5361:
! 5362: /*
! 5363: -------------------------------------------------------------------------------
! 5364: Returns 1 if the quadruple-precision floating-point value `a' is less than
! 5365: the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
! 5366: exception. Otherwise, the comparison is performed according to the IEC/IEEE
! 5367: Standard for Binary Floating-Point Arithmetic.
! 5368: -------------------------------------------------------------------------------
! 5369: */
! 5370: flag float128_lt_quiet( float128 a, float128 b )
! 5371: {
! 5372: flag aSign, bSign;
! 5373:
! 5374: if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
! 5375: && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
! 5376: || ( ( extractFloat128Exp( b ) == 0x7FFF )
! 5377: && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
! 5378: ) {
! 5379: if ( float128_is_signaling_nan( a )
! 5380: || float128_is_signaling_nan( b ) ) {
! 5381: float_raise( float_flag_invalid );
! 5382: }
! 5383: return 0;
! 5384: }
! 5385: aSign = extractFloat128Sign( a );
! 5386: bSign = extractFloat128Sign( b );
! 5387: if ( aSign != bSign ) {
! 5388: return
! 5389: aSign
! 5390: && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
! 5391: != 0 );
! 5392: }
! 5393: return
! 5394: aSign ? lt128( b.high, b.low, a.high, a.low )
! 5395: : lt128( a.high, a.low, b.high, b.low );
! 5396:
! 5397: }
! 5398:
! 5399: #endif
! 5400:
! 5401:
! 5402: #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
! 5403:
! 5404: /*
! 5405: * These two routines are not part of the original softfloat distribution.
! 5406: *
! 5407: * They are based on the corresponding conversions to integer but return
! 5408: * unsigned numbers instead since these functions are required by GCC.
! 5409: *
! 5410: * Added by Mark Brinicombe <mark@netbsd.org> 27/09/97
! 5411: *
! 5412: * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
! 5413: */
! 5414:
! 5415: /*
! 5416: -------------------------------------------------------------------------------
! 5417: Returns the result of converting the double-precision floating-point value
! 5418: `a' to the 32-bit unsigned integer format. The conversion is
! 5419: performed according to the IEC/IEEE Standard for Binary Floating-point
! 5420: Arithmetic, except that the conversion is always rounded toward zero. If
! 5421: `a' is a NaN, the largest positive integer is returned. If the conversion
! 5422: overflows, the largest integer positive is returned.
! 5423: -------------------------------------------------------------------------------
! 5424: */
! 5425: uint32 float64_to_uint32_round_to_zero( float64 a )
! 5426: {
! 5427: flag aSign;
! 5428: int16 aExp, shiftCount;
! 5429: bits64 aSig, savedASig;
! 5430: uint32 z;
! 5431:
! 5432: aSig = extractFloat64Frac( a );
! 5433: aExp = extractFloat64Exp( a );
! 5434: aSign = extractFloat64Sign( a );
! 5435:
! 5436: if (aSign) {
! 5437: float_raise( float_flag_invalid );
! 5438: return(0);
! 5439: }
! 5440:
! 5441: if ( 0x41E < aExp ) {
! 5442: float_raise( float_flag_invalid );
! 5443: return 0xffffffff;
! 5444: }
! 5445: else if ( aExp < 0x3FF ) {
! 5446: if ( aExp || aSig ) float_set_inexact();
! 5447: return 0;
! 5448: }
! 5449: aSig |= LIT64( 0x0010000000000000 );
! 5450: shiftCount = 0x433 - aExp;
! 5451: savedASig = aSig;
! 5452: aSig >>= shiftCount;
! 5453: z = aSig;
! 5454: if ( ( aSig<<shiftCount ) != savedASig ) {
! 5455: float_set_inexact();
! 5456: }
! 5457: return z;
! 5458:
! 5459: }
! 5460:
! 5461: /*
! 5462: -------------------------------------------------------------------------------
! 5463: Returns the result of converting the single-precision floating-point value
! 5464: `a' to the 32-bit unsigned integer format. The conversion is
! 5465: performed according to the IEC/IEEE Standard for Binary Floating-point
! 5466: Arithmetic, except that the conversion is always rounded toward zero. If
! 5467: `a' is a NaN, the largest positive integer is returned. If the conversion
! 5468: overflows, the largest positive integer is returned.
! 5469: -------------------------------------------------------------------------------
! 5470: */
! 5471: uint32 float32_to_uint32_round_to_zero( float32 a )
! 5472: {
! 5473: flag aSign;
! 5474: int16 aExp, shiftCount;
! 5475: bits32 aSig;
! 5476: uint32 z;
! 5477:
! 5478: aSig = extractFloat32Frac( a );
! 5479: aExp = extractFloat32Exp( a );
! 5480: aSign = extractFloat32Sign( a );
! 5481: shiftCount = aExp - 0x9E;
! 5482:
! 5483: if (aSign) {
! 5484: float_raise( float_flag_invalid );
! 5485: return(0);
! 5486: }
! 5487: if ( 0 < shiftCount ) {
! 5488: float_raise( float_flag_invalid );
! 5489: return 0xFFFFFFFF;
! 5490: }
! 5491: else if ( aExp <= 0x7E ) {
! 5492: if ( aExp | aSig ) float_set_inexact();
! 5493: return 0;
! 5494: }
! 5495: aSig = ( aSig | 0x800000 )<<8;
! 5496: z = aSig>>( - shiftCount );
! 5497: if ( aSig<<( shiftCount & 31 ) ) {
! 5498: float_set_inexact();
! 5499: }
! 5500: return z;
! 5501:
! 5502: }
! 5503:
! 5504: #endif
! 5505:
! 5506: #endif /* !NO_IEEE */
CVSweb