sys/lib/libkern/softfloat.c - annotate

Return to softfloat.c CVS log
Up to [local] / sys / lib / libkern
Annotation of sys/lib/libkern/softfloat.c, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: softfloat.c,v 1.1 2002/04/28 20:55:14 pvalchev Exp $  */
                      2: /*     $NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $       */
                      3:
                      4: /*
                      5:  * This version hacked for use with gcc -msoft-float by bjh21.
                      6:  * (Mostly a case of #ifdefing out things GCC doesn't need or provides
                      7:  *  itself).
                      8:  */
                      9:
                     10: /*
                     11:  * Things you may want to define:
                     12:  *
                     13:  * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
                     14:  *   -msoft-float) to work.  Include "softfloat-for-gcc.h" to get them
                     15:  *   properly renamed.
                     16:  */
                     17:
                     18: /*
                     19: ===============================================================================
                     20:
                     21: This C source file is part of the SoftFloat IEC/IEEE Floating-point
                     22: Arithmetic Package, Release 2a.
                     23:
                     24: Written by John R. Hauser.  This work was made possible in part by the
                     25: International Computer Science Institute, located at Suite 600, 1947 Center
                     26: Street, Berkeley, California 94704.  Funding was partially provided by the
                     27: National Science Foundation under grant MIP-9311980.  The original version
                     28: of this code was written as part of a project to build a fixed-point vector
                     29: processor in collaboration with the University of California at Berkeley,
                     30: overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
                     31: is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
                     32: arithmetic/SoftFloat.html'.
                     33:
                     34: THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable
                     35: effort has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT
                     36: WILL AT TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS
                     37: RESTRICTED TO PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL
                     38: RESPONSIBILITY FOR ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM
                     39: THEIR OWN USE OF THE SOFTWARE, AND WHO ALSO EFFECTIVELY INDEMNIFY
                     40: (possibly via similar legal warning) JOHN HAUSER AND THE INTERNATIONAL
                     41: COMPUTER SCIENCE INSTITUTE AGAINST ALL LOSSES, COSTS, OR OTHER PROBLEMS
                     42: ARISING FROM THE USE OF THE SOFTWARE BY THEIR CUSTOMERS AND CLIENTS.
                     43:
                     44: Derivative works are acceptable, even for commercial purposes, so long as
                     45: (1) they include prominent notice that the work is derivative, and (2) they
                     46: include prominent notice akin to these four paragraphs for those parts of
                     47: this code that are retained.
                     48:
                     49: ===============================================================================
                     50: */
                     51:
                     52: #ifndef NO_IEEE
                     53:
                     54: #include <sys/cdefs.h>
                     55: #if defined(LIBC_SCCS) && !defined(lint)
                     56: __RCSID("$NetBSD: softfloat.c,v 1.1 2001/04/26 03:10:47 ross Exp $");
                     57: #endif /* LIBC_SCCS and not lint */
                     58:
                     59: #ifdef SOFTFLOAT_FOR_GCC
                     60: #include "softfloat-for-gcc.h"
                     61: #endif
                     62:
                     63: #include "milieu.h"
                     64: #include "softfloat.h"
                     65:
                     66: /*
                     67:  * Conversions between floats as stored in memory and floats as
                     68:  * SoftFloat uses them
                     69:  */
                     70: #ifndef FLOAT64_DEMANGLE
                     71: #define FLOAT64_DEMANGLE(a)    (a)
                     72: #endif
                     73: #ifndef FLOAT64_MANGLE
                     74: #define FLOAT64_MANGLE(a)      (a)
                     75: #endif
                     76:
                     77: /*
                     78: -------------------------------------------------------------------------------
                     79: Floating-point rounding mode, extended double-precision rounding precision,
                     80: and exception flags.
                     81: -------------------------------------------------------------------------------
                     82: */
                     83:
                     84: /*
                     85:  * XXX: This may cause options-MULTIPROCESSOR or thread problems someday.
                     86:  *     Right now, it does not.  I've removed all other dynamic global
                     87:  *     variables. [ross]
                     88:  */
                     89: #ifdef FLOATX80
                     90: int8 floatx80_rounding_precision = 80;
                     91: #endif
                     92:
                     93: /*
                     94: -------------------------------------------------------------------------------
                     95: Primitive arithmetic functions, including multi-word arithmetic, and
                     96: division and square root approximations.  (Can be specialized to target if
                     97: desired.)
                     98: -------------------------------------------------------------------------------
                     99: */
                    100: #include "softfloat-macros.h"
                    101:
                    102: /*
                    103: -------------------------------------------------------------------------------
                    104: Functions and definitions to determine:  (1) whether tininess for underflow
                    105: is detected before or after rounding by default, (2) what (if anything)
                    106: happens when exceptions are raised, (3) how signaling NaNs are distinguished
                    107: from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
                    108: are propagated from function inputs to output.  These details are target-
                    109: specific.
                    110: -------------------------------------------------------------------------------
                    111: */
                    112: #include "softfloat-specialize.h"
                    113:
                    114: #ifndef SOFTFLOAT_FOR_GCC /* Not used */
                    115: /*
                    116: -------------------------------------------------------------------------------
                    117: Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
                    118: and 7, and returns the properly rounded 32-bit integer corresponding to the
                    119: input.  If `zSign' is 1, the input is negated before being converted to an
                    120: integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
                    121: is simply rounded to an integer, with the inexact exception raised if the
                    122: input cannot be represented exactly as an integer.  However, if the fixed-
                    123: point input is too large, the invalid exception is raised and the largest
                    124: positive or negative integer is returned.
                    125: -------------------------------------------------------------------------------
                    126: */
                    127: static int32 roundAndPackInt32( flag zSign, bits64 absZ )
                    128: {
                    129:     int8 roundingMode;
                    130:     flag roundNearestEven;
                    131:     int8 roundIncrement, roundBits;
                    132:     int32 z;
                    133:
                    134:     roundingMode = float_rounding_mode();
                    135:     roundNearestEven = ( roundingMode == float_round_nearest_even );
                    136:     roundIncrement = 0x40;
                    137:     if ( ! roundNearestEven ) {
                    138:         if ( roundingMode == float_round_to_zero ) {
                    139:             roundIncrement = 0;
                    140:         }
                    141:         else {
                    142:             roundIncrement = 0x7F;
                    143:             if ( zSign ) {
                    144:                 if ( roundingMode == float_round_up ) roundIncrement = 0;
                    145:             }
                    146:             else {
                    147:                 if ( roundingMode == float_round_down ) roundIncrement = 0;
                    148:             }
                    149:         }
                    150:     }
                    151:     roundBits = absZ & 0x7F;
                    152:     absZ = ( absZ + roundIncrement )>>7;
                    153:     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
                    154:     z = absZ;
                    155:     if ( zSign ) z = - z;
                    156:     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
                    157:         float_raise( float_flag_invalid );
                    158:         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
                    159:     }
                    160:     if ( roundBits ) float_set_inexact();
                    161:     return z;
                    162:
                    163: }
                    164:
                    165: /*
                    166: -------------------------------------------------------------------------------
                    167: Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
                    168: `absZ1', with binary point between bits 63 and 64 (between the input words),
                    169: and returns the properly rounded 64-bit integer corresponding to the input.
                    170: If `zSign' is 1, the input is negated before being converted to an integer.
                    171: Ordinarily, the fixed-point input is simply rounded to an integer, with
                    172: the inexact exception raised if the input cannot be represented exactly as
                    173: an integer.  However, if the fixed-point input is too large, the invalid
                    174: exception is raised and the largest positive or negative integer is
                    175: returned.
                    176: -------------------------------------------------------------------------------
                    177: */
                    178: static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
                    179: {
                    180:     int8 roundingMode;
                    181:     flag roundNearestEven, increment;
                    182:     int64 z;
                    183:
                    184:     roundingMode = float_rounding_mode();
                    185:     roundNearestEven = ( roundingMode == float_round_nearest_even );
                    186:     increment = ( (sbits64) absZ1 < 0 );
                    187:     if ( ! roundNearestEven ) {
                    188:         if ( roundingMode == float_round_to_zero ) {
                    189:             increment = 0;
                    190:         }
                    191:         else {
                    192:             if ( zSign ) {
                    193:                 increment = ( roundingMode == float_round_down ) && absZ1;
                    194:             }
                    195:             else {
                    196:                 increment = ( roundingMode == float_round_up ) && absZ1;
                    197:             }
                    198:         }
                    199:     }
                    200:     if ( increment ) {
                    201:         ++absZ0;
                    202:         if ( absZ0 == 0 ) goto overflow;
                    203:         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
                    204:     }
                    205:     z = absZ0;
                    206:     if ( zSign ) z = - z;
                    207:     if ( z && ( ( z < 0 ) ^ zSign ) ) {
                    208:  overflow:
                    209:         float_raise( float_flag_invalid );
                    210:         return
                    211:               zSign ? (sbits64) LIT64( 0x8000000000000000 )
                    212:             : LIT64( 0x7FFFFFFFFFFFFFFF );
                    213:     }
                    214:     if ( absZ1 ) float_set_inexact();
                    215:     return z;
                    216:
                    217: }
                    218: #endif
                    219:
                    220: /*
                    221: -------------------------------------------------------------------------------
                    222: Returns the fraction bits of the single-precision floating-point value `a'.
                    223: -------------------------------------------------------------------------------
                    224: */
                    225: INLINE bits32 extractFloat32Frac( float32 a )
                    226: {
                    227:
                    228:     return a & 0x007FFFFF;
                    229:
                    230: }
                    231:
                    232: /*
                    233: -------------------------------------------------------------------------------
                    234: Returns the exponent bits of the single-precision floating-point value `a'.
                    235: -------------------------------------------------------------------------------
                    236: */
                    237: INLINE int16 extractFloat32Exp( float32 a )
                    238: {
                    239:
                    240:     return ( a>>23 ) & 0xFF;
                    241:
                    242: }
                    243:
                    244: /*
                    245: -------------------------------------------------------------------------------
                    246: Returns the sign bit of the single-precision floating-point value `a'.
                    247: -------------------------------------------------------------------------------
                    248: */
                    249: INLINE flag extractFloat32Sign( float32 a )
                    250: {
                    251:
                    252:     return a>>31;
                    253:
                    254: }
                    255:
                    256: /*
                    257: -------------------------------------------------------------------------------
                    258: Normalizes the subnormal single-precision floating-point value represented
                    259: by the denormalized significand `aSig'.  The normalized exponent and
                    260: significand are stored at the locations pointed to by `zExpPtr' and
                    261: `zSigPtr', respectively.
                    262: -------------------------------------------------------------------------------
                    263: */
                    264: static void
                    265:  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
                    266: {
                    267:     int8 shiftCount;
                    268:
                    269:     shiftCount = countLeadingZeros32( aSig ) - 8;
                    270:     *zSigPtr = aSig<<shiftCount;
                    271:     *zExpPtr = 1 - shiftCount;
                    272:
                    273: }
                    274:
                    275: /*
                    276: -------------------------------------------------------------------------------
                    277: Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
                    278: single-precision floating-point value, returning the result.  After being
                    279: shifted into the proper positions, the three fields are simply added
                    280: together to form the result.  This means that any integer portion of `zSig'
                    281: will be added into the exponent.  Since a properly normalized significand
                    282: will have an integer portion equal to 1, the `zExp' input should be 1 less
                    283: than the desired result exponent whenever `zSig' is a complete, normalized
                    284: significand.
                    285: -------------------------------------------------------------------------------
                    286: */
                    287: INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
                    288: {
                    289:
                    290:     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
                    291:
                    292: }
                    293:
                    294: /*
                    295: -------------------------------------------------------------------------------
                    296: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                    297: and significand `zSig', and returns the proper single-precision floating-
                    298: point value corresponding to the abstract input.  Ordinarily, the abstract
                    299: value is simply rounded and packed into the single-precision format, with
                    300: the inexact exception raised if the abstract input cannot be represented
                    301: exactly.  However, if the abstract value is too large, the overflow and
                    302: inexact exceptions are raised and an infinity or maximal finite value is
                    303: returned.  If the abstract value is too small, the input value is rounded to
                    304: a subnormal number, and the underflow and inexact exceptions are raised if
                    305: the abstract input cannot be represented exactly as a subnormal single-
                    306: precision floating-point number.
                    307:     The input significand `zSig' has its binary point between bits 30
                    308: and 29, which is 7 bits to the left of the usual location.  This shifted
                    309: significand must be normalized or smaller.  If `zSig' is not normalized,
                    310: `zExp' must be 0; in that case, the result returned is a subnormal number,
                    311: and it must not require rounding.  In the usual case that `zSig' is
                    312: normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
                    313: The handling of underflow and overflow follows the IEC/IEEE Standard for
                    314: Binary Floating-Point Arithmetic.
                    315: -------------------------------------------------------------------------------
                    316: */
                    317: static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
                    318: {
                    319:     int8 roundingMode;
                    320:     flag roundNearestEven;
                    321:     int8 roundIncrement, roundBits;
                    322:     flag isTiny;
                    323:
                    324:     roundingMode = float_rounding_mode();
                    325:     roundNearestEven = ( roundingMode == float_round_nearest_even );
                    326:     roundIncrement = 0x40;
                    327:     if ( ! roundNearestEven ) {
                    328:         if ( roundingMode == float_round_to_zero ) {
                    329:             roundIncrement = 0;
                    330:         }
                    331:         else {
                    332:             roundIncrement = 0x7F;
                    333:             if ( zSign ) {
                    334:                 if ( roundingMode == float_round_up ) roundIncrement = 0;
                    335:             }
                    336:             else {
                    337:                 if ( roundingMode == float_round_down ) roundIncrement = 0;
                    338:             }
                    339:         }
                    340:     }
                    341:     roundBits = zSig & 0x7F;
                    342:     if ( 0xFD <= (bits16) zExp ) {
                    343:         if (    ( 0xFD < zExp )
                    344:              || (    ( zExp == 0xFD )
                    345:                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
                    346:            ) {
                    347:             float_raise( float_flag_overflow | float_flag_inexact );
                    348:             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
                    349:         }
                    350:         if ( zExp < 0 ) {
                    351:             isTiny =
                    352:                    ( float_detect_tininess == float_tininess_before_rounding )
                    353:                 || ( zExp < -1 )
                    354:                 || ( zSig + roundIncrement < 0x80000000 );
                    355:             shift32RightJamming( zSig, - zExp, &zSig );
                    356:             zExp = 0;
                    357:             roundBits = zSig & 0x7F;
                    358:             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
                    359:         }
                    360:     }
                    361:     if ( roundBits ) float_set_inexact();
                    362:     zSig = ( zSig + roundIncrement )>>7;
                    363:     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
                    364:     if ( zSig == 0 ) zExp = 0;
                    365:     return packFloat32( zSign, zExp, zSig );
                    366:
                    367: }
                    368:
                    369: /*
                    370: -------------------------------------------------------------------------------
                    371: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                    372: and significand `zSig', and returns the proper single-precision floating-
                    373: point value corresponding to the abstract input.  This routine is just like
                    374: `roundAndPackFloat32' except that `zSig' does not have to be normalized.
                    375: Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
                    376: floating-point exponent.
                    377: -------------------------------------------------------------------------------
                    378: */
                    379: static float32
                    380:  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
                    381: {
                    382:     int8 shiftCount;
                    383:
                    384:     shiftCount = countLeadingZeros32( zSig ) - 1;
                    385:     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
                    386:
                    387: }
                    388:
                    389: /*
                    390: -------------------------------------------------------------------------------
                    391: Returns the fraction bits of the double-precision floating-point value `a'.
                    392: -------------------------------------------------------------------------------
                    393: */
                    394: INLINE bits64 extractFloat64Frac( float64 a )
                    395: {
                    396:
                    397:     return FLOAT64_DEMANGLE(a) & LIT64( 0x000FFFFFFFFFFFFF );
                    398:
                    399: }
                    400:
                    401: /*
                    402: -------------------------------------------------------------------------------
                    403: Returns the exponent bits of the double-precision floating-point value `a'.
                    404: -------------------------------------------------------------------------------
                    405: */
                    406: INLINE int16 extractFloat64Exp( float64 a )
                    407: {
                    408:
                    409:     return ( FLOAT64_DEMANGLE(a)>>52 ) & 0x7FF;
                    410:
                    411: }
                    412:
                    413: /*
                    414: -------------------------------------------------------------------------------
                    415: Returns the sign bit of the double-precision floating-point value `a'.
                    416: -------------------------------------------------------------------------------
                    417: */
                    418: INLINE flag extractFloat64Sign( float64 a )
                    419: {
                    420:
                    421:     return FLOAT64_DEMANGLE(a)>>63;
                    422:
                    423: }
                    424:
                    425: /*
                    426: -------------------------------------------------------------------------------
                    427: Normalizes the subnormal double-precision floating-point value represented
                    428: by the denormalized significand `aSig'.  The normalized exponent and
                    429: significand are stored at the locations pointed to by `zExpPtr' and
                    430: `zSigPtr', respectively.
                    431: -------------------------------------------------------------------------------
                    432: */
                    433: static void
                    434:  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
                    435: {
                    436:     int8 shiftCount;
                    437:
                    438:     shiftCount = countLeadingZeros64( aSig ) - 11;
                    439:     *zSigPtr = aSig<<shiftCount;
                    440:     *zExpPtr = 1 - shiftCount;
                    441:
                    442: }
                    443:
                    444: /*
                    445: -------------------------------------------------------------------------------
                    446: Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
                    447: double-precision floating-point value, returning the result.  After being
                    448: shifted into the proper positions, the three fields are simply added
                    449: together to form the result.  This means that any integer portion of `zSig'
                    450: will be added into the exponent.  Since a properly normalized significand
                    451: will have an integer portion equal to 1, the `zExp' input should be 1 less
                    452: than the desired result exponent whenever `zSig' is a complete, normalized
                    453: significand.
                    454: -------------------------------------------------------------------------------
                    455: */
                    456: INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
                    457: {
                    458:
                    459:     return FLOAT64_MANGLE( ( ( (bits64) zSign )<<63 ) +
                    460:                           ( ( (bits64) zExp )<<52 ) + zSig );
                    461:
                    462: }
                    463:
                    464: /*
                    465: -------------------------------------------------------------------------------
                    466: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                    467: and significand `zSig', and returns the proper double-precision floating-
                    468: point value corresponding to the abstract input.  Ordinarily, the abstract
                    469: value is simply rounded and packed into the double-precision format, with
                    470: the inexact exception raised if the abstract input cannot be represented
                    471: exactly.  However, if the abstract value is too large, the overflow and
                    472: inexact exceptions are raised and an infinity or maximal finite value is
                    473: returned.  If the abstract value is too small, the input value is rounded to
                    474: a subnormal number, and the underflow and inexact exceptions are raised if
                    475: the abstract input cannot be represented exactly as a subnormal double-
                    476: precision floating-point number.
                    477:     The input significand `zSig' has its binary point between bits 62
                    478: and 61, which is 10 bits to the left of the usual location.  This shifted
                    479: significand must be normalized or smaller.  If `zSig' is not normalized,
                    480: `zExp' must be 0; in that case, the result returned is a subnormal number,
                    481: and it must not require rounding.  In the usual case that `zSig' is
                    482: normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
                    483: The handling of underflow and overflow follows the IEC/IEEE Standard for
                    484: Binary Floating-Point Arithmetic.
                    485: -------------------------------------------------------------------------------
                    486: */
                    487: static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
                    488: {
                    489:     int8 roundingMode;
                    490:     flag roundNearestEven;
                    491:     int16 roundIncrement, roundBits;
                    492:     flag isTiny;
                    493:
                    494:     roundingMode = float_rounding_mode();
                    495:     roundNearestEven = ( roundingMode == float_round_nearest_even );
                    496:     roundIncrement = 0x200;
                    497:     if ( ! roundNearestEven ) {
                    498:         if ( roundingMode == float_round_to_zero ) {
                    499:             roundIncrement = 0;
                    500:         }
                    501:         else {
                    502:             roundIncrement = 0x3FF;
                    503:             if ( zSign ) {
                    504:                 if ( roundingMode == float_round_up ) roundIncrement = 0;
                    505:             }
                    506:             else {
                    507:                 if ( roundingMode == float_round_down ) roundIncrement = 0;
                    508:             }
                    509:         }
                    510:     }
                    511:     roundBits = zSig & 0x3FF;
                    512:     if ( 0x7FD <= (bits16) zExp ) {
                    513:         if (    ( 0x7FD < zExp )
                    514:              || (    ( zExp == 0x7FD )
                    515:                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
                    516:            ) {
                    517:             float_raise( float_flag_overflow | float_flag_inexact );
                    518:             return FLOAT64_MANGLE(
                    519:                FLOAT64_DEMANGLE(packFloat64( zSign, 0x7FF, 0 )) -
                    520:                ( roundIncrement == 0 ));
                    521:         }
                    522:         if ( zExp < 0 ) {
                    523:             isTiny =
                    524:                    ( float_detect_tininess == float_tininess_before_rounding )
                    525:                 || ( zExp < -1 )
                    526:                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
                    527:             shift64RightJamming( zSig, - zExp, &zSig );
                    528:             zExp = 0;
                    529:             roundBits = zSig & 0x3FF;
                    530:             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
                    531:         }
                    532:     }
                    533:     if ( roundBits ) float_set_inexact();
                    534:     zSig = ( zSig + roundIncrement )>>10;
                    535:     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
                    536:     if ( zSig == 0 ) zExp = 0;
                    537:     return packFloat64( zSign, zExp, zSig );
                    538:
                    539: }
                    540:
                    541: /*
                    542: -------------------------------------------------------------------------------
                    543: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                    544: and significand `zSig', and returns the proper double-precision floating-
                    545: point value corresponding to the abstract input.  This routine is just like
                    546: `roundAndPackFloat64' except that `zSig' does not have to be normalized.
                    547: Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
                    548: floating-point exponent.
                    549: -------------------------------------------------------------------------------
                    550: */
                    551: static float64
                    552:  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
                    553: {
                    554:     int8 shiftCount;
                    555:
                    556:     shiftCount = countLeadingZeros64( zSig ) - 1;
                    557:     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
                    558:
                    559: }
                    560:
                    561: #ifdef FLOATX80
                    562:
                    563: /*
                    564: -------------------------------------------------------------------------------
                    565: Returns the fraction bits of the extended double-precision floating-point
                    566: value `a'.
                    567: -------------------------------------------------------------------------------
                    568: */
                    569: INLINE bits64 extractFloatx80Frac( floatx80 a )
                    570: {
                    571:
                    572:     return a.low;
                    573:
                    574: }
                    575:
                    576: /*
                    577: -------------------------------------------------------------------------------
                    578: Returns the exponent bits of the extended double-precision floating-point
                    579: value `a'.
                    580: -------------------------------------------------------------------------------
                    581: */
                    582: INLINE int32 extractFloatx80Exp( floatx80 a )
                    583: {
                    584:
                    585:     return a.high & 0x7FFF;
                    586:
                    587: }
                    588:
                    589: /*
                    590: -------------------------------------------------------------------------------
                    591: Returns the sign bit of the extended double-precision floating-point value
                    592: `a'.
                    593: -------------------------------------------------------------------------------
                    594: */
                    595: INLINE flag extractFloatx80Sign( floatx80 a )
                    596: {
                    597:
                    598:     return a.high>>15;
                    599:
                    600: }
                    601:
                    602: /*
                    603: -------------------------------------------------------------------------------
                    604: Normalizes the subnormal extended double-precision floating-point value
                    605: represented by the denormalized significand `aSig'.  The normalized exponent
                    606: and significand are stored at the locations pointed to by `zExpPtr' and
                    607: `zSigPtr', respectively.
                    608: -------------------------------------------------------------------------------
                    609: */
                    610: static void
                    611:  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
                    612: {
                    613:     int8 shiftCount;
                    614:
                    615:     shiftCount = countLeadingZeros64( aSig );
                    616:     *zSigPtr = aSig<<shiftCount;
                    617:     *zExpPtr = 1 - shiftCount;
                    618:
                    619: }
                    620:
                    621: /*
                    622: -------------------------------------------------------------------------------
                    623: Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
                    624: extended double-precision floating-point value, returning the result.
                    625: -------------------------------------------------------------------------------
                    626: */
                    627: INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
                    628: {
                    629:     floatx80 z;
                    630:
                    631:     z.low = zSig;
                    632:     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
                    633:     return z;
                    634:
                    635: }
                    636:
                    637: /*
                    638: -------------------------------------------------------------------------------
                    639: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                    640: and extended significand formed by the concatenation of `zSig0' and `zSig1',
                    641: and returns the proper extended double-precision floating-point value
                    642: corresponding to the abstract input.  Ordinarily, the abstract value is
                    643: rounded and packed into the extended double-precision format, with the
                    644: inexact exception raised if the abstract input cannot be represented
                    645: exactly.  However, if the abstract value is too large, the overflow and
                    646: inexact exceptions are raised and an infinity or maximal finite value is
                    647: returned.  If the abstract value is too small, the input value is rounded to
                    648: a subnormal number, and the underflow and inexact exceptions are raised if
                    649: the abstract input cannot be represented exactly as a subnormal extended
                    650: double-precision floating-point number.
                    651:     If `roundingPrecision' is 32 or 64, the result is rounded to the same
                    652: number of bits as single or double precision, respectively.  Otherwise, the
                    653: result is rounded to the full precision of the extended double-precision
                    654: format.
                    655:     The input significand must be normalized or smaller.  If the input
                    656: significand is not normalized, `zExp' must be 0; in that case, the result
                    657: returned is a subnormal number, and it must not require rounding.  The
                    658: handling of underflow and overflow follows the IEC/IEEE Standard for Binary
                    659: Floating-Point Arithmetic.
                    660: -------------------------------------------------------------------------------
                    661: */
                    662: static floatx80
                    663:  roundAndPackFloatx80(
                    664:      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
                    665:  )
                    666: {
                    667:     int8 roundingMode;
                    668:     flag roundNearestEven, increment, isTiny;
                    669:     int64 roundIncrement, roundMask, roundBits;
                    670:
                    671:     roundingMode = float_rounding_mode();
                    672:     roundNearestEven = ( roundingMode == float_round_nearest_even );
                    673:     if ( roundingPrecision == 80 ) goto precision80;
                    674:     if ( roundingPrecision == 64 ) {
                    675:         roundIncrement = LIT64( 0x0000000000000400 );
                    676:         roundMask = LIT64( 0x00000000000007FF );
                    677:     }
                    678:     else if ( roundingPrecision == 32 ) {
                    679:         roundIncrement = LIT64( 0x0000008000000000 );
                    680:         roundMask = LIT64( 0x000000FFFFFFFFFF );
                    681:     }
                    682:     else {
                    683:         goto precision80;
                    684:     }
                    685:     zSig0 |= ( zSig1 != 0 );
                    686:     if ( ! roundNearestEven ) {
                    687:         if ( roundingMode == float_round_to_zero ) {
                    688:             roundIncrement = 0;
                    689:         }
                    690:         else {
                    691:             roundIncrement = roundMask;
                    692:             if ( zSign ) {
                    693:                 if ( roundingMode == float_round_up ) roundIncrement = 0;
                    694:             }
                    695:             else {
                    696:                 if ( roundingMode == float_round_down ) roundIncrement = 0;
                    697:             }
                    698:         }
                    699:     }
                    700:     roundBits = zSig0 & roundMask;
                    701:     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
                    702:         if (    ( 0x7FFE < zExp )
                    703:              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
                    704:            ) {
                    705:             goto overflow;
                    706:         }
                    707:         if ( zExp <= 0 ) {
                    708:             isTiny =
                    709:                    ( float_detect_tininess == float_tininess_before_rounding )
                    710:                 || ( zExp < 0 )
                    711:                 || ( zSig0 <= zSig0 + roundIncrement );
                    712:             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
                    713:             zExp = 0;
                    714:             roundBits = zSig0 & roundMask;
                    715:             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
                    716:             if ( roundBits ) float_set_inexact();
                    717:             zSig0 += roundIncrement;
                    718:             if ( (sbits64) zSig0 < 0 ) zExp = 1;
                    719:             roundIncrement = roundMask + 1;
                    720:             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
                    721:                 roundMask |= roundIncrement;
                    722:             }
                    723:             zSig0 &= ~ roundMask;
                    724:             return packFloatx80( zSign, zExp, zSig0 );
                    725:         }
                    726:     }
                    727:     if ( roundBits ) float_set_inexact();
                    728:     zSig0 += roundIncrement;
                    729:     if ( zSig0 < roundIncrement ) {
                    730:         ++zExp;
                    731:         zSig0 = LIT64( 0x8000000000000000 );
                    732:     }
                    733:     roundIncrement = roundMask + 1;
                    734:     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
                    735:         roundMask |= roundIncrement;
                    736:     }
                    737:     zSig0 &= ~ roundMask;
                    738:     if ( zSig0 == 0 ) zExp = 0;
                    739:     return packFloatx80( zSign, zExp, zSig0 );
                    740:  precision80:
                    741:     increment = ( (sbits64) zSig1 < 0 );
                    742:     if ( ! roundNearestEven ) {
                    743:         if ( roundingMode == float_round_to_zero ) {
                    744:             increment = 0;
                    745:         }
                    746:         else {
                    747:             if ( zSign ) {
                    748:                 increment = ( roundingMode == float_round_down ) && zSig1;
                    749:             }
                    750:             else {
                    751:                 increment = ( roundingMode == float_round_up ) && zSig1;
                    752:             }
                    753:         }
                    754:     }
                    755:     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
                    756:         if (    ( 0x7FFE < zExp )
                    757:              || (    ( zExp == 0x7FFE )
                    758:                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
                    759:                   && increment
                    760:                 )
                    761:            ) {
                    762:             roundMask = 0;
                    763:  overflow:
                    764:             float_raise( float_flag_overflow | float_flag_inexact );
                    765:             if (    ( roundingMode == float_round_to_zero )
                    766:                  || ( zSign && ( roundingMode == float_round_up ) )
                    767:                  || ( ! zSign && ( roundingMode == float_round_down ) )
                    768:                ) {
                    769:                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
                    770:             }
                    771:             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                    772:         }
                    773:         if ( zExp <= 0 ) {
                    774:             isTiny =
                    775:                    ( float_detect_tininess == float_tininess_before_rounding )
                    776:                 || ( zExp < 0 )
                    777:                 || ! increment
                    778:                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
                    779:             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
                    780:             zExp = 0;
                    781:             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
                    782:             if ( zSig1 ) float_set_inexact();
                    783:             if ( roundNearestEven ) {
                    784:                 increment = ( (sbits64) zSig1 < 0 );
                    785:             }
                    786:             else {
                    787:                 if ( zSign ) {
                    788:                     increment = ( roundingMode == float_round_down ) && zSig1;
                    789:                 }
                    790:                 else {
                    791:                     increment = ( roundingMode == float_round_up ) && zSig1;
                    792:                 }
                    793:             }
                    794:             if ( increment ) {
                    795:                 ++zSig0;
                    796:                 zSig0 &=
                    797:                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
                    798:                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
                    799:             }
                    800:             return packFloatx80( zSign, zExp, zSig0 );
                    801:         }
                    802:     }
                    803:     if ( zSig1 ) float_set_inexact();
                    804:     if ( increment ) {
                    805:         ++zSig0;
                    806:         if ( zSig0 == 0 ) {
                    807:             ++zExp;
                    808:             zSig0 = LIT64( 0x8000000000000000 );
                    809:         }
                    810:         else {
                    811:             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
                    812:         }
                    813:     }
                    814:     else {
                    815:         if ( zSig0 == 0 ) zExp = 0;
                    816:     }
                    817:     return packFloatx80( zSign, zExp, zSig0 );
                    818:
                    819: }
                    820:
                    821: /*
                    822: -------------------------------------------------------------------------------
                    823: Takes an abstract floating-point value having sign `zSign', exponent
                    824: `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
                    825: and returns the proper extended double-precision floating-point value
                    826: corresponding to the abstract input.  This routine is just like
                    827: `roundAndPackFloatx80' except that the input significand does not have to be
                    828: normalized.
                    829: -------------------------------------------------------------------------------
                    830: */
                    831: static floatx80
                    832:  normalizeRoundAndPackFloatx80(
                    833:      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
                    834:  )
                    835: {
                    836:     int8 shiftCount;
                    837:
                    838:     if ( zSig0 == 0 ) {
                    839:         zSig0 = zSig1;
                    840:         zSig1 = 0;
                    841:         zExp -= 64;
                    842:     }
                    843:     shiftCount = countLeadingZeros64( zSig0 );
                    844:     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
                    845:     zExp -= shiftCount;
                    846:     return
                    847:         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
                    848:
                    849: }
                    850:
                    851: #endif
                    852:
                    853: #ifdef FLOAT128
                    854:
                    855: /*
                    856: -------------------------------------------------------------------------------
                    857: Returns the least-significant 64 fraction bits of the quadruple-precision
                    858: floating-point value `a'.
                    859: -------------------------------------------------------------------------------
                    860: */
                    861: INLINE bits64 extractFloat128Frac1( float128 a )
                    862: {
                    863:
                    864:     return a.low;
                    865:
                    866: }
                    867:
                    868: /*
                    869: -------------------------------------------------------------------------------
                    870: Returns the most-significant 48 fraction bits of the quadruple-precision
                    871: floating-point value `a'.
                    872: -------------------------------------------------------------------------------
                    873: */
                    874: INLINE bits64 extractFloat128Frac0( float128 a )
                    875: {
                    876:
                    877:     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
                    878:
                    879: }
                    880:
                    881: /*
                    882: -------------------------------------------------------------------------------
                    883: Returns the exponent bits of the quadruple-precision floating-point value
                    884: `a'.
                    885: -------------------------------------------------------------------------------
                    886: */
                    887: INLINE int32 extractFloat128Exp( float128 a )
                    888: {
                    889:
                    890:     return ( a.high>>48 ) & 0x7FFF;
                    891:
                    892: }
                    893:
                    894: /*
                    895: -------------------------------------------------------------------------------
                    896: Returns the sign bit of the quadruple-precision floating-point value `a'.
                    897: -------------------------------------------------------------------------------
                    898: */
                    899: INLINE flag extractFloat128Sign( float128 a )
                    900: {
                    901:
                    902:     return a.high>>63;
                    903:
                    904: }
                    905:
                    906: /*
                    907: -------------------------------------------------------------------------------
                    908: Normalizes the subnormal quadruple-precision floating-point value
                    909: represented by the denormalized significand formed by the concatenation of
                    910: `aSig0' and `aSig1'.  The normalized exponent is stored at the location
                    911: pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
                    912: significand are stored at the location pointed to by `zSig0Ptr', and the
                    913: least significant 64 bits of the normalized significand are stored at the
                    914: location pointed to by `zSig1Ptr'.
                    915: -------------------------------------------------------------------------------
                    916: */
                    917: static void
                    918:  normalizeFloat128Subnormal(
                    919:      bits64 aSig0,
                    920:      bits64 aSig1,
                    921:      int32 *zExpPtr,
                    922:      bits64 *zSig0Ptr,
                    923:      bits64 *zSig1Ptr
                    924:  )
                    925: {
                    926:     int8 shiftCount;
                    927:
                    928:     if ( aSig0 == 0 ) {
                    929:         shiftCount = countLeadingZeros64( aSig1 ) - 15;
                    930:         if ( shiftCount < 0 ) {
                    931:             *zSig0Ptr = aSig1>>( - shiftCount );
                    932:             *zSig1Ptr = aSig1<<( shiftCount & 63 );
                    933:         }
                    934:         else {
                    935:             *zSig0Ptr = aSig1<<shiftCount;
                    936:             *zSig1Ptr = 0;
                    937:         }
                    938:         *zExpPtr = - shiftCount - 63;
                    939:     }
                    940:     else {
                    941:         shiftCount = countLeadingZeros64( aSig0 ) - 15;
                    942:         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
                    943:         *zExpPtr = 1 - shiftCount;
                    944:     }
                    945:
                    946: }
                    947:
                    948: /*
                    949: -------------------------------------------------------------------------------
                    950: Packs the sign `zSign', the exponent `zExp', and the significand formed
                    951: by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
                    952: floating-point value, returning the result.  After being shifted into the
                    953: proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
                    954: added together to form the most significant 32 bits of the result.  This
                    955: means that any integer portion of `zSig0' will be added into the exponent.
                    956: Since a properly normalized significand will have an integer portion equal
                    957: to 1, the `zExp' input should be 1 less than the desired result exponent
                    958: whenever `zSig0' and `zSig1' concatenated form a complete, normalized
                    959: significand.
                    960: -------------------------------------------------------------------------------
                    961: */
                    962: INLINE float128
                    963:  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
                    964: {
                    965:     float128 z;
                    966:
                    967:     z.low = zSig1;
                    968:     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
                    969:     return z;
                    970:
                    971: }
                    972:
                    973: /*
                    974: -------------------------------------------------------------------------------
                    975: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                    976: and extended significand formed by the concatenation of `zSig0', `zSig1',
                    977: and `zSig2', and returns the proper quadruple-precision floating-point value
                    978: corresponding to the abstract input.  Ordinarily, the abstract value is
                    979: simply rounded and packed into the quadruple-precision format, with the
                    980: inexact exception raised if the abstract input cannot be represented
                    981: exactly.  However, if the abstract value is too large, the overflow and
                    982: inexact exceptions are raised and an infinity or maximal finite value is
                    983: returned.  If the abstract value is too small, the input value is rounded to
                    984: a subnormal number, and the underflow and inexact exceptions are raised if
                    985: the abstract input cannot be represented exactly as a subnormal quadruple-
                    986: precision floating-point number.
                    987:     The input significand must be normalized or smaller.  If the input
                    988: significand is not normalized, `zExp' must be 0; in that case, the result
                    989: returned is a subnormal number, and it must not require rounding.  In the
                    990: usual case that the input significand is normalized, `zExp' must be 1 less
                    991: than the ``true'' floating-point exponent.  The handling of underflow and
                    992: overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                    993: -------------------------------------------------------------------------------
                    994: */
                    995: static float128
                    996:  roundAndPackFloat128(
                    997:      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
                    998: {
                    999:     int8 roundingMode;
                   1000:     flag roundNearestEven, increment, isTiny;
                   1001:
                   1002:     roundingMode = float_rounding_mode();
                   1003:     roundNearestEven = ( roundingMode == float_round_nearest_even );
                   1004:     increment = ( (sbits64) zSig2 < 0 );
                   1005:     if ( ! roundNearestEven ) {
                   1006:         if ( roundingMode == float_round_to_zero ) {
                   1007:             increment = 0;
                   1008:         }
                   1009:         else {
                   1010:             if ( zSign ) {
                   1011:                 increment = ( roundingMode == float_round_down ) && zSig2;
                   1012:             }
                   1013:             else {
                   1014:                 increment = ( roundingMode == float_round_up ) && zSig2;
                   1015:             }
                   1016:         }
                   1017:     }
                   1018:     if ( 0x7FFD <= (bits32) zExp ) {
                   1019:         if (    ( 0x7FFD < zExp )
                   1020:              || (    ( zExp == 0x7FFD )
                   1021:                   && eq128(
                   1022:                          LIT64( 0x0001FFFFFFFFFFFF ),
                   1023:                          LIT64( 0xFFFFFFFFFFFFFFFF ),
                   1024:                          zSig0,
                   1025:                          zSig1
                   1026:                      )
                   1027:                   && increment
                   1028:                 )
                   1029:            ) {
                   1030:             float_raise( float_flag_overflow | float_flag_inexact );
                   1031:             if (    ( roundingMode == float_round_to_zero )
                   1032:                  || ( zSign && ( roundingMode == float_round_up ) )
                   1033:                  || ( ! zSign && ( roundingMode == float_round_down ) )
                   1034:                ) {
                   1035:                 return
                   1036:                     packFloat128(
                   1037:                         zSign,
                   1038:                         0x7FFE,
                   1039:                         LIT64( 0x0000FFFFFFFFFFFF ),
                   1040:                         LIT64( 0xFFFFFFFFFFFFFFFF )
                   1041:                     );
                   1042:             }
                   1043:             return packFloat128( zSign, 0x7FFF, 0, 0 );
                   1044:         }
                   1045:         if ( zExp < 0 ) {
                   1046:             isTiny =
                   1047:                    ( float_detect_tininess == float_tininess_before_rounding )
                   1048:                 || ( zExp < -1 )
                   1049:                 || ! increment
                   1050:                 || lt128(
                   1051:                        zSig0,
                   1052:                        zSig1,
                   1053:                        LIT64( 0x0001FFFFFFFFFFFF ),
                   1054:                        LIT64( 0xFFFFFFFFFFFFFFFF )
                   1055:                    );
                   1056:             shift128ExtraRightJamming(
                   1057:                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
                   1058:             zExp = 0;
                   1059:             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
                   1060:             if ( roundNearestEven ) {
                   1061:                 increment = ( (sbits64) zSig2 < 0 );
                   1062:             }
                   1063:             else {
                   1064:                 if ( zSign ) {
                   1065:                     increment = ( roundingMode == float_round_down ) && zSig2;
                   1066:                 }
                   1067:                 else {
                   1068:                     increment = ( roundingMode == float_round_up ) && zSig2;
                   1069:                 }
                   1070:             }
                   1071:         }
                   1072:     }
                   1073:     if ( zSig2 ) float_set_inexact();
                   1074:     if ( increment ) {
                   1075:         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
                   1076:         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
                   1077:     }
                   1078:     else {
                   1079:         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
                   1080:     }
                   1081:     return packFloat128( zSign, zExp, zSig0, zSig1 );
                   1082:
                   1083: }
                   1084:
                   1085: /*
                   1086: -------------------------------------------------------------------------------
                   1087: Takes an abstract floating-point value having sign `zSign', exponent `zExp',
                   1088: and significand formed by the concatenation of `zSig0' and `zSig1', and
                   1089: returns the proper quadruple-precision floating-point value corresponding
                   1090: to the abstract input.  This routine is just like `roundAndPackFloat128'
                   1091: except that the input significand has fewer bits and does not have to be
                   1092: normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
                   1093: point exponent.
                   1094: -------------------------------------------------------------------------------
                   1095: */
                   1096: static float128
                   1097:  normalizeRoundAndPackFloat128(
                   1098:      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
                   1099: {
                   1100:     int8 shiftCount;
                   1101:     bits64 zSig2;
                   1102:
                   1103:     if ( zSig0 == 0 ) {
                   1104:         zSig0 = zSig1;
                   1105:         zSig1 = 0;
                   1106:         zExp -= 64;
                   1107:     }
                   1108:     shiftCount = countLeadingZeros64( zSig0 ) - 15;
                   1109:     if ( 0 <= shiftCount ) {
                   1110:         zSig2 = 0;
                   1111:         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
                   1112:     }
                   1113:     else {
                   1114:         shift128ExtraRightJamming(
                   1115:             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
                   1116:     }
                   1117:     zExp -= shiftCount;
                   1118:     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
                   1119:
                   1120: }
                   1121:
                   1122: #endif
                   1123:
                   1124: /*
                   1125: -------------------------------------------------------------------------------
                   1126: Returns the result of converting the 32-bit two's complement integer `a'
                   1127: to the single-precision floating-point format.  The conversion is performed
                   1128: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1129: -------------------------------------------------------------------------------
                   1130: */
                   1131: float32 int32_to_float32( int32 a )
                   1132: {
                   1133:     flag zSign;
                   1134:
                   1135:     if ( a == 0 ) return 0;
                   1136:     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
                   1137:     zSign = ( a < 0 );
                   1138:     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
                   1139:
                   1140: }
                   1141:
                   1142: /*
                   1143: -------------------------------------------------------------------------------
                   1144: Returns the result of converting the 32-bit two's complement integer `a'
                   1145: to the double-precision floating-point format.  The conversion is performed
                   1146: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1147: -------------------------------------------------------------------------------
                   1148: */
                   1149: float64 int32_to_float64( int32 a )
                   1150: {
                   1151:     flag zSign;
                   1152:     uint32 absA;
                   1153:     int8 shiftCount;
                   1154:     bits64 zSig;
                   1155:
                   1156:     if ( a == 0 ) return 0;
                   1157:     zSign = ( a < 0 );
                   1158:     absA = zSign ? - a : a;
                   1159:     shiftCount = countLeadingZeros32( absA ) + 21;
                   1160:     zSig = absA;
                   1161:     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
                   1162:
                   1163: }
                   1164:
                   1165: #ifdef FLOATX80
                   1166:
                   1167: /*
                   1168: -------------------------------------------------------------------------------
                   1169: Returns the result of converting the 32-bit two's complement integer `a'
                   1170: to the extended double-precision floating-point format.  The conversion
                   1171: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1172: Arithmetic.
                   1173: -------------------------------------------------------------------------------
                   1174: */
                   1175: floatx80 int32_to_floatx80( int32 a )
                   1176: {
                   1177:     flag zSign;
                   1178:     uint32 absA;
                   1179:     int8 shiftCount;
                   1180:     bits64 zSig;
                   1181:
                   1182:     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
                   1183:     zSign = ( a < 0 );
                   1184:     absA = zSign ? - a : a;
                   1185:     shiftCount = countLeadingZeros32( absA ) + 32;
                   1186:     zSig = absA;
                   1187:     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
                   1188:
                   1189: }
                   1190:
                   1191: #endif
                   1192:
                   1193: #ifdef FLOAT128
                   1194:
                   1195: /*
                   1196: -------------------------------------------------------------------------------
                   1197: Returns the result of converting the 32-bit two's complement integer `a' to
                   1198: the quadruple-precision floating-point format.  The conversion is performed
                   1199: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1200: -------------------------------------------------------------------------------
                   1201: */
                   1202: float128 int32_to_float128( int32 a )
                   1203: {
                   1204:     flag zSign;
                   1205:     uint32 absA;
                   1206:     int8 shiftCount;
                   1207:     bits64 zSig0;
                   1208:
                   1209:     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
                   1210:     zSign = ( a < 0 );
                   1211:     absA = zSign ? - a : a;
                   1212:     shiftCount = countLeadingZeros32( absA ) + 17;
                   1213:     zSig0 = absA;
                   1214:     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
                   1215:
                   1216: }
                   1217:
                   1218: #endif
                   1219:
                   1220: #ifndef SOFTFLOAT_FOR_GCC /* __floatdi?f is in libgcc2.c */
                   1221: /*
                   1222: -------------------------------------------------------------------------------
                   1223: Returns the result of converting the 64-bit two's complement integer `a'
                   1224: to the single-precision floating-point format.  The conversion is performed
                   1225: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1226: -------------------------------------------------------------------------------
                   1227: */
                   1228: float32 int64_to_float32( int64 a )
                   1229: {
                   1230:     flag zSign;
                   1231:     uint64 absA;
                   1232:     int8 shiftCount;
                   1233:
                   1234:     if ( a == 0 ) return 0;
                   1235:     zSign = ( a < 0 );
                   1236:     absA = zSign ? - a : a;
                   1237:     shiftCount = countLeadingZeros64( absA ) - 40;
                   1238:     if ( 0 <= shiftCount ) {
                   1239:         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
                   1240:     }
                   1241:     else {
                   1242:         shiftCount += 7;
                   1243:         if ( shiftCount < 0 ) {
                   1244:             shift64RightJamming( absA, - shiftCount, &absA );
                   1245:         }
                   1246:         else {
                   1247:             absA <<= shiftCount;
                   1248:         }
                   1249:         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
                   1250:     }
                   1251:
                   1252: }
                   1253:
                   1254: /*
                   1255: -------------------------------------------------------------------------------
                   1256: Returns the result of converting the 64-bit two's complement integer `a'
                   1257: to the double-precision floating-point format.  The conversion is performed
                   1258: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1259: -------------------------------------------------------------------------------
                   1260: */
                   1261: float64 int64_to_float64( int64 a )
                   1262: {
                   1263:     flag zSign;
                   1264:
                   1265:     if ( a == 0 ) return 0;
                   1266:     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
                   1267:         return packFloat64( 1, 0x43E, 0 );
                   1268:     }
                   1269:     zSign = ( a < 0 );
                   1270:     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
                   1271:
                   1272: }
                   1273:
                   1274: #ifdef FLOATX80
                   1275:
                   1276: /*
                   1277: -------------------------------------------------------------------------------
                   1278: Returns the result of converting the 64-bit two's complement integer `a'
                   1279: to the extended double-precision floating-point format.  The conversion
                   1280: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1281: Arithmetic.
                   1282: -------------------------------------------------------------------------------
                   1283: */
                   1284: floatx80 int64_to_floatx80( int64 a )
                   1285: {
                   1286:     flag zSign;
                   1287:     uint64 absA;
                   1288:     int8 shiftCount;
                   1289:
                   1290:     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
                   1291:     zSign = ( a < 0 );
                   1292:     absA = zSign ? - a : a;
                   1293:     shiftCount = countLeadingZeros64( absA );
                   1294:     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
                   1295:
                   1296: }
                   1297:
                   1298: #endif
                   1299:
                   1300: #ifdef FLOAT128
                   1301:
                   1302: /*
                   1303: -------------------------------------------------------------------------------
                   1304: Returns the result of converting the 64-bit two's complement integer `a' to
                   1305: the quadruple-precision floating-point format.  The conversion is performed
                   1306: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1307: -------------------------------------------------------------------------------
                   1308: */
                   1309: float128 int64_to_float128( int64 a )
                   1310: {
                   1311:     flag zSign;
                   1312:     uint64 absA;
                   1313:     int8 shiftCount;
                   1314:     int32 zExp;
                   1315:     bits64 zSig0, zSig1;
                   1316:
                   1317:     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
                   1318:     zSign = ( a < 0 );
                   1319:     absA = zSign ? - a : a;
                   1320:     shiftCount = countLeadingZeros64( absA ) + 49;
                   1321:     zExp = 0x406E - shiftCount;
                   1322:     if ( 64 <= shiftCount ) {
                   1323:         zSig1 = 0;
                   1324:         zSig0 = absA;
                   1325:         shiftCount -= 64;
                   1326:     }
                   1327:     else {
                   1328:         zSig1 = absA;
                   1329:         zSig0 = 0;
                   1330:     }
                   1331:     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
                   1332:     return packFloat128( zSign, zExp, zSig0, zSig1 );
                   1333:
                   1334: }
                   1335:
                   1336: #endif
                   1337: #endif /* !SOFTFLOAT_FOR_GCC */
                   1338:
                   1339: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   1340: /*
                   1341: -------------------------------------------------------------------------------
                   1342: Returns the result of converting the single-precision floating-point value
                   1343: `a' to the 32-bit two's complement integer format.  The conversion is
                   1344: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1345: Arithmetic---which means in particular that the conversion is rounded
                   1346: according to the current rounding mode.  If `a' is a NaN, the largest
                   1347: positive integer is returned.  Otherwise, if the conversion overflows, the
                   1348: largest integer with the same sign as `a' is returned.
                   1349: -------------------------------------------------------------------------------
                   1350: */
                   1351: int32 float32_to_int32( float32 a )
                   1352: {
                   1353:     flag aSign;
                   1354:     int16 aExp, shiftCount;
                   1355:     bits32 aSig;
                   1356:     bits64 aSig64;
                   1357:
                   1358:     aSig = extractFloat32Frac( a );
                   1359:     aExp = extractFloat32Exp( a );
                   1360:     aSign = extractFloat32Sign( a );
                   1361:     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
                   1362:     if ( aExp ) aSig |= 0x00800000;
                   1363:     shiftCount = 0xAF - aExp;
                   1364:     aSig64 = aSig;
                   1365:     aSig64 <<= 32;
                   1366:     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
                   1367:     return roundAndPackInt32( aSign, aSig64 );
                   1368:
                   1369: }
                   1370: #endif /* !SOFTFLOAT_FOR_GCC */
                   1371:
                   1372: /*
                   1373: -------------------------------------------------------------------------------
                   1374: Returns the result of converting the single-precision floating-point value
                   1375: `a' to the 32-bit two's complement integer format.  The conversion is
                   1376: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1377: Arithmetic, except that the conversion is always rounded toward zero.
                   1378: If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
                   1379: the conversion overflows, the largest integer with the same sign as `a' is
                   1380: returned.
                   1381: -------------------------------------------------------------------------------
                   1382: */
                   1383: int32 float32_to_int32_round_to_zero( float32 a )
                   1384: {
                   1385:     flag aSign;
                   1386:     int16 aExp, shiftCount;
                   1387:     bits32 aSig;
                   1388:     int32 z;
                   1389:
                   1390:     aSig = extractFloat32Frac( a );
                   1391:     aExp = extractFloat32Exp( a );
                   1392:     aSign = extractFloat32Sign( a );
                   1393:     shiftCount = aExp - 0x9E;
                   1394:     if ( 0 <= shiftCount ) {
                   1395:         if ( a != 0xCF000000 ) {
                   1396:             float_raise( float_flag_invalid );
                   1397:             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
                   1398:         }
                   1399:         return (sbits32) 0x80000000;
                   1400:     }
                   1401:     else if ( aExp <= 0x7E ) {
                   1402:         if ( aExp | aSig ) float_set_inexact();
                   1403:         return 0;
                   1404:     }
                   1405:     aSig = ( aSig | 0x00800000 )<<8;
                   1406:     z = aSig>>( - shiftCount );
                   1407:     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
                   1408:         float_set_inexact();
                   1409:     }
                   1410:     if ( aSign ) z = - z;
                   1411:     return z;
                   1412:
                   1413: }
                   1414:
                   1415: #ifndef SOFTFLOAT_FOR_GCC /* __fix?fdi provided by libgcc2.c */
                   1416: /*
                   1417: -------------------------------------------------------------------------------
                   1418: Returns the result of converting the single-precision floating-point value
                   1419: `a' to the 64-bit two's complement integer format.  The conversion is
                   1420: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1421: Arithmetic---which means in particular that the conversion is rounded
                   1422: according to the current rounding mode.  If `a' is a NaN, the largest
                   1423: positive integer is returned.  Otherwise, if the conversion overflows, the
                   1424: largest integer with the same sign as `a' is returned.
                   1425: -------------------------------------------------------------------------------
                   1426: */
                   1427: int64 float32_to_int64( float32 a )
                   1428: {
                   1429:     flag aSign;
                   1430:     int16 aExp, shiftCount;
                   1431:     bits32 aSig;
                   1432:     bits64 aSig64, aSigExtra;
                   1433:
                   1434:     aSig = extractFloat32Frac( a );
                   1435:     aExp = extractFloat32Exp( a );
                   1436:     aSign = extractFloat32Sign( a );
                   1437:     shiftCount = 0xBE - aExp;
                   1438:     if ( shiftCount < 0 ) {
                   1439:         float_raise( float_flag_invalid );
                   1440:         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
                   1441:             return LIT64( 0x7FFFFFFFFFFFFFFF );
                   1442:         }
                   1443:         return (sbits64) LIT64( 0x8000000000000000 );
                   1444:     }
                   1445:     if ( aExp ) aSig |= 0x00800000;
                   1446:     aSig64 = aSig;
                   1447:     aSig64 <<= 40;
                   1448:     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
                   1449:     return roundAndPackInt64( aSign, aSig64, aSigExtra );
                   1450:
                   1451: }
                   1452:
                   1453: /*
                   1454: -------------------------------------------------------------------------------
                   1455: Returns the result of converting the single-precision floating-point value
                   1456: `a' to the 64-bit two's complement integer format.  The conversion is
                   1457: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1458: Arithmetic, except that the conversion is always rounded toward zero.  If
                   1459: `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
                   1460: conversion overflows, the largest integer with the same sign as `a' is
                   1461: returned.
                   1462: -------------------------------------------------------------------------------
                   1463: */
                   1464: int64 float32_to_int64_round_to_zero( float32 a )
                   1465: {
                   1466:     flag aSign;
                   1467:     int16 aExp, shiftCount;
                   1468:     bits32 aSig;
                   1469:     bits64 aSig64;
                   1470:     int64 z;
                   1471:
                   1472:     aSig = extractFloat32Frac( a );
                   1473:     aExp = extractFloat32Exp( a );
                   1474:     aSign = extractFloat32Sign( a );
                   1475:     shiftCount = aExp - 0xBE;
                   1476:     if ( 0 <= shiftCount ) {
                   1477:         if ( a != 0xDF000000 ) {
                   1478:             float_raise( float_flag_invalid );
                   1479:             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
                   1480:                 return LIT64( 0x7FFFFFFFFFFFFFFF );
                   1481:             }
                   1482:         }
                   1483:         return (sbits64) LIT64( 0x8000000000000000 );
                   1484:     }
                   1485:     else if ( aExp <= 0x7E ) {
                   1486:         if ( aExp | aSig ) float_set_inexact();
                   1487:         return 0;
                   1488:     }
                   1489:     aSig64 = aSig | 0x00800000;
                   1490:     aSig64 <<= 40;
                   1491:     z = aSig64>>( - shiftCount );
                   1492:     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
                   1493:         float_set_inexact();
                   1494:     }
                   1495:     if ( aSign ) z = - z;
                   1496:     return z;
                   1497:
                   1498: }
                   1499: #endif /* !SOFTFLOAT_FOR_GCC */
                   1500:
                   1501: /*
                   1502: -------------------------------------------------------------------------------
                   1503: Returns the result of converting the single-precision floating-point value
                   1504: `a' to the double-precision floating-point format.  The conversion is
                   1505: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1506: Arithmetic.
                   1507: -------------------------------------------------------------------------------
                   1508: */
                   1509: float64 float32_to_float64( float32 a )
                   1510: {
                   1511:     flag aSign;
                   1512:     int16 aExp;
                   1513:     bits32 aSig;
                   1514:
                   1515:     aSig = extractFloat32Frac( a );
                   1516:     aExp = extractFloat32Exp( a );
                   1517:     aSign = extractFloat32Sign( a );
                   1518:     if ( aExp == 0xFF ) {
                   1519:         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
                   1520:         return packFloat64( aSign, 0x7FF, 0 );
                   1521:     }
                   1522:     if ( aExp == 0 ) {
                   1523:         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
                   1524:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   1525:         --aExp;
                   1526:     }
                   1527:     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
                   1528:
                   1529: }
                   1530:
                   1531: #ifdef FLOATX80
                   1532:
                   1533: /*
                   1534: -------------------------------------------------------------------------------
                   1535: Returns the result of converting the single-precision floating-point value
                   1536: `a' to the extended double-precision floating-point format.  The conversion
                   1537: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1538: Arithmetic.
                   1539: -------------------------------------------------------------------------------
                   1540: */
                   1541: floatx80 float32_to_floatx80( float32 a )
                   1542: {
                   1543:     flag aSign;
                   1544:     int16 aExp;
                   1545:     bits32 aSig;
                   1546:
                   1547:     aSig = extractFloat32Frac( a );
                   1548:     aExp = extractFloat32Exp( a );
                   1549:     aSign = extractFloat32Sign( a );
                   1550:     if ( aExp == 0xFF ) {
                   1551:         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
                   1552:         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   1553:     }
                   1554:     if ( aExp == 0 ) {
                   1555:         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
                   1556:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   1557:     }
                   1558:     aSig |= 0x00800000;
                   1559:     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
                   1560:
                   1561: }
                   1562:
                   1563: #endif
                   1564:
                   1565: #ifdef FLOAT128
                   1566:
                   1567: /*
                   1568: -------------------------------------------------------------------------------
                   1569: Returns the result of converting the single-precision floating-point value
                   1570: `a' to the double-precision floating-point format.  The conversion is
                   1571: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   1572: Arithmetic.
                   1573: -------------------------------------------------------------------------------
                   1574: */
                   1575: float128 float32_to_float128( float32 a )
                   1576: {
                   1577:     flag aSign;
                   1578:     int16 aExp;
                   1579:     bits32 aSig;
                   1580:
                   1581:     aSig = extractFloat32Frac( a );
                   1582:     aExp = extractFloat32Exp( a );
                   1583:     aSign = extractFloat32Sign( a );
                   1584:     if ( aExp == 0xFF ) {
                   1585:         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
                   1586:         return packFloat128( aSign, 0x7FFF, 0, 0 );
                   1587:     }
                   1588:     if ( aExp == 0 ) {
                   1589:         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
                   1590:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   1591:         --aExp;
                   1592:     }
                   1593:     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
                   1594:
                   1595: }
                   1596:
                   1597: #endif
                   1598:
                   1599: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   1600: /*
                   1601: -------------------------------------------------------------------------------
                   1602: Rounds the single-precision floating-point value `a' to an integer, and
                   1603: returns the result as a single-precision floating-point value.  The
                   1604: operation is performed according to the IEC/IEEE Standard for Binary
                   1605: Floating-Point Arithmetic.
                   1606: -------------------------------------------------------------------------------
                   1607: */
                   1608: float32 float32_round_to_int( float32 a )
                   1609: {
                   1610:     flag aSign;
                   1611:     int16 aExp;
                   1612:     bits32 lastBitMask, roundBitsMask;
                   1613:     int8 roundingMode;
                   1614:     float32 z;
                   1615:
                   1616:     aExp = extractFloat32Exp( a );
                   1617:     if ( 0x96 <= aExp ) {
                   1618:         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
                   1619:             return propagateFloat32NaN( a, a );
                   1620:         }
                   1621:         return a;
                   1622:     }
                   1623:     if ( aExp <= 0x7E ) {
                   1624:         if ( (bits32) ( a<<1 ) == 0 ) return a;
                   1625:         float_set_inexact();
                   1626:         aSign = extractFloat32Sign( a );
                   1627:         switch ( float_rounding_mode() ) {
                   1628:          case float_round_nearest_even:
                   1629:             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
                   1630:                 return packFloat32( aSign, 0x7F, 0 );
                   1631:             }
                   1632:             break;
                   1633:          case float_round_down:
                   1634:             return aSign ? 0xBF800000 : 0;
                   1635:          case float_round_up:
                   1636:             return aSign ? 0x80000000 : 0x3F800000;
                   1637:         }
                   1638:         return packFloat32( aSign, 0, 0 );
                   1639:     }
                   1640:     lastBitMask = 1;
                   1641:     lastBitMask <<= 0x96 - aExp;
                   1642:     roundBitsMask = lastBitMask - 1;
                   1643:     z = a;
                   1644:     roundingMode = float_rounding_mode();
                   1645:     if ( roundingMode == float_round_nearest_even ) {
                   1646:         z += lastBitMask>>1;
                   1647:         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
                   1648:     }
                   1649:     else if ( roundingMode != float_round_to_zero ) {
                   1650:         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
                   1651:             z += roundBitsMask;
                   1652:         }
                   1653:     }
                   1654:     z &= ~ roundBitsMask;
                   1655:     if ( z != a ) float_set_inexact();
                   1656:     return z;
                   1657:
                   1658: }
                   1659: #endif /* !SOFTFLOAT_FOR_GCC */
                   1660:
                   1661: /*
                   1662: -------------------------------------------------------------------------------
                   1663: Returns the result of adding the absolute values of the single-precision
                   1664: floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
                   1665: before being returned.  `zSign' is ignored if the result is a NaN.
                   1666: The addition is performed according to the IEC/IEEE Standard for Binary
                   1667: Floating-Point Arithmetic.
                   1668: -------------------------------------------------------------------------------
                   1669: */
                   1670: static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
                   1671: {
                   1672:     int16 aExp, bExp, zExp;
                   1673:     bits32 aSig, bSig, zSig;
                   1674:     int16 expDiff;
                   1675:
                   1676:     aSig = extractFloat32Frac( a );
                   1677:     aExp = extractFloat32Exp( a );
                   1678:     bSig = extractFloat32Frac( b );
                   1679:     bExp = extractFloat32Exp( b );
                   1680:     expDiff = aExp - bExp;
                   1681:     aSig <<= 6;
                   1682:     bSig <<= 6;
                   1683:     if ( 0 < expDiff ) {
                   1684:         if ( aExp == 0xFF ) {
                   1685:             if ( aSig ) return propagateFloat32NaN( a, b );
                   1686:             return a;
                   1687:         }
                   1688:         if ( bExp == 0 ) {
                   1689:             --expDiff;
                   1690:         }
                   1691:         else {
                   1692:             bSig |= 0x20000000;
                   1693:         }
                   1694:         shift32RightJamming( bSig, expDiff, &bSig );
                   1695:         zExp = aExp;
                   1696:     }
                   1697:     else if ( expDiff < 0 ) {
                   1698:         if ( bExp == 0xFF ) {
                   1699:             if ( bSig ) return propagateFloat32NaN( a, b );
                   1700:             return packFloat32( zSign, 0xFF, 0 );
                   1701:         }
                   1702:         if ( aExp == 0 ) {
                   1703:             ++expDiff;
                   1704:         }
                   1705:         else {
                   1706:             aSig |= 0x20000000;
                   1707:         }
                   1708:         shift32RightJamming( aSig, - expDiff, &aSig );
                   1709:         zExp = bExp;
                   1710:     }
                   1711:     else {
                   1712:         if ( aExp == 0xFF ) {
                   1713:             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
                   1714:             return a;
                   1715:         }
                   1716:         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
                   1717:         zSig = 0x40000000 + aSig + bSig;
                   1718:         zExp = aExp;
                   1719:         goto roundAndPack;
                   1720:     }
                   1721:     aSig |= 0x20000000;
                   1722:     zSig = ( aSig + bSig )<<1;
                   1723:     --zExp;
                   1724:     if ( (sbits32) zSig < 0 ) {
                   1725:         zSig = aSig + bSig;
                   1726:         ++zExp;
                   1727:     }
                   1728:  roundAndPack:
                   1729:     return roundAndPackFloat32( zSign, zExp, zSig );
                   1730:
                   1731: }
                   1732:
                   1733: /*
                   1734: -------------------------------------------------------------------------------
                   1735: Returns the result of subtracting the absolute values of the single-
                   1736: precision floating-point values `a' and `b'.  If `zSign' is 1, the
                   1737: difference is negated before being returned.  `zSign' is ignored if the
                   1738: result is a NaN.  The subtraction is performed according to the IEC/IEEE
                   1739: Standard for Binary Floating-Point Arithmetic.
                   1740: -------------------------------------------------------------------------------
                   1741: */
                   1742: static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
                   1743: {
                   1744:     int16 aExp, bExp, zExp;
                   1745:     bits32 aSig, bSig, zSig;
                   1746:     int16 expDiff;
                   1747:
                   1748:     aSig = extractFloat32Frac( a );
                   1749:     aExp = extractFloat32Exp( a );
                   1750:     bSig = extractFloat32Frac( b );
                   1751:     bExp = extractFloat32Exp( b );
                   1752:     expDiff = aExp - bExp;
                   1753:     aSig <<= 7;
                   1754:     bSig <<= 7;
                   1755:     if ( 0 < expDiff ) goto aExpBigger;
                   1756:     if ( expDiff < 0 ) goto bExpBigger;
                   1757:     if ( aExp == 0xFF ) {
                   1758:         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
                   1759:         float_raise( float_flag_invalid );
                   1760:         return float32_default_nan;
                   1761:     }
                   1762:     if ( aExp == 0 ) {
                   1763:         aExp = 1;
                   1764:         bExp = 1;
                   1765:     }
                   1766:     if ( bSig < aSig ) goto aBigger;
                   1767:     if ( aSig < bSig ) goto bBigger;
                   1768:     return packFloat32( float_rounding_mode() == float_round_down, 0, 0 );
                   1769:  bExpBigger:
                   1770:     if ( bExp == 0xFF ) {
                   1771:         if ( bSig ) return propagateFloat32NaN( a, b );
                   1772:         return packFloat32( zSign ^ 1, 0xFF, 0 );
                   1773:     }
                   1774:     if ( aExp == 0 ) {
                   1775:         ++expDiff;
                   1776:     }
                   1777:     else {
                   1778:         aSig |= 0x40000000;
                   1779:     }
                   1780:     shift32RightJamming( aSig, - expDiff, &aSig );
                   1781:     bSig |= 0x40000000;
                   1782:  bBigger:
                   1783:     zSig = bSig - aSig;
                   1784:     zExp = bExp;
                   1785:     zSign ^= 1;
                   1786:     goto normalizeRoundAndPack;
                   1787:  aExpBigger:
                   1788:     if ( aExp == 0xFF ) {
                   1789:         if ( aSig ) return propagateFloat32NaN( a, b );
                   1790:         return a;
                   1791:     }
                   1792:     if ( bExp == 0 ) {
                   1793:         --expDiff;
                   1794:     }
                   1795:     else {
                   1796:         bSig |= 0x40000000;
                   1797:     }
                   1798:     shift32RightJamming( bSig, expDiff, &bSig );
                   1799:     aSig |= 0x40000000;
                   1800:  aBigger:
                   1801:     zSig = aSig - bSig;
                   1802:     zExp = aExp;
                   1803:  normalizeRoundAndPack:
                   1804:     --zExp;
                   1805:     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
                   1806:
                   1807: }
                   1808:
                   1809: /*
                   1810: -------------------------------------------------------------------------------
                   1811: Returns the result of adding the single-precision floating-point values `a'
                   1812: and `b'.  The operation is performed according to the IEC/IEEE Standard for
                   1813: Binary Floating-Point Arithmetic.
                   1814: -------------------------------------------------------------------------------
                   1815: */
                   1816: float32 float32_add( float32 a, float32 b )
                   1817: {
                   1818:     flag aSign, bSign;
                   1819:
                   1820:     aSign = extractFloat32Sign( a );
                   1821:     bSign = extractFloat32Sign( b );
                   1822:     if ( aSign == bSign ) {
                   1823:         return addFloat32Sigs( a, b, aSign );
                   1824:     }
                   1825:     else {
                   1826:         return subFloat32Sigs( a, b, aSign );
                   1827:     }
                   1828:
                   1829: }
                   1830:
                   1831: /*
                   1832: -------------------------------------------------------------------------------
                   1833: Returns the result of subtracting the single-precision floating-point values
                   1834: `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
                   1835: for Binary Floating-Point Arithmetic.
                   1836: -------------------------------------------------------------------------------
                   1837: */
                   1838: float32 float32_sub( float32 a, float32 b )
                   1839: {
                   1840:     flag aSign, bSign;
                   1841:
                   1842:     aSign = extractFloat32Sign( a );
                   1843:     bSign = extractFloat32Sign( b );
                   1844:     if ( aSign == bSign ) {
                   1845:         return subFloat32Sigs( a, b, aSign );
                   1846:     }
                   1847:     else {
                   1848:         return addFloat32Sigs( a, b, aSign );
                   1849:     }
                   1850:
                   1851: }
                   1852:
                   1853: /*
                   1854: -------------------------------------------------------------------------------
                   1855: Returns the result of multiplying the single-precision floating-point values
                   1856: `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
                   1857: for Binary Floating-Point Arithmetic.
                   1858: -------------------------------------------------------------------------------
                   1859: */
                   1860: float32 float32_mul( float32 a, float32 b )
                   1861: {
                   1862:     flag aSign, bSign, zSign;
                   1863:     int16 aExp, bExp, zExp;
                   1864:     bits32 aSig, bSig;
                   1865:     bits64 zSig64;
                   1866:     bits32 zSig;
                   1867:
                   1868:     aSig = extractFloat32Frac( a );
                   1869:     aExp = extractFloat32Exp( a );
                   1870:     aSign = extractFloat32Sign( a );
                   1871:     bSig = extractFloat32Frac( b );
                   1872:     bExp = extractFloat32Exp( b );
                   1873:     bSign = extractFloat32Sign( b );
                   1874:     zSign = aSign ^ bSign;
                   1875:     if ( aExp == 0xFF ) {
                   1876:         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
                   1877:             return propagateFloat32NaN( a, b );
                   1878:         }
                   1879:         if ( ( bExp | bSig ) == 0 ) {
                   1880:             float_raise( float_flag_invalid );
                   1881:             return float32_default_nan;
                   1882:         }
                   1883:         return packFloat32( zSign, 0xFF, 0 );
                   1884:     }
                   1885:     if ( bExp == 0xFF ) {
                   1886:         if ( bSig ) return propagateFloat32NaN( a, b );
                   1887:         if ( ( aExp | aSig ) == 0 ) {
                   1888:             float_raise( float_flag_invalid );
                   1889:             return float32_default_nan;
                   1890:         }
                   1891:         return packFloat32( zSign, 0xFF, 0 );
                   1892:     }
                   1893:     if ( aExp == 0 ) {
                   1894:         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
                   1895:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   1896:     }
                   1897:     if ( bExp == 0 ) {
                   1898:         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
                   1899:         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
                   1900:     }
                   1901:     zExp = aExp + bExp - 0x7F;
                   1902:     aSig = ( aSig | 0x00800000 )<<7;
                   1903:     bSig = ( bSig | 0x00800000 )<<8;
                   1904:     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
                   1905:     zSig = zSig64;
                   1906:     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
                   1907:         zSig <<= 1;
                   1908:         --zExp;
                   1909:     }
                   1910:     return roundAndPackFloat32( zSign, zExp, zSig );
                   1911:
                   1912: }
                   1913:
                   1914: /*
                   1915: -------------------------------------------------------------------------------
                   1916: Returns the result of dividing the single-precision floating-point value `a'
                   1917: by the corresponding value `b'.  The operation is performed according to the
                   1918: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1919: -------------------------------------------------------------------------------
                   1920: */
                   1921: float32 float32_div( float32 a, float32 b )
                   1922: {
                   1923:     flag aSign, bSign, zSign;
                   1924:     int16 aExp, bExp, zExp;
                   1925:     bits32 aSig, bSig, zSig;
                   1926:
                   1927:     aSig = extractFloat32Frac( a );
                   1928:     aExp = extractFloat32Exp( a );
                   1929:     aSign = extractFloat32Sign( a );
                   1930:     bSig = extractFloat32Frac( b );
                   1931:     bExp = extractFloat32Exp( b );
                   1932:     bSign = extractFloat32Sign( b );
                   1933:     zSign = aSign ^ bSign;
                   1934:     if ( aExp == 0xFF ) {
                   1935:         if ( aSig ) return propagateFloat32NaN( a, b );
                   1936:         if ( bExp == 0xFF ) {
                   1937:             if ( bSig ) return propagateFloat32NaN( a, b );
                   1938:             float_raise( float_flag_invalid );
                   1939:             return float32_default_nan;
                   1940:         }
                   1941:         return packFloat32( zSign, 0xFF, 0 );
                   1942:     }
                   1943:     if ( bExp == 0xFF ) {
                   1944:         if ( bSig ) return propagateFloat32NaN( a, b );
                   1945:         return packFloat32( zSign, 0, 0 );
                   1946:     }
                   1947:     if ( bExp == 0 ) {
                   1948:         if ( bSig == 0 ) {
                   1949:             if ( ( aExp | aSig ) == 0 ) {
                   1950:                 float_raise( float_flag_invalid );
                   1951:                 return float32_default_nan;
                   1952:             }
                   1953:             float_raise( float_flag_divbyzero );
                   1954:             return packFloat32( zSign, 0xFF, 0 );
                   1955:         }
                   1956:         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
                   1957:     }
                   1958:     if ( aExp == 0 ) {
                   1959:         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
                   1960:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   1961:     }
                   1962:     zExp = aExp - bExp + 0x7D;
                   1963:     aSig = ( aSig | 0x00800000 )<<7;
                   1964:     bSig = ( bSig | 0x00800000 )<<8;
                   1965:     if ( bSig <= ( aSig + aSig ) ) {
                   1966:         aSig >>= 1;
                   1967:         ++zExp;
                   1968:     }
                   1969:     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
                   1970:     if ( ( zSig & 0x3F ) == 0 ) {
                   1971:         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
                   1972:     }
                   1973:     return roundAndPackFloat32( zSign, zExp, zSig );
                   1974:
                   1975: }
                   1976:
                   1977: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   1978: /*
                   1979: -------------------------------------------------------------------------------
                   1980: Returns the remainder of the single-precision floating-point value `a'
                   1981: with respect to the corresponding value `b'.  The operation is performed
                   1982: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   1983: -------------------------------------------------------------------------------
                   1984: */
                   1985: float32 float32_rem( float32 a, float32 b )
                   1986: {
                   1987:     flag aSign, bSign, zSign;
                   1988:     int16 aExp, bExp, expDiff;
                   1989:     bits32 aSig, bSig;
                   1990:     bits32 q;
                   1991:     bits64 aSig64, bSig64, q64;
                   1992:     bits32 alternateASig;
                   1993:     sbits32 sigMean;
                   1994:
                   1995:     aSig = extractFloat32Frac( a );
                   1996:     aExp = extractFloat32Exp( a );
                   1997:     aSign = extractFloat32Sign( a );
                   1998:     bSig = extractFloat32Frac( b );
                   1999:     bExp = extractFloat32Exp( b );
                   2000:     bSign = extractFloat32Sign( b );
                   2001:     if ( aExp == 0xFF ) {
                   2002:         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
                   2003:             return propagateFloat32NaN( a, b );
                   2004:         }
                   2005:         float_raise( float_flag_invalid );
                   2006:         return float32_default_nan;
                   2007:     }
                   2008:     if ( bExp == 0xFF ) {
                   2009:         if ( bSig ) return propagateFloat32NaN( a, b );
                   2010:         return a;
                   2011:     }
                   2012:     if ( bExp == 0 ) {
                   2013:         if ( bSig == 0 ) {
                   2014:             float_raise( float_flag_invalid );
                   2015:             return float32_default_nan;
                   2016:         }
                   2017:         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
                   2018:     }
                   2019:     if ( aExp == 0 ) {
                   2020:         if ( aSig == 0 ) return a;
                   2021:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   2022:     }
                   2023:     expDiff = aExp - bExp;
                   2024:     aSig |= 0x00800000;
                   2025:     bSig |= 0x00800000;
                   2026:     if ( expDiff < 32 ) {
                   2027:         aSig <<= 8;
                   2028:         bSig <<= 8;
                   2029:         if ( expDiff < 0 ) {
                   2030:             if ( expDiff < -1 ) return a;
                   2031:             aSig >>= 1;
                   2032:         }
                   2033:         q = ( bSig <= aSig );
                   2034:         if ( q ) aSig -= bSig;
                   2035:         if ( 0 < expDiff ) {
                   2036:             q = ( ( (bits64) aSig )<<32 ) / bSig;
                   2037:             q >>= 32 - expDiff;
                   2038:             bSig >>= 2;
                   2039:             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
                   2040:         }
                   2041:         else {
                   2042:             aSig >>= 2;
                   2043:             bSig >>= 2;
                   2044:         }
                   2045:     }
                   2046:     else {
                   2047:         if ( bSig <= aSig ) aSig -= bSig;
                   2048:         aSig64 = ( (bits64) aSig )<<40;
                   2049:         bSig64 = ( (bits64) bSig )<<40;
                   2050:         expDiff -= 64;
                   2051:         while ( 0 < expDiff ) {
                   2052:             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
                   2053:             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
                   2054:             aSig64 = - ( ( bSig * q64 )<<38 );
                   2055:             expDiff -= 62;
                   2056:         }
                   2057:         expDiff += 64;
                   2058:         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
                   2059:         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
                   2060:         q = q64>>( 64 - expDiff );
                   2061:         bSig <<= 6;
                   2062:         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
                   2063:     }
                   2064:     do {
                   2065:         alternateASig = aSig;
                   2066:         ++q;
                   2067:         aSig -= bSig;
                   2068:     } while ( 0 <= (sbits32) aSig );
                   2069:     sigMean = aSig + alternateASig;
                   2070:     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
                   2071:         aSig = alternateASig;
                   2072:     }
                   2073:     zSign = ( (sbits32) aSig < 0 );
                   2074:     if ( zSign ) aSig = - aSig;
                   2075:     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
                   2076:
                   2077: }
                   2078: #endif /* !SOFTFLOAT_FOR_GCC */
                   2079:
                   2080: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   2081: /*
                   2082: -------------------------------------------------------------------------------
                   2083: Returns the square root of the single-precision floating-point value `a'.
                   2084: The operation is performed according to the IEC/IEEE Standard for Binary
                   2085: Floating-Point Arithmetic.
                   2086: -------------------------------------------------------------------------------
                   2087: */
                   2088: float32 float32_sqrt( float32 a )
                   2089: {
                   2090:     flag aSign;
                   2091:     int16 aExp, zExp;
                   2092:     bits32 aSig, zSig;
                   2093:     bits64 rem, term;
                   2094:
                   2095:     aSig = extractFloat32Frac( a );
                   2096:     aExp = extractFloat32Exp( a );
                   2097:     aSign = extractFloat32Sign( a );
                   2098:     if ( aExp == 0xFF ) {
                   2099:         if ( aSig ) return propagateFloat32NaN( a, 0 );
                   2100:         if ( ! aSign ) return a;
                   2101:         float_raise( float_flag_invalid );
                   2102:         return float32_default_nan;
                   2103:     }
                   2104:     if ( aSign ) {
                   2105:         if ( ( aExp | aSig ) == 0 ) return a;
                   2106:         float_raise( float_flag_invalid );
                   2107:         return float32_default_nan;
                   2108:     }
                   2109:     if ( aExp == 0 ) {
                   2110:         if ( aSig == 0 ) return 0;
                   2111:         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
                   2112:     }
                   2113:     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
                   2114:     aSig = ( aSig | 0x00800000 )<<8;
                   2115:     zSig = estimateSqrt32( aExp, aSig ) + 2;
                   2116:     if ( ( zSig & 0x7F ) <= 5 ) {
                   2117:         if ( zSig < 2 ) {
                   2118:             zSig = 0x7FFFFFFF;
                   2119:             goto roundAndPack;
                   2120:         }
                   2121:         aSig >>= aExp & 1;
                   2122:         term = ( (bits64) zSig ) * zSig;
                   2123:         rem = ( ( (bits64) aSig )<<32 ) - term;
                   2124:         while ( (sbits64) rem < 0 ) {
                   2125:             --zSig;
                   2126:             rem += ( ( (bits64) zSig )<<1 ) | 1;
                   2127:         }
                   2128:         zSig |= ( rem != 0 );
                   2129:     }
                   2130:     shift32RightJamming( zSig, 1, &zSig );
                   2131:  roundAndPack:
                   2132:     return roundAndPackFloat32( 0, zExp, zSig );
                   2133:
                   2134: }
                   2135: #endif /* !SOFTFLOAT_FOR_GCC */
                   2136:
                   2137: /*
                   2138: -------------------------------------------------------------------------------
                   2139: Returns 1 if the single-precision floating-point value `a' is equal to
                   2140: the corresponding value `b', and 0 otherwise.  The comparison is performed
                   2141: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   2142: -------------------------------------------------------------------------------
                   2143: */
                   2144: flag float32_eq( float32 a, float32 b )
                   2145: {
                   2146:
                   2147:     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
                   2148:          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
                   2149:        ) {
                   2150:         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
                   2151:             float_raise( float_flag_invalid );
                   2152:         }
                   2153:         return 0;
                   2154:     }
                   2155:     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
                   2156:
                   2157: }
                   2158:
                   2159: /*
                   2160: -------------------------------------------------------------------------------
                   2161: Returns 1 if the single-precision floating-point value `a' is less than
                   2162: or equal to the corresponding value `b', and 0 otherwise.  The comparison
                   2163: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2164: Arithmetic.
                   2165: -------------------------------------------------------------------------------
                   2166: */
                   2167: flag float32_le( float32 a, float32 b )
                   2168: {
                   2169:     flag aSign, bSign;
                   2170:
                   2171:     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
                   2172:          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
                   2173:        ) {
                   2174:         float_raise( float_flag_invalid );
                   2175:         return 0;
                   2176:     }
                   2177:     aSign = extractFloat32Sign( a );
                   2178:     bSign = extractFloat32Sign( b );
                   2179:     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
                   2180:     return ( a == b ) || ( aSign ^ ( a < b ) );
                   2181:
                   2182: }
                   2183:
                   2184: /*
                   2185: -------------------------------------------------------------------------------
                   2186: Returns 1 if the single-precision floating-point value `a' is less than
                   2187: the corresponding value `b', and 0 otherwise.  The comparison is performed
                   2188: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   2189: -------------------------------------------------------------------------------
                   2190: */
                   2191: flag float32_lt( float32 a, float32 b )
                   2192: {
                   2193:     flag aSign, bSign;
                   2194:
                   2195:     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
                   2196:          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
                   2197:        ) {
                   2198:         float_raise( float_flag_invalid );
                   2199:         return 0;
                   2200:     }
                   2201:     aSign = extractFloat32Sign( a );
                   2202:     bSign = extractFloat32Sign( b );
                   2203:     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
                   2204:     return ( a != b ) && ( aSign ^ ( a < b ) );
                   2205:
                   2206: }
                   2207:
                   2208: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   2209: /*
                   2210: -------------------------------------------------------------------------------
                   2211: Returns 1 if the single-precision floating-point value `a' is equal to
                   2212: the corresponding value `b', and 0 otherwise.  The invalid exception is
                   2213: raised if either operand is a NaN.  Otherwise, the comparison is performed
                   2214: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   2215: -------------------------------------------------------------------------------
                   2216: */
                   2217: flag float32_eq_signaling( float32 a, float32 b )
                   2218: {
                   2219:
                   2220:     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
                   2221:          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
                   2222:        ) {
                   2223:         float_raise( float_flag_invalid );
                   2224:         return 0;
                   2225:     }
                   2226:     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
                   2227:
                   2228: }
                   2229:
                   2230: /*
                   2231: -------------------------------------------------------------------------------
                   2232: Returns 1 if the single-precision floating-point value `a' is less than or
                   2233: equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
                   2234: cause an exception.  Otherwise, the comparison is performed according to the
                   2235: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   2236: -------------------------------------------------------------------------------
                   2237: */
                   2238: flag float32_le_quiet( float32 a, float32 b )
                   2239: {
                   2240:     flag aSign, bSign;
                   2241:
                   2242:     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
                   2243:          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
                   2244:        ) {
                   2245:         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
                   2246:             float_raise( float_flag_invalid );
                   2247:         }
                   2248:         return 0;
                   2249:     }
                   2250:     aSign = extractFloat32Sign( a );
                   2251:     bSign = extractFloat32Sign( b );
                   2252:     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
                   2253:     return ( a == b ) || ( aSign ^ ( a < b ) );
                   2254:
                   2255: }
                   2256:
                   2257: /*
                   2258: -------------------------------------------------------------------------------
                   2259: Returns 1 if the single-precision floating-point value `a' is less than
                   2260: the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
                   2261: exception.  Otherwise, the comparison is performed according to the IEC/IEEE
                   2262: Standard for Binary Floating-Point Arithmetic.
                   2263: -------------------------------------------------------------------------------
                   2264: */
                   2265: flag float32_lt_quiet( float32 a, float32 b )
                   2266: {
                   2267:     flag aSign, bSign;
                   2268:
                   2269:     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
                   2270:          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
                   2271:        ) {
                   2272:         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
                   2273:             float_raise( float_flag_invalid );
                   2274:         }
                   2275:         return 0;
                   2276:     }
                   2277:     aSign = extractFloat32Sign( a );
                   2278:     bSign = extractFloat32Sign( b );
                   2279:     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
                   2280:     return ( a != b ) && ( aSign ^ ( a < b ) );
                   2281:
                   2282: }
                   2283: #endif /* !SOFTFLOAT_FOR_GCC */
                   2284:
                   2285: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   2286: /*
                   2287: -------------------------------------------------------------------------------
                   2288: Returns the result of converting the double-precision floating-point value
                   2289: `a' to the 32-bit two's complement integer format.  The conversion is
                   2290: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2291: Arithmetic---which means in particular that the conversion is rounded
                   2292: according to the current rounding mode.  If `a' is a NaN, the largest
                   2293: positive integer is returned.  Otherwise, if the conversion overflows, the
                   2294: largest integer with the same sign as `a' is returned.
                   2295: -------------------------------------------------------------------------------
                   2296: */
                   2297: int32 float64_to_int32( float64 a )
                   2298: {
                   2299:     flag aSign;
                   2300:     int16 aExp, shiftCount;
                   2301:     bits64 aSig;
                   2302:
                   2303:     aSig = extractFloat64Frac( a );
                   2304:     aExp = extractFloat64Exp( a );
                   2305:     aSign = extractFloat64Sign( a );
                   2306:     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
                   2307:     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
                   2308:     shiftCount = 0x42C - aExp;
                   2309:     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
                   2310:     return roundAndPackInt32( aSign, aSig );
                   2311:
                   2312: }
                   2313: #endif /* !SOFTFLOAT_FOR_GCC */
                   2314:
                   2315: /*
                   2316: -------------------------------------------------------------------------------
                   2317: Returns the result of converting the double-precision floating-point value
                   2318: `a' to the 32-bit two's complement integer format.  The conversion is
                   2319: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2320: Arithmetic, except that the conversion is always rounded toward zero.
                   2321: If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
                   2322: the conversion overflows, the largest integer with the same sign as `a' is
                   2323: returned.
                   2324: -------------------------------------------------------------------------------
                   2325: */
                   2326: int32 float64_to_int32_round_to_zero( float64 a )
                   2327: {
                   2328:     flag aSign;
                   2329:     int16 aExp, shiftCount;
                   2330:     bits64 aSig, savedASig;
                   2331:     int32 z;
                   2332:
                   2333:     aSig = extractFloat64Frac( a );
                   2334:     aExp = extractFloat64Exp( a );
                   2335:     aSign = extractFloat64Sign( a );
                   2336:     if ( 0x41E < aExp ) {
                   2337:         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
                   2338:         goto invalid;
                   2339:     }
                   2340:     else if ( aExp < 0x3FF ) {
                   2341:         if ( aExp || aSig ) float_set_inexact();
                   2342:         return 0;
                   2343:     }
                   2344:     aSig |= LIT64( 0x0010000000000000 );
                   2345:     shiftCount = 0x433 - aExp;
                   2346:     savedASig = aSig;
                   2347:     aSig >>= shiftCount;
                   2348:     z = aSig;
                   2349:     if ( aSign ) z = - z;
                   2350:     if ( ( z < 0 ) ^ aSign ) {
                   2351:  invalid:
                   2352:         float_raise( float_flag_invalid );
                   2353:         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
                   2354:     }
                   2355:     if ( ( aSig<<shiftCount ) != savedASig ) {
                   2356:         float_set_inexact();
                   2357:     }
                   2358:     return z;
                   2359:
                   2360: }
                   2361:
                   2362: #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
                   2363: /*
                   2364: -------------------------------------------------------------------------------
                   2365: Returns the result of converting the double-precision floating-point value
                   2366: `a' to the 64-bit two's complement integer format.  The conversion is
                   2367: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2368: Arithmetic---which means in particular that the conversion is rounded
                   2369: according to the current rounding mode.  If `a' is a NaN, the largest
                   2370: positive integer is returned.  Otherwise, if the conversion overflows, the
                   2371: largest integer with the same sign as `a' is returned.
                   2372: -------------------------------------------------------------------------------
                   2373: */
                   2374: int64 float64_to_int64( float64 a )
                   2375: {
                   2376:     flag aSign;
                   2377:     int16 aExp, shiftCount;
                   2378:     bits64 aSig, aSigExtra;
                   2379:
                   2380:     aSig = extractFloat64Frac( a );
                   2381:     aExp = extractFloat64Exp( a );
                   2382:     aSign = extractFloat64Sign( a );
                   2383:     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
                   2384:     shiftCount = 0x433 - aExp;
                   2385:     if ( shiftCount <= 0 ) {
                   2386:         if ( 0x43E < aExp ) {
                   2387:             float_raise( float_flag_invalid );
                   2388:             if (    ! aSign
                   2389:                  || (    ( aExp == 0x7FF )
                   2390:                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
                   2391:                ) {
                   2392:                 return LIT64( 0x7FFFFFFFFFFFFFFF );
                   2393:             }
                   2394:             return (sbits64) LIT64( 0x8000000000000000 );
                   2395:         }
                   2396:         aSigExtra = 0;
                   2397:         aSig <<= - shiftCount;
                   2398:     }
                   2399:     else {
                   2400:         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
                   2401:     }
                   2402:     return roundAndPackInt64( aSign, aSig, aSigExtra );
                   2403:
                   2404: }
                   2405:
                   2406: /*
                   2407: -------------------------------------------------------------------------------
                   2408: Returns the result of converting the double-precision floating-point value
                   2409: `a' to the 64-bit two's complement integer format.  The conversion is
                   2410: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2411: Arithmetic, except that the conversion is always rounded toward zero.
                   2412: If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
                   2413: the conversion overflows, the largest integer with the same sign as `a' is
                   2414: returned.
                   2415: -------------------------------------------------------------------------------
                   2416: */
                   2417: int64 float64_to_int64_round_to_zero( float64 a )
                   2418: {
                   2419:     flag aSign;
                   2420:     int16 aExp, shiftCount;
                   2421:     bits64 aSig;
                   2422:     int64 z;
                   2423:
                   2424:     aSig = extractFloat64Frac( a );
                   2425:     aExp = extractFloat64Exp( a );
                   2426:     aSign = extractFloat64Sign( a );
                   2427:     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
                   2428:     shiftCount = aExp - 0x433;
                   2429:     if ( 0 <= shiftCount ) {
                   2430:         if ( 0x43E <= aExp ) {
                   2431:             if ( a != LIT64( 0xC3E0000000000000 ) ) {
                   2432:                 float_raise( float_flag_invalid );
                   2433:                 if (    ! aSign
                   2434:                      || (    ( aExp == 0x7FF )
                   2435:                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
                   2436:                    ) {
                   2437:                     return LIT64( 0x7FFFFFFFFFFFFFFF );
                   2438:                 }
                   2439:             }
                   2440:             return (sbits64) LIT64( 0x8000000000000000 );
                   2441:         }
                   2442:         z = aSig<<shiftCount;
                   2443:     }
                   2444:     else {
                   2445:         if ( aExp < 0x3FE ) {
                   2446:             if ( aExp | aSig ) float_set_inexact();
                   2447:             return 0;
                   2448:         }
                   2449:         z = aSig>>( - shiftCount );
                   2450:         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
                   2451:             float_set_inexact();
                   2452:         }
                   2453:     }
                   2454:     if ( aSign ) z = - z;
                   2455:     return z;
                   2456:
                   2457: }
                   2458: #endif /* !SOFTFLOAT_FOR_GCC */
                   2459:
                   2460: /*
                   2461: -------------------------------------------------------------------------------
                   2462: Returns the result of converting the double-precision floating-point value
                   2463: `a' to the single-precision floating-point format.  The conversion is
                   2464: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2465: Arithmetic.
                   2466: -------------------------------------------------------------------------------
                   2467: */
                   2468: float32 float64_to_float32( float64 a )
                   2469: {
                   2470:     flag aSign;
                   2471:     int16 aExp;
                   2472:     bits64 aSig;
                   2473:     bits32 zSig;
                   2474:
                   2475:     aSig = extractFloat64Frac( a );
                   2476:     aExp = extractFloat64Exp( a );
                   2477:     aSign = extractFloat64Sign( a );
                   2478:     if ( aExp == 0x7FF ) {
                   2479:         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
                   2480:         return packFloat32( aSign, 0xFF, 0 );
                   2481:     }
                   2482:     shift64RightJamming( aSig, 22, &aSig );
                   2483:     zSig = aSig;
                   2484:     if ( aExp || zSig ) {
                   2485:         zSig |= 0x40000000;
                   2486:         aExp -= 0x381;
                   2487:     }
                   2488:     return roundAndPackFloat32( aSign, aExp, zSig );
                   2489:
                   2490: }
                   2491:
                   2492: #ifdef FLOATX80
                   2493:
                   2494: /*
                   2495: -------------------------------------------------------------------------------
                   2496: Returns the result of converting the double-precision floating-point value
                   2497: `a' to the extended double-precision floating-point format.  The conversion
                   2498: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2499: Arithmetic.
                   2500: -------------------------------------------------------------------------------
                   2501: */
                   2502: floatx80 float64_to_floatx80( float64 a )
                   2503: {
                   2504:     flag aSign;
                   2505:     int16 aExp;
                   2506:     bits64 aSig;
                   2507:
                   2508:     aSig = extractFloat64Frac( a );
                   2509:     aExp = extractFloat64Exp( a );
                   2510:     aSign = extractFloat64Sign( a );
                   2511:     if ( aExp == 0x7FF ) {
                   2512:         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
                   2513:         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   2514:     }
                   2515:     if ( aExp == 0 ) {
                   2516:         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
                   2517:         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
                   2518:     }
                   2519:     return
                   2520:         packFloatx80(
                   2521:             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
                   2522:
                   2523: }
                   2524:
                   2525: #endif
                   2526:
                   2527: #ifdef FLOAT128
                   2528:
                   2529: /*
                   2530: -------------------------------------------------------------------------------
                   2531: Returns the result of converting the double-precision floating-point value
                   2532: `a' to the quadruple-precision floating-point format.  The conversion is
                   2533: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   2534: Arithmetic.
                   2535: -------------------------------------------------------------------------------
                   2536: */
                   2537: float128 float64_to_float128( float64 a )
                   2538: {
                   2539:     flag aSign;
                   2540:     int16 aExp;
                   2541:     bits64 aSig, zSig0, zSig1;
                   2542:
                   2543:     aSig = extractFloat64Frac( a );
                   2544:     aExp = extractFloat64Exp( a );
                   2545:     aSign = extractFloat64Sign( a );
                   2546:     if ( aExp == 0x7FF ) {
                   2547:         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
                   2548:         return packFloat128( aSign, 0x7FFF, 0, 0 );
                   2549:     }
                   2550:     if ( aExp == 0 ) {
                   2551:         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
                   2552:         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
                   2553:         --aExp;
                   2554:     }
                   2555:     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
                   2556:     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
                   2557:
                   2558: }
                   2559:
                   2560: #endif
                   2561:
                   2562: #ifndef SOFTFLOAT_FOR_GCC
                   2563: /*
                   2564: -------------------------------------------------------------------------------
                   2565: Rounds the double-precision floating-point value `a' to an integer, and
                   2566: returns the result as a double-precision floating-point value.  The
                   2567: operation is performed according to the IEC/IEEE Standard for Binary
                   2568: Floating-Point Arithmetic.
                   2569: -------------------------------------------------------------------------------
                   2570: */
                   2571: float64 float64_round_to_int( float64 a )
                   2572: {
                   2573:     flag aSign;
                   2574:     int16 aExp;
                   2575:     bits64 lastBitMask, roundBitsMask;
                   2576:     int8 roundingMode;
                   2577:     float64 z;
                   2578:
                   2579:     aExp = extractFloat64Exp( a );
                   2580:     if ( 0x433 <= aExp ) {
                   2581:         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
                   2582:             return propagateFloat64NaN( a, a );
                   2583:         }
                   2584:         return a;
                   2585:     }
                   2586:     if ( aExp < 0x3FF ) {
                   2587:         if ( (bits64) ( a<<1 ) == 0 ) return a;
                   2588:         float_set_inexact();
                   2589:         aSign = extractFloat64Sign( a );
                   2590:         switch ( float_rounding_mode() ) {
                   2591:          case float_round_nearest_even:
                   2592:             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
                   2593:                 return packFloat64( aSign, 0x3FF, 0 );
                   2594:             }
                   2595:             break;
                   2596:          case float_round_down:
                   2597:             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
                   2598:          case float_round_up:
                   2599:             return
                   2600:             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
                   2601:         }
                   2602:         return packFloat64( aSign, 0, 0 );
                   2603:     }
                   2604:     lastBitMask = 1;
                   2605:     lastBitMask <<= 0x433 - aExp;
                   2606:     roundBitsMask = lastBitMask - 1;
                   2607:     z = a;
                   2608:     roundingMode = float_rounding_mode();
                   2609:     if ( roundingMode == float_round_nearest_even ) {
                   2610:         z += lastBitMask>>1;
                   2611:         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
                   2612:     }
                   2613:     else if ( roundingMode != float_round_to_zero ) {
                   2614:         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
                   2615:             z += roundBitsMask;
                   2616:         }
                   2617:     }
                   2618:     z &= ~ roundBitsMask;
                   2619:     if ( z != a ) float_set_inexact();
                   2620:     return z;
                   2621:
                   2622: }
                   2623: #endif
                   2624:
                   2625: /*
                   2626: -------------------------------------------------------------------------------
                   2627: Returns the result of adding the absolute values of the double-precision
                   2628: floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
                   2629: before being returned.  `zSign' is ignored if the result is a NaN.
                   2630: The addition is performed according to the IEC/IEEE Standard for Binary
                   2631: Floating-Point Arithmetic.
                   2632: -------------------------------------------------------------------------------
                   2633: */
                   2634: static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
                   2635: {
                   2636:     int16 aExp, bExp, zExp;
                   2637:     bits64 aSig, bSig, zSig;
                   2638:     int16 expDiff;
                   2639:
                   2640:     aSig = extractFloat64Frac( a );
                   2641:     aExp = extractFloat64Exp( a );
                   2642:     bSig = extractFloat64Frac( b );
                   2643:     bExp = extractFloat64Exp( b );
                   2644:     expDiff = aExp - bExp;
                   2645:     aSig <<= 9;
                   2646:     bSig <<= 9;
                   2647:     if ( 0 < expDiff ) {
                   2648:         if ( aExp == 0x7FF ) {
                   2649:             if ( aSig ) return propagateFloat64NaN( a, b );
                   2650:             return a;
                   2651:         }
                   2652:         if ( bExp == 0 ) {
                   2653:             --expDiff;
                   2654:         }
                   2655:         else {
                   2656:             bSig |= LIT64( 0x2000000000000000 );
                   2657:         }
                   2658:         shift64RightJamming( bSig, expDiff, &bSig );
                   2659:         zExp = aExp;
                   2660:     }
                   2661:     else if ( expDiff < 0 ) {
                   2662:         if ( bExp == 0x7FF ) {
                   2663:             if ( bSig ) return propagateFloat64NaN( a, b );
                   2664:             return packFloat64( zSign, 0x7FF, 0 );
                   2665:         }
                   2666:         if ( aExp == 0 ) {
                   2667:             ++expDiff;
                   2668:         }
                   2669:         else {
                   2670:             aSig |= LIT64( 0x2000000000000000 );
                   2671:         }
                   2672:         shift64RightJamming( aSig, - expDiff, &aSig );
                   2673:         zExp = bExp;
                   2674:     }
                   2675:     else {
                   2676:         if ( aExp == 0x7FF ) {
                   2677:             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
                   2678:             return a;
                   2679:         }
                   2680:         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
                   2681:         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
                   2682:         zExp = aExp;
                   2683:         goto roundAndPack;
                   2684:     }
                   2685:     aSig |= LIT64( 0x2000000000000000 );
                   2686:     zSig = ( aSig + bSig )<<1;
                   2687:     --zExp;
                   2688:     if ( (sbits64) zSig < 0 ) {
                   2689:         zSig = aSig + bSig;
                   2690:         ++zExp;
                   2691:     }
                   2692:  roundAndPack:
                   2693:     return roundAndPackFloat64( zSign, zExp, zSig );
                   2694:
                   2695: }
                   2696:
                   2697: /*
                   2698: -------------------------------------------------------------------------------
                   2699: Returns the result of subtracting the absolute values of the double-
                   2700: precision floating-point values `a' and `b'.  If `zSign' is 1, the
                   2701: difference is negated before being returned.  `zSign' is ignored if the
                   2702: result is a NaN.  The subtraction is performed according to the IEC/IEEE
                   2703: Standard for Binary Floating-Point Arithmetic.
                   2704: -------------------------------------------------------------------------------
                   2705: */
                   2706: static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
                   2707: {
                   2708:     int16 aExp, bExp, zExp;
                   2709:     bits64 aSig, bSig, zSig;
                   2710:     int16 expDiff;
                   2711:
                   2712:     aSig = extractFloat64Frac( a );
                   2713:     aExp = extractFloat64Exp( a );
                   2714:     bSig = extractFloat64Frac( b );
                   2715:     bExp = extractFloat64Exp( b );
                   2716:     expDiff = aExp - bExp;
                   2717:     aSig <<= 10;
                   2718:     bSig <<= 10;
                   2719:     if ( 0 < expDiff ) goto aExpBigger;
                   2720:     if ( expDiff < 0 ) goto bExpBigger;
                   2721:     if ( aExp == 0x7FF ) {
                   2722:         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
                   2723:         float_raise( float_flag_invalid );
                   2724:         return float64_default_nan;
                   2725:     }
                   2726:     if ( aExp == 0 ) {
                   2727:         aExp = 1;
                   2728:         bExp = 1;
                   2729:     }
                   2730:     if ( bSig < aSig ) goto aBigger;
                   2731:     if ( aSig < bSig ) goto bBigger;
                   2732:     return packFloat64( float_rounding_mode() == float_round_down, 0, 0 );
                   2733:  bExpBigger:
                   2734:     if ( bExp == 0x7FF ) {
                   2735:         if ( bSig ) return propagateFloat64NaN( a, b );
                   2736:         return packFloat64( zSign ^ 1, 0x7FF, 0 );
                   2737:     }
                   2738:     if ( aExp == 0 ) {
                   2739:         ++expDiff;
                   2740:     }
                   2741:     else {
                   2742:         aSig |= LIT64( 0x4000000000000000 );
                   2743:     }
                   2744:     shift64RightJamming( aSig, - expDiff, &aSig );
                   2745:     bSig |= LIT64( 0x4000000000000000 );
                   2746:  bBigger:
                   2747:     zSig = bSig - aSig;
                   2748:     zExp = bExp;
                   2749:     zSign ^= 1;
                   2750:     goto normalizeRoundAndPack;
                   2751:  aExpBigger:
                   2752:     if ( aExp == 0x7FF ) {
                   2753:         if ( aSig ) return propagateFloat64NaN( a, b );
                   2754:         return a;
                   2755:     }
                   2756:     if ( bExp == 0 ) {
                   2757:         --expDiff;
                   2758:     }
                   2759:     else {
                   2760:         bSig |= LIT64( 0x4000000000000000 );
                   2761:     }
                   2762:     shift64RightJamming( bSig, expDiff, &bSig );
                   2763:     aSig |= LIT64( 0x4000000000000000 );
                   2764:  aBigger:
                   2765:     zSig = aSig - bSig;
                   2766:     zExp = aExp;
                   2767:  normalizeRoundAndPack:
                   2768:     --zExp;
                   2769:     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
                   2770:
                   2771: }
                   2772:
                   2773: /*
                   2774: -------------------------------------------------------------------------------
                   2775: Returns the result of adding the double-precision floating-point values `a'
                   2776: and `b'.  The operation is performed according to the IEC/IEEE Standard for
                   2777: Binary Floating-Point Arithmetic.
                   2778: -------------------------------------------------------------------------------
                   2779: */
                   2780: float64 float64_add( float64 a, float64 b )
                   2781: {
                   2782:     flag aSign, bSign;
                   2783:
                   2784:     aSign = extractFloat64Sign( a );
                   2785:     bSign = extractFloat64Sign( b );
                   2786:     if ( aSign == bSign ) {
                   2787:         return addFloat64Sigs( a, b, aSign );
                   2788:     }
                   2789:     else {
                   2790:         return subFloat64Sigs( a, b, aSign );
                   2791:     }
                   2792:
                   2793: }
                   2794:
                   2795: /*
                   2796: -------------------------------------------------------------------------------
                   2797: Returns the result of subtracting the double-precision floating-point values
                   2798: `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
                   2799: for Binary Floating-Point Arithmetic.
                   2800: -------------------------------------------------------------------------------
                   2801: */
                   2802: float64 float64_sub( float64 a, float64 b )
                   2803: {
                   2804:     flag aSign, bSign;
                   2805:
                   2806:     aSign = extractFloat64Sign( a );
                   2807:     bSign = extractFloat64Sign( b );
                   2808:     if ( aSign == bSign ) {
                   2809:         return subFloat64Sigs( a, b, aSign );
                   2810:     }
                   2811:     else {
                   2812:         return addFloat64Sigs( a, b, aSign );
                   2813:     }
                   2814:
                   2815: }
                   2816:
                   2817: /*
                   2818: -------------------------------------------------------------------------------
                   2819: Returns the result of multiplying the double-precision floating-point values
                   2820: `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
                   2821: for Binary Floating-Point Arithmetic.
                   2822: -------------------------------------------------------------------------------
                   2823: */
                   2824: float64 float64_mul( float64 a, float64 b )
                   2825: {
                   2826:     flag aSign, bSign, zSign;
                   2827:     int16 aExp, bExp, zExp;
                   2828:     bits64 aSig, bSig, zSig0, zSig1;
                   2829:
                   2830:     aSig = extractFloat64Frac( a );
                   2831:     aExp = extractFloat64Exp( a );
                   2832:     aSign = extractFloat64Sign( a );
                   2833:     bSig = extractFloat64Frac( b );
                   2834:     bExp = extractFloat64Exp( b );
                   2835:     bSign = extractFloat64Sign( b );
                   2836:     zSign = aSign ^ bSign;
                   2837:     if ( aExp == 0x7FF ) {
                   2838:         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
                   2839:             return propagateFloat64NaN( a, b );
                   2840:         }
                   2841:         if ( ( bExp | bSig ) == 0 ) {
                   2842:             float_raise( float_flag_invalid );
                   2843:             return float64_default_nan;
                   2844:         }
                   2845:         return packFloat64( zSign, 0x7FF, 0 );
                   2846:     }
                   2847:     if ( bExp == 0x7FF ) {
                   2848:         if ( bSig ) return propagateFloat64NaN( a, b );
                   2849:         if ( ( aExp | aSig ) == 0 ) {
                   2850:             float_raise( float_flag_invalid );
                   2851:             return float64_default_nan;
                   2852:         }
                   2853:         return packFloat64( zSign, 0x7FF, 0 );
                   2854:     }
                   2855:     if ( aExp == 0 ) {
                   2856:         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
                   2857:         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
                   2858:     }
                   2859:     if ( bExp == 0 ) {
                   2860:         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
                   2861:         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
                   2862:     }
                   2863:     zExp = aExp + bExp - 0x3FF;
                   2864:     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
                   2865:     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
                   2866:     mul64To128( aSig, bSig, &zSig0, &zSig1 );
                   2867:     zSig0 |= ( zSig1 != 0 );
                   2868:     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
                   2869:         zSig0 <<= 1;
                   2870:         --zExp;
                   2871:     }
                   2872:     return roundAndPackFloat64( zSign, zExp, zSig0 );
                   2873:
                   2874: }
                   2875:
                   2876: /*
                   2877: -------------------------------------------------------------------------------
                   2878: Returns the result of dividing the double-precision floating-point value `a'
                   2879: by the corresponding value `b'.  The operation is performed according to
                   2880: the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   2881: -------------------------------------------------------------------------------
                   2882: */
                   2883: float64 float64_div( float64 a, float64 b )
                   2884: {
                   2885:     flag aSign, bSign, zSign;
                   2886:     int16 aExp, bExp, zExp;
                   2887:     bits64 aSig, bSig, zSig;
                   2888:     bits64 rem0, rem1;
                   2889:     bits64 term0, term1;
                   2890:
                   2891:     aSig = extractFloat64Frac( a );
                   2892:     aExp = extractFloat64Exp( a );
                   2893:     aSign = extractFloat64Sign( a );
                   2894:     bSig = extractFloat64Frac( b );
                   2895:     bExp = extractFloat64Exp( b );
                   2896:     bSign = extractFloat64Sign( b );
                   2897:     zSign = aSign ^ bSign;
                   2898:     if ( aExp == 0x7FF ) {
                   2899:         if ( aSig ) return propagateFloat64NaN( a, b );
                   2900:         if ( bExp == 0x7FF ) {
                   2901:             if ( bSig ) return propagateFloat64NaN( a, b );
                   2902:             float_raise( float_flag_invalid );
                   2903:             return float64_default_nan;
                   2904:         }
                   2905:         return packFloat64( zSign, 0x7FF, 0 );
                   2906:     }
                   2907:     if ( bExp == 0x7FF ) {
                   2908:         if ( bSig ) return propagateFloat64NaN( a, b );
                   2909:         return packFloat64( zSign, 0, 0 );
                   2910:     }
                   2911:     if ( bExp == 0 ) {
                   2912:         if ( bSig == 0 ) {
                   2913:             if ( ( aExp | aSig ) == 0 ) {
                   2914:                 float_raise( float_flag_invalid );
                   2915:                 return float64_default_nan;
                   2916:             }
                   2917:             float_raise( float_flag_divbyzero );
                   2918:             return packFloat64( zSign, 0x7FF, 0 );
                   2919:         }
                   2920:         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
                   2921:     }
                   2922:     if ( aExp == 0 ) {
                   2923:         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
                   2924:         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
                   2925:     }
                   2926:     zExp = aExp - bExp + 0x3FD;
                   2927:     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
                   2928:     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
                   2929:     if ( bSig <= ( aSig + aSig ) ) {
                   2930:         aSig >>= 1;
                   2931:         ++zExp;
                   2932:     }
                   2933:     zSig = estimateDiv128To64( aSig, 0, bSig );
                   2934:     if ( ( zSig & 0x1FF ) <= 2 ) {
                   2935:         mul64To128( bSig, zSig, &term0, &term1 );
                   2936:         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
                   2937:         while ( (sbits64) rem0 < 0 ) {
                   2938:             --zSig;
                   2939:             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
                   2940:         }
                   2941:         zSig |= ( rem1 != 0 );
                   2942:     }
                   2943:     return roundAndPackFloat64( zSign, zExp, zSig );
                   2944:
                   2945: }
                   2946:
                   2947: #ifndef SOFTFLOAT_FOR_GCC
                   2948: /*
                   2949: -------------------------------------------------------------------------------
                   2950: Returns the remainder of the double-precision floating-point value `a'
                   2951: with respect to the corresponding value `b'.  The operation is performed
                   2952: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   2953: -------------------------------------------------------------------------------
                   2954: */
                   2955: float64 float64_rem( float64 a, float64 b )
                   2956: {
                   2957:     flag aSign, bSign, zSign;
                   2958:     int16 aExp, bExp, expDiff;
                   2959:     bits64 aSig, bSig;
                   2960:     bits64 q, alternateASig;
                   2961:     sbits64 sigMean;
                   2962:
                   2963:     aSig = extractFloat64Frac( a );
                   2964:     aExp = extractFloat64Exp( a );
                   2965:     aSign = extractFloat64Sign( a );
                   2966:     bSig = extractFloat64Frac( b );
                   2967:     bExp = extractFloat64Exp( b );
                   2968:     bSign = extractFloat64Sign( b );
                   2969:     if ( aExp == 0x7FF ) {
                   2970:         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
                   2971:             return propagateFloat64NaN( a, b );
                   2972:         }
                   2973:         float_raise( float_flag_invalid );
                   2974:         return float64_default_nan;
                   2975:     }
                   2976:     if ( bExp == 0x7FF ) {
                   2977:         if ( bSig ) return propagateFloat64NaN( a, b );
                   2978:         return a;
                   2979:     }
                   2980:     if ( bExp == 0 ) {
                   2981:         if ( bSig == 0 ) {
                   2982:             float_raise( float_flag_invalid );
                   2983:             return float64_default_nan;
                   2984:         }
                   2985:         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
                   2986:     }
                   2987:     if ( aExp == 0 ) {
                   2988:         if ( aSig == 0 ) return a;
                   2989:         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
                   2990:     }
                   2991:     expDiff = aExp - bExp;
                   2992:     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
                   2993:     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
                   2994:     if ( expDiff < 0 ) {
                   2995:         if ( expDiff < -1 ) return a;
                   2996:         aSig >>= 1;
                   2997:     }
                   2998:     q = ( bSig <= aSig );
                   2999:     if ( q ) aSig -= bSig;
                   3000:     expDiff -= 64;
                   3001:     while ( 0 < expDiff ) {
                   3002:         q = estimateDiv128To64( aSig, 0, bSig );
                   3003:         q = ( 2 < q ) ? q - 2 : 0;
                   3004:         aSig = - ( ( bSig>>2 ) * q );
                   3005:         expDiff -= 62;
                   3006:     }
                   3007:     expDiff += 64;
                   3008:     if ( 0 < expDiff ) {
                   3009:         q = estimateDiv128To64( aSig, 0, bSig );
                   3010:         q = ( 2 < q ) ? q - 2 : 0;
                   3011:         q >>= 64 - expDiff;
                   3012:         bSig >>= 2;
                   3013:         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
                   3014:     }
                   3015:     else {
                   3016:         aSig >>= 2;
                   3017:         bSig >>= 2;
                   3018:     }
                   3019:     do {
                   3020:         alternateASig = aSig;
                   3021:         ++q;
                   3022:         aSig -= bSig;
                   3023:     } while ( 0 <= (sbits64) aSig );
                   3024:     sigMean = aSig + alternateASig;
                   3025:     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
                   3026:         aSig = alternateASig;
                   3027:     }
                   3028:     zSign = ( (sbits64) aSig < 0 );
                   3029:     if ( zSign ) aSig = - aSig;
                   3030:     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
                   3031:
                   3032: }
                   3033:
                   3034: /*
                   3035: -------------------------------------------------------------------------------
                   3036: Returns the square root of the double-precision floating-point value `a'.
                   3037: The operation is performed according to the IEC/IEEE Standard for Binary
                   3038: Floating-Point Arithmetic.
                   3039: -------------------------------------------------------------------------------
                   3040: */
                   3041: float64 float64_sqrt( float64 a )
                   3042: {
                   3043:     flag aSign;
                   3044:     int16 aExp, zExp;
                   3045:     bits64 aSig, zSig, doubleZSig;
                   3046:     bits64 rem0, rem1, term0, term1;
                   3047:
                   3048:     aSig = extractFloat64Frac( a );
                   3049:     aExp = extractFloat64Exp( a );
                   3050:     aSign = extractFloat64Sign( a );
                   3051:     if ( aExp == 0x7FF ) {
                   3052:         if ( aSig ) return propagateFloat64NaN( a, a );
                   3053:         if ( ! aSign ) return a;
                   3054:         float_raise( float_flag_invalid );
                   3055:         return float64_default_nan;
                   3056:     }
                   3057:     if ( aSign ) {
                   3058:         if ( ( aExp | aSig ) == 0 ) return a;
                   3059:         float_raise( float_flag_invalid );
                   3060:         return float64_default_nan;
                   3061:     }
                   3062:     if ( aExp == 0 ) {
                   3063:         if ( aSig == 0 ) return 0;
                   3064:         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
                   3065:     }
                   3066:     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
                   3067:     aSig |= LIT64( 0x0010000000000000 );
                   3068:     zSig = estimateSqrt32( aExp, aSig>>21 );
                   3069:     aSig <<= 9 - ( aExp & 1 );
                   3070:     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
                   3071:     if ( ( zSig & 0x1FF ) <= 5 ) {
                   3072:         doubleZSig = zSig<<1;
                   3073:         mul64To128( zSig, zSig, &term0, &term1 );
                   3074:         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
                   3075:         while ( (sbits64) rem0 < 0 ) {
                   3076:             --zSig;
                   3077:             doubleZSig -= 2;
                   3078:             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
                   3079:         }
                   3080:         zSig |= ( ( rem0 | rem1 ) != 0 );
                   3081:     }
                   3082:     return roundAndPackFloat64( 0, zExp, zSig );
                   3083:
                   3084: }
                   3085: #endif
                   3086:
                   3087: /*
                   3088: -------------------------------------------------------------------------------
                   3089: Returns 1 if the double-precision floating-point value `a' is equal to the
                   3090: corresponding value `b', and 0 otherwise.  The comparison is performed
                   3091: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3092: -------------------------------------------------------------------------------
                   3093: */
                   3094: flag float64_eq( float64 a, float64 b )
                   3095: {
                   3096:
                   3097:     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
                   3098:          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
                   3099:        ) {
                   3100:         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
                   3101:             float_raise( float_flag_invalid );
                   3102:         }
                   3103:         return 0;
                   3104:     }
                   3105:     return ( a == b ) ||
                   3106:        ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) == 0 );
                   3107:
                   3108: }
                   3109:
                   3110: /*
                   3111: -------------------------------------------------------------------------------
                   3112: Returns 1 if the double-precision floating-point value `a' is less than or
                   3113: equal to the corresponding value `b', and 0 otherwise.  The comparison is
                   3114: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   3115: Arithmetic.
                   3116: -------------------------------------------------------------------------------
                   3117: */
                   3118: flag float64_le( float64 a, float64 b )
                   3119: {
                   3120:     flag aSign, bSign;
                   3121:
                   3122:     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
                   3123:          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
                   3124:        ) {
                   3125:         float_raise( float_flag_invalid );
                   3126:         return 0;
                   3127:     }
                   3128:     aSign = extractFloat64Sign( a );
                   3129:     bSign = extractFloat64Sign( b );
                   3130:     if ( aSign != bSign )
                   3131:        return aSign ||
                   3132:            ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) ==
                   3133:              0 );
                   3134:     return ( a == b ) ||
                   3135:        ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
                   3136:
                   3137: }
                   3138:
                   3139: /*
                   3140: -------------------------------------------------------------------------------
                   3141: Returns 1 if the double-precision floating-point value `a' is less than
                   3142: the corresponding value `b', and 0 otherwise.  The comparison is performed
                   3143: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3144: -------------------------------------------------------------------------------
                   3145: */
                   3146: flag float64_lt( float64 a, float64 b )
                   3147: {
                   3148:     flag aSign, bSign;
                   3149:
                   3150:     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
                   3151:          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
                   3152:        ) {
                   3153:         float_raise( float_flag_invalid );
                   3154:         return 0;
                   3155:     }
                   3156:     aSign = extractFloat64Sign( a );
                   3157:     bSign = extractFloat64Sign( b );
                   3158:     if ( aSign != bSign )
                   3159:        return aSign &&
                   3160:            ( (bits64) ( ( FLOAT64_DEMANGLE(a) | FLOAT64_DEMANGLE(b) )<<1 ) !=
                   3161:              0 );
                   3162:     return ( a != b ) &&
                   3163:        ( aSign ^ ( FLOAT64_DEMANGLE(a) < FLOAT64_DEMANGLE(b) ) );
                   3164:
                   3165: }
                   3166:
                   3167: #ifndef SOFTFLOAT_FOR_GCC
                   3168: /*
                   3169: -------------------------------------------------------------------------------
                   3170: Returns 1 if the double-precision floating-point value `a' is equal to the
                   3171: corresponding value `b', and 0 otherwise.  The invalid exception is raised
                   3172: if either operand is a NaN.  Otherwise, the comparison is performed
                   3173: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3174: -------------------------------------------------------------------------------
                   3175: */
                   3176: flag float64_eq_signaling( float64 a, float64 b )
                   3177: {
                   3178:
                   3179:     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
                   3180:          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
                   3181:        ) {
                   3182:         float_raise( float_flag_invalid );
                   3183:         return 0;
                   3184:     }
                   3185:     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
                   3186:
                   3187: }
                   3188:
                   3189: /*
                   3190: -------------------------------------------------------------------------------
                   3191: Returns 1 if the double-precision floating-point value `a' is less than or
                   3192: equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
                   3193: cause an exception.  Otherwise, the comparison is performed according to the
                   3194: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3195: -------------------------------------------------------------------------------
                   3196: */
                   3197: flag float64_le_quiet( float64 a, float64 b )
                   3198: {
                   3199:     flag aSign, bSign;
                   3200:
                   3201:     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
                   3202:          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
                   3203:        ) {
                   3204:         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
                   3205:             float_raise( float_flag_invalid );
                   3206:         }
                   3207:         return 0;
                   3208:     }
                   3209:     aSign = extractFloat64Sign( a );
                   3210:     bSign = extractFloat64Sign( b );
                   3211:     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
                   3212:     return ( a == b ) || ( aSign ^ ( a < b ) );
                   3213:
                   3214: }
                   3215:
                   3216: /*
                   3217: -------------------------------------------------------------------------------
                   3218: Returns 1 if the double-precision floating-point value `a' is less than
                   3219: the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
                   3220: exception.  Otherwise, the comparison is performed according to the IEC/IEEE
                   3221: Standard for Binary Floating-Point Arithmetic.
                   3222: -------------------------------------------------------------------------------
                   3223: */
                   3224: flag float64_lt_quiet( float64 a, float64 b )
                   3225: {
                   3226:     flag aSign, bSign;
                   3227:
                   3228:     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
                   3229:          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
                   3230:        ) {
                   3231:         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
                   3232:             float_raise( float_flag_invalid );
                   3233:         }
                   3234:         return 0;
                   3235:     }
                   3236:     aSign = extractFloat64Sign( a );
                   3237:     bSign = extractFloat64Sign( b );
                   3238:     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
                   3239:     return ( a != b ) && ( aSign ^ ( a < b ) );
                   3240:
                   3241: }
                   3242: #endif
                   3243:
                   3244: #ifdef FLOATX80
                   3245:
                   3246: /*
                   3247: -------------------------------------------------------------------------------
                   3248: Returns the result of converting the extended double-precision floating-
                   3249: point value `a' to the 32-bit two's complement integer format.  The
                   3250: conversion is performed according to the IEC/IEEE Standard for Binary
                   3251: Floating-Point Arithmetic---which means in particular that the conversion
                   3252: is rounded according to the current rounding mode.  If `a' is a NaN, the
                   3253: largest positive integer is returned.  Otherwise, if the conversion
                   3254: overflows, the largest integer with the same sign as `a' is returned.
                   3255: -------------------------------------------------------------------------------
                   3256: */
                   3257: int32 floatx80_to_int32( floatx80 a )
                   3258: {
                   3259:     flag aSign;
                   3260:     int32 aExp, shiftCount;
                   3261:     bits64 aSig;
                   3262:
                   3263:     aSig = extractFloatx80Frac( a );
                   3264:     aExp = extractFloatx80Exp( a );
                   3265:     aSign = extractFloatx80Sign( a );
                   3266:     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
                   3267:     shiftCount = 0x4037 - aExp;
                   3268:     if ( shiftCount <= 0 ) shiftCount = 1;
                   3269:     shift64RightJamming( aSig, shiftCount, &aSig );
                   3270:     return roundAndPackInt32( aSign, aSig );
                   3271:
                   3272: }
                   3273:
                   3274: /*
                   3275: -------------------------------------------------------------------------------
                   3276: Returns the result of converting the extended double-precision floating-
                   3277: point value `a' to the 32-bit two's complement integer format.  The
                   3278: conversion is performed according to the IEC/IEEE Standard for Binary
                   3279: Floating-Point Arithmetic, except that the conversion is always rounded
                   3280: toward zero.  If `a' is a NaN, the largest positive integer is returned.
                   3281: Otherwise, if the conversion overflows, the largest integer with the same
                   3282: sign as `a' is returned.
                   3283: -------------------------------------------------------------------------------
                   3284: */
                   3285: int32 floatx80_to_int32_round_to_zero( floatx80 a )
                   3286: {
                   3287:     flag aSign;
                   3288:     int32 aExp, shiftCount;
                   3289:     bits64 aSig, savedASig;
                   3290:     int32 z;
                   3291:
                   3292:     aSig = extractFloatx80Frac( a );
                   3293:     aExp = extractFloatx80Exp( a );
                   3294:     aSign = extractFloatx80Sign( a );
                   3295:     if ( 0x401E < aExp ) {
                   3296:         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
                   3297:         goto invalid;
                   3298:     }
                   3299:     else if ( aExp < 0x3FFF ) {
                   3300:         if ( aExp || aSig ) float_set_inexact();
                   3301:         return 0;
                   3302:     }
                   3303:     shiftCount = 0x403E - aExp;
                   3304:     savedASig = aSig;
                   3305:     aSig >>= shiftCount;
                   3306:     z = aSig;
                   3307:     if ( aSign ) z = - z;
                   3308:     if ( ( z < 0 ) ^ aSign ) {
                   3309:  invalid:
                   3310:         float_raise( float_flag_invalid );
                   3311:         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
                   3312:     }
                   3313:     if ( ( aSig<<shiftCount ) != savedASig ) {
                   3314:         float_set_inexact();
                   3315:     }
                   3316:     return z;
                   3317:
                   3318: }
                   3319:
                   3320: /*
                   3321: -------------------------------------------------------------------------------
                   3322: Returns the result of converting the extended double-precision floating-
                   3323: point value `a' to the 64-bit two's complement integer format.  The
                   3324: conversion is performed according to the IEC/IEEE Standard for Binary
                   3325: Floating-Point Arithmetic---which means in particular that the conversion
                   3326: is rounded according to the current rounding mode.  If `a' is a NaN,
                   3327: the largest positive integer is returned.  Otherwise, if the conversion
                   3328: overflows, the largest integer with the same sign as `a' is returned.
                   3329: -------------------------------------------------------------------------------
                   3330: */
                   3331: int64 floatx80_to_int64( floatx80 a )
                   3332: {
                   3333:     flag aSign;
                   3334:     int32 aExp, shiftCount;
                   3335:     bits64 aSig, aSigExtra;
                   3336:
                   3337:     aSig = extractFloatx80Frac( a );
                   3338:     aExp = extractFloatx80Exp( a );
                   3339:     aSign = extractFloatx80Sign( a );
                   3340:     shiftCount = 0x403E - aExp;
                   3341:     if ( shiftCount <= 0 ) {
                   3342:         if ( shiftCount ) {
                   3343:             float_raise( float_flag_invalid );
                   3344:             if (    ! aSign
                   3345:                  || (    ( aExp == 0x7FFF )
                   3346:                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
                   3347:                ) {
                   3348:                 return LIT64( 0x7FFFFFFFFFFFFFFF );
                   3349:             }
                   3350:             return (sbits64) LIT64( 0x8000000000000000 );
                   3351:         }
                   3352:         aSigExtra = 0;
                   3353:     }
                   3354:     else {
                   3355:         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
                   3356:     }
                   3357:     return roundAndPackInt64( aSign, aSig, aSigExtra );
                   3358:
                   3359: }
                   3360:
                   3361: /*
                   3362: -------------------------------------------------------------------------------
                   3363: Returns the result of converting the extended double-precision floating-
                   3364: point value `a' to the 64-bit two's complement integer format.  The
                   3365: conversion is performed according to the IEC/IEEE Standard for Binary
                   3366: Floating-Point Arithmetic, except that the conversion is always rounded
                   3367: toward zero.  If `a' is a NaN, the largest positive integer is returned.
                   3368: Otherwise, if the conversion overflows, the largest integer with the same
                   3369: sign as `a' is returned.
                   3370: -------------------------------------------------------------------------------
                   3371: */
                   3372: int64 floatx80_to_int64_round_to_zero( floatx80 a )
                   3373: {
                   3374:     flag aSign;
                   3375:     int32 aExp, shiftCount;
                   3376:     bits64 aSig;
                   3377:     int64 z;
                   3378:
                   3379:     aSig = extractFloatx80Frac( a );
                   3380:     aExp = extractFloatx80Exp( a );
                   3381:     aSign = extractFloatx80Sign( a );
                   3382:     shiftCount = aExp - 0x403E;
                   3383:     if ( 0 <= shiftCount ) {
                   3384:         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
                   3385:         if ( ( a.high != 0xC03E ) || aSig ) {
                   3386:             float_raise( float_flag_invalid );
                   3387:             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
                   3388:                 return LIT64( 0x7FFFFFFFFFFFFFFF );
                   3389:             }
                   3390:         }
                   3391:         return (sbits64) LIT64( 0x8000000000000000 );
                   3392:     }
                   3393:     else if ( aExp < 0x3FFF ) {
                   3394:         if ( aExp | aSig ) float_set_inexact();
                   3395:         return 0;
                   3396:     }
                   3397:     z = aSig>>( - shiftCount );
                   3398:     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
                   3399:         float_set_inexact();
                   3400:     }
                   3401:     if ( aSign ) z = - z;
                   3402:     return z;
                   3403:
                   3404: }
                   3405:
                   3406: /*
                   3407: -------------------------------------------------------------------------------
                   3408: Returns the result of converting the extended double-precision floating-
                   3409: point value `a' to the single-precision floating-point format.  The
                   3410: conversion is performed according to the IEC/IEEE Standard for Binary
                   3411: Floating-Point Arithmetic.
                   3412: -------------------------------------------------------------------------------
                   3413: */
                   3414: float32 floatx80_to_float32( floatx80 a )
                   3415: {
                   3416:     flag aSign;
                   3417:     int32 aExp;
                   3418:     bits64 aSig;
                   3419:
                   3420:     aSig = extractFloatx80Frac( a );
                   3421:     aExp = extractFloatx80Exp( a );
                   3422:     aSign = extractFloatx80Sign( a );
                   3423:     if ( aExp == 0x7FFF ) {
                   3424:         if ( (bits64) ( aSig<<1 ) ) {
                   3425:             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
                   3426:         }
                   3427:         return packFloat32( aSign, 0xFF, 0 );
                   3428:     }
                   3429:     shift64RightJamming( aSig, 33, &aSig );
                   3430:     if ( aExp || aSig ) aExp -= 0x3F81;
                   3431:     return roundAndPackFloat32( aSign, aExp, aSig );
                   3432:
                   3433: }
                   3434:
                   3435: /*
                   3436: -------------------------------------------------------------------------------
                   3437: Returns the result of converting the extended double-precision floating-
                   3438: point value `a' to the double-precision floating-point format.  The
                   3439: conversion is performed according to the IEC/IEEE Standard for Binary
                   3440: Floating-Point Arithmetic.
                   3441: -------------------------------------------------------------------------------
                   3442: */
                   3443: float64 floatx80_to_float64( floatx80 a )
                   3444: {
                   3445:     flag aSign;
                   3446:     int32 aExp;
                   3447:     bits64 aSig, zSig;
                   3448:
                   3449:     aSig = extractFloatx80Frac( a );
                   3450:     aExp = extractFloatx80Exp( a );
                   3451:     aSign = extractFloatx80Sign( a );
                   3452:     if ( aExp == 0x7FFF ) {
                   3453:         if ( (bits64) ( aSig<<1 ) ) {
                   3454:             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
                   3455:         }
                   3456:         return packFloat64( aSign, 0x7FF, 0 );
                   3457:     }
                   3458:     shift64RightJamming( aSig, 1, &zSig );
                   3459:     if ( aExp || aSig ) aExp -= 0x3C01;
                   3460:     return roundAndPackFloat64( aSign, aExp, zSig );
                   3461:
                   3462: }
                   3463:
                   3464: #ifdef FLOAT128
                   3465:
                   3466: /*
                   3467: -------------------------------------------------------------------------------
                   3468: Returns the result of converting the extended double-precision floating-
                   3469: point value `a' to the quadruple-precision floating-point format.  The
                   3470: conversion is performed according to the IEC/IEEE Standard for Binary
                   3471: Floating-Point Arithmetic.
                   3472: -------------------------------------------------------------------------------
                   3473: */
                   3474: float128 floatx80_to_float128( floatx80 a )
                   3475: {
                   3476:     flag aSign;
                   3477:     int16 aExp;
                   3478:     bits64 aSig, zSig0, zSig1;
                   3479:
                   3480:     aSig = extractFloatx80Frac( a );
                   3481:     aExp = extractFloatx80Exp( a );
                   3482:     aSign = extractFloatx80Sign( a );
                   3483:     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
                   3484:         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
                   3485:     }
                   3486:     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
                   3487:     return packFloat128( aSign, aExp, zSig0, zSig1 );
                   3488:
                   3489: }
                   3490:
                   3491: #endif
                   3492:
                   3493: /*
                   3494: -------------------------------------------------------------------------------
                   3495: Rounds the extended double-precision floating-point value `a' to an integer,
                   3496: and returns the result as an extended quadruple-precision floating-point
                   3497: value.  The operation is performed according to the IEC/IEEE Standard for
                   3498: Binary Floating-Point Arithmetic.
                   3499: -------------------------------------------------------------------------------
                   3500: */
                   3501: floatx80 floatx80_round_to_int( floatx80 a )
                   3502: {
                   3503:     flag aSign;
                   3504:     int32 aExp;
                   3505:     bits64 lastBitMask, roundBitsMask;
                   3506:     int8 roundingMode;
                   3507:     floatx80 z;
                   3508:
                   3509:     aExp = extractFloatx80Exp( a );
                   3510:     if ( 0x403E <= aExp ) {
                   3511:         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
                   3512:             return propagateFloatx80NaN( a, a );
                   3513:         }
                   3514:         return a;
                   3515:     }
                   3516:     if ( aExp < 0x3FFF ) {
                   3517:         if (    ( aExp == 0 )
                   3518:              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
                   3519:             return a;
                   3520:         }
                   3521:         float_set_inexact();
                   3522:         aSign = extractFloatx80Sign( a );
                   3523:         switch ( float_rounding_mode() ) {
                   3524:          case float_round_nearest_even:
                   3525:             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
                   3526:                ) {
                   3527:                 return
                   3528:                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
                   3529:             }
                   3530:             break;
                   3531:          case float_round_down:
                   3532:             return
                   3533:                   aSign ?
                   3534:                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
                   3535:                 : packFloatx80( 0, 0, 0 );
                   3536:          case float_round_up:
                   3537:             return
                   3538:                   aSign ? packFloatx80( 1, 0, 0 )
                   3539:                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
                   3540:         }
                   3541:         return packFloatx80( aSign, 0, 0 );
                   3542:     }
                   3543:     lastBitMask = 1;
                   3544:     lastBitMask <<= 0x403E - aExp;
                   3545:     roundBitsMask = lastBitMask - 1;
                   3546:     z = a;
                   3547:     roundingMode = float_rounding_mode();
                   3548:     if ( roundingMode == float_round_nearest_even ) {
                   3549:         z.low += lastBitMask>>1;
                   3550:         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
                   3551:     }
                   3552:     else if ( roundingMode != float_round_to_zero ) {
                   3553:         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
                   3554:             z.low += roundBitsMask;
                   3555:         }
                   3556:     }
                   3557:     z.low &= ~ roundBitsMask;
                   3558:     if ( z.low == 0 ) {
                   3559:         ++z.high;
                   3560:         z.low = LIT64( 0x8000000000000000 );
                   3561:     }
                   3562:     if ( z.low != a.low ) float_set_inexact();
                   3563:     return z;
                   3564:
                   3565: }
                   3566:
                   3567: /*
                   3568: -------------------------------------------------------------------------------
                   3569: Returns the result of adding the absolute values of the extended double-
                   3570: precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
                   3571: negated before being returned.  `zSign' is ignored if the result is a NaN.
                   3572: The addition is performed according to the IEC/IEEE Standard for Binary
                   3573: Floating-Point Arithmetic.
                   3574: -------------------------------------------------------------------------------
                   3575: */
                   3576: static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
                   3577: {
                   3578:     int32 aExp, bExp, zExp;
                   3579:     bits64 aSig, bSig, zSig0, zSig1;
                   3580:     int32 expDiff;
                   3581:
                   3582:     aSig = extractFloatx80Frac( a );
                   3583:     aExp = extractFloatx80Exp( a );
                   3584:     bSig = extractFloatx80Frac( b );
                   3585:     bExp = extractFloatx80Exp( b );
                   3586:     expDiff = aExp - bExp;
                   3587:     if ( 0 < expDiff ) {
                   3588:         if ( aExp == 0x7FFF ) {
                   3589:             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3590:             return a;
                   3591:         }
                   3592:         if ( bExp == 0 ) --expDiff;
                   3593:         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
                   3594:         zExp = aExp;
                   3595:     }
                   3596:     else if ( expDiff < 0 ) {
                   3597:         if ( bExp == 0x7FFF ) {
                   3598:             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3599:             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   3600:         }
                   3601:         if ( aExp == 0 ) ++expDiff;
                   3602:         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
                   3603:         zExp = bExp;
                   3604:     }
                   3605:     else {
                   3606:         if ( aExp == 0x7FFF ) {
                   3607:             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
                   3608:                 return propagateFloatx80NaN( a, b );
                   3609:             }
                   3610:             return a;
                   3611:         }
                   3612:         zSig1 = 0;
                   3613:         zSig0 = aSig + bSig;
                   3614:         if ( aExp == 0 ) {
                   3615:             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
                   3616:             goto roundAndPack;
                   3617:         }
                   3618:         zExp = aExp;
                   3619:         goto shiftRight1;
                   3620:     }
                   3621:     zSig0 = aSig + bSig;
                   3622:     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
                   3623:  shiftRight1:
                   3624:     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
                   3625:     zSig0 |= LIT64( 0x8000000000000000 );
                   3626:     ++zExp;
                   3627:  roundAndPack:
                   3628:     return
                   3629:         roundAndPackFloatx80(
                   3630:             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
                   3631:
                   3632: }
                   3633:
                   3634: /*
                   3635: -------------------------------------------------------------------------------
                   3636: Returns the result of subtracting the absolute values of the extended
                   3637: double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
                   3638: difference is negated before being returned.  `zSign' is ignored if the
                   3639: result is a NaN.  The subtraction is performed according to the IEC/IEEE
                   3640: Standard for Binary Floating-Point Arithmetic.
                   3641: -------------------------------------------------------------------------------
                   3642: */
                   3643: static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
                   3644: {
                   3645:     int32 aExp, bExp, zExp;
                   3646:     bits64 aSig, bSig, zSig0, zSig1;
                   3647:     int32 expDiff;
                   3648:     floatx80 z;
                   3649:
                   3650:     aSig = extractFloatx80Frac( a );
                   3651:     aExp = extractFloatx80Exp( a );
                   3652:     bSig = extractFloatx80Frac( b );
                   3653:     bExp = extractFloatx80Exp( b );
                   3654:     expDiff = aExp - bExp;
                   3655:     if ( 0 < expDiff ) goto aExpBigger;
                   3656:     if ( expDiff < 0 ) goto bExpBigger;
                   3657:     if ( aExp == 0x7FFF ) {
                   3658:         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
                   3659:             return propagateFloatx80NaN( a, b );
                   3660:         }
                   3661:         float_raise( float_flag_invalid );
                   3662:         z.low = floatx80_default_nan_low;
                   3663:         z.high = floatx80_default_nan_high;
                   3664:         return z;
                   3665:     }
                   3666:     if ( aExp == 0 ) {
                   3667:         aExp = 1;
                   3668:         bExp = 1;
                   3669:     }
                   3670:     zSig1 = 0;
                   3671:     if ( bSig < aSig ) goto aBigger;
                   3672:     if ( aSig < bSig ) goto bBigger;
                   3673:     return packFloatx80( float_rounding_mode() == float_round_down, 0, 0 );
                   3674:  bExpBigger:
                   3675:     if ( bExp == 0x7FFF ) {
                   3676:         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3677:         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   3678:     }
                   3679:     if ( aExp == 0 ) ++expDiff;
                   3680:     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
                   3681:  bBigger:
                   3682:     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
                   3683:     zExp = bExp;
                   3684:     zSign ^= 1;
                   3685:     goto normalizeRoundAndPack;
                   3686:  aExpBigger:
                   3687:     if ( aExp == 0x7FFF ) {
                   3688:         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3689:         return a;
                   3690:     }
                   3691:     if ( bExp == 0 ) --expDiff;
                   3692:     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
                   3693:  aBigger:
                   3694:     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
                   3695:     zExp = aExp;
                   3696:  normalizeRoundAndPack:
                   3697:     return
                   3698:         normalizeRoundAndPackFloatx80(
                   3699:             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
                   3700:
                   3701: }
                   3702:
                   3703: /*
                   3704: -------------------------------------------------------------------------------
                   3705: Returns the result of adding the extended double-precision floating-point
                   3706: values `a' and `b'.  The operation is performed according to the IEC/IEEE
                   3707: Standard for Binary Floating-Point Arithmetic.
                   3708: -------------------------------------------------------------------------------
                   3709: */
                   3710: floatx80 floatx80_add( floatx80 a, floatx80 b )
                   3711: {
                   3712:     flag aSign, bSign;
                   3713:
                   3714:     aSign = extractFloatx80Sign( a );
                   3715:     bSign = extractFloatx80Sign( b );
                   3716:     if ( aSign == bSign ) {
                   3717:         return addFloatx80Sigs( a, b, aSign );
                   3718:     }
                   3719:     else {
                   3720:         return subFloatx80Sigs( a, b, aSign );
                   3721:     }
                   3722:
                   3723: }
                   3724:
                   3725: /*
                   3726: -------------------------------------------------------------------------------
                   3727: Returns the result of subtracting the extended double-precision floating-
                   3728: point values `a' and `b'.  The operation is performed according to the
                   3729: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3730: -------------------------------------------------------------------------------
                   3731: */
                   3732: floatx80 floatx80_sub( floatx80 a, floatx80 b )
                   3733: {
                   3734:     flag aSign, bSign;
                   3735:
                   3736:     aSign = extractFloatx80Sign( a );
                   3737:     bSign = extractFloatx80Sign( b );
                   3738:     if ( aSign == bSign ) {
                   3739:         return subFloatx80Sigs( a, b, aSign );
                   3740:     }
                   3741:     else {
                   3742:         return addFloatx80Sigs( a, b, aSign );
                   3743:     }
                   3744:
                   3745: }
                   3746:
                   3747: /*
                   3748: -------------------------------------------------------------------------------
                   3749: Returns the result of multiplying the extended double-precision floating-
                   3750: point values `a' and `b'.  The operation is performed according to the
                   3751: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3752: -------------------------------------------------------------------------------
                   3753: */
                   3754: floatx80 floatx80_mul( floatx80 a, floatx80 b )
                   3755: {
                   3756:     flag aSign, bSign, zSign;
                   3757:     int32 aExp, bExp, zExp;
                   3758:     bits64 aSig, bSig, zSig0, zSig1;
                   3759:     floatx80 z;
                   3760:
                   3761:     aSig = extractFloatx80Frac( a );
                   3762:     aExp = extractFloatx80Exp( a );
                   3763:     aSign = extractFloatx80Sign( a );
                   3764:     bSig = extractFloatx80Frac( b );
                   3765:     bExp = extractFloatx80Exp( b );
                   3766:     bSign = extractFloatx80Sign( b );
                   3767:     zSign = aSign ^ bSign;
                   3768:     if ( aExp == 0x7FFF ) {
                   3769:         if (    (bits64) ( aSig<<1 )
                   3770:              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
                   3771:             return propagateFloatx80NaN( a, b );
                   3772:         }
                   3773:         if ( ( bExp | bSig ) == 0 ) goto invalid;
                   3774:         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   3775:     }
                   3776:     if ( bExp == 0x7FFF ) {
                   3777:         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3778:         if ( ( aExp | aSig ) == 0 ) {
                   3779:  invalid:
                   3780:             float_raise( float_flag_invalid );
                   3781:             z.low = floatx80_default_nan_low;
                   3782:             z.high = floatx80_default_nan_high;
                   3783:             return z;
                   3784:         }
                   3785:         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   3786:     }
                   3787:     if ( aExp == 0 ) {
                   3788:         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
                   3789:         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
                   3790:     }
                   3791:     if ( bExp == 0 ) {
                   3792:         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
                   3793:         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
                   3794:     }
                   3795:     zExp = aExp + bExp - 0x3FFE;
                   3796:     mul64To128( aSig, bSig, &zSig0, &zSig1 );
                   3797:     if ( 0 < (sbits64) zSig0 ) {
                   3798:         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
                   3799:         --zExp;
                   3800:     }
                   3801:     return
                   3802:         roundAndPackFloatx80(
                   3803:             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
                   3804:
                   3805: }
                   3806:
                   3807: /*
                   3808: -------------------------------------------------------------------------------
                   3809: Returns the result of dividing the extended double-precision floating-point
                   3810: value `a' by the corresponding value `b'.  The operation is performed
                   3811: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3812: -------------------------------------------------------------------------------
                   3813: */
                   3814: floatx80 floatx80_div( floatx80 a, floatx80 b )
                   3815: {
                   3816:     flag aSign, bSign, zSign;
                   3817:     int32 aExp, bExp, zExp;
                   3818:     bits64 aSig, bSig, zSig0, zSig1;
                   3819:     bits64 rem0, rem1, rem2, term0, term1, term2;
                   3820:     floatx80 z;
                   3821:
                   3822:     aSig = extractFloatx80Frac( a );
                   3823:     aExp = extractFloatx80Exp( a );
                   3824:     aSign = extractFloatx80Sign( a );
                   3825:     bSig = extractFloatx80Frac( b );
                   3826:     bExp = extractFloatx80Exp( b );
                   3827:     bSign = extractFloatx80Sign( b );
                   3828:     zSign = aSign ^ bSign;
                   3829:     if ( aExp == 0x7FFF ) {
                   3830:         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3831:         if ( bExp == 0x7FFF ) {
                   3832:             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3833:             goto invalid;
                   3834:         }
                   3835:         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   3836:     }
                   3837:     if ( bExp == 0x7FFF ) {
                   3838:         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3839:         return packFloatx80( zSign, 0, 0 );
                   3840:     }
                   3841:     if ( bExp == 0 ) {
                   3842:         if ( bSig == 0 ) {
                   3843:             if ( ( aExp | aSig ) == 0 ) {
                   3844:  invalid:
                   3845:                 float_raise( float_flag_invalid );
                   3846:                 z.low = floatx80_default_nan_low;
                   3847:                 z.high = floatx80_default_nan_high;
                   3848:                 return z;
                   3849:             }
                   3850:             float_raise( float_flag_divbyzero );
                   3851:             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   3852:         }
                   3853:         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
                   3854:     }
                   3855:     if ( aExp == 0 ) {
                   3856:         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
                   3857:         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
                   3858:     }
                   3859:     zExp = aExp - bExp + 0x3FFE;
                   3860:     rem1 = 0;
                   3861:     if ( bSig <= aSig ) {
                   3862:         shift128Right( aSig, 0, 1, &aSig, &rem1 );
                   3863:         ++zExp;
                   3864:     }
                   3865:     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
                   3866:     mul64To128( bSig, zSig0, &term0, &term1 );
                   3867:     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
                   3868:     while ( (sbits64) rem0 < 0 ) {
                   3869:         --zSig0;
                   3870:         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
                   3871:     }
                   3872:     zSig1 = estimateDiv128To64( rem1, 0, bSig );
                   3873:     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
                   3874:         mul64To128( bSig, zSig1, &term1, &term2 );
                   3875:         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
                   3876:         while ( (sbits64) rem1 < 0 ) {
                   3877:             --zSig1;
                   3878:             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
                   3879:         }
                   3880:         zSig1 |= ( ( rem1 | rem2 ) != 0 );
                   3881:     }
                   3882:     return
                   3883:         roundAndPackFloatx80(
                   3884:             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
                   3885:
                   3886: }
                   3887:
                   3888: /*
                   3889: -------------------------------------------------------------------------------
                   3890: Returns the remainder of the extended double-precision floating-point value
                   3891: `a' with respect to the corresponding value `b'.  The operation is performed
                   3892: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   3893: -------------------------------------------------------------------------------
                   3894: */
                   3895: floatx80 floatx80_rem( floatx80 a, floatx80 b )
                   3896: {
                   3897:     flag aSign, bSign, zSign;
                   3898:     int32 aExp, bExp, expDiff;
                   3899:     bits64 aSig0, aSig1, bSig;
                   3900:     bits64 q, term0, term1, alternateASig0, alternateASig1;
                   3901:     floatx80 z;
                   3902:
                   3903:     aSig0 = extractFloatx80Frac( a );
                   3904:     aExp = extractFloatx80Exp( a );
                   3905:     aSign = extractFloatx80Sign( a );
                   3906:     bSig = extractFloatx80Frac( b );
                   3907:     bExp = extractFloatx80Exp( b );
                   3908:     bSign = extractFloatx80Sign( b );
                   3909:     if ( aExp == 0x7FFF ) {
                   3910:         if (    (bits64) ( aSig0<<1 )
                   3911:              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
                   3912:             return propagateFloatx80NaN( a, b );
                   3913:         }
                   3914:         goto invalid;
                   3915:     }
                   3916:     if ( bExp == 0x7FFF ) {
                   3917:         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
                   3918:         return a;
                   3919:     }
                   3920:     if ( bExp == 0 ) {
                   3921:         if ( bSig == 0 ) {
                   3922:  invalid:
                   3923:             float_raise( float_flag_invalid );
                   3924:             z.low = floatx80_default_nan_low;
                   3925:             z.high = floatx80_default_nan_high;
                   3926:             return z;
                   3927:         }
                   3928:         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
                   3929:     }
                   3930:     if ( aExp == 0 ) {
                   3931:         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
                   3932:         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
                   3933:     }
                   3934:     bSig |= LIT64( 0x8000000000000000 );
                   3935:     zSign = aSign;
                   3936:     expDiff = aExp - bExp;
                   3937:     aSig1 = 0;
                   3938:     if ( expDiff < 0 ) {
                   3939:         if ( expDiff < -1 ) return a;
                   3940:         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
                   3941:         expDiff = 0;
                   3942:     }
                   3943:     q = ( bSig <= aSig0 );
                   3944:     if ( q ) aSig0 -= bSig;
                   3945:     expDiff -= 64;
                   3946:     while ( 0 < expDiff ) {
                   3947:         q = estimateDiv128To64( aSig0, aSig1, bSig );
                   3948:         q = ( 2 < q ) ? q - 2 : 0;
                   3949:         mul64To128( bSig, q, &term0, &term1 );
                   3950:         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
                   3951:         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
                   3952:         expDiff -= 62;
                   3953:     }
                   3954:     expDiff += 64;
                   3955:     if ( 0 < expDiff ) {
                   3956:         q = estimateDiv128To64( aSig0, aSig1, bSig );
                   3957:         q = ( 2 < q ) ? q - 2 : 0;
                   3958:         q >>= 64 - expDiff;
                   3959:         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
                   3960:         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
                   3961:         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
                   3962:         while ( le128( term0, term1, aSig0, aSig1 ) ) {
                   3963:             ++q;
                   3964:             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
                   3965:         }
                   3966:     }
                   3967:     else {
                   3968:         term1 = 0;
                   3969:         term0 = bSig;
                   3970:     }
                   3971:     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
                   3972:     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
                   3973:          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
                   3974:               && ( q & 1 ) )
                   3975:        ) {
                   3976:         aSig0 = alternateASig0;
                   3977:         aSig1 = alternateASig1;
                   3978:         zSign = ! zSign;
                   3979:     }
                   3980:     return
                   3981:         normalizeRoundAndPackFloatx80(
                   3982:             80, zSign, bExp + expDiff, aSig0, aSig1 );
                   3983:
                   3984: }
                   3985:
                   3986: /*
                   3987: -------------------------------------------------------------------------------
                   3988: Returns the square root of the extended double-precision floating-point
                   3989: value `a'.  The operation is performed according to the IEC/IEEE Standard
                   3990: for Binary Floating-Point Arithmetic.
                   3991: -------------------------------------------------------------------------------
                   3992: */
                   3993: floatx80 floatx80_sqrt( floatx80 a )
                   3994: {
                   3995:     flag aSign;
                   3996:     int32 aExp, zExp;
                   3997:     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
                   3998:     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
                   3999:     floatx80 z;
                   4000:
                   4001:     aSig0 = extractFloatx80Frac( a );
                   4002:     aExp = extractFloatx80Exp( a );
                   4003:     aSign = extractFloatx80Sign( a );
                   4004:     if ( aExp == 0x7FFF ) {
                   4005:         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
                   4006:         if ( ! aSign ) return a;
                   4007:         goto invalid;
                   4008:     }
                   4009:     if ( aSign ) {
                   4010:         if ( ( aExp | aSig0 ) == 0 ) return a;
                   4011:  invalid:
                   4012:         float_raise( float_flag_invalid );
                   4013:         z.low = floatx80_default_nan_low;
                   4014:         z.high = floatx80_default_nan_high;
                   4015:         return z;
                   4016:     }
                   4017:     if ( aExp == 0 ) {
                   4018:         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
                   4019:         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
                   4020:     }
                   4021:     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
                   4022:     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
                   4023:     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
                   4024:     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
                   4025:     doubleZSig0 = zSig0<<1;
                   4026:     mul64To128( zSig0, zSig0, &term0, &term1 );
                   4027:     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
                   4028:     while ( (sbits64) rem0 < 0 ) {
                   4029:         --zSig0;
                   4030:         doubleZSig0 -= 2;
                   4031:         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
                   4032:     }
                   4033:     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
                   4034:     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
                   4035:         if ( zSig1 == 0 ) zSig1 = 1;
                   4036:         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
                   4037:         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
                   4038:         mul64To128( zSig1, zSig1, &term2, &term3 );
                   4039:         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
                   4040:         while ( (sbits64) rem1 < 0 ) {
                   4041:             --zSig1;
                   4042:             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
                   4043:             term3 |= 1;
                   4044:             term2 |= doubleZSig0;
                   4045:             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
                   4046:         }
                   4047:         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
                   4048:     }
                   4049:     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
                   4050:     zSig0 |= doubleZSig0;
                   4051:     return
                   4052:         roundAndPackFloatx80(
                   4053:             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
                   4054:
                   4055: }
                   4056:
                   4057: /*
                   4058: -------------------------------------------------------------------------------
                   4059: Returns 1 if the extended double-precision floating-point value `a' is
                   4060: equal to the corresponding value `b', and 0 otherwise.  The comparison is
                   4061: performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4062: Arithmetic.
                   4063: -------------------------------------------------------------------------------
                   4064: */
                   4065: flag floatx80_eq( floatx80 a, floatx80 b )
                   4066: {
                   4067:
                   4068:     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
                   4069:               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
                   4070:          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
                   4071:               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
                   4072:        ) {
                   4073:         if (    floatx80_is_signaling_nan( a )
                   4074:              || floatx80_is_signaling_nan( b ) ) {
                   4075:             float_raise( float_flag_invalid );
                   4076:         }
                   4077:         return 0;
                   4078:     }
                   4079:     return
                   4080:            ( a.low == b.low )
                   4081:         && (    ( a.high == b.high )
                   4082:              || (    ( a.low == 0 )
                   4083:                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
                   4084:            );
                   4085:
                   4086: }
                   4087:
                   4088: /*
                   4089: -------------------------------------------------------------------------------
                   4090: Returns 1 if the extended double-precision floating-point value `a' is
                   4091: less than or equal to the corresponding value `b', and 0 otherwise.  The
                   4092: comparison is performed according to the IEC/IEEE Standard for Binary
                   4093: Floating-Point Arithmetic.
                   4094: -------------------------------------------------------------------------------
                   4095: */
                   4096: flag floatx80_le( floatx80 a, floatx80 b )
                   4097: {
                   4098:     flag aSign, bSign;
                   4099:
                   4100:     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
                   4101:               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
                   4102:          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
                   4103:               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
                   4104:        ) {
                   4105:         float_raise( float_flag_invalid );
                   4106:         return 0;
                   4107:     }
                   4108:     aSign = extractFloatx80Sign( a );
                   4109:     bSign = extractFloatx80Sign( b );
                   4110:     if ( aSign != bSign ) {
                   4111:         return
                   4112:                aSign
                   4113:             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   4114:                  == 0 );
                   4115:     }
                   4116:     return
                   4117:           aSign ? le128( b.high, b.low, a.high, a.low )
                   4118:         : le128( a.high, a.low, b.high, b.low );
                   4119:
                   4120: }
                   4121:
                   4122: /*
                   4123: -------------------------------------------------------------------------------
                   4124: Returns 1 if the extended double-precision floating-point value `a' is
                   4125: less than the corresponding value `b', and 0 otherwise.  The comparison
                   4126: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4127: Arithmetic.
                   4128: -------------------------------------------------------------------------------
                   4129: */
                   4130: flag floatx80_lt( floatx80 a, floatx80 b )
                   4131: {
                   4132:     flag aSign, bSign;
                   4133:
                   4134:     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
                   4135:               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
                   4136:          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
                   4137:               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
                   4138:        ) {
                   4139:         float_raise( float_flag_invalid );
                   4140:         return 0;
                   4141:     }
                   4142:     aSign = extractFloatx80Sign( a );
                   4143:     bSign = extractFloatx80Sign( b );
                   4144:     if ( aSign != bSign ) {
                   4145:         return
                   4146:                aSign
                   4147:             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   4148:                  != 0 );
                   4149:     }
                   4150:     return
                   4151:           aSign ? lt128( b.high, b.low, a.high, a.low )
                   4152:         : lt128( a.high, a.low, b.high, b.low );
                   4153:
                   4154: }
                   4155:
                   4156: /*
                   4157: -------------------------------------------------------------------------------
                   4158: Returns 1 if the extended double-precision floating-point value `a' is equal
                   4159: to the corresponding value `b', and 0 otherwise.  The invalid exception is
                   4160: raised if either operand is a NaN.  Otherwise, the comparison is performed
                   4161: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   4162: -------------------------------------------------------------------------------
                   4163: */
                   4164: flag floatx80_eq_signaling( floatx80 a, floatx80 b )
                   4165: {
                   4166:
                   4167:     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
                   4168:               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
                   4169:          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
                   4170:               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
                   4171:        ) {
                   4172:         float_raise( float_flag_invalid );
                   4173:         return 0;
                   4174:     }
                   4175:     return
                   4176:            ( a.low == b.low )
                   4177:         && (    ( a.high == b.high )
                   4178:              || (    ( a.low == 0 )
                   4179:                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
                   4180:            );
                   4181:
                   4182: }
                   4183:
                   4184: /*
                   4185: -------------------------------------------------------------------------------
                   4186: Returns 1 if the extended double-precision floating-point value `a' is less
                   4187: than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
                   4188: do not cause an exception.  Otherwise, the comparison is performed according
                   4189: to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   4190: -------------------------------------------------------------------------------
                   4191: */
                   4192: flag floatx80_le_quiet( floatx80 a, floatx80 b )
                   4193: {
                   4194:     flag aSign, bSign;
                   4195:
                   4196:     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
                   4197:               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
                   4198:          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
                   4199:               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
                   4200:        ) {
                   4201:         if (    floatx80_is_signaling_nan( a )
                   4202:              || floatx80_is_signaling_nan( b ) ) {
                   4203:             float_raise( float_flag_invalid );
                   4204:         }
                   4205:         return 0;
                   4206:     }
                   4207:     aSign = extractFloatx80Sign( a );
                   4208:     bSign = extractFloatx80Sign( b );
                   4209:     if ( aSign != bSign ) {
                   4210:         return
                   4211:                aSign
                   4212:             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   4213:                  == 0 );
                   4214:     }
                   4215:     return
                   4216:           aSign ? le128( b.high, b.low, a.high, a.low )
                   4217:         : le128( a.high, a.low, b.high, b.low );
                   4218:
                   4219: }
                   4220:
                   4221: /*
                   4222: -------------------------------------------------------------------------------
                   4223: Returns 1 if the extended double-precision floating-point value `a' is less
                   4224: than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
                   4225: an exception.  Otherwise, the comparison is performed according to the
                   4226: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   4227: -------------------------------------------------------------------------------
                   4228: */
                   4229: flag floatx80_lt_quiet( floatx80 a, floatx80 b )
                   4230: {
                   4231:     flag aSign, bSign;
                   4232:
                   4233:     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
                   4234:               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
                   4235:          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
                   4236:               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
                   4237:        ) {
                   4238:         if (    floatx80_is_signaling_nan( a )
                   4239:              || floatx80_is_signaling_nan( b ) ) {
                   4240:             float_raise( float_flag_invalid );
                   4241:         }
                   4242:         return 0;
                   4243:     }
                   4244:     aSign = extractFloatx80Sign( a );
                   4245:     bSign = extractFloatx80Sign( b );
                   4246:     if ( aSign != bSign ) {
                   4247:         return
                   4248:                aSign
                   4249:             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   4250:                  != 0 );
                   4251:     }
                   4252:     return
                   4253:           aSign ? lt128( b.high, b.low, a.high, a.low )
                   4254:         : lt128( a.high, a.low, b.high, b.low );
                   4255:
                   4256: }
                   4257:
                   4258: #endif
                   4259:
                   4260: #ifdef FLOAT128
                   4261:
                   4262: /*
                   4263: -------------------------------------------------------------------------------
                   4264: Returns the result of converting the quadruple-precision floating-point
                   4265: value `a' to the 32-bit two's complement integer format.  The conversion
                   4266: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4267: Arithmetic---which means in particular that the conversion is rounded
                   4268: according to the current rounding mode.  If `a' is a NaN, the largest
                   4269: positive integer is returned.  Otherwise, if the conversion overflows, the
                   4270: largest integer with the same sign as `a' is returned.
                   4271: -------------------------------------------------------------------------------
                   4272: */
                   4273: int32 float128_to_int32( float128 a )
                   4274: {
                   4275:     flag aSign;
                   4276:     int32 aExp, shiftCount;
                   4277:     bits64 aSig0, aSig1;
                   4278:
                   4279:     aSig1 = extractFloat128Frac1( a );
                   4280:     aSig0 = extractFloat128Frac0( a );
                   4281:     aExp = extractFloat128Exp( a );
                   4282:     aSign = extractFloat128Sign( a );
                   4283:     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
                   4284:     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
                   4285:     aSig0 |= ( aSig1 != 0 );
                   4286:     shiftCount = 0x4028 - aExp;
                   4287:     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
                   4288:     return roundAndPackInt32( aSign, aSig0 );
                   4289:
                   4290: }
                   4291:
                   4292: /*
                   4293: -------------------------------------------------------------------------------
                   4294: Returns the result of converting the quadruple-precision floating-point
                   4295: value `a' to the 32-bit two's complement integer format.  The conversion
                   4296: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4297: Arithmetic, except that the conversion is always rounded toward zero.  If
                   4298: `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
                   4299: conversion overflows, the largest integer with the same sign as `a' is
                   4300: returned.
                   4301: -------------------------------------------------------------------------------
                   4302: */
                   4303: int32 float128_to_int32_round_to_zero( float128 a )
                   4304: {
                   4305:     flag aSign;
                   4306:     int32 aExp, shiftCount;
                   4307:     bits64 aSig0, aSig1, savedASig;
                   4308:     int32 z;
                   4309:
                   4310:     aSig1 = extractFloat128Frac1( a );
                   4311:     aSig0 = extractFloat128Frac0( a );
                   4312:     aExp = extractFloat128Exp( a );
                   4313:     aSign = extractFloat128Sign( a );
                   4314:     aSig0 |= ( aSig1 != 0 );
                   4315:     if ( 0x401E < aExp ) {
                   4316:         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
                   4317:         goto invalid;
                   4318:     }
                   4319:     else if ( aExp < 0x3FFF ) {
                   4320:         if ( aExp || aSig0 ) float_set_inexact();
                   4321:         return 0;
                   4322:     }
                   4323:     aSig0 |= LIT64( 0x0001000000000000 );
                   4324:     shiftCount = 0x402F - aExp;
                   4325:     savedASig = aSig0;
                   4326:     aSig0 >>= shiftCount;
                   4327:     z = aSig0;
                   4328:     if ( aSign ) z = - z;
                   4329:     if ( ( z < 0 ) ^ aSign ) {
                   4330:  invalid:
                   4331:         float_raise( float_flag_invalid );
                   4332:         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
                   4333:     }
                   4334:     if ( ( aSig0<<shiftCount ) != savedASig ) {
                   4335:         float_set_inexact();
                   4336:     }
                   4337:     return z;
                   4338:
                   4339: }
                   4340:
                   4341: /*
                   4342: -------------------------------------------------------------------------------
                   4343: Returns the result of converting the quadruple-precision floating-point
                   4344: value `a' to the 64-bit two's complement integer format.  The conversion
                   4345: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4346: Arithmetic---which means in particular that the conversion is rounded
                   4347: according to the current rounding mode.  If `a' is a NaN, the largest
                   4348: positive integer is returned.  Otherwise, if the conversion overflows, the
                   4349: largest integer with the same sign as `a' is returned.
                   4350: -------------------------------------------------------------------------------
                   4351: */
                   4352: int64 float128_to_int64( float128 a )
                   4353: {
                   4354:     flag aSign;
                   4355:     int32 aExp, shiftCount;
                   4356:     bits64 aSig0, aSig1;
                   4357:
                   4358:     aSig1 = extractFloat128Frac1( a );
                   4359:     aSig0 = extractFloat128Frac0( a );
                   4360:     aExp = extractFloat128Exp( a );
                   4361:     aSign = extractFloat128Sign( a );
                   4362:     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
                   4363:     shiftCount = 0x402F - aExp;
                   4364:     if ( shiftCount <= 0 ) {
                   4365:         if ( 0x403E < aExp ) {
                   4366:             float_raise( float_flag_invalid );
                   4367:             if (    ! aSign
                   4368:                  || (    ( aExp == 0x7FFF )
                   4369:                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
                   4370:                     )
                   4371:                ) {
                   4372:                 return LIT64( 0x7FFFFFFFFFFFFFFF );
                   4373:             }
                   4374:             return (sbits64) LIT64( 0x8000000000000000 );
                   4375:         }
                   4376:         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
                   4377:     }
                   4378:     else {
                   4379:         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
                   4380:     }
                   4381:     return roundAndPackInt64( aSign, aSig0, aSig1 );
                   4382:
                   4383: }
                   4384:
                   4385: /*
                   4386: -------------------------------------------------------------------------------
                   4387: Returns the result of converting the quadruple-precision floating-point
                   4388: value `a' to the 64-bit two's complement integer format.  The conversion
                   4389: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4390: Arithmetic, except that the conversion is always rounded toward zero.
                   4391: If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
                   4392: the conversion overflows, the largest integer with the same sign as `a' is
                   4393: returned.
                   4394: -------------------------------------------------------------------------------
                   4395: */
                   4396: int64 float128_to_int64_round_to_zero( float128 a )
                   4397: {
                   4398:     flag aSign;
                   4399:     int32 aExp, shiftCount;
                   4400:     bits64 aSig0, aSig1;
                   4401:     int64 z;
                   4402:
                   4403:     aSig1 = extractFloat128Frac1( a );
                   4404:     aSig0 = extractFloat128Frac0( a );
                   4405:     aExp = extractFloat128Exp( a );
                   4406:     aSign = extractFloat128Sign( a );
                   4407:     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
                   4408:     shiftCount = aExp - 0x402F;
                   4409:     if ( 0 < shiftCount ) {
                   4410:         if ( 0x403E <= aExp ) {
                   4411:             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
                   4412:             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
                   4413:                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
                   4414:                 if ( aSig1 ) float_set_inexact();
                   4415:             }
                   4416:             else {
                   4417:                 float_raise( float_flag_invalid );
                   4418:                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
                   4419:                     return LIT64( 0x7FFFFFFFFFFFFFFF );
                   4420:                 }
                   4421:             }
                   4422:             return (sbits64) LIT64( 0x8000000000000000 );
                   4423:         }
                   4424:         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
                   4425:         if ( (bits64) ( aSig1<<shiftCount ) ) {
                   4426:             float_set_inexact();
                   4427:         }
                   4428:     }
                   4429:     else {
                   4430:         if ( aExp < 0x3FFF ) {
                   4431:             if ( aExp | aSig0 | aSig1 ) {
                   4432:                 float_set_inexact();
                   4433:             }
                   4434:             return 0;
                   4435:         }
                   4436:         z = aSig0>>( - shiftCount );
                   4437:         if (    aSig1
                   4438:              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
                   4439:             float_set_inexact();
                   4440:         }
                   4441:     }
                   4442:     if ( aSign ) z = - z;
                   4443:     return z;
                   4444:
                   4445: }
                   4446:
                   4447: /*
                   4448: -------------------------------------------------------------------------------
                   4449: Returns the result of converting the quadruple-precision floating-point
                   4450: value `a' to the single-precision floating-point format.  The conversion
                   4451: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4452: Arithmetic.
                   4453: -------------------------------------------------------------------------------
                   4454: */
                   4455: float32 float128_to_float32( float128 a )
                   4456: {
                   4457:     flag aSign;
                   4458:     int32 aExp;
                   4459:     bits64 aSig0, aSig1;
                   4460:     bits32 zSig;
                   4461:
                   4462:     aSig1 = extractFloat128Frac1( a );
                   4463:     aSig0 = extractFloat128Frac0( a );
                   4464:     aExp = extractFloat128Exp( a );
                   4465:     aSign = extractFloat128Sign( a );
                   4466:     if ( aExp == 0x7FFF ) {
                   4467:         if ( aSig0 | aSig1 ) {
                   4468:             return commonNaNToFloat32( float128ToCommonNaN( a ) );
                   4469:         }
                   4470:         return packFloat32( aSign, 0xFF, 0 );
                   4471:     }
                   4472:     aSig0 |= ( aSig1 != 0 );
                   4473:     shift64RightJamming( aSig0, 18, &aSig0 );
                   4474:     zSig = aSig0;
                   4475:     if ( aExp || zSig ) {
                   4476:         zSig |= 0x40000000;
                   4477:         aExp -= 0x3F81;
                   4478:     }
                   4479:     return roundAndPackFloat32( aSign, aExp, zSig );
                   4480:
                   4481: }
                   4482:
                   4483: /*
                   4484: -------------------------------------------------------------------------------
                   4485: Returns the result of converting the quadruple-precision floating-point
                   4486: value `a' to the double-precision floating-point format.  The conversion
                   4487: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   4488: Arithmetic.
                   4489: -------------------------------------------------------------------------------
                   4490: */
                   4491: float64 float128_to_float64( float128 a )
                   4492: {
                   4493:     flag aSign;
                   4494:     int32 aExp;
                   4495:     bits64 aSig0, aSig1;
                   4496:
                   4497:     aSig1 = extractFloat128Frac1( a );
                   4498:     aSig0 = extractFloat128Frac0( a );
                   4499:     aExp = extractFloat128Exp( a );
                   4500:     aSign = extractFloat128Sign( a );
                   4501:     if ( aExp == 0x7FFF ) {
                   4502:         if ( aSig0 | aSig1 ) {
                   4503:             return commonNaNToFloat64( float128ToCommonNaN( a ) );
                   4504:         }
                   4505:         return packFloat64( aSign, 0x7FF, 0 );
                   4506:     }
                   4507:     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
                   4508:     aSig0 |= ( aSig1 != 0 );
                   4509:     if ( aExp || aSig0 ) {
                   4510:         aSig0 |= LIT64( 0x4000000000000000 );
                   4511:         aExp -= 0x3C01;
                   4512:     }
                   4513:     return roundAndPackFloat64( aSign, aExp, aSig0 );
                   4514:
                   4515: }
                   4516:
                   4517: #ifdef FLOATX80
                   4518:
                   4519: /*
                   4520: -------------------------------------------------------------------------------
                   4521: Returns the result of converting the quadruple-precision floating-point
                   4522: value `a' to the extended double-precision floating-point format.  The
                   4523: conversion is performed according to the IEC/IEEE Standard for Binary
                   4524: Floating-Point Arithmetic.
                   4525: -------------------------------------------------------------------------------
                   4526: */
                   4527: floatx80 float128_to_floatx80( float128 a )
                   4528: {
                   4529:     flag aSign;
                   4530:     int32 aExp;
                   4531:     bits64 aSig0, aSig1;
                   4532:
                   4533:     aSig1 = extractFloat128Frac1( a );
                   4534:     aSig0 = extractFloat128Frac0( a );
                   4535:     aExp = extractFloat128Exp( a );
                   4536:     aSign = extractFloat128Sign( a );
                   4537:     if ( aExp == 0x7FFF ) {
                   4538:         if ( aSig0 | aSig1 ) {
                   4539:             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
                   4540:         }
                   4541:         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
                   4542:     }
                   4543:     if ( aExp == 0 ) {
                   4544:         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
                   4545:         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
                   4546:     }
                   4547:     else {
                   4548:         aSig0 |= LIT64( 0x0001000000000000 );
                   4549:     }
                   4550:     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
                   4551:     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
                   4552:
                   4553: }
                   4554:
                   4555: #endif
                   4556:
                   4557: /*
                   4558: -------------------------------------------------------------------------------
                   4559: Rounds the quadruple-precision floating-point value `a' to an integer, and
                   4560: returns the result as a quadruple-precision floating-point value.  The
                   4561: operation is performed according to the IEC/IEEE Standard for Binary
                   4562: Floating-Point Arithmetic.
                   4563: -------------------------------------------------------------------------------
                   4564: */
                   4565: float128 float128_round_to_int( float128 a )
                   4566: {
                   4567:     flag aSign;
                   4568:     int32 aExp;
                   4569:     bits64 lastBitMask, roundBitsMask;
                   4570:     int8 roundingMode;
                   4571:     float128 z;
                   4572:
                   4573:     aExp = extractFloat128Exp( a );
                   4574:     if ( 0x402F <= aExp ) {
                   4575:         if ( 0x406F <= aExp ) {
                   4576:             if (    ( aExp == 0x7FFF )
                   4577:                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
                   4578:                ) {
                   4579:                 return propagateFloat128NaN( a, a );
                   4580:             }
                   4581:             return a;
                   4582:         }
                   4583:         lastBitMask = 1;
                   4584:         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
                   4585:         roundBitsMask = lastBitMask - 1;
                   4586:         z = a;
                   4587:         roundingMode = float_rounding_mode();
                   4588:         if ( roundingMode == float_round_nearest_even ) {
                   4589:             if ( lastBitMask ) {
                   4590:                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
                   4591:                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
                   4592:             }
                   4593:             else {
                   4594:                 if ( (sbits64) z.low < 0 ) {
                   4595:                     ++z.high;
                   4596:                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
                   4597:                 }
                   4598:             }
                   4599:         }
                   4600:         else if ( roundingMode != float_round_to_zero ) {
                   4601:             if (   extractFloat128Sign( z )
                   4602:                  ^ ( roundingMode == float_round_up ) ) {
                   4603:                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
                   4604:             }
                   4605:         }
                   4606:         z.low &= ~ roundBitsMask;
                   4607:     }
                   4608:     else {
                   4609:         if ( aExp < 0x3FFF ) {
                   4610:             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
                   4611:             float_set_inexact();
                   4612:             aSign = extractFloat128Sign( a );
                   4613:             switch ( float_rounding_mode() ) {
                   4614:              case float_round_nearest_even:
                   4615:                 if (    ( aExp == 0x3FFE )
                   4616:                      && (   extractFloat128Frac0( a )
                   4617:                           | extractFloat128Frac1( a ) )
                   4618:                    ) {
                   4619:                     return packFloat128( aSign, 0x3FFF, 0, 0 );
                   4620:                 }
                   4621:                 break;
                   4622:              case float_round_down:
                   4623:                 return
                   4624:                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
                   4625:                     : packFloat128( 0, 0, 0, 0 );
                   4626:              case float_round_up:
                   4627:                 return
                   4628:                       aSign ? packFloat128( 1, 0, 0, 0 )
                   4629:                     : packFloat128( 0, 0x3FFF, 0, 0 );
                   4630:             }
                   4631:             return packFloat128( aSign, 0, 0, 0 );
                   4632:         }
                   4633:         lastBitMask = 1;
                   4634:         lastBitMask <<= 0x402F - aExp;
                   4635:         roundBitsMask = lastBitMask - 1;
                   4636:         z.low = 0;
                   4637:         z.high = a.high;
                   4638:         roundingMode = float_rounding_mode();
                   4639:         if ( roundingMode == float_round_nearest_even ) {
                   4640:             z.high += lastBitMask>>1;
                   4641:             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
                   4642:                 z.high &= ~ lastBitMask;
                   4643:             }
                   4644:         }
                   4645:         else if ( roundingMode != float_round_to_zero ) {
                   4646:             if (   extractFloat128Sign( z )
                   4647:                  ^ ( roundingMode == float_round_up ) ) {
                   4648:                 z.high |= ( a.low != 0 );
                   4649:                 z.high += roundBitsMask;
                   4650:             }
                   4651:         }
                   4652:         z.high &= ~ roundBitsMask;
                   4653:     }
                   4654:     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
                   4655:         float_set_inexact();
                   4656:     }
                   4657:     return z;
                   4658:
                   4659: }
                   4660:
                   4661: /*
                   4662: -------------------------------------------------------------------------------
                   4663: Returns the result of adding the absolute values of the quadruple-precision
                   4664: floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
                   4665: before being returned.  `zSign' is ignored if the result is a NaN.
                   4666: The addition is performed according to the IEC/IEEE Standard for Binary
                   4667: Floating-Point Arithmetic.
                   4668: -------------------------------------------------------------------------------
                   4669: */
                   4670: static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
                   4671: {
                   4672:     int32 aExp, bExp, zExp;
                   4673:     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
                   4674:     int32 expDiff;
                   4675:
                   4676:     aSig1 = extractFloat128Frac1( a );
                   4677:     aSig0 = extractFloat128Frac0( a );
                   4678:     aExp = extractFloat128Exp( a );
                   4679:     bSig1 = extractFloat128Frac1( b );
                   4680:     bSig0 = extractFloat128Frac0( b );
                   4681:     bExp = extractFloat128Exp( b );
                   4682:     expDiff = aExp - bExp;
                   4683:     if ( 0 < expDiff ) {
                   4684:         if ( aExp == 0x7FFF ) {
                   4685:             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
                   4686:             return a;
                   4687:         }
                   4688:         if ( bExp == 0 ) {
                   4689:             --expDiff;
                   4690:         }
                   4691:         else {
                   4692:             bSig0 |= LIT64( 0x0001000000000000 );
                   4693:         }
                   4694:         shift128ExtraRightJamming(
                   4695:             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
                   4696:         zExp = aExp;
                   4697:     }
                   4698:     else if ( expDiff < 0 ) {
                   4699:         if ( bExp == 0x7FFF ) {
                   4700:             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
                   4701:             return packFloat128( zSign, 0x7FFF, 0, 0 );
                   4702:         }
                   4703:         if ( aExp == 0 ) {
                   4704:             ++expDiff;
                   4705:         }
                   4706:         else {
                   4707:             aSig0 |= LIT64( 0x0001000000000000 );
                   4708:         }
                   4709:         shift128ExtraRightJamming(
                   4710:             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
                   4711:         zExp = bExp;
                   4712:     }
                   4713:     else {
                   4714:         if ( aExp == 0x7FFF ) {
                   4715:             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
                   4716:                 return propagateFloat128NaN( a, b );
                   4717:             }
                   4718:             return a;
                   4719:         }
                   4720:         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
                   4721:         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
                   4722:         zSig2 = 0;
                   4723:         zSig0 |= LIT64( 0x0002000000000000 );
                   4724:         zExp = aExp;
                   4725:         goto shiftRight1;
                   4726:     }
                   4727:     aSig0 |= LIT64( 0x0001000000000000 );
                   4728:     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
                   4729:     --zExp;
                   4730:     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
                   4731:     ++zExp;
                   4732:  shiftRight1:
                   4733:     shift128ExtraRightJamming(
                   4734:         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
                   4735:  roundAndPack:
                   4736:     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
                   4737:
                   4738: }
                   4739:
                   4740: /*
                   4741: -------------------------------------------------------------------------------
                   4742: Returns the result of subtracting the absolute values of the quadruple-
                   4743: precision floating-point values `a' and `b'.  If `zSign' is 1, the
                   4744: difference is negated before being returned.  `zSign' is ignored if the
                   4745: result is a NaN.  The subtraction is performed according to the IEC/IEEE
                   4746: Standard for Binary Floating-Point Arithmetic.
                   4747: -------------------------------------------------------------------------------
                   4748: */
                   4749: static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
                   4750: {
                   4751:     int32 aExp, bExp, zExp;
                   4752:     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
                   4753:     int32 expDiff;
                   4754:     float128 z;
                   4755:
                   4756:     aSig1 = extractFloat128Frac1( a );
                   4757:     aSig0 = extractFloat128Frac0( a );
                   4758:     aExp = extractFloat128Exp( a );
                   4759:     bSig1 = extractFloat128Frac1( b );
                   4760:     bSig0 = extractFloat128Frac0( b );
                   4761:     bExp = extractFloat128Exp( b );
                   4762:     expDiff = aExp - bExp;
                   4763:     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
                   4764:     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
                   4765:     if ( 0 < expDiff ) goto aExpBigger;
                   4766:     if ( expDiff < 0 ) goto bExpBigger;
                   4767:     if ( aExp == 0x7FFF ) {
                   4768:         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
                   4769:             return propagateFloat128NaN( a, b );
                   4770:         }
                   4771:         float_raise( float_flag_invalid );
                   4772:         z.low = float128_default_nan_low;
                   4773:         z.high = float128_default_nan_high;
                   4774:         return z;
                   4775:     }
                   4776:     if ( aExp == 0 ) {
                   4777:         aExp = 1;
                   4778:         bExp = 1;
                   4779:     }
                   4780:     if ( bSig0 < aSig0 ) goto aBigger;
                   4781:     if ( aSig0 < bSig0 ) goto bBigger;
                   4782:     if ( bSig1 < aSig1 ) goto aBigger;
                   4783:     if ( aSig1 < bSig1 ) goto bBigger;
                   4784:     return packFloat128( float_rounding_mode() == float_round_down, 0, 0, 0 );
                   4785:  bExpBigger:
                   4786:     if ( bExp == 0x7FFF ) {
                   4787:         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
                   4788:         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
                   4789:     }
                   4790:     if ( aExp == 0 ) {
                   4791:         ++expDiff;
                   4792:     }
                   4793:     else {
                   4794:         aSig0 |= LIT64( 0x4000000000000000 );
                   4795:     }
                   4796:     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
                   4797:     bSig0 |= LIT64( 0x4000000000000000 );
                   4798:  bBigger:
                   4799:     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
                   4800:     zExp = bExp;
                   4801:     zSign ^= 1;
                   4802:     goto normalizeRoundAndPack;
                   4803:  aExpBigger:
                   4804:     if ( aExp == 0x7FFF ) {
                   4805:         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
                   4806:         return a;
                   4807:     }
                   4808:     if ( bExp == 0 ) {
                   4809:         --expDiff;
                   4810:     }
                   4811:     else {
                   4812:         bSig0 |= LIT64( 0x4000000000000000 );
                   4813:     }
                   4814:     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
                   4815:     aSig0 |= LIT64( 0x4000000000000000 );
                   4816:  aBigger:
                   4817:     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
                   4818:     zExp = aExp;
                   4819:  normalizeRoundAndPack:
                   4820:     --zExp;
                   4821:     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
                   4822:
                   4823: }
                   4824:
                   4825: /*
                   4826: -------------------------------------------------------------------------------
                   4827: Returns the result of adding the quadruple-precision floating-point values
                   4828: `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
                   4829: for Binary Floating-Point Arithmetic.
                   4830: -------------------------------------------------------------------------------
                   4831: */
                   4832: float128 float128_add( float128 a, float128 b )
                   4833: {
                   4834:     flag aSign, bSign;
                   4835:
                   4836:     aSign = extractFloat128Sign( a );
                   4837:     bSign = extractFloat128Sign( b );
                   4838:     if ( aSign == bSign ) {
                   4839:         return addFloat128Sigs( a, b, aSign );
                   4840:     }
                   4841:     else {
                   4842:         return subFloat128Sigs( a, b, aSign );
                   4843:     }
                   4844:
                   4845: }
                   4846:
                   4847: /*
                   4848: -------------------------------------------------------------------------------
                   4849: Returns the result of subtracting the quadruple-precision floating-point
                   4850: values `a' and `b'.  The operation is performed according to the IEC/IEEE
                   4851: Standard for Binary Floating-Point Arithmetic.
                   4852: -------------------------------------------------------------------------------
                   4853: */
                   4854: float128 float128_sub( float128 a, float128 b )
                   4855: {
                   4856:     flag aSign, bSign;
                   4857:
                   4858:     aSign = extractFloat128Sign( a );
                   4859:     bSign = extractFloat128Sign( b );
                   4860:     if ( aSign == bSign ) {
                   4861:         return subFloat128Sigs( a, b, aSign );
                   4862:     }
                   4863:     else {
                   4864:         return addFloat128Sigs( a, b, aSign );
                   4865:     }
                   4866:
                   4867: }
                   4868:
                   4869: /*
                   4870: -------------------------------------------------------------------------------
                   4871: Returns the result of multiplying the quadruple-precision floating-point
                   4872: values `a' and `b'.  The operation is performed according to the IEC/IEEE
                   4873: Standard for Binary Floating-Point Arithmetic.
                   4874: -------------------------------------------------------------------------------
                   4875: */
                   4876: float128 float128_mul( float128 a, float128 b )
                   4877: {
                   4878:     flag aSign, bSign, zSign;
                   4879:     int32 aExp, bExp, zExp;
                   4880:     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
                   4881:     float128 z;
                   4882:
                   4883:     aSig1 = extractFloat128Frac1( a );
                   4884:     aSig0 = extractFloat128Frac0( a );
                   4885:     aExp = extractFloat128Exp( a );
                   4886:     aSign = extractFloat128Sign( a );
                   4887:     bSig1 = extractFloat128Frac1( b );
                   4888:     bSig0 = extractFloat128Frac0( b );
                   4889:     bExp = extractFloat128Exp( b );
                   4890:     bSign = extractFloat128Sign( b );
                   4891:     zSign = aSign ^ bSign;
                   4892:     if ( aExp == 0x7FFF ) {
                   4893:         if (    ( aSig0 | aSig1 )
                   4894:              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
                   4895:             return propagateFloat128NaN( a, b );
                   4896:         }
                   4897:         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
                   4898:         return packFloat128( zSign, 0x7FFF, 0, 0 );
                   4899:     }
                   4900:     if ( bExp == 0x7FFF ) {
                   4901:         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
                   4902:         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
                   4903:  invalid:
                   4904:             float_raise( float_flag_invalid );
                   4905:             z.low = float128_default_nan_low;
                   4906:             z.high = float128_default_nan_high;
                   4907:             return z;
                   4908:         }
                   4909:         return packFloat128( zSign, 0x7FFF, 0, 0 );
                   4910:     }
                   4911:     if ( aExp == 0 ) {
                   4912:         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
                   4913:         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
                   4914:     }
                   4915:     if ( bExp == 0 ) {
                   4916:         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
                   4917:         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
                   4918:     }
                   4919:     zExp = aExp + bExp - 0x4000;
                   4920:     aSig0 |= LIT64( 0x0001000000000000 );
                   4921:     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
                   4922:     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
                   4923:     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
                   4924:     zSig2 |= ( zSig3 != 0 );
                   4925:     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
                   4926:         shift128ExtraRightJamming(
                   4927:             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
                   4928:         ++zExp;
                   4929:     }
                   4930:     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
                   4931:
                   4932: }
                   4933:
                   4934: /*
                   4935: -------------------------------------------------------------------------------
                   4936: Returns the result of dividing the quadruple-precision floating-point value
                   4937: `a' by the corresponding value `b'.  The operation is performed according to
                   4938: the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   4939: -------------------------------------------------------------------------------
                   4940: */
                   4941: float128 float128_div( float128 a, float128 b )
                   4942: {
                   4943:     flag aSign, bSign, zSign;
                   4944:     int32 aExp, bExp, zExp;
                   4945:     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
                   4946:     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
                   4947:     float128 z;
                   4948:
                   4949:     aSig1 = extractFloat128Frac1( a );
                   4950:     aSig0 = extractFloat128Frac0( a );
                   4951:     aExp = extractFloat128Exp( a );
                   4952:     aSign = extractFloat128Sign( a );
                   4953:     bSig1 = extractFloat128Frac1( b );
                   4954:     bSig0 = extractFloat128Frac0( b );
                   4955:     bExp = extractFloat128Exp( b );
                   4956:     bSign = extractFloat128Sign( b );
                   4957:     zSign = aSign ^ bSign;
                   4958:     if ( aExp == 0x7FFF ) {
                   4959:         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
                   4960:         if ( bExp == 0x7FFF ) {
                   4961:             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
                   4962:             goto invalid;
                   4963:         }
                   4964:         return packFloat128( zSign, 0x7FFF, 0, 0 );
                   4965:     }
                   4966:     if ( bExp == 0x7FFF ) {
                   4967:         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
                   4968:         return packFloat128( zSign, 0, 0, 0 );
                   4969:     }
                   4970:     if ( bExp == 0 ) {
                   4971:         if ( ( bSig0 | bSig1 ) == 0 ) {
                   4972:             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
                   4973:  invalid:
                   4974:                 float_raise( float_flag_invalid );
                   4975:                 z.low = float128_default_nan_low;
                   4976:                 z.high = float128_default_nan_high;
                   4977:                 return z;
                   4978:             }
                   4979:             float_raise( float_flag_divbyzero );
                   4980:             return packFloat128( zSign, 0x7FFF, 0, 0 );
                   4981:         }
                   4982:         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
                   4983:     }
                   4984:     if ( aExp == 0 ) {
                   4985:         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
                   4986:         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
                   4987:     }
                   4988:     zExp = aExp - bExp + 0x3FFD;
                   4989:     shortShift128Left(
                   4990:         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
                   4991:     shortShift128Left(
                   4992:         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
                   4993:     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
                   4994:         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
                   4995:         ++zExp;
                   4996:     }
                   4997:     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
                   4998:     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
                   4999:     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
                   5000:     while ( (sbits64) rem0 < 0 ) {
                   5001:         --zSig0;
                   5002:         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
                   5003:     }
                   5004:     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
                   5005:     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
                   5006:         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
                   5007:         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
                   5008:         while ( (sbits64) rem1 < 0 ) {
                   5009:             --zSig1;
                   5010:             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
                   5011:         }
                   5012:         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
                   5013:     }
                   5014:     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
                   5015:     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
                   5016:
                   5017: }
                   5018:
                   5019: /*
                   5020: -------------------------------------------------------------------------------
                   5021: Returns the remainder of the quadruple-precision floating-point value `a'
                   5022: with respect to the corresponding value `b'.  The operation is performed
                   5023: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   5024: -------------------------------------------------------------------------------
                   5025: */
                   5026: float128 float128_rem( float128 a, float128 b )
                   5027: {
                   5028:     flag aSign, bSign, zSign;
                   5029:     int32 aExp, bExp, expDiff;
                   5030:     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
                   5031:     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
                   5032:     sbits64 sigMean0;
                   5033:     float128 z;
                   5034:
                   5035:     aSig1 = extractFloat128Frac1( a );
                   5036:     aSig0 = extractFloat128Frac0( a );
                   5037:     aExp = extractFloat128Exp( a );
                   5038:     aSign = extractFloat128Sign( a );
                   5039:     bSig1 = extractFloat128Frac1( b );
                   5040:     bSig0 = extractFloat128Frac0( b );
                   5041:     bExp = extractFloat128Exp( b );
                   5042:     bSign = extractFloat128Sign( b );
                   5043:     if ( aExp == 0x7FFF ) {
                   5044:         if (    ( aSig0 | aSig1 )
                   5045:              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
                   5046:             return propagateFloat128NaN( a, b );
                   5047:         }
                   5048:         goto invalid;
                   5049:     }
                   5050:     if ( bExp == 0x7FFF ) {
                   5051:         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
                   5052:         return a;
                   5053:     }
                   5054:     if ( bExp == 0 ) {
                   5055:         if ( ( bSig0 | bSig1 ) == 0 ) {
                   5056:  invalid:
                   5057:             float_raise( float_flag_invalid );
                   5058:             z.low = float128_default_nan_low;
                   5059:             z.high = float128_default_nan_high;
                   5060:             return z;
                   5061:         }
                   5062:         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
                   5063:     }
                   5064:     if ( aExp == 0 ) {
                   5065:         if ( ( aSig0 | aSig1 ) == 0 ) return a;
                   5066:         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
                   5067:     }
                   5068:     expDiff = aExp - bExp;
                   5069:     if ( expDiff < -1 ) return a;
                   5070:     shortShift128Left(
                   5071:         aSig0 | LIT64( 0x0001000000000000 ),
                   5072:         aSig1,
                   5073:         15 - ( expDiff < 0 ),
                   5074:         &aSig0,
                   5075:         &aSig1
                   5076:     );
                   5077:     shortShift128Left(
                   5078:         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
                   5079:     q = le128( bSig0, bSig1, aSig0, aSig1 );
                   5080:     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
                   5081:     expDiff -= 64;
                   5082:     while ( 0 < expDiff ) {
                   5083:         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
                   5084:         q = ( 4 < q ) ? q - 4 : 0;
                   5085:         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
                   5086:         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
                   5087:         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
                   5088:         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
                   5089:         expDiff -= 61;
                   5090:     }
                   5091:     if ( -64 < expDiff ) {
                   5092:         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
                   5093:         q = ( 4 < q ) ? q - 4 : 0;
                   5094:         q >>= - expDiff;
                   5095:         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
                   5096:         expDiff += 52;
                   5097:         if ( expDiff < 0 ) {
                   5098:             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
                   5099:         }
                   5100:         else {
                   5101:             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
                   5102:         }
                   5103:         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
                   5104:         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
                   5105:     }
                   5106:     else {
                   5107:         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
                   5108:         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
                   5109:     }
                   5110:     do {
                   5111:         alternateASig0 = aSig0;
                   5112:         alternateASig1 = aSig1;
                   5113:         ++q;
                   5114:         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
                   5115:     } while ( 0 <= (sbits64) aSig0 );
                   5116:     add128(
                   5117:         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
                   5118:     if (    ( sigMean0 < 0 )
                   5119:          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
                   5120:         aSig0 = alternateASig0;
                   5121:         aSig1 = alternateASig1;
                   5122:     }
                   5123:     zSign = ( (sbits64) aSig0 < 0 );
                   5124:     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
                   5125:     return
                   5126:         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
                   5127:
                   5128: }
                   5129:
                   5130: /*
                   5131: -------------------------------------------------------------------------------
                   5132: Returns the square root of the quadruple-precision floating-point value `a'.
                   5133: The operation is performed according to the IEC/IEEE Standard for Binary
                   5134: Floating-Point Arithmetic.
                   5135: -------------------------------------------------------------------------------
                   5136: */
                   5137: float128 float128_sqrt( float128 a )
                   5138: {
                   5139:     flag aSign;
                   5140:     int32 aExp, zExp;
                   5141:     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
                   5142:     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
                   5143:     float128 z;
                   5144:
                   5145:     aSig1 = extractFloat128Frac1( a );
                   5146:     aSig0 = extractFloat128Frac0( a );
                   5147:     aExp = extractFloat128Exp( a );
                   5148:     aSign = extractFloat128Sign( a );
                   5149:     if ( aExp == 0x7FFF ) {
                   5150:         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
                   5151:         if ( ! aSign ) return a;
                   5152:         goto invalid;
                   5153:     }
                   5154:     if ( aSign ) {
                   5155:         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
                   5156:  invalid:
                   5157:         float_raise( float_flag_invalid );
                   5158:         z.low = float128_default_nan_low;
                   5159:         z.high = float128_default_nan_high;
                   5160:         return z;
                   5161:     }
                   5162:     if ( aExp == 0 ) {
                   5163:         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
                   5164:         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
                   5165:     }
                   5166:     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
                   5167:     aSig0 |= LIT64( 0x0001000000000000 );
                   5168:     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
                   5169:     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
                   5170:     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
                   5171:     doubleZSig0 = zSig0<<1;
                   5172:     mul64To128( zSig0, zSig0, &term0, &term1 );
                   5173:     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
                   5174:     while ( (sbits64) rem0 < 0 ) {
                   5175:         --zSig0;
                   5176:         doubleZSig0 -= 2;
                   5177:         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
                   5178:     }
                   5179:     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
                   5180:     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
                   5181:         if ( zSig1 == 0 ) zSig1 = 1;
                   5182:         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
                   5183:         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
                   5184:         mul64To128( zSig1, zSig1, &term2, &term3 );
                   5185:         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
                   5186:         while ( (sbits64) rem1 < 0 ) {
                   5187:             --zSig1;
                   5188:             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
                   5189:             term3 |= 1;
                   5190:             term2 |= doubleZSig0;
                   5191:             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
                   5192:         }
                   5193:         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
                   5194:     }
                   5195:     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
                   5196:     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
                   5197:
                   5198: }
                   5199:
                   5200: /*
                   5201: -------------------------------------------------------------------------------
                   5202: Returns 1 if the quadruple-precision floating-point value `a' is equal to
                   5203: the corresponding value `b', and 0 otherwise.  The comparison is performed
                   5204: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   5205: -------------------------------------------------------------------------------
                   5206: */
                   5207: flag float128_eq( float128 a, float128 b )
                   5208: {
                   5209:
                   5210:     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
                   5211:               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
                   5212:          || (    ( extractFloat128Exp( b ) == 0x7FFF )
                   5213:               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
                   5214:        ) {
                   5215:         if (    float128_is_signaling_nan( a )
                   5216:              || float128_is_signaling_nan( b ) ) {
                   5217:             float_raise( float_flag_invalid );
                   5218:         }
                   5219:         return 0;
                   5220:     }
                   5221:     return
                   5222:            ( a.low == b.low )
                   5223:         && (    ( a.high == b.high )
                   5224:              || (    ( a.low == 0 )
                   5225:                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
                   5226:            );
                   5227:
                   5228: }
                   5229:
                   5230: /*
                   5231: -------------------------------------------------------------------------------
                   5232: Returns 1 if the quadruple-precision floating-point value `a' is less than
                   5233: or equal to the corresponding value `b', and 0 otherwise.  The comparison
                   5234: is performed according to the IEC/IEEE Standard for Binary Floating-Point
                   5235: Arithmetic.
                   5236: -------------------------------------------------------------------------------
                   5237: */
                   5238: flag float128_le( float128 a, float128 b )
                   5239: {
                   5240:     flag aSign, bSign;
                   5241:
                   5242:     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
                   5243:               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
                   5244:          || (    ( extractFloat128Exp( b ) == 0x7FFF )
                   5245:               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
                   5246:        ) {
                   5247:         float_raise( float_flag_invalid );
                   5248:         return 0;
                   5249:     }
                   5250:     aSign = extractFloat128Sign( a );
                   5251:     bSign = extractFloat128Sign( b );
                   5252:     if ( aSign != bSign ) {
                   5253:         return
                   5254:                aSign
                   5255:             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   5256:                  == 0 );
                   5257:     }
                   5258:     return
                   5259:           aSign ? le128( b.high, b.low, a.high, a.low )
                   5260:         : le128( a.high, a.low, b.high, b.low );
                   5261:
                   5262: }
                   5263:
                   5264: /*
                   5265: -------------------------------------------------------------------------------
                   5266: Returns 1 if the quadruple-precision floating-point value `a' is less than
                   5267: the corresponding value `b', and 0 otherwise.  The comparison is performed
                   5268: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   5269: -------------------------------------------------------------------------------
                   5270: */
                   5271: flag float128_lt( float128 a, float128 b )
                   5272: {
                   5273:     flag aSign, bSign;
                   5274:
                   5275:     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
                   5276:               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
                   5277:          || (    ( extractFloat128Exp( b ) == 0x7FFF )
                   5278:               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
                   5279:        ) {
                   5280:         float_raise( float_flag_invalid );
                   5281:         return 0;
                   5282:     }
                   5283:     aSign = extractFloat128Sign( a );
                   5284:     bSign = extractFloat128Sign( b );
                   5285:     if ( aSign != bSign ) {
                   5286:         return
                   5287:                aSign
                   5288:             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   5289:                  != 0 );
                   5290:     }
                   5291:     return
                   5292:           aSign ? lt128( b.high, b.low, a.high, a.low )
                   5293:         : lt128( a.high, a.low, b.high, b.low );
                   5294:
                   5295: }
                   5296:
                   5297: /*
                   5298: -------------------------------------------------------------------------------
                   5299: Returns 1 if the quadruple-precision floating-point value `a' is equal to
                   5300: the corresponding value `b', and 0 otherwise.  The invalid exception is
                   5301: raised if either operand is a NaN.  Otherwise, the comparison is performed
                   5302: according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   5303: -------------------------------------------------------------------------------
                   5304: */
                   5305: flag float128_eq_signaling( float128 a, float128 b )
                   5306: {
                   5307:
                   5308:     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
                   5309:               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
                   5310:          || (    ( extractFloat128Exp( b ) == 0x7FFF )
                   5311:               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
                   5312:        ) {
                   5313:         float_raise( float_flag_invalid );
                   5314:         return 0;
                   5315:     }
                   5316:     return
                   5317:            ( a.low == b.low )
                   5318:         && (    ( a.high == b.high )
                   5319:              || (    ( a.low == 0 )
                   5320:                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
                   5321:            );
                   5322:
                   5323: }
                   5324:
                   5325: /*
                   5326: -------------------------------------------------------------------------------
                   5327: Returns 1 if the quadruple-precision floating-point value `a' is less than
                   5328: or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
                   5329: cause an exception.  Otherwise, the comparison is performed according to the
                   5330: IEC/IEEE Standard for Binary Floating-Point Arithmetic.
                   5331: -------------------------------------------------------------------------------
                   5332: */
                   5333: flag float128_le_quiet( float128 a, float128 b )
                   5334: {
                   5335:     flag aSign, bSign;
                   5336:
                   5337:     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
                   5338:               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
                   5339:          || (    ( extractFloat128Exp( b ) == 0x7FFF )
                   5340:               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
                   5341:        ) {
                   5342:         if (    float128_is_signaling_nan( a )
                   5343:              || float128_is_signaling_nan( b ) ) {
                   5344:             float_raise( float_flag_invalid );
                   5345:         }
                   5346:         return 0;
                   5347:     }
                   5348:     aSign = extractFloat128Sign( a );
                   5349:     bSign = extractFloat128Sign( b );
                   5350:     if ( aSign != bSign ) {
                   5351:         return
                   5352:                aSign
                   5353:             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   5354:                  == 0 );
                   5355:     }
                   5356:     return
                   5357:           aSign ? le128( b.high, b.low, a.high, a.low )
                   5358:         : le128( a.high, a.low, b.high, b.low );
                   5359:
                   5360: }
                   5361:
                   5362: /*
                   5363: -------------------------------------------------------------------------------
                   5364: Returns 1 if the quadruple-precision floating-point value `a' is less than
                   5365: the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
                   5366: exception.  Otherwise, the comparison is performed according to the IEC/IEEE
                   5367: Standard for Binary Floating-Point Arithmetic.
                   5368: -------------------------------------------------------------------------------
                   5369: */
                   5370: flag float128_lt_quiet( float128 a, float128 b )
                   5371: {
                   5372:     flag aSign, bSign;
                   5373:
                   5374:     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
                   5375:               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
                   5376:          || (    ( extractFloat128Exp( b ) == 0x7FFF )
                   5377:               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
                   5378:        ) {
                   5379:         if (    float128_is_signaling_nan( a )
                   5380:              || float128_is_signaling_nan( b ) ) {
                   5381:             float_raise( float_flag_invalid );
                   5382:         }
                   5383:         return 0;
                   5384:     }
                   5385:     aSign = extractFloat128Sign( a );
                   5386:     bSign = extractFloat128Sign( b );
                   5387:     if ( aSign != bSign ) {
                   5388:         return
                   5389:                aSign
                   5390:             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
                   5391:                  != 0 );
                   5392:     }
                   5393:     return
                   5394:           aSign ? lt128( b.high, b.low, a.high, a.low )
                   5395:         : lt128( a.high, a.low, b.high, b.low );
                   5396:
                   5397: }
                   5398:
                   5399: #endif
                   5400:
                   5401:
                   5402: #if defined(SOFTFLOAT_FOR_GCC) && defined(SOFTFLOAT_NEED_FIXUNS)
                   5403:
                   5404: /*
                   5405:  * These two routines are not part of the original softfloat distribution.
                   5406:  *
                   5407:  * They are based on the corresponding conversions to integer but return
                   5408:  * unsigned numbers instead since these functions are required by GCC.
                   5409:  *
                   5410:  * Added by Mark Brinicombe <mark@netbsd.org>  27/09/97
                   5411:  *
                   5412:  * float64 version overhauled for SoftFloat 2a [bjh21 2000-07-15]
                   5413:  */
                   5414:
                   5415: /*
                   5416: -------------------------------------------------------------------------------
                   5417: Returns the result of converting the double-precision floating-point value
                   5418: `a' to the 32-bit unsigned integer format.  The conversion is
                   5419: performed according to the IEC/IEEE Standard for Binary Floating-point
                   5420: Arithmetic, except that the conversion is always rounded toward zero.  If
                   5421: `a' is a NaN, the largest positive integer is returned.  If the conversion
                   5422: overflows, the largest integer positive is returned.
                   5423: -------------------------------------------------------------------------------
                   5424: */
                   5425: uint32 float64_to_uint32_round_to_zero( float64 a )
                   5426: {
                   5427:     flag aSign;
                   5428:     int16 aExp, shiftCount;
                   5429:     bits64 aSig, savedASig;
                   5430:     uint32 z;
                   5431:
                   5432:     aSig = extractFloat64Frac( a );
                   5433:     aExp = extractFloat64Exp( a );
                   5434:     aSign = extractFloat64Sign( a );
                   5435:
                   5436:     if (aSign) {
                   5437:         float_raise( float_flag_invalid );
                   5438:        return(0);
                   5439:     }
                   5440:
                   5441:     if ( 0x41E < aExp ) {
                   5442:         float_raise( float_flag_invalid );
                   5443:         return 0xffffffff;
                   5444:     }
                   5445:     else if ( aExp < 0x3FF ) {
                   5446:         if ( aExp || aSig ) float_set_inexact();
                   5447:         return 0;
                   5448:     }
                   5449:     aSig |= LIT64( 0x0010000000000000 );
                   5450:     shiftCount = 0x433 - aExp;
                   5451:     savedASig = aSig;
                   5452:     aSig >>= shiftCount;
                   5453:     z = aSig;
                   5454:     if ( ( aSig<<shiftCount ) != savedASig ) {
                   5455:         float_set_inexact();
                   5456:     }
                   5457:     return z;
                   5458:
                   5459: }
                   5460:
                   5461: /*
                   5462: -------------------------------------------------------------------------------
                   5463: Returns the result of converting the single-precision floating-point value
                   5464: `a' to the 32-bit unsigned integer format.  The conversion is
                   5465: performed according to the IEC/IEEE Standard for Binary Floating-point
                   5466: Arithmetic, except that the conversion is always rounded toward zero.  If
                   5467: `a' is a NaN, the largest positive integer is returned.  If the conversion
                   5468: overflows, the largest positive integer is returned.
                   5469: -------------------------------------------------------------------------------
                   5470: */
                   5471: uint32 float32_to_uint32_round_to_zero( float32 a )
                   5472: {
                   5473:     flag aSign;
                   5474:     int16 aExp, shiftCount;
                   5475:     bits32 aSig;
                   5476:     uint32 z;
                   5477:
                   5478:     aSig = extractFloat32Frac( a );
                   5479:     aExp = extractFloat32Exp( a );
                   5480:     aSign = extractFloat32Sign( a );
                   5481:     shiftCount = aExp - 0x9E;
                   5482:
                   5483:     if (aSign) {
                   5484:         float_raise( float_flag_invalid );
                   5485:        return(0);
                   5486:     }
                   5487:     if ( 0 < shiftCount ) {
                   5488:         float_raise( float_flag_invalid );
                   5489:         return 0xFFFFFFFF;
                   5490:     }
                   5491:     else if ( aExp <= 0x7E ) {
                   5492:         if ( aExp | aSig ) float_set_inexact();
                   5493:         return 0;
                   5494:     }
                   5495:     aSig = ( aSig | 0x800000 )<<8;
                   5496:     z = aSig>>( - shiftCount );
                   5497:     if ( aSig<<( shiftCount & 31 ) ) {
                   5498:         float_set_inexact();
                   5499:     }
                   5500:     return z;
                   5501:
                   5502: }
                   5503:
                   5504: #endif
                   5505:
                   5506: #endif /* !NO_IEEE */
CVSweb