sys/arch/sparc/fpu/fpu_sqrt.c - annotate

Return to fpu_sqrt.c CVS log
Up to [local] / sys / arch / sparc / fpu
Annotation of sys/arch/sparc/fpu/fpu_sqrt.c, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: fpu_sqrt.c,v 1.3 2003/06/02 23:27:54 millert Exp $    */
                      2: /*     $NetBSD: fpu_sqrt.c,v 1.2 1994/11/20 20:52:46 deraadt Exp $ */
                      3:
                      4: /*
                      5:  * Copyright (c) 1992, 1993
                      6:  *     The Regents of the University of California.  All rights reserved.
                      7:  *
                      8:  * This software was developed by the Computer Systems Engineering group
                      9:  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
                     10:  * contributed to Berkeley.
                     11:  *
                     12:  * All advertising materials mentioning features or use of this software
                     13:  * must display the following acknowledgement:
                     14:  *     This product includes software developed by the University of
                     15:  *     California, Lawrence Berkeley Laboratory.
                     16:  *
                     17:  * Redistribution and use in source and binary forms, with or without
                     18:  * modification, are permitted provided that the following conditions
                     19:  * are met:
                     20:  * 1. Redistributions of source code must retain the above copyright
                     21:  *    notice, this list of conditions and the following disclaimer.
                     22:  * 2. Redistributions in binary form must reproduce the above copyright
                     23:  *    notice, this list of conditions and the following disclaimer in the
                     24:  *    documentation and/or other materials provided with the distribution.
                     25:  * 3. Neither the name of the University nor the names of its contributors
                     26:  *    may be used to endorse or promote products derived from this software
                     27:  *    without specific prior written permission.
                     28:  *
                     29:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     30:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     31:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     32:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     33:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     34:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     35:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     36:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     37:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     38:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     39:  * SUCH DAMAGE.
                     40:  *
                     41:  *     @(#)fpu_sqrt.c  8.1 (Berkeley) 6/11/93
                     42:  */
                     43:
                     44: /*
                     45:  * Perform an FPU square root (return sqrt(x)).
                     46:  */
                     47:
                     48: #include <sys/types.h>
                     49:
                     50: #include <machine/reg.h>
                     51:
                     52: #include <sparc/fpu/fpu_arith.h>
                     53: #include <sparc/fpu/fpu_emu.h>
                     54:
                     55: /*
                     56:  * Our task is to calculate the square root of a floating point number x0.
                     57:  * This number x normally has the form:
                     58:  *
                     59:  *                 exp
                     60:  *     x = mant * 2            (where 1 <= mant < 2 and exp is an integer)
                     61:  *
                     62:  * This can be left as it stands, or the mantissa can be doubled and the
                     63:  * exponent decremented:
                     64:  *
                     65:  *                       exp-1
                     66:  *     x = (2 * mant) * 2      (where 2 <= 2 * mant < 4)
                     67:  *
                     68:  * If the exponent `exp' is even, the square root of the number is best
                     69:  * handled using the first form, and is by definition equal to:
                     70:  *
                     71:  *                             exp/2
                     72:  *     sqrt(x) = sqrt(mant) * 2
                     73:  *
                     74:  * If exp is odd, on the other hand, it is convenient to use the second
                     75:  * form, giving:
                     76:  *
                     77:  *                                 (exp-1)/2
                     78:  *     sqrt(x) = sqrt(2 * mant) * 2
                     79:  *
                     80:  * In the first case, we have
                     81:  *
                     82:  *     1 <= mant < 2
                     83:  *
                     84:  * and therefore
                     85:  *
                     86:  *     sqrt(1) <= sqrt(mant) < sqrt(2)
                     87:  *
                     88:  * while in the second case we have
                     89:  *
                     90:  *     2 <= 2*mant < 4
                     91:  *
                     92:  * and therefore
                     93:  *
                     94:  *     sqrt(2) <= sqrt(2*mant) < sqrt(4)
                     95:  *
                     96:  * so that in any case, we are sure that
                     97:  *
                     98:  *     sqrt(1) <= sqrt(n * mant) < sqrt(4),    n = 1 or 2
                     99:  *
                    100:  * or
                    101:  *
                    102:  *     1 <= sqrt(n * mant) < 2,                n = 1 or 2.
                    103:  *
                    104:  * This root is therefore a properly formed mantissa for a floating
                    105:  * point number.  The exponent of sqrt(x) is either exp/2 or (exp-1)/2
                    106:  * as above.  This leaves us with the problem of finding the square root
                    107:  * of a fixed-point number in the range [1..4).
                    108:  *
                    109:  * Though it may not be instantly obvious, the following square root
                    110:  * algorithm works for any integer x of an even number of bits, provided
                    111:  * that no overflows occur:
                    112:  *
                    113:  *     let q = 0
                    114:  *     for k = NBITS-1 to 0 step -1 do -- for each digit in the answer...
                    115:  *             x *= 2                  -- multiply by radix, for next digit
                    116:  *             if x >= 2q + 2^k then   -- if adding 2^k does not
                    117:  *                     x -= 2q + 2^k   -- exceed the correct root,
                    118:  *                     q += 2^k        -- add 2^k and adjust x
                    119:  *             fi
                    120:  *     done
                    121:  *     sqrt = q / 2^(NBITS/2)          -- (and any remainder is in x)
                    122:  *
                    123:  * If NBITS is odd (so that k is initially even), we can just add another
                    124:  * zero bit at the top of x.  Doing so means that q is not going to acquire
                    125:  * a 1 bit in the first trip around the loop (since x0 < 2^NBITS).  If the
                    126:  * final value in x is not needed, or can be off by a factor of 2, this is
                    127:  * equivalant to moving the `x *= 2' step to the bottom of the loop:
                    128:  *
                    129:  *     for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done
                    130:  *
                    131:  * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2).
                    132:  * (Since the algorithm is destructive on x, we will call x's initial
                    133:  * value, for which q is some power of two times its square root, x0.)
                    134:  *
                    135:  * If we insert a loop invariant y = 2q, we can then rewrite this using
                    136:  * C notation as:
                    137:  *
                    138:  *     q = y = 0; x = x0;
                    139:  *     for (k = NBITS; --k >= 0;) {
                    140:  * #if (NBITS is even)
                    141:  *             x *= 2;
                    142:  * #endif
                    143:  *             t = y + (1 << k);
                    144:  *             if (x >= t) {
                    145:  *                     x -= t;
                    146:  *                     q += 1 << k;
                    147:  *                     y += 1 << (k + 1);
                    148:  *             }
                    149:  * #if (NBITS is odd)
                    150:  *             x *= 2;
                    151:  * #endif
                    152:  *     }
                    153:  *
                    154:  * If x0 is fixed point, rather than an integer, we can simply alter the
                    155:  * scale factor between q and sqrt(x0).  As it happens, we can easily arrange
                    156:  * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q.
                    157:  *
                    158:  * In our case, however, x0 (and therefore x, y, q, and t) are multiword
                    159:  * integers, which adds some complication.  But note that q is built one
                    160:  * bit at a time, from the top down, and is not used itself in the loop
                    161:  * (we use 2q as held in y instead).  This means we can build our answer
                    162:  * in an integer, one word at a time, which saves a bit of work.  Also,
                    163:  * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are
                    164:  * `new' bits in y and we can set them with an `or' operation rather than
                    165:  * a full-blown multiword add.
                    166:  *
                    167:  * We are almost done, except for one snag.  We must prove that none of our
                    168:  * intermediate calculations can overflow.  We know that x0 is in [1..4)
                    169:  * and therefore the square root in q will be in [1..2), but what about x,
                    170:  * y, and t?
                    171:  *
                    172:  * We know that y = 2q at the beginning of each loop.  (The relation only
                    173:  * fails temporarily while y and q are being updated.)  Since q < 2, y < 4.
                    174:  * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and.
                    175:  * Furthermore, we can prove with a bit of work that x never exceeds y by
                    176:  * more than 2, so that even after doubling, 0 <= x < 8.  (This is left as
                    177:  * an exercise to the reader, mostly because I have become tired of working
                    178:  * on this comment.)
                    179:  *
                    180:  * If our floating point mantissas (which are of the form 1.frac) occupy
                    181:  * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra.
                    182:  * In fact, we want even one more bit (for a carry, to avoid compares), or
                    183:  * three extra.  There is a comment in fpu_emu.h reminding maintainers of
                    184:  * this, so we have some justification in assuming it.
                    185:  */
                    186: struct fpn *
                    187: fpu_sqrt(fe)
                    188:        struct fpemu *fe;
                    189: {
                    190:        register struct fpn *x = &fe->fe_f1;
                    191:        register u_int bit, q, tt;
                    192:        register u_int x0, x1, x2, x3;
                    193:        register u_int y0, y1, y2, y3;
                    194:        register u_int d0, d1, d2, d3;
                    195:        register int e;
                    196:
                    197:        /*
                    198:         * Take care of special cases first.  In order:
                    199:         *
                    200:         *      sqrt(NaN) = NaN
                    201:         *      sqrt(+0) = +0
                    202:         *      sqrt(-0) = -0
                    203:         *      sqrt(x < 0) = NaN       (including sqrt(-Inf))
                    204:         *      sqrt(+Inf) = +Inf
                    205:         *
                    206:         * Then all that remains are numbers with mantissas in [1..2).
                    207:         */
                    208:        if (ISNAN(x) || ISZERO(x))
                    209:                return (x);
                    210:        if (x->fp_sign)
                    211:                return (fpu_newnan(fe));
                    212:        if (ISINF(x))
                    213:                return (x);
                    214:
                    215:        /*
                    216:         * Calculate result exponent.  As noted above, this may involve
                    217:         * doubling the mantissa.  We will also need to double x each
                    218:         * time around the loop, so we define a macro for this here, and
                    219:         * we break out the multiword mantissa.
                    220:         */
                    221: #ifdef FPU_SHL1_BY_ADD
                    222: #define        DOUBLE_X { \
                    223:        FPU_ADDS(x3, x3, x3); FPU_ADDCS(x2, x2, x2); \
                    224:        FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \
                    225: }
                    226: #else
                    227: #define        DOUBLE_X { \
                    228:        x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \
                    229:        x2 = (x2 << 1) | (x3 >> 31); x3 <<= 1; \
                    230: }
                    231: #endif
                    232: #if (FP_NMANT & 1) != 0
                    233: # define ODD_DOUBLE    DOUBLE_X
                    234: # define EVEN_DOUBLE   /* nothing */
                    235: #else
                    236: # define ODD_DOUBLE    /* nothing */
                    237: # define EVEN_DOUBLE   DOUBLE_X
                    238: #endif
                    239:        x0 = x->fp_mant[0];
                    240:        x1 = x->fp_mant[1];
                    241:        x2 = x->fp_mant[2];
                    242:        x3 = x->fp_mant[3];
                    243:        e = x->fp_exp;
                    244:        if (e & 1)              /* exponent is odd; use sqrt(2mant) */
                    245:                DOUBLE_X;
                    246:        /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */
                    247:        x->fp_exp = e >> 1;     /* calculates (e&1 ? (e-1)/2 : e/2 */
                    248:
                    249:        /*
                    250:         * Now calculate the mantissa root.  Since x is now in [1..4),
                    251:         * we know that the first trip around the loop will definitely
                    252:         * set the top bit in q, so we can do that manually and start
                    253:         * the loop at the next bit down instead.  We must be sure to
                    254:         * double x correctly while doing the `known q=1.0'.
                    255:         *
                    256:         * We do this one mantissa-word at a time, as noted above, to
                    257:         * save work.  To avoid `(1 << 31) << 1', we also do the top bit
                    258:         * outside of each per-word loop.
                    259:         *
                    260:         * The calculation `t = y + bit' breaks down into `t0 = y0, ...,
                    261:         * t3 = y3, t? |= bit' for the appropriate word.  Since the bit
                    262:         * is always a `new' one, this means that three of the `t?'s are
                    263:         * just the corresponding `y?'; we use `#define's here for this.
                    264:         * The variable `tt' holds the actual `t?' variable.
                    265:         */
                    266:
                    267:        /* calculate q0 */
                    268: #define        t0 tt
                    269:        bit = FP_1;
                    270:        EVEN_DOUBLE;
                    271:        /* if (x >= (t0 = y0 | bit)) { */       /* always true */
                    272:                q = bit;
                    273:                x0 -= bit;
                    274:                y0 = bit << 1;
                    275:        /* } */
                    276:        ODD_DOUBLE;
                    277:        while ((bit >>= 1) != 0) {      /* for remaining bits in q0 */
                    278:                EVEN_DOUBLE;
                    279:                t0 = y0 | bit;          /* t = y + bit */
                    280:                if (x0 >= t0) {         /* if x >= t then */
                    281:                        x0 -= t0;       /*      x -= t */
                    282:                        q |= bit;       /*      q += bit */
                    283:                        y0 |= bit << 1; /*      y += bit << 1 */
                    284:                }
                    285:                ODD_DOUBLE;
                    286:        }
                    287:        x->fp_mant[0] = q;
                    288: #undef t0
                    289:
                    290:        /* calculate q1.  note (y0&1)==0. */
                    291: #define t0 y0
                    292: #define t1 tt
                    293:        q = 0;
                    294:        y1 = 0;
                    295:        bit = 1 << 31;
                    296:        EVEN_DOUBLE;
                    297:        t1 = bit;
                    298:        FPU_SUBS(d1, x1, t1);
                    299:        FPU_SUBC(d0, x0, t0);           /* d = x - t */
                    300:        if ((int)d0 >= 0) {             /* if d >= 0 (i.e., x >= t) then */
                    301:                x0 = d0, x1 = d1;       /*      x -= t */
                    302:                q = bit;                /*      q += bit */
                    303:                y0 |= 1;                /*      y += bit << 1 */
                    304:        }
                    305:        ODD_DOUBLE;
                    306:        while ((bit >>= 1) != 0) {      /* for remaining bits in q1 */
                    307:                EVEN_DOUBLE;            /* as before */
                    308:                t1 = y1 | bit;
                    309:                FPU_SUBS(d1, x1, t1);
                    310:                FPU_SUBC(d0, x0, t0);
                    311:                if ((int)d0 >= 0) {
                    312:                        x0 = d0, x1 = d1;
                    313:                        q |= bit;
                    314:                        y1 |= bit << 1;
                    315:                }
                    316:                ODD_DOUBLE;
                    317:        }
                    318:        x->fp_mant[1] = q;
                    319: #undef t1
                    320:
                    321:        /* calculate q2.  note (y1&1)==0; y0 (aka t0) is fixed. */
                    322: #define t1 y1
                    323: #define t2 tt
                    324:        q = 0;
                    325:        y2 = 0;
                    326:        bit = 1 << 31;
                    327:        EVEN_DOUBLE;
                    328:        t2 = bit;
                    329:        FPU_SUBS(d2, x2, t2);
                    330:        FPU_SUBCS(d1, x1, t1);
                    331:        FPU_SUBC(d0, x0, t0);
                    332:        if ((int)d0 >= 0) {
                    333:                x0 = d0, x1 = d1, x2 = d2;
                    334:                q |= bit;
                    335:                y1 |= 1;                /* now t1, y1 are set in concrete */
                    336:        }
                    337:        ODD_DOUBLE;
                    338:        while ((bit >>= 1) != 0) {
                    339:                EVEN_DOUBLE;
                    340:                t2 = y2 | bit;
                    341:                FPU_SUBS(d2, x2, t2);
                    342:                FPU_SUBCS(d1, x1, t1);
                    343:                FPU_SUBC(d0, x0, t0);
                    344:                if ((int)d0 >= 0) {
                    345:                        x0 = d0, x1 = d1, x2 = d2;
                    346:                        q |= bit;
                    347:                        y2 |= bit << 1;
                    348:                }
                    349:                ODD_DOUBLE;
                    350:        }
                    351:        x->fp_mant[2] = q;
                    352: #undef t2
                    353:
                    354:        /* calculate q3.  y0, t0, y1, t1 all fixed; y2, t2, almost done. */
                    355: #define t2 y2
                    356: #define t3 tt
                    357:        q = 0;
                    358:        y3 = 0;
                    359:        bit = 1 << 31;
                    360:        EVEN_DOUBLE;
                    361:        t3 = bit;
                    362:        FPU_SUBS(d3, x3, t3);
                    363:        FPU_SUBCS(d2, x2, t2);
                    364:        FPU_SUBCS(d1, x1, t1);
                    365:        FPU_SUBC(d0, x0, t0);
                    366:        ODD_DOUBLE;
                    367:        if ((int)d0 >= 0) {
                    368:                x0 = d0, x1 = d1, x2 = d2;
                    369:                q |= bit;
                    370:                y2 |= 1;
                    371:        }
                    372:        while ((bit >>= 1) != 0) {
                    373:                EVEN_DOUBLE;
                    374:                t3 = y3 | bit;
                    375:                FPU_SUBS(d3, x3, t3);
                    376:                FPU_SUBCS(d2, x2, t2);
                    377:                FPU_SUBCS(d1, x1, t1);
                    378:                FPU_SUBC(d0, x0, t0);
                    379:                if ((int)d0 >= 0) {
                    380:                        x0 = d0, x1 = d1, x2 = d2;
                    381:                        q |= bit;
                    382:                        y3 |= bit << 1;
                    383:                }
                    384:                ODD_DOUBLE;
                    385:        }
                    386:        x->fp_mant[3] = q;
                    387:
                    388:        /*
                    389:         * The result, which includes guard and round bits, is exact iff
                    390:         * x is now zero; any nonzero bits in x represent sticky bits.
                    391:         */
                    392:        x->fp_sticky = x0 | x1 | x2 | x3;
                    393:        return (x);
                    394: }
CVSweb