sys/arch/sparc64/fpu/fpu_sqrt.c - annotate

Return to fpu_sqrt.c CVS log
Up to [local] / sys / arch / sparc64 / fpu
Annotation of sys/arch/sparc64/fpu/fpu_sqrt.c, Revision 1.1

1.1     ! nbrk        1: /*     $OpenBSD: fpu_sqrt.c,v 1.2 2003/06/02 23:27:55 millert Exp $    */
        !             2: /*     $NetBSD: fpu_sqrt.c,v 1.2 1994/11/20 20:52:46 deraadt Exp $ */
        !             3:
        !             4: /*
        !             5:  * Copyright (c) 1992, 1993
        !             6:  *     The Regents of the University of California.  All rights reserved.
        !             7:  *
        !             8:  * This software was developed by the Computer Systems Engineering group
        !             9:  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
        !            10:  * contributed to Berkeley.
        !            11:  *
        !            12:  * All advertising materials mentioning features or use of this software
        !            13:  * must display the following acknowledgement:
        !            14:  *     This product includes software developed by the University of
        !            15:  *     California, Lawrence Berkeley Laboratory.
        !            16:  *
        !            17:  * Redistribution and use in source and binary forms, with or without
        !            18:  * modification, are permitted provided that the following conditions
        !            19:  * are met:
        !            20:  * 1. Redistributions of source code must retain the above copyright
        !            21:  *    notice, this list of conditions and the following disclaimer.
        !            22:  * 2. Redistributions in binary form must reproduce the above copyright
        !            23:  *    notice, this list of conditions and the following disclaimer in the
        !            24:  *    documentation and/or other materials provided with the distribution.
        !            25:  * 3. Neither the name of the University nor the names of its contributors
        !            26:  *    may be used to endorse or promote products derived from this software
        !            27:  *    without specific prior written permission.
        !            28:  *
        !            29:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
        !            30:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
        !            31:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
        !            32:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
        !            33:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
        !            34:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
        !            35:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
        !            36:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
        !            37:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
        !            38:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
        !            39:  * SUCH DAMAGE.
        !            40:  *
        !            41:  *     @(#)fpu_sqrt.c  8.1 (Berkeley) 6/11/93
        !            42:  */
        !            43:
        !            44: /*
        !            45:  * Perform an FPU square root (return sqrt(x)).
        !            46:  */
        !            47:
        !            48: #include <sys/types.h>
        !            49:
        !            50: #include <machine/reg.h>
        !            51:
        !            52: #include <sparc64/fpu/fpu_arith.h>
        !            53: #include <sparc64/fpu/fpu_emu.h>
        !            54:
        !            55: /*
        !            56:  * Our task is to calculate the square root of a floating point number x0.
        !            57:  * This number x normally has the form:
        !            58:  *
        !            59:  *                 exp
        !            60:  *     x = mant * 2            (where 1 <= mant < 2 and exp is an integer)
        !            61:  *
        !            62:  * This can be left as it stands, or the mantissa can be doubled and the
        !            63:  * exponent decremented:
        !            64:  *
        !            65:  *                       exp-1
        !            66:  *     x = (2 * mant) * 2      (where 2 <= 2 * mant < 4)
        !            67:  *
        !            68:  * If the exponent `exp' is even, the square root of the number is best
        !            69:  * handled using the first form, and is by definition equal to:
        !            70:  *
        !            71:  *                             exp/2
        !            72:  *     sqrt(x) = sqrt(mant) * 2
        !            73:  *
        !            74:  * If exp is odd, on the other hand, it is convenient to use the second
        !            75:  * form, giving:
        !            76:  *
        !            77:  *                                 (exp-1)/2
        !            78:  *     sqrt(x) = sqrt(2 * mant) * 2
        !            79:  *
        !            80:  * In the first case, we have
        !            81:  *
        !            82:  *     1 <= mant < 2
        !            83:  *
        !            84:  * and therefore
        !            85:  *
        !            86:  *     sqrt(1) <= sqrt(mant) < sqrt(2)
        !            87:  *
        !            88:  * while in the second case we have
        !            89:  *
        !            90:  *     2 <= 2*mant < 4
        !            91:  *
        !            92:  * and therefore
        !            93:  *
        !            94:  *     sqrt(2) <= sqrt(2*mant) < sqrt(4)
        !            95:  *
        !            96:  * so that in any case, we are sure that
        !            97:  *
        !            98:  *     sqrt(1) <= sqrt(n * mant) < sqrt(4),    n = 1 or 2
        !            99:  *
        !           100:  * or
        !           101:  *
        !           102:  *     1 <= sqrt(n * mant) < 2,                n = 1 or 2.
        !           103:  *
        !           104:  * This root is therefore a properly formed mantissa for a floating
        !           105:  * point number.  The exponent of sqrt(x) is either exp/2 or (exp-1)/2
        !           106:  * as above.  This leaves us with the problem of finding the square root
        !           107:  * of a fixed-point number in the range [1..4).
        !           108:  *
        !           109:  * Though it may not be instantly obvious, the following square root
        !           110:  * algorithm works for any integer x of an even number of bits, provided
        !           111:  * that no overflows occur:
        !           112:  *
        !           113:  *     let q = 0
        !           114:  *     for k = NBITS-1 to 0 step -1 do -- for each digit in the answer...
        !           115:  *             x *= 2                  -- multiply by radix, for next digit
        !           116:  *             if x >= 2q + 2^k then   -- if adding 2^k does not
        !           117:  *                     x -= 2q + 2^k   -- exceed the correct root,
        !           118:  *                     q += 2^k        -- add 2^k and adjust x
        !           119:  *             fi
        !           120:  *     done
        !           121:  *     sqrt = q / 2^(NBITS/2)          -- (and any remainder is in x)
        !           122:  *
        !           123:  * If NBITS is odd (so that k is initially even), we can just add another
        !           124:  * zero bit at the top of x.  Doing so means that q is not going to acquire
        !           125:  * a 1 bit in the first trip around the loop (since x0 < 2^NBITS).  If the
        !           126:  * final value in x is not needed, or can be off by a factor of 2, this is
        !           127:  * equivalant to moving the `x *= 2' step to the bottom of the loop:
        !           128:  *
        !           129:  *     for k = NBITS-1 to 0 step -1 do if ... fi; x *= 2; done
        !           130:  *
        !           131:  * and the result q will then be sqrt(x0) * 2^floor(NBITS / 2).
        !           132:  * (Since the algorithm is destructive on x, we will call x's initial
        !           133:  * value, for which q is some power of two times its square root, x0.)
        !           134:  *
        !           135:  * If we insert a loop invariant y = 2q, we can then rewrite this using
        !           136:  * C notation as:
        !           137:  *
        !           138:  *     q = y = 0; x = x0;
        !           139:  *     for (k = NBITS; --k >= 0;) {
        !           140:  * #if (NBITS is even)
        !           141:  *             x *= 2;
        !           142:  * #endif
        !           143:  *             t = y + (1 << k);
        !           144:  *             if (x >= t) {
        !           145:  *                     x -= t;
        !           146:  *                     q += 1 << k;
        !           147:  *                     y += 1 << (k + 1);
        !           148:  *             }
        !           149:  * #if (NBITS is odd)
        !           150:  *             x *= 2;
        !           151:  * #endif
        !           152:  *     }
        !           153:  *
        !           154:  * If x0 is fixed point, rather than an integer, we can simply alter the
        !           155:  * scale factor between q and sqrt(x0).  As it happens, we can easily arrange
        !           156:  * for the scale factor to be 2**0 or 1, so that sqrt(x0) == q.
        !           157:  *
        !           158:  * In our case, however, x0 (and therefore x, y, q, and t) are multiword
        !           159:  * integers, which adds some complication.  But note that q is built one
        !           160:  * bit at a time, from the top down, and is not used itself in the loop
        !           161:  * (we use 2q as held in y instead).  This means we can build our answer
        !           162:  * in an integer, one word at a time, which saves a bit of work.  Also,
        !           163:  * since 1 << k is always a `new' bit in q, 1 << k and 1 << (k+1) are
        !           164:  * `new' bits in y and we can set them with an `or' operation rather than
        !           165:  * a full-blown multiword add.
        !           166:  *
        !           167:  * We are almost done, except for one snag.  We must prove that none of our
        !           168:  * intermediate calculations can overflow.  We know that x0 is in [1..4)
        !           169:  * and therefore the square root in q will be in [1..2), but what about x,
        !           170:  * y, and t?
        !           171:  *
        !           172:  * We know that y = 2q at the beginning of each loop.  (The relation only
        !           173:  * fails temporarily while y and q are being updated.)  Since q < 2, y < 4.
        !           174:  * The sum in t can, in our case, be as much as y+(1<<1) = y+2 < 6, and.
        !           175:  * Furthermore, we can prove with a bit of work that x never exceeds y by
        !           176:  * more than 2, so that even after doubling, 0 <= x < 8.  (This is left as
        !           177:  * an exercise to the reader, mostly because I have become tired of working
        !           178:  * on this comment.)
        !           179:  *
        !           180:  * If our floating point mantissas (which are of the form 1.frac) occupy
        !           181:  * B+1 bits, our largest intermediary needs at most B+3 bits, or two extra.
        !           182:  * In fact, we want even one more bit (for a carry, to avoid compares), or
        !           183:  * three extra.  There is a comment in fpu_emu.h reminding maintainers of
        !           184:  * this, so we have some justification in assuming it.
        !           185:  */
        !           186: struct fpn *
        !           187: fpu_sqrt(fe)
        !           188:        struct fpemu *fe;
        !           189: {
        !           190:        register struct fpn *x = &fe->fe_f1;
        !           191:        register u_int bit, q, tt;
        !           192:        register u_int x0, x1, x2, x3;
        !           193:        register u_int y0, y1, y2, y3;
        !           194:        register u_int d0, d1, d2, d3;
        !           195:        register int e;
        !           196:
        !           197:        /*
        !           198:         * Take care of special cases first.  In order:
        !           199:         *
        !           200:         *      sqrt(NaN) = NaN
        !           201:         *      sqrt(+0) = +0
        !           202:         *      sqrt(-0) = -0
        !           203:         *      sqrt(x < 0) = NaN       (including sqrt(-Inf))
        !           204:         *      sqrt(+Inf) = +Inf
        !           205:         *
        !           206:         * Then all that remains are numbers with mantissas in [1..2).
        !           207:         */
        !           208:        if (ISNAN(x) || ISZERO(x))
        !           209:                return (x);
        !           210:        if (x->fp_sign)
        !           211:                return (fpu_newnan(fe));
        !           212:        if (ISINF(x))
        !           213:                return (x);
        !           214:
        !           215:        /*
        !           216:         * Calculate result exponent.  As noted above, this may involve
        !           217:         * doubling the mantissa.  We will also need to double x each
        !           218:         * time around the loop, so we define a macro for this here, and
        !           219:         * we break out the multiword mantissa.
        !           220:         */
        !           221: #ifdef FPU_SHL1_BY_ADD
        !           222: #define        DOUBLE_X { \
        !           223:        FPU_ADDS(x3, x3, x3); FPU_ADDCS(x2, x2, x2); \
        !           224:        FPU_ADDCS(x1, x1, x1); FPU_ADDC(x0, x0, x0); \
        !           225: }
        !           226: #else
        !           227: #define        DOUBLE_X { \
        !           228:        x0 = (x0 << 1) | (x1 >> 31); x1 = (x1 << 1) | (x2 >> 31); \
        !           229:        x2 = (x2 << 1) | (x3 >> 31); x3 <<= 1; \
        !           230: }
        !           231: #endif
        !           232: #if (FP_NMANT & 1) != 0
        !           233: # define ODD_DOUBLE    DOUBLE_X
        !           234: # define EVEN_DOUBLE   /* nothing */
        !           235: #else
        !           236: # define ODD_DOUBLE    /* nothing */
        !           237: # define EVEN_DOUBLE   DOUBLE_X
        !           238: #endif
        !           239:        x0 = x->fp_mant[0];
        !           240:        x1 = x->fp_mant[1];
        !           241:        x2 = x->fp_mant[2];
        !           242:        x3 = x->fp_mant[3];
        !           243:        e = x->fp_exp;
        !           244:        if (e & 1)              /* exponent is odd; use sqrt(2mant) */
        !           245:                DOUBLE_X;
        !           246:        /* THE FOLLOWING ASSUMES THAT RIGHT SHIFT DOES SIGN EXTENSION */
        !           247:        x->fp_exp = e >> 1;     /* calculates (e&1 ? (e-1)/2 : e/2 */
        !           248:
        !           249:        /*
        !           250:         * Now calculate the mantissa root.  Since x is now in [1..4),
        !           251:         * we know that the first trip around the loop will definitely
        !           252:         * set the top bit in q, so we can do that manually and start
        !           253:         * the loop at the next bit down instead.  We must be sure to
        !           254:         * double x correctly while doing the `known q=1.0'.
        !           255:         *
        !           256:         * We do this one mantissa-word at a time, as noted above, to
        !           257:         * save work.  To avoid `(1 << 31) << 1', we also do the top bit
        !           258:         * outside of each per-word loop.
        !           259:         *
        !           260:         * The calculation `t = y + bit' breaks down into `t0 = y0, ...,
        !           261:         * t3 = y3, t? |= bit' for the appropriate word.  Since the bit
        !           262:         * is always a `new' one, this means that three of the `t?'s are
        !           263:         * just the corresponding `y?'; we use `#define's here for this.
        !           264:         * The variable `tt' holds the actual `t?' variable.
        !           265:         */
        !           266:
        !           267:        /* calculate q0 */
        !           268: #define        t0 tt
        !           269:        bit = FP_1;
        !           270:        EVEN_DOUBLE;
        !           271:        /* if (x >= (t0 = y0 | bit)) { */       /* always true */
        !           272:                q = bit;
        !           273:                x0 -= bit;
        !           274:                y0 = bit << 1;
        !           275:        /* } */
        !           276:        ODD_DOUBLE;
        !           277:        while ((bit >>= 1) != 0) {      /* for remaining bits in q0 */
        !           278:                EVEN_DOUBLE;
        !           279:                t0 = y0 | bit;          /* t = y + bit */
        !           280:                if (x0 >= t0) {         /* if x >= t then */
        !           281:                        x0 -= t0;       /*      x -= t */
        !           282:                        q |= bit;       /*      q += bit */
        !           283:                        y0 |= bit << 1; /*      y += bit << 1 */
        !           284:                }
        !           285:                ODD_DOUBLE;
        !           286:        }
        !           287:        x->fp_mant[0] = q;
        !           288: #undef t0
        !           289:
        !           290:        /* calculate q1.  note (y0&1)==0. */
        !           291: #define t0 y0
        !           292: #define t1 tt
        !           293:        q = 0;
        !           294:        y1 = 0;
        !           295:        bit = 1 << 31;
        !           296:        EVEN_DOUBLE;
        !           297:        t1 = bit;
        !           298:        FPU_SUBS(d1, x1, t1);
        !           299:        FPU_SUBC(d0, x0, t0);           /* d = x - t */
        !           300:        if ((int)d0 >= 0) {             /* if d >= 0 (i.e., x >= t) then */
        !           301:                x0 = d0, x1 = d1;       /*      x -= t */
        !           302:                q = bit;                /*      q += bit */
        !           303:                y0 |= 1;                /*      y += bit << 1 */
        !           304:        }
        !           305:        ODD_DOUBLE;
        !           306:        while ((bit >>= 1) != 0) {      /* for remaining bits in q1 */
        !           307:                EVEN_DOUBLE;            /* as before */
        !           308:                t1 = y1 | bit;
        !           309:                FPU_SUBS(d1, x1, t1);
        !           310:                FPU_SUBC(d0, x0, t0);
        !           311:                if ((int)d0 >= 0) {
        !           312:                        x0 = d0, x1 = d1;
        !           313:                        q |= bit;
        !           314:                        y1 |= bit << 1;
        !           315:                }
        !           316:                ODD_DOUBLE;
        !           317:        }
        !           318:        x->fp_mant[1] = q;
        !           319: #undef t1
        !           320:
        !           321:        /* calculate q2.  note (y1&1)==0; y0 (aka t0) is fixed. */
        !           322: #define t1 y1
        !           323: #define t2 tt
        !           324:        q = 0;
        !           325:        y2 = 0;
        !           326:        bit = 1 << 31;
        !           327:        EVEN_DOUBLE;
        !           328:        t2 = bit;
        !           329:        FPU_SUBS(d2, x2, t2);
        !           330:        FPU_SUBCS(d1, x1, t1);
        !           331:        FPU_SUBC(d0, x0, t0);
        !           332:        if ((int)d0 >= 0) {
        !           333:                x0 = d0, x1 = d1, x2 = d2;
        !           334:                q |= bit;
        !           335:                y1 |= 1;                /* now t1, y1 are set in concrete */
        !           336:        }
        !           337:        ODD_DOUBLE;
        !           338:        while ((bit >>= 1) != 0) {
        !           339:                EVEN_DOUBLE;
        !           340:                t2 = y2 | bit;
        !           341:                FPU_SUBS(d2, x2, t2);
        !           342:                FPU_SUBCS(d1, x1, t1);
        !           343:                FPU_SUBC(d0, x0, t0);
        !           344:                if ((int)d0 >= 0) {
        !           345:                        x0 = d0, x1 = d1, x2 = d2;
        !           346:                        q |= bit;
        !           347:                        y2 |= bit << 1;
        !           348:                }
        !           349:                ODD_DOUBLE;
        !           350:        }
        !           351:        x->fp_mant[2] = q;
        !           352: #undef t2
        !           353:
        !           354:        /* calculate q3.  y0, t0, y1, t1 all fixed; y2, t2, almost done. */
        !           355: #define t2 y2
        !           356: #define t3 tt
        !           357:        q = 0;
        !           358:        y3 = 0;
        !           359:        bit = 1 << 31;
        !           360:        EVEN_DOUBLE;
        !           361:        t3 = bit;
        !           362:        FPU_SUBS(d3, x3, t3);
        !           363:        FPU_SUBCS(d2, x2, t2);
        !           364:        FPU_SUBCS(d1, x1, t1);
        !           365:        FPU_SUBC(d0, x0, t0);
        !           366:        ODD_DOUBLE;
        !           367:        if ((int)d0 >= 0) {
        !           368:                x0 = d0, x1 = d1, x2 = d2;
        !           369:                q |= bit;
        !           370:                y2 |= 1;
        !           371:        }
        !           372:        while ((bit >>= 1) != 0) {
        !           373:                EVEN_DOUBLE;
        !           374:                t3 = y3 | bit;
        !           375:                FPU_SUBS(d3, x3, t3);
        !           376:                FPU_SUBCS(d2, x2, t2);
        !           377:                FPU_SUBCS(d1, x1, t1);
        !           378:                FPU_SUBC(d0, x0, t0);
        !           379:                if ((int)d0 >= 0) {
        !           380:                        x0 = d0, x1 = d1, x2 = d2;
        !           381:                        q |= bit;
        !           382:                        y3 |= bit << 1;
        !           383:                }
        !           384:                ODD_DOUBLE;
        !           385:        }
        !           386:        x->fp_mant[3] = q;
        !           387:
        !           388:        /*
        !           389:         * The result, which includes guard and round bits, is exact iff
        !           390:         * x is now zero; any nonzero bits in x represent sticky bits.
        !           391:         */
        !           392:        x->fp_sticky = x0 | x1 | x2 | x3;
        !           393:        return (x);
        !           394: }
CVSweb