sys/arch/arm/arm/in_cksum_arm.S - annotate

Return to in_cksum_arm.S CVS log
Up to [local] / sys / arch / arm / arm
Annotation of sys/arch/arm/arm/in_cksum_arm.S, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: in_cksum_arm.S,v 1.2 2005/05/10 21:32:20 brad Exp $   */
                      2: /*     $NetBSD: in_cksum_arm.S,v 1.3 2003/11/26 10:31:53 rearnsha Exp $ */
                      3:
                      4: /*
                      5:  * Copyright 2003 Wasabi Systems, Inc.
                      6:  * All rights reserved.
                      7:  *
                      8:  * Written by Steve C. Woodford for Wasabi Systems, Inc.
                      9:  *
                     10:  * Redistribution and use in source and binary forms, with or without
                     11:  * modification, are permitted provided that the following conditions
                     12:  * are met:
                     13:  * 1. Redistributions of source code must retain the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer.
                     15:  * 2. Redistributions in binary form must reproduce the above copyright
                     16:  *    notice, this list of conditions and the following disclaimer in the
                     17:  *    documentation and/or other materials provided with the distribution.
                     18:  * 3. All advertising materials mentioning features or use of this software
                     19:  *    must display the following acknowledgement:
                     20:  *      This product includes software developed for the NetBSD Project by
                     21:  *      Wasabi Systems, Inc.
                     22:  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
                     23:  *    or promote products derived from this software without specific prior
                     24:  *    written permission.
                     25:  *
                     26:  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
                     27:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     28:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     29:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
                     30:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36:  * POSSIBILITY OF SUCH DAMAGE.
                     37:  */
                     38:
                     39: /*
                     40:  * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
                     41:  */
                     42:
                     43: #include <machine/asm.h>
                     44: #include "assym.h"
                     45:
                     46: /*
                     47:  * int in_cksum(struct mbuf *m, int len)
                     48:  *
                     49:  * Entry:
                     50:  *     r0      m
                     51:  *     r1      len
                     52:  *
                     53:  * NOTE: Assumes 'm' is *never* NULL.
                     54:  */
                     55: /* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
                     56: ENTRY(in_cksum)
                     57:        stmfd   sp!, {r4-r11,lr}
                     58:        mov     r8, #0x00
                     59:        mov     r9, r1
                     60:        mov     r10, #0x00
                     61:        mov     ip, r0
                     62:
                     63: .Lin_cksum_loop:
                     64:        ldr     r1, [ip, #(M_LEN)]
                     65:        ldr     r0, [ip, #(M_DATA)]
                     66:        ldr     ip, [ip, #(M_NEXT)]
                     67: .Lin_cksum_entry4:
                     68:        cmp     r9, r1
                     69:        movlt   r1, r9
                     70:        sub     r9, r9, r1
                     71:        eor     r11, r10, r0
                     72:        add     r10, r10, r1
                     73:        adds    r2, r1, #0x00
                     74:        blne    _ASM_LABEL(L_cksumdata)
                     75:        tst     r11, #0x01
                     76:        movne   r2, r2, ror #8
                     77:        adds    r8, r8, r2
                     78:        adc     r8, r8, #0x00
                     79:        cmp     ip, #0x00
                     80:        bne     .Lin_cksum_loop
                     81:
                     82:        mov     r1, #0xff
                     83:        orr     r1, r1, #0xff00
                     84:        and     r0, r8, r1
                     85:        add     r0, r0, r8, lsr #16
                     86:        add     r0, r0, r0, lsr #16
                     87:        and     r0, r0, r1
                     88:        eor     r0, r0, r1
                     89:        ldmfd   sp!, {r4-r11,pc}
                     90:
                     91: #ifdef INET
                     92: /*
                     93:  * int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
                     94:  *
                     95:  * Entry:
                     96:  *     r0      m
                     97:  *     r1      nxt
                     98:  *     r2      off
                     99:  *     r3      len
                    100:  */
                    101: /* LINTSTUB: Func: int in4_cksum(struct mbuf *, u_int8_t, int, int) */
                    102: ENTRY(in4_cksum)
                    103:        stmfd   sp!, {r4-r11,lr}
                    104:        mov     r8, #0x00               /* Accumulate sum in r8 */
                    105:
                    106:        /*
                    107:         * First, deal with a pseudo header, if present
                    108:         */
                    109:        ldr     r6, [r0, #(M_DATA)]
                    110:        cmp     r1, #0x00
                    111:        beq     .Lin4_cksum_skip_entry
                    112:
                    113: #ifdef __XSCALE__
                    114:        pld     [r6, #(IP_SRC)]
                    115: #endif
                    116:        add     r4, r6, #(IP_SRC)
                    117:        ands    r4, r4, #0x03
                    118:        add     r8, r1, r3              /* sum = nxt + len */
                    119:        addne   pc, pc, r4, lsl #5      /* Handle alignment of pseudo header */
                    120:
                    121:        /* 0x00: Data 32-bit aligned */
                    122:        ldr     r5, [r6, #(IP_SRC)]
                    123:        ldr     r4, [r6, #(IP_DST)]
                    124:        b       .Lin4_cksum_add_ips
                    125:        nop
                    126:        nop
                    127:        nop
                    128:        nop
                    129:        nop
                    130:        nop
                    131:
                    132:        /* 0x01: Data 8-bit aligned */
                    133:        ldr     r4, [r6, #(IP_SRC - 1)] /* BE:r4 = x012  LE:r4 = 210x */
                    134:        ldr     r5, [r6, #(IP_SRC + 3)] /* BE:r5 = 3456  LE:r5 = 6543 */
                    135:        ldrb    r7, [r6, #(IP_SRC + 7)] /* r7 = ...7 */
                    136: #ifdef __ARMEB__
                    137:        mov     r4, r4, lsl #8          /* r4 = 012. */
                    138:        orr     r4, r4, r5, lsr #24     /* r4 = 0123 */
                    139:        orr     r5, r7, r5, lsl #8      /* r5 = 4567 */
                    140:        b       .Lin4_cksum_add_ips
                    141:        nop
                    142: #else
                    143:        mov     r4, r4, lsr #8          /* r4 = .210 */
                    144:        orr     r4, r4, r5, lsl #24     /* r4 = 3210 */
                    145:        mov     r5, r5, lsr #8          /* r5 = .654 */
                    146:        orr     r5, r5, r7, lsl #24     /* r5 = 7654 */
                    147:        b       .Lin4_cksum_add_ips
                    148: #endif
                    149:
                    150:        /* 0x02: Data 16-bit aligned */
                    151: #ifdef __XSCALE__
                    152:        ldrh    r5, [r6, #(IP_SRC)]     /* BE:r5 = ..01  LE:r5 = ..10 */
                    153:        ldrh    r7, [r6, #(IP_DST + 2)] /* BE:r7 = ..67  LE:r7 = ..76 */
                    154:        ldr     r4, [r6, #(IP_SRC + 2)] /* BE:r4 = 2345  LE:r4 = 5432 */
                    155:        orr     r5, r7, r5, lsl #16     /* BE:r5 = 0167  LE:r5 = 1076 */
                    156:        b       .Lin4_cksum_add_ips
                    157:        nop
                    158:        nop
                    159:        nop
                    160: #else
                    161:        ldr     r4, [r6, #(IP_SRC - 2)] /* r4 = 10xx */
                    162:        ldr     r7, [r6, #(IP_DST - 2)] /* r7 = xx76 */
                    163:        ldr     r5, [r6, #(IP_SRC + 2)] /* r5 = 5432 */
                    164:        mov     r4, r4, lsr #16         /* r4 = ..10 */
                    165:        orr     r4, r4, r7, lsl #16     /* r4 = 7610 */
                    166:        b       .Lin4_cksum_add_ips
                    167:        nop
                    168:        nop
                    169: #endif
                    170:
                    171:        /* 0x03: Data 8-bit aligned */
                    172:        ldrb    r4, [r6, #(IP_SRC)]     /* r4 = ...0 */
                    173:        ldr     r5, [r6, #(IP_SRC + 1)] /* BE:r5 = 1234  LE:r5 = 4321 */
                    174:        ldr     r7, [r6, #(IP_SRC + 5)] /* BE:r7 = 567x  LE:r7 = x765 */
                    175: #ifdef __ARMEB__
                    176:        mov     r4, r4, lsl #24         /* r4 = 0... */
                    177:        orr     r4, r4, r5, lsr #8      /* r4 = 0123 */
                    178:        mov     r5, r5, lsl #24         /* r5 = 4... */
                    179:        orr     r5, r5, r7, lsr #8      /* r5 = 4567 */
                    180: #else
                    181:        orr     r4, r4, r5, lsl #8      /* r4 = 3210 */
                    182:        mov     r5, r5, lsr #24         /* r4 = ...4 */
                    183:        orr     r5, r5, r7, lsl #8      /* r5 = 7654 */
                    184: #endif
                    185:        /* FALLTHROUGH */
                    186:
                    187: .Lin4_cksum_add_ips:
                    188:        adds    r5, r5, r4
                    189: #ifndef __ARMEB__
                    190:        adcs    r8, r5, r8, lsl #8
                    191: #else
                    192:        adcs    r8, r5, r8
                    193: #endif
                    194:        adc     r8, r8, #0x00
                    195:        mov     r1, #0x00
                    196:        b       .Lin4_cksum_skip_entry
                    197:
                    198: .Lin4_cksum_skip_loop:
                    199:        ldr     r1, [r0, #(M_LEN)]
                    200:        ldr     r6, [r0, #(M_DATA)]
                    201:        ldr     r0, [r0, #(M_NEXT)]
                    202: .Lin4_cksum_skip_entry:
                    203:        subs    r2, r2, r1
                    204:        blt     .Lin4_cksum_skip_done
                    205:        cmp     r0, #0x00
                    206:        bne     .Lin4_cksum_skip_loop
                    207:        b       .Lin4_cksum_whoops
                    208:
                    209: .Lin4_cksum_skip_done:
                    210:        mov     ip, r0
                    211:        add     r0, r2, r6
                    212:        add     r0, r0, r1
                    213:        rsb     r1, r2, #0x00
                    214:        mov     r9, r3
                    215:        mov     r10, #0x00
                    216:        b       .Lin_cksum_entry4
                    217:
                    218: .Lin4_cksum_whoops:
                    219:        adr     r0, .Lin4_cksum_whoops_str
                    220:        bl      _C_LABEL(panic)
                    221: .Lin4_cksum_whoops_str:
                    222:        .asciz  "in4_cksum: out of mbufs\n"
                    223:        .align  5
                    224: #endif /* INET */
                    225:
                    226: /*
                    227:  * The main in*_cksum() workhorse...
                    228:  *
                    229:  * Entry parameters:
                    230:  *     r0      Pointer to buffer
                    231:  *     r1      Buffer length
                    232:  *     lr      Return address
                    233:  *
                    234:  * Returns:
                    235:  *     r2      Accumulated 32-bit sum
                    236:  *
                    237:  * Clobbers:
                    238:  *     r0-r7
                    239:  */
                    240: /* LINTSTUB: Ignore */
                    241: ASENTRY_NP(L_cksumdata)
                    242: #ifdef __XSCALE__
                    243:        pld     [r0]                    /* Pre-fetch the start of the buffer */
                    244: #endif
                    245:        mov     r2, #0
                    246:
                    247:        /* We first have to word-align the buffer.  */
                    248:        ands    r7, r0, #0x03
                    249:        beq     .Lcksumdata_wordaligned
                    250:        rsb     r7, r7, #0x04
                    251:        cmp     r1, r7                  /* Enough bytes left to make it? */
                    252:        blt     .Lcksumdata_endgame
                    253:        cmp     r7, #0x02
                    254:        ldrb    r4, [r0], #0x01         /* Fetch 1st byte */
                    255:        ldrgeb  r5, [r0], #0x01         /* Fetch 2nd byte */
                    256:        movlt   r5, #0x00
                    257:        ldrgtb  r6, [r0], #0x01         /* Fetch 3rd byte */
                    258:        movle   r6, #0x00
                    259:        /* Combine the three bytes depending on endianness and alignment */
                    260: #ifdef __ARMEB__
                    261:        orreq   r2, r5, r4, lsl #8
                    262:        orreq   r2, r2, r6, lsl #24
                    263:        orrne   r2, r4, r5, lsl #8
                    264:        orrne   r2, r2, r6, lsl #16
                    265: #else
                    266:        orreq   r2, r4, r5, lsl #8
                    267:        orreq   r2, r2, r6, lsl #16
                    268:        orrne   r2, r5, r4, lsl #8
                    269:        orrne   r2, r2, r6, lsl #24
                    270: #endif
                    271:        subs    r1, r1, r7              /* Update length */
                    272:        moveq   pc, lr                  /* All done? */
                    273:
                    274:        /* Buffer is now word aligned */
                    275: .Lcksumdata_wordaligned:
                    276: #ifdef __XSCALE__
                    277:        cmp     r1, #0x04               /* Less than 4 bytes left? */
                    278:        blt     .Lcksumdata_endgame     /* Yup */
                    279:
                    280:        /* Now quad-align, if necessary */
                    281:        ands    r7, r0, #0x04
                    282:        ldrne   r7, [r0], #0x04
                    283:        subne   r1, r1, #0x04
                    284:        subs    r1, r1, #0x40
                    285:        blt     .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */
                    286:
                    287:        /*
                    288:         * Buffer is now quad aligned. Sum 64 bytes at a time.
                    289:         * Note: First ldrd is hoisted above the loop, together with
                    290:         * setting r6 to zero to avoid stalling for results in the
                    291:         * loop. (r7 is live, from above).
                    292:         */
                    293:        ldrd    r4, [r0], #0x08
                    294:        mov     r6, #0x00
                    295: .Lcksumdata_bigloop:
                    296:        pld     [r0, #0x18]
                    297:        adds    r2, r2, r6
                    298:        adcs    r2, r2, r7
                    299:        ldrd    r6, [r0], #0x08
                    300:        adcs    r2, r2, r4
                    301:        adcs    r2, r2, r5
                    302:        ldrd    r4, [r0], #0x08
                    303:        adcs    r2, r2, r6
                    304:        adcs    r2, r2, r7
                    305:        ldrd    r6, [r0], #0x08
                    306:        adcs    r2, r2, r4
                    307:        adcs    r2, r2, r5
                    308:        ldrd    r4, [r0], #0x08
                    309:        adcs    r2, r2, r6
                    310:        adcs    r2, r2, r7
                    311:        pld     [r0, #0x18]
                    312:        ldrd    r6, [r0], #0x08
                    313:        adcs    r2, r2, r4
                    314:        adcs    r2, r2, r5
                    315:        ldrd    r4, [r0], #0x08
                    316:        adcs    r2, r2, r6
                    317:        adcs    r2, r2, r7
                    318:        ldrd    r6, [r0], #0x08
                    319:        adcs    r2, r2, r4
                    320:        adcs    r2, r2, r5
                    321:        adc     r2, r2, #0x00
                    322:        subs    r1, r1, #0x40
                    323:        ldrged  r4, [r0], #0x08
                    324:        bge     .Lcksumdata_bigloop
                    325:
                    326:        adds    r2, r2, r6              /* r6/r7 still need summing */
                    327: .Lcksumdata_bigloop_end:
                    328:        adcs    r2, r2, r7
                    329:        adc     r2, r2, #0x00
                    330:
                    331: #else  /* !__XSCALE__ */
                    332:
                    333:        subs    r1, r1, #0x40
                    334:        blt     .Lcksumdata_bigloop_end
                    335:
                    336: .Lcksumdata_bigloop:
                    337:        ldmia   r0!, {r3, r4, r5, r6}
                    338:        adds    r2, r2, r3
                    339:        adcs    r2, r2, r4
                    340:        adcs    r2, r2, r5
                    341:        ldmia   r0!, {r3, r4, r5, r7}
                    342:        adcs    r2, r2, r6
                    343:        adcs    r2, r2, r3
                    344:        adcs    r2, r2, r4
                    345:        adcs    r2, r2, r5
                    346:        ldmia   r0!, {r3, r4, r5, r6}
                    347:        adcs    r2, r2, r7
                    348:        adcs    r2, r2, r3
                    349:        adcs    r2, r2, r4
                    350:        adcs    r2, r2, r5
                    351:        ldmia   r0!, {r3, r4, r5, r7}
                    352:        adcs    r2, r2, r6
                    353:        adcs    r2, r2, r3
                    354:        adcs    r2, r2, r4
                    355:        adcs    r2, r2, r5
                    356:        adcs    r2, r2, r7
                    357:        adc     r2, r2, #0x00
                    358:        subs    r1, r1, #0x40
                    359:        bge     .Lcksumdata_bigloop
                    360: .Lcksumdata_bigloop_end:
                    361: #endif
                    362:
                    363:        adds    r1, r1, #0x40
                    364:        moveq   pc, lr
                    365:        cmp     r1, #0x20
                    366:
                    367: #ifdef __XSCALE__
                    368:        ldrged  r4, [r0], #0x08         /* Avoid stalling pld and result */
                    369:        blt     .Lcksumdata_less_than_32
                    370:        pld     [r0, #0x18]
                    371:        ldrd    r6, [r0], #0x08
                    372:        adds    r2, r2, r4
                    373:        adcs    r2, r2, r5
                    374:        ldrd    r4, [r0], #0x08
                    375:        adcs    r2, r2, r6
                    376:        adcs    r2, r2, r7
                    377:        ldrd    r6, [r0], #0x08
                    378:        adcs    r2, r2, r4
                    379:        adcs    r2, r2, r5
                    380:        adcs    r2, r2, r6              /* XXX: Unavoidable result stall */
                    381:        adcs    r2, r2, r7
                    382: #else
                    383:        blt     .Lcksumdata_less_than_32
                    384:        ldmia   r0!, {r3, r4, r5, r6}
                    385:        adds    r2, r2, r3
                    386:        adcs    r2, r2, r4
                    387:        adcs    r2, r2, r5
                    388:        ldmia   r0!, {r3, r4, r5, r7}
                    389:        adcs    r2, r2, r6
                    390:        adcs    r2, r2, r3
                    391:        adcs    r2, r2, r4
                    392:        adcs    r2, r2, r5
                    393:        adcs    r2, r2, r7
                    394: #endif
                    395:        adc     r2, r2, #0x00
                    396:        subs    r1, r1, #0x20
                    397:        moveq   pc, lr
                    398:
                    399: .Lcksumdata_less_than_32:
                    400:        /* There are less than 32 bytes left */
                    401:        and     r3, r1, #0x18
                    402:        rsb     r4, r3, #0x18
                    403:        sub     r1, r1, r3
                    404:        adds    r4, r4, r4, lsr #1      /* Side effect: Clear carry flag */
                    405:        addne   pc, pc, r4
                    406:
                    407: /*
                    408:  * Note: We use ldm here, even on Xscale, since the combined issue/result
                    409:  * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
                    410:  */
                    411:        /* At least 24 bytes remaining... */
                    412:        ldmia   r0!, {r4, r5}
                    413:        nop
                    414:        adcs    r2, r2, r4
                    415:        adcs    r2, r2, r5
                    416:
                    417:        /* At least 16 bytes remaining... */
                    418:        ldmia   r0!, {r4, r5}
                    419:        adcs    r2, r2, r4
                    420:        adcs    r2, r2, r5
                    421:
                    422:        /* At least 8 bytes remaining... */
                    423:        ldmia   r0!, {r4, r5}
                    424:        adcs    r2, r2, r4
                    425:        adcs    r2, r2, r5
                    426:
                    427:        /* Less than 8 bytes remaining... */
                    428:        adc     r2, r2, #0x00
                    429:        subs    r1, r1, #0x04
                    430:        blt     .Lcksumdata_lessthan4
                    431:
                    432:        ldr     r4, [r0], #0x04
                    433:        sub     r1, r1, #0x04
                    434:        adds    r2, r2, r4
                    435:        adc     r2, r2, #0x00
                    436:
                    437:        /* Deal with < 4 bytes remaining */
                    438: .Lcksumdata_lessthan4:
                    439:        adds    r1, r1, #0x04
                    440:        moveq   pc, lr
                    441:
                    442:        /* Deal with 1 to 3 remaining bytes, possibly misaligned */
                    443: .Lcksumdata_endgame:
                    444:        ldrb    r3, [r0]                /* Fetch first byte */
                    445:        cmp     r1, #0x02
                    446:        ldrgeb  r4, [r0, #0x01]         /* Fetch 2nd and 3rd as necessary */
                    447:        movlt   r4, #0x00
                    448:        ldrgtb  r5, [r0, #0x02]
                    449:        movle   r5, #0x00
                    450:        /* Combine the three bytes depending on endianness and alignment */
                    451:        tst     r0, #0x01
                    452: #ifdef __ARMEB__
                    453:        orreq   r3, r4, r3, lsl #8
                    454:        orreq   r3, r3, r5, lsl #24
                    455:        orrne   r3, r3, r4, lsl #8
                    456:        orrne   r3, r3, r5, lsl #16
                    457: #else
                    458:        orreq   r3, r3, r4, lsl #8
                    459:        orreq   r3, r3, r5, lsl #16
                    460:        orrne   r3, r4, r3, lsl #8
                    461:        orrne   r3, r3, r5, lsl #24
                    462: #endif
                    463:        adds    r2, r2, r3
                    464:        adc     r2, r2, #0x00
                    465:        mov     pc, lr
CVSweb