sys/lib/libkern/arch/arm/memcpy.S - annotate

Return to memcpy.S CVS log
Up to [local] / sys / lib / libkern / arch / arm
Annotation of sys/lib/libkern/arch/arm/memcpy.S, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $        */
                      2: /*     $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
                      3:
                      4: /*-
                      5:  * Copyright (c) 1997 The NetBSD Foundation, Inc.
                      6:  * All rights reserved.
                      7:  *
                      8:  * This code is derived from software contributed to The NetBSD Foundation
                      9:  * by Neil A. Carson and Mark Brinicombe
                     10:  *
                     11:  * Redistribution and use in source and binary forms, with or without
                     12:  * modification, are permitted provided that the following conditions
                     13:  * are met:
                     14:  * 1. Redistributions of source code must retain the above copyright
                     15:  *    notice, this list of conditions and the following disclaimer.
                     16:  * 2. Redistributions in binary form must reproduce the above copyright
                     17:  *    notice, this list of conditions and the following disclaimer in the
                     18:  *    documentation and/or other materials provided with the distribution.
                     19:  * 3. All advertising materials mentioning features or use of this software
                     20:  *    must display the following acknowledgement:
                     21:  *        This product includes software developed by the NetBSD
                     22:  *        Foundation, Inc. and its contributors.
                     23:  * 4. Neither the name of The NetBSD Foundation nor the names of its
                     24:  *    contributors may be used to endorse or promote products derived
                     25:  *    from this software without specific prior written permission.
                     26:  *
                     27:  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
                     28:  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     29:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
                     30:  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
                     31:  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     32:  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     33:  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     34:  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     35:  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     36:  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     37:  * POSSIBILITY OF SUCH DAMAGE.
                     38:  */
                     39:
                     40: #include <machine/asm.h>
                     41:
                     42: /*
                     43:  * This is one fun bit of code ...
                     44:  * Some easy listening music is suggested while trying to understand this
                     45:  * code e.g. Iron Maiden
                     46:  *
                     47:  * For anyone attempting to understand it :
                     48:  *
                     49:  * The core code is implemented here with simple stubs for memcpy()
                     50:  * memmove() and bcopy().
                     51:  *
                     52:  * All local labels are prefixed with Lmemcpy_
                     53:  * Following the prefix a label starting f is used in the forward copy code
                     54:  * while a label using b is used in the backwards copy code
                     55:  * The source and destination addresses determine whether a forward or
                     56:  * backward copy is performed.
                     57:  * Separate bits of code are used to deal with the following situations
                     58:  * for both the forward and backwards copy.
                     59:  * unaligned source address
                     60:  * unaligned destination address
                     61:  * Separate copy routines are used to produce an optimised result for each
                     62:  * of these cases.
                     63:  * The copy code will use LDM/STM instructions to copy up to 32 bytes at
                     64:  * a time where possible.
                     65:  *
                     66:  * Note: r12 (aka ip) can be trashed during the function along with
                     67:  * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
                     68:  * Additional registers are preserved prior to use i.e. r4, r5 & lr
                     69:  *
                     70:  * Apologies for the state of the comments ;-)
                     71:  */
                     72:
                     73: ENTRY(memcpy)
                     74: ENTRY_NP(memmove)
                     75:        /* Determine copy direction */
                     76:        cmp     r1, r0
                     77:
                     78:        moveq   r0, #0                  /* Quick abort for len=0 */
                     79: #ifdef __APCS_26__
                     80:        moveqs  pc, lr
                     81: #else
                     82:        moveq   pc, lr
                     83: #endif
                     84:
                     85:        /* save leaf functions having to store this away */
                     86:        stmdb   sp!, {r0, lr}           /* memcpy() returns dest addr */
                     87:
                     88:        bcc     Lmemcpy_backwards
                     89:
                     90:        /* start of forwards copy */
                     91:        subs    r2, r2, #4
                     92:        blt     Lmemcpy_fl4             /* less than 4 bytes */
                     93:        ands    r12, r0, #3
                     94:        bne     Lmemcpy_fdestul         /* oh unaligned destination addr */
                     95:        ands    r12, r1, #3
                     96:        bne     Lmemcpy_fsrcul          /* oh unaligned source addr */
                     97:
                     98: Lmemcpy_ft8:
                     99:        /* We have aligned source and destination */
                    100:        subs    r2, r2, #8
                    101:        blt     Lmemcpy_fl12            /* less than 12 bytes (4 from above) */
                    102:        subs    r2, r2, #0x14
                    103:        blt     Lmemcpy_fl32            /* less than 32 bytes (12 from above) */
                    104:        stmdb   sp!, {r4}               /* borrow r4 */
                    105:
                    106:        /* blat 32 bytes at a time */
                    107:        /* XXX for really big copies perhaps we should use more registers */
                    108: Lmemcpy_floop32:
                    109:        ldmia   r1!, {r3, r4, r12, lr}
                    110:        stmia   r0!, {r3, r4, r12, lr}
                    111:        ldmia   r1!, {r3, r4, r12, lr}
                    112:        stmia   r0!, {r3, r4, r12, lr}
                    113:        subs    r2, r2, #0x20
                    114:        bge     Lmemcpy_floop32
                    115:
                    116:        cmn     r2, #0x10
                    117:        ldmgeia r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
                    118:        stmgeia r0!, {r3, r4, r12, lr}
                    119:        subge   r2, r2, #0x10
                    120:        ldmia   sp!, {r4}               /* return r4 */
                    121:
                    122: Lmemcpy_fl32:
                    123:        adds    r2, r2, #0x14
                    124:
                    125:        /* blat 12 bytes at a time */
                    126: Lmemcpy_floop12:
                    127:        ldmgeia r1!, {r3, r12, lr}
                    128:        stmgeia r0!, {r3, r12, lr}
                    129:        subges  r2, r2, #0x0c
                    130:        bge     Lmemcpy_floop12
                    131:
                    132: Lmemcpy_fl12:
                    133:        adds    r2, r2, #8
                    134:        blt     Lmemcpy_fl4
                    135:
                    136:        subs    r2, r2, #4
                    137:        ldrlt   r3, [r1], #4
                    138:        strlt   r3, [r0], #4
                    139:        ldmgeia r1!, {r3, r12}
                    140:        stmgeia r0!, {r3, r12}
                    141:        subge   r2, r2, #4
                    142:
                    143: Lmemcpy_fl4:
                    144:        /* less than 4 bytes to go */
                    145:        adds    r2, r2, #4
                    146: #ifdef __APCS_26_
                    147:        ldmeqia sp!, {r0, pc}^          /* done */
                    148: #else
                    149:        ldmeqia sp!, {r0, pc}           /* done */
                    150: #endif
                    151:        /* copy the crud byte at a time */
                    152:        cmp     r2, #2
                    153:        ldrb    r3, [r1], #1
                    154:        strb    r3, [r0], #1
                    155:        ldrgeb  r3, [r1], #1
                    156:        strgeb  r3, [r0], #1
                    157:        ldrgtb  r3, [r1], #1
                    158:        strgtb  r3, [r0], #1
                    159: #ifdef __APCS_26__
                    160:        ldmia   sp!, {r0, pc}^
                    161: #else
                    162:        ldmia   sp!, {r0, pc}
                    163: #endif
                    164:
                    165:        /* erg - unaligned destination */
                    166: Lmemcpy_fdestul:
                    167:        rsb     r12, r12, #4
                    168:        cmp     r12, #2
                    169:
                    170:        /* align destination with byte copies */
                    171:        ldrb    r3, [r1], #1
                    172:        strb    r3, [r0], #1
                    173:        ldrgeb  r3, [r1], #1
                    174:        strgeb  r3, [r0], #1
                    175:        ldrgtb  r3, [r1], #1
                    176:        strgtb  r3, [r0], #1
                    177:        subs    r2, r2, r12
                    178:        blt     Lmemcpy_fl4             /* less the 4 bytes */
                    179:
                    180:        ands    r12, r1, #3
                    181:        beq     Lmemcpy_ft8             /* we have an aligned source */
                    182:
                    183:        /* erg - unaligned source */
                    184:        /* This is where it gets nasty ... */
                    185: Lmemcpy_fsrcul:
                    186:        bic     r1, r1, #3
                    187:        ldr     lr, [r1], #4
                    188:        cmp     r12, #2
                    189:        bgt     Lmemcpy_fsrcul3
                    190:        beq     Lmemcpy_fsrcul2
                    191:        cmp     r2, #0x0c
                    192:        blt     Lmemcpy_fsrcul1loop4
                    193:        sub     r2, r2, #0x0c
                    194:        stmdb   sp!, {r4, r5}
                    195:
                    196: Lmemcpy_fsrcul1loop16:
                    197:        mov     r3, lr, lsr #8
                    198:        ldmia   r1!, {r4, r5, r12, lr}
                    199:        orr     r3, r3, r4, lsl #24
                    200:        mov     r4, r4, lsr #8
                    201:        orr     r4, r4, r5, lsl #24
                    202:        mov     r5, r5, lsr #8
                    203:        orr     r5, r5, r12, lsl #24
                    204:        mov     r12, r12, lsr #8
                    205:        orr     r12, r12, lr, lsl #24
                    206:        stmia   r0!, {r3-r5, r12}
                    207:        subs    r2, r2, #0x10
                    208:        bge     Lmemcpy_fsrcul1loop16
                    209:        ldmia   sp!, {r4, r5}
                    210:        adds    r2, r2, #0x0c
                    211:        blt     Lmemcpy_fsrcul1l4
                    212:
                    213: Lmemcpy_fsrcul1loop4:
                    214:        mov     r12, lr, lsr #8
                    215:        ldr     lr, [r1], #4
                    216:        orr     r12, r12, lr, lsl #24
                    217:        str     r12, [r0], #4
                    218:        subs    r2, r2, #4
                    219:        bge     Lmemcpy_fsrcul1loop4
                    220:
                    221: Lmemcpy_fsrcul1l4:
                    222:        sub     r1, r1, #3
                    223:        b       Lmemcpy_fl4
                    224:
                    225: Lmemcpy_fsrcul2:
                    226:        cmp     r2, #0x0c
                    227:        blt     Lmemcpy_fsrcul2loop4
                    228:        sub     r2, r2, #0x0c
                    229:        stmdb   sp!, {r4, r5}
                    230:
                    231: Lmemcpy_fsrcul2loop16:
                    232:        mov     r3, lr, lsr #16
                    233:        ldmia   r1!, {r4, r5, r12, lr}
                    234:        orr     r3, r3, r4, lsl #16
                    235:        mov     r4, r4, lsr #16
                    236:        orr     r4, r4, r5, lsl #16
                    237:        mov     r5, r5, lsr #16
                    238:        orr     r5, r5, r12, lsl #16
                    239:        mov     r12, r12, lsr #16
                    240:        orr     r12, r12, lr, lsl #16
                    241:        stmia   r0!, {r3-r5, r12}
                    242:        subs    r2, r2, #0x10
                    243:        bge     Lmemcpy_fsrcul2loop16
                    244:        ldmia   sp!, {r4, r5}
                    245:        adds    r2, r2, #0x0c
                    246:        blt     Lmemcpy_fsrcul2l4
                    247:
                    248: Lmemcpy_fsrcul2loop4:
                    249:        mov     r12, lr, lsr #16
                    250:        ldr     lr, [r1], #4
                    251:        orr     r12, r12, lr, lsl #16
                    252:        str     r12, [r0], #4
                    253:        subs    r2, r2, #4
                    254:        bge     Lmemcpy_fsrcul2loop4
                    255:
                    256: Lmemcpy_fsrcul2l4:
                    257:        sub     r1, r1, #2
                    258:        b       Lmemcpy_fl4
                    259:
                    260: Lmemcpy_fsrcul3:
                    261:        cmp     r2, #0x0c
                    262:        blt     Lmemcpy_fsrcul3loop4
                    263:        sub     r2, r2, #0x0c
                    264:        stmdb   sp!, {r4, r5}
                    265:
                    266: Lmemcpy_fsrcul3loop16:
                    267:        mov     r3, lr, lsr #24
                    268:        ldmia   r1!, {r4, r5, r12, lr}
                    269:        orr     r3, r3, r4, lsl #8
                    270:        mov     r4, r4, lsr #24
                    271:        orr     r4, r4, r5, lsl #8
                    272:        mov     r5, r5, lsr #24
                    273:        orr     r5, r5, r12, lsl #8
                    274:        mov     r12, r12, lsr #24
                    275:        orr     r12, r12, lr, lsl #8
                    276:        stmia   r0!, {r3-r5, r12}
                    277:        subs    r2, r2, #0x10
                    278:        bge     Lmemcpy_fsrcul3loop16
                    279:        ldmia   sp!, {r4, r5}
                    280:        adds    r2, r2, #0x0c
                    281:        blt     Lmemcpy_fsrcul3l4
                    282:
                    283: Lmemcpy_fsrcul3loop4:
                    284:        mov     r12, lr, lsr #24
                    285:        ldr     lr, [r1], #4
                    286:        orr     r12, r12, lr, lsl #8
                    287:        str     r12, [r0], #4
                    288:        subs    r2, r2, #4
                    289:        bge     Lmemcpy_fsrcul3loop4
                    290:
                    291: Lmemcpy_fsrcul3l4:
                    292:        sub     r1, r1, #1
                    293:        b       Lmemcpy_fl4
                    294:
                    295: Lmemcpy_backwards:
                    296:        add     r1, r1, r2
                    297:        add     r0, r0, r2
                    298:        subs    r2, r2, #4
                    299:        blt     Lmemcpy_bl4             /* less than 4 bytes */
                    300:        ands    r12, r0, #3
                    301:        bne     Lmemcpy_bdestul         /* oh unaligned destination addr */
                    302:        ands    r12, r1, #3
                    303:        bne     Lmemcpy_bsrcul          /* oh unaligned source addr */
                    304:
                    305: Lmemcpy_bt8:
                    306:        /* We have aligned source and destination */
                    307:        subs    r2, r2, #8
                    308:        blt     Lmemcpy_bl12            /* less than 12 bytes (4 from above) */
                    309:        stmdb   sp!, {r4}
                    310:        subs    r2, r2, #0x14           /* less than 32 bytes (12 from above) */
                    311:        blt     Lmemcpy_bl32
                    312:
                    313:        /* blat 32 bytes at a time */
                    314:        /* XXX for really big copies perhaps we should use more registers */
                    315: Lmemcpy_bloop32:
                    316:        ldmdb   r1!, {r3, r4, r12, lr}
                    317:        stmdb   r0!, {r3, r4, r12, lr}
                    318:        ldmdb   r1!, {r3, r4, r12, lr}
                    319:        stmdb   r0!, {r3, r4, r12, lr}
                    320:        subs    r2, r2, #0x20
                    321:        bge     Lmemcpy_bloop32
                    322:
                    323: Lmemcpy_bl32:
                    324:        cmn     r2, #0x10
                    325:        ldmgedb r1!, {r3, r4, r12, lr}  /* blat a remaining 16 bytes */
                    326:        stmgedb r0!, {r3, r4, r12, lr}
                    327:        subge   r2, r2, #0x10
                    328:        adds    r2, r2, #0x14
                    329:        ldmgedb r1!, {r3, r12, lr}      /* blat a remaining 12 bytes */
                    330:        stmgedb r0!, {r3, r12, lr}
                    331:        subge   r2, r2, #0x0c
                    332:        ldmia   sp!, {r4}
                    333:
                    334: Lmemcpy_bl12:
                    335:        adds    r2, r2, #8
                    336:        blt     Lmemcpy_bl4
                    337:        subs    r2, r2, #4
                    338:        ldrlt   r3, [r1, #-4]!
                    339:        strlt   r3, [r0, #-4]!
                    340:        ldmgedb r1!, {r3, r12}
                    341:        stmgedb r0!, {r3, r12}
                    342:        subge   r2, r2, #4
                    343:
                    344: Lmemcpy_bl4:
                    345:        /* less than 4 bytes to go */
                    346:        adds    r2, r2, #4
                    347: #ifdef __APCS_26__
                    348:        ldmeqia sp!, {r0, pc}^
                    349: #else
                    350:        ldmeqia sp!, {r0, pc}
                    351: #endif
                    352:
                    353:        /* copy the crud byte at a time */
                    354:        cmp     r2, #2
                    355:        ldrb    r3, [r1, #-1]!
                    356:        strb    r3, [r0, #-1]!
                    357:        ldrgeb  r3, [r1, #-1]!
                    358:        strgeb  r3, [r0, #-1]!
                    359:        ldrgtb  r3, [r1, #-1]!
                    360:        strgtb  r3, [r0, #-1]!
                    361: #ifdef __APCS_26__
                    362:        ldmia   sp!, {r0, pc}^
                    363: #else
                    364:        ldmia   sp!, {r0, pc}
                    365: #endif
                    366:
                    367:        /* erg - unaligned destination */
                    368: Lmemcpy_bdestul:
                    369:        cmp     r12, #2
                    370:
                    371:        /* align destination with byte copies */
                    372:        ldrb    r3, [r1, #-1]!
                    373:        strb    r3, [r0, #-1]!
                    374:        ldrgeb  r3, [r1, #-1]!
                    375:        strgeb  r3, [r0, #-1]!
                    376:        ldrgtb  r3, [r1, #-1]!
                    377:        strgtb  r3, [r0, #-1]!
                    378:        subs    r2, r2, r12
                    379:        blt     Lmemcpy_bl4             /* less than 4 bytes to go */
                    380:        ands    r12, r1, #3
                    381:        beq     Lmemcpy_bt8             /* we have an aligned source */
                    382:
                    383:        /* erg - unaligned source */
                    384:        /* This is where it gets nasty ... */
                    385: Lmemcpy_bsrcul:
                    386:        bic     r1, r1, #3
                    387:        ldr     r3, [r1, #0]
                    388:        cmp     r12, #2
                    389:        blt     Lmemcpy_bsrcul1
                    390:        beq     Lmemcpy_bsrcul2
                    391:        cmp     r2, #0x0c
                    392:        blt     Lmemcpy_bsrcul3loop4
                    393:        sub     r2, r2, #0x0c
                    394:        stmdb   sp!, {r4, r5}
                    395:
                    396: Lmemcpy_bsrcul3loop16:
                    397:        mov     lr, r3, lsl #8
                    398:        ldmdb   r1!, {r3-r5, r12}
                    399:        orr     lr, lr, r12, lsr #24
                    400:        mov     r12, r12, lsl #8
                    401:        orr     r12, r12, r5, lsr #24
                    402:        mov     r5, r5, lsl #8
                    403:        orr     r5, r5, r4, lsr #24
                    404:        mov     r4, r4, lsl #8
                    405:        orr     r4, r4, r3, lsr #24
                    406:        stmdb   r0!, {r4, r5, r12, lr}
                    407:        subs    r2, r2, #0x10
                    408:        bge     Lmemcpy_bsrcul3loop16
                    409:        ldmia   sp!, {r4, r5}
                    410:        adds    r2, r2, #0x0c
                    411:        blt     Lmemcpy_bsrcul3l4
                    412:
                    413: Lmemcpy_bsrcul3loop4:
                    414:        mov     r12, r3, lsl #8
                    415:        ldr     r3, [r1, #-4]!
                    416:        orr     r12, r12, r3, lsr #24
                    417:        str     r12, [r0, #-4]!
                    418:        subs    r2, r2, #4
                    419:        bge     Lmemcpy_bsrcul3loop4
                    420:
                    421: Lmemcpy_bsrcul3l4:
                    422:        add     r1, r1, #3
                    423:        b       Lmemcpy_bl4
                    424:
                    425: Lmemcpy_bsrcul2:
                    426:        cmp     r2, #0x0c
                    427:        blt     Lmemcpy_bsrcul2loop4
                    428:        sub     r2, r2, #0x0c
                    429:        stmdb   sp!, {r4, r5}
                    430:
                    431: Lmemcpy_bsrcul2loop16:
                    432:        mov     lr, r3, lsl #16
                    433:        ldmdb   r1!, {r3-r5, r12}
                    434:        orr     lr, lr, r12, lsr #16
                    435:        mov     r12, r12, lsl #16
                    436:        orr     r12, r12, r5, lsr #16
                    437:        mov     r5, r5, lsl #16
                    438:        orr     r5, r5, r4, lsr #16
                    439:        mov     r4, r4, lsl #16
                    440:        orr     r4, r4, r3, lsr #16
                    441:        stmdb   r0!, {r4, r5, r12, lr}
                    442:        subs    r2, r2, #0x10
                    443:        bge     Lmemcpy_bsrcul2loop16
                    444:        ldmia   sp!, {r4, r5}
                    445:        adds    r2, r2, #0x0c
                    446:        blt     Lmemcpy_bsrcul2l4
                    447:
                    448: Lmemcpy_bsrcul2loop4:
                    449:        mov     r12, r3, lsl #16
                    450:        ldr     r3, [r1, #-4]!
                    451:        orr     r12, r12, r3, lsr #16
                    452:        str     r12, [r0, #-4]!
                    453:        subs    r2, r2, #4
                    454:        bge     Lmemcpy_bsrcul2loop4
                    455:
                    456: Lmemcpy_bsrcul2l4:
                    457:        add     r1, r1, #2
                    458:        b       Lmemcpy_bl4
                    459:
                    460: Lmemcpy_bsrcul1:
                    461:        cmp     r2, #0x0c
                    462:        blt     Lmemcpy_bsrcul1loop4
                    463:        sub     r2, r2, #0x0c
                    464:        stmdb   sp!, {r4, r5}
                    465:
                    466: Lmemcpy_bsrcul1loop32:
                    467:        mov     lr, r3, lsl #24
                    468:        ldmdb   r1!, {r3-r5, r12}
                    469:        orr     lr, lr, r12, lsr #8
                    470:        mov     r12, r12, lsl #24
                    471:        orr     r12, r12, r5, lsr #8
                    472:        mov     r5, r5, lsl #24
                    473:        orr     r5, r5, r4, lsr #8
                    474:        mov     r4, r4, lsl #24
                    475:        orr     r4, r4, r3, lsr #8
                    476:        stmdb   r0!, {r4, r5, r12, lr}
                    477:        subs    r2, r2, #0x10
                    478:        bge     Lmemcpy_bsrcul1loop32
                    479:        ldmia   sp!, {r4, r5}
                    480:        adds    r2, r2, #0x0c
                    481:        blt     Lmemcpy_bsrcul1l4
                    482:
                    483: Lmemcpy_bsrcul1loop4:
                    484:        mov     r12, r3, lsl #24
                    485:        ldr     r3, [r1, #-4]!
                    486:        orr     r12, r12, r3, lsr #8
                    487:        str     r12, [r0, #-4]!
                    488:        subs    r2, r2, #4
                    489:        bge     Lmemcpy_bsrcul1loop4
                    490:
                    491: Lmemcpy_bsrcul1l4:
                    492:        add     r1, r1, #1
                    493:        b       Lmemcpy_bl4
                    494:
CVSweb