Annotation of sys/lib/libkern/arch/arm/memcpy.S, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: memcpy.S,v 1.2 2004/02/01 05:47:10 drahn Exp $ */
2: /* $NetBSD: memcpy.S,v 1.2 2001/11/20 00:29:20 chris Exp $ */
3:
4: /*-
5: * Copyright (c) 1997 The NetBSD Foundation, Inc.
6: * All rights reserved.
7: *
8: * This code is derived from software contributed to The NetBSD Foundation
9: * by Neil A. Carson and Mark Brinicombe
10: *
11: * Redistribution and use in source and binary forms, with or without
12: * modification, are permitted provided that the following conditions
13: * are met:
14: * 1. Redistributions of source code must retain the above copyright
15: * notice, this list of conditions and the following disclaimer.
16: * 2. Redistributions in binary form must reproduce the above copyright
17: * notice, this list of conditions and the following disclaimer in the
18: * documentation and/or other materials provided with the distribution.
19: * 3. All advertising materials mentioning features or use of this software
20: * must display the following acknowledgement:
21: * This product includes software developed by the NetBSD
22: * Foundation, Inc. and its contributors.
23: * 4. Neither the name of The NetBSD Foundation nor the names of its
24: * contributors may be used to endorse or promote products derived
25: * from this software without specific prior written permission.
26: *
27: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: * POSSIBILITY OF SUCH DAMAGE.
38: */
39:
40: #include <machine/asm.h>
41:
42: /*
43: * This is one fun bit of code ...
44: * Some easy listening music is suggested while trying to understand this
45: * code e.g. Iron Maiden
46: *
47: * For anyone attempting to understand it :
48: *
49: * The core code is implemented here with simple stubs for memcpy()
50: * memmove() and bcopy().
51: *
52: * All local labels are prefixed with Lmemcpy_
53: * Following the prefix a label starting f is used in the forward copy code
54: * while a label using b is used in the backwards copy code
55: * The source and destination addresses determine whether a forward or
56: * backward copy is performed.
57: * Separate bits of code are used to deal with the following situations
58: * for both the forward and backwards copy.
59: * unaligned source address
60: * unaligned destination address
61: * Separate copy routines are used to produce an optimised result for each
62: * of these cases.
63: * The copy code will use LDM/STM instructions to copy up to 32 bytes at
64: * a time where possible.
65: *
66: * Note: r12 (aka ip) can be trashed during the function along with
67: * r0-r3 although r0-r2 have defined uses i.e. src, dest, len through out.
68: * Additional registers are preserved prior to use i.e. r4, r5 & lr
69: *
70: * Apologies for the state of the comments ;-)
71: */
72:
73: ENTRY(memcpy)
74: ENTRY_NP(memmove)
75: /* Determine copy direction */
76: cmp r1, r0
77:
78: moveq r0, #0 /* Quick abort for len=0 */
79: #ifdef __APCS_26__
80: moveqs pc, lr
81: #else
82: moveq pc, lr
83: #endif
84:
85: /* save leaf functions having to store this away */
86: stmdb sp!, {r0, lr} /* memcpy() returns dest addr */
87:
88: bcc Lmemcpy_backwards
89:
90: /* start of forwards copy */
91: subs r2, r2, #4
92: blt Lmemcpy_fl4 /* less than 4 bytes */
93: ands r12, r0, #3
94: bne Lmemcpy_fdestul /* oh unaligned destination addr */
95: ands r12, r1, #3
96: bne Lmemcpy_fsrcul /* oh unaligned source addr */
97:
98: Lmemcpy_ft8:
99: /* We have aligned source and destination */
100: subs r2, r2, #8
101: blt Lmemcpy_fl12 /* less than 12 bytes (4 from above) */
102: subs r2, r2, #0x14
103: blt Lmemcpy_fl32 /* less than 32 bytes (12 from above) */
104: stmdb sp!, {r4} /* borrow r4 */
105:
106: /* blat 32 bytes at a time */
107: /* XXX for really big copies perhaps we should use more registers */
108: Lmemcpy_floop32:
109: ldmia r1!, {r3, r4, r12, lr}
110: stmia r0!, {r3, r4, r12, lr}
111: ldmia r1!, {r3, r4, r12, lr}
112: stmia r0!, {r3, r4, r12, lr}
113: subs r2, r2, #0x20
114: bge Lmemcpy_floop32
115:
116: cmn r2, #0x10
117: ldmgeia r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
118: stmgeia r0!, {r3, r4, r12, lr}
119: subge r2, r2, #0x10
120: ldmia sp!, {r4} /* return r4 */
121:
122: Lmemcpy_fl32:
123: adds r2, r2, #0x14
124:
125: /* blat 12 bytes at a time */
126: Lmemcpy_floop12:
127: ldmgeia r1!, {r3, r12, lr}
128: stmgeia r0!, {r3, r12, lr}
129: subges r2, r2, #0x0c
130: bge Lmemcpy_floop12
131:
132: Lmemcpy_fl12:
133: adds r2, r2, #8
134: blt Lmemcpy_fl4
135:
136: subs r2, r2, #4
137: ldrlt r3, [r1], #4
138: strlt r3, [r0], #4
139: ldmgeia r1!, {r3, r12}
140: stmgeia r0!, {r3, r12}
141: subge r2, r2, #4
142:
143: Lmemcpy_fl4:
144: /* less than 4 bytes to go */
145: adds r2, r2, #4
146: #ifdef __APCS_26_
147: ldmeqia sp!, {r0, pc}^ /* done */
148: #else
149: ldmeqia sp!, {r0, pc} /* done */
150: #endif
151: /* copy the crud byte at a time */
152: cmp r2, #2
153: ldrb r3, [r1], #1
154: strb r3, [r0], #1
155: ldrgeb r3, [r1], #1
156: strgeb r3, [r0], #1
157: ldrgtb r3, [r1], #1
158: strgtb r3, [r0], #1
159: #ifdef __APCS_26__
160: ldmia sp!, {r0, pc}^
161: #else
162: ldmia sp!, {r0, pc}
163: #endif
164:
165: /* erg - unaligned destination */
166: Lmemcpy_fdestul:
167: rsb r12, r12, #4
168: cmp r12, #2
169:
170: /* align destination with byte copies */
171: ldrb r3, [r1], #1
172: strb r3, [r0], #1
173: ldrgeb r3, [r1], #1
174: strgeb r3, [r0], #1
175: ldrgtb r3, [r1], #1
176: strgtb r3, [r0], #1
177: subs r2, r2, r12
178: blt Lmemcpy_fl4 /* less the 4 bytes */
179:
180: ands r12, r1, #3
181: beq Lmemcpy_ft8 /* we have an aligned source */
182:
183: /* erg - unaligned source */
184: /* This is where it gets nasty ... */
185: Lmemcpy_fsrcul:
186: bic r1, r1, #3
187: ldr lr, [r1], #4
188: cmp r12, #2
189: bgt Lmemcpy_fsrcul3
190: beq Lmemcpy_fsrcul2
191: cmp r2, #0x0c
192: blt Lmemcpy_fsrcul1loop4
193: sub r2, r2, #0x0c
194: stmdb sp!, {r4, r5}
195:
196: Lmemcpy_fsrcul1loop16:
197: mov r3, lr, lsr #8
198: ldmia r1!, {r4, r5, r12, lr}
199: orr r3, r3, r4, lsl #24
200: mov r4, r4, lsr #8
201: orr r4, r4, r5, lsl #24
202: mov r5, r5, lsr #8
203: orr r5, r5, r12, lsl #24
204: mov r12, r12, lsr #8
205: orr r12, r12, lr, lsl #24
206: stmia r0!, {r3-r5, r12}
207: subs r2, r2, #0x10
208: bge Lmemcpy_fsrcul1loop16
209: ldmia sp!, {r4, r5}
210: adds r2, r2, #0x0c
211: blt Lmemcpy_fsrcul1l4
212:
213: Lmemcpy_fsrcul1loop4:
214: mov r12, lr, lsr #8
215: ldr lr, [r1], #4
216: orr r12, r12, lr, lsl #24
217: str r12, [r0], #4
218: subs r2, r2, #4
219: bge Lmemcpy_fsrcul1loop4
220:
221: Lmemcpy_fsrcul1l4:
222: sub r1, r1, #3
223: b Lmemcpy_fl4
224:
225: Lmemcpy_fsrcul2:
226: cmp r2, #0x0c
227: blt Lmemcpy_fsrcul2loop4
228: sub r2, r2, #0x0c
229: stmdb sp!, {r4, r5}
230:
231: Lmemcpy_fsrcul2loop16:
232: mov r3, lr, lsr #16
233: ldmia r1!, {r4, r5, r12, lr}
234: orr r3, r3, r4, lsl #16
235: mov r4, r4, lsr #16
236: orr r4, r4, r5, lsl #16
237: mov r5, r5, lsr #16
238: orr r5, r5, r12, lsl #16
239: mov r12, r12, lsr #16
240: orr r12, r12, lr, lsl #16
241: stmia r0!, {r3-r5, r12}
242: subs r2, r2, #0x10
243: bge Lmemcpy_fsrcul2loop16
244: ldmia sp!, {r4, r5}
245: adds r2, r2, #0x0c
246: blt Lmemcpy_fsrcul2l4
247:
248: Lmemcpy_fsrcul2loop4:
249: mov r12, lr, lsr #16
250: ldr lr, [r1], #4
251: orr r12, r12, lr, lsl #16
252: str r12, [r0], #4
253: subs r2, r2, #4
254: bge Lmemcpy_fsrcul2loop4
255:
256: Lmemcpy_fsrcul2l4:
257: sub r1, r1, #2
258: b Lmemcpy_fl4
259:
260: Lmemcpy_fsrcul3:
261: cmp r2, #0x0c
262: blt Lmemcpy_fsrcul3loop4
263: sub r2, r2, #0x0c
264: stmdb sp!, {r4, r5}
265:
266: Lmemcpy_fsrcul3loop16:
267: mov r3, lr, lsr #24
268: ldmia r1!, {r4, r5, r12, lr}
269: orr r3, r3, r4, lsl #8
270: mov r4, r4, lsr #24
271: orr r4, r4, r5, lsl #8
272: mov r5, r5, lsr #24
273: orr r5, r5, r12, lsl #8
274: mov r12, r12, lsr #24
275: orr r12, r12, lr, lsl #8
276: stmia r0!, {r3-r5, r12}
277: subs r2, r2, #0x10
278: bge Lmemcpy_fsrcul3loop16
279: ldmia sp!, {r4, r5}
280: adds r2, r2, #0x0c
281: blt Lmemcpy_fsrcul3l4
282:
283: Lmemcpy_fsrcul3loop4:
284: mov r12, lr, lsr #24
285: ldr lr, [r1], #4
286: orr r12, r12, lr, lsl #8
287: str r12, [r0], #4
288: subs r2, r2, #4
289: bge Lmemcpy_fsrcul3loop4
290:
291: Lmemcpy_fsrcul3l4:
292: sub r1, r1, #1
293: b Lmemcpy_fl4
294:
295: Lmemcpy_backwards:
296: add r1, r1, r2
297: add r0, r0, r2
298: subs r2, r2, #4
299: blt Lmemcpy_bl4 /* less than 4 bytes */
300: ands r12, r0, #3
301: bne Lmemcpy_bdestul /* oh unaligned destination addr */
302: ands r12, r1, #3
303: bne Lmemcpy_bsrcul /* oh unaligned source addr */
304:
305: Lmemcpy_bt8:
306: /* We have aligned source and destination */
307: subs r2, r2, #8
308: blt Lmemcpy_bl12 /* less than 12 bytes (4 from above) */
309: stmdb sp!, {r4}
310: subs r2, r2, #0x14 /* less than 32 bytes (12 from above) */
311: blt Lmemcpy_bl32
312:
313: /* blat 32 bytes at a time */
314: /* XXX for really big copies perhaps we should use more registers */
315: Lmemcpy_bloop32:
316: ldmdb r1!, {r3, r4, r12, lr}
317: stmdb r0!, {r3, r4, r12, lr}
318: ldmdb r1!, {r3, r4, r12, lr}
319: stmdb r0!, {r3, r4, r12, lr}
320: subs r2, r2, #0x20
321: bge Lmemcpy_bloop32
322:
323: Lmemcpy_bl32:
324: cmn r2, #0x10
325: ldmgedb r1!, {r3, r4, r12, lr} /* blat a remaining 16 bytes */
326: stmgedb r0!, {r3, r4, r12, lr}
327: subge r2, r2, #0x10
328: adds r2, r2, #0x14
329: ldmgedb r1!, {r3, r12, lr} /* blat a remaining 12 bytes */
330: stmgedb r0!, {r3, r12, lr}
331: subge r2, r2, #0x0c
332: ldmia sp!, {r4}
333:
334: Lmemcpy_bl12:
335: adds r2, r2, #8
336: blt Lmemcpy_bl4
337: subs r2, r2, #4
338: ldrlt r3, [r1, #-4]!
339: strlt r3, [r0, #-4]!
340: ldmgedb r1!, {r3, r12}
341: stmgedb r0!, {r3, r12}
342: subge r2, r2, #4
343:
344: Lmemcpy_bl4:
345: /* less than 4 bytes to go */
346: adds r2, r2, #4
347: #ifdef __APCS_26__
348: ldmeqia sp!, {r0, pc}^
349: #else
350: ldmeqia sp!, {r0, pc}
351: #endif
352:
353: /* copy the crud byte at a time */
354: cmp r2, #2
355: ldrb r3, [r1, #-1]!
356: strb r3, [r0, #-1]!
357: ldrgeb r3, [r1, #-1]!
358: strgeb r3, [r0, #-1]!
359: ldrgtb r3, [r1, #-1]!
360: strgtb r3, [r0, #-1]!
361: #ifdef __APCS_26__
362: ldmia sp!, {r0, pc}^
363: #else
364: ldmia sp!, {r0, pc}
365: #endif
366:
367: /* erg - unaligned destination */
368: Lmemcpy_bdestul:
369: cmp r12, #2
370:
371: /* align destination with byte copies */
372: ldrb r3, [r1, #-1]!
373: strb r3, [r0, #-1]!
374: ldrgeb r3, [r1, #-1]!
375: strgeb r3, [r0, #-1]!
376: ldrgtb r3, [r1, #-1]!
377: strgtb r3, [r0, #-1]!
378: subs r2, r2, r12
379: blt Lmemcpy_bl4 /* less than 4 bytes to go */
380: ands r12, r1, #3
381: beq Lmemcpy_bt8 /* we have an aligned source */
382:
383: /* erg - unaligned source */
384: /* This is where it gets nasty ... */
385: Lmemcpy_bsrcul:
386: bic r1, r1, #3
387: ldr r3, [r1, #0]
388: cmp r12, #2
389: blt Lmemcpy_bsrcul1
390: beq Lmemcpy_bsrcul2
391: cmp r2, #0x0c
392: blt Lmemcpy_bsrcul3loop4
393: sub r2, r2, #0x0c
394: stmdb sp!, {r4, r5}
395:
396: Lmemcpy_bsrcul3loop16:
397: mov lr, r3, lsl #8
398: ldmdb r1!, {r3-r5, r12}
399: orr lr, lr, r12, lsr #24
400: mov r12, r12, lsl #8
401: orr r12, r12, r5, lsr #24
402: mov r5, r5, lsl #8
403: orr r5, r5, r4, lsr #24
404: mov r4, r4, lsl #8
405: orr r4, r4, r3, lsr #24
406: stmdb r0!, {r4, r5, r12, lr}
407: subs r2, r2, #0x10
408: bge Lmemcpy_bsrcul3loop16
409: ldmia sp!, {r4, r5}
410: adds r2, r2, #0x0c
411: blt Lmemcpy_bsrcul3l4
412:
413: Lmemcpy_bsrcul3loop4:
414: mov r12, r3, lsl #8
415: ldr r3, [r1, #-4]!
416: orr r12, r12, r3, lsr #24
417: str r12, [r0, #-4]!
418: subs r2, r2, #4
419: bge Lmemcpy_bsrcul3loop4
420:
421: Lmemcpy_bsrcul3l4:
422: add r1, r1, #3
423: b Lmemcpy_bl4
424:
425: Lmemcpy_bsrcul2:
426: cmp r2, #0x0c
427: blt Lmemcpy_bsrcul2loop4
428: sub r2, r2, #0x0c
429: stmdb sp!, {r4, r5}
430:
431: Lmemcpy_bsrcul2loop16:
432: mov lr, r3, lsl #16
433: ldmdb r1!, {r3-r5, r12}
434: orr lr, lr, r12, lsr #16
435: mov r12, r12, lsl #16
436: orr r12, r12, r5, lsr #16
437: mov r5, r5, lsl #16
438: orr r5, r5, r4, lsr #16
439: mov r4, r4, lsl #16
440: orr r4, r4, r3, lsr #16
441: stmdb r0!, {r4, r5, r12, lr}
442: subs r2, r2, #0x10
443: bge Lmemcpy_bsrcul2loop16
444: ldmia sp!, {r4, r5}
445: adds r2, r2, #0x0c
446: blt Lmemcpy_bsrcul2l4
447:
448: Lmemcpy_bsrcul2loop4:
449: mov r12, r3, lsl #16
450: ldr r3, [r1, #-4]!
451: orr r12, r12, r3, lsr #16
452: str r12, [r0, #-4]!
453: subs r2, r2, #4
454: bge Lmemcpy_bsrcul2loop4
455:
456: Lmemcpy_bsrcul2l4:
457: add r1, r1, #2
458: b Lmemcpy_bl4
459:
460: Lmemcpy_bsrcul1:
461: cmp r2, #0x0c
462: blt Lmemcpy_bsrcul1loop4
463: sub r2, r2, #0x0c
464: stmdb sp!, {r4, r5}
465:
466: Lmemcpy_bsrcul1loop32:
467: mov lr, r3, lsl #24
468: ldmdb r1!, {r3-r5, r12}
469: orr lr, lr, r12, lsr #8
470: mov r12, r12, lsl #24
471: orr r12, r12, r5, lsr #8
472: mov r5, r5, lsl #24
473: orr r5, r5, r4, lsr #8
474: mov r4, r4, lsl #24
475: orr r4, r4, r3, lsr #8
476: stmdb r0!, {r4, r5, r12, lr}
477: subs r2, r2, #0x10
478: bge Lmemcpy_bsrcul1loop32
479: ldmia sp!, {r4, r5}
480: adds r2, r2, #0x0c
481: blt Lmemcpy_bsrcul1l4
482:
483: Lmemcpy_bsrcul1loop4:
484: mov r12, r3, lsl #24
485: ldr r3, [r1, #-4]!
486: orr r12, r12, r3, lsr #8
487: str r12, [r0, #-4]!
488: subs r2, r2, #4
489: bge Lmemcpy_bsrcul1loop4
490:
491: Lmemcpy_bsrcul1l4:
492: add r1, r1, #1
493: b Lmemcpy_bl4
494:
CVSweb