Annotation of sys/arch/arm/arm/in_cksum_arm.S, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: in_cksum_arm.S,v 1.2 2005/05/10 21:32:20 brad Exp $ */
2: /* $NetBSD: in_cksum_arm.S,v 1.3 2003/11/26 10:31:53 rearnsha Exp $ */
3:
4: /*
5: * Copyright 2003 Wasabi Systems, Inc.
6: * All rights reserved.
7: *
8: * Written by Steve C. Woodford for Wasabi Systems, Inc.
9: *
10: * Redistribution and use in source and binary forms, with or without
11: * modification, are permitted provided that the following conditions
12: * are met:
13: * 1. Redistributions of source code must retain the above copyright
14: * notice, this list of conditions and the following disclaimer.
15: * 2. Redistributions in binary form must reproduce the above copyright
16: * notice, this list of conditions and the following disclaimer in the
17: * documentation and/or other materials provided with the distribution.
18: * 3. All advertising materials mentioning features or use of this software
19: * must display the following acknowledgement:
20: * This product includes software developed for the NetBSD Project by
21: * Wasabi Systems, Inc.
22: * 4. The name of Wasabi Systems, Inc. may not be used to endorse
23: * or promote products derived from this software without specific prior
24: * written permission.
25: *
26: * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
27: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
30: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: * POSSIBILITY OF SUCH DAMAGE.
37: */
38:
39: /*
40: * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/Xscale
41: */
42:
43: #include <machine/asm.h>
44: #include "assym.h"
45:
46: /*
47: * int in_cksum(struct mbuf *m, int len)
48: *
49: * Entry:
50: * r0 m
51: * r1 len
52: *
53: * NOTE: Assumes 'm' is *never* NULL.
54: */
55: /* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */
56: ENTRY(in_cksum)
57: stmfd sp!, {r4-r11,lr}
58: mov r8, #0x00
59: mov r9, r1
60: mov r10, #0x00
61: mov ip, r0
62:
63: .Lin_cksum_loop:
64: ldr r1, [ip, #(M_LEN)]
65: ldr r0, [ip, #(M_DATA)]
66: ldr ip, [ip, #(M_NEXT)]
67: .Lin_cksum_entry4:
68: cmp r9, r1
69: movlt r1, r9
70: sub r9, r9, r1
71: eor r11, r10, r0
72: add r10, r10, r1
73: adds r2, r1, #0x00
74: blne _ASM_LABEL(L_cksumdata)
75: tst r11, #0x01
76: movne r2, r2, ror #8
77: adds r8, r8, r2
78: adc r8, r8, #0x00
79: cmp ip, #0x00
80: bne .Lin_cksum_loop
81:
82: mov r1, #0xff
83: orr r1, r1, #0xff00
84: and r0, r8, r1
85: add r0, r0, r8, lsr #16
86: add r0, r0, r0, lsr #16
87: and r0, r0, r1
88: eor r0, r0, r1
89: ldmfd sp!, {r4-r11,pc}
90:
91: #ifdef INET
92: /*
93: * int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
94: *
95: * Entry:
96: * r0 m
97: * r1 nxt
98: * r2 off
99: * r3 len
100: */
101: /* LINTSTUB: Func: int in4_cksum(struct mbuf *, u_int8_t, int, int) */
102: ENTRY(in4_cksum)
103: stmfd sp!, {r4-r11,lr}
104: mov r8, #0x00 /* Accumulate sum in r8 */
105:
106: /*
107: * First, deal with a pseudo header, if present
108: */
109: ldr r6, [r0, #(M_DATA)]
110: cmp r1, #0x00
111: beq .Lin4_cksum_skip_entry
112:
113: #ifdef __XSCALE__
114: pld [r6, #(IP_SRC)]
115: #endif
116: add r4, r6, #(IP_SRC)
117: ands r4, r4, #0x03
118: add r8, r1, r3 /* sum = nxt + len */
119: addne pc, pc, r4, lsl #5 /* Handle alignment of pseudo header */
120:
121: /* 0x00: Data 32-bit aligned */
122: ldr r5, [r6, #(IP_SRC)]
123: ldr r4, [r6, #(IP_DST)]
124: b .Lin4_cksum_add_ips
125: nop
126: nop
127: nop
128: nop
129: nop
130: nop
131:
132: /* 0x01: Data 8-bit aligned */
133: ldr r4, [r6, #(IP_SRC - 1)] /* BE:r4 = x012 LE:r4 = 210x */
134: ldr r5, [r6, #(IP_SRC + 3)] /* BE:r5 = 3456 LE:r5 = 6543 */
135: ldrb r7, [r6, #(IP_SRC + 7)] /* r7 = ...7 */
136: #ifdef __ARMEB__
137: mov r4, r4, lsl #8 /* r4 = 012. */
138: orr r4, r4, r5, lsr #24 /* r4 = 0123 */
139: orr r5, r7, r5, lsl #8 /* r5 = 4567 */
140: b .Lin4_cksum_add_ips
141: nop
142: #else
143: mov r4, r4, lsr #8 /* r4 = .210 */
144: orr r4, r4, r5, lsl #24 /* r4 = 3210 */
145: mov r5, r5, lsr #8 /* r5 = .654 */
146: orr r5, r5, r7, lsl #24 /* r5 = 7654 */
147: b .Lin4_cksum_add_ips
148: #endif
149:
150: /* 0x02: Data 16-bit aligned */
151: #ifdef __XSCALE__
152: ldrh r5, [r6, #(IP_SRC)] /* BE:r5 = ..01 LE:r5 = ..10 */
153: ldrh r7, [r6, #(IP_DST + 2)] /* BE:r7 = ..67 LE:r7 = ..76 */
154: ldr r4, [r6, #(IP_SRC + 2)] /* BE:r4 = 2345 LE:r4 = 5432 */
155: orr r5, r7, r5, lsl #16 /* BE:r5 = 0167 LE:r5 = 1076 */
156: b .Lin4_cksum_add_ips
157: nop
158: nop
159: nop
160: #else
161: ldr r4, [r6, #(IP_SRC - 2)] /* r4 = 10xx */
162: ldr r7, [r6, #(IP_DST - 2)] /* r7 = xx76 */
163: ldr r5, [r6, #(IP_SRC + 2)] /* r5 = 5432 */
164: mov r4, r4, lsr #16 /* r4 = ..10 */
165: orr r4, r4, r7, lsl #16 /* r4 = 7610 */
166: b .Lin4_cksum_add_ips
167: nop
168: nop
169: #endif
170:
171: /* 0x03: Data 8-bit aligned */
172: ldrb r4, [r6, #(IP_SRC)] /* r4 = ...0 */
173: ldr r5, [r6, #(IP_SRC + 1)] /* BE:r5 = 1234 LE:r5 = 4321 */
174: ldr r7, [r6, #(IP_SRC + 5)] /* BE:r7 = 567x LE:r7 = x765 */
175: #ifdef __ARMEB__
176: mov r4, r4, lsl #24 /* r4 = 0... */
177: orr r4, r4, r5, lsr #8 /* r4 = 0123 */
178: mov r5, r5, lsl #24 /* r5 = 4... */
179: orr r5, r5, r7, lsr #8 /* r5 = 4567 */
180: #else
181: orr r4, r4, r5, lsl #8 /* r4 = 3210 */
182: mov r5, r5, lsr #24 /* r4 = ...4 */
183: orr r5, r5, r7, lsl #8 /* r5 = 7654 */
184: #endif
185: /* FALLTHROUGH */
186:
187: .Lin4_cksum_add_ips:
188: adds r5, r5, r4
189: #ifndef __ARMEB__
190: adcs r8, r5, r8, lsl #8
191: #else
192: adcs r8, r5, r8
193: #endif
194: adc r8, r8, #0x00
195: mov r1, #0x00
196: b .Lin4_cksum_skip_entry
197:
198: .Lin4_cksum_skip_loop:
199: ldr r1, [r0, #(M_LEN)]
200: ldr r6, [r0, #(M_DATA)]
201: ldr r0, [r0, #(M_NEXT)]
202: .Lin4_cksum_skip_entry:
203: subs r2, r2, r1
204: blt .Lin4_cksum_skip_done
205: cmp r0, #0x00
206: bne .Lin4_cksum_skip_loop
207: b .Lin4_cksum_whoops
208:
209: .Lin4_cksum_skip_done:
210: mov ip, r0
211: add r0, r2, r6
212: add r0, r0, r1
213: rsb r1, r2, #0x00
214: mov r9, r3
215: mov r10, #0x00
216: b .Lin_cksum_entry4
217:
218: .Lin4_cksum_whoops:
219: adr r0, .Lin4_cksum_whoops_str
220: bl _C_LABEL(panic)
221: .Lin4_cksum_whoops_str:
222: .asciz "in4_cksum: out of mbufs\n"
223: .align 5
224: #endif /* INET */
225:
226: /*
227: * The main in*_cksum() workhorse...
228: *
229: * Entry parameters:
230: * r0 Pointer to buffer
231: * r1 Buffer length
232: * lr Return address
233: *
234: * Returns:
235: * r2 Accumulated 32-bit sum
236: *
237: * Clobbers:
238: * r0-r7
239: */
240: /* LINTSTUB: Ignore */
241: ASENTRY_NP(L_cksumdata)
242: #ifdef __XSCALE__
243: pld [r0] /* Pre-fetch the start of the buffer */
244: #endif
245: mov r2, #0
246:
247: /* We first have to word-align the buffer. */
248: ands r7, r0, #0x03
249: beq .Lcksumdata_wordaligned
250: rsb r7, r7, #0x04
251: cmp r1, r7 /* Enough bytes left to make it? */
252: blt .Lcksumdata_endgame
253: cmp r7, #0x02
254: ldrb r4, [r0], #0x01 /* Fetch 1st byte */
255: ldrgeb r5, [r0], #0x01 /* Fetch 2nd byte */
256: movlt r5, #0x00
257: ldrgtb r6, [r0], #0x01 /* Fetch 3rd byte */
258: movle r6, #0x00
259: /* Combine the three bytes depending on endianness and alignment */
260: #ifdef __ARMEB__
261: orreq r2, r5, r4, lsl #8
262: orreq r2, r2, r6, lsl #24
263: orrne r2, r4, r5, lsl #8
264: orrne r2, r2, r6, lsl #16
265: #else
266: orreq r2, r4, r5, lsl #8
267: orreq r2, r2, r6, lsl #16
268: orrne r2, r5, r4, lsl #8
269: orrne r2, r2, r6, lsl #24
270: #endif
271: subs r1, r1, r7 /* Update length */
272: moveq pc, lr /* All done? */
273:
274: /* Buffer is now word aligned */
275: .Lcksumdata_wordaligned:
276: #ifdef __XSCALE__
277: cmp r1, #0x04 /* Less than 4 bytes left? */
278: blt .Lcksumdata_endgame /* Yup */
279:
280: /* Now quad-align, if necessary */
281: ands r7, r0, #0x04
282: ldrne r7, [r0], #0x04
283: subne r1, r1, #0x04
284: subs r1, r1, #0x40
285: blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */
286:
287: /*
288: * Buffer is now quad aligned. Sum 64 bytes at a time.
289: * Note: First ldrd is hoisted above the loop, together with
290: * setting r6 to zero to avoid stalling for results in the
291: * loop. (r7 is live, from above).
292: */
293: ldrd r4, [r0], #0x08
294: mov r6, #0x00
295: .Lcksumdata_bigloop:
296: pld [r0, #0x18]
297: adds r2, r2, r6
298: adcs r2, r2, r7
299: ldrd r6, [r0], #0x08
300: adcs r2, r2, r4
301: adcs r2, r2, r5
302: ldrd r4, [r0], #0x08
303: adcs r2, r2, r6
304: adcs r2, r2, r7
305: ldrd r6, [r0], #0x08
306: adcs r2, r2, r4
307: adcs r2, r2, r5
308: ldrd r4, [r0], #0x08
309: adcs r2, r2, r6
310: adcs r2, r2, r7
311: pld [r0, #0x18]
312: ldrd r6, [r0], #0x08
313: adcs r2, r2, r4
314: adcs r2, r2, r5
315: ldrd r4, [r0], #0x08
316: adcs r2, r2, r6
317: adcs r2, r2, r7
318: ldrd r6, [r0], #0x08
319: adcs r2, r2, r4
320: adcs r2, r2, r5
321: adc r2, r2, #0x00
322: subs r1, r1, #0x40
323: ldrged r4, [r0], #0x08
324: bge .Lcksumdata_bigloop
325:
326: adds r2, r2, r6 /* r6/r7 still need summing */
327: .Lcksumdata_bigloop_end:
328: adcs r2, r2, r7
329: adc r2, r2, #0x00
330:
331: #else /* !__XSCALE__ */
332:
333: subs r1, r1, #0x40
334: blt .Lcksumdata_bigloop_end
335:
336: .Lcksumdata_bigloop:
337: ldmia r0!, {r3, r4, r5, r6}
338: adds r2, r2, r3
339: adcs r2, r2, r4
340: adcs r2, r2, r5
341: ldmia r0!, {r3, r4, r5, r7}
342: adcs r2, r2, r6
343: adcs r2, r2, r3
344: adcs r2, r2, r4
345: adcs r2, r2, r5
346: ldmia r0!, {r3, r4, r5, r6}
347: adcs r2, r2, r7
348: adcs r2, r2, r3
349: adcs r2, r2, r4
350: adcs r2, r2, r5
351: ldmia r0!, {r3, r4, r5, r7}
352: adcs r2, r2, r6
353: adcs r2, r2, r3
354: adcs r2, r2, r4
355: adcs r2, r2, r5
356: adcs r2, r2, r7
357: adc r2, r2, #0x00
358: subs r1, r1, #0x40
359: bge .Lcksumdata_bigloop
360: .Lcksumdata_bigloop_end:
361: #endif
362:
363: adds r1, r1, #0x40
364: moveq pc, lr
365: cmp r1, #0x20
366:
367: #ifdef __XSCALE__
368: ldrged r4, [r0], #0x08 /* Avoid stalling pld and result */
369: blt .Lcksumdata_less_than_32
370: pld [r0, #0x18]
371: ldrd r6, [r0], #0x08
372: adds r2, r2, r4
373: adcs r2, r2, r5
374: ldrd r4, [r0], #0x08
375: adcs r2, r2, r6
376: adcs r2, r2, r7
377: ldrd r6, [r0], #0x08
378: adcs r2, r2, r4
379: adcs r2, r2, r5
380: adcs r2, r2, r6 /* XXX: Unavoidable result stall */
381: adcs r2, r2, r7
382: #else
383: blt .Lcksumdata_less_than_32
384: ldmia r0!, {r3, r4, r5, r6}
385: adds r2, r2, r3
386: adcs r2, r2, r4
387: adcs r2, r2, r5
388: ldmia r0!, {r3, r4, r5, r7}
389: adcs r2, r2, r6
390: adcs r2, r2, r3
391: adcs r2, r2, r4
392: adcs r2, r2, r5
393: adcs r2, r2, r7
394: #endif
395: adc r2, r2, #0x00
396: subs r1, r1, #0x20
397: moveq pc, lr
398:
399: .Lcksumdata_less_than_32:
400: /* There are less than 32 bytes left */
401: and r3, r1, #0x18
402: rsb r4, r3, #0x18
403: sub r1, r1, r3
404: adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */
405: addne pc, pc, r4
406:
407: /*
408: * Note: We use ldm here, even on Xscale, since the combined issue/result
409: * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
410: */
411: /* At least 24 bytes remaining... */
412: ldmia r0!, {r4, r5}
413: nop
414: adcs r2, r2, r4
415: adcs r2, r2, r5
416:
417: /* At least 16 bytes remaining... */
418: ldmia r0!, {r4, r5}
419: adcs r2, r2, r4
420: adcs r2, r2, r5
421:
422: /* At least 8 bytes remaining... */
423: ldmia r0!, {r4, r5}
424: adcs r2, r2, r4
425: adcs r2, r2, r5
426:
427: /* Less than 8 bytes remaining... */
428: adc r2, r2, #0x00
429: subs r1, r1, #0x04
430: blt .Lcksumdata_lessthan4
431:
432: ldr r4, [r0], #0x04
433: sub r1, r1, #0x04
434: adds r2, r2, r4
435: adc r2, r2, #0x00
436:
437: /* Deal with < 4 bytes remaining */
438: .Lcksumdata_lessthan4:
439: adds r1, r1, #0x04
440: moveq pc, lr
441:
442: /* Deal with 1 to 3 remaining bytes, possibly misaligned */
443: .Lcksumdata_endgame:
444: ldrb r3, [r0] /* Fetch first byte */
445: cmp r1, #0x02
446: ldrgeb r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */
447: movlt r4, #0x00
448: ldrgtb r5, [r0, #0x02]
449: movle r5, #0x00
450: /* Combine the three bytes depending on endianness and alignment */
451: tst r0, #0x01
452: #ifdef __ARMEB__
453: orreq r3, r4, r3, lsl #8
454: orreq r3, r3, r5, lsl #24
455: orrne r3, r3, r4, lsl #8
456: orrne r3, r3, r5, lsl #16
457: #else
458: orreq r3, r3, r4, lsl #8
459: orreq r3, r3, r5, lsl #16
460: orrne r3, r4, r3, lsl #8
461: orrne r3, r3, r5, lsl #24
462: #endif
463: adds r2, r2, r3
464: adc r2, r2, #0x00
465: mov pc, lr
CVSweb