2 * Copyright (C) 2008 The Android Open Source Project
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in
12 * the documentation and/or other materials provided with the
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
18 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
19 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
21 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
22 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
23 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
25 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include <machine/cpu-features.h>
34 .type memcmp, %function
38 * Optimized memcmp() for ARM9.
39 * This would not be optimal on XScale or ARM11, where more prefetching
40 * and use of PLD will be needed.
41 * The 2 major optimzations here are
42 * (1) The main loop compares 16 bytes at a time
43 * (2) The loads are scheduled in a way they won't stall
51 /* take of the case where length is 0 or the buffers are the same */
64 /* since r0 hold the result, move the first source
65 * pointer somewhere else
70 /* make sure we have at least 8+4 bytes, this simplify things below
71 * and avoid some overhead for small blocks
76 /* align first pointer to word boundary
83 /* align first pointer */
93 0: /* here the first pointer is aligned, and we have at least 4 bytes
97 /* see if the pointers are congruent */
102 /* congruent case, 32 bytes per iteration
103 * We need to make sure there are at least 32+4 bytes left
104 * because we effectively read ahead one word, and we could
105 * read past the buffer (and segfault) if we're not careful.
109 subs r2, r2, #(32 + 4)
142 /* do we have at least 4 bytes left? */
143 1: adds r2, r2, #(32 - 4 + 4)
146 /* finish off 4 bytes at a time */
159 /* finish off the remaining bytes */
162 2: /* the last 4 bytes are different, restart them */
167 /* process the last few bytes */
176 9: /* restore registers and return */
185 5: /*************** non-congruent case ***************/
190 /* here, offset is 2 (16-bits aligned, special cased) */
192 /* make sure we have at least 16 bytes to process */
197 /* align the unaligned pointer */
206 orr ip, ip, lr, lsl #16
208 moveq ip, lr, lsr #16
211 orreq ip, ip, lr, lsl #16
213 moveq ip, lr, lsr #16
216 orreq ip, ip, lr, lsl #16
218 moveq ip, lr, lsr #16
221 orreq ip, ip, lr, lsl #16
231 /* finish off the remaining bytes */
234 7: /* fix up the 2 pointers and fallthrough... */
241 4: /*************** offset is 1 or 3 (less optimized) ***************/
243 stmfd sp!, {r5, r6, r7}
249 mov r5, r0, lsl #3 /* r5 = right shift */
250 rsb r6, r5, #32 /* r6 = left shift */
252 /* align the unaligned pointer */
257 6: mov ip, r7, lsr r5
260 orr ip, ip, r7, lsl r6
265 orreq ip, ip, r7, lsl r6
271 sub r1, r1, r6, lsr #3
272 ldmfd sp!, {r5, r6, r7}
279 /* finish off the remaining bytes */
282 7: /* fix up the 2 pointers and fallthrough... */
284 sub r1, r1, r6, lsr #3
287 ldmfd sp!, {r5, r6, r7}