bionic/libc/arch-arm/bionic/memcmp.S

   1 /*
   2  * Copyright (C) 2008 The Android Open Source Project
   3  * All rights reserved.
   4  *
   5  * Redistribution and use in source and binary forms, with or without
   6  * modification, are permitted provided that the following conditions
   7  * are met:
   8  *  * Redistributions of source code must retain the above copyright
   9  *    notice, this list of conditions and the following disclaimer.
  10  *  * Redistributions in binary form must reproduce the above copyright
  11  *    notice, this list of conditions and the following disclaimer in
  12  *    the documentation and/or other materials provided with the
  13  *    distribution.
  14  *
  15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  16  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  17  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  18  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  19  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  20  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  21  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
  22  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  23  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  25  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  26  * SUCH DAMAGE.
  27  */
  28
  29 #include <machine/cpu-features.h>
  30
  31     .text
  32
  33     .global memcmp
  34     .type memcmp, %function
  35     .align 4
  36
  37 /*
  38  * Optimized memcmp() for ARM9.
  39  * This would not be optimal on XScale or ARM11, where more prefetching
  40  * and use of PLD will be needed.
  41  * The 2 major optimzations here are
  42  * (1) The main loop compares 16 bytes at a time
  43  * (2) The loads are scheduled in a way they won't stall
  44  */
  45
  46 memcmp:
  47         .fnstart
  48         PLD         (r0, #0)
  49         PLD         (r1, #0)
  50
  51         /* take of the case where length is 0 or the buffers are the same */
  52         cmp         r0, r1
  53         cmpne       r2, #0
  54         moveq       r0, #0
  55         bxeq        lr
  56
  57         .save {r4, lr}
  58         /* save registers */
  59         stmfd       sp!, {r4, lr}
  60
  61         PLD         (r0, #32)
  62         PLD         (r1, #32)
  63
  64         /* since r0 hold the result, move the first source
  65          * pointer somewhere else
  66          */
  67
  68          mov        r4, r0
  69
  70          /* make sure we have at least 8+4 bytes, this simplify things below
  71           * and avoid some overhead for small blocks
  72           */
  73          cmp        r2, #(8+4)
  74          bmi        8f
  75
  76         /* align first pointer to word boundary
  77          * offset = -src & 3
  78          */
  79         rsb         r3, r4, #0
  80         ands        r3, r3, #3
  81         beq         0f
  82
  83         /* align first pointer  */
  84         sub         r2, r2, r3
  85 1:      ldrb        r0, [r4], #1
  86         ldrb        ip, [r1], #1
  87         subs        r0, r0, ip
  88         bne         9f
  89         subs        r3, r3, #1
  90         bne         1b
  91
  92
  93 0:      /* here the first pointer is aligned, and we have at least 4 bytes
  94          * to process.
  95          */
  96
  97         /* see if the pointers are congruent */
  98         eor         r0, r4, r1
  99         ands        r0, r0, #3
 100         bne         5f
 101
 102         /* congruent case, 32 bytes per iteration
 103          * We need to make sure there are at least 32+4 bytes left
 104          * because we effectively read ahead one word, and we could
 105          * read past the buffer (and segfault) if we're not careful.
 106          */
 107
 108         ldr         ip, [r1]
 109         subs        r2, r2, #(32 + 4)
 110         bmi         1f
 111
 112 0:      PLD         (r4, #64)
 113         PLD         (r1, #64)
 114         ldr         r0, [r4], #4
 115         ldr         lr, [r1, #4]!
 116         eors        r0, r0, ip
 117         ldreq       r0, [r4], #4
 118         ldreq       ip, [r1, #4]!
 119         eoreqs      r0, r0, lr
 120         ldreq       r0, [r4], #4
 121         ldreq       lr, [r1, #4]!
 122         eoreqs      r0, r0, ip
 123         ldreq       r0, [r4], #4
 124         ldreq       ip, [r1, #4]!
 125         eoreqs      r0, r0, lr
 126         ldreq       r0, [r4], #4
 127         ldreq       lr, [r1, #4]!
 128         eoreqs      r0, r0, ip
 129         ldreq       r0, [r4], #4
 130         ldreq       ip, [r1, #4]!
 131         eoreqs      r0, r0, lr
 132         ldreq       r0, [r4], #4
 133         ldreq       lr, [r1, #4]!
 134         eoreqs      r0, r0, ip
 135         ldreq       r0, [r4], #4
 136         ldreq       ip, [r1, #4]!
 137         eoreqs      r0, r0, lr
 138         bne         2f
 139         subs        r2, r2, #32
 140         bhs         0b
 141
 142         /* do we have at least 4 bytes left? */
 143 1:      adds        r2, r2, #(32 - 4 + 4)
 144         bmi         4f
 145
 146         /* finish off 4 bytes at a time */
 147 3:      ldr         r0, [r4], #4
 148         ldr         ip, [r1], #4
 149         eors        r0, r0, ip
 150         bne         2f
 151         subs        r2, r2, #4
 152         bhs         3b
 153
 154         /* are we done? */
 155 4:      adds        r2, r2, #4
 156         moveq       r0, #0
 157         beq         9f
 158
 159         /* finish off the remaining bytes */
 160         b           8f
 161
 162 2:      /* the last 4 bytes are different, restart them */
 163         sub         r4, r4, #4
 164         sub         r1, r1, #4
 165         mov         r2, #4
 166
 167         /* process the last few bytes */
 168 8:      ldrb        r0, [r4], #1
 169         ldrb        ip, [r1], #1
 170         // stall
 171         subs        r0, r0, ip
 172         bne         9f
 173         subs        r2, r2, #1
 174         bne         8b
 175
 176 9:      /* restore registers and return */
 177         ldmfd       sp!, {r4, lr}
 178         bx          lr
 179         .fnend
 180
 181
 182
 183
 184
 185 5:      /*************** non-congruent case ***************/
 186         and         r0, r1, #3
 187         cmp         r0, #2
 188         bne         4f
 189
 190         /* here, offset is 2 (16-bits aligned, special cased) */
 191
 192         /* make sure we have at least 16 bytes to process */
 193         subs        r2, r2, #16
 194         addmi       r2, r2, #16
 195         bmi         8b
 196
 197         /* align the unaligned pointer */
 198         bic         r1, r1, #3
 199         ldr         lr, [r1], #4
 200
 201 6:      PLD         (r1, #64)
 202         PLD         (r4, #64)
 203         mov         ip, lr, lsr #16
 204         ldr         lr, [r1], #4
 205         ldr         r0, [r4], #4
 206         orr         ip, ip, lr, lsl #16
 207         eors        r0, r0, ip
 208         moveq       ip, lr, lsr #16
 209         ldreq       lr, [r1], #4
 210         ldreq       r0, [r4], #4
 211         orreq       ip, ip, lr, lsl #16
 212         eoreqs      r0, r0, ip
 213         moveq       ip, lr, lsr #16
 214         ldreq       lr, [r1], #4
 215         ldreq       r0, [r4], #4
 216         orreq       ip, ip, lr, lsl #16
 217         eoreqs      r0, r0, ip
 218         moveq       ip, lr, lsr #16
 219         ldreq       lr, [r1], #4
 220         ldreq       r0, [r4], #4
 221         orreq       ip, ip, lr, lsl #16
 222         eoreqs      r0, r0, ip
 223         bne         7f
 224         subs        r2, r2, #16
 225         bhs         6b
 226         sub         r1, r1, #2
 227         /* are we done? */
 228         adds        r2, r2, #16
 229         moveq       r0, #0
 230         beq         9b
 231         /* finish off the remaining bytes */
 232         b           8b
 233
 234 7:      /* fix up the 2 pointers and fallthrough... */
 235         sub         r1, r1, #(4+2)
 236         sub         r4, r4, #4
 237         mov         r2, #4
 238         b           8b
 239
 240
 241 4:      /*************** offset is 1 or 3 (less optimized) ***************/
 242
 243                 stmfd           sp!, {r5, r6, r7}
 244
 245         // r5 = rhs
 246         // r6 = lhs
 247         // r7 = scratch
 248
 249         mov         r5, r0, lsl #3              /* r5 = right shift */
 250         rsb         r6, r5, #32         /* r6 = left shift */
 251
 252         /* align the unaligned pointer */
 253         bic         r1, r1, #3
 254         ldr         r7, [r1], #4
 255         sub         r2, r2, #8
 256
 257 6:      mov         ip, r7, lsr r5
 258         ldr         r7, [r1], #4
 259         ldr         r0, [r4], #4
 260         orr         ip, ip, r7, lsl r6
 261         eors        r0, r0, ip
 262         moveq       ip, r7, lsr r5
 263         ldreq       r7, [r1], #4
 264         ldreq       r0, [r4], #4
 265         orreq       ip, ip, r7, lsl r6
 266         eoreqs      r0, r0, ip
 267         bne         7f
 268         subs        r2, r2, #8
 269         bhs         6b
 270
 271         sub         r1, r1, r6, lsr #3
 272                 ldmfd       sp!, {r5, r6, r7}
 273
 274         /* are we done? */
 275         adds        r2, r2, #8
 276         moveq       r0, #0
 277         beq         9b
 278
 279         /* finish off the remaining bytes */
 280         b           8b
 281
 282 7:      /* fix up the 2 pointers and fallthrough... */
 283         sub         r1, r1, #4
 284         sub         r1, r1, r6, lsr #3
 285         sub         r4, r4, #4
 286         mov         r2, #4
 287                 ldmfd           sp!, {r5, r6, r7}
 288         b           8b