libc/string/xtensa/strcmp.S

   1 /* Optimized strcmp for Xtensa.
   2    Copyright (C) 2001, 2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
  18    Boston, MA 02110-1301, USA.  */
  19
  20 #include "../../sysdeps/linux/xtensa/sysdep.h"
  21 #include <bits/xtensa-config.h>
  22 #include <features.h>
  23
  24 #ifdef __XTENSA_EB__
  25 #define MASK0 0xff000000
  26 #define MASK1 0x00ff0000
  27 #define MASK2 0x0000ff00
  28 #define MASK3 0x000000ff
  29 #else
  30 #define MASK0 0x000000ff
  31 #define MASK1 0x0000ff00
  32 #define MASK2 0x00ff0000
  33 #define MASK3 0xff000000
  34 #endif
  35
  36 #define MASK4 0x40404040
  37
  38         .literal .Lmask0, MASK0
  39         .literal .Lmask1, MASK1
  40         .literal .Lmask2, MASK2
  41         .literal .Lmask3, MASK3
  42         .literal .Lmask4, MASK4
  43
  44         .text
  45 ENTRY (strcmp)
  46         /* a2 = s1, a3 = s2 */
  47
  48         l8ui    a8, a2, 0       // byte 0 from s1
  49         l8ui    a9, a3, 0       // byte 0 from s2
  50         movi    a10, 3          // mask
  51         bne     a8, a9, .Lretdiff
  52
  53         or      a11, a2, a3
  54         bnone   a11, a10, .Laligned
  55
  56         xor     a11, a2, a3     // compare low two bits of s1 and s2
  57         bany    a11, a10, .Lunaligned   // if they have different alignment
  58
  59         /* s1/s2 are not word-aligned.  */
  60         addi    a2, a2, 1       // advance s1
  61         beqz    a8, .Leq        // bytes equal, if zero, strings are equal
  62         addi    a3, a3, 1       // advance s2
  63         bnone   a2, a10, .Laligned // if s1/s2 now aligned
  64         l8ui    a8, a2, 0       // byte 1 from s1
  65         l8ui    a9, a3, 0       // byte 1 from s2
  66         addi    a2, a2, 1       // advance s1
  67         bne     a8, a9, .Lretdiff // if different, return difference
  68         beqz    a8, .Leq        // bytes equal, if zero, strings are equal
  69         addi    a3, a3, 1       // advance s2
  70         bnone   a2, a10, .Laligned // if s1/s2 now aligned
  71         l8ui    a8, a2, 0       // byte 2 from s1
  72         l8ui    a9, a3, 0       // byte 2 from s2
  73         addi    a2, a2, 1       // advance s1
  74         bne     a8, a9, .Lretdiff // if different, return difference
  75         beqz    a8, .Leq        // bytes equal, if zero, strings are equal
  76         addi    a3, a3, 1       // advance s2
  77         j       .Laligned
  78
  79 /* s1 and s2 have different alignment.
  80
  81    If the zero-overhead loop option is available, use an (almost)
  82    infinite zero-overhead loop with conditional exits so we only pay
  83    for taken branches when exiting the loop.
  84
  85    Note: It is important for this unaligned case to come before the
  86    code for aligned strings, because otherwise some of the branches
  87    above cannot reach and have to be transformed to branches around
  88    jumps.  The unaligned code is smaller and the branches can reach
  89    over it.  */
  90
  91         .align  4
  92         /* (2 mod 4) alignment for loop instruction */
  93 .Lunaligned:
  94 #if XCHAL_HAVE_LOOPS
  95         _movi.n a8, 0           // set up for the maximum loop count
  96         loop    a8, .Lretdiff   // loop forever (almost anyway)
  97 #endif
  98 .Lnextbyte:
  99         l8ui    a8, a2, 0
 100         l8ui    a9, a3, 0
 101         addi    a2, a2, 1
 102         bne     a8, a9, .Lretdiff
 103         addi    a3, a3, 1
 104 #if XCHAL_HAVE_LOOPS
 105         beqz    a8, .Lretdiff
 106 #else
 107         bnez    a8, .Lnextbyte
 108 #endif
 109 .Lretdiff:
 110         sub     a2, a8, a9
 111         retw
 112
 113 /* s1 is word-aligned; s2 is word-aligned.
 114
 115    If the zero-overhead loop option is available, use an (almost)
 116    infinite zero-overhead loop with conditional exits so we only pay
 117    for taken branches when exiting the loop.  */
 118
 119 /* New algorithm, relying on the fact that all normal ASCII is between
 120    32 and 127.
 121
 122    Rather than check all bytes for zero:
 123    Take one word (4 bytes).  Call it w1.
 124    Shift w1 left by one into w1'.
 125    Or w1 and w1'.  For all normal ASCII bit 6 will be 1; for zero it won't.
 126    Check that all 4 bit 6's (one for each byte) are one:
 127    If they are, we are definitely not done.
 128    If they are not, we are probably done, but need to check for zero.  */
 129
 130         .align  4
 131 #if XCHAL_HAVE_LOOPS
 132 .Laligned:
 133         .begin  no-transform
 134         l32r    a4, .Lmask0     // mask for byte 0
 135         l32r    a7, .Lmask4
 136         /* Loop forever.  (a4 is more than than the maximum number
 137            of iterations) */
 138         loop    a4, .Laligned_done
 139
 140         /* First unrolled loop body.  */
 141         l32i    a8, a2, 0       // get word from s1
 142         l32i    a9, a3, 0       // get word from s2
 143         slli    a5, a8, 1
 144         bne     a8, a9, .Lwne2
 145         or      a9, a8, a5
 146         bnall   a9, a7, .Lprobeq
 147
 148         /* Second unrolled loop body.  */
 149         l32i    a8, a2, 4       // get word from s1+4
 150         l32i    a9, a3, 4       // get word from s2+4
 151         slli    a5, a8, 1
 152         bne     a8, a9, .Lwne2
 153         or      a9, a8, a5
 154         bnall   a9, a7, .Lprobeq2
 155
 156         addi    a2, a2, 8       // advance s1 pointer
 157         addi    a3, a3, 8       // advance s2 pointer
 158 .Laligned_done:
 159         or      a1, a1, a1      // nop
 160
 161 .Lprobeq2:
 162         /* Adjust pointers to account for the loop unrolling.  */
 163         addi    a2, a2, 4
 164         addi    a3, a3, 4
 165
 166 #else /* !XCHAL_HAVE_LOOPS */
 167
 168 .Laligned:
 169         movi    a4, MASK0       // mask for byte 0
 170         movi    a7, MASK4
 171         j       .Lfirstword
 172 .Lnextword:
 173         addi    a2, a2, 4       // advance s1 pointer
 174         addi    a3, a3, 4       // advance s2 pointer
 175 .Lfirstword:
 176         l32i    a8, a2, 0       // get word from s1
 177         l32i    a9, a3, 0       // get word from s2
 178         slli    a5, a8, 1
 179         bne     a8, a9, .Lwne2
 180         or      a9, a8, a5
 181         ball    a9, a7, .Lnextword
 182 #endif /* !XCHAL_HAVE_LOOPS */
 183
 184         /* align (0 mod 4) */
 185 .Lprobeq:
 186         /* Words are probably equal, but check for sure.
 187            If not, loop over the rest of string using normal algorithm.  */
 188
 189         bnone   a8, a4, .Leq    // if byte 0 is zero
 190         l32r    a5, .Lmask1     // mask for byte 1
 191         l32r    a6, .Lmask2     // mask for byte 2
 192         bnone   a8, a5, .Leq    // if byte 1 is zero
 193         l32r    a7, .Lmask3     // mask for byte 3
 194         bnone   a8, a6, .Leq    // if byte 2 is zero
 195         bnone   a8, a7, .Leq    // if byte 3 is zero
 196         addi.n  a2, a2, 4       // advance s1 pointer
 197         addi.n  a3, a3, 4       // advance s2 pointer
 198 #if XCHAL_HAVE_LOOPS
 199
 200         /* align (1 mod 4) */
 201         loop    a4, .Leq        // loop forever (a4 is bigger than max iters)
 202         .end    no-transform
 203
 204         l32i    a8, a2, 0       // get word from s1
 205         l32i    a9, a3, 0       // get word from s2
 206         addi    a2, a2, 4       // advance s1 pointer
 207         bne     a8, a9, .Lwne
 208         bnone   a8, a4, .Leq    // if byte 0 is zero
 209         bnone   a8, a5, .Leq    // if byte 1 is zero
 210         bnone   a8, a6, .Leq    // if byte 2 is zero
 211         bnone   a8, a7, .Leq    // if byte 3 is zero
 212         addi    a3, a3, 4       // advance s2 pointer
 213
 214 #else /* !XCHAL_HAVE_LOOPS */
 215
 216         j       .Lfirstword2
 217 .Lnextword2:
 218         addi    a3, a3, 4       // advance s2 pointer
 219 .Lfirstword2:
 220         l32i    a8, a2, 0       // get word from s1
 221         l32i    a9, a3, 0       // get word from s2
 222         addi    a2, a2, 4       // advance s1 pointer
 223         bne     a8, a9, .Lwne
 224         bnone   a8, a4, .Leq    // if byte 0 is zero
 225         bnone   a8, a5, .Leq    // if byte 1 is zero
 226         bnone   a8, a6, .Leq    // if byte 2 is zero
 227         bany    a8, a7, .Lnextword2     // if byte 3 is zero
 228 #endif /* !XCHAL_HAVE_LOOPS */
 229
 230         /* Words are equal; some byte is zero.  */
 231 .Leq:   movi    a2, 0           // return equal
 232         retw
 233
 234 .Lwne2: /* Words are not equal.  On big-endian processors, if none of the
 235            bytes are zero, the return value can be determined by a simple
 236            comparison.  */
 237 #ifdef __XTENSA_EB__
 238         or      a10, a8, a5
 239         bnall   a10, a7, .Lsomezero
 240         bgeu    a8, a9, .Lposreturn
 241         movi    a2, -1
 242         retw
 243 .Lposreturn:
 244         movi    a2, 1
 245         retw
 246 .Lsomezero:     // There is probably some zero byte.
 247 #endif /* __XTENSA_EB__ */
 248 .Lwne:  /* Words are not equal.  */
 249         xor     a2, a8, a9      // get word with nonzero in byte that differs
 250         bany    a2, a4, .Ldiff0 // if byte 0 differs
 251         movi    a5, MASK1       // mask for byte 1
 252         bnone   a8, a4, .Leq    // if byte 0 is zero
 253         bany    a2, a5, .Ldiff1 // if byte 1 differs
 254         movi    a6, MASK2       // mask for byte 2
 255         bnone   a8, a5, .Leq    // if byte 1 is zero
 256         bany    a2, a6, .Ldiff2 // if byte 2 differs
 257         bnone   a8, a6, .Leq    // if byte 2 is zero
 258 #ifdef __XTENSA_EB__
 259 .Ldiff3:
 260 .Ldiff2:
 261 .Ldiff1:
 262         /* Byte 0 is equal (at least) and there is a difference before a zero
 263            byte.  Just subtract words to get the return value.
 264            The high order equal bytes cancel, leaving room for the sign.  */
 265         sub     a2, a8, a9
 266         retw
 267
 268 .Ldiff0:
 269         /* Need to make room for the sign, so can't subtract whole words.  */
 270         extui   a10, a8, 24, 8
 271         extui   a11, a9, 24, 8
 272         sub     a2, a10, a11
 273         retw
 274
 275 #else /* !__XTENSA_EB__ */
 276         /* Little-endian is a little more difficult because can't subtract
 277            whole words.  */
 278 .Ldiff3:
 279         /* Bytes 0-2 are equal; byte 3 is different.
 280            For little-endian need to have a sign bit for the difference.  */
 281         extui   a10, a8, 24, 8
 282         extui   a11, a9, 24, 8
 283         sub     a2, a10, a11
 284         retw
 285
 286 .Ldiff0:
 287         /* Byte 0 is different.  */
 288         extui   a10, a8, 0, 8
 289         extui   a11, a9, 0, 8
 290         sub     a2, a10, a11
 291         retw
 292
 293 .Ldiff1:
 294         /* Byte 0 is equal; byte 1 is different.  */
 295         extui   a10, a8, 8, 8
 296         extui   a11, a9, 8, 8
 297         sub     a2, a10, a11
 298         retw
 299
 300 .Ldiff2:
 301         /* Bytes 0-1 are equal; byte 2 is different.  */
 302         extui   a10, a8, 16, 8
 303         extui   a11, a9, 16, 8
 304         sub     a2, a10, a11
 305         retw
 306
 307 #endif /* !__XTENSA_EB */
 308
 309 libc_hidden_def (strcmp)
 310
 311 #ifndef __UCLIBC_HAS_LOCALE__
 312 strong_alias (strcmp, strcoll)
 313 libc_hidden_def (strcoll)
 314 #endif