libc/string/xtensa/memcpy.S

   1 /* Optimized memcpy for Xtensa.
   2    Copyright (C) 2001, 2007 Free Software Foundation, Inc.
   3    This file is part of the GNU C Library.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 51 Franklin Street - Fifth Floor,
  18    Boston, MA 02110-1301, USA.  */
  19
  20 #include "../../sysdeps/linux/xtensa/sysdep.h"
  21 #include <bits/xtensa-config.h>
  22
  23         .macro  src_b   r, w0, w1
  24 #ifdef __XTENSA_EB__
  25         src     \r, \w0, \w1
  26 #else
  27         src     \r, \w1, \w0
  28 #endif
  29         .endm
  30
  31         .macro  ssa8    r
  32 #ifdef __XTENSA_EB__
  33         ssa8b   \r
  34 #else
  35         ssa8l   \r
  36 #endif
  37         .endm
  38
  39 /* If the Xtensa Unaligned Load Exception option is not used, this
  40    code can run a few cycles faster by relying on the low address bits
  41    being ignored.  However, if the code is then run with an Xtensa ISS
  42    client that checks for unaligned accesses, it will produce a lot of
  43    warning messages.  Set this flag to disable the use of unaligned
  44    accesses and keep the ISS happy.  */
  45
  46 #define UNALIGNED_ADDRESSES_CHECKED 1
  47
  48 /* Do not use .literal_position in the ENTRY macro.  */
  49 #undef LITERAL_POSITION
  50 #define LITERAL_POSITION
  51
  52
  53 /* void *memcpy (void *dst, const void *src, size_t len)
  54
  55    The algorithm is as follows:
  56
  57    If the destination is unaligned, align it by conditionally
  58    copying 1- and/or 2-byte pieces.
  59
  60    If the source is aligned, copy 16 bytes with a loop, and then finish up
  61    with 8, 4, 2, and 1-byte copies conditional on the length.
  62
  63    Else (if source is unaligned), do the same, but use SRC to align the
  64    source data.
  65
  66    This code tries to use fall-through branches for the common
  67    case of aligned source and destination and multiple of 4 (or 8) length.  */
  68
  69
  70 /* Byte by byte copy.  */
  71
  72         .text
  73         .align  4
  74         .literal_position
  75 __memcpy_aux:
  76
  77         /* Skip a byte to get 1 mod 4 alignment for LOOPNEZ
  78            (0 mod 4 alignment for LBEG).  */
  79         .byte   0
  80
  81 .Lbytecopy:
  82 #if XCHAL_HAVE_LOOPS
  83         loopnez a4, 2f
  84 #else
  85         beqz    a4, 2f
  86         add     a7, a3, a4      // a7 = end address for source
  87 #endif
  88 1:      l8ui    a6, a3, 0
  89         addi    a3, a3, 1
  90         s8i     a6, a5, 0
  91         addi    a5, a5, 1
  92 #if !XCHAL_HAVE_LOOPS
  93         blt     a3, a7, 1b
  94 #endif
  95 2:      retw
  96
  97
  98 /* Destination is unaligned.  */
  99
 100         .align  4
 101 .Ldst1mod2: // dst is only byte aligned
 102
 103         /* Do short copies byte-by-byte.  */
 104         _bltui  a4, 7, .Lbytecopy
 105
 106         /* Copy 1 byte.  */
 107         l8ui    a6, a3, 0
 108         addi    a3, a3, 1
 109         addi    a4, a4, -1
 110         s8i     a6, a5, 0
 111         addi    a5, a5, 1
 112
 113         /* Return to main algorithm if dst is now aligned.  */
 114         _bbci.l a5, 1, .Ldstaligned
 115
 116 .Ldst2mod4: // dst has 16-bit alignment
 117
 118         /* Do short copies byte-by-byte.  */
 119         _bltui  a4, 6, .Lbytecopy
 120
 121         /* Copy 2 bytes.  */
 122         l8ui    a6, a3, 0
 123         l8ui    a7, a3, 1
 124         addi    a3, a3, 2
 125         addi    a4, a4, -2
 126         s8i     a6, a5, 0
 127         s8i     a7, a5, 1
 128         addi    a5, a5, 2
 129
 130         /* dst is now aligned; return to main algorithm.  */
 131         j       .Ldstaligned
 132
 133
 134 ENTRY (memcpy)
 135         /* a2 = dst, a3 = src, a4 = len */
 136
 137         mov     a5, a2          // copy dst so that a2 is return value
 138         _bbsi.l a2, 0, .Ldst1mod2
 139         _bbsi.l a2, 1, .Ldst2mod4
 140 .Ldstaligned:
 141
 142         /* Get number of loop iterations with 16B per iteration.  */
 143         srli    a7, a4, 4
 144
 145         /* Check if source is aligned.  */
 146         movi    a8, 3
 147         _bany   a3, a8, .Lsrcunaligned
 148
 149         /* Destination and source are word-aligned, use word copy.  */
 150 #if XCHAL_HAVE_LOOPS
 151         loopnez a7, 2f
 152 #else
 153         beqz    a7, 2f
 154         slli    a8, a7, 4
 155         add     a8, a8, a3      // a8 = end of last 16B source chunk
 156 #endif
 157 1:      l32i    a6, a3, 0
 158         l32i    a7, a3, 4
 159         s32i    a6, a5, 0
 160         l32i    a6, a3, 8
 161         s32i    a7, a5, 4
 162         l32i    a7, a3, 12
 163         s32i    a6, a5, 8
 164         addi    a3, a3, 16
 165         s32i    a7, a5, 12
 166         addi    a5, a5, 16
 167 #if !XCHAL_HAVE_LOOPS
 168         blt     a3, a8, 1b
 169 #endif
 170
 171         /* Copy any leftover pieces smaller than 16B.  */
 172 2:      bbci.l  a4, 3, 3f
 173
 174         /* Copy 8 bytes.  */
 175         l32i    a6, a3, 0
 176         l32i    a7, a3, 4
 177         addi    a3, a3, 8
 178         s32i    a6, a5, 0
 179         s32i    a7, a5, 4
 180         addi    a5, a5, 8
 181
 182 3:      bbsi.l  a4, 2, 4f
 183         bbsi.l  a4, 1, 5f
 184         bbsi.l  a4, 0, 6f
 185         retw
 186
 187         /* Copy 4 bytes.  */
 188 4:      l32i    a6, a3, 0
 189         addi    a3, a3, 4
 190         s32i    a6, a5, 0
 191         addi    a5, a5, 4
 192         bbsi.l  a4, 1, 5f
 193         bbsi.l  a4, 0, 6f
 194         retw
 195
 196         /* Copy 2 bytes.  */
 197 5:      l16ui   a6, a3, 0
 198         addi    a3, a3, 2
 199         s16i    a6, a5, 0
 200         addi    a5, a5, 2
 201         bbsi.l  a4, 0, 6f
 202         retw
 203
 204         /* Copy 1 byte.  */
 205 6:      l8ui    a6, a3, 0
 206         s8i     a6, a5, 0
 207
 208 .Ldone:
 209         retw
 210
 211
 212 /* Destination is aligned; source is unaligned.  */
 213
 214         .align  4
 215 .Lsrcunaligned:
 216         /* Avoid loading anything for zero-length copies.  */
 217         _beqz   a4, .Ldone
 218
 219         /* Copy 16 bytes per iteration for word-aligned dst and
 220            unaligned src.  */
 221         ssa8    a3              // set shift amount from byte offset
 222 #if UNALIGNED_ADDRESSES_CHECKED
 223         and     a11, a3, a8     // save unalignment offset for below
 224         sub     a3, a3, a11     // align a3
 225 #endif
 226         l32i    a6, a3, 0       // load first word
 227 #if XCHAL_HAVE_LOOPS
 228         loopnez a7, 2f
 229 #else
 230         beqz    a7, 2f
 231         slli    a10, a7, 4
 232         add     a10, a10, a3    // a10 = end of last 16B source chunk
 233 #endif
 234 1:      l32i    a7, a3, 4
 235         l32i    a8, a3, 8
 236         src_b   a6, a6, a7
 237         s32i    a6, a5, 0
 238         l32i    a9, a3, 12
 239         src_b   a7, a7, a8
 240         s32i    a7, a5, 4
 241         l32i    a6, a3, 16
 242         src_b   a8, a8, a9
 243         s32i    a8, a5, 8
 244         addi    a3, a3, 16
 245         src_b   a9, a9, a6
 246         s32i    a9, a5, 12
 247         addi    a5, a5, 16
 248 #if !XCHAL_HAVE_LOOPS
 249         blt     a3, a10, 1b
 250 #endif
 251
 252 2:      bbci.l  a4, 3, 3f
 253
 254         /* Copy 8 bytes.  */
 255         l32i    a7, a3, 4
 256         l32i    a8, a3, 8
 257         src_b   a6, a6, a7
 258         s32i    a6, a5, 0
 259         addi    a3, a3, 8
 260         src_b   a7, a7, a8
 261         s32i    a7, a5, 4
 262         addi    a5, a5, 8
 263         mov     a6, a8
 264
 265 3:      bbci.l  a4, 2, 4f
 266
 267         /* Copy 4 bytes.  */
 268         l32i    a7, a3, 4
 269         addi    a3, a3, 4
 270         src_b   a6, a6, a7
 271         s32i    a6, a5, 0
 272         addi    a5, a5, 4
 273         mov     a6, a7
 274 4:
 275 #if UNALIGNED_ADDRESSES_CHECKED
 276         add     a3, a3, a11     // readjust a3 with correct misalignment
 277 #endif
 278         bbsi.l  a4, 1, 5f
 279         bbsi.l  a4, 0, 6f
 280         retw
 281
 282         /* Copy 2 bytes.  */
 283 5:      l8ui    a6, a3, 0
 284         l8ui    a7, a3, 1
 285         addi    a3, a3, 2
 286         s8i     a6, a5, 0
 287         s8i     a7, a5, 1
 288         addi    a5, a5, 2
 289         bbsi.l  a4, 0, 6f
 290         retw
 291
 292         /* Copy 1 byte.  */
 293 6:      l8ui    a6, a3, 0
 294         s8i     a6, a5, 0
 295         retw
 296
 297 libc_hidden_def (memcpy)