newlib/libc/machine/sh/memcpy.S

   1 !
   2 ! Fast SH memcpy
   3 !
   4 ! by Toshiyasu Morita (tm@netcom.com)
   5 ! hacked by J"orn Rernnecke (joern.rennecke@superh.com) ("o for o-umlaut)
   6 ! SH5 code Copyright 2002 SuperH Ltd.
   7 !
   8 ! Entry: ARG0: destination pointer
   9 !        ARG1: source pointer
  10 !        ARG3: byte count
  11 !
  12 ! Exit:  RESULT: destination pointer
  13 !        any other registers in the range r0-r7: trashed
  14 !
  15 ! Notes: Usually one wants to do small reads and write a longword, but
  16 !        unfortunately it is difficult in some cases to concatanate bytes
  17 !        into a longword on the SH, so this does a longword read and small
  18 !        writes.
  19 !
  20 ! This implementation makes two assumptions about how it is called:
  21 !
  22 ! 1.: If the byte count is nonzero, the address of the last byte to be
  23 !     copied is unsigned greater than the address of the first byte to
  24 !     be copied.  This could be easily swapped for a signed comparison,
  25 !     but the algorithm used needs some comparison.
  26 !
  27 ! 2.: When there are two or three bytes in the last word of an 11-or-more
  28 !     bytes memory chunk to b copied, the rest of the word can be read
  29 !     without side effects.
  30 !     This could be easily changed by increasing the minumum size of
  31 !     a fast memcpy and the amount subtracted from r7 before L_2l_loop be 2,
  32 !     however, this would cost a few extra cyles on average.
  33 !     For SHmedia, the assumption is that any quadword can be read in its
  34 !     enirety if at least one byte is included in the copy.
  35 !
  36
  37 #include "asm.h"
  38
  39 ENTRY(memcpy)
  40
  41 #if __SHMEDIA__
  42
  43 #define LDUAQ(P,O,D0,D1) ldlo.q P,O,D0; ldhi.q P,O+7,D1
  44 #define STUAQ(P,O,D0,D1) stlo.q P,O,D0; sthi.q P,O+7,D1
  45 #define LDUAL(P,O,D0,D1) ldlo.l P,O,D0; ldhi.l P,O+3,D1
  46 #define STUAL(P,O,D0,D1) stlo.l P,O,D0; sthi.l P,O+3,D1
  47
  48         ld.b r3,0,r63
  49         pta/l Large,tr0
  50         movi 25,r0
  51         bgeu/u r4,r0,tr0
  52         nsb r4,r0
  53         shlli r0,5,r0
  54         movi (L1-L0+63*32 + 1) & 0xffff,r1
  55         sub r1, r0, r0
  56 L0:     ptrel r0,tr0
  57         add r2,r4,r5
  58         ptabs r18,tr1
  59         add r3,r4,r6
  60         blink tr0,r63
  61
  62         .balign 8
  63 L1:
  64         /* 0 byte memcpy */
  65         blink tr1,r63
  66
  67 L4_7:   /* 4..7 byte memcpy cntd. */
  68         stlo.l r2, 0, r0
  69         or r6, r7, r6
  70         sthi.l r5, -1, r6
  71         stlo.l r5, -4, r6
  72         blink tr1,r63
  73
  74 L2_3:   /* 2 or 3 byte memcpy cntd. */
  75         st.b r5,-1,r6
  76         blink tr1,r63
  77
  78         /* 1 byte memcpy */
  79         ld.b r3,0,r0
  80         st.b r2,0,r0
  81         blink tr1,r63
  82
  83 L8_15:  /* 8..15 byte memcpy cntd. */
  84         stlo.q r2, 0, r0
  85         or r6, r7, r6
  86         sthi.q r5, -1, r6
  87         stlo.q r5, -8, r6
  88         blink tr1,r63
  89
  90         /* 2 or 3 byte memcpy */
  91         ld.b r3,0,r0
  92         ld.b r2,0,r63
  93         ld.b r3,1,r1
  94         st.b r2,0,r0
  95         pta/l L2_3,tr0
  96         ld.b r6,-1,r6
  97         st.b r2,1,r1
  98         blink tr0, r63
  99
 100         /* 4 .. 7 byte memcpy */
 101         LDUAL (r3, 0, r0, r1)
 102         pta L4_7, tr0
 103         ldlo.l r6, -4, r7
 104         or r0, r1, r0
 105         sthi.l r2, 3, r0
 106         ldhi.l r6, -1, r6
 107         blink tr0, r63
 108
 109         /* 8 .. 15 byte memcpy */
 110         LDUAQ (r3, 0, r0, r1)
 111         pta L8_15, tr0
 112         ldlo.q r6, -8, r7
 113         or r0, r1, r0
 114         sthi.q r2, 7, r0
 115         ldhi.q r6, -1, r6
 116         blink tr0, r63
 117
 118         /* 16 .. 24 byte memcpy */
 119         LDUAQ (r3, 0, r0, r1)
 120         LDUAQ (r3, 8, r8, r9)
 121         or r0, r1, r0
 122         sthi.q r2, 7, r0
 123         or r8, r9, r8
 124         sthi.q r2, 15, r8
 125         ldlo.q r6, -8, r7
 126         ldhi.q r6, -1, r6
 127         stlo.q r2, 8, r8
 128         stlo.q r2, 0, r0
 129         or r6, r7, r6
 130         sthi.q r5, -1, r6
 131         stlo.q r5, -8, r6
 132         blink tr1,r63
 133
 134 Large:
 135         ld.b r2, 0, r63
 136         pta/l  Loop_ua, tr1
 137         ori r3, -8, r7
 138         sub r2, r7, r22
 139         sub r3, r2, r6
 140         add r2, r4, r5
 141         ldlo.q r3, 0, r0
 142         addi r5, -16, r5
 143         movi 64+8, r27 // could subtract r7 from that.
 144         stlo.q r2, 0, r0
 145         sthi.q r2, 7, r0
 146         ldx.q r22, r6, r0
 147         bgtu/l r27, r4, tr1
 148
 149         addi r5, -48, r27
 150         pta/l Loop_line, tr0
 151         addi r6, 64, r36
 152         addi r6, -24, r19
 153         addi r6, -16, r20
 154         addi r6, -8, r21
 155
 156 Loop_line:
 157         ldx.q r22, r36, r63
 158         alloco r22, 32
 159         addi r22, 32, r22
 160         ldx.q r22, r19, r23
 161         sthi.q r22, -25, r0
 162         ldx.q r22, r20, r24
 163         ldx.q r22, r21, r25
 164         stlo.q r22, -32, r0
 165         ldx.q r22, r6,  r0
 166         sthi.q r22, -17, r23
 167         sthi.q r22,  -9, r24
 168         sthi.q r22,  -1, r25
 169         stlo.q r22, -24, r23
 170         stlo.q r22, -16, r24
 171         stlo.q r22,  -8, r25
 172         bgeu r27, r22, tr0
 173
 174 Loop_ua:
 175         addi r22, 8, r22
 176         sthi.q r22, -1, r0
 177         stlo.q r22, -8, r0
 178         ldx.q r22, r6, r0
 179         bgtu/l r5, r22, tr1
 180
 181         add r3, r4, r7
 182         ldlo.q r7, -8, r1
 183         sthi.q r22, 7, r0
 184         ldhi.q r7, -1, r7
 185         ptabs r18,tr1
 186         stlo.q r22, 0, r0
 187         or r1, r7, r1
 188         sthi.q r5, 15, r1
 189         stlo.q r5, 8, r1
 190         blink tr1, r63
 191
 192 #else /* ! SHMEDIA, i.e. SH1 .. SH4 / SHcompact */
 193
 194 #ifdef __SH5__
 195 #define DST r2
 196 #define SRC r3
 197 #define COUNT r4
 198 #define TMP0 r5
 199 #define TMP1 r6
 200 #define RESULT r2
 201 #else
 202 #define DST r4
 203 #define SRC r5
 204 #define COUNT r6
 205 #define TMP0 r2
 206 #define TMP1 r3
 207 #define RESULT r0
 208 #endif
 209
 210 #ifdef __LITTLE_ENDIAN__
 211         ! Little endian version copies with increasing addresses.
 212         mov DST,TMP1    ! Save return value
 213         mov #11,r0      ! Check if small number of bytes
 214         cmp/hs r0,COUNT
 215                         ! COUNT becomes src end address
 216         SL(bf, L_small, add SRC,COUNT)
 217         mov #1,r1
 218         tst r1,SRC      ! check if source even
 219         SL(bt, L_even, mov COUNT,r7)
 220         mov.b @SRC+,r0  ! no, make it even.
 221         mov.b r0,@DST
 222         add #1,DST
 223 L_even: tst r1,DST      ! check if destination is even
 224         add #-3,r7
 225         SL(bf, L_odddst, mov #2,r1)
 226         tst r1,DST      ! check if destination is 4-byte aligned
 227         mov DST,r0
 228         SL(bt, L_al4dst, sub SRC,r0)
 229         mov.w @SRC+,TMP0
 230         mov.w TMP0,@DST
 231         ! add #2,DST  DST is dead here.
 232 L_al4dst:
 233         tst r1,SRC
 234         bt L_al4both
 235         mov.w @SRC+,r1
 236         swap.w r1,r1
 237         add #-6,r0
 238         add #-6,r7      ! r7 := src end address minus 9.
 239         .align 2
 240 L_2l_loop:
 241         mov.l @SRC+,TMP0 ! Read & write two longwords per iteration
 242         xtrct TMP0,r1
 243         mov.l r1,@(r0,SRC)
 244         cmp/hs r7,SRC
 245         mov.l @SRC+,r1
 246         xtrct r1,TMP0
 247         mov.l TMP0,@(r0,SRC)
 248         bf L_2l_loop
 249         add #-2,SRC
 250         bra  L_cleanup
 251         add #5,r0
 252 L_al4both:
 253         add #-4,r0
 254         .align 2
 255 L_al4both_loop:
 256         mov.l @SRC+,DST   ! Read longword, write longword per iteration
 257         cmp/hs r7,SRC
 258         SL(bf, L_al4both_loop, mov.l DST,@(r0,SRC))
 259
 260         bra L_cleanup
 261         add #3,r0
 262
 263 L_odddst:
 264         tst r1,SRC
 265         SL(bt, L_al4src, add #-1,DST)
 266         mov.w @SRC+,r0
 267         mov.b r0,@(1,DST)
 268         shlr8 r0
 269         mov.b r0,@(2,DST)
 270         add #2,DST
 271 L_al4src:
 272         .align 2
 273 L_odd_loop:
 274         mov.l @SRC+,r0   ! Read longword, write byte, word, byte per iteration
 275         cmp/hs r7,SRC
 276         mov.b r0,@(1,DST)
 277         shlr8 r0
 278         mov.w r0,@(2,DST)
 279         shlr16 r0
 280         mov.b r0,@(4,DST)
 281         SL(bf, L_odd_loop, add #4,DST)
 282         .align 2 ! avoid nop in more frequently executed code.
 283 L_cleanup2:
 284         mov     DST,r0
 285         sub     SRC,r0
 286 L_cleanup:
 287         cmp/eq  COUNT,SRC
 288         bt      L_ready
 289         .align 2
 290 L_cleanup_loop:
 291         mov.b   @SRC+,r1
 292         cmp/eq  COUNT,SRC
 293         mov.b   r1,@(r0,SRC)
 294         bf      L_cleanup_loop
 295 L_ready:
 296         rts
 297         mov     TMP1,RESULT
 298 L_small:
 299         bra L_cleanup2
 300         add #-1,DST
 301 #else /* ! __LITTLE_ENDIAN__ */
 302         ! Big endian version copies with decreasing addresses.
 303         mov DST,r0
 304         add COUNT,r0
 305         sub DST,SRC
 306         mov #11,r1
 307         cmp/hs r1,COUNT
 308         SL(bf, L_small, add #-1,SRC)
 309         mov SRC,TMP1
 310         add r0,TMP1
 311         shlr TMP1
 312         SL(bt, L_even,
 313         mov DST,r7)
 314         mov.b @(r0,SRC),TMP0
 315         add #-1,TMP1
 316         mov.b TMP0,@-r0
 317 L_even:
 318         tst #1,r0
 319         add #-1,SRC
 320         SL(bf, L_odddst, add #8,r7)
 321         tst #2,r0
 322         bt L_al4dst
 323         add #-1,TMP1
 324         mov.w @(r0,SRC),r1
 325         mov.w r1,@-r0
 326 L_al4dst:
 327         shlr TMP1
 328         bt L_al4both
 329         mov.w @(r0,SRC),r1
 330         swap.w r1,r1
 331         add #4,r7
 332         add #-4,SRC
 333         .align 2
 334 L_2l_loop:
 335         mov.l @(r0,SRC),TMP0
 336         xtrct TMP0,r1
 337         mov.l r1,@-r0
 338         cmp/hs r7,r0
 339         mov.l @(r0,SRC),r1
 340         xtrct r1,TMP0
 341         mov.l TMP0,@-r0
 342         bt L_2l_loop
 343         bra L_cleanup
 344         add #5,SRC
 345
 346         nop ! avoid nop in executed code.
 347 L_al4both:
 348         add #-2,SRC
 349         .align 2
 350 L_al4both_loop:
 351         mov.l @(r0,SRC),r1
 352         cmp/hs r7,r0
 353         SL(bt, L_al4both_loop,
 354         mov.l r1,@-r0)
 355         bra L_cleanup
 356         add #3,SRC
 357
 358         nop ! avoid nop in executed code.
 359 L_odddst:
 360         shlr TMP1
 361         bt L_al4src
 362         mov.w @(r0,SRC),r1
 363         mov.b r1,@-r0
 364         shlr8 r1
 365         mov.b r1,@-r0
 366 L_al4src:
 367         add #-2,SRC
 368         .align 2
 369 L_odd_loop:
 370         mov.l @(r0,SRC),TMP0
 371         cmp/hs r7,r0
 372         mov.b TMP0,@-r0
 373         shlr8 TMP0
 374         mov.w TMP0,@-r0
 375         shlr16 TMP0
 376         mov.b TMP0,@-r0
 377         bt L_odd_loop
 378
 379         add #3,SRC
 380 L_cleanup:
 381 L_small:
 382         cmp/eq DST,r0
 383         bt L_ready
 384         add #1,DST
 385         .align 2
 386 L_cleanup_loop:
 387         mov.b @(r0,SRC),TMP0
 388         cmp/eq DST,r0
 389         mov.b TMP0,@-r0
 390         bf L_cleanup_loop
 391 L_ready:
 392         rts
 393         mov r0,RESULT
 394 #endif /* ! __LITTLE_ENDIAN__ */
 395 #endif /* ! SHMEDIA */