OSDN Git Service

Remove NEON optimizations for memcpy
authorDavid 'Digit' Turner <digit@google.com>
Sun, 27 Sep 2009 14:08:46 +0000 (07:08 -0700)
committerDavid 'Digit' Turner <digit@google.com>
Sun, 27 Sep 2009 14:08:46 +0000 (07:08 -0700)
libc/arch-arm/bionic/memcpy.S

index 4ea2c6d..fcb58cd 100644 (file)
 
 #include <machine/cpu-features.h>
 
-#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
-
-               .text
-               .fpu    neon
-
-               .global memcpy
-               .type memcpy, %function
-               .align 4
-
-#define NEON_MAX_PREFETCH_DISTANCE 320
-
-memcpy:
-        .fnstart
-               mov     ip, r0
-               cmp     r2, #16
-               blt     4f      @ Have less than 16 bytes to copy
-
-               @ First ensure 16 byte alignment for the destination buffer
-               tst     r0, #0xF
-               beq     2f
-               tst     r0, #1
-               ldrneb  r3, [r1], #1
-               strneb  r3, [ip], #1
-               subne   r2, r2, #1
-               tst     ip, #2
-               ldrneb  r3, [r1], #1
-               strneb  r3, [ip], #1
-               ldrneb  r3, [r1], #1
-               strneb  r3, [ip], #1
-               subne   r2, r2, #2
-
-               tst     ip, #4
-               beq     1f
-               vld4.8  {d0[0], d1[0], d2[0], d3[0]}, [r1]!
-               vst4.8  {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
-               sub     r2, r2, #4
-1:
-               tst     ip, #8
-               beq     2f
-               vld1.8  {d0}, [r1]!
-               vst1.8  {d0}, [ip, :64]!
-               sub     r2, r2, #8
-2:
-               subs    r2, r2, #32
-               blt     3f
-               mov     r3, #32
-
-               @ Main copy loop, 32 bytes are processed per iteration.
-               @ ARM instructions are used for doing fine-grained prefetch,
-               @ increasing prefetch distance progressively up to
-               @ NEON_MAX_PREFETCH_DISTANCE at runtime
-1:
-               vld1.8  {d0-d3}, [r1]!
-               cmp     r3, #(NEON_MAX_PREFETCH_DISTANCE - 32)
-               pld     [r1, r3]
-               addle   r3, r3, #32
-               vst1.8  {d0-d3}, [ip, :128]!
-               sub     r2, r2, #32
-               cmp     r2, r3
-               bge     1b
-               cmp     r2, #0
-               blt     3f
-1:             @ Copy the remaining part of the buffer (already prefetched)
-               vld1.8  {d0-d3}, [r1]!
-               subs    r2, r2, #32
-               vst1.8  {d0-d3}, [ip, :128]!
-               bge     1b
-3:             @ Copy up to 31 remaining bytes
-               tst     r2, #16
-               beq     4f
-               vld1.8  {d0, d1}, [r1]!
-               vst1.8  {d0, d1}, [ip, :128]!
-4:
-               @ Use ARM instructions exclusively for the final trailing part
-               @ not fully fitting into full 16 byte aligned block in order
-               @ to avoid "ARM store after NEON store" hazard. Also NEON
-               @ pipeline will be (mostly) flushed by the time when the
-               @ control returns to the caller, making the use of NEON mostly
-               @ transparent (and avoiding hazards in the caller code)
-
-               movs    r3, r2, lsl #29
-               bcc     1f
-       .rept   8
-               ldrcsb  r3, [r1], #1
-               strcsb  r3, [ip], #1
-       .endr
-1:
-               bpl     1f
-       .rept   4
-               ldrmib  r3, [r1], #1
-               strmib  r3, [ip], #1
-       .endr
-1:
-               movs    r2, r2, lsl #31
-               ldrcsb  r3, [r1], #1
-               strcsb  r3, [ip], #1
-               ldrcsb  r3, [r1], #1
-               strcsb  r3, [ip], #1
-               ldrmib  r3, [r1], #1
-               strmib  r3, [ip], #1
-               bx      lr
-        .fnend
-
-#else  /* __ARM_ARCH__ < 7 */
-
        .text
 
     .global memcpy
@@ -490,5 +385,3 @@ copy_last_3_and_return:
                bx                      lr
         .fnend
 
-#endif
-