OSDN Git Service

Add optimized version of memset for Cortex A9
authorHenrik Smiding <henrik.smiding@stericsson.com>
Fri, 5 Nov 2010 14:07:53 +0000 (15:07 +0100)
committerElliott Hughes <enh@google.com>
Fri, 9 Nov 2012 23:05:32 +0000 (15:05 -0800)
Adds new code to function memset, optimized for Cortex A9.

Copyright (C) ST-Ericsson SA 2010

Added neon implementation

Author: Henrik Smiding henrik.smiding@stericsson.com for ST-Ericsson.

Change-Id: Id3c87767953439269040e15bd30a27aba709aef6
Signed-off-by: Christian Bejram <christian.bejram@stericsson.com>
libc/arch-arm/bionic/memset.S

index 273b9e3..54f74de 100644 (file)
  * SUCH DAMAGE.
  */
 
+#include <machine/cpu-features.h>
 #include <machine/asm.h>
-       
+
                /*
                 * Optimized memset() for ARM.
          *
          * memset() returns its first argument.
                 */
-       
+
+#if defined(__ARM_NEON__)
+    .fpu    neon
+#endif
+
 ENTRY(bzero)
         mov     r2, r1
         mov     r1, #0
 END(bzero)
 
 ENTRY(memset)
+#if defined(__ARM_NEON__)
+
+#ifdef  NEON_MEMSET_DIVIDER
+        cmp         r2, #NEON_MEMSET_DIVIDER
+        bhi         11f
+#endif
+        .save       {r0}
+        stmfd       sp!, {r0}
+
+        vdup.8      q0, r1
+
+#ifndef NEON_UNALIGNED_ACCESS
+        /* do we have at least 16-bytes to write (needed for alignment below) */
+        cmp         r2, #16
+        blo         3f
+
+        /* align destination to 16 bytes for the write-buffer */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0xF
+        beq         2f
+
+        /* write up to 15-bytes (count in r3) */
+        sub         r2, r2, r3
+        movs        ip, r3, lsl #31
+        strmib      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        movs        ip, r3, lsl #29
+        bge         1f
+
+        // writes 4 bytes, 32-bits aligned
+        vst1.32     {d0[0]}, [r0, :32]!
+1:      bcc         2f
+
+        // writes 8 bytes, 64-bits aligned
+        vst1.8      {d0}, [r0, :64]!
+2:
+#endif
+        /* make sure we have at least 32 bytes to write */
+        subs        r2, r2, #32
+        blo         2f
+        vmov        q1, q0
+
+1:      /* The main loop writes 32 bytes at a time */
+        subs        r2, r2, #32
+#ifndef NEON_UNALIGNED_ACCESS
+        vst1.8      {d0 - d3}, [r0, :128]!
+#else
+        vst1.8      {d0 - d3}, [r0]!
+#endif
+        bhs         1b
+
+2:      /* less than 32 left */
+        add         r2, r2, #32
+        tst         r2, #0x10
+        beq         3f
+
+        // writes 16 bytes, 128-bits aligned
+#ifndef NEON_UNALIGNED_ACCESS
+        vst1.8      {d0, d1}, [r0, :128]!
+#else
+        vst1.8      {d0, d1}, [r0]!
+#endif
+3:      /* write up to 15-bytes (count in r2) */
+        movs        ip, r2, lsl #29
+        bcc         1f
+        vst1.8      {d0}, [r0]!
+1:      bge         2f
+        vst1.32     {d0[0]}, [r0]!
+2:      movs        ip, r2, lsl #31
+        strmib      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        strcsb      r1, [r0], #1
+        ldmfd       sp!, {r0}
+        bx          lr
+11:
+#endif
+
+        /*
+         * Optimized memset() for ARM.
+         *
+         * memset() returns its first argument.
+         */
+
                /* compute the offset to align the destination
                 * offset = (4-(src&3))&3 = -src & 3
                 */
+
         .save       {r0, r4-r7, lr}
                stmfd           sp!, {r0, r4-r7, lr}
                rsb                     r3, r0, #0
@@ -70,7 +160,7 @@ ENTRY(memset)
         mov         r5, r1
         mov         r6, r1
         mov         r7, r1
-        
+
                rsb         r3, r0, #0
                ands            r3, r3, #0x1C
                beq         3f
@@ -78,7 +168,7 @@ ENTRY(memset)
                andhi           r3, r2, #0x1C
                sub         r2, r2, r3
 
-               /* conditionnaly writes 0 to 7 words (length in r3) */
+               /* conditionally writes 0 to 7 words (length in r3) */
                movs            r3, r3, lsl #28
                stmcsia         r0!, {r1, lr}
                stmcsia         r0!, {r1, lr}
@@ -95,7 +185,7 @@ ENTRY(memset)
         bhs         1b
 2:      add         r2, r2, #32
 
-               /* conditionnaly stores 0 to 31 bytes */
+               /* conditionally stores 0 to 31 bytes */
                movs            r2, r2, lsl #28
                stmcsia         r0!, {r1,r3,r12,lr}
                stmmiia         r0!, {r1, lr}