From b7ec7cac7d48f314e5a49d727f97e23c34f9a88c Mon Sep 17 00:00:00 2001 From: Chitti Babu Theegala Date: Fri, 16 Dec 2016 02:13:28 +0530 Subject: [PATCH] Fix streaming(memcpy) performance on Cortex-A7 Stream-mode detection for L1 in A7-core is failing for non cache-line-size (non 64 byte) aligned addresses. This leads to destination data getting cached unnecessarily. This A7 issue is confirmed by ARM This issue is solved by aligning destination address to 64 byte before entering the loop in memcpy routine. Though we get lower score for micro_bench memcpy when L1 cache is bypassed, it is desirable since it avoids unnecessary eviction of other process data from L1 which is good for overall system performance. Higher micro_bench memcpy numbers for < 64byte alignment shows good numbers but this is at the cost of L1 cache pollution. During memcpy/memset, unnecessary data is filled in L1 cache, this causes eviction of other process data from L1. For example during msmset(0), L1 cache gets filled with 0s which should be avoided. Additionally, there is another issue with cortex A7 that impacts performance for all alignments / all Android Wear versions: Store Buffer on A7 is 32 byte which limits the 32-byte back to back stores. In the current implementation back to back 32bytes writes is causing CPU stalls. This issue can be solved by interleaved Loads and Stores. This helps in avoiding CPU stalls during memcpy by utilizing efficiently the A7 internal load and store buffers. Bug: 31917556 Change-Id: Ie5f12f2bb5d86f627686730416279057e4f5f6d0 (cherry picked from commit cbfdc7f9054e0bc8071aaf3a70afc00273a8a869) --- libc/arch-arm/cortex-a7/bionic/memcpy_base.S | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/libc/arch-arm/cortex-a7/bionic/memcpy_base.S b/libc/arch-arm/cortex-a7/bionic/memcpy_base.S index 1d152bbc1..4ff982b0f 100644 --- a/libc/arch-arm/cortex-a7/bionic/memcpy_base.S +++ b/libc/arch-arm/cortex-a7/bionic/memcpy_base.S @@ -101,16 +101,38 @@ vld1.8 {d0}, [r1]! vst1.8 {d0}, [r0, :64]! -2: // Make sure we have at least 64 bytes to copy. +2: cmp r2, #256 + ble .L_copy_loop + + // Make sure DST is 64 BYTE aligned. + rsb r3, r0, #0 + ands r3, r3, #0x30 + beq .L_copy_loop + + sub r2, r2, r3 + cmp r3, #0x10 + beq .L_copy_16 + + vld1.8 {d0 - d3}, [r1]! + vst1.8 {d0 - d3}, [r0, :128]! + ands r3, r3, #0x10 + beq .L_copy_loop + +.L_copy_16: + vld1.8 {d0, d1}, [r1]! + vst1.8 {d0, d1}, [r0, :128]! + +.L_copy_loop: + // Make sure we have at least 64 bytes to copy. subs r2, r2, #64 blo 2f 1: // The main loop copies 64 bytes at a time. vld1.8 {d0 - d3}, [r1]! - vld1.8 {d4 - d7}, [r1]! + vst1.8 {d0 - d3}, [r0, :128]! pld [r1, #(64*4)] subs r2, r2, #64 - vst1.8 {d0 - d3}, [r0, :128]! + vld1.8 {d4 - d7}, [r1]! vst1.8 {d4 - d7}, [r0, :128]! bhs 1b -- 2.11.0