From 2a0bffd2ae4725a89b400279a2d58f07ff610685 Mon Sep 17 00:00:00 2001 From: Varvara Rainchik Date: Wed, 30 Jul 2014 17:01:24 +0400 Subject: [PATCH] Add x86_64 optimized __memcmp16 implementation; fix tabs in 32-bit implementation. Change-Id: I7bbfb344074aed66511c1a845998dc38798116ea Signed-off-by: Varvara Rainchik --- runtime/Android.mk | 1 + runtime/arch/memcmp16.h | 2 +- runtime/arch/x86/memcmp16_x86.S | 1812 ++++++++++++++++----------------- runtime/arch/x86_64/memcmp16_x86_64.S | 1210 ++++++++++++++++++++++ 4 files changed, 2118 insertions(+), 907 deletions(-) create mode 100755 runtime/arch/x86_64/memcmp16_x86_64.S diff --git a/runtime/Android.mk b/runtime/Android.mk index 8fc5e343e..302e835e7 100644 --- a/runtime/Android.mk +++ b/runtime/Android.mk @@ -252,6 +252,7 @@ LIBART_SRC_FILES_x86_64 := \ arch/x86_64/context_x86_64.cc \ arch/x86_64/entrypoints_init_x86_64.cc \ arch/x86_64/jni_entrypoints_x86_64.S \ + arch/x86_64/memcmp16_x86_64.S \ arch/x86_64/portable_entrypoints_x86_64.S \ arch/x86_64/quick_entrypoints_x86_64.S \ arch/x86_64/thread_x86_64.cc \ diff --git a/runtime/arch/memcmp16.h b/runtime/arch/memcmp16.h index 65d2f92d7..14dc1e388 100644 --- a/runtime/arch/memcmp16.h +++ b/runtime/arch/memcmp16.h @@ -30,7 +30,7 @@ // // In both cases, MemCmp16 is declared. -#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) +#if defined(__aarch64__) || defined(__arm__) || defined(__mips) || defined(__i386__) || defined(__x86_64__) extern "C" uint32_t __memcmp16(const uint16_t* s0, const uint16_t* s1, size_t count); #define MemCmp16 __memcmp16 diff --git a/runtime/arch/x86/memcmp16_x86.S b/runtime/arch/x86/memcmp16_x86.S index 17662fae6..a315a378e 100644 --- a/runtime/arch/x86/memcmp16_x86.S +++ b/runtime/arch/x86/memcmp16_x86.S @@ -21,1018 +21,1018 @@ /* int32_t memcmp16_compare(const uint16_t* s0, const uint16_t* s1, size_t count); */ #ifndef L -# define L(label) .L##label +# define L(label) .L##label #endif -#define CFI_PUSH(REG) \ - CFI_ADJUST_CFA_OFFSET(4); \ - CFI_REL_OFFSET(REG, 0) +#define CFI_PUSH(REG) \ + CFI_ADJUST_CFA_OFFSET(4); \ + CFI_REL_OFFSET(REG, 0) -#define CFI_POP(REG) \ - CFI_ADJUST_CFA_OFFSET(-4); \ - CFI_RESTORE(REG) +#define CFI_POP(REG) \ + CFI_ADJUST_CFA_OFFSET(-4); \ + CFI_RESTORE(REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) -#define PARMS 4 -#define BLK1 PARMS -#define BLK2 BLK1+4 -#define LEN BLK2+4 -#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret -#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE +#define PARMS 4 +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +#define RETURN RETURN_END; CFI_RESTORE_STATE; CFI_REMEMBER_STATE DEFINE_FUNCTION MEMCMP - movl LEN(%esp), %ecx + movl LEN(%esp), %ecx - shl $1, %ecx - jz L(zero) + shl $1, %ecx + jz L(zero) - movl BLK1(%esp), %eax - cmp $48, %ecx - movl BLK2(%esp), %edx - jae L(48bytesormore) + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) - PUSH (%ebx) - add %ecx, %edx - add %ecx, %eax - jmp L(less48bytes) + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) - CFI_POP (%ebx) + CFI_POP (%ebx) - .p2align 4 + .p2align 4 L(zero): - xor %eax, %eax - ret + xor %eax, %eax + ret - .p2align 4 + .p2align 4 L(48bytesormore): - PUSH (%ebx) - PUSH (%esi) - PUSH (%edi) - CFI_REMEMBER_STATE - movdqu (%eax), %xmm3 - movdqu (%edx), %xmm0 - movl %eax, %edi - movl %edx, %esi - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%edi), %edi - - sub $0xffff, %edx - lea 16(%esi), %esi - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %edx, %edi - sub %edx, %esi - add %edx, %ecx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %edx, %esi - - cmp $0, %edx - je L(shr_0) - cmp $2, %edx - je L(shr_2) - cmp $4, %edx - je L(shr_4) - cmp $6, %edx - je L(shr_6) - cmp $8, %edx - je L(shr_8) - cmp $10, %edx - je L(shr_10) - cmp $12, %edx - je L(shr_12) - jmp L(shr_14) - - .p2align 4 + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + CFI_REMEMBER_STATE + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + + cmp $0, %edx + je L(shr_0) + cmp $2, %edx + je L(shr_2) + cmp $4, %edx + je L(shr_4) + cmp $6, %edx + je L(shr_6) + cmp $8, %edx + je L(shr_8) + cmp $10, %edx + je L(shr_10) + cmp $12, %edx + je L(shr_12) + jmp L(shr_14) + + .p2align 4 L(shr_0): - cmp $80, %ecx - jae L(shr_0_gobble) - lea -48(%ecx), %ecx - xor %eax, %eax - movaps (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - movaps 16(%esi), %xmm2 - pcmpeqb 16(%edi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - add $32, %edi - add $32, %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_0_gobble): - lea -48(%ecx), %ecx - movdqa (%esi), %xmm0 - xor %eax, %eax - pcmpeqb (%edi), %xmm0 - sub $32, %ecx - movdqa 16(%esi), %xmm2 - pcmpeqb 16(%edi), %xmm2 + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %ecx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%esi), %xmm0 - movdqa 48(%esi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%edi), %xmm0 - pcmpeqb 48(%edi), %xmm2 - lea 32(%edi), %edi - lea 32(%esi), %esi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %ecx - jge L(shr_0_gobble_loop_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx L(shr_0_gobble_loop_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea (%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_2): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $2,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_2_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $2,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $2,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $2,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $2,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_2_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 2(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_4): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $4,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_4_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $4,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $4,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $4,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $4,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_4_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 4(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_6): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $6,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_6_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $6,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $6,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $6,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $6,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_6_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 6(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_8): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8,(%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $8,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_8_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $8,(%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $8,16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $8,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $8,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_8_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 8(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_10): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $10,%xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_10_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $10, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $10, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $10,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $10,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_10_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 10(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_12): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_12_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $12, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $12, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $12,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $12,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_12_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 12(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_14): - cmp $80, %ecx - lea -48(%ecx), %ecx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%esi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%esi), %xmm1 - pcmpeqb (%edi), %xmm1 - - movdqa 32(%esi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%edi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - lea (%ecx, %edi,1), %eax - lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(shr_14_gobble): - sub $32, %ecx - movdqa 16(%esi), %xmm0 - palignr $14, (%esi), %xmm0 - pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 - movdqa 32(%esi), %xmm3 - palignr $14, 16(%esi), %xmm3 - pcmpeqb 16(%edi), %xmm3 + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %ecx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%esi), %xmm3 - palignr $14,48(%esi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%esi), %xmm0 - palignr $14,32(%esi), %xmm0 - pcmpeqb 32(%edi), %xmm0 - lea 32(%esi), %esi - pcmpeqb 48(%edi), %xmm3 - - lea 32(%edi), %edi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %ecx - jge L(shr_14_gobble_next) - inc %edx - add $32, %ecx + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%edi), %edi - lea 32(%esi), %esi - sub $0xffff, %edx - jnz L(exit) - - lea (%ecx, %edi,1), %eax - lea 14(%ecx, %esi,1), %edx - POP (%edi) - POP (%esi) - jmp L(less48bytes) - - CFI_RESTORE_STATE - CFI_REMEMBER_STATE - .p2align 4 + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + CFI_RESTORE_STATE + CFI_REMEMBER_STATE + .p2align 4 L(exit): - pmovmskb %xmm1, %ebx - sub $0xffff, %ebx - jz L(first16bytes) - lea -16(%esi), %esi - lea -16(%edi), %edi - mov %ebx, %edx + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx L(first16bytes): - add %eax, %esi + add %eax, %esi L(less16bytes): - test %dl, %dl - jz L(next_four_words) - test $15, %dl - jz L(second_two_words) - test $3, %dl - jz L(second_word) - movzwl -16(%edi), %eax - movzwl -16(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test %dl, %dl + jz L(next_four_words) + test $15, %dl + jz L(second_two_words) + test $3, %dl + jz L(second_word) + movzwl -16(%edi), %eax + movzwl -16(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(second_word): - movzwl -14(%edi), %eax - movzwl -14(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -14(%edi), %eax + movzwl -14(%esi), %ebx + subl %ebx, %eax + RETURN - .p2align 4 + .p2align 4 L(second_two_words): - test $63, %dl - jz L(fourth_word) - movzwl -12(%edi), %eax - movzwl -12(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test $63, %dl + jz L(fourth_word) + movzwl -12(%edi), %eax + movzwl -12(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(fourth_word): - movzwl -10(%edi), %eax - movzwl -10(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -10(%edi), %eax + movzwl -10(%esi), %ebx + subl %ebx, %eax + RETURN - .p2align 4 + .p2align 4 L(next_four_words): - test $15, %dh - jz L(fourth_two_words) - test $3, %dh - jz L(sixth_word) - movzwl -8(%edi), %eax - movzwl -8(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test $15, %dh + jz L(fourth_two_words) + test $3, %dh + jz L(sixth_word) + movzwl -8(%edi), %eax + movzwl -8(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(sixth_word): - movzwl -6(%edi), %eax - movzwl -6(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -6(%edi), %eax + movzwl -6(%esi), %ebx + subl %ebx, %eax + RETURN - .p2align 4 + .p2align 4 L(fourth_two_words): - test $63, %dh - jz L(eighth_word) - movzwl -4(%edi), %eax - movzwl -4(%esi), %ebx - subl %ebx, %eax - RETURN - - .p2align 4 + test $63, %dh + jz L(eighth_word) + movzwl -4(%edi), %eax + movzwl -4(%esi), %ebx + subl %ebx, %eax + RETURN + + .p2align 4 L(eighth_word): - movzwl -2(%edi), %eax - movzwl -2(%esi), %ebx - subl %ebx, %eax - RETURN + movzwl -2(%edi), %eax + movzwl -2(%esi), %ebx + subl %ebx, %eax + RETURN - CFI_PUSH (%ebx) + CFI_PUSH (%ebx) - .p2align 4 + .p2align 4 L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) - cmp $10, %ecx - je L(10bytes) - cmp $12, %ecx - je L(12bytes) - jmp L(14bytes) - - .p2align 4 + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) + cmp $10, %ecx + je L(10bytes) + cmp $12, %ecx + je L(12bytes) + jmp L(14bytes) + + .p2align 4 L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) - cmp $18, %ecx - je L(18bytes) - cmp $20, %ecx - je L(20bytes) - jmp L(22bytes) - - .p2align 4 + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) + cmp $18, %ecx + je L(18bytes) + cmp $20, %ecx + je L(20bytes) + jmp L(22bytes) + + .p2align 4 L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) - cmp $26, %ecx - je L(26bytes) - cmp $28, %ecx - je L(28bytes) - jmp L(30bytes) - - .p2align 4 + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) + cmp $26, %ecx + je L(26bytes) + cmp $28, %ecx + je L(28bytes) + jmp L(30bytes) + + .p2align 4 L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) - cmp $34, %ecx - je L(34bytes) - cmp $36, %ecx - je L(36bytes) - jmp L(38bytes) - - .p2align 4 + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) + cmp $34, %ecx + je L(34bytes) + cmp $36, %ecx + je L(36bytes) + jmp L(38bytes) + + .p2align 4 L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $2, %ecx - je L(2bytes) - cmp $4, %ecx - je L(4bytes) - jmp L(6bytes) - - .p2align 4 + cmp $8, %ecx + jae L(more8bytes) + cmp $2, %ecx + je L(2bytes) + cmp $4, %ecx + je L(4bytes) + jmp L(6bytes) + + .p2align 4 L(more40bytes): - cmp $40, %ecx - je L(40bytes) - cmp $42, %ecx - je L(42bytes) - cmp $44, %ecx - je L(44bytes) - jmp L(46bytes) - - .p2align 4 + cmp $40, %ecx + je L(40bytes) + cmp $42, %ecx + je L(42bytes) + cmp $44, %ecx + je L(44bytes) + jmp L(46bytes) + + .p2align 4 L(46bytes): - movzwl -46(%eax), %ecx - movzwl -46(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -46(%eax), %ecx + movzwl -46(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(44bytes): - movzwl -44(%eax), %ecx - movzwl -44(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -44(%eax), %ecx + movzwl -44(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(42bytes): - movzwl -42(%eax), %ecx - movzwl -42(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -42(%eax), %ecx + movzwl -42(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(40bytes): - movzwl -40(%eax), %ecx - movzwl -40(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -40(%eax), %ecx + movzwl -40(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(38bytes): - movzwl -38(%eax), %ecx - movzwl -38(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -38(%eax), %ecx + movzwl -38(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(36bytes): - movzwl -36(%eax), %ecx - movzwl -36(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -36(%eax), %ecx + movzwl -36(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(34bytes): - movzwl -34(%eax), %ecx - movzwl -34(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -34(%eax), %ecx + movzwl -34(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(32bytes): - movzwl -32(%eax), %ecx - movzwl -32(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -32(%eax), %ecx + movzwl -32(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(30bytes): - movzwl -30(%eax), %ecx - movzwl -30(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -30(%eax), %ecx + movzwl -30(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(28bytes): - movzwl -28(%eax), %ecx - movzwl -28(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -28(%eax), %ecx + movzwl -28(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(26bytes): - movzwl -26(%eax), %ecx - movzwl -26(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -26(%eax), %ecx + movzwl -26(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(24bytes): - movzwl -24(%eax), %ecx - movzwl -24(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -24(%eax), %ecx + movzwl -24(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(22bytes): - movzwl -22(%eax), %ecx - movzwl -22(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -22(%eax), %ecx + movzwl -22(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(20bytes): - movzwl -20(%eax), %ecx - movzwl -20(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -20(%eax), %ecx + movzwl -20(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(18bytes): - movzwl -18(%eax), %ecx - movzwl -18(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -18(%eax), %ecx + movzwl -18(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(16bytes): - movzwl -16(%eax), %ecx - movzwl -16(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -16(%eax), %ecx + movzwl -16(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(14bytes): - movzwl -14(%eax), %ecx - movzwl -14(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -14(%eax), %ecx + movzwl -14(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(12bytes): - movzwl -12(%eax), %ecx - movzwl -12(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -12(%eax), %ecx + movzwl -12(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(10bytes): - movzwl -10(%eax), %ecx - movzwl -10(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -10(%eax), %ecx + movzwl -10(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(8bytes): - movzwl -8(%eax), %ecx - movzwl -8(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -8(%eax), %ecx + movzwl -8(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(6bytes): - movzwl -6(%eax), %ecx - movzwl -6(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -6(%eax), %ecx + movzwl -6(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(4bytes): - movzwl -4(%eax), %ecx - movzwl -4(%edx), %ebx - subl %ebx, %ecx - jne L(memcmp16_exit) + movzwl -4(%eax), %ecx + movzwl -4(%edx), %ebx + subl %ebx, %ecx + jne L(memcmp16_exit) L(2bytes): - movzwl -2(%eax), %eax - movzwl -2(%edx), %ebx - subl %ebx, %eax - POP (%ebx) - ret - CFI_PUSH (%ebx) - - .p2align 4 + movzwl -2(%eax), %eax + movzwl -2(%edx), %ebx + subl %ebx, %eax + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 L(memcmp16_exit): - POP (%ebx) - mov %ecx, %eax - ret + POP (%ebx) + mov %ecx, %eax + ret END_FUNCTION MEMCMP diff --git a/runtime/arch/x86_64/memcmp16_x86_64.S b/runtime/arch/x86_64/memcmp16_x86_64.S new file mode 100755 index 000000000..46e4ba36c --- /dev/null +++ b/runtime/arch/x86_64/memcmp16_x86_64.S @@ -0,0 +1,1210 @@ +/* + * Copyright (C) 2014 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "asm_support_x86_64.S" + +#define MEMCMP __memcmp16 + +/* + * Half of Silvermont L1 Data Cache size + *(see original file cache.h in bionic/libc/arch-x86_64/). + * This value is used for specific optimization on big lengths. + */ +#define DATA_CACHE_SIZE_HALF (12*1024) + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define JMPTBL(I, B) (I - B) + +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + +DEFINE_FUNCTION MEMCMP + pxor %xmm0, %xmm0 + shl $1, %rdx + cmp $79, %rdx + ja L(79bytesormore) + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) + cmp $256, %rdx + ja L(less512bytes) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(512bytesormore): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +L(L2_L3_cache_unaglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + ALIGN (4) +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + ALIGN (4) +L(512bytesormorein2aligned): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) +L(L2_L3_cache_aglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 2) + + + ALIGN (4) +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx + mov -4(%rdi), %eax + cmp %eax, %ecx + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + + ALIGN (4) +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(end) + + ALIGN (4) +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + ALIGN (3) +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax +L(diffin4bytes): + cmp %cx, %ax + jne L(end) + shr $16, %ecx + shr $16, %eax + jmp L(end) + + ALIGN (4) +L(end): + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + +END_FUNCTION MEMCMP + + ALIGN (3) +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) -- 2.11.0