OSDN Git Service

arm64/lib: improve CRC32 performance for deep pipelines
authorArd Biesheuvel <ard.biesheuvel@linaro.org>
Tue, 27 Nov 2018 17:42:55 +0000 (18:42 +0100)
committer0ranko0P <ranko0p@outlook.com>
Wed, 4 Dec 2019 13:45:16 +0000 (21:45 +0800)
Improve the performance of the crc32() asm routines by getting rid of
most of the branches and small sized loads on the common path.

Instead, use a branchless code path involving overlapping 16 byte
loads to process the first (length % 32) bytes, and process the
remainder using a loop that processes 32 bytes at a time.

Tested using the following test program:

  #include <stdlib.h>

  extern void crc32_le(unsigned short, char const*, int);

  int main(void)
  {
    static const char buf[4096];

    srand(20181126);

    for (int i = 0; i < 100 * 1000 * 1000; i++)
      crc32_le(0, buf, rand() % 1024);

    return 0;
  }

On Cortex-A53 and Cortex-A57, the performance regresses but only very
slightly. On Cortex-A72 however, the performance improves from

  $ time ./crc32

  real  0m10.149s
  user  0m10.149s
  sys   0m0.000s

to

  $ time ./crc32

  real  0m7.915s
  user  0m7.915s
  sys   0m0.000s

Cc: Rui Sun <sunrui26@huawei.com>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
arch/arm64/lib/crc32.S

index 5bc1e85..f132f2a 100644 (file)
        .cpu            generic+crc
 
        .macro          __crc32, c
-0:     subs            x2, x2, #16
-       b.mi            8f
-       ldp             x3, x4, [x1], #16
+       cmp             x2, #16
+       b.lt            8f                      // less than 16 bytes
+
+       and             x7, x2, #0x1f
+       and             x2, x2, #~0x1f
+       cbz             x7, 32f                 // multiple of 32 bytes
+
+       and             x8, x7, #0xf
+       ldp             x3, x4, [x1]
+       add             x8, x8, x1
+       add             x1, x1, x7
+       ldp             x5, x6, [x8]
 CPU_BE(        rev             x3, x3          )
 CPU_BE(        rev             x4, x4          )
+CPU_BE(        rev             x5, x5          )
+CPU_BE(        rev             x6, x6          )
+
+       tst             x7, #8
+       crc32\c\()x     w8, w0, x3
+       csel            x3, x3, x4, eq
+       csel            w0, w0, w8, eq
+       tst             x7, #4
+       lsr             x4, x3, #32
+       crc32\c\()w     w8, w0, w3
+       csel            x3, x3, x4, eq
+       csel            w0, w0, w8, eq
+       tst             x7, #2
+       lsr             w4, w3, #16
+       crc32\c\()h     w8, w0, w3
+       csel            w3, w3, w4, eq
+       csel            w0, w0, w8, eq
+       tst             x7, #1
+       crc32\c\()b     w8, w0, w3
+       csel            w0, w0, w8, eq
+       tst             x7, #16
+       crc32\c\()x     w8, w0, x5
+       crc32\c\()x     w8, w8, x6
+       csel            w0, w0, w8, eq
+       cbz             x2, 0f
+
+32:    ldp             x3, x4, [x1], #32
+       sub             x2, x2, #32
+       ldp             x5, x6, [x1, #-16]
+CPU_BE(        rev             x3, x3          )
+CPU_BE(        rev             x4, x4          )
+CPU_BE(        rev             x5, x5          )
+CPU_BE(        rev             x6, x6          )
        crc32\c\()x     w0, w0, x3
        crc32\c\()x     w0, w0, x4
-       b.ne            0b
-       ret
+       crc32\c\()x     w0, w0, x5
+       crc32\c\()x     w0, w0, x6
+       cbnz            x2, 32b
+0:     ret
 
 8:     tbz             x2, #3, 4f
        ldr             x3, [x1], #8