crypto: x86/poly1305 - wire up faster implementations for kernel

author Jason A. Donenfeld <Jason@zx2c4.com>

Mon, 6 Jan 2020 03:40:48 +0000 (22:40 -0500)

committer Herbert Xu <herbert@gondor.apana.org.au>

Thu, 16 Jan 2020 07:18:12 +0000 (15:18 +0800)
author Jason A. Donenfeld <Jason@zx2c4.com>
Mon, 6 Jan 2020 03:40:48 +0000 (22:40 -0500)
committer Herbert Xu <herbert@gondor.apana.org.au>
Thu, 16 Jan 2020 07:18:12 +0000 (15:18 +0800)
diff --git a/arch/x86/crypto/.gitignore b/arch/x86/crypto/.gitignore

new file mode 100644 (file)

index 0000000..c406ea6
--- /dev/null
+++ b/arch/x86/crypto/.gitignore
@@ -0,0 +1 @@
+poly1305-x86_64.S
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile

index 958440e..b69e00b 100644 (file)
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -73,6 +73,10 @@ aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
  
  nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
  blake2s-x86_64-y := blake2s-core.o blake2s-glue.o
+poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
+ifneq ($(CONFIG_CRYPTO_POLY1305_X86_64),)
+targets += poly1305-x86_64-cryptogams.S
+endif
  
  ifeq ($(avx_supported),yes)
         camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
@@ -101,10 +105,8 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
  aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
  sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
-poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
  ifeq ($(avx2_supported),yes)
  sha1-ssse3-y += sha1_avx2_x86_64_asm.o
-poly1305-x86_64-y += poly1305-avx2-x86_64.o
  endif
  ifeq ($(sha1_ni_supported),yes)
  sha1-ssse3-y += sha1_ni_asm.o
@@ -118,3 +120,8 @@ sha256-ssse3-y += sha256_ni_asm.o
  endif
  sha512-ssse3-y := sha512-ssse3-asm.o sha512-avx-asm.o sha512-avx2-asm.o sha512_ssse3_glue.o
  crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
+
+quiet_cmd_perlasm = PERLASM $@
+      cmd_perlasm = $(PERL) $< > $@
+$(obj)/%.S: $(src)/%.pl FORCE
+       $(call if_changed,perlasm)
diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S

deleted file mode 100644 (file)

index 8f56989..0000000
--- a/arch/x86/crypto/poly1305-avx2-x86_64.S
+++ /dev/null
@@ -1,390 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Poly1305 authenticator algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section       .rodata.cst32.ANMASK, "aM", @progbits, 32
-.align 32
-ANMASK:        .octa 0x0000000003ffffff0000000003ffffff
-       .octa 0x0000000003ffffff0000000003ffffff
-
-.section       .rodata.cst32.ORMASK, "aM", @progbits, 32
-.align 32
-ORMASK:        .octa 0x00000000010000000000000001000000
-       .octa 0x00000000010000000000000001000000
-
-.text
-
-#define h0 0x00(%rdi)
-#define h1 0x04(%rdi)
-#define h2 0x08(%rdi)
-#define h3 0x0c(%rdi)
-#define h4 0x10(%rdi)
-#define r0 0x00(%rdx)
-#define r1 0x04(%rdx)
-#define r2 0x08(%rdx)
-#define r3 0x0c(%rdx)
-#define r4 0x10(%rdx)
-#define u0 0x00(%r8)
-#define u1 0x04(%r8)
-#define u2 0x08(%r8)
-#define u3 0x0c(%r8)
-#define u4 0x10(%r8)
-#define w0 0x18(%r8)
-#define w1 0x1c(%r8)
-#define w2 0x20(%r8)
-#define w3 0x24(%r8)
-#define w4 0x28(%r8)
-#define y0 0x30(%r8)
-#define y1 0x34(%r8)
-#define y2 0x38(%r8)
-#define y3 0x3c(%r8)
-#define y4 0x40(%r8)
-#define m %rsi
-#define hc0 %ymm0
-#define hc1 %ymm1
-#define hc2 %ymm2
-#define hc3 %ymm3
-#define hc4 %ymm4
-#define hc0x %xmm0
-#define hc1x %xmm1
-#define hc2x %xmm2
-#define hc3x %xmm3
-#define hc4x %xmm4
-#define t1 %ymm5
-#define t2 %ymm6
-#define t1x %xmm5
-#define t2x %xmm6
-#define ruwy0 %ymm7
-#define ruwy1 %ymm8
-#define ruwy2 %ymm9
-#define ruwy3 %ymm10
-#define ruwy4 %ymm11
-#define ruwy0x %xmm7
-#define ruwy1x %xmm8
-#define ruwy2x %xmm9
-#define ruwy3x %xmm10
-#define ruwy4x %xmm11
-#define svxz1 %ymm12
-#define svxz2 %ymm13
-#define svxz3 %ymm14
-#define svxz4 %ymm15
-#define d0 %r9
-#define d1 %r10
-#define d2 %r11
-#define d3 %r12
-#define d4 %r13
-
-SYM_FUNC_START(poly1305_4block_avx2)
-       # %rdi: Accumulator h[5]
-       # %rsi: 64 byte input block m
-       # %rdx: Poly1305 key r[5]
-       # %rcx: Quadblock count
-       # %r8:  Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
-
-       # This four-block variant uses loop unrolled block processing. It
-       # requires 4 Poly1305 keys: r, r^2, r^3 and r^4:
-       # h = (h + m) * r  =>  h = (h + m1) * r^4 + m2 * r^3 + m3 * r^2 + m4 * r
-
-       vzeroupper
-       push            %rbx
-       push            %r12
-       push            %r13
-
-       # combine r0,u0,w0,y0
-       vmovd           y0,ruwy0x
-       vmovd           w0,t1x
-       vpunpcklqdq     t1,ruwy0,ruwy0
-       vmovd           u0,t1x
-       vmovd           r0,t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,ruwy0,ruwy0
-
-       # combine r1,u1,w1,y1 and s1=r1*5,v1=u1*5,x1=w1*5,z1=y1*5
-       vmovd           y1,ruwy1x
-       vmovd           w1,t1x
-       vpunpcklqdq     t1,ruwy1,ruwy1
-       vmovd           u1,t1x
-       vmovd           r1,t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,ruwy1,ruwy1
-       vpslld          $2,ruwy1,svxz1
-       vpaddd          ruwy1,svxz1,svxz1
-
-       # combine r2,u2,w2,y2 and s2=r2*5,v2=u2*5,x2=w2*5,z2=y2*5
-       vmovd           y2,ruwy2x
-       vmovd           w2,t1x
-       vpunpcklqdq     t1,ruwy2,ruwy2
-       vmovd           u2,t1x
-       vmovd           r2,t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,ruwy2,ruwy2
-       vpslld          $2,ruwy2,svxz2
-       vpaddd          ruwy2,svxz2,svxz2
-
-       # combine r3,u3,w3,y3 and s3=r3*5,v3=u3*5,x3=w3*5,z3=y3*5
-       vmovd           y3,ruwy3x
-       vmovd           w3,t1x
-       vpunpcklqdq     t1,ruwy3,ruwy3
-       vmovd           u3,t1x
-       vmovd           r3,t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,ruwy3,ruwy3
-       vpslld          $2,ruwy3,svxz3
-       vpaddd          ruwy3,svxz3,svxz3
-
-       # combine r4,u4,w4,y4 and s4=r4*5,v4=u4*5,x4=w4*5,z4=y4*5
-       vmovd           y4,ruwy4x
-       vmovd           w4,t1x
-       vpunpcklqdq     t1,ruwy4,ruwy4
-       vmovd           u4,t1x
-       vmovd           r4,t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,ruwy4,ruwy4
-       vpslld          $2,ruwy4,svxz4
-       vpaddd          ruwy4,svxz4,svxz4
-
-.Ldoblock4:
-       # hc0 = [m[48-51] & 0x3ffffff, m[32-35] & 0x3ffffff,
-       #        m[16-19] & 0x3ffffff, m[ 0- 3] & 0x3ffffff + h0]
-       vmovd           0x00(m),hc0x
-       vmovd           0x10(m),t1x
-       vpunpcklqdq     t1,hc0,hc0
-       vmovd           0x20(m),t1x
-       vmovd           0x30(m),t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,hc0,hc0
-       vpand           ANMASK(%rip),hc0,hc0
-       vmovd           h0,t1x
-       vpaddd          t1,hc0,hc0
-       # hc1 = [(m[51-54] >> 2) & 0x3ffffff, (m[35-38] >> 2) & 0x3ffffff,
-       #        (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
-       vmovd           0x03(m),hc1x
-       vmovd           0x13(m),t1x
-       vpunpcklqdq     t1,hc1,hc1
-       vmovd           0x23(m),t1x
-       vmovd           0x33(m),t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,hc1,hc1
-       vpsrld          $2,hc1,hc1
-       vpand           ANMASK(%rip),hc1,hc1
-       vmovd           h1,t1x
-       vpaddd          t1,hc1,hc1
-       # hc2 = [(m[54-57] >> 4) & 0x3ffffff, (m[38-41] >> 4) & 0x3ffffff,
-       #        (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
-       vmovd           0x06(m),hc2x
-       vmovd           0x16(m),t1x
-       vpunpcklqdq     t1,hc2,hc2
-       vmovd           0x26(m),t1x
-       vmovd           0x36(m),t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,hc2,hc2
-       vpsrld          $4,hc2,hc2
-       vpand           ANMASK(%rip),hc2,hc2
-       vmovd           h2,t1x
-       vpaddd          t1,hc2,hc2
-       # hc3 = [(m[57-60] >> 6) & 0x3ffffff, (m[41-44] >> 6) & 0x3ffffff,
-       #        (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
-       vmovd           0x09(m),hc3x
-       vmovd           0x19(m),t1x
-       vpunpcklqdq     t1,hc3,hc3
-       vmovd           0x29(m),t1x
-       vmovd           0x39(m),t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,hc3,hc3
-       vpsrld          $6,hc3,hc3
-       vpand           ANMASK(%rip),hc3,hc3
-       vmovd           h3,t1x
-       vpaddd          t1,hc3,hc3
-       # hc4 = [(m[60-63] >> 8) | (1<<24), (m[44-47] >> 8) | (1<<24),
-       #        (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
-       vmovd           0x0c(m),hc4x
-       vmovd           0x1c(m),t1x
-       vpunpcklqdq     t1,hc4,hc4
-       vmovd           0x2c(m),t1x
-       vmovd           0x3c(m),t2x
-       vpunpcklqdq     t2,t1,t1
-       vperm2i128      $0x20,t1,hc4,hc4
-       vpsrld          $8,hc4,hc4
-       vpor            ORMASK(%rip),hc4,hc4
-       vmovd           h4,t1x
-       vpaddd          t1,hc4,hc4
-
-       # t1 = [ hc0[3] * r0, hc0[2] * u0, hc0[1] * w0, hc0[0] * y0 ]
-       vpmuludq        hc0,ruwy0,t1
-       # t1 += [ hc1[3] * s4, hc1[2] * v4, hc1[1] * x4, hc1[0] * z4 ]
-       vpmuludq        hc1,svxz4,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc2[3] * s3, hc2[2] * v3, hc2[1] * x3, hc2[0] * z3 ]
-       vpmuludq        hc2,svxz3,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc3[3] * s2, hc3[2] * v2, hc3[1] * x2, hc3[0] * z2 ]
-       vpmuludq        hc3,svxz2,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc4[3] * s1, hc4[2] * v1, hc4[1] * x1, hc4[0] * z1 ]
-       vpmuludq        hc4,svxz1,t2
-       vpaddq          t2,t1,t1
-       # d0 = t1[0] + t1[1] + t[2] + t[3]
-       vpermq          $0xee,t1,t2
-       vpaddq          t2,t1,t1
-       vpsrldq         $8,t1,t2
-       vpaddq          t2,t1,t1
-       vmovq           t1x,d0
-
-       # t1 = [ hc0[3] * r1, hc0[2] * u1,hc0[1] * w1, hc0[0] * y1 ]
-       vpmuludq        hc0,ruwy1,t1
-       # t1 += [ hc1[3] * r0, hc1[2] * u0, hc1[1] * w0, hc1[0] * y0 ]
-       vpmuludq        hc1,ruwy0,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc2[3] * s4, hc2[2] * v4, hc2[1] * x4, hc2[0] * z4 ]
-       vpmuludq        hc2,svxz4,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc3[3] * s3, hc3[2] * v3, hc3[1] * x3, hc3[0] * z3 ]
-       vpmuludq        hc3,svxz3,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc4[3] * s2, hc4[2] * v2, hc4[1] * x2, hc4[0] * z2 ]
-       vpmuludq        hc4,svxz2,t2
-       vpaddq          t2,t1,t1
-       # d1 = t1[0] + t1[1] + t1[3] + t1[4]
-       vpermq          $0xee,t1,t2
-       vpaddq          t2,t1,t1
-       vpsrldq         $8,t1,t2
-       vpaddq          t2,t1,t1
-       vmovq           t1x,d1
-
-       # t1 = [ hc0[3] * r2, hc0[2] * u2, hc0[1] * w2, hc0[0] * y2 ]
-       vpmuludq        hc0,ruwy2,t1
-       # t1 += [ hc1[3] * r1, hc1[2] * u1, hc1[1] * w1, hc1[0] * y1 ]
-       vpmuludq        hc1,ruwy1,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc2[3] * r0, hc2[2] * u0, hc2[1] * w0, hc2[0] * y0 ]
-       vpmuludq        hc2,ruwy0,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc3[3] * s4, hc3[2] * v4, hc3[1] * x4, hc3[0] * z4 ]
-       vpmuludq        hc3,svxz4,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc4[3] * s3, hc4[2] * v3, hc4[1] * x3, hc4[0] * z3 ]
-       vpmuludq        hc4,svxz3,t2
-       vpaddq          t2,t1,t1
-       # d2 = t1[0] + t1[1] + t1[2] + t1[3]
-       vpermq          $0xee,t1,t2
-       vpaddq          t2,t1,t1
-       vpsrldq         $8,t1,t2
-       vpaddq          t2,t1,t1
-       vmovq           t1x,d2
-
-       # t1 = [ hc0[3] * r3, hc0[2] * u3, hc0[1] * w3, hc0[0] * y3 ]
-       vpmuludq        hc0,ruwy3,t1
-       # t1 += [ hc1[3] * r2, hc1[2] * u2, hc1[1] * w2, hc1[0] * y2 ]
-       vpmuludq        hc1,ruwy2,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc2[3] * r1, hc2[2] * u1, hc2[1] * w1, hc2[0] * y1 ]
-       vpmuludq        hc2,ruwy1,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc3[3] * r0, hc3[2] * u0, hc3[1] * w0, hc3[0] * y0 ]
-       vpmuludq        hc3,ruwy0,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc4[3] * s4, hc4[2] * v4, hc4[1] * x4, hc4[0] * z4 ]
-       vpmuludq        hc4,svxz4,t2
-       vpaddq          t2,t1,t1
-       # d3 = t1[0] + t1[1] + t1[2] + t1[3]
-       vpermq          $0xee,t1,t2
-       vpaddq          t2,t1,t1
-       vpsrldq         $8,t1,t2
-       vpaddq          t2,t1,t1
-       vmovq           t1x,d3
-
-       # t1 = [ hc0[3] * r4, hc0[2] * u4, hc0[1] * w4, hc0[0] * y4 ]
-       vpmuludq        hc0,ruwy4,t1
-       # t1 += [ hc1[3] * r3, hc1[2] * u3, hc1[1] * w3, hc1[0] * y3 ]
-       vpmuludq        hc1,ruwy3,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc2[3] * r2, hc2[2] * u2, hc2[1] * w2, hc2[0] * y2 ]
-       vpmuludq        hc2,ruwy2,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc3[3] * r1, hc3[2] * u1, hc3[1] * w1, hc3[0] * y1 ]
-       vpmuludq        hc3,ruwy1,t2
-       vpaddq          t2,t1,t1
-       # t1 += [ hc4[3] * r0, hc4[2] * u0, hc4[1] * w0, hc4[0] * y0 ]
-       vpmuludq        hc4,ruwy0,t2
-       vpaddq          t2,t1,t1
-       # d4 = t1[0] + t1[1] + t1[2] + t1[3]
-       vpermq          $0xee,t1,t2
-       vpaddq          t2,t1,t1
-       vpsrldq         $8,t1,t2
-       vpaddq          t2,t1,t1
-       vmovq           t1x,d4
-
-       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
-       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
-       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
-       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
-       # integers.  It's true in a single-block implementation, but not here.
-
-       # d1 += d0 >> 26
-       mov             d0,%rax
-       shr             $26,%rax
-       add             %rax,d1
-       # h0 = d0 & 0x3ffffff
-       mov             d0,%rbx
-       and             $0x3ffffff,%ebx
-
-       # d2 += d1 >> 26
-       mov             d1,%rax
-       shr             $26,%rax
-       add             %rax,d2
-       # h1 = d1 & 0x3ffffff
-       mov             d1,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h1
-
-       # d3 += d2 >> 26
-       mov             d2,%rax
-       shr             $26,%rax
-       add             %rax,d3
-       # h2 = d2 & 0x3ffffff
-       mov             d2,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h2
-
-       # d4 += d3 >> 26
-       mov             d3,%rax
-       shr             $26,%rax
-       add             %rax,d4
-       # h3 = d3 & 0x3ffffff
-       mov             d3,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h3
-
-       # h0 += (d4 >> 26) * 5
-       mov             d4,%rax
-       shr             $26,%rax
-       lea             (%rax,%rax,4),%rax
-       add             %rax,%rbx
-       # h4 = d4 & 0x3ffffff
-       mov             d4,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h4
-
-       # h1 += h0 >> 26
-       mov             %rbx,%rax
-       shr             $26,%rax
-       add             %eax,h1
-       # h0 = h0 & 0x3ffffff
-       andl            $0x3ffffff,%ebx
-       mov             %ebx,h0
-
-       add             $0x40,m
-       dec             %rcx
-       jnz             .Ldoblock4
-
-       vzeroupper
-       pop             %r13
-       pop             %r12
-       pop             %rbx
-       ret
-SYM_FUNC_END(poly1305_4block_avx2)
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S

deleted file mode 100644 (file)

index d8ea29b..0000000
--- a/arch/x86/crypto/poly1305-sse2-x86_64.S
+++ /dev/null
@@ -1,590 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
- *
- * Copyright (C) 2015 Martin Willi
- */
-
-#include <linux/linkage.h>
-
-.section       .rodata.cst16.ANMASK, "aM", @progbits, 16
-.align 16
-ANMASK:        .octa 0x0000000003ffffff0000000003ffffff
-
-.section       .rodata.cst16.ORMASK, "aM", @progbits, 16
-.align 16
-ORMASK:        .octa 0x00000000010000000000000001000000
-
-.text
-
-#define h0 0x00(%rdi)
-#define h1 0x04(%rdi)
-#define h2 0x08(%rdi)
-#define h3 0x0c(%rdi)
-#define h4 0x10(%rdi)
-#define r0 0x00(%rdx)
-#define r1 0x04(%rdx)
-#define r2 0x08(%rdx)
-#define r3 0x0c(%rdx)
-#define r4 0x10(%rdx)
-#define s1 0x00(%rsp)
-#define s2 0x04(%rsp)
-#define s3 0x08(%rsp)
-#define s4 0x0c(%rsp)
-#define m %rsi
-#define h01 %xmm0
-#define h23 %xmm1
-#define h44 %xmm2
-#define t1 %xmm3
-#define t2 %xmm4
-#define t3 %xmm5
-#define t4 %xmm6
-#define mask %xmm7
-#define d0 %r8
-#define d1 %r9
-#define d2 %r10
-#define d3 %r11
-#define d4 %r12
-
-SYM_FUNC_START(poly1305_block_sse2)
-       # %rdi: Accumulator h[5]
-       # %rsi: 16 byte input block m
-       # %rdx: Poly1305 key r[5]
-       # %rcx: Block count
-
-       # This single block variant tries to improve performance by doing two
-       # multiplications in parallel using SSE instructions. There is quite
-       # some quardword packing involved, hence the speedup is marginal.
-
-       push            %rbx
-       push            %r12
-       sub             $0x10,%rsp
-
-       # s1..s4 = r1..r4 * 5
-       mov             r1,%eax
-       lea             (%eax,%eax,4),%eax
-       mov             %eax,s1
-       mov             r2,%eax
-       lea             (%eax,%eax,4),%eax
-       mov             %eax,s2
-       mov             r3,%eax
-       lea             (%eax,%eax,4),%eax
-       mov             %eax,s3
-       mov             r4,%eax
-       lea             (%eax,%eax,4),%eax
-       mov             %eax,s4
-
-       movdqa          ANMASK(%rip),mask
-
-.Ldoblock:
-       # h01 = [0, h1, 0, h0]
-       # h23 = [0, h3, 0, h2]
-       # h44 = [0, h4, 0, h4]
-       movd            h0,h01
-       movd            h1,t1
-       movd            h2,h23
-       movd            h3,t2
-       movd            h4,h44
-       punpcklqdq      t1,h01
-       punpcklqdq      t2,h23
-       punpcklqdq      h44,h44
-
-       # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
-       movd            0x00(m),t1
-       movd            0x03(m),t2
-       psrld           $2,t2
-       punpcklqdq      t2,t1
-       pand            mask,t1
-       paddd           t1,h01
-       # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
-       movd            0x06(m),t1
-       movd            0x09(m),t2
-       psrld           $4,t1
-       psrld           $6,t2
-       punpcklqdq      t2,t1
-       pand            mask,t1
-       paddd           t1,h23
-       # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
-       mov             0x0c(m),%eax
-       shr             $8,%eax
-       or              $0x01000000,%eax
-       movd            %eax,t1
-       pshufd          $0xc4,t1,t1
-       paddd           t1,h44
-
-       # t1[0] = h0 * r0 + h2 * s3
-       # t1[1] = h1 * s4 + h3 * s2
-       movd            r0,t1
-       movd            s4,t2
-       punpcklqdq      t2,t1
-       pmuludq         h01,t1
-       movd            s3,t2
-       movd            s2,t3
-       punpcklqdq      t3,t2
-       pmuludq         h23,t2
-       paddq           t2,t1
-       # t2[0] = h0 * r1 + h2 * s4
-       # t2[1] = h1 * r0 + h3 * s3
-       movd            r1,t2
-       movd            r0,t3
-       punpcklqdq      t3,t2
-       pmuludq         h01,t2
-       movd            s4,t3
-       movd            s3,t4
-       punpcklqdq      t4,t3
-       pmuludq         h23,t3
-       paddq           t3,t2
-       # t3[0] = h4 * s1
-       # t3[1] = h4 * s2
-       movd            s1,t3
-       movd            s2,t4
-       punpcklqdq      t4,t3
-       pmuludq         h44,t3
-       # d0 = t1[0] + t1[1] + t3[0]
-       # d1 = t2[0] + t2[1] + t3[1]
-       movdqa          t1,t4
-       punpcklqdq      t2,t4
-       punpckhqdq      t2,t1
-       paddq           t4,t1
-       paddq           t3,t1
-       movq            t1,d0
-       psrldq          $8,t1
-       movq            t1,d1
-
-       # t1[0] = h0 * r2 + h2 * r0
-       # t1[1] = h1 * r1 + h3 * s4
-       movd            r2,t1
-       movd            r1,t2
-       punpcklqdq      t2,t1
-       pmuludq         h01,t1
-       movd            r0,t2
-       movd            s4,t3
-       punpcklqdq      t3,t2
-       pmuludq         h23,t2
-       paddq           t2,t1
-       # t2[0] = h0 * r3 + h2 * r1
-       # t2[1] = h1 * r2 + h3 * r0
-       movd            r3,t2
-       movd            r2,t3
-       punpcklqdq      t3,t2
-       pmuludq         h01,t2
-       movd            r1,t3
-       movd            r0,t4
-       punpcklqdq      t4,t3
-       pmuludq         h23,t3
-       paddq           t3,t2
-       # t3[0] = h4 * s3
-       # t3[1] = h4 * s4
-       movd            s3,t3
-       movd            s4,t4
-       punpcklqdq      t4,t3
-       pmuludq         h44,t3
-       # d2 = t1[0] + t1[1] + t3[0]
-       # d3 = t2[0] + t2[1] + t3[1]
-       movdqa          t1,t4
-       punpcklqdq      t2,t4
-       punpckhqdq      t2,t1
-       paddq           t4,t1
-       paddq           t3,t1
-       movq            t1,d2
-       psrldq          $8,t1
-       movq            t1,d3
-
-       # t1[0] = h0 * r4 + h2 * r2
-       # t1[1] = h1 * r3 + h3 * r1
-       movd            r4,t1
-       movd            r3,t2
-       punpcklqdq      t2,t1
-       pmuludq         h01,t1
-       movd            r2,t2
-       movd            r1,t3
-       punpcklqdq      t3,t2
-       pmuludq         h23,t2
-       paddq           t2,t1
-       # t3[0] = h4 * r0
-       movd            r0,t3
-       pmuludq         h44,t3
-       # d4 = t1[0] + t1[1] + t3[0]
-       movdqa          t1,t4
-       psrldq          $8,t4
-       paddq           t4,t1
-       paddq           t3,t1
-       movq            t1,d4
-
-       # d1 += d0 >> 26
-       mov             d0,%rax
-       shr             $26,%rax
-       add             %rax,d1
-       # h0 = d0 & 0x3ffffff
-       mov             d0,%rbx
-       and             $0x3ffffff,%ebx
-
-       # d2 += d1 >> 26
-       mov             d1,%rax
-       shr             $26,%rax
-       add             %rax,d2
-       # h1 = d1 & 0x3ffffff
-       mov             d1,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h1
-
-       # d3 += d2 >> 26
-       mov             d2,%rax
-       shr             $26,%rax
-       add             %rax,d3
-       # h2 = d2 & 0x3ffffff
-       mov             d2,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h2
-
-       # d4 += d3 >> 26
-       mov             d3,%rax
-       shr             $26,%rax
-       add             %rax,d4
-       # h3 = d3 & 0x3ffffff
-       mov             d3,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h3
-
-       # h0 += (d4 >> 26) * 5
-       mov             d4,%rax
-       shr             $26,%rax
-       lea             (%rax,%rax,4),%rax
-       add             %rax,%rbx
-       # h4 = d4 & 0x3ffffff
-       mov             d4,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h4
-
-       # h1 += h0 >> 26
-       mov             %rbx,%rax
-       shr             $26,%rax
-       add             %eax,h1
-       # h0 = h0 & 0x3ffffff
-       andl            $0x3ffffff,%ebx
-       mov             %ebx,h0
-
-       add             $0x10,m
-       dec             %rcx
-       jnz             .Ldoblock
-
-       # Zeroing of key material
-       mov             %rcx,0x00(%rsp)
-       mov             %rcx,0x08(%rsp)
-
-       add             $0x10,%rsp
-       pop             %r12
-       pop             %rbx
-       ret
-SYM_FUNC_END(poly1305_block_sse2)
-
-
-#define u0 0x00(%r8)
-#define u1 0x04(%r8)
-#define u2 0x08(%r8)
-#define u3 0x0c(%r8)
-#define u4 0x10(%r8)
-#define hc0 %xmm0
-#define hc1 %xmm1
-#define hc2 %xmm2
-#define hc3 %xmm5
-#define hc4 %xmm6
-#define ru0 %xmm7
-#define ru1 %xmm8
-#define ru2 %xmm9
-#define ru3 %xmm10
-#define ru4 %xmm11
-#define sv1 %xmm12
-#define sv2 %xmm13
-#define sv3 %xmm14
-#define sv4 %xmm15
-#undef d0
-#define d0 %r13
-
-SYM_FUNC_START(poly1305_2block_sse2)
-       # %rdi: Accumulator h[5]
-       # %rsi: 16 byte input block m
-       # %rdx: Poly1305 key r[5]
-       # %rcx: Doubleblock count
-       # %r8:  Poly1305 derived key r^2 u[5]
-
-       # This two-block variant further improves performance by using loop
-       # unrolled block processing. This is more straight forward and does
-       # less byte shuffling, but requires a second Poly1305 key r^2:
-       # h = (h + m) * r    =>    h = (h + m1) * r^2 + m2 * r
-
-       push            %rbx
-       push            %r12
-       push            %r13
-
-       # combine r0,u0
-       movd            u0,ru0
-       movd            r0,t1
-       punpcklqdq      t1,ru0
-
-       # combine r1,u1 and s1=r1*5,v1=u1*5
-       movd            u1,ru1
-       movd            r1,t1
-       punpcklqdq      t1,ru1
-       movdqa          ru1,sv1
-       pslld           $2,sv1
-       paddd           ru1,sv1
-
-       # combine r2,u2 and s2=r2*5,v2=u2*5
-       movd            u2,ru2
-       movd            r2,t1
-       punpcklqdq      t1,ru2
-       movdqa          ru2,sv2
-       pslld           $2,sv2
-       paddd           ru2,sv2
-
-       # combine r3,u3 and s3=r3*5,v3=u3*5
-       movd            u3,ru3
-       movd            r3,t1
-       punpcklqdq      t1,ru3
-       movdqa          ru3,sv3
-       pslld           $2,sv3
-       paddd           ru3,sv3
-
-       # combine r4,u4 and s4=r4*5,v4=u4*5
-       movd            u4,ru4
-       movd            r4,t1
-       punpcklqdq      t1,ru4
-       movdqa          ru4,sv4
-       pslld           $2,sv4
-       paddd           ru4,sv4
-
-.Ldoblock2:
-       # hc0 = [ m[16-19] & 0x3ffffff, h0 + m[0-3] & 0x3ffffff ]
-       movd            0x00(m),hc0
-       movd            0x10(m),t1
-       punpcklqdq      t1,hc0
-       pand            ANMASK(%rip),hc0
-       movd            h0,t1
-       paddd           t1,hc0
-       # hc1 = [ (m[19-22] >> 2) & 0x3ffffff, h1 + (m[3-6] >> 2) & 0x3ffffff ]
-       movd            0x03(m),hc1
-       movd            0x13(m),t1
-       punpcklqdq      t1,hc1
-       psrld           $2,hc1
-       pand            ANMASK(%rip),hc1
-       movd            h1,t1
-       paddd           t1,hc1
-       # hc2 = [ (m[22-25] >> 4) & 0x3ffffff, h2 + (m[6-9] >> 4) & 0x3ffffff ]
-       movd            0x06(m),hc2
-       movd            0x16(m),t1
-       punpcklqdq      t1,hc2
-       psrld           $4,hc2
-       pand            ANMASK(%rip),hc2
-       movd            h2,t1
-       paddd           t1,hc2
-       # hc3 = [ (m[25-28] >> 6) & 0x3ffffff, h3 + (m[9-12] >> 6) & 0x3ffffff ]
-       movd            0x09(m),hc3
-       movd            0x19(m),t1
-       punpcklqdq      t1,hc3
-       psrld           $6,hc3
-       pand            ANMASK(%rip),hc3
-       movd            h3,t1
-       paddd           t1,hc3
-       # hc4 = [ (m[28-31] >> 8) | (1<<24), h4 + (m[12-15] >> 8) | (1<<24) ]
-       movd            0x0c(m),hc4
-       movd            0x1c(m),t1
-       punpcklqdq      t1,hc4
-       psrld           $8,hc4
-       por             ORMASK(%rip),hc4
-       movd            h4,t1
-       paddd           t1,hc4
-
-       # t1 = [ hc0[1] * r0, hc0[0] * u0 ]
-       movdqa          ru0,t1
-       pmuludq         hc0,t1
-       # t1 += [ hc1[1] * s4, hc1[0] * v4 ]
-       movdqa          sv4,t2
-       pmuludq         hc1,t2
-       paddq           t2,t1
-       # t1 += [ hc2[1] * s3, hc2[0] * v3 ]
-       movdqa          sv3,t2
-       pmuludq         hc2,t2
-       paddq           t2,t1
-       # t1 += [ hc3[1] * s2, hc3[0] * v2 ]
-       movdqa          sv2,t2
-       pmuludq         hc3,t2
-       paddq           t2,t1
-       # t1 += [ hc4[1] * s1, hc4[0] * v1 ]
-       movdqa          sv1,t2
-       pmuludq         hc4,t2
-       paddq           t2,t1
-       # d0 = t1[0] + t1[1]
-       movdqa          t1,t2
-       psrldq          $8,t2
-       paddq           t2,t1
-       movq            t1,d0
-
-       # t1 = [ hc0[1] * r1, hc0[0] * u1 ]
-       movdqa          ru1,t1
-       pmuludq         hc0,t1
-       # t1 += [ hc1[1] * r0, hc1[0] * u0 ]
-       movdqa          ru0,t2
-       pmuludq         hc1,t2
-       paddq           t2,t1
-       # t1 += [ hc2[1] * s4, hc2[0] * v4 ]
-       movdqa          sv4,t2
-       pmuludq         hc2,t2
-       paddq           t2,t1
-       # t1 += [ hc3[1] * s3, hc3[0] * v3 ]
-       movdqa          sv3,t2
-       pmuludq         hc3,t2
-       paddq           t2,t1
-       # t1 += [ hc4[1] * s2, hc4[0] * v2 ]
-       movdqa          sv2,t2
-       pmuludq         hc4,t2
-       paddq           t2,t1
-       # d1 = t1[0] + t1[1]
-       movdqa          t1,t2
-       psrldq          $8,t2
-       paddq           t2,t1
-       movq            t1,d1
-
-       # t1 = [ hc0[1] * r2, hc0[0] * u2 ]
-       movdqa          ru2,t1
-       pmuludq         hc0,t1
-       # t1 += [ hc1[1] * r1, hc1[0] * u1 ]
-       movdqa          ru1,t2
-       pmuludq         hc1,t2
-       paddq           t2,t1
-       # t1 += [ hc2[1] * r0, hc2[0] * u0 ]
-       movdqa          ru0,t2
-       pmuludq         hc2,t2
-       paddq           t2,t1
-       # t1 += [ hc3[1] * s4, hc3[0] * v4 ]
-       movdqa          sv4,t2
-       pmuludq         hc3,t2
-       paddq           t2,t1
-       # t1 += [ hc4[1] * s3, hc4[0] * v3 ]
-       movdqa          sv3,t2
-       pmuludq         hc4,t2
-       paddq           t2,t1
-       # d2 = t1[0] + t1[1]
-       movdqa          t1,t2
-       psrldq          $8,t2
-       paddq           t2,t1
-       movq            t1,d2
-
-       # t1 = [ hc0[1] * r3, hc0[0] * u3 ]
-       movdqa          ru3,t1
-       pmuludq         hc0,t1
-       # t1 += [ hc1[1] * r2, hc1[0] * u2 ]
-       movdqa          ru2,t2
-       pmuludq         hc1,t2
-       paddq           t2,t1
-       # t1 += [ hc2[1] * r1, hc2[0] * u1 ]
-       movdqa          ru1,t2
-       pmuludq         hc2,t2
-       paddq           t2,t1
-       # t1 += [ hc3[1] * r0, hc3[0] * u0 ]
-       movdqa          ru0,t2
-       pmuludq         hc3,t2
-       paddq           t2,t1
-       # t1 += [ hc4[1] * s4, hc4[0] * v4 ]
-       movdqa          sv4,t2
-       pmuludq         hc4,t2
-       paddq           t2,t1
-       # d3 = t1[0] + t1[1]
-       movdqa          t1,t2
-       psrldq          $8,t2
-       paddq           t2,t1
-       movq            t1,d3
-
-       # t1 = [ hc0[1] * r4, hc0[0] * u4 ]
-       movdqa          ru4,t1
-       pmuludq         hc0,t1
-       # t1 += [ hc1[1] * r3, hc1[0] * u3 ]
-       movdqa          ru3,t2
-       pmuludq         hc1,t2
-       paddq           t2,t1
-       # t1 += [ hc2[1] * r2, hc2[0] * u2 ]
-       movdqa          ru2,t2
-       pmuludq         hc2,t2
-       paddq           t2,t1
-       # t1 += [ hc3[1] * r1, hc3[0] * u1 ]
-       movdqa          ru1,t2
-       pmuludq         hc3,t2
-       paddq           t2,t1
-       # t1 += [ hc4[1] * r0, hc4[0] * u0 ]
-       movdqa          ru0,t2
-       pmuludq         hc4,t2
-       paddq           t2,t1
-       # d4 = t1[0] + t1[1]
-       movdqa          t1,t2
-       psrldq          $8,t2
-       paddq           t2,t1
-       movq            t1,d4
-
-       # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 ->
-       # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small
-       # amount.  Careful: we must not assume the carry bits 'd0 >> 26',
-       # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
-       # integers.  It's true in a single-block implementation, but not here.
-
-       # d1 += d0 >> 26
-       mov             d0,%rax
-       shr             $26,%rax
-       add             %rax,d1
-       # h0 = d0 & 0x3ffffff
-       mov             d0,%rbx
-       and             $0x3ffffff,%ebx
-
-       # d2 += d1 >> 26
-       mov             d1,%rax
-       shr             $26,%rax
-       add             %rax,d2
-       # h1 = d1 & 0x3ffffff
-       mov             d1,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h1
-
-       # d3 += d2 >> 26
-       mov             d2,%rax
-       shr             $26,%rax
-       add             %rax,d3
-       # h2 = d2 & 0x3ffffff
-       mov             d2,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h2
-
-       # d4 += d3 >> 26
-       mov             d3,%rax
-       shr             $26,%rax
-       add             %rax,d4
-       # h3 = d3 & 0x3ffffff
-       mov             d3,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h3
-
-       # h0 += (d4 >> 26) * 5
-       mov             d4,%rax
-       shr             $26,%rax
-       lea             (%rax,%rax,4),%rax
-       add             %rax,%rbx
-       # h4 = d4 & 0x3ffffff
-       mov             d4,%rax
-       and             $0x3ffffff,%eax
-       mov             %eax,h4
-
-       # h1 += h0 >> 26
-       mov             %rbx,%rax
-       shr             $26,%rax
-       add             %eax,h1
-       # h0 = h0 & 0x3ffffff
-       andl            $0x3ffffff,%ebx
-       mov             %ebx,h0
-
-       add             $0x20,m
-       dec             %rcx
-       jnz             .Ldoblock2
-
-       pop             %r13
-       pop             %r12
-       pop             %rbx
-       ret
-SYM_FUNC_END(poly1305_2block_sse2)
diff --git a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl

index 342ad7f..7a6b538 100644 (file)
--- a/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
+++ b/arch/x86/crypto/poly1305-x86_64-cryptogams.pl
@@ -1,11 +1,14 @@
-#! /usr/bin/env perl
-# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
+#!/usr/bin/env perl
+# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
  #
-# Licensed under the OpenSSL license (the "License").  You may not use
-# this file except in compliance with the License.  You can obtain a copy
-# in the file LICENSE in the source distribution or at
-# https://www.openssl.org/source/license.html
-
+# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved.
+#
+# This code is taken from the OpenSSL project but the author, Andy Polyakov,
+# has relicensed it under the licenses specified in the SPDX header above.
+# The original headers, including the original license headers, are
+# included below for completeness.
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -32,7 +35,7 @@
  # Skylake-X system performance. Since we are likely to suppress
  # AVX512F capability flag [at least on Skylake-X], conversion serves
  # as kind of "investment protection". Note that next *lake processor,
-# Cannolake, has AVX512IFMA code path to execute...
+# Cannonlake, has AVX512IFMA code path to execute...
  #
  # Numbers are cycles per processed byte with poly1305_blocks alone,
  # measured with rdtsc at fixed clock frequency.
@@ -68,39 +71,114 @@ $output  = shift;
  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  
  $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
-$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
-( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
-( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
-die "can't locate x86_64-xlate.pl";
-
-if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
-               =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
-       $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26);
+$kernel=0; $kernel=1 if (!$flavour && !$output);
+
+if (!$kernel) {
+       $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+       ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+       ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+       die "can't locate x86_64-xlate.pl";
+
+       open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+       *STDOUT=*OUT;
+
+       if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+           =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+               $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25);
+       }
+
+       if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+           `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
+               $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12);
+               $avx += 1 if ($1==2.11 && $2>=8);
+       }
+
+       if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+           `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+               $avx = ($1>=10) + ($1>=11);
+       }
+
+       if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+               $avx = ($2>=3.0) + ($2>3.0);
+       }
+} else {
+       $avx = 4; # The kernel uses ifdefs for this.
  }
  
-if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
-          `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
-       $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12);
-       $avx += 2 if ($1==2.11 && $2>=8);
+sub declare_function() {
+       my ($name, $align, $nargs) = @_;
+       if($kernel) {
+               $code .= ".align $align\n";
+               $code .= "SYM_FUNC_START($name)\n";
+               $code .= ".L$name:\n";
+       } else {
+               $code .= ".globl        $name\n";
+               $code .= ".type $name,\@function,$nargs\n";
+               $code .= ".align        $align\n";
+               $code .= "$name:\n";
+       }
  }
  
-if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
-          `ml64 2>&1` =~ /Version ([0-9]+)\./) {
-       $avx = ($1>=10) + ($1>=12);
+sub end_function() {
+       my ($name) = @_;
+       if($kernel) {
+               $code .= "SYM_FUNC_END($name)\n";
+       } else {
+               $code .= ".size   $name,.-$name\n";
+       }
  }
  
-if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
-       $avx = ($2>=3.0) + ($2>3.0);
-}
+$code.=<<___ if $kernel;
+#include <linux/linkage.h>
+___
+
+if ($avx) {
+$code.=<<___ if $kernel;
+.section .rodata
+___
+$code.=<<___;
+.align 64
+.Lconst:
+.Lmask24:
+.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
+.L129:
+.long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
+.Lmask26:
+.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
+.Lpermd_avx2:
+.long  2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
+
+.L2_44_inp_permd:
+.long  0,1,1,2,2,3,7,7
+.L2_44_inp_shift:
+.quad  0,12,24,64
+.L2_44_mask:
+.quad  0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
+.L2_44_shift_rgt:
+.quad  44,44,42,64
+.L2_44_shift_lft:
+.quad  8,8,10,64
  
-open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
-*STDOUT=*OUT;
+.align 64
+.Lx_mask44:
+.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
+.Lx_mask42:
+.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
+___
+}
+$code.=<<___ if (!$kernel);
+.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
  
  my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
  my ($mac,$nonce)=($inp,$len);  # *_emit arguments
-my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
-my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
+my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
+my ($h0,$h1,$h2)=("%r14","%rbx","%r10");
  
  sub poly1305_iteration {
  # input:       copy of $r1 in %rax, $h0-$h2, $r0-$r1
@@ -155,19 +233,19 @@ ___
  
  $code.=<<___;
  .text
-
+___
+$code.=<<___ if (!$kernel);
  .extern        OPENSSL_ia32cap_P
  
-.globl poly1305_init
-.hidden        poly1305_init
-.globl poly1305_blocks
-.hidden        poly1305_blocks
-.globl poly1305_emit
-.hidden        poly1305_emit
-
-.type  poly1305_init,\@function,3
-.align 32
-poly1305_init:
+.globl poly1305_init_x86_64
+.hidden        poly1305_init_x86_64
+.globl poly1305_blocks_x86_64
+.hidden        poly1305_blocks_x86_64
+.globl poly1305_emit_x86_64
+.hidden        poly1305_emit_x86_64
+___
+&declare_function("poly1305_init_x86_64", 32, 3);
+$code.=<<___;
         xor     %rax,%rax
         mov     %rax,0($ctx)            # initialize hash value
         mov     %rax,8($ctx)
@@ -175,11 +253,12 @@ poly1305_init:
  
         cmp     \$0,$inp
         je      .Lno_key
-
-       lea     poly1305_blocks(%rip),%r10
-       lea     poly1305_emit(%rip),%r11
  ___
-$code.=<<___   if ($avx);
+$code.=<<___ if (!$kernel);
+       lea     poly1305_blocks_x86_64(%rip),%r10
+       lea     poly1305_emit_x86_64(%rip),%r11
+___
+$code.=<<___   if (!$kernel && $avx);
         mov     OPENSSL_ia32cap_P+4(%rip),%r9
         lea     poly1305_blocks_avx(%rip),%rax
         lea     poly1305_emit_avx(%rip),%rcx
@@ -187,12 +266,12 @@ $code.=<<___      if ($avx);
         cmovc   %rax,%r10
         cmovc   %rcx,%r11
  ___
-$code.=<<___   if ($avx>1);
+$code.=<<___   if (!$kernel && $avx>1);
         lea     poly1305_blocks_avx2(%rip),%rax
         bt      \$`5+32`,%r9            # AVX2?
         cmovc   %rax,%r10
  ___
-$code.=<<___   if ($avx>3);
+$code.=<<___   if (!$kernel && $avx>3);
         mov     \$`(1<<31|1<<21|1<<16)`,%rax
         shr     \$32,%r9
         and     %rax,%r9
@@ -207,11 +286,11 @@ $code.=<<___;
         mov     %rax,24($ctx)
         mov     %rcx,32($ctx)
  ___
-$code.=<<___   if ($flavour !~ /elf32/);
+$code.=<<___   if (!$kernel && $flavour !~ /elf32/);
         mov     %r10,0(%rdx)
         mov     %r11,8(%rdx)
  ___
-$code.=<<___   if ($flavour =~ /elf32/);
+$code.=<<___   if (!$kernel && $flavour =~ /elf32/);
         mov     %r10d,0(%rdx)
         mov     %r11d,4(%rdx)
  ___
@@ -219,11 +298,11 @@ $code.=<<___;
         mov     \$1,%eax
  .Lno_key:
         ret
-.size  poly1305_init,.-poly1305_init
+___
+&end_function("poly1305_init_x86_64");
  
-.type  poly1305_blocks,\@function,4
-.align 32
-poly1305_blocks:
+&declare_function("poly1305_blocks_x86_64", 32, 4);
+$code.=<<___;
  .cfi_startproc
  .Lblocks:
         shr     \$4,$len
@@ -231,8 +310,6 @@ poly1305_blocks:
  
         push    %rbx
  .cfi_push      %rbx
-       push    %rbp
-.cfi_push      %rbp
         push    %r12
  .cfi_push      %r12
         push    %r13
@@ -241,6 +318,8 @@ poly1305_blocks:
  .cfi_push      %r14
         push    %r15
  .cfi_push      %r15
+       push    $ctx
+.cfi_push      $ctx
  .Lblocks_body:
  
         mov     $len,%r15               # reassign $len
@@ -265,26 +344,29 @@ poly1305_blocks:
         lea     16($inp),$inp
         adc     $padbit,$h2
  ___
+
         &poly1305_iteration();
+
  $code.=<<___;
         mov     $r1,%rax
         dec     %r15                    # len-=16
         jnz     .Loop
  
+       mov     0(%rsp),$ctx
+.cfi_restore   $ctx
+
         mov     $h0,0($ctx)             # store hash value
         mov     $h1,8($ctx)
         mov     $h2,16($ctx)
  
-       mov     0(%rsp),%r15
+       mov     8(%rsp),%r15
  .cfi_restore   %r15
-       mov     8(%rsp),%r14
+       mov     16(%rsp),%r14
  .cfi_restore   %r14
-       mov     16(%rsp),%r13
+       mov     24(%rsp),%r13
  .cfi_restore   %r13
-       mov     24(%rsp),%r12
+       mov     32(%rsp),%r12
  .cfi_restore   %r12
-       mov     32(%rsp),%rbp
-.cfi_restore   %rbp
         mov     40(%rsp),%rbx
  .cfi_restore   %rbx
         lea     48(%rsp),%rsp
@@ -293,11 +375,11 @@ $code.=<<___;
  .Lblocks_epilogue:
         ret
  .cfi_endproc
-.size  poly1305_blocks,.-poly1305_blocks
+___
+&end_function("poly1305_blocks_x86_64");
  
-.type  poly1305_emit,\@function,3
-.align 32
-poly1305_emit:
+&declare_function("poly1305_emit_x86_64", 32, 3);
+$code.=<<___;
  .Lemit:
         mov     0($ctx),%r8     # load hash value
         mov     8($ctx),%r9
@@ -318,10 +400,14 @@ poly1305_emit:
         mov     %rcx,8($mac)
  
         ret
-.size  poly1305_emit,.-poly1305_emit
  ___
+&end_function("poly1305_emit_x86_64");
  if ($avx) {
  
+if($kernel) {
+       $code .= "#ifdef CONFIG_AS_AVX\n";
+}
+
  ########################################################################
  # Layout of opaque area is following.
  #
@@ -342,15 +428,19 @@ $code.=<<___;
  .type  __poly1305_block,\@abi-omnipotent
  .align 32
  __poly1305_block:
+       push $ctx
  ___
         &poly1305_iteration();
  $code.=<<___;
+       pop $ctx
         ret
  .size  __poly1305_block,.-__poly1305_block
  
  .type  __poly1305_init_avx,\@abi-omnipotent
  .align 32
  __poly1305_init_avx:
+       push %rbp
+       mov %rsp,%rbp
         mov     $r0,$h0
         mov     $r1,$h1
         xor     $h2,$h2
@@ -507,12 +597,13 @@ __poly1305_init_avx:
         mov     $d1#d,`16*8+8-64`($ctx)
  
         lea     -48-64($ctx),$ctx       # size [de-]optimization
+       pop %rbp
         ret
  .size  __poly1305_init_avx,.-__poly1305_init_avx
+___
  
-.type  poly1305_blocks_avx,\@function,4
-.align 32
-poly1305_blocks_avx:
+&declare_function("poly1305_blocks_avx", 32, 4);
+$code.=<<___;
  .cfi_startproc
         mov     20($ctx),%r8d           # is_base2_26
         cmp     \$128,$len
@@ -532,10 +623,11 @@ poly1305_blocks_avx:
         test    \$31,$len
         jz      .Leven_avx
  
-       push    %rbx
-.cfi_push      %rbx
         push    %rbp
  .cfi_push      %rbp
+       mov     %rsp,%rbp
+       push    %rbx
+.cfi_push      %rbx
         push    %r12
  .cfi_push      %r12
         push    %r13
@@ -645,20 +737,18 @@ poly1305_blocks_avx:
         mov     $h2#d,16($ctx)
  .align 16
  .Ldone_avx:
-       mov     0(%rsp),%r15
+       pop             %r15
  .cfi_restore   %r15
-       mov     8(%rsp),%r14
+       pop             %r14
  .cfi_restore   %r14
-       mov     16(%rsp),%r13
+       pop             %r13
  .cfi_restore   %r13
-       mov     24(%rsp),%r12
+       pop             %r12
  .cfi_restore   %r12
-       mov     32(%rsp),%rbp
-.cfi_restore   %rbp
-       mov     40(%rsp),%rbx
+       pop             %rbx
  .cfi_restore   %rbx
-       lea     48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+       pop             %rbp
+.cfi_restore   %rbp
  .Lno_data_avx:
  .Lblocks_avx_epilogue:
         ret
@@ -667,10 +757,11 @@ poly1305_blocks_avx:
  .align 32
  .Lbase2_64_avx:
  .cfi_startproc
-       push    %rbx
-.cfi_push      %rbx
         push    %rbp
  .cfi_push      %rbp
+       mov     %rsp,%rbp
+       push    %rbx
+.cfi_push      %rbx
         push    %r12
  .cfi_push      %r12
         push    %r13
@@ -736,22 +827,18 @@ poly1305_blocks_avx:
  
  .Lproceed_avx:
         mov     %r15,$len
-
-       mov     0(%rsp),%r15
+       pop             %r15
  .cfi_restore   %r15
-       mov     8(%rsp),%r14
+       pop             %r14
  .cfi_restore   %r14
-       mov     16(%rsp),%r13
+       pop             %r13
  .cfi_restore   %r13
-       mov     24(%rsp),%r12
+       pop             %r12
  .cfi_restore   %r12
-       mov     32(%rsp),%rbp
-.cfi_restore   %rbp
-       mov     40(%rsp),%rbx
+       pop             %rbx
  .cfi_restore   %rbx
-       lea     48(%rsp),%rax
-       lea     48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
+       pop             %rbp
+.cfi_restore   %rbp
  .Lbase2_64_avx_epilogue:
         jmp     .Ldo_avx
  .cfi_endproc
@@ -768,8 +855,11 @@ poly1305_blocks_avx:
  .Ldo_avx:
  ___
  $code.=<<___   if (!$win64);
+       lea             8(%rsp),%r10
+.cfi_def_cfa_register  %r10
+       and             \$-32,%rsp
+       sub             \$-8,%rsp
         lea             -0x58(%rsp),%r11
-.cfi_def_cfa           %r11,0x60
         sub             \$0x178,%rsp
  ___
  $code.=<<___   if ($win64);
@@ -1361,18 +1451,18 @@ $code.=<<___    if ($win64);
  .Ldo_avx_epilogue:
  ___
  $code.=<<___   if (!$win64);
-       lea             0x58(%r11),%rsp
-.cfi_def_cfa           %rsp,8
+       lea             -8(%r10),%rsp
+.cfi_def_cfa_register  %rsp
  ___
  $code.=<<___;
         vzeroupper
         ret
  .cfi_endproc
-.size  poly1305_blocks_avx,.-poly1305_blocks_avx
+___
+&end_function("poly1305_blocks_avx");
  
-.type  poly1305_emit_avx,\@function,3
-.align 32
-poly1305_emit_avx:
+&declare_function("poly1305_emit_avx", 32, 3);
+$code.=<<___;
         cmpl    \$0,20($ctx)    # is_base2_26?
         je      .Lemit
  
@@ -1423,41 +1513,51 @@ poly1305_emit_avx:
         mov     %rcx,8($mac)
  
         ret
-.size  poly1305_emit_avx,.-poly1305_emit_avx
  ___
+&end_function("poly1305_emit_avx");
+
+if ($kernel) {
+       $code .= "#endif\n";
+}
  
  if ($avx>1) {
+
+if ($kernel) {
+       $code .= "#ifdef CONFIG_AS_AVX2\n";
+}
+
  my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
      map("%ymm$_",(0..15));
  my $S4=$MASK;
  
+sub poly1305_blocks_avxN {
+       my ($avx512) = @_;
+       my $suffix = $avx512 ? "_avx512" : "";
  $code.=<<___;
-.type  poly1305_blocks_avx2,\@function,4
-.align 32
-poly1305_blocks_avx2:
  .cfi_startproc
         mov     20($ctx),%r8d           # is_base2_26
         cmp     \$128,$len
-       jae     .Lblocks_avx2
+       jae     .Lblocks_avx2$suffix
         test    %r8d,%r8d
         jz      .Lblocks
  
-.Lblocks_avx2:
+.Lblocks_avx2$suffix:
         and     \$-16,$len
-       jz      .Lno_data_avx2
+       jz      .Lno_data_avx2$suffix
  
         vzeroupper
  
         test    %r8d,%r8d
-       jz      .Lbase2_64_avx2
+       jz      .Lbase2_64_avx2$suffix
  
         test    \$63,$len
-       jz      .Leven_avx2
+       jz      .Leven_avx2$suffix
  
-       push    %rbx
-.cfi_push      %rbx
         push    %rbp
  .cfi_push      %rbp
+       mov     %rsp,%rbp
+       push    %rbx
+.cfi_push      %rbx
         push    %r12
  .cfi_push      %r12
         push    %r13
@@ -1466,7 +1566,7 @@ poly1305_blocks_avx2:
  .cfi_push      %r14
         push    %r15
  .cfi_push      %r15
-.Lblocks_avx2_body:
+.Lblocks_avx2_body$suffix:
  
         mov     $len,%r15               # reassign $len
  
@@ -1513,7 +1613,7 @@ poly1305_blocks_avx2:
         shr     \$2,$s1
         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
  
-.Lbase2_26_pre_avx2:
+.Lbase2_26_pre_avx2$suffix:
         add     0($inp),$h0             # accumulate input
         adc     8($inp),$h1
         lea     16($inp),$inp
@@ -1524,10 +1624,10 @@ poly1305_blocks_avx2:
         mov     $r1,%rax
  
         test    \$63,%r15
-       jnz     .Lbase2_26_pre_avx2
+       jnz     .Lbase2_26_pre_avx2$suffix
  
         test    $padbit,$padbit         # if $padbit is zero,
-       jz      .Lstore_base2_64_avx2   # store hash in base 2^64 format
+       jz      .Lstore_base2_64_avx2$suffix    # store hash in base 2^64 format
  
         ################################# base 2^64 -> base 2^26
         mov     $h0,%rax
@@ -1548,57 +1648,56 @@ poly1305_blocks_avx2:
         or      $r1,$h2                 # h[4]
  
         test    %r15,%r15
-       jz      .Lstore_base2_26_avx2
+       jz      .Lstore_base2_26_avx2$suffix
  
         vmovd   %rax#d,%x#$H0
         vmovd   %rdx#d,%x#$H1
         vmovd   $h0#d,%x#$H2
         vmovd   $h1#d,%x#$H3
         vmovd   $h2#d,%x#$H4
-       jmp     .Lproceed_avx2
+       jmp     .Lproceed_avx2$suffix
  
  .align 32
-.Lstore_base2_64_avx2:
+.Lstore_base2_64_avx2$suffix:
         mov     $h0,0($ctx)
         mov     $h1,8($ctx)
         mov     $h2,16($ctx)            # note that is_base2_26 is zeroed
-       jmp     .Ldone_avx2
+       jmp     .Ldone_avx2$suffix
  
  .align 16
-.Lstore_base2_26_avx2:
+.Lstore_base2_26_avx2$suffix:
         mov     %rax#d,0($ctx)          # store hash value base 2^26
         mov     %rdx#d,4($ctx)
         mov     $h0#d,8($ctx)
         mov     $h1#d,12($ctx)
         mov     $h2#d,16($ctx)
  .align 16
-.Ldone_avx2:
-       mov     0(%rsp),%r15
+.Ldone_avx2$suffix:
+       pop             %r15
  .cfi_restore   %r15
-       mov     8(%rsp),%r14
+       pop             %r14
  .cfi_restore   %r14
-       mov     16(%rsp),%r13
+       pop             %r13
  .cfi_restore   %r13
-       mov     24(%rsp),%r12
+       pop             %r12
  .cfi_restore   %r12
-       mov     32(%rsp),%rbp
-.cfi_restore   %rbp
-       mov     40(%rsp),%rbx
+       pop             %rbx
  .cfi_restore   %rbx
-       lea     48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lno_data_avx2:
-.Lblocks_avx2_epilogue:
+       pop             %rbp
+.cfi_restore   %rbp
+.Lno_data_avx2$suffix:
+.Lblocks_avx2_epilogue$suffix:
         ret
  .cfi_endproc
  
  .align 32
-.Lbase2_64_avx2:
+.Lbase2_64_avx2$suffix:
  .cfi_startproc
-       push    %rbx
-.cfi_push      %rbx
         push    %rbp
  .cfi_push      %rbp
+       mov     %rsp,%rbp
+       push    %rbx
+.cfi_push      %rbx
         push    %r12
  .cfi_push      %r12
         push    %r13
@@ -1607,7 +1706,7 @@ poly1305_blocks_avx2:
  .cfi_push      %r14
         push    %r15
  .cfi_push      %r15
-.Lbase2_64_avx2_body:
+.Lbase2_64_avx2_body$suffix:
  
         mov     $len,%r15               # reassign $len
  
@@ -1624,9 +1723,9 @@ poly1305_blocks_avx2:
         add     $r1,$s1                 # s1 = r1 + (r1 >> 2)
  
         test    \$63,$len
-       jz      .Linit_avx2
+       jz      .Linit_avx2$suffix
  
-.Lbase2_64_pre_avx2:
+.Lbase2_64_pre_avx2$suffix:
         add     0($inp),$h0             # accumulate input
         adc     8($inp),$h1
         lea     16($inp),$inp
@@ -1637,9 +1736,9 @@ poly1305_blocks_avx2:
         mov     $r1,%rax
  
         test    \$63,%r15
-       jnz     .Lbase2_64_pre_avx2
+       jnz     .Lbase2_64_pre_avx2$suffix
  
-.Linit_avx2:
+.Linit_avx2$suffix:
         ################################# base 2^64 -> base 2^26
         mov     $h0,%rax
         mov     $h0,%rdx
@@ -1667,69 +1766,77 @@ poly1305_blocks_avx2:
  
         call    __poly1305_init_avx
  
-.Lproceed_avx2:
+.Lproceed_avx2$suffix:
         mov     %r15,$len                       # restore $len
-       mov     OPENSSL_ia32cap_P+8(%rip),%r10d
+___
+$code.=<<___ if (!$kernel);
+       mov     OPENSSL_ia32cap_P+8(%rip),%r9d
         mov     \$`(1<<31|1<<30|1<<16)`,%r11d
-
-       mov     0(%rsp),%r15
+___
+$code.=<<___;
+       pop             %r15
  .cfi_restore   %r15
-       mov     8(%rsp),%r14
+       pop             %r14
  .cfi_restore   %r14
-       mov     16(%rsp),%r13
+       pop             %r13
  .cfi_restore   %r13
-       mov     24(%rsp),%r12
+       pop             %r12
  .cfi_restore   %r12
-       mov     32(%rsp),%rbp
-.cfi_restore   %rbp
-       mov     40(%rsp),%rbx
+       pop             %rbx
  .cfi_restore   %rbx
-       lea     48(%rsp),%rax
-       lea     48(%rsp),%rsp
-.cfi_adjust_cfa_offset -48
-.Lbase2_64_avx2_epilogue:
-       jmp     .Ldo_avx2
+       pop             %rbp
+.cfi_restore   %rbp
+.Lbase2_64_avx2_epilogue$suffix:
+       jmp     .Ldo_avx2$suffix
  .cfi_endproc
  
  .align 32
-.Leven_avx2:
+.Leven_avx2$suffix:
  .cfi_startproc
-       mov             OPENSSL_ia32cap_P+8(%rip),%r10d
+___
+$code.=<<___ if (!$kernel);
+       mov             OPENSSL_ia32cap_P+8(%rip),%r9d
+___
+$code.=<<___;
         vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
         vmovd           4*1($ctx),%x#$H1
         vmovd           4*2($ctx),%x#$H2
         vmovd           4*3($ctx),%x#$H3
         vmovd           4*4($ctx),%x#$H4
  
-.Ldo_avx2:
+.Ldo_avx2$suffix:
  ___
-$code.=<<___           if ($avx>2);
+$code.=<<___           if (!$kernel && $avx>2);
         cmp             \$512,$len
         jb              .Lskip_avx512
-       and             %r11d,%r10d
-       test            \$`1<<16`,%r10d         # check for AVX512F
+       and             %r11d,%r9d
+       test            \$`1<<16`,%r9d          # check for AVX512F
         jnz             .Lblocks_avx512
-.Lskip_avx512:
+.Lskip_avx512$suffix:
+___
+$code.=<<___ if ($avx > 2 && $avx512 && $kernel);
+       cmp             \$512,$len
+       jae             .Lblocks_avx512
  ___
  $code.=<<___   if (!$win64);
-       lea             -8(%rsp),%r11
-.cfi_def_cfa           %r11,16
+       lea             8(%rsp),%r10
+.cfi_def_cfa_register  %r10
         sub             \$0x128,%rsp
  ___
  $code.=<<___   if ($win64);
-       lea             -0xf8(%rsp),%r11
+       lea             8(%rsp),%r10
         sub             \$0x1c8,%rsp
-       vmovdqa         %xmm6,0x50(%r11)
-       vmovdqa         %xmm7,0x60(%r11)
-       vmovdqa         %xmm8,0x70(%r11)
-       vmovdqa         %xmm9,0x80(%r11)
-       vmovdqa         %xmm10,0x90(%r11)
-       vmovdqa         %xmm11,0xa0(%r11)
-       vmovdqa         %xmm12,0xb0(%r11)
-       vmovdqa         %xmm13,0xc0(%r11)
-       vmovdqa         %xmm14,0xd0(%r11)
-       vmovdqa         %xmm15,0xe0(%r11)
-.Ldo_avx2_body:
+       vmovdqa         %xmm6,-0xb0(%r10)
+       vmovdqa         %xmm7,-0xa0(%r10)
+       vmovdqa         %xmm8,-0x90(%r10)
+       vmovdqa         %xmm9,-0x80(%r10)
+       vmovdqa         %xmm10,-0x70(%r10)
+       vmovdqa         %xmm11,-0x60(%r10)
+       vmovdqa         %xmm12,-0x50(%r10)
+       vmovdqa         %xmm13,-0x40(%r10)
+       vmovdqa         %xmm14,-0x30(%r10)
+       vmovdqa         %xmm15,-0x20(%r10)
+.Ldo_avx2_body$suffix:
  ___
  $code.=<<___;
         lea             .Lconst(%rip),%rcx
@@ -1794,11 +1901,11 @@ $code.=<<___;
  
         vpaddq          $H2,$T2,$H2             # accumulate input
         sub             \$64,$len
-       jz              .Ltail_avx2
-       jmp             .Loop_avx2
+       jz              .Ltail_avx2$suffix
+       jmp             .Loop_avx2$suffix
  
  .align 32
-.Loop_avx2:
+.Loop_avx2$suffix:
         ################################################################
         # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4
         # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
@@ -1946,10 +2053,10 @@ $code.=<<___;
          vpor           32(%rcx),$T4,$T4        # padbit, yes, always
  
         sub             \$64,$len
-       jnz             .Loop_avx2
+       jnz             .Loop_avx2$suffix
  
         .byte           0x66,0x90
-.Ltail_avx2:
+.Ltail_avx2$suffix:
         ################################################################
         # while above multiplications were by r^4 in all lanes, in last
         # iteration we multiply least significant lane by r^4 and most
@@ -2087,37 +2194,29 @@ $code.=<<___;
         vmovd           %x#$H4,`4*4-48-64`($ctx)
  ___
  $code.=<<___   if ($win64);
-       vmovdqa         0x50(%r11),%xmm6
-       vmovdqa         0x60(%r11),%xmm7
-       vmovdqa         0x70(%r11),%xmm8
-       vmovdqa         0x80(%r11),%xmm9
-       vmovdqa         0x90(%r11),%xmm10
-       vmovdqa         0xa0(%r11),%xmm11
-       vmovdqa         0xb0(%r11),%xmm12
-       vmovdqa         0xc0(%r11),%xmm13
-       vmovdqa         0xd0(%r11),%xmm14
-       vmovdqa         0xe0(%r11),%xmm15
-       lea             0xf8(%r11),%rsp
-.Ldo_avx2_epilogue:
+       vmovdqa         -0xb0(%r10),%xmm6
+       vmovdqa         -0xa0(%r10),%xmm7
+       vmovdqa         -0x90(%r10),%xmm8
+       vmovdqa         -0x80(%r10),%xmm9
+       vmovdqa         -0x70(%r10),%xmm10
+       vmovdqa         -0x60(%r10),%xmm11
+       vmovdqa         -0x50(%r10),%xmm12
+       vmovdqa         -0x40(%r10),%xmm13
+       vmovdqa         -0x30(%r10),%xmm14
+       vmovdqa         -0x20(%r10),%xmm15
+       lea             -8(%r10),%rsp
+.Ldo_avx2_epilogue$suffix:
  ___
  $code.=<<___   if (!$win64);
-       lea             8(%r11),%rsp
-.cfi_def_cfa           %rsp,8
+       lea             -8(%r10),%rsp
+.cfi_def_cfa_register  %rsp
  ___
  $code.=<<___;
         vzeroupper
         ret
  .cfi_endproc
-.size  poly1305_blocks_avx2,.-poly1305_blocks_avx2
  ___
-#######################################################################
-if ($avx>2) {
-# On entry we have input length divisible by 64. But since inner loop
-# processes 128 bytes per iteration, cases when length is not divisible
-# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
-# reason stack layout is kept identical to poly1305_blocks_avx2. If not
-# for this tail, we wouldn't have to even allocate stack frame...
-
+if($avx > 2 && $avx512) {
  my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24));
  my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29));
  my $PADBIT="%zmm30";
@@ -2128,32 +2227,29 @@ map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4));
  map(s/%y/%z/,($MASK));
  
  $code.=<<___;
-.type  poly1305_blocks_avx512,\@function,4
-.align 32
-poly1305_blocks_avx512:
  .cfi_startproc
  .Lblocks_avx512:
         mov             \$15,%eax
         kmovw           %eax,%k2
  ___
  $code.=<<___   if (!$win64);
-       lea             -8(%rsp),%r11
-.cfi_def_cfa           %r11,16
+       lea             8(%rsp),%r10
+.cfi_def_cfa_register  %r10
         sub             \$0x128,%rsp
  ___
  $code.=<<___   if ($win64);
-       lea             -0xf8(%rsp),%r11
+       lea             8(%rsp),%r10
         sub             \$0x1c8,%rsp
-       vmovdqa         %xmm6,0x50(%r11)
-       vmovdqa         %xmm7,0x60(%r11)
-       vmovdqa         %xmm8,0x70(%r11)
-       vmovdqa         %xmm9,0x80(%r11)
-       vmovdqa         %xmm10,0x90(%r11)
-       vmovdqa         %xmm11,0xa0(%r11)
-       vmovdqa         %xmm12,0xb0(%r11)
-       vmovdqa         %xmm13,0xc0(%r11)
-       vmovdqa         %xmm14,0xd0(%r11)
-       vmovdqa         %xmm15,0xe0(%r11)
+       vmovdqa         %xmm6,-0xb0(%r10)
+       vmovdqa         %xmm7,-0xa0(%r10)
+       vmovdqa         %xmm8,-0x90(%r10)
+       vmovdqa         %xmm9,-0x80(%r10)
+       vmovdqa         %xmm10,-0x70(%r10)
+       vmovdqa         %xmm11,-0x60(%r10)
+       vmovdqa         %xmm12,-0x50(%r10)
+       vmovdqa         %xmm13,-0x40(%r10)
+       vmovdqa         %xmm14,-0x30(%r10)
+       vmovdqa         %xmm15,-0x20(%r10)
  .Ldo_avx512_body:
  ___
  $code.=<<___;
@@ -2679,7 +2775,7 @@ $code.=<<___;
  
         lea             0x90(%rsp),%rax         # size optimization for .Ltail_avx2
         add             \$64,$len
-       jnz             .Ltail_avx2
+       jnz             .Ltail_avx2$suffix
  
         vpsubq          $T2,$H2,$H2             # undo input accumulation
         vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
@@ -2690,29 +2786,61 @@ $code.=<<___;
         vzeroall
  ___
  $code.=<<___   if ($win64);
-       movdqa          0x50(%r11),%xmm6
-       movdqa          0x60(%r11),%xmm7
-       movdqa          0x70(%r11),%xmm8
-       movdqa          0x80(%r11),%xmm9
-       movdqa          0x90(%r11),%xmm10
-       movdqa          0xa0(%r11),%xmm11
-       movdqa          0xb0(%r11),%xmm12
-       movdqa          0xc0(%r11),%xmm13
-       movdqa          0xd0(%r11),%xmm14
-       movdqa          0xe0(%r11),%xmm15
-       lea             0xf8(%r11),%rsp
+       movdqa          -0xb0(%r10),%xmm6
+       movdqa          -0xa0(%r10),%xmm7
+       movdqa          -0x90(%r10),%xmm8
+       movdqa          -0x80(%r10),%xmm9
+       movdqa          -0x70(%r10),%xmm10
+       movdqa          -0x60(%r10),%xmm11
+       movdqa          -0x50(%r10),%xmm12
+       movdqa          -0x40(%r10),%xmm13
+       movdqa          -0x30(%r10),%xmm14
+       movdqa          -0x20(%r10),%xmm15
+       lea             -8(%r10),%rsp
  .Ldo_avx512_epilogue:
  ___
  $code.=<<___   if (!$win64);
-       lea             8(%r11),%rsp
-.cfi_def_cfa           %rsp,8
+       lea             -8(%r10),%rsp
+.cfi_def_cfa_register  %rsp
  ___
  $code.=<<___;
         ret
  .cfi_endproc
-.size  poly1305_blocks_avx512,.-poly1305_blocks_avx512
  ___
-if ($avx>3) {
+
+}
+
+}
+
+&declare_function("poly1305_blocks_avx2", 32, 4);
+poly1305_blocks_avxN(0);
+&end_function("poly1305_blocks_avx2");
+
+if($kernel) {
+       $code .= "#endif\n";
+}
+
+#######################################################################
+if ($avx>2) {
+# On entry we have input length divisible by 64. But since inner loop
+# processes 128 bytes per iteration, cases when length is not divisible
+# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this
+# reason stack layout is kept identical to poly1305_blocks_avx2. If not
+# for this tail, we wouldn't have to even allocate stack frame...
+
+if($kernel) {
+       $code .= "#ifdef CONFIG_AS_AVX512\n";
+}
+
+&declare_function("poly1305_blocks_avx512", 32, 4);
+poly1305_blocks_avxN(1);
+&end_function("poly1305_blocks_avx512");
+
+if ($kernel) {
+       $code .= "#endif\n";
+}
+
+if (!$kernel && $avx>3) {
  ########################################################################
  # VPMADD52 version using 2^44 radix.
  #
@@ -3753,45 +3881,9 @@ poly1305_emit_base2_44:
  .size  poly1305_emit_base2_44,.-poly1305_emit_base2_44
  ___
  }      }       }
-$code.=<<___;
-.align 64
-.Lconst:
-.Lmask24:
-.long  0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
-.L129:
-.long  `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0
-.Lmask26:
-.long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
-.Lpermd_avx2:
-.long  2,2,2,3,2,0,2,1
-.Lpermd_avx512:
-.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
-
-.L2_44_inp_permd:
-.long  0,1,1,2,2,3,7,7
-.L2_44_inp_shift:
-.quad  0,12,24,64
-.L2_44_mask:
-.quad  0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
-.L2_44_shift_rgt:
-.quad  44,44,42,64
-.L2_44_shift_lft:
-.quad  8,8,10,64
-
-.align 64
-.Lx_mask44:
-.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.quad  0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
-.Lx_mask42:
-.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-.quad  0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
-___
  }
-$code.=<<___;
-.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
-.align 16
-___
  
+if (!$kernel)
  {      # chacha20-poly1305 helpers
  my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
                                    ("%rdi","%rsi","%rdx","%rcx");  # Unix order
@@ -4038,17 +4130,17 @@ avx_handler:
  
  .section       .pdata
  .align 4
-       .rva    .LSEH_begin_poly1305_init
-       .rva    .LSEH_end_poly1305_init
-       .rva    .LSEH_info_poly1305_init
+       .rva    .LSEH_begin_poly1305_init_x86_64
+       .rva    .LSEH_end_poly1305_init_x86_64
+       .rva    .LSEH_info_poly1305_init_x86_64
  
-       .rva    .LSEH_begin_poly1305_blocks
-       .rva    .LSEH_end_poly1305_blocks
-       .rva    .LSEH_info_poly1305_blocks
+       .rva    .LSEH_begin_poly1305_blocks_x86_64
+       .rva    .LSEH_end_poly1305_blocks_x86_64
+       .rva    .LSEH_info_poly1305_blocks_x86_64
  
-       .rva    .LSEH_begin_poly1305_emit
-       .rva    .LSEH_end_poly1305_emit
-       .rva    .LSEH_info_poly1305_emit
+       .rva    .LSEH_begin_poly1305_emit_x86_64
+       .rva    .LSEH_end_poly1305_emit_x86_64
+       .rva    .LSEH_info_poly1305_emit_x86_64
  ___
  $code.=<<___ if ($avx);
         .rva    .LSEH_begin_poly1305_blocks_avx
@@ -4088,20 +4180,20 @@ ___
  $code.=<<___;
  .section       .xdata
  .align 8
-.LSEH_info_poly1305_init:
+.LSEH_info_poly1305_init_x86_64:
         .byte   9,0,0,0
         .rva    se_handler
-       .rva    .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
+       .rva    .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64
  
-.LSEH_info_poly1305_blocks:
+.LSEH_info_poly1305_blocks_x86_64:
         .byte   9,0,0,0
         .rva    se_handler
         .rva    .Lblocks_body,.Lblocks_epilogue
  
-.LSEH_info_poly1305_emit:
+.LSEH_info_poly1305_emit_x86_64:
         .byte   9,0,0,0
         .rva    se_handler
-       .rva    .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
+       .rva    .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64
  ___
  $code.=<<___ if ($avx);
  .LSEH_info_poly1305_blocks_avx_1:
@@ -4148,12 +4240,26 @@ $code.=<<___ if ($avx>2);
  ___
  }
  
+open SELF,$0;
+while(<SELF>) {
+       next if (/^#!/);
+       last if (!s/^#/\/\// and !/^$/);
+       print;
+}
+close SELF;
+
  foreach (split('\n',$code)) {
         s/\`([^\`]*)\`/eval($1)/ge;
         s/%r([a-z]+)#d/%e$1/g;
         s/%r([0-9]+)#d/%r$1d/g;
         s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g;
  
+       if ($kernel) {
+               s/(^\.type.*),[0-9]+$/\1/;
+               s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/;
+               next if /^\.cfi.*/;
+       }
+
         print $_,"\n";
  }
  close STDOUT;
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c

index edb7113..6573635 100644 (file)
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -1,8 +1,6 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0 OR MIT
  /*
- * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
+ * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
   */
  
  #include <crypto/algapi.h>
@@ -13,279 +11,170 @@
  #include <linux/jump_label.h>
  #include <linux/kernel.h>
  #include <linux/module.h>
+#include <asm/intel-family.h>
  #include <asm/simd.h>
  
-asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
-                                   const u32 *r, unsigned int blocks);
-asmlinkage void poly1305_2block_sse2(u32 *h, const u8 *src, const u32 *r,
-                                    unsigned int blocks, const u32 *u);
-asmlinkage void poly1305_4block_avx2(u32 *h, const u8 *src, const u32 *r,
-                                    unsigned int blocks, const u32 *u);
-
-static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_simd);
+asmlinkage void poly1305_init_x86_64(void *ctx,
+                                    const u8 key[POLY1305_KEY_SIZE]);
+asmlinkage void poly1305_blocks_x86_64(void *ctx, const u8 *inp,
+                                      const size_t len, const u32 padbit);
+asmlinkage void poly1305_emit_x86_64(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+                                    const u32 nonce[4]);
+asmlinkage void poly1305_emit_avx(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+                                 const u32 nonce[4]);
+asmlinkage void poly1305_blocks_avx(void *ctx, const u8 *inp, const size_t len,
+                                   const u32 padbit);
+asmlinkage void poly1305_blocks_avx2(void *ctx, const u8 *inp, const size_t len,
+                                    const u32 padbit);
+asmlinkage void poly1305_blocks_avx512(void *ctx, const u8 *inp,
+                                      const size_t len, const u32 padbit);
+
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx);
  static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx2);
+static __ro_after_init DEFINE_STATIC_KEY_FALSE(poly1305_use_avx512);
+
+struct poly1305_arch_internal {
+       union {
+               struct {
+                       u32 h[5];
+                       u32 is_base2_26;
+               };
+               u64 hs[3];
+       };
+       u64 r[2];
+       u64 pad;
+       struct { u32 r2, r1, r4, r3; } rn[9];
+};
  
-static inline u64 mlt(u64 a, u64 b)
+/* The AVX code uses base 2^26, while the scalar code uses base 2^64. If we hit
+ * the unfortunate situation of using AVX and then having to go back to scalar
+ * -- because the user is silly and has called the update function from two
+ * separate contexts -- then we need to convert back to the original base before
+ * proceeding. It is possible to reason that the initial reduction below is
+ * sufficient given the implementation invariants. However, for an avoidance of
+ * doubt and because this is not performance critical, we do the full reduction
+ * anyway. Z3 proof of below function: https://xn--4db.cc/ltPtHCKN/py
+ */
+static void convert_to_base2_64(void *ctx)
  {
-       return a * b;
-}
+       struct poly1305_arch_internal *state = ctx;
+       u32 cy;
  
-static inline u32 sr(u64 v, u_char n)
-{
-       return v >> n;
-}
+       if (!state->is_base2_26)
+               return;
  
-static inline u32 and(u32 v, u32 mask)
-{
-       return v & mask;
+       cy = state->h[0] >> 26; state->h[0] &= 0x3ffffff; state->h[1] += cy;
+       cy = state->h[1] >> 26; state->h[1] &= 0x3ffffff; state->h[2] += cy;
+       cy = state->h[2] >> 26; state->h[2] &= 0x3ffffff; state->h[3] += cy;
+       cy = state->h[3] >> 26; state->h[3] &= 0x3ffffff; state->h[4] += cy;
+       state->hs[0] = ((u64)state->h[2] << 52) | ((u64)state->h[1] << 26) | state->h[0];
+       state->hs[1] = ((u64)state->h[4] << 40) | ((u64)state->h[3] << 14) | (state->h[2] >> 12);
+       state->hs[2] = state->h[4] >> 24;
+#define ULT(a, b) ((a ^ ((a ^ b) | ((a - b) ^ b))) >> (sizeof(a) * 8 - 1))
+       cy = (state->hs[2] >> 2) + (state->hs[2] & ~3ULL);
+       state->hs[2] &= 3;
+       state->hs[0] += cy;
+       state->hs[1] += (cy = ULT(state->hs[0], cy));
+       state->hs[2] += ULT(state->hs[1], cy);
+#undef ULT
+       state->is_base2_26 = 0;
  }
  
-static void poly1305_simd_mult(u32 *a, const u32 *b)
+static void poly1305_simd_init(void *ctx, const u8 key[POLY1305_KEY_SIZE])
  {
-       u8 m[POLY1305_BLOCK_SIZE];
-
-       memset(m, 0, sizeof(m));
-       /* The poly1305 block function adds a hi-bit to the accumulator which
-        * we don't need for key multiplication; compensate for it. */
-       a[4] -= 1 << 24;
-       poly1305_block_sse2(a, m, b, 1);
+       poly1305_init_x86_64(ctx, key);
  }
  
-static void poly1305_integer_setkey(struct poly1305_key *key, const u8 *raw_key)
+static void poly1305_simd_blocks(void *ctx, const u8 *inp, size_t len,
+                                const u32 padbit)
  {
-       /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
-       key->r[0] = (get_unaligned_le32(raw_key +  0) >> 0) & 0x3ffffff;
-       key->r[1] = (get_unaligned_le32(raw_key +  3) >> 2) & 0x3ffff03;
-       key->r[2] = (get_unaligned_le32(raw_key +  6) >> 4) & 0x3ffc0ff;
-       key->r[3] = (get_unaligned_le32(raw_key +  9) >> 6) & 0x3f03fff;
-       key->r[4] = (get_unaligned_le32(raw_key + 12) >> 8) & 0x00fffff;
-}
+       struct poly1305_arch_internal *state = ctx;
  
-static void poly1305_integer_blocks(struct poly1305_state *state,
-                                   const struct poly1305_key *key,
-                                   const void *src,
-                                   unsigned int nblocks, u32 hibit)
-{
-       u32 r0, r1, r2, r3, r4;
-       u32 s1, s2, s3, s4;
-       u32 h0, h1, h2, h3, h4;
-       u64 d0, d1, d2, d3, d4;
+       /* SIMD disables preemption, so relax after processing each page. */
+       BUILD_BUG_ON(PAGE_SIZE < POLY1305_BLOCK_SIZE ||
+                    PAGE_SIZE % POLY1305_BLOCK_SIZE);
  
-       if (!nblocks)
+       if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+           (len < (POLY1305_BLOCK_SIZE * 18) && !state->is_base2_26) ||
+           !crypto_simd_usable()) {
+               convert_to_base2_64(ctx);
+               poly1305_blocks_x86_64(ctx, inp, len, padbit);
                 return;
+       }
  
-       r0 = key->r[0];
-       r1 = key->r[1];
-       r2 = key->r[2];
-       r3 = key->r[3];
-       r4 = key->r[4];
-
-       s1 = r1 * 5;
-       s2 = r2 * 5;
-       s3 = r3 * 5;
-       s4 = r4 * 5;
-
-       h0 = state->h[0];
-       h1 = state->h[1];
-       h2 = state->h[2];
-       h3 = state->h[3];
-       h4 = state->h[4];
-
-       do {
-               /* h += m[i] */
-               h0 += (get_unaligned_le32(src +  0) >> 0) & 0x3ffffff;
-               h1 += (get_unaligned_le32(src +  3) >> 2) & 0x3ffffff;
-               h2 += (get_unaligned_le32(src +  6) >> 4) & 0x3ffffff;
-               h3 += (get_unaligned_le32(src +  9) >> 6) & 0x3ffffff;
-               h4 += (get_unaligned_le32(src + 12) >> 8) | (hibit << 24);
-
-               /* h *= r */
-               d0 = mlt(h0, r0) + mlt(h1, s4) + mlt(h2, s3) +
-                    mlt(h3, s2) + mlt(h4, s1);
-               d1 = mlt(h0, r1) + mlt(h1, r0) + mlt(h2, s4) +
-                    mlt(h3, s3) + mlt(h4, s2);
-               d2 = mlt(h0, r2) + mlt(h1, r1) + mlt(h2, r0) +
-                    mlt(h3, s4) + mlt(h4, s3);
-               d3 = mlt(h0, r3) + mlt(h1, r2) + mlt(h2, r1) +
-                    mlt(h3, r0) + mlt(h4, s4);
-               d4 = mlt(h0, r4) + mlt(h1, r3) + mlt(h2, r2) +
-                    mlt(h3, r1) + mlt(h4, r0);
-
-               /* (partial) h %= p */
-               d1 += sr(d0, 26);     h0 = and(d0, 0x3ffffff);
-               d2 += sr(d1, 26);     h1 = and(d1, 0x3ffffff);
-               d3 += sr(d2, 26);     h2 = and(d2, 0x3ffffff);
-               d4 += sr(d3, 26);     h3 = and(d3, 0x3ffffff);
-               h0 += sr(d4, 26) * 5; h4 = and(d4, 0x3ffffff);
-               h1 += h0 >> 26;       h0 = h0 & 0x3ffffff;
-
-               src += POLY1305_BLOCK_SIZE;
-       } while (--nblocks);
-
-       state->h[0] = h0;
-       state->h[1] = h1;
-       state->h[2] = h2;
-       state->h[3] = h3;
-       state->h[4] = h4;
+       for (;;) {
+               const size_t bytes = min_t(size_t, len, PAGE_SIZE);
+
+               kernel_fpu_begin();
+               if (IS_ENABLED(CONFIG_AS_AVX512) && static_branch_likely(&poly1305_use_avx512))
+                       poly1305_blocks_avx512(ctx, inp, bytes, padbit);
+               else if (IS_ENABLED(CONFIG_AS_AVX2) && static_branch_likely(&poly1305_use_avx2))
+                       poly1305_blocks_avx2(ctx, inp, bytes, padbit);
+               else
+                       poly1305_blocks_avx(ctx, inp, bytes, padbit);
+               kernel_fpu_end();
+               len -= bytes;
+               if (!len)
+                       break;
+               inp += bytes;
+       }
  }
  
-static void poly1305_integer_emit(const struct poly1305_state *state, void *dst)
+static void poly1305_simd_emit(void *ctx, u8 mac[POLY1305_DIGEST_SIZE],
+                              const u32 nonce[4])
  {
-       u32 h0, h1, h2, h3, h4;
-       u32 g0, g1, g2, g3, g4;
-       u32 mask;
-
-       /* fully carry h */
-       h0 = state->h[0];
-       h1 = state->h[1];
-       h2 = state->h[2];
-       h3 = state->h[3];
-       h4 = state->h[4];
-
-       h2 += (h1 >> 26);     h1 = h1 & 0x3ffffff;
-       h3 += (h2 >> 26);     h2 = h2 & 0x3ffffff;
-       h4 += (h3 >> 26);     h3 = h3 & 0x3ffffff;
-       h0 += (h4 >> 26) * 5; h4 = h4 & 0x3ffffff;
-       h1 += (h0 >> 26);     h0 = h0 & 0x3ffffff;
-
-       /* compute h + -p */
-       g0 = h0 + 5;
-       g1 = h1 + (g0 >> 26);             g0 &= 0x3ffffff;
-       g2 = h2 + (g1 >> 26);             g1 &= 0x3ffffff;
-       g3 = h3 + (g2 >> 26);             g2 &= 0x3ffffff;
-       g4 = h4 + (g3 >> 26) - (1 << 26); g3 &= 0x3ffffff;
-
-       /* select h if h < p, or h + -p if h >= p */
-       mask = (g4 >> ((sizeof(u32) * 8) - 1)) - 1;
-       g0 &= mask;
-       g1 &= mask;
-       g2 &= mask;
-       g3 &= mask;
-       g4 &= mask;
-       mask = ~mask;
-       h0 = (h0 & mask) | g0;
-       h1 = (h1 & mask) | g1;
-       h2 = (h2 & mask) | g2;
-       h3 = (h3 & mask) | g3;
-       h4 = (h4 & mask) | g4;
-
-       /* h = h % (2^128) */
-       put_unaligned_le32((h0 >>  0) | (h1 << 26), dst +  0);
-       put_unaligned_le32((h1 >>  6) | (h2 << 20), dst +  4);
-       put_unaligned_le32((h2 >> 12) | (h3 << 14), dst +  8);
-       put_unaligned_le32((h3 >> 18) | (h4 <<  8), dst + 12);
+       struct poly1305_arch_internal *state = ctx;
+
+       if (!IS_ENABLED(CONFIG_AS_AVX) || !static_branch_likely(&poly1305_use_avx) ||
+           !state->is_base2_26 || !crypto_simd_usable()) {
+               convert_to_base2_64(ctx);
+               poly1305_emit_x86_64(ctx, mac, nonce);
+       } else
+               poly1305_emit_avx(ctx, mac, nonce);
  }
  
-void poly1305_init_arch(struct poly1305_desc_ctx *desc, const u8 *key)
+void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 *key)
  {
-       poly1305_integer_setkey(desc->opaque_r, key);
-       desc->s[0] = get_unaligned_le32(key + 16);
-       desc->s[1] = get_unaligned_le32(key + 20);
-       desc->s[2] = get_unaligned_le32(key + 24);
-       desc->s[3] = get_unaligned_le32(key + 28);
-       poly1305_core_init(&desc->h);
-       desc->buflen = 0;
-       desc->sset = true;
-       desc->rset = 1;
+       poly1305_simd_init(&dctx->h, key);
+       dctx->s[0] = get_unaligned_le32(&key[16]);
+       dctx->s[1] = get_unaligned_le32(&key[20]);
+       dctx->s[2] = get_unaligned_le32(&key[24]);
+       dctx->s[3] = get_unaligned_le32(&key[28]);
+       dctx->buflen = 0;
+       dctx->sset = true;
  }
-EXPORT_SYMBOL_GPL(poly1305_init_arch);
+EXPORT_SYMBOL(poly1305_init_arch);
  
-static unsigned int crypto_poly1305_setdesckey(struct poly1305_desc_ctx *dctx,
-                                              const u8 *src, unsigned int srclen)
+static unsigned int crypto_poly1305_setdctxkey(struct poly1305_desc_ctx *dctx,
+                                              const u8 *inp, unsigned int len)
  {
-       if (!dctx->sset) {
-               if (!dctx->rset && srclen >= POLY1305_BLOCK_SIZE) {
-                       poly1305_integer_setkey(dctx->r, src);
-                       src += POLY1305_BLOCK_SIZE;
-                       srclen -= POLY1305_BLOCK_SIZE;
+       unsigned int acc = 0;
+       if (unlikely(!dctx->sset)) {
+               if (!dctx->rset && len >= POLY1305_BLOCK_SIZE) {
+                       poly1305_simd_init(&dctx->h, inp);
+                       inp += POLY1305_BLOCK_SIZE;
+                       len -= POLY1305_BLOCK_SIZE;
+                       acc += POLY1305_BLOCK_SIZE;
                         dctx->rset = 1;
                 }
-               if (srclen >= POLY1305_BLOCK_SIZE) {
-                       dctx->s[0] = get_unaligned_le32(src +  0);
-                       dctx->s[1] = get_unaligned_le32(src +  4);
-                       dctx->s[2] = get_unaligned_le32(src +  8);
-                       dctx->s[3] = get_unaligned_le32(src + 12);
-                       src += POLY1305_BLOCK_SIZE;
-                       srclen -= POLY1305_BLOCK_SIZE;
+               if (len >= POLY1305_BLOCK_SIZE) {
+                       dctx->s[0] = get_unaligned_le32(&inp[0]);
+                       dctx->s[1] = get_unaligned_le32(&inp[4]);
+                       dctx->s[2] = get_unaligned_le32(&inp[8]);
+                       dctx->s[3] = get_unaligned_le32(&inp[12]);
+                       inp += POLY1305_BLOCK_SIZE;
+                       len -= POLY1305_BLOCK_SIZE;
+                       acc += POLY1305_BLOCK_SIZE;
                         dctx->sset = true;
                 }
         }
-       return srclen;
-}
-
-static unsigned int poly1305_scalar_blocks(struct poly1305_desc_ctx *dctx,
-                                          const u8 *src, unsigned int srclen)
-{
-       unsigned int datalen;
-
-       if (unlikely(!dctx->sset)) {
-               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
-               src += srclen - datalen;
-               srclen = datalen;
-       }
-       if (srclen >= POLY1305_BLOCK_SIZE) {
-               poly1305_integer_blocks(&dctx->h, dctx->opaque_r, src,
-                                       srclen / POLY1305_BLOCK_SIZE, 1);
-               srclen %= POLY1305_BLOCK_SIZE;
-       }
-       return srclen;
-}
-
-static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
-                                        const u8 *src, unsigned int srclen)
-{
-       unsigned int blocks, datalen;
-
-       if (unlikely(!dctx->sset)) {
-               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
-               src += srclen - datalen;
-               srclen = datalen;
-       }
-
-       if (IS_ENABLED(CONFIG_AS_AVX2) &&
-           static_branch_likely(&poly1305_use_avx2) &&
-           srclen >= POLY1305_BLOCK_SIZE * 4) {
-               if (unlikely(dctx->rset < 4)) {
-                       if (dctx->rset < 2) {
-                               dctx->r[1] = dctx->r[0];
-                               poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
-                       }
-                       dctx->r[2] = dctx->r[1];
-                       poly1305_simd_mult(dctx->r[2].r, dctx->r[0].r);
-                       dctx->r[3] = dctx->r[2];
-                       poly1305_simd_mult(dctx->r[3].r, dctx->r[0].r);
-                       dctx->rset = 4;
-               }
-               blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
-               poly1305_4block_avx2(dctx->h.h, src, dctx->r[0].r, blocks,
-                                    dctx->r[1].r);
-               src += POLY1305_BLOCK_SIZE * 4 * blocks;
-               srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
-       }
-
-       if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
-               if (unlikely(dctx->rset < 2)) {
-                       dctx->r[1] = dctx->r[0];
-                       poly1305_simd_mult(dctx->r[1].r, dctx->r[0].r);
-                       dctx->rset = 2;
-               }
-               blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
-               poly1305_2block_sse2(dctx->h.h, src, dctx->r[0].r,
-                                    blocks, dctx->r[1].r);
-               src += POLY1305_BLOCK_SIZE * 2 * blocks;
-               srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
-       }
-       if (srclen >= POLY1305_BLOCK_SIZE) {
-               poly1305_block_sse2(dctx->h.h, src, dctx->r[0].r, 1);
-               srclen -= POLY1305_BLOCK_SIZE;
-       }
-       return srclen;
+       return acc;
  }
  
  void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
                           unsigned int srclen)
  {
-       unsigned int bytes;
+       unsigned int bytes, used;
  
         if (unlikely(dctx->buflen)) {
                 bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
@@ -295,31 +184,19 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
                 dctx->buflen += bytes;
  
                 if (dctx->buflen == POLY1305_BLOCK_SIZE) {
-                       if (static_branch_likely(&poly1305_use_simd) &&
-                           likely(crypto_simd_usable())) {
-                               kernel_fpu_begin();
-                               poly1305_simd_blocks(dctx, dctx->buf,
-                                                    POLY1305_BLOCK_SIZE);
-                               kernel_fpu_end();
-                       } else {
-                               poly1305_scalar_blocks(dctx, dctx->buf,
-                                                      POLY1305_BLOCK_SIZE);
-                       }
+                       if (likely(!crypto_poly1305_setdctxkey(dctx, dctx->buf, POLY1305_BLOCK_SIZE)))
+                               poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
                         dctx->buflen = 0;
                 }
         }
  
         if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
-               if (static_branch_likely(&poly1305_use_simd) &&
-                   likely(crypto_simd_usable())) {
-                       kernel_fpu_begin();
-                       bytes = poly1305_simd_blocks(dctx, src, srclen);
-                       kernel_fpu_end();
-               } else {
-                       bytes = poly1305_scalar_blocks(dctx, src, srclen);
-               }
-               src += srclen - bytes;
-               srclen = bytes;
+               bytes = round_down(srclen, POLY1305_BLOCK_SIZE);
+               srclen -= bytes;
+               used = crypto_poly1305_setdctxkey(dctx, src, bytes);
+               if (likely(bytes - used))
+                       poly1305_simd_blocks(&dctx->h, src + used, bytes - used, 1);
+               src += bytes;
         }
  
         if (unlikely(srclen)) {
@@ -329,31 +206,17 @@ void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
  }
  EXPORT_SYMBOL(poly1305_update_arch);
  
-void poly1305_final_arch(struct poly1305_desc_ctx *desc, u8 *dst)
+void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
  {
-       __le32 digest[4];
-       u64 f = 0;
-
-       if (unlikely(desc->buflen)) {
-               desc->buf[desc->buflen++] = 1;
-               memset(desc->buf + desc->buflen, 0,
-                      POLY1305_BLOCK_SIZE - desc->buflen);
-               poly1305_integer_blocks(&desc->h, desc->opaque_r, desc->buf, 1, 0);
+       if (unlikely(dctx->buflen)) {
+               dctx->buf[dctx->buflen++] = 1;
+               memset(dctx->buf + dctx->buflen, 0,
+                      POLY1305_BLOCK_SIZE - dctx->buflen);
+               poly1305_simd_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
         }
  
-       poly1305_integer_emit(&desc->h, digest);
-
-       /* mac = (h + s) % (2^128) */
-       f = (f >> 32) + le32_to_cpu(digest[0]) + desc->s[0];
-       put_unaligned_le32(f, dst + 0);
-       f = (f >> 32) + le32_to_cpu(digest[1]) + desc->s[1];
-       put_unaligned_le32(f, dst + 4);
-       f = (f >> 32) + le32_to_cpu(digest[2]) + desc->s[2];
-       put_unaligned_le32(f, dst + 8);
-       f = (f >> 32) + le32_to_cpu(digest[3]) + desc->s[3];
-       put_unaligned_le32(f, dst + 12);
-
-       *desc = (struct poly1305_desc_ctx){};
+       poly1305_simd_emit(&dctx->h, dst, dctx->s);
+       *dctx = (struct poly1305_desc_ctx){};
  }
  EXPORT_SYMBOL(poly1305_final_arch);
  
@@ -361,38 +224,34 @@ static int crypto_poly1305_init(struct shash_desc *desc)
  {
         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
  
-       poly1305_core_init(&dctx->h);
-       dctx->buflen = 0;
-       dctx->rset = 0;
-       dctx->sset = false;
-
+       *dctx = (struct poly1305_desc_ctx){};
         return 0;
  }
  
-static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
+static int crypto_poly1305_update(struct shash_desc *desc,
+                                 const u8 *src, unsigned int srclen)
  {
         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
  
-       if (unlikely(!dctx->sset))
-               return -ENOKEY;
-
-       poly1305_final_arch(dctx, dst);
+       poly1305_update_arch(dctx, src, srclen);
         return 0;
  }
  
-static int poly1305_simd_update(struct shash_desc *desc,
-                               const u8 *src, unsigned int srclen)
+static int crypto_poly1305_final(struct shash_desc *desc, u8 *dst)
  {
         struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
  
-       poly1305_update_arch(dctx, src, srclen);
+       if (unlikely(!dctx->sset))
+               return -ENOKEY;
+
+       poly1305_final_arch(dctx, dst);
         return 0;
  }
  
  static struct shash_alg alg = {
         .digestsize     = POLY1305_DIGEST_SIZE,
         .init           = crypto_poly1305_init,
-       .update         = poly1305_simd_update,
+       .update         = crypto_poly1305_update,
         .final          = crypto_poly1305_final,
         .descsize       = sizeof(struct poly1305_desc_ctx),
         .base           = {
@@ -406,17 +265,19 @@ static struct shash_alg alg = {
  
  static int __init poly1305_simd_mod_init(void)
  {
-       if (!boot_cpu_has(X86_FEATURE_XMM2))
-               return 0;
-
-       static_branch_enable(&poly1305_use_simd);
-
-       if (IS_ENABLED(CONFIG_AS_AVX2) &&
-           boot_cpu_has(X86_FEATURE_AVX) &&
+       if (IS_ENABLED(CONFIG_AS_AVX) && boot_cpu_has(X86_FEATURE_AVX) &&
+           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+               static_branch_enable(&poly1305_use_avx);
+       if (IS_ENABLED(CONFIG_AS_AVX2) && boot_cpu_has(X86_FEATURE_AVX) &&
             boot_cpu_has(X86_FEATURE_AVX2) &&
             cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
                 static_branch_enable(&poly1305_use_avx2);
-
+       if (IS_ENABLED(CONFIG_AS_AVX512) && boot_cpu_has(X86_FEATURE_AVX) &&
+           boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_AVX512F) &&
+           cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | XFEATURE_MASK_AVX512, NULL) &&
+           /* Skylake downclocks unacceptably much when using zmm, but later generations are fast. */
+           boot_cpu_data.x86_model != INTEL_FAM6_SKYLAKE_X)
+               static_branch_enable(&poly1305_use_avx512);
         return IS_REACHABLE(CONFIG_CRYPTO_HASH) ? crypto_register_shash(&alg) : 0;
  }
  
@@ -430,7 +291,7 @@ module_init(poly1305_simd_mod_init);
  module_exit(poly1305_simd_mod_exit);
  
  MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_AUTHOR("Jason A. Donenfeld <Jason@zx2c4.com>");
  MODULE_DESCRIPTION("Poly1305 authenticator");
  MODULE_ALIAS_CRYPTO("poly1305");
  MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/lib/crypto/Kconfig b/lib/crypto/Kconfig

index 0b2c4fc..14c032d 100644 (file)
--- a/lib/crypto/Kconfig
+++ b/lib/crypto/Kconfig
@@ -90,7 +90,7 @@ config CRYPTO_LIB_DES
  config CRYPTO_LIB_POLY1305_RSIZE
         int
         default 2 if MIPS
-       default 4 if X86_64
+       default 11 if X86_64
         default 9 if ARM || ARM64
         default 1
author	Jason A. Donenfeld <Jason@zx2c4.com>
	Mon, 6 Jan 2020 03:40:48 +0000 (22:40 -0500)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Thu, 16 Jan 2020 07:18:12 +0000 (15:18 +0800)
arch/x86/crypto/.gitignore	[new file with mode: 0644]	patch \| blob
arch/x86/crypto/Makefile		patch \| blob \| history
arch/x86/crypto/poly1305-avx2-x86_64.S	[deleted file]	patch \| blob \| history
arch/x86/crypto/poly1305-sse2-x86_64.S	[deleted file]	patch \| blob \| history
arch/x86/crypto/poly1305-x86_64-cryptogams.pl		patch \| blob \| history
arch/x86/crypto/poly1305_glue.c		patch \| blob \| history
lib/crypto/Kconfig		patch \| blob \| history