crypto: poly1305 - Add a SSE2 SIMD variant for x86_64

author Martin Willi <martin@strongswan.org>

Thu, 16 Jul 2015 17:14:06 +0000 (19:14 +0200)

committer Herbert Xu <herbert@gondor.apana.org.au>

Fri, 17 Jul 2015 13:20:27 +0000 (21:20 +0800)
author Martin Willi <martin@strongswan.org>
Thu, 16 Jul 2015 17:14:06 +0000 (19:14 +0200)
committer Herbert Xu <herbert@gondor.apana.org.au>
Fri, 17 Jul 2015 13:20:27 +0000 (21:20 +0800)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile

index ce39b3c..5cf405c 100644 (file)
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_CRYPTO_CRC32_PCLMUL) += crc32-pclmul.o
  obj-$(CONFIG_CRYPTO_SHA256_SSSE3) += sha256-ssse3.o
  obj-$(CONFIG_CRYPTO_SHA512_SSSE3) += sha512-ssse3.o
  obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
+obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
  
  # These modules require assembler to support AVX.
  ifeq ($(avx_supported),yes)
@@ -85,6 +86,7 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
  aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o
  ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
  sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
+poly1305-x86_64-y := poly1305-sse2-x86_64.o poly1305_glue.o
  ifeq ($(avx2_supported),yes)
  sha1-ssse3-y += sha1_avx2_x86_64_asm.o
  endif
diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S

new file mode 100644 (file)

index 0000000..a3d2b5e
--- /dev/null
+++ b/arch/x86/crypto/poly1305-sse2-x86_64.S
@@ -0,0 +1,276 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, x64 SSE2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.data
+.align 16
+
+ANMASK:        .octa 0x0000000003ffffff0000000003ffffff
+
+.text
+
+#define h0 0x00(%rdi)
+#define h1 0x04(%rdi)
+#define h2 0x08(%rdi)
+#define h3 0x0c(%rdi)
+#define h4 0x10(%rdi)
+#define r0 0x00(%rdx)
+#define r1 0x04(%rdx)
+#define r2 0x08(%rdx)
+#define r3 0x0c(%rdx)
+#define r4 0x10(%rdx)
+#define s1 0x00(%rsp)
+#define s2 0x04(%rsp)
+#define s3 0x08(%rsp)
+#define s4 0x0c(%rsp)
+#define m %rsi
+#define h01 %xmm0
+#define h23 %xmm1
+#define h44 %xmm2
+#define t1 %xmm3
+#define t2 %xmm4
+#define t3 %xmm5
+#define t4 %xmm6
+#define mask %xmm7
+#define d0 %r8
+#define d1 %r9
+#define d2 %r10
+#define d3 %r11
+#define d4 %r12
+
+ENTRY(poly1305_block_sse2)
+       # %rdi: Accumulator h[5]
+       # %rsi: 16 byte input block m
+       # %rdx: Poly1305 key r[5]
+       # %rcx: Block count
+
+       # This single block variant tries to improve performance by doing two
+       # multiplications in parallel using SSE instructions. There is quite
+       # some quardword packing involved, hence the speedup is marginal.
+
+       push            %rbx
+       push            %r12
+       sub             $0x10,%rsp
+
+       # s1..s4 = r1..r4 * 5
+       mov             r1,%eax
+       lea             (%eax,%eax,4),%eax
+       mov             %eax,s1
+       mov             r2,%eax
+       lea             (%eax,%eax,4),%eax
+       mov             %eax,s2
+       mov             r3,%eax
+       lea             (%eax,%eax,4),%eax
+       mov             %eax,s3
+       mov             r4,%eax
+       lea             (%eax,%eax,4),%eax
+       mov             %eax,s4
+
+       movdqa          ANMASK(%rip),mask
+
+.Ldoblock:
+       # h01 = [0, h1, 0, h0]
+       # h23 = [0, h3, 0, h2]
+       # h44 = [0, h4, 0, h4]
+       movd            h0,h01
+       movd            h1,t1
+       movd            h2,h23
+       movd            h3,t2
+       movd            h4,h44
+       punpcklqdq      t1,h01
+       punpcklqdq      t2,h23
+       punpcklqdq      h44,h44
+
+       # h01 += [ (m[3-6] >> 2) & 0x3ffffff, m[0-3] & 0x3ffffff ]
+       movd            0x00(m),t1
+       movd            0x03(m),t2
+       psrld           $2,t2
+       punpcklqdq      t2,t1
+       pand            mask,t1
+       paddd           t1,h01
+       # h23 += [ (m[9-12] >> 6) & 0x3ffffff, (m[6-9] >> 4) & 0x3ffffff ]
+       movd            0x06(m),t1
+       movd            0x09(m),t2
+       psrld           $4,t1
+       psrld           $6,t2
+       punpcklqdq      t2,t1
+       pand            mask,t1
+       paddd           t1,h23
+       # h44 += [ (m[12-15] >> 8) | (1 << 24), (m[12-15] >> 8) | (1 << 24) ]
+       mov             0x0c(m),%eax
+       shr             $8,%eax
+       or              $0x01000000,%eax
+       movd            %eax,t1
+       pshufd          $0xc4,t1,t1
+       paddd           t1,h44
+
+       # t1[0] = h0 * r0 + h2 * s3
+       # t1[1] = h1 * s4 + h3 * s2
+       movd            r0,t1
+       movd            s4,t2
+       punpcklqdq      t2,t1
+       pmuludq         h01,t1
+       movd            s3,t2
+       movd            s2,t3
+       punpcklqdq      t3,t2
+       pmuludq         h23,t2
+       paddq           t2,t1
+       # t2[0] = h0 * r1 + h2 * s4
+       # t2[1] = h1 * r0 + h3 * s3
+       movd            r1,t2
+       movd            r0,t3
+       punpcklqdq      t3,t2
+       pmuludq         h01,t2
+       movd            s4,t3
+       movd            s3,t4
+       punpcklqdq      t4,t3
+       pmuludq         h23,t3
+       paddq           t3,t2
+       # t3[0] = h4 * s1
+       # t3[1] = h4 * s2
+       movd            s1,t3
+       movd            s2,t4
+       punpcklqdq      t4,t3
+       pmuludq         h44,t3
+       # d0 = t1[0] + t1[1] + t3[0]
+       # d1 = t2[0] + t2[1] + t3[1]
+       movdqa          t1,t4
+       punpcklqdq      t2,t4
+       punpckhqdq      t2,t1
+       paddq           t4,t1
+       paddq           t3,t1
+       movq            t1,d0
+       psrldq          $8,t1
+       movq            t1,d1
+
+       # t1[0] = h0 * r2 + h2 * r0
+       # t1[1] = h1 * r1 + h3 * s4
+       movd            r2,t1
+       movd            r1,t2
+       punpcklqdq      t2,t1
+       pmuludq         h01,t1
+       movd            r0,t2
+       movd            s4,t3
+       punpcklqdq      t3,t2
+       pmuludq         h23,t2
+       paddq           t2,t1
+       # t2[0] = h0 * r3 + h2 * r1
+       # t2[1] = h1 * r2 + h3 * r0
+       movd            r3,t2
+       movd            r2,t3
+       punpcklqdq      t3,t2
+       pmuludq         h01,t2
+       movd            r1,t3
+       movd            r0,t4
+       punpcklqdq      t4,t3
+       pmuludq         h23,t3
+       paddq           t3,t2
+       # t3[0] = h4 * s3
+       # t3[1] = h4 * s4
+       movd            s3,t3
+       movd            s4,t4
+       punpcklqdq      t4,t3
+       pmuludq         h44,t3
+       # d2 = t1[0] + t1[1] + t3[0]
+       # d3 = t2[0] + t2[1] + t3[1]
+       movdqa          t1,t4
+       punpcklqdq      t2,t4
+       punpckhqdq      t2,t1
+       paddq           t4,t1
+       paddq           t3,t1
+       movq            t1,d2
+       psrldq          $8,t1
+       movq            t1,d3
+
+       # t1[0] = h0 * r4 + h2 * r2
+       # t1[1] = h1 * r3 + h3 * r1
+       movd            r4,t1
+       movd            r3,t2
+       punpcklqdq      t2,t1
+       pmuludq         h01,t1
+       movd            r2,t2
+       movd            r1,t3
+       punpcklqdq      t3,t2
+       pmuludq         h23,t2
+       paddq           t2,t1
+       # t3[0] = h4 * r0
+       movd            r0,t3
+       pmuludq         h44,t3
+       # d4 = t1[0] + t1[1] + t3[0]
+       movdqa          t1,t4
+       psrldq          $8,t4
+       paddq           t4,t1
+       paddq           t3,t1
+       movq            t1,d4
+
+       # d1 += d0 >> 26
+       mov             d0,%rax
+       shr             $26,%rax
+       add             %rax,d1
+       # h0 = d0 & 0x3ffffff
+       mov             d0,%rbx
+       and             $0x3ffffff,%ebx
+
+       # d2 += d1 >> 26
+       mov             d1,%rax
+       shr             $26,%rax
+       add             %rax,d2
+       # h1 = d1 & 0x3ffffff
+       mov             d1,%rax
+       and             $0x3ffffff,%eax
+       mov             %eax,h1
+
+       # d3 += d2 >> 26
+       mov             d2,%rax
+       shr             $26,%rax
+       add             %rax,d3
+       # h2 = d2 & 0x3ffffff
+       mov             d2,%rax
+       and             $0x3ffffff,%eax
+       mov             %eax,h2
+
+       # d4 += d3 >> 26
+       mov             d3,%rax
+       shr             $26,%rax
+       add             %rax,d4
+       # h3 = d3 & 0x3ffffff
+       mov             d3,%rax
+       and             $0x3ffffff,%eax
+       mov             %eax,h3
+
+       # h0 += (d4 >> 26) * 5
+       mov             d4,%rax
+       shr             $26,%rax
+       lea             (%eax,%eax,4),%eax
+       add             %eax,%ebx
+       # h4 = d4 & 0x3ffffff
+       mov             d4,%rax
+       and             $0x3ffffff,%eax
+       mov             %eax,h4
+
+       # h1 += h0 >> 26
+       mov             %ebx,%eax
+       shr             $26,%eax
+       add             %eax,h1
+       # h0 = h0 & 0x3ffffff
+       andl            $0x3ffffff,%ebx
+       mov             %ebx,h0
+
+       add             $0x10,m
+       dec             %rcx
+       jnz             .Ldoblock
+
+       add             $0x10,%rsp
+       pop             %r12
+       pop             %rbx
+       ret
+ENDPROC(poly1305_block_sse2)
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c

new file mode 100644 (file)

index 0000000..1e59274
--- /dev/null
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -0,0 +1,123 @@
+/*
+ * Poly1305 authenticator algorithm, RFC7539, SIMD glue code
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/internal/hash.h>
+#include <crypto/poly1305.h>
+#include <linux/crypto.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+asmlinkage void poly1305_block_sse2(u32 *h, const u8 *src,
+                                   const u32 *r, unsigned int blocks);
+
+static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
+                                        const u8 *src, unsigned int srclen)
+{
+       unsigned int blocks, datalen;
+
+       if (unlikely(!dctx->sset)) {
+               datalen = crypto_poly1305_setdesckey(dctx, src, srclen);
+               src += srclen - datalen;
+               srclen = datalen;
+       }
+
+       if (srclen >= POLY1305_BLOCK_SIZE) {
+               blocks = srclen / POLY1305_BLOCK_SIZE;
+               poly1305_block_sse2(dctx->h, src, dctx->r, blocks);
+               srclen -= POLY1305_BLOCK_SIZE * blocks;
+       }
+       return srclen;
+}
+
+static int poly1305_simd_update(struct shash_desc *desc,
+                               const u8 *src, unsigned int srclen)
+{
+       struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
+       unsigned int bytes;
+
+       /* kernel_fpu_begin/end is costly, use fallback for small updates */
+       if (srclen <= 288 || !may_use_simd())
+               return crypto_poly1305_update(desc, src, srclen);
+
+       kernel_fpu_begin();
+
+       if (unlikely(dctx->buflen)) {
+               bytes = min(srclen, POLY1305_BLOCK_SIZE - dctx->buflen);
+               memcpy(dctx->buf + dctx->buflen, src, bytes);
+               src += bytes;
+               srclen -= bytes;
+               dctx->buflen += bytes;
+
+               if (dctx->buflen == POLY1305_BLOCK_SIZE) {
+                       poly1305_simd_blocks(dctx, dctx->buf,
+                                            POLY1305_BLOCK_SIZE);
+                       dctx->buflen = 0;
+               }
+       }
+
+       if (likely(srclen >= POLY1305_BLOCK_SIZE)) {
+               bytes = poly1305_simd_blocks(dctx, src, srclen);
+               src += srclen - bytes;
+               srclen = bytes;
+       }
+
+       kernel_fpu_end();
+
+       if (unlikely(srclen)) {
+               dctx->buflen = srclen;
+               memcpy(dctx->buf, src, srclen);
+       }
+
+       return 0;
+}
+
+static struct shash_alg alg = {
+       .digestsize     = POLY1305_DIGEST_SIZE,
+       .init           = crypto_poly1305_init,
+       .update         = poly1305_simd_update,
+       .final          = crypto_poly1305_final,
+       .setkey         = crypto_poly1305_setkey,
+       .descsize       = sizeof(struct poly1305_desc_ctx),
+       .base           = {
+               .cra_name               = "poly1305",
+               .cra_driver_name        = "poly1305-simd",
+               .cra_priority           = 300,
+               .cra_flags              = CRYPTO_ALG_TYPE_SHASH,
+               .cra_alignmask          = sizeof(u32) - 1,
+               .cra_blocksize          = POLY1305_BLOCK_SIZE,
+               .cra_module             = THIS_MODULE,
+       },
+};
+
+static int __init poly1305_simd_mod_init(void)
+{
+       if (!cpu_has_xmm2)
+               return -ENODEV;
+
+       return crypto_register_shash(&alg);
+}
+
+static void __exit poly1305_simd_mod_exit(void)
+{
+       crypto_unregister_shash(&alg);
+}
+
+module_init(poly1305_simd_mod_init);
+module_exit(poly1305_simd_mod_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("Poly1305 authenticator");
+MODULE_ALIAS_CRYPTO("poly1305");
+MODULE_ALIAS_CRYPTO("poly1305-simd");
diff --git a/crypto/Kconfig b/crypto/Kconfig

index 82caab0..c57478c 100644 (file)
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -470,6 +470,18 @@ config CRYPTO_POLY1305
           It is used for the ChaCha20-Poly1305 AEAD, specified in RFC7539 for use
           in IETF protocols. This is the portable C implementation of Poly1305.
  
+config CRYPTO_POLY1305_X86_64
+       tristate "Poly1305 authenticator algorithm (x86_64/SSE2)"
+       depends on X86 && 64BIT
+       select CRYPTO_POLY1305
+       help
+         Poly1305 authenticator algorithm, RFC7539.
+
+         Poly1305 is an authenticator algorithm designed by Daniel J. Bernstein.
+         It is used for the ChaCha20-Poly1305 AEAD, specified in RFC7539 for use
+         in IETF protocols. This is the x86_64 assembler implementation using SIMD
+         instructions.
+
  config CRYPTO_MD4
         tristate "MD4 digest algorithm"
         select CRYPTO_HASH
author	Martin Willi <martin@strongswan.org>
	Thu, 16 Jul 2015 17:14:06 +0000 (19:14 +0200)
committer	Herbert Xu <herbert@gondor.apana.org.au>
	Fri, 17 Jul 2015 13:20:27 +0000 (21:20 +0800)
arch/x86/crypto/Makefile		patch \| blob \| history
arch/x86/crypto/poly1305-sse2-x86_64.S	[new file with mode: 0644]	patch \| blob
arch/x86/crypto/poly1305_glue.c	[new file with mode: 0644]	patch \| blob
crypto/Kconfig		patch \| blob \| history