crypto: arm/chacha20 - add XChaCha20 support

[uclinux-h8/linux.git] / arch / arm / crypto / chacha20-neon-core.S
diff --git a/arch/arm/crypto/chacha20-neon-core.S b/arch/arm/crypto/chacha20-neon-core.S

index 50e7b98..2335e50 100644 (file)
--- a/arch/arm/crypto/chacha20-neon-core.S
+++ b/arch/arm/crypto/chacha20-neon-core.S
@@ -52,27 +52,16 @@
         .fpu            neon
         .align          5
  
-ENTRY(chacha20_block_xor_neon)
-       // r0: Input state matrix, s
-       // r1: 1 data block output, o
-       // r2: 1 data block input, i
-
-       //
-       // This function encrypts one ChaCha20 block by loading the state matrix
-       // in four NEON registers. It performs matrix operation on four words in
-       // parallel, but requireds shuffling to rearrange the words after each
-       // round.
-       //
-
-       // x0..3 = s0..3
-       add             ip, r0, #0x20
-       vld1.32         {q0-q1}, [r0]
-       vld1.32         {q2-q3}, [ip]
-
-       vmov            q8, q0
-       vmov            q9, q1
-       vmov            q10, q2
-       vmov            q11, q3
+/*
+ * chacha20_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is stored in the four NEON
+ * registers q0-q3.  It performs matrix operations on four words in parallel,
+ * but requires shuffling to rearrange the words after each round.
+ *
+ * Clobbers: r3, ip, q4-q5
+ */
+chacha20_permute:
  
         adr             ip, .Lrol8_table
         mov             r3, #10
@@ -142,6 +131,27 @@ ENTRY(chacha20_block_xor_neon)
         subs            r3, r3, #1
         bne             .Ldoubleround
  
+       bx              lr
+ENDPROC(chacha20_permute)
+
+ENTRY(chacha20_block_xor_neon)
+       // r0: Input state matrix, s
+       // r1: 1 data block output, o
+       // r2: 1 data block input, i
+       push            {lr}
+
+       // x0..3 = s0..3
+       add             ip, r0, #0x20
+       vld1.32         {q0-q1}, [r0]
+       vld1.32         {q2-q3}, [ip]
+
+       vmov            q8, q0
+       vmov            q9, q1
+       vmov            q10, q2
+       vmov            q11, q3
+
+       bl              chacha20_permute
+
         add             ip, r2, #0x20
         vld1.8          {q4-q5}, [r2]
         vld1.8          {q6-q7}, [ip]
@@ -166,9 +176,25 @@ ENTRY(chacha20_block_xor_neon)
         vst1.8          {q0-q1}, [r1]
         vst1.8          {q2-q3}, [ip]
  
-       bx              lr
+       pop             {pc}
  ENDPROC(chacha20_block_xor_neon)
  
+ENTRY(hchacha20_block_neon)
+       // r0: Input state matrix, s
+       // r1: output (8 32-bit words)
+       push            {lr}
+
+       vld1.32         {q0-q1}, [r0]!
+       vld1.32         {q2-q3}, [r0]
+
+       bl              chacha20_permute
+
+       vst1.32         {q0}, [r1]!
+       vst1.32         {q3}, [r1]
+
+       pop             {pc}
+ENDPROC(hchacha20_block_neon)
+
         .align          4
  .Lctrinc:      .word   0, 1, 2, 3
  .Lrol8_table:  .byte   3, 0, 1, 2, 7, 4, 5, 6