--- /dev/null
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//+build !noasm,!appengine
+
+#include "textflag.h"
+
+#define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DdotUnitary(x, y []float32) (sum float32)
+TEXT ·DdotUnitary(SB), NOSPLIT, $0
+ MOVQ x_base+0(FP), X_PTR // X_PTR = &x
+ MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+ MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
+ CMPQ y_len+32(FP), LEN
+ CMOVQLE y_len+32(FP), LEN
+ PXOR SUM, SUM // psum = 0
+ CMPQ LEN, $0
+ JE dot_end
+
+ XORQ IDX, IDX
+ MOVQ Y_PTR, DX
+ ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS
+ JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
+
+ SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer do {
+ CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+ CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+ MULSD X3, X2
+ ADDSD X2, SUM // SUM += X2
+ INCQ IDX // IDX++
+ DECQ LEN
+ JZ dot_end // if --TAIL == 0 { return }
+ ADDQ $4, DX
+ JNZ dot_align // } while --LEN > 0
+
+dot_no_trim:
+ PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
+ MOVQ LEN, TAIL
+ ANDQ $0x7, TAIL // TAIL = LEN % 8
+ SHRQ $3, LEN // LEN = floor( LEN / 8 )
+ JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_loop: // Loop unrolled 8x do {
+ CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+ CVTPS2PD 8(X_PTR)(IDX*4), X3
+ CVTPS2PD 16(X_PTR)(IDX*4), X4
+ CVTPS2PD 24(X_PTR)(IDX*4), X5
+
+ CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
+ CVTPS2PD 8(Y_PTR)(IDX*4), X7
+ CVTPS2PD 16(Y_PTR)(IDX*4), X8
+ CVTPS2PD 24(Y_PTR)(IDX*4), X9
+
+ MULPD X6, X2 // X_i *= X_j
+ MULPD X7, X3
+ MULPD X8, X4
+ MULPD X9, X5
+
+ ADDPD X2, SUM // SUM += X_i
+ ADDPD X3, P_SUM
+ ADDPD X4, SUM
+ ADDPD X5, P_SUM
+
+ ADDQ $8, IDX // IDX += 8
+ DECQ LEN
+ JNZ dot_loop // } while --LEN > 0
+
+ ADDPD P_SUM, SUM // SUM += P_SUM
+ CMPQ TAIL, $0 // if TAIL == 0 { return }
+ JE dot_end
+
+dot_tail_start:
+ MOVQ TAIL, LEN
+ SHRQ $1, LEN
+ JZ dot_tail_one
+
+dot_tail_two:
+ CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+ CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
+ MULPD X6, X2 // X_i *= X_j
+ ADDPD X2, SUM // SUM += X_i
+ ADDQ $2, IDX // IDX += 2
+ DECQ LEN
+ JNZ dot_tail_two // } while --LEN > 0
+
+ ANDQ $1, TAIL
+ JZ dot_end
+
+dot_tail_one:
+ CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+ CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+ MULSD X3, X2 // X2 *= X3
+ ADDSD X2, SUM // SUM += X2
+
+dot_end:
+ HADDPD_SUM_SUM // SUM = \sum{ SUM[i] }
+ MOVSD SUM, sum+48(FP) // return SUM
+ RET