1 // Copyright ©2017 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
9 #define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0
19 // func DdotUnitary(x, y []float32) (sum float32)
20 TEXT ·DdotUnitary(SB), NOSPLIT, $0
21 MOVQ x_base+0(FP), X_PTR // X_PTR = &x
22 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
23 MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
24 CMPQ y_len+32(FP), LEN
25 CMOVQLE y_len+32(FP), LEN
26 PXOR SUM, SUM // psum = 0
32 ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS
33 JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
37 dot_align: // Trim first value(s) in unaligned buffer do {
38 CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
39 CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
41 ADDSD X2, SUM // SUM += X2
44 JZ dot_end // if --TAIL == 0 { return }
46 JNZ dot_align // } while --LEN > 0
49 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
51 ANDQ $0x7, TAIL // TAIL = LEN % 8
52 SHRQ $3, LEN // LEN = floor( LEN / 8 )
53 JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
55 dot_loop: // Loop unrolled 8x do {
56 CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
57 CVTPS2PD 8(X_PTR)(IDX*4), X3
58 CVTPS2PD 16(X_PTR)(IDX*4), X4
59 CVTPS2PD 24(X_PTR)(IDX*4), X5
61 CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
62 CVTPS2PD 8(Y_PTR)(IDX*4), X7
63 CVTPS2PD 16(Y_PTR)(IDX*4), X8
64 CVTPS2PD 24(Y_PTR)(IDX*4), X9
66 MULPD X6, X2 // X_i *= X_j
71 ADDPD X2, SUM // SUM += X_i
76 ADDQ $8, IDX // IDX += 8
78 JNZ dot_loop // } while --LEN > 0
80 ADDPD P_SUM, SUM // SUM += P_SUM
81 CMPQ TAIL, $0 // if TAIL == 0 { return }
90 CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
91 CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
92 MULPD X6, X2 // X_i *= X_j
93 ADDPD X2, SUM // SUM += X_i
94 ADDQ $2, IDX // IDX += 2
96 JNZ dot_tail_two // } while --LEN > 0
102 CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
103 CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
104 MULSD X3, X2 // X2 *= X3
105 ADDSD X2, SUM // SUM += X2
108 HADDPD_SUM_SUM // SUM = \sum{ SUM[i] }
109 MOVSD SUM, sum+48(FP) // return SUM