1 // Copyright ©2017 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
9 #define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0
19 // func DotUnitary(x, y []float32) (sum float32)
20 TEXT ·DotUnitary(SB), NOSPLIT, $0
21 MOVQ x_base+0(FP), X_PTR // X_PTR = &x
22 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
23 PXOR SUM, SUM // SUM = 0
24 MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
25 CMPQ y_len+32(FP), LEN
26 CMOVQLE y_len+32(FP), LEN
32 ANDQ $0xF, DX // Align on 16-byte boundary for MULPS
33 JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
36 dot_align: // Trim first value(s) in unaligned buffer do {
37 MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
38 MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
39 ADDSS X2, SUM // SUM += X2
42 JZ dot_end // if --TAIL == 0 { return }
44 JNZ dot_align // } while --DX > 0
47 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
49 ANDQ $0xF, TAIL // TAIL = LEN % 16
50 SHRQ $4, LEN // LEN = floor( LEN / 16 )
51 JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
53 dot_loop: // Loop unrolled 16x do {
54 MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
55 MOVUPS 16(X_PTR)(IDX*4), X3
56 MOVUPS 32(X_PTR)(IDX*4), X4
57 MOVUPS 48(X_PTR)(IDX*4), X5
59 MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
60 MULPS 16(Y_PTR)(IDX*4), X3
61 MULPS 32(Y_PTR)(IDX*4), X4
62 MULPS 48(Y_PTR)(IDX*4), X5
64 ADDPS X2, SUM // SUM += X_i
69 ADDQ $16, IDX // IDX += 16
71 JNZ dot_loop // } while --LEN > 0
73 ADDPS P_SUM, SUM // SUM += P_SUM
74 CMPQ TAIL, $0 // if TAIL == 0 { return }
77 dot_tail4_start: // Reset loop counter for 4-wide tail loop
78 MOVQ TAIL, LEN // LEN = floor( TAIL / 4 )
80 JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
82 dot_tail4_loop: // Loop unrolled 4x do {
83 MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
84 MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
85 ADDPS X2, SUM // SUM += X_i
86 ADDQ $4, IDX // i += 4
88 JNZ dot_tail4_loop // } while --LEN > 0
90 dot_tail_start: // Reset loop counter for 1-wide tail loop
91 ANDQ $3, TAIL // TAIL = TAIL % 4
92 JZ dot_end // if TAIL == 0 { return }
95 MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
96 MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
97 ADDSS X2, SUM // psum += X2
100 JNZ dot_tail // } while --TAIL > 0
103 HADDPS_SUM_SUM // SUM = \sum{ SUM[i] }
105 MOVSS SUM, sum+48(FP) // return SUM