1 // Copyright ©2017 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
20 // func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
21 TEXT ·DotInc(SB), NOSPLIT, $0
22 MOVQ x_base+0(FP), X_PTR // X_PTR = &x
23 MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
24 PXOR SUM, SUM // SUM = 0
25 MOVQ n+48(FP), LEN // LEN = n
29 MOVQ ix+72(FP), INC_X // INC_X = ix
30 MOVQ iy+80(FP), INC_Y // INC_Y = iy
31 LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
32 LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
34 MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
36 MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
40 ANDQ $0x3, TAIL // TAIL = LEN % 4
41 SHRQ $2, LEN // LEN = floor( LEN / 4 )
42 JZ dot_tail // if LEN == 0 { goto dot_tail }
44 PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
45 LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
46 LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
48 dot_loop: // Loop unrolled 4x do {
49 MOVSS (X_PTR), X2 // X_i = x[i:i+1]
50 MOVSS (X_PTR)(INC_X*1), X3
51 MOVSS (X_PTR)(INC_X*2), X4
52 MOVSS (X_PTR)(INCx3_X*1), X5
54 MULSS (Y_PTR), X2 // X_i *= y[i:i+1]
55 MULSS (Y_PTR)(INC_Y*1), X3
56 MULSS (Y_PTR)(INC_Y*2), X4
57 MULSS (Y_PTR)(INCx3_Y*1), X5
59 ADDSS X2, SUM // SUM += X_i
64 LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
65 LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
68 JNZ dot_loop // } while --LEN > 0
70 ADDSS P_SUM, SUM // P_SUM += SUM
71 CMPQ TAIL, $0 // if TAIL == 0 { return }
75 MOVSS (X_PTR), X2 // X2 = x[i]
76 MULSS (Y_PTR), X2 // X2 *= y[i]
77 ADDSS X2, SUM // SUM += X2
78 ADDQ INC_X, X_PTR // X_PTR += INC_X
79 ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
81 JNZ dot_tail // } while --TAIL > 0
84 MOVSS SUM, sum+88(FP) // return SUM