1 // Copyright ©2016 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
9 // func AxpyUnitary(alpha float32, x, y []float32)
10 TEXT ·AxpyUnitary(SB), NOSPLIT, $0
11 MOVQ x_base+8(FP), SI // SI = &x
12 MOVQ y_base+32(FP), DI // DI = &y
13 MOVQ x_len+16(FP), BX // BX = min( len(x), len(y) )
15 CMOVQLE y_len+40(FP), BX
16 CMPQ BX, $0 // if BX == 0 { return }
19 SHUFPS $0, X0, X0 // X0 = { a, a, a, a }
21 PXOR X2, X2 // 2 NOP instructions (PXOR) to align
22 PXOR X3, X3 // loop to cache line
24 ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
25 JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
27 XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
31 axpy_align: // Trim first value(s) in unaligned buffer do {
32 MOVSS (SI)(AX*4), X2 // X2 = x[i]
33 MULSS X0, X2 // X2 *= a
34 ADDSS (DI)(AX*4), X2 // X2 += y[i]
35 MOVSS X2, (DI)(AX*4) // y[i] = X2
38 JZ axpy_end // if --BX == 0 { return }
39 LOOP axpy_align // } while --CX > 0
42 MOVUPS X0, X1 // Copy X0 to X1 for pipelining
44 ANDQ $0xF, BX // BX = len % 16
45 SHRQ $4, CX // CX = int( len / 16 )
46 JZ axpy_tail4_start // if CX == 0 { return }
48 axpy_loop: // Loop unrolled 16x do {
49 MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
50 MOVUPS 16(SI)(AX*4), X3
51 MOVUPS 32(SI)(AX*4), X4
52 MOVUPS 48(SI)(AX*4), X5
53 MULPS X0, X2 // X2 *= a
57 ADDPS (DI)(AX*4), X2 // X2 += y[i:i+4]
58 ADDPS 16(DI)(AX*4), X3
59 ADDPS 32(DI)(AX*4), X4
60 ADDPS 48(DI)(AX*4), X5
61 MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
62 MOVUPS X3, 16(DI)(AX*4)
63 MOVUPS X4, 32(DI)(AX*4)
64 MOVUPS X5, 48(DI)(AX*4)
65 ADDQ $16, AX // i += 16
66 LOOP axpy_loop // while (--CX) > 0
67 CMPQ BX, $0 // if BX == 0 { return }
70 axpy_tail4_start: // Reset loop counter for 4-wide tail loop
71 MOVQ BX, CX // CX = floor( BX / 4 )
73 JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
75 axpy_tail4: // Loop unrolled 4x do {
76 MOVUPS (SI)(AX*4), X2 // X2 = x[i]
77 MULPS X0, X2 // X2 *= a
78 ADDPS (DI)(AX*4), X2 // X2 += y[i]
79 MOVUPS X2, (DI)(AX*4) // y[i] = X2
81 LOOP axpy_tail4 // } while --CX > 0
83 axpy_tail_start: // Reset loop counter for 1-wide tail loop
84 MOVQ BX, CX // CX = BX % 4
86 JZ axpy_end // if CX == 0 { return }
89 MOVSS (SI)(AX*4), X1 // X1 = x[i]
90 MULSS X0, X1 // X1 *= a
91 ADDSS (DI)(AX*4), X1 // X1 += y[i]
92 MOVSS X1, (DI)(AX*4) // y[i] = X1
94 LOOP axpy_tail // } while --CX > 0