1 // Copyright ©2016 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
9 // func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
10 TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
11 MOVQ dst_base+0(FP), DI // DI = &dst
12 MOVQ x_base+32(FP), SI // SI = &x
13 MOVQ y_base+56(FP), DX // DX = &y
14 MOVQ x_len+40(FP), BX // BX = min( len(x), len(y), len(dst) )
16 CMOVQLE y_len+64(FP), BX
17 CMPQ dst_len+8(FP), BX
18 CMOVQLE dst_len+8(FP), BX
19 CMPQ BX, $0 // if BX == 0 { return }
21 MOVSS alpha+24(FP), X0
22 SHUFPS $0, X0, X0 // X0 = { a, a, a, a, }
25 ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
26 JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
28 XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
32 axpy_align: // Trim first value(s) in unaligned buffer do {
33 MOVSS (SI)(AX*4), X2 // X2 = x[i]
34 MULSS X0, X2 // X2 *= a
35 ADDSS (DX)(AX*4), X2 // X2 += y[i]
36 MOVSS X2, (DI)(AX*4) // y[i] = X2
39 JZ axpy_end // if --BX == 0 { return }
40 LOOP axpy_align // } while --CX > 0
43 MOVUPS X0, X1 // Copy X0 to X1 for pipelining
45 ANDQ $0xF, BX // BX = len % 16
46 SHRQ $4, CX // CX = floor( len / 16 )
47 JZ axpy_tail4_start // if CX == 0 { return }
49 axpy_loop: // Loop unrolled 16x do {
50 MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
51 MOVUPS 16(SI)(AX*4), X3
52 MOVUPS 32(SI)(AX*4), X4
53 MOVUPS 48(SI)(AX*4), X5
54 MULPS X0, X2 // X2 *= a
58 ADDPS (DX)(AX*4), X2 // X2 += y[i:i+4]
59 ADDPS 16(DX)(AX*4), X3
60 ADDPS 32(DX)(AX*4), X4
61 ADDPS 48(DX)(AX*4), X5
62 MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
63 MOVUPS X3, 16(DI)(AX*4)
64 MOVUPS X4, 32(DI)(AX*4)
65 MOVUPS X5, 48(DI)(AX*4)
66 ADDQ $16, AX // i += 16
67 LOOP axpy_loop // while (--CX) > 0
68 CMPQ BX, $0 // if BX == 0 { return }
71 axpy_tail4_start: // Reset loop counter for 4-wide tail loop
72 MOVQ BX, CX // CX = floor( BX / 4 )
74 JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
76 axpy_tail4: // Loop unrolled 4x do {
77 MOVUPS (SI)(AX*4), X2 // X2 = x[i]
78 MULPS X0, X2 // X2 *= a
79 ADDPS (DX)(AX*4), X2 // X2 += y[i]
80 MOVUPS X2, (DI)(AX*4) // y[i] = X2
82 LOOP axpy_tail4 // } while --CX > 0
84 axpy_tail_start: // Reset loop counter for 1-wide tail loop
85 MOVQ BX, CX // CX = BX % 4
87 JZ axpy_end // if CX == 0 { return }
90 MOVSS (SI)(AX*4), X1 // X1 = x[i]
91 MULSS X0, X1 // X1 *= a
92 ADDSS (DX)(AX*4), X1 // X1 += y[i]
93 MOVSS X1, (DI)(AX*4) // y[i] = X1
95 LOOP axpy_tail // } while --CX > 0