1 // Copyright ©2016 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
9 // func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
10 TEXT ·AxpyInc(SB), NOSPLIT, $0
11 MOVQ n+56(FP), CX // CX = n
12 CMPQ CX, $0 // if n==0 { return }
14 MOVQ x_base+8(FP), SI // SI = &x
15 MOVQ y_base+32(FP), DI // DI = &y
16 MOVQ ix+80(FP), R8 // R8 = ix
17 MOVQ iy+88(FP), R9 // R9 = iy
18 LEAQ (SI)(R8*4), SI // SI = &(x[ix])
19 LEAQ (DI)(R9*4), DI // DI = &(y[iy])
20 MOVQ DI, DX // DX = DI Read Pointer for y
21 MOVQ incX+64(FP), R8 // R8 = incX
22 SHLQ $2, R8 // R8 *= sizeof(float32)
23 MOVQ incY+72(FP), R9 // R9 = incY
24 SHLQ $2, R9 // R9 *= sizeof(float32)
25 MOVSS alpha+0(FP), X0 // X0 = alpha
26 MOVSS X0, X1 // X1 = X0 // for pipelining
28 ANDQ $3, BX // BX = n % 4
29 SHRQ $2, CX // CX = floor( n / 4 )
30 JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
32 axpyi_loop: // Loop unrolled 4x do {
33 MOVSS (SI), X2 // X_i = x[i]
35 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
38 MULSS X1, X2 // X_i *= a
42 ADDSS (DX), X2 // X_i += y[i]
44 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
47 MOVSS X2, (DI) // y[i] = X_i
49 LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
52 LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
53 LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
54 LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
55 LOOP axpyi_loop // } while --CX > 0
56 CMPQ BX, $0 // if BX == 0 { return }
59 axpyi_tail_start: // Reset loop registers
60 MOVQ BX, CX // Loop counter: CX = BX
63 MOVSS (SI), X2 // X2 = x[i]
64 MULSS X1, X2 // X2 *= a
65 ADDSS (DI), X2 // X2 += y[i]
66 MOVSS X2, (DI) // y[i] = X2
67 ADDQ R8, SI // SI = &(SI[incX])
68 ADDQ R9, DI // DI = &(DI[incY])
69 LOOP axpyi_tail // } while --CX > 0