1 // Copyright ©2016 The Gonum Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 //+build !noasm,!appengine
10 #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
12 #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
14 #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
16 #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
19 #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
21 #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
23 #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
25 #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
27 // func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
28 TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
29 MOVQ dst_base+0(FP), DI // DI = &dst
30 MOVQ x_base+40(FP), SI // SI = &x
31 MOVQ y_base+64(FP), DX // DX = &y
32 MOVQ x_len+48(FP), CX // CX = min( len(x), len(y), len(dst) )
34 CMOVQLE y_len+72(FP), CX
35 CMPQ dst_len+8(FP), CX
36 CMOVQLE dst_len+8(FP), CX
37 CMPQ CX, $0 // if CX == 0 { return }
39 MOVUPS alpha+24(FP), X0 // X0 = { imag(a), real(a) }
41 SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
43 MOVAPS X0, X10 // Copy X0 and X1 for pipelining
46 ANDQ $3, CX // CX = n % 4
47 SHRQ $2, BX // BX = floor( n / 4 )
48 JZ caxy_tail // if BX == 0 { goto caxy_tail }
51 MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
52 MOVUPS 16(SI)(AX*8), X4
53 MOVUPS 32(SI)(AX*8), X6
54 MOVUPS 48(SI)(AX*8), X8
56 // X_(i+1) = { real(x[i], real(x[i]) }
57 MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
62 // X_i = { imag(x[i]), imag(x[i]) }
63 SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
68 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
69 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
80 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
81 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
88 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
90 ADDPD 16(DX)(AX*8), X5
91 ADDPD 32(DX)(AX*8), X7
92 ADDPD 48(DX)(AX*8), X9
93 MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
94 MOVUPS X5, 16(DI)(AX*8)
95 MOVUPS X7, 32(DI)(AX*8)
96 MOVUPS X9, 48(DI)(AX*8)
99 JNZ caxy_loop // } while --BX > 0
100 CMPQ CX, $0 // if CX == 0 { return }
103 caxy_tail: // Same calculation, but read in values to avoid trampling memory
104 MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
105 MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
106 SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
107 MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
108 MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
111 // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
112 // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
116 // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
118 MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
119 ADDQ $2, AX // i += 2
120 LOOP caxy_tail // } while --CX > 0