// Copyright ©2017 The Gonum Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //+build !noasm,!appengine #include "textflag.h" #define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0 #define X_PTR SI #define Y_PTR DI #define LEN CX #define TAIL BX #define IDX AX #define SUM X0 #define P_SUM X1 // func DdotUnitary(x, y []float32) (sum float32) TEXT ·DdotUnitary(SB), NOSPLIT, $0 MOVQ x_base+0(FP), X_PTR // X_PTR = &x MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) CMPQ y_len+32(FP), LEN CMOVQLE y_len+32(FP), LEN PXOR SUM, SUM // psum = 0 CMPQ LEN, $0 JE dot_end XORQ IDX, IDX MOVQ Y_PTR, DX ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS JZ dot_no_trim // if DX == 0 { goto dot_no_trim } SUBQ $16, DX dot_align: // Trim first value(s) in unaligned buffer do { CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i]) CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i]) MULSD X3, X2 ADDSD X2, SUM // SUM += X2 INCQ IDX // IDX++ DECQ LEN JZ dot_end // if --TAIL == 0 { return } ADDQ $4, DX JNZ dot_align // } while --LEN > 0 dot_no_trim: PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining MOVQ LEN, TAIL ANDQ $0x7, TAIL // TAIL = LEN % 8 SHRQ $3, LEN // LEN = floor( LEN / 8 ) JZ dot_tail_start // if LEN == 0 { goto dot_tail_start } dot_loop: // Loop unrolled 8x do { CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] CVTPS2PD 8(X_PTR)(IDX*4), X3 CVTPS2PD 16(X_PTR)(IDX*4), X4 CVTPS2PD 24(X_PTR)(IDX*4), X5 CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1] CVTPS2PD 8(Y_PTR)(IDX*4), X7 CVTPS2PD 16(Y_PTR)(IDX*4), X8 CVTPS2PD 24(Y_PTR)(IDX*4), X9 MULPD X6, X2 // X_i *= X_j MULPD X7, X3 MULPD X8, X4 MULPD X9, X5 ADDPD X2, SUM // SUM += X_i ADDPD X3, P_SUM ADDPD X4, SUM ADDPD X5, P_SUM ADDQ $8, IDX // IDX += 8 DECQ LEN JNZ dot_loop // } while --LEN > 0 ADDPD P_SUM, SUM // SUM += P_SUM CMPQ TAIL, $0 // if TAIL == 0 { return } JE dot_end dot_tail_start: MOVQ TAIL, LEN SHRQ $1, LEN JZ dot_tail_one dot_tail_two: CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1] MULPD X6, X2 // X_i *= X_j ADDPD X2, SUM // SUM += X_i ADDQ $2, IDX // IDX += 2 DECQ LEN JNZ dot_tail_two // } while --LEN > 0 ANDQ $1, TAIL JZ dot_end dot_tail_one: CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i]) CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i]) MULSD X3, X2 // X2 *= X3 ADDSD X2, SUM // SUM += X2 dot_end: HADDPD_SUM_SUM // SUM = \sum{ SUM[i] } MOVSD SUM, sum+48(FP) // return SUM RET