+++ /dev/null
-// Copyright ©2017 The Gonum Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//+build !noasm,!appengine
-
-#include "textflag.h"
-
-#define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0
-
-#define X_PTR SI
-#define Y_PTR DI
-#define LEN CX
-#define TAIL BX
-#define IDX AX
-#define SUM X0
-#define P_SUM X1
-
-// func DotUnitary(x, y []float32) (sum float32)
-TEXT ·DotUnitary(SB), NOSPLIT, $0
- MOVQ x_base+0(FP), X_PTR // X_PTR = &x
- MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
- PXOR SUM, SUM // SUM = 0
- MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
- CMPQ y_len+32(FP), LEN
- CMOVQLE y_len+32(FP), LEN
- CMPQ LEN, $0
- JE dot_end
-
- XORQ IDX, IDX
- MOVQ Y_PTR, DX
- ANDQ $0xF, DX // Align on 16-byte boundary for MULPS
- JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
- SUBQ $16, DX
-
-dot_align: // Trim first value(s) in unaligned buffer do {
- MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
- MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
- ADDSS X2, SUM // SUM += X2
- INCQ IDX // IDX++
- DECQ LEN
- JZ dot_end // if --TAIL == 0 { return }
- ADDQ $4, DX
- JNZ dot_align // } while --DX > 0
-
-dot_no_trim:
- PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
- MOVQ LEN, TAIL
- ANDQ $0xF, TAIL // TAIL = LEN % 16
- SHRQ $4, LEN // LEN = floor( LEN / 16 )
- JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
-
-dot_loop: // Loop unrolled 16x do {
- MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
- MOVUPS 16(X_PTR)(IDX*4), X3
- MOVUPS 32(X_PTR)(IDX*4), X4
- MOVUPS 48(X_PTR)(IDX*4), X5
-
- MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
- MULPS 16(Y_PTR)(IDX*4), X3
- MULPS 32(Y_PTR)(IDX*4), X4
- MULPS 48(Y_PTR)(IDX*4), X5
-
- ADDPS X2, SUM // SUM += X_i
- ADDPS X3, P_SUM
- ADDPS X4, SUM
- ADDPS X5, P_SUM
-
- ADDQ $16, IDX // IDX += 16
- DECQ LEN
- JNZ dot_loop // } while --LEN > 0
-
- ADDPS P_SUM, SUM // SUM += P_SUM
- CMPQ TAIL, $0 // if TAIL == 0 { return }
- JE dot_end
-
-dot_tail4_start: // Reset loop counter for 4-wide tail loop
- MOVQ TAIL, LEN // LEN = floor( TAIL / 4 )
- SHRQ $2, LEN
- JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
-
-dot_tail4_loop: // Loop unrolled 4x do {
- MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
- MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
- ADDPS X2, SUM // SUM += X_i
- ADDQ $4, IDX // i += 4
- DECQ LEN
- JNZ dot_tail4_loop // } while --LEN > 0
-
-dot_tail_start: // Reset loop counter for 1-wide tail loop
- ANDQ $3, TAIL // TAIL = TAIL % 4
- JZ dot_end // if TAIL == 0 { return }
-
-dot_tail: // do {
- MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
- MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
- ADDSS X2, SUM // psum += X2
- INCQ IDX // IDX++
- DECQ TAIL
- JNZ dot_tail // } while --TAIL > 0
-
-dot_end:
- HADDPS_SUM_SUM // SUM = \sum{ SUM[i] }
- HADDPS_SUM_SUM
- MOVSS SUM, sum+48(FP) // return SUM
- RET