X-Git-Url: http://git.osdn.net/view?p=bytom%2Fvapor.git;a=blobdiff_plain;f=vendor%2Fgonum.org%2Fv1%2Fgonum%2Finternal%2Fasm%2Ff32%2Fdotunitary_amd64.s;fp=vendor%2Fgonum.org%2Fv1%2Fgonum%2Finternal%2Fasm%2Ff32%2Fdotunitary_amd64.s;h=0000000000000000000000000000000000000000;hp=581a8f4cd51546193a2a40ef0af530212b1cbd48;hb=54373c1a3efe0e373ec1605840a4363e4b246c46;hpb=ee01d543fdfe1fd0a4d548965c66f7923ea7b062 diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s deleted file mode 100644 index 581a8f4c..00000000 --- a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright ©2017 The Gonum Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//+build !noasm,!appengine - -#include "textflag.h" - -#define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0 - -#define X_PTR SI -#define Y_PTR DI -#define LEN CX -#define TAIL BX -#define IDX AX -#define SUM X0 -#define P_SUM X1 - -// func DotUnitary(x, y []float32) (sum float32) -TEXT ·DotUnitary(SB), NOSPLIT, $0 - MOVQ x_base+0(FP), X_PTR // X_PTR = &x - MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y - PXOR SUM, SUM // SUM = 0 - MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) - CMPQ y_len+32(FP), LEN - CMOVQLE y_len+32(FP), LEN - CMPQ LEN, $0 - JE dot_end - - XORQ IDX, IDX - MOVQ Y_PTR, DX - ANDQ $0xF, DX // Align on 16-byte boundary for MULPS - JZ dot_no_trim // if DX == 0 { goto dot_no_trim } - SUBQ $16, DX - -dot_align: // Trim first value(s) in unaligned buffer do { - MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i] - MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i] - ADDSS X2, SUM // SUM += X2 - INCQ IDX // IDX++ - DECQ LEN - JZ dot_end // if --TAIL == 0 { return } - ADDQ $4, DX - JNZ dot_align // } while --DX > 0 - -dot_no_trim: - PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining - MOVQ LEN, TAIL - ANDQ $0xF, TAIL // TAIL = LEN % 16 - SHRQ $4, LEN // LEN = floor( LEN / 16 ) - JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start } - -dot_loop: // Loop unrolled 16x do { - MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] - MOVUPS 16(X_PTR)(IDX*4), X3 - MOVUPS 32(X_PTR)(IDX*4), X4 - MOVUPS 48(X_PTR)(IDX*4), X5 - - MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1] - MULPS 16(Y_PTR)(IDX*4), X3 - MULPS 32(Y_PTR)(IDX*4), X4 - MULPS 48(Y_PTR)(IDX*4), X5 - - ADDPS X2, SUM // SUM += X_i - ADDPS X3, P_SUM - ADDPS X4, SUM - ADDPS X5, P_SUM - - ADDQ $16, IDX // IDX += 16 - DECQ LEN - JNZ dot_loop // } while --LEN > 0 - - ADDPS P_SUM, SUM // SUM += P_SUM - CMPQ TAIL, $0 // if TAIL == 0 { return } - JE dot_end - -dot_tail4_start: // Reset loop counter for 4-wide tail loop - MOVQ TAIL, LEN // LEN = floor( TAIL / 4 ) - SHRQ $2, LEN - JZ dot_tail_start // if LEN == 0 { goto dot_tail_start } - -dot_tail4_loop: // Loop unrolled 4x do { - MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1] - MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1] - ADDPS X2, SUM // SUM += X_i - ADDQ $4, IDX // i += 4 - DECQ LEN - JNZ dot_tail4_loop // } while --LEN > 0 - -dot_tail_start: // Reset loop counter for 1-wide tail loop - ANDQ $3, TAIL // TAIL = TAIL % 4 - JZ dot_end // if TAIL == 0 { return } - -dot_tail: // do { - MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i] - MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i] - ADDSS X2, SUM // psum += X2 - INCQ IDX // IDX++ - DECQ TAIL - JNZ dot_tail // } while --TAIL > 0 - -dot_end: - HADDPS_SUM_SUM // SUM = \sum{ SUM[i] } - HADDPS_SUM_SUM - MOVSS SUM, sum+48(FP) // return SUM - RET