X-Git-Url: http://git.osdn.net/view?p=bytom%2Fvapor.git;a=blobdiff_plain;f=vendor%2Fgonum.org%2Fv1%2Fgonum%2Finternal%2Fasm%2Ff32%2Fddotinc_amd64.s;fp=vendor%2Fgonum.org%2Fv1%2Fgonum%2Finternal%2Fasm%2Ff32%2Fddotinc_amd64.s;h=0000000000000000000000000000000000000000;hp=68c7fa9db4ded296cd29d9bb0249dcce1f18661e;hb=d09b7a78d44dc259725902b8141cdba0d716b121;hpb=ee01d543fdfe1fd0a4d548965c66f7923ea7b062 diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s deleted file mode 100644 index 68c7fa9d..00000000 --- a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright ©2017 The Gonum Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//+build !noasm,!appengine - -#include "textflag.h" - -#define X_PTR SI -#define Y_PTR DI -#define LEN CX -#define TAIL BX -#define INC_X R8 -#define INCx3_X R10 -#define INC_Y R9 -#define INCx3_Y R11 -#define SUM X0 -#define P_SUM X1 - -// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) -TEXT ·DdotInc(SB), NOSPLIT, $0 - MOVQ x_base+0(FP), X_PTR // X_PTR = &x - MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y - MOVQ n+48(FP), LEN // LEN = n - PXOR SUM, SUM // SUM = 0 - CMPQ LEN, $0 - JE dot_end - - MOVQ ix+72(FP), INC_X // INC_X = ix - MOVQ iy+80(FP), INC_Y // INC_Y = iy - LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix]) - LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy]) - - MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32) - SHLQ $2, INC_X - MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32) - SHLQ $2, INC_Y - - MOVQ LEN, TAIL - ANDQ $3, TAIL // TAIL = LEN % 4 - SHRQ $2, LEN // LEN = floor( LEN / 4 ) - JZ dot_tail // if LEN == 0 { goto dot_tail } - - PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining - LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 - LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 - -dot_loop: // Loop unrolled 4x do { - CVTSS2SD (X_PTR), X2 // X_i = x[i:i+1] - CVTSS2SD (X_PTR)(INC_X*1), X3 - CVTSS2SD (X_PTR)(INC_X*2), X4 - CVTSS2SD (X_PTR)(INCx3_X*1), X5 - - CVTSS2SD (Y_PTR), X6 // X_j = y[i:i+1] - CVTSS2SD (Y_PTR)(INC_Y*1), X7 - CVTSS2SD (Y_PTR)(INC_Y*2), X8 - CVTSS2SD (Y_PTR)(INCx3_Y*1), X9 - - MULSD X6, X2 // X_i *= X_j - MULSD X7, X3 - MULSD X8, X4 - MULSD X9, X5 - - ADDSD X2, SUM // SUM += X_i - ADDSD X3, P_SUM - ADDSD X4, SUM - ADDSD X5, P_SUM - - LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4]) - LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4]) - - DECQ LEN - JNZ dot_loop // } while --LEN > 0 - - ADDSD P_SUM, SUM // SUM += P_SUM - CMPQ TAIL, $0 // if TAIL == 0 { return } - JE dot_end - -dot_tail: // do { - CVTSS2SD (X_PTR), X2 // X2 = x[i] - CVTSS2SD (Y_PTR), X3 // X2 *= y[i] - MULSD X3, X2 - ADDSD X2, SUM // SUM += X2 - ADDQ INC_X, X_PTR // X_PTR += INC_X - ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y - DECQ TAIL - JNZ dot_tail // } while --TAIL > 0 - -dot_end: - MOVSD SUM, sum+88(FP) // return SUM - RET