From 6f5f5d05caed8465ad15ca5728e2a30c7a080d94 Mon Sep 17 00:00:00 2001 From: Maxim Kazantsev Date: Mon, 8 Dec 2014 12:39:16 +0600 Subject: [PATCH] ART: Implement FP packed reduce for x86 This patch implements correct FP vector reduction by index. Previous implementation corresponded to packed add reduction. Change-Id: I02a9bcb8e8945937ba7a511b723f23ec30667d34 --- compiler/dex/quick/x86/target_x86.cc | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/compiler/dex/quick/x86/target_x86.cc b/compiler/dex/quick/x86/target_x86.cc index 5f6cdda0d..230f611de 100755 --- a/compiler/dex/quick/x86/target_x86.cc +++ b/compiler/dex/quick/x86/target_x86.cc @@ -2209,18 +2209,36 @@ void X86Mir2Lir::GenReduceVector(MIR* mir) { // Handle float case. // TODO Add support for fast math (not value safe) and do horizontal add in that case. + int extract_index = mir->dalvikInsn.arg[0]; + rl_result = EvalLoc(rl_dest, kFPReg, true); NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg()); - NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); - // Since FP must keep order of operation for value safety, we shift to low - // 32-bits and add to result. - for (int i = 0; i < 3; i++) { - NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39); + if (LIKELY(extract_index != 0)) { + // We know the index of element which we want to extract. We want to extract it and + // keep values in vector register correct for future use. So the way we act is: + // 1. Generate shuffle mask that allows to swap zeroth and required elements; + // 2. Shuffle vector register with this mask; + // 3. Extract zeroth element where required value lies; + // 4. Shuffle with same mask again to restore original values in vector register. + // The mask is generated from equivalence mask 0b11100100 swapping 0th and extracted + // element indices. + int shuffle[4] = {0b00, 0b01, 0b10, 0b11}; + shuffle[0] = extract_index; + shuffle[extract_index] = 0; + int mask = 0; + for (int i = 0; i < 4; i++) { + mask |= (shuffle[i] << (2 * i)); + } + NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), mask); + NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); + NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), mask); + } else { + // We need to extract zeroth element and don't need any complex stuff to do it. NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg()); } - StoreValue(rl_dest, rl_result); + StoreFinalValue(rl_dest, rl_result); } else if (opsize == kDouble) { // TODO Handle double case. LOG(FATAL) << "Unsupported add reduce for double."; -- 2.11.0