#include "codegen_x86.h"
#include "dex/quick/mir_to_lir-inl.h"
+#include "oat.h"
#include "x86_lir.h"
namespace art {
EXT_0F_ENCODING_MAP(Subss, 0xF3, 0x5C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Divsd, 0xF2, 0x5E, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Divss, 0xF3, 0x5E, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Punpcklbw, 0x66, 0x60, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Punpcklwd, 0x66, 0x61, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Punpckldq, 0x66, 0x62, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Punpcklqdq, 0x66, 0x6C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Sqrtsd, 0xF2, 0x51, REG_DEF0_USE0),
EXT_0F_ENCODING2_MAP(Pmulld, 0x66, 0x38, 0x40, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Pmullw, 0x66, 0xD5, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Pmuludq, 0x66, 0xF4, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Mulps, 0x00, 0x59, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Mulpd, 0x66, 0x59, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Paddb, 0x66, 0xFC, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Paddw, 0x66, 0xFD, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Paddd, 0x66, 0xFE, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Paddq, 0x66, 0xD4, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psadbw, 0x66, 0xF6, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Addps, 0x00, 0x58, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Addpd, 0xF2, 0x58, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Psubb, 0x66, 0xF8, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Psubw, 0x66, 0xF9, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Psubd, 0x66, 0xFA, REG_DEF0_USE0),
+ EXT_0F_ENCODING_MAP(Psubq, 0x66, 0xFB, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Subps, 0x00, 0x5C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Subpd, 0x66, 0x5C, REG_DEF0_USE0),
EXT_0F_ENCODING_MAP(Pand, 0x66, 0xDB, REG_DEF0_USE0),
{ kX86PsrlwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 2, 0, 1, false }, "PsrlwRI", "!0r,!1d" },
{ kX86PsrldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 2, 0, 1, false }, "PsrldRI", "!0r,!1d" },
{ kX86PsrlqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 2, 0, 1, false }, "PsrlqRI", "!0r,!1d" },
+ { kX86PsrldqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 3, 0, 1, false }, "PsrldqRI", "!0r,!1d" },
{ kX86PsllwRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x71, 0, 6, 0, 1, false }, "PsllwRI", "!0r,!1d" },
{ kX86PslldRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x72, 0, 6, 0, 1, false }, "PslldRI", "!0r,!1d" },
{ kX86PsllqRI, kRegImm, IS_BINARY_OP | REG_DEF0_USE0, { 0x66, 0, 0x0F, 0x73, 0, 6, 0, 1, false }, "PsllqRI", "!0r,!1d" },
{ kX86Fucompp, kNullary, NO_OPERAND | USE_FP_STACK, { 0xDA, 0, 0xE9, 0, 0, 0, 0, 0, false }, "Fucompp", "" },
{ kX86Fstsw16R, kNullary, NO_OPERAND | REG_DEFA | USE_FP_STACK, { 0x9B, 0xDF, 0xE0, 0, 0, 0, 0, 0, false }, "Fstsw16R", "ax" },
- EXT_0F_ENCODING_MAP(Mova128, 0x66, 0x6F, REG_DEF0),
- { kX86Mova128MR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128MR", "[!0r+!1d],!2r" },
- { kX86Mova128AR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "Mova128AR", "[!0r+!1r<<!2d+!3d],!4r" },
+ EXT_0F_ENCODING_MAP(Movdqa, 0x66, 0x6F, REG_DEF0),
+ { kX86MovdqaMR, kMemReg, IS_STORE | IS_TERTIARY_OP | REG_USE02, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaMR", "[!0r+!1d],!2r" },
+ { kX86MovdqaAR, kArrayReg, IS_STORE | IS_QUIN_OP | REG_USE014, { 0x66, 0, 0x0F, 0x6F, 0, 0, 0, 0, false }, "MovdqaAR", "[!0r+!1r<<!2d+!3d],!4r" },
EXT_0F_ENCODING_MAP(Movups, 0x0, 0x10, REG_DEF0),
int offset = AssignInsnOffsets();
if (const_vectors_ != nullptr) {
- /* assign offsets to vector literals */
-
- // First, get offset to 12 mod 16 to align to 16 byte boundary.
- // This will ensure that the vector is 16 byte aligned, as the procedure is
- // always aligned at at 4 mod 16.
- int align_size = (16-4) - (offset & 0xF);
- if (align_size < 0) {
- align_size += 16;
- }
-
- offset += align_size;
+ // Vector literals must be 16-byte aligned. The header that is placed
+ // in the code section causes misalignment so we take it into account.
+ // Otherwise, we are sure that for x86 method is aligned to 16.
+ DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u);
+ uint32_t bytes_to_fill = (0x10 - ((offset + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF;
+ offset += bytes_to_fill;
// Now assign each literal the right offset.
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
#include "dex/reg_storage_eq.h"
#include "mirror/array.h"
#include "mirror/string.h"
+#include "oat.h"
#include "x86_lir.h"
#include "utils/dwarf_cfi.h"
}
RegStorage X86Mir2Lir::Get128BitRegister(RegStorage reg) {
- return GetRegInfo(reg)->FindMatchingView(RegisterInfo::k128SoloStorageMask)->GetReg();
+ return GetRegInfo(reg)->Master()->GetReg();
}
bool X86Mir2Lir::IsByteRegister(RegStorage reg) {
return 128;
}
-int X86Mir2Lir::NumReservableVectorRegisters(bool fp_used) {
- return fp_used ? 5 : 7;
+int X86Mir2Lir::NumReservableVectorRegisters(bool long_or_fp) {
+ int num_vector_temps = cu_->target64 ? xp_temps_64.size() : xp_temps_32.size();
+
+ // Leave a few temps for use by backend as scratch.
+ return long_or_fp ? num_vector_temps - 2 : num_vector_temps - 1;
}
void X86Mir2Lir::SpillCoreRegs() {
rX86_RET1 = rDX;
rX86_INVOKE_TGT = rAX;
rX86_COUNT = rCX;
-
- // Initialize the number of reserved vector registers
- num_reserved_vector_regs_ = -1;
}
Mir2Lir* X86CodeGenerator(CompilationUnit* const cu, MIRGraph* const mir_graph,
DCHECK(method_literal_list_ == nullptr);
DCHECK(class_literal_list_ == nullptr);
- // Align to 16 byte boundary. We have implicit knowledge that the start of the method is
- // on a 4 byte boundary. How can I check this if it changes (other than aligned loads
- // will fail at runtime)?
- if (const_vectors_ != nullptr) {
- int align_size = (16-4) - (code_buffer_.size() & 0xF);
- if (align_size < 0) {
- align_size += 16;
- }
- while (align_size > 0) {
+ if (const_vectors_ != nullptr) {
+ // Vector literals must be 16-byte aligned. The header that is placed
+ // in the code section causes misalignment so we take it into account.
+ // Otherwise, we are sure that for x86 method is aligned to 16.
+ DCHECK_EQ(GetInstructionSetAlignment(cu_->instruction_set), 16u);
+ uint32_t bytes_to_fill = (0x10 - ((code_buffer_.size() + sizeof(OatQuickMethodHeader)) & 0xF)) & 0xF;
+ while (bytes_to_fill > 0) {
code_buffer_.push_back(0);
- align_size--;
+ bytes_to_fill--;
}
+
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
PushWord(&code_buffer_, p->operands[0]);
PushWord(&code_buffer_, p->operands[1]);
ReserveVectorRegisters(mir);
break;
case kMirOpReturnVectorRegisters:
- ReturnVectorRegisters();
+ ReturnVectorRegisters(mir);
break;
case kMirOpConstVector:
GenConst128(bb, mir);
case kMirOpMemBarrier:
GenMemBarrier(static_cast<MemBarrierKind>(mir->dalvikInsn.vA));
break;
+ case kMirOpPackedArrayGet:
+ GenPackedArrayGet(bb, mir);
+ break;
+ case kMirOpPackedArrayPut:
+ GenPackedArrayPut(bb, mir);
+ break;
default:
break;
}
}
void X86Mir2Lir::ReserveVectorRegisters(MIR* mir) {
- // We should not try to reserve twice without returning the registers
- DCHECK_NE(num_reserved_vector_regs_, -1);
-
- int num_vector_reg = mir->dalvikInsn.vA;
- for (int i = 0; i < num_vector_reg; i++) {
+ for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) {
RegStorage xp_reg = RegStorage::Solo128(i);
RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
Clobber(xp_reg);
}
}
}
-
- num_reserved_vector_regs_ = num_vector_reg;
}
-void X86Mir2Lir::ReturnVectorRegisters() {
- // Return all the reserved registers
- for (int i = 0; i < num_reserved_vector_regs_; i++) {
+void X86Mir2Lir::ReturnVectorRegisters(MIR* mir) {
+ for (uint32_t i = mir->dalvikInsn.vA; i <= mir->dalvikInsn.vB; i++) {
RegStorage xp_reg = RegStorage::Solo128(i);
RegisterInfo *xp_reg_info = GetRegInfo(xp_reg);
}
}
}
-
- // We don't have anymore reserved vector registers
- num_reserved_vector_regs_ = -1;
}
void X86Mir2Lir::GenConst128(BasicBlock* bb, MIR* mir) {
- store_method_addr_used_ = true;
- int type_size = mir->dalvikInsn.vB;
- // We support 128 bit vectors.
- DCHECK_EQ(type_size & 0xFFFF, 128);
RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest);
+
uint32_t *args = mir->dalvikInsn.arg;
int reg = rs_dest.GetReg();
// Check for all 0 case.
}
// Append the mov const vector to reg opcode.
- AppendOpcodeWithConst(kX86MovupsRM, reg, mir);
+ AppendOpcodeWithConst(kX86MovdqaRM, reg, mir);
}
void X86Mir2Lir::AppendOpcodeWithConst(X86OpCode opcode, int reg, MIR* mir) {
- // Okay, load it from the constant vector area.
- LIR *data_target = ScanVectorLiteral(mir);
+ // The literal pool needs position independent logic.
+ store_method_addr_used_ = true;
+
+ // To deal with correct memory ordering, reverse order of constants.
+ int32_t constants[4];
+ constants[3] = mir->dalvikInsn.arg[0];
+ constants[2] = mir->dalvikInsn.arg[1];
+ constants[1] = mir->dalvikInsn.arg[2];
+ constants[0] = mir->dalvikInsn.arg[3];
+
+ // Search if there is already a constant in pool with this value.
+ LIR *data_target = ScanVectorLiteral(constants);
if (data_target == nullptr) {
- data_target = AddVectorLiteral(mir);
+ data_target = AddVectorLiteral(constants);
}
// Address the start of the method.
// 4 byte offset. We will fix this up in the assembler later to have the right
// value.
ScopedMemRefType mem_ref_type(this, ResourceMask::kLiteral);
- LIR *load = NewLIR2(opcode, reg, rl_method.reg.GetReg());
+ LIR *load = NewLIR3(opcode, reg, rl_method.reg.GetReg(), 256 /* bogus */);
load->flags.fixup = kFixupLoad;
load->target = data_target;
}
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest);
RegStorage rs_src = RegStorage::Solo128(mir->dalvikInsn.vB);
- NewLIR2(kX86Mova128RR, rs_dest.GetReg(), rs_src.GetReg());
+ NewLIR2(kX86MovdqaRR, rs_dest.GetReg(), rs_src.GetReg());
}
-void X86Mir2Lir::GenMultiplyVectorSignedByte(BasicBlock *bb, MIR *mir) {
- const int BYTE_SIZE = 8;
- RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
- RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
- RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempWide());
-
+void X86Mir2Lir::GenMultiplyVectorSignedByte(RegStorage rs_dest_src1, RegStorage rs_src2) {
/*
* Emulate the behavior of a kSignedByte by separating out the 16 values in the two XMM
* and multiplying 8 at a time before recombining back into one XMM register.
*/
// Copy xmm1.
- NewLIR2(kX86Mova128RR, rs_src1_high_tmp.GetReg(), rs_dest_src1.GetReg());
+ RegStorage rs_src1_high_tmp = Get128BitRegister(AllocTempDouble());
+ RegStorage rs_dest_high_tmp = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_src1_high_tmp.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86MovdqaRR, rs_dest_high_tmp.GetReg(), rs_dest_src1.GetReg());
// Multiply low bits.
+ // x7 *= x3
NewLIR2(kX86PmullwRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
// xmm1 now has low bits.
AndMaskVectorRegister(rs_dest_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
// Prepare high bits for multiplication.
- NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), BYTE_SIZE);
- AndMaskVectorRegister(rs_src2, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
+ NewLIR2(kX86PsrlwRI, rs_src1_high_tmp.GetReg(), 0x8);
+ AndMaskVectorRegister(rs_dest_high_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
// Multiply high bits and xmm2 now has high bits.
- NewLIR2(kX86PmullwRR, rs_src2.GetReg(), rs_src1_high_tmp.GetReg());
+ NewLIR2(kX86PmullwRR, rs_src1_high_tmp.GetReg(), rs_dest_high_tmp.GetReg());
// Combine back into dest XMM register.
- NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src1_high_tmp.GetReg());
+}
+
+void X86Mir2Lir::GenMultiplyVectorLong(RegStorage rs_dest_src1, RegStorage rs_src2) {
+ /*
+ * We need to emulate the packed long multiply.
+ * For kMirOpPackedMultiply xmm1, xmm0:
+ * - xmm1 is src/dest
+ * - xmm0 is src
+ * - Get xmm2 and xmm3 as temp
+ * - Idea is to multiply the lower 32 of each operand with the higher 32 of the other.
+ * - Then add the two results.
+ * - Move it to the upper 32 of the destination
+ * - Then multiply the lower 32-bits of the operands and add the result to the destination.
+ *
+ * (op dest src )
+ * movdqa %xmm2, %xmm1
+ * movdqa %xmm3, %xmm0
+ * psrlq %xmm3, $0x20
+ * pmuludq %xmm3, %xmm2
+ * psrlq %xmm1, $0x20
+ * pmuludq %xmm1, %xmm0
+ * paddq %xmm1, %xmm3
+ * psllq %xmm1, $0x20
+ * pmuludq %xmm2, %xmm0
+ * paddq %xmm1, %xmm2
+ *
+ * When both the operands are the same, then we need to calculate the lower-32 * higher-32
+ * calculation only once. Thus we don't need the xmm3 temp above. That sequence becomes:
+ *
+ * (op dest src )
+ * movdqa %xmm2, %xmm1
+ * psrlq %xmm1, $0x20
+ * pmuludq %xmm1, %xmm0
+ * paddq %xmm1, %xmm1
+ * psllq %xmm1, $0x20
+ * pmuludq %xmm2, %xmm0
+ * paddq %xmm1, %xmm2
+ *
+ */
+
+ bool both_operands_same = (rs_dest_src1.GetReg() == rs_src2.GetReg());
+
+ RegStorage rs_tmp_vector_1;
+ RegStorage rs_tmp_vector_2;
+ rs_tmp_vector_1 = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector_1.GetReg(), rs_dest_src1.GetReg());
+
+ if (both_operands_same == false) {
+ rs_tmp_vector_2 = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector_2.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86PsrlqRI, rs_tmp_vector_2.GetReg(), 0x20);
+ NewLIR2(kX86PmuludqRR, rs_tmp_vector_2.GetReg(), rs_tmp_vector_1.GetReg());
+ }
+
+ NewLIR2(kX86PsrlqRI, rs_dest_src1.GetReg(), 0x20);
+ NewLIR2(kX86PmuludqRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
+
+ if (both_operands_same == false) {
+ NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_2.GetReg());
+ } else {
+ NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg());
+ }
+
+ NewLIR2(kX86PsllqRI, rs_dest_src1.GetReg(), 0x20);
+ NewLIR2(kX86PmuludqRR, rs_tmp_vector_1.GetReg(), rs_src2.GetReg());
+ NewLIR2(kX86PaddqRR, rs_dest_src1.GetReg(), rs_tmp_vector_1.GetReg());
}
void X86Mir2Lir::GenMultiplyVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
int opcode = 0;
switch (opsize) {
break;
case kSignedByte:
// HW doesn't support 16x16 byte multiplication so emulate it.
- GenMultiplyVectorSignedByte(bb, mir);
+ GenMultiplyVectorSignedByte(rs_dest_src1, rs_src2);
+ return;
+ case k64:
+ GenMultiplyVectorLong(rs_dest_src1, rs_src2);
return;
default:
LOG(FATAL) << "Unsupported vector multiply " << opsize;
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
int opcode = 0;
switch (opsize) {
case k32:
opcode = kX86PadddRR;
break;
+ case k64:
+ opcode = kX86PaddqRR;
+ break;
case kSignedHalf:
case kUnsignedHalf:
opcode = kX86PaddwRR;
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
int opcode = 0;
switch (opsize) {
case k32:
opcode = kX86PsubdRR;
break;
+ case k64:
+ opcode = kX86PsubqRR;
+ break;
case kSignedHalf:
case kUnsignedHalf:
opcode = kX86PsubwRR;
}
void X86Mir2Lir::GenShiftByteVector(BasicBlock *bb, MIR *mir) {
+ // Destination does not need clobbered because it has already been as part
+ // of the general packed shift handler (caller of this method).
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
- RegStorage rs_tmp = Get128BitRegister(AllocTempWide());
int opcode = 0;
- int imm = mir->dalvikInsn.vB;
-
switch (static_cast<ExtendedMIROpcode>(mir->dalvikInsn.opcode)) {
case kMirOpPackedShiftLeft:
opcode = kX86PsllwRI;
break;
case kMirOpPackedSignedShiftRight:
- opcode = kX86PsrawRI;
- break;
case kMirOpPackedUnsignedShiftRight:
- opcode = kX86PsrlwRI;
- break;
+ // TODO Add support for emulated byte shifts.
default:
LOG(FATAL) << "Unsupported shift operation on byte vector " << opcode;
break;
}
- /*
- * xmm1 will have low bits
- * xmm2 will have high bits
- *
- * xmm2 = xmm1
- * xmm1 = xmm1 .<< N
- * xmm2 = xmm2 && 0xFF00FF00FF00FF00FF00FF00FF00FF00
- * xmm2 = xmm2 .<< N
- * xmm1 = xmm1 | xmm2
- */
-
- // Copy xmm1.
- NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_dest_src1.GetReg());
+ // Clear xmm register and return if shift more than byte length.
+ int imm = mir->dalvikInsn.vB;
+ if (imm >= 8) {
+ NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_dest_src1.GetReg());
+ return;
+ }
// Shift lower values.
NewLIR2(opcode, rs_dest_src1.GetReg(), imm);
- // Mask bottom bits.
- AndMaskVectorRegister(rs_tmp, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00);
-
- // Shift higher values.
- NewLIR2(opcode, rs_tmp.GetReg(), imm);
+ /*
+ * The above shift will shift the whole word, but that means
+ * both the bytes will shift as well. To emulate a byte level
+ * shift, we can just throw away the lower (8 - N) bits of the
+ * upper byte, and we are done.
+ */
+ uint8_t byte_mask = 0xFF << imm;
+ uint32_t int_mask = byte_mask;
+ int_mask = int_mask << 8 | byte_mask;
+ int_mask = int_mask << 8 | byte_mask;
+ int_mask = int_mask << 8 | byte_mask;
- // Combine back into dest XMM register.
- NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_tmp.GetReg());
+ // And the destination with the mask
+ AndMaskVectorRegister(rs_dest_src1, int_mask, int_mask, int_mask, int_mask);
}
void X86Mir2Lir::GenShiftLeftVector(BasicBlock *bb, MIR *mir) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
int imm = mir->dalvikInsn.vB;
int opcode = 0;
switch (opsize) {
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
int imm = mir->dalvikInsn.vB;
int opcode = 0;
switch (opsize) {
case kUnsignedByte:
GenShiftByteVector(bb, mir);
return;
+ case k64:
+ // TODO Implement emulated shift algorithm.
default:
LOG(FATAL) << "Unsupported vector signed shift right " << opsize;
break;
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
int imm = mir->dalvikInsn.vB;
int opcode = 0;
switch (opsize) {
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
NewLIR2(kX86PandRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
}
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
NewLIR2(kX86PorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
}
// We only support 128 bit registers.
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
RegStorage rs_dest_src1 = RegStorage::Solo128(mir->dalvikInsn.vA);
+ Clobber(rs_dest_src1);
RegStorage rs_src2 = RegStorage::Solo128(mir->dalvikInsn.vB);
NewLIR2(kX86PxorRR, rs_dest_src1.GetReg(), rs_src2.GetReg());
}
void X86Mir2Lir::GenAddReduceVector(BasicBlock *bb, MIR *mir) {
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
- RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
- RegLocation rl_dest = mir_graph_->GetDest(mir);
- RegStorage rs_tmp;
-
- int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
- int vec_unit_size = 0;
- int opcode = 0;
- int extr_opcode = 0;
- RegLocation rl_result;
+ RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB);
+ bool is_wide = opsize == k64 || opsize == kDouble;
+
+ // Get the location of the virtual register. Since this bytecode is overloaded
+ // for different types (and sizes), we need different logic for each path.
+ // The design of bytecode uses same VR for source and destination.
+ RegLocation rl_src, rl_dest, rl_result;
+ if (is_wide) {
+ rl_src = mir_graph_->GetSrcWide(mir, 0);
+ rl_dest = mir_graph_->GetDestWide(mir);
+ } else {
+ rl_src = mir_graph_->GetSrc(mir, 0);
+ rl_dest = mir_graph_->GetDest(mir);
+ }
- switch (opsize) {
- case k32:
- extr_opcode = kX86PextrdRRI;
- opcode = kX86PhadddRR;
- vec_unit_size = 4;
- break;
- case kSignedByte:
- case kUnsignedByte:
- extr_opcode = kX86PextrbRRI;
- opcode = kX86PhaddwRR;
- vec_unit_size = 2;
- break;
- case kSignedHalf:
- case kUnsignedHalf:
- extr_opcode = kX86PextrwRRI;
- opcode = kX86PhaddwRR;
- vec_unit_size = 2;
- break;
- case kSingle:
- rl_result = EvalLoc(rl_dest, kFPReg, true);
- vec_unit_size = 4;
- for (int i = 0; i < 3; i++) {
- NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
- NewLIR3(kX86ShufpsRRI, rs_src1.GetReg(), rs_src1.GetReg(), 0x39);
- }
- NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), rs_src1.GetReg());
- StoreValue(rl_dest, rl_result);
+ // We need a temp for byte and short values
+ RegStorage temp;
- // For single-precision floats, we are done here
- return;
- default:
- LOG(FATAL) << "Unsupported vector add reduce " << opsize;
- break;
- }
+ // There is a different path depending on type and size.
+ if (opsize == kSingle) {
+ // Handle float case.
+ // TODO Add support for fast math (not value safe) and do horizontal add in that case.
- int elems = vec_bytes / vec_unit_size;
+ rl_src = LoadValue(rl_src, kFPReg);
+ rl_result = EvalLoc(rl_dest, kFPReg, true);
- // Emulate horizontal add instruction by reducing 2 vectors with 8 values before adding them again
- // TODO is overflow handled correctly?
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- rs_tmp = Get128BitRegister(AllocTempWide());
+ // Since we are doing an add-reduce, we move the reg holding the VR
+ // into the result so we include it in result.
+ OpRegCopy(rl_result.reg, rl_src.reg);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
- // tmp = xmm1 .>> 8.
- NewLIR2(kX86Mova128RR, rs_tmp.GetReg(), rs_src1.GetReg());
- NewLIR2(kX86PsrlwRI, rs_tmp.GetReg(), 8);
+ // Since FP must keep order of operation for value safety, we shift to low
+ // 32-bits and add to result.
+ for (int i = 0; i < 3; i++) {
+ NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+ }
- // Zero extend low bits in xmm1.
- AndMaskVectorRegister(rs_src1, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF);
- }
+ StoreValue(rl_dest, rl_result);
+ } else if (opsize == kDouble) {
+ // Handle double case.
+ rl_src = LoadValueWide(rl_src, kFPReg);
+ rl_result = EvalLocWide(rl_dest, kFPReg, true);
+ LOG(FATAL) << "Unsupported vector add reduce for double.";
+ } else if (opsize == k64) {
+ /*
+ * Handle long case:
+ * 1) Reduce the vector register to lower half (with addition).
+ * 1-1) Get an xmm temp and fill it with vector register.
+ * 1-2) Shift the xmm temp by 8-bytes.
+ * 1-3) Add the xmm temp to vector register that is being reduced.
+ * 2) Allocate temp GP / GP pair.
+ * 2-1) In 64-bit case, use movq to move result to a 64-bit GP.
+ * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair.
+ * 3) Finish the add reduction by doing what add-long/2addr does,
+ * but instead of having a VR as one of the sources, we have our temp GP.
+ */
+ RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8);
+ NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg());
+ FreeTemp(rs_tmp_vector);
+
+ // We would like to be able to reuse the add-long implementation, so set up a fake
+ // register location to pass it.
+ RegLocation temp_loc = mir_graph_->GetBadLoc();
+ temp_loc.core = 1;
+ temp_loc.wide = 1;
+ temp_loc.location = kLocPhysReg;
+ temp_loc.reg = AllocTempWide();
+
+ if (cu_->target64) {
+ DCHECK(!temp_loc.reg.IsPair());
+ NewLIR2(kX86MovqrxRR, temp_loc.reg.GetReg(), vector_src.GetReg());
+ } else {
+ NewLIR2(kX86MovdrxRR, temp_loc.reg.GetLowReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20);
+ NewLIR2(kX86MovdrxRR, temp_loc.reg.GetHighReg(), vector_src.GetReg());
+ }
- while (elems > 1) {
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- NewLIR2(opcode, rs_tmp.GetReg(), rs_tmp.GetReg());
+ GenArithOpLong(Instruction::ADD_LONG_2ADDR, rl_dest, temp_loc, temp_loc);
+ } else if (opsize == kSignedByte || opsize == kUnsignedByte) {
+ RegStorage rs_tmp = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86PxorRR, rs_tmp.GetReg(), rs_tmp.GetReg());
+ NewLIR2(kX86PsadbwRR, vector_src.GetReg(), rs_tmp.GetReg());
+ NewLIR3(kX86PshufdRRI, rs_tmp.GetReg(), vector_src.GetReg(), 0x4e);
+ NewLIR2(kX86PaddbRR, vector_src.GetReg(), rs_tmp.GetReg());
+ // Move to a GPR
+ temp = AllocTemp();
+ NewLIR2(kX86MovdrxRR, temp.GetReg(), vector_src.GetReg());
+ } else {
+ // Handle and the int and short cases together
+
+ // Initialize as if we were handling int case. Below we update
+ // the opcode if handling byte or short.
+ int vec_bytes = (mir->dalvikInsn.vC & 0xFFFF) / 8;
+ int vec_unit_size;
+ int horizontal_add_opcode;
+ int extract_opcode;
+
+ if (opsize == kSignedHalf || opsize == kUnsignedHalf) {
+ extract_opcode = kX86PextrwRRI;
+ horizontal_add_opcode = kX86PhaddwRR;
+ vec_unit_size = 2;
+ } else if (opsize == k32) {
+ vec_unit_size = 4;
+ horizontal_add_opcode = kX86PhadddRR;
+ extract_opcode = kX86PextrdRRI;
+ } else {
+ LOG(FATAL) << "Unsupported vector add reduce " << opsize;
+ return;
}
- NewLIR2(opcode, rs_src1.GetReg(), rs_src1.GetReg());
- elems >>= 1;
- }
- // Combine the results if we separated them.
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- NewLIR2(kX86PaddbRR, rs_src1.GetReg(), rs_tmp.GetReg());
- }
+ int elems = vec_bytes / vec_unit_size;
- // We need to extract to a GPR.
- RegStorage temp = AllocTemp();
- NewLIR3(extr_opcode, temp.GetReg(), rs_src1.GetReg(), 0);
+ while (elems > 1) {
+ NewLIR2(horizontal_add_opcode, vector_src.GetReg(), vector_src.GetReg());
+ elems >>= 1;
+ }
- // Can we do this directly into memory?
- rl_result = UpdateLocTyped(rl_dest, kCoreReg);
- if (rl_result.location == kLocPhysReg) {
- // Ensure res is in a core reg
- rl_result = EvalLoc(rl_dest, kCoreReg, true);
- OpRegReg(kOpAdd, rl_result.reg, temp);
- StoreFinalValue(rl_dest, rl_result);
- } else {
- OpMemReg(kOpAdd, rl_result, temp.GetReg());
- }
+ // Handle this as arithmetic unary case.
+ ScopedMemRefType mem_ref_type(this, ResourceMask::kDalvikReg);
- FreeTemp(temp);
+ // Extract to a GP register because this is integral typed.
+ temp = AllocTemp();
+ NewLIR3(extract_opcode, temp.GetReg(), vector_src.GetReg(), 0);
+ }
+
+ if (opsize != k64 && opsize != kSingle && opsize != kDouble) {
+ // The logic below looks very similar to the handling of ADD_INT_2ADDR
+ // except the rhs is not a VR but a physical register allocated above.
+ // No load of source VR is done because it assumes that rl_result will
+ // share physical register / memory location.
+ rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+ if (rl_result.location == kLocPhysReg) {
+ // Ensure res is in a core reg.
+ rl_result = EvalLoc(rl_dest, kCoreReg, true);
+ OpRegReg(kOpAdd, rl_result.reg, temp);
+ StoreFinalValue(rl_dest, rl_result);
+ } else {
+ // Do the addition directly to memory.
+ OpMemReg(kOpAdd, rl_result, temp.GetReg());
+ }
+ }
}
void X86Mir2Lir::GenReduceVector(BasicBlock *bb, MIR *mir) {
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegLocation rl_dest = mir_graph_->GetDest(mir);
- RegStorage rs_src1 = RegStorage::Solo128(mir->dalvikInsn.vB);
+ RegStorage vector_src = RegStorage::Solo128(mir->dalvikInsn.vB);
int extract_index = mir->dalvikInsn.arg[0];
int extr_opcode = 0;
RegLocation rl_result;
bool is_wide = false;
- switch (opsize) {
- case k32:
- rl_result = UpdateLocTyped(rl_dest, kCoreReg);
- extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
- break;
- case kSignedHalf:
- case kUnsignedHalf:
- rl_result= UpdateLocTyped(rl_dest, kCoreReg);
- extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
- break;
- default:
- LOG(FATAL) << "Unsupported vector add reduce " << opsize;
- return;
- break;
- }
+ // There is a different path depending on type and size.
+ if (opsize == kSingle) {
+ // Handle float case.
+ // TODO Add support for fast math (not value safe) and do horizontal add in that case.
- if (rl_result.location == kLocPhysReg) {
- NewLIR3(extr_opcode, rl_result.reg.GetReg(), rs_src1.GetReg(), extract_index);
- if (is_wide == true) {
- StoreFinalValue(rl_dest, rl_result);
+ rl_result = EvalLoc(rl_dest, kFPReg, true);
+ NewLIR2(kX86PxorRR, rl_result.reg.GetReg(), rl_result.reg.GetReg());
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+
+ // Since FP must keep order of operation for value safety, we shift to low
+ // 32-bits and add to result.
+ for (int i = 0; i < 3; i++) {
+ NewLIR3(kX86ShufpsRRI, vector_src.GetReg(), vector_src.GetReg(), 0x39);
+ NewLIR2(kX86AddssRR, rl_result.reg.GetReg(), vector_src.GetReg());
+ }
+
+ StoreValue(rl_dest, rl_result);
+ } else if (opsize == kDouble) {
+ // TODO Handle double case.
+ LOG(FATAL) << "Unsupported add reduce for double.";
+ } else if (opsize == k64) {
+ /*
+ * Handle long case:
+ * 1) Reduce the vector register to lower half (with addition).
+ * 1-1) Get an xmm temp and fill it with vector register.
+ * 1-2) Shift the xmm temp by 8-bytes.
+ * 1-3) Add the xmm temp to vector register that is being reduced.
+ * 2) Evaluate destination to a GP / GP pair.
+ * 2-1) In 64-bit case, use movq to move result to a 64-bit GP.
+ * 2-2) In 32-bit case, use movd twice to move to 32-bit GP pair.
+ * 3) Store the result to the final destination.
+ */
+ RegStorage rs_tmp_vector = Get128BitRegister(AllocTempDouble());
+ NewLIR2(kX86MovdqaRR, rs_tmp_vector.GetReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrldqRI, rs_tmp_vector.GetReg(), 8);
+ NewLIR2(kX86PaddqRR, vector_src.GetReg(), rs_tmp_vector.GetReg());
+ FreeTemp(rs_tmp_vector);
+
+ rl_result = EvalLocWide(rl_dest, kCoreReg, true);
+ if (cu_->target64) {
+ DCHECK(!rl_result.reg.IsPair());
+ NewLIR2(kX86MovqrxRR, rl_result.reg.GetReg(), vector_src.GetReg());
} else {
- StoreFinalValueWide(rl_dest, rl_result);
+ NewLIR2(kX86MovdrxRR, rl_result.reg.GetLowReg(), vector_src.GetReg());
+ NewLIR2(kX86PsrlqRI, vector_src.GetReg(), 0x20);
+ NewLIR2(kX86MovdrxRR, rl_result.reg.GetHighReg(), vector_src.GetReg());
}
+
+ StoreValueWide(rl_dest, rl_result);
} else {
- int displacement = SRegOffset(rl_result.s_reg_low);
- LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, rs_src1.GetReg());
- AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
- AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+ // Handle the rest of integral types now.
+ switch (opsize) {
+ case k32:
+ rl_result = UpdateLocTyped(rl_dest, kCoreReg);
+ extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrdMRI : kX86PextrdRRI;
+ break;
+ case kSignedHalf:
+ case kUnsignedHalf:
+ rl_result= UpdateLocTyped(rl_dest, kCoreReg);
+ extr_opcode = (rl_result.location == kLocPhysReg) ? kX86PextrwMRI : kX86PextrwRRI;
+ break;
+ default:
+ LOG(FATAL) << "Unsupported vector reduce " << opsize;
+ return;
+ }
+
+ if (rl_result.location == kLocPhysReg) {
+ NewLIR3(extr_opcode, rl_result.reg.GetReg(), vector_src.GetReg(), extract_index);
+ if (is_wide == true) {
+ StoreFinalValue(rl_dest, rl_result);
+ } else {
+ StoreFinalValueWide(rl_dest, rl_result);
+ }
+ } else {
+ int displacement = SRegOffset(rl_result.s_reg_low);
+ LIR *l = NewLIR3(extr_opcode, rs_rX86_SP.GetReg(), displacement, vector_src.GetReg());
+ AnnotateDalvikRegAccess(l, displacement >> 2, true /* is_load */, is_wide /* is_64bit */);
+ AnnotateDalvikRegAccess(l, displacement >> 2, false /* is_load */, is_wide /* is_64bit */);
+ }
}
}
DCHECK_EQ(mir->dalvikInsn.vC & 0xFFFF, 128U);
OpSize opsize = static_cast<OpSize>(mir->dalvikInsn.vC >> 16);
RegStorage rs_dest = RegStorage::Solo128(mir->dalvikInsn.vA);
- int op_low = 0, op_high = 0, imm = 0, op_mov = kX86MovdxrRR;
+ Clobber(rs_dest);
+ int op_shuffle = 0, op_shuffle_high = 0, op_mov = kX86MovdxrRR;
RegisterClass reg_type = kCoreReg;
+ bool is_wide = false;
switch (opsize) {
case k32:
- op_low = kX86PshufdRRI;
+ op_shuffle = kX86PshufdRRI;
break;
case kSingle:
- op_low = kX86PshufdRRI;
- op_mov = kX86Mova128RR;
+ op_shuffle = kX86PshufdRRI;
+ op_mov = kX86MovdqaRR;
reg_type = kFPReg;
break;
case k64:
- op_low = kX86PshufdRRI;
- imm = 0x44;
- break;
- case kDouble:
- op_low = kX86PshufdRRI;
- op_mov = kX86Mova128RR;
- reg_type = kFPReg;
- imm = 0x44;
+ op_shuffle = kX86PunpcklqdqRR;
+ op_mov = kX86MovqrxRR;
+ is_wide = true;
break;
case kSignedByte:
case kUnsignedByte:
- // Shuffle 8 bit value into 16 bit word.
- // We set val = val + (val << 8) below and use 16 bit shuffle.
+ // We will have the source loaded up in a
+ // double-word before we use this shuffle
+ op_shuffle = kX86PshufdRRI;
+ break;
case kSignedHalf:
case kUnsignedHalf:
// Handles low quadword.
- op_low = kX86PshuflwRRI;
+ op_shuffle = kX86PshuflwRRI;
// Handles upper quadword.
- op_high = kX86PshufdRRI;
+ op_shuffle_high = kX86PshufdRRI;
break;
default:
LOG(FATAL) << "Unsupported vector set " << opsize;
break;
}
- RegLocation rl_src = mir_graph_->GetSrc(mir, 0);
-
- // Load the value from the VR into the reg.
- if (rl_src.wide == 0) {
+ // Load the value from the VR into a physical register.
+ RegLocation rl_src;
+ if (!is_wide) {
+ rl_src = mir_graph_->GetSrc(mir, 0);
rl_src = LoadValue(rl_src, reg_type);
} else {
+ rl_src = mir_graph_->GetSrcWide(mir, 0);
rl_src = LoadValueWide(rl_src, reg_type);
}
+ RegStorage reg_to_shuffle = rl_src.reg;
- // If opsize is 8 bits wide then double value and use 16 bit shuffle instead.
- if (opsize == kSignedByte || opsize == kUnsignedByte) {
- RegStorage temp = AllocTemp();
- // val = val + (val << 8).
- NewLIR2(kX86Mov32RR, temp.GetReg(), rl_src.reg.GetReg());
- NewLIR2(kX86Sal32RI, temp.GetReg(), 8);
- NewLIR2(kX86Or32RR, rl_src.reg.GetReg(), temp.GetReg());
- FreeTemp(temp);
+ // Load the value into the XMM register.
+ if (!cu_->target64 && opsize == k64) {
+ // Logic assumes that longs are loaded in GP register pairs.
+ NewLIR2(kX86MovdxrRR, rs_dest.GetReg(), reg_to_shuffle.GetLowReg());
+ RegStorage r_tmp = AllocTempDouble();
+ NewLIR2(kX86MovdxrRR, r_tmp.GetReg(), reg_to_shuffle.GetHighReg());
+ NewLIR2(kX86PunpckldqRR, rs_dest.GetReg(), r_tmp.GetReg());
+ FreeTemp(r_tmp);
+ } else {
+ NewLIR2(op_mov, rs_dest.GetReg(), reg_to_shuffle.GetReg());
}
- // Load the value into the XMM register.
- NewLIR2(op_mov, rs_dest.GetReg(), rl_src.reg.GetReg());
+ if (opsize == kSignedByte || opsize == kUnsignedByte) {
+ // In the byte case, first duplicate it to be a word
+ // Then duplicate it to be a double-word
+ NewLIR2(kX86PunpcklbwRR, rs_dest.GetReg(), rs_dest.GetReg());
+ NewLIR2(kX86PunpcklwdRR, rs_dest.GetReg(), rs_dest.GetReg());
+ }
// Now shuffle the value across the destination.
- NewLIR3(op_low, rs_dest.GetReg(), rs_dest.GetReg(), imm);
+ if (op_shuffle == kX86PunpcklqdqRR) {
+ NewLIR2(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg());
+ } else {
+ NewLIR3(op_shuffle, rs_dest.GetReg(), rs_dest.GetReg(), 0);
+ }
// And then repeat as needed.
- if (op_high != 0) {
- NewLIR3(op_high, rs_dest.GetReg(), rs_dest.GetReg(), imm);
+ if (op_shuffle_high != 0) {
+ NewLIR3(op_shuffle_high, rs_dest.GetReg(), rs_dest.GetReg(), 0);
}
}
-LIR *X86Mir2Lir::ScanVectorLiteral(MIR *mir) {
- int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
+void X86Mir2Lir::GenPackedArrayGet(BasicBlock *bb, MIR *mir) {
+ UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayGet not supported.";
+}
+
+void X86Mir2Lir::GenPackedArrayPut(BasicBlock *bb, MIR *mir) {
+ UNIMPLEMENTED(FATAL) << "Extended opcode kMirOpPackedArrayPut not supported.";
+}
+
+LIR* X86Mir2Lir::ScanVectorLiteral(int32_t* constants) {
for (LIR *p = const_vectors_; p != nullptr; p = p->next) {
- if (args[0] == p->operands[0] && args[1] == p->operands[1] &&
- args[2] == p->operands[2] && args[3] == p->operands[3]) {
+ if (constants[0] == p->operands[0] && constants[1] == p->operands[1] &&
+ constants[2] == p->operands[2] && constants[3] == p->operands[3]) {
return p;
}
}
return nullptr;
}
-LIR *X86Mir2Lir::AddVectorLiteral(MIR *mir) {
+LIR* X86Mir2Lir::AddVectorLiteral(int32_t* constants) {
LIR* new_value = static_cast<LIR*>(arena_->Alloc(sizeof(LIR), kArenaAllocData));
- int *args = reinterpret_cast<int*>(mir->dalvikInsn.arg);
- new_value->operands[0] = args[0];
- new_value->operands[1] = args[1];
- new_value->operands[2] = args[2];
- new_value->operands[3] = args[3];
+ new_value->operands[0] = constants[0];
+ new_value->operands[1] = constants[1];
+ new_value->operands[2] = constants[2];
+ new_value->operands[3] = constants[3];
new_value->next = const_vectors_;
if (const_vectors_ == nullptr) {
- estimated_native_code_size_ += 12; // Amount needed to align to 16 byte boundary.
+ estimated_native_code_size_ += 12; // Maximum needed to align to 16 byte boundary.
}
estimated_native_code_size_ += 16; // Space for one vector.
const_vectors_ = new_value;