From: Guillaume Sanchez Date: Wed, 15 Apr 2015 18:57:27 +0000 (+0000) Subject: Revert "[optimizing] Improve x86 parallel moves/swaps" X-Git-Tag: android-x86-7.1-r1~889^2~1507^2 X-Git-Url: http://git.osdn.net/view?a=commitdiff_plain;h=e14590bdfed24df30e6b7545fc819ba03ff8bba1;p=android-x86%2Fart.git Revert "[optimizing] Improve x86 parallel moves/swaps" This reverts commit a5c19ce8d200d68a528f2ce0ebff989106c4a933. This commit introduces a performance regression on CaffeineLogic of 30%. Change-Id: I917e206e249d44e1748537bc1b2d31054ea4959d --- diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 183453ded..4d7468350 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -3958,43 +3958,23 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const { } void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) { - ScratchRegisterScope possible_scratch( - this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); - int temp = possible_scratch.GetRegister(); - if (temp == kNoRegister) { - // Use the stack. - __ pushl(Address(ESP, src)); - __ popl(Address(ESP, dst)); - } else { - Register temp_reg = static_cast(temp); - __ movl(temp_reg, Address(ESP, src)); - __ movl(Address(ESP, dst), temp_reg); - } + ScratchRegisterScope ensure_scratch( + this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); + Register temp_reg = static_cast(ensure_scratch.GetRegister()); + int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; + __ movl(temp_reg, Address(ESP, src + stack_offset)); + __ movl(Address(ESP, dst + stack_offset), temp_reg); } void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) { - ScratchRegisterScope possible_scratch( - this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); - int temp = possible_scratch.GetRegister(); - if (temp == kNoRegister) { - // Use the stack instead. - // Push src low word. - __ pushl(Address(ESP, src)); - // Push src high word. Stack offset = 4. - __ pushl(Address(ESP, src + 4 /* offset */ + kX86WordSize /* high */)); - - // Pop into dst high word. Stack offset = 8. - // Pop with ESP address uses the 'after increment' value of ESP. - __ popl(Address(ESP, dst + 4 /* offset */ + kX86WordSize /* high */)); - // Finally dst low word. Stack offset = 4. - __ popl(Address(ESP, dst)); - } else { - Register temp_reg = static_cast(temp); - __ movl(temp_reg, Address(ESP, src)); - __ movl(Address(ESP, dst), temp_reg); - __ movl(temp_reg, Address(ESP, src + kX86WordSize)); - __ movl(Address(ESP, dst + kX86WordSize), temp_reg); - } + ScratchRegisterScope ensure_scratch( + this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); + Register temp_reg = static_cast(ensure_scratch.GetRegister()); + int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; + __ movl(temp_reg, Address(ESP, src + stack_offset)); + __ movl(Address(ESP, dst + stack_offset), temp_reg); + __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize)); + __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg); } void ParallelMoveResolverX86::EmitMove(size_t index) { @@ -4059,18 +4039,10 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ xorps(dest, dest); } else { ScratchRegisterScope ensure_scratch( - this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); - int temp_reg = ensure_scratch.GetRegister(); - if (temp_reg == kNoRegister) { - // Avoid spilling/restoring a scratch register by using the stack. - __ pushl(Immediate(value)); - __ movss(dest, Address(ESP, 0)); - __ addl(ESP, Immediate(4)); - } else { - Register temp = static_cast(temp_reg); - __ movl(temp, Immediate(value)); - __ movd(dest, temp); - } + this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); + Register temp = static_cast(ensure_scratch.GetRegister()); + __ movl(temp, Immediate(value)); + __ movd(dest, temp); } } else { DCHECK(destination.IsStackSlot()) << destination; @@ -4119,96 +4091,42 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { } } -void ParallelMoveResolverX86::Exchange(Register reg1, Register reg2) { - // Prefer to avoid xchg as it isn't speedy on smaller processors. - ScratchRegisterScope possible_scratch( - this, reg1, codegen_->GetNumberOfCoreRegisters()); - int temp_reg = possible_scratch.GetRegister(); - if (temp_reg == kNoRegister || temp_reg == reg2) { - __ pushl(reg1); - __ movl(reg1, reg2); - __ popl(reg2); - } else { - Register temp = static_cast(temp_reg); - __ movl(temp, reg1); - __ movl(reg1, reg2); - __ movl(reg2, temp); - } -} - void ParallelMoveResolverX86::Exchange(Register reg, int mem) { - ScratchRegisterScope possible_scratch( - this, reg, codegen_->GetNumberOfCoreRegisters()); - int temp_reg = possible_scratch.GetRegister(); - if (temp_reg == kNoRegister) { - __ pushl(Address(ESP, mem)); - __ movl(Address(ESP, mem + kX86WordSize), reg); - __ popl(reg); - } else { - Register temp = static_cast(temp_reg); - __ movl(temp, Address(ESP, mem)); - __ movl(Address(ESP, mem), reg); - __ movl(reg, temp); - } + Register suggested_scratch = reg == EAX ? EBX : EAX; + ScratchRegisterScope ensure_scratch( + this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters()); + + int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; + __ movl(static_cast(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset)); + __ movl(Address(ESP, mem + stack_offset), reg); + __ movl(reg, static_cast(ensure_scratch.GetRegister())); } void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) { - ScratchRegisterScope possible_scratch( - this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); - int temp_reg = possible_scratch.GetRegister(); - if (temp_reg == kNoRegister) { - __ pushl(Address(ESP, mem)); - __ movss(Address(ESP, mem + kX86WordSize), reg); - __ movss(reg, Address(ESP, 0)); - __ addl(ESP, Immediate(kX86WordSize)); - } else { - Register temp = static_cast(temp_reg); - __ movl(temp, Address(ESP, mem)); - __ movss(Address(ESP, mem), reg); - __ movd(reg, temp); - } + ScratchRegisterScope ensure_scratch( + this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); + + Register temp_reg = static_cast(ensure_scratch.GetRegister()); + int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; + __ movl(temp_reg, Address(ESP, mem + stack_offset)); + __ movss(Address(ESP, mem + stack_offset), reg); + __ movd(reg, temp_reg); } void ParallelMoveResolverX86::Exchange(int mem1, int mem2) { - ScratchRegisterScope possible_scratch1( - this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); - int temp_reg1 = possible_scratch1.GetRegister(); - if (temp_reg1 == kNoRegister) { - // No free registers. Use the stack. - __ pushl(Address(ESP, mem1)); - __ pushl(Address(ESP, mem2 + kX86WordSize)); - // Pop with ESP address uses the 'after increment' value of ESP. - __ popl(Address(ESP, mem1 + kX86WordSize)); - __ popl(Address(ESP, mem2)); - } else { - // Got the first one. Try for a second. - ScratchRegisterScope possible_scratch2( - this, temp_reg1, codegen_->GetNumberOfCoreRegisters()); - int temp_reg2 = possible_scratch2.GetRegister(); - if (temp_reg2 == kNoRegister) { - Register temp = static_cast(temp_reg1); - // Bummer. Only have one free register to use. - // Save mem1 on the stack. - __ pushl(Address(ESP, mem1)); - - // Copy mem2 into mem1. - __ movl(temp, Address(ESP, mem2 + kX86WordSize)); - __ movl(Address(ESP, mem1 + kX86WordSize), temp); - - // Now pop mem1 into mem2. - // Pop with ESP address uses the 'after increment' value of ESP. - __ popl(Address(ESP, mem2)); - } else { - // Great. We have 2 registers to play with. - Register temp1 = static_cast(temp_reg1); - Register temp2 = static_cast(temp_reg2); - DCHECK_NE(temp1, temp2); - __ movl(temp1, Address(ESP, mem1)); - __ movl(temp2, Address(ESP, mem2)); - __ movl(Address(ESP, mem2), temp1); - __ movl(Address(ESP, mem1), temp2); - } - } + ScratchRegisterScope ensure_scratch1( + this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); + + Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX; + ScratchRegisterScope ensure_scratch2( + this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters()); + + int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0; + stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0; + __ movl(static_cast(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset)); + __ movl(static_cast(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset)); + __ movl(Address(ESP, mem2 + stack_offset), static_cast(ensure_scratch1.GetRegister())); + __ movl(Address(ESP, mem1 + stack_offset), static_cast(ensure_scratch2.GetRegister())); } void ParallelMoveResolverX86::EmitSwap(size_t index) { @@ -4217,7 +4135,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) { Location destination = move->GetDestination(); if (source.IsRegister() && destination.IsRegister()) { - Exchange(destination.AsRegister(), source.AsRegister()); + __ xchgl(destination.AsRegister(), source.AsRegister()); } else if (source.IsRegister() && destination.IsStackSlot()) { Exchange(source.AsRegister(), destination.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsRegister()) { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index b2420e407..e6e7fb7b4 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -106,7 +106,6 @@ class ParallelMoveResolverX86 : public ParallelMoveResolver { X86Assembler* GetAssembler() const; private: - void Exchange(Register reg1, Register Reg2); void Exchange(Register reg, int mem); void Exchange(int mem1, int mem2); void Exchange32(XmmRegister reg, int mem); diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index f714214e8..5710ec57d 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -3838,27 +3838,15 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) { void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) { ScratchRegisterScope ensure_scratch( - this, TMP, codegen_->GetNumberOfCoreRegisters()); - - int temp_reg = ensure_scratch.GetRegister(); - if (temp_reg == kNoRegister) { - // Use the stack as a temporary. - // Save mem1 on the stack. - __ pushq(Address(CpuRegister(RSP), mem1)); - - // Copy mem2 into mem1. - __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize)); - __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP)); + this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); - // Now pop mem1 into mem2. - __ popq(Address(CpuRegister(RSP), mem2)); - } else { - CpuRegister temp = CpuRegister(temp_reg); - __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1)); - __ movq(temp, Address(CpuRegister(RSP), mem2)); - __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP)); - __ movq(Address(CpuRegister(RSP), mem1), temp); - } + int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset)); + __ movq(CpuRegister(ensure_scratch.GetRegister()), + Address(CpuRegister(RSP), mem2 + stack_offset)); + __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP)); + __ movq(Address(CpuRegister(RSP), mem1 + stack_offset), + CpuRegister(ensure_scratch.GetRegister())); } void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { @@ -3867,13 +3855,6 @@ void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { __ movd(reg, CpuRegister(TMP)); } -void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) { - // Prefer to avoid xchg as it isn't speedy on smaller processors. - __ movq(CpuRegister(TMP), reg1); - __ movq(reg1, reg2); - __ movq(reg2, CpuRegister(TMP)); -} - void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) { __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem)); __ movsd(Address(CpuRegister(RSP), mem), reg); @@ -3886,7 +3867,7 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) { Location destination = move->GetDestination(); if (source.IsRegister() && destination.IsRegister()) { - Exchange64(destination.AsRegister(), source.AsRegister()); + __ xchgq(destination.AsRegister(), source.AsRegister()); } else if (source.IsRegister() && destination.IsStackSlot()) { Exchange32(source.AsRegister(), destination.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsRegister()) { diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index a1bbe4799..aae7de0e4 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -118,7 +118,6 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolver { void Exchange32(CpuRegister reg, int mem); void Exchange32(XmmRegister reg, int mem); void Exchange32(int mem1, int mem2); - void Exchange64(CpuRegister reg1, CpuRegister reg2); void Exchange64(CpuRegister reg, int mem); void Exchange64(XmmRegister reg, int mem); void Exchange64(int mem1, int mem2); diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc index 493668536..9df8f5640 100644 --- a/compiler/optimizing/parallel_move_resolver.cc +++ b/compiler/optimizing/parallel_move_resolver.cc @@ -269,20 +269,6 @@ int ParallelMoveResolver::AllocateScratchRegister(int blocked, } -int ParallelMoveResolver::AllocateScratchRegister(int blocked, - int register_count) { - int scratch = -1; - for (int reg = 0; reg < register_count; ++reg) { - if ((blocked != reg) && IsScratchLocation(Location::RegisterLocation(reg))) { - scratch = reg; - break; - } - } - - return scratch; -} - - ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers) : resolver_(resolver), @@ -296,16 +282,6 @@ ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( } -ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( - ParallelMoveResolver* resolver, int blocked, int number_of_registers) - : resolver_(resolver), - reg_(kNoRegister), - spilled_(false) { - // We don't want to spill a register if none are free. - reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers); -} - - ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() { if (spilled_) { resolver_->RestoreScratch(reg_); diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h index 173cffc71..3fa1b37af 100644 --- a/compiler/optimizing/parallel_move_resolver.h +++ b/compiler/optimizing/parallel_move_resolver.h @@ -42,15 +42,10 @@ class ParallelMoveResolver : public ValueObject { protected: class ScratchRegisterScope : public ValueObject { public: - // Spill a scratch register if no regs are free. ScratchRegisterScope(ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers); - // Grab a scratch register only if available. - ScratchRegisterScope(ParallelMoveResolver* resolver, - int blocked, - int number_of_registers); ~ScratchRegisterScope(); int GetRegister() const { return reg_; } @@ -67,8 +62,6 @@ class ParallelMoveResolver : public ValueObject { // Allocate a scratch register for performing a move. The method will try to use // a register that is the destination of a move, but that move has not been emitted yet. int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled); - // As above, but return -1 if no free register. - int AllocateScratchRegister(int blocked, int register_count); // Emit a move. virtual void EmitMove(size_t index) = 0;