From a5c19ce8d200d68a528f2ce0ebff989106c4a933 Mon Sep 17 00:00:00 2001 From: Mark Mendell Date: Wed, 1 Apr 2015 12:51:05 -0400 Subject: [PATCH] [optimizing] Improve x86 parallel moves/swaps Add a new constructor to ScratchRegisterScope that will supply a register if there is a free one, but not spill to force one. Use this to generated alternate code that doesn't use a temporary, as the spill/restore of a register generates extra instructions that aren't necessary on x86. Here is the benefit for a 32 bit memory-to-memory exchange with no free registers: < 50 push eax < 53 push ebx < 8B44244C mov eax, [esp + 76] < 8B5C246C mov ebx, [esp + 108] < 8944246C mov [esp + 108], eax < 895C244C mov [esp + 76], ebx < 5B pop ebx < 58 pop eax --- > FF742444 push [esp + 68] > FF742468 push [esp + 104] > 8F44244C pop [esp + 72] > 8F442468 pop [esp + 100] Avoid using xchg instruction, as it is slow on smaller processors. Change-Id: Id29ee3abd998577baaee552d55d23e60ae0c7871 Signed-off-by: Mark Mendell --- compiler/optimizing/code_generator_x86.cc | 178 +++++++++++++++++++------- compiler/optimizing/code_generator_x86.h | 1 + compiler/optimizing/code_generator_x86_64.cc | 37 ++++-- compiler/optimizing/code_generator_x86_64.h | 1 + compiler/optimizing/parallel_move_resolver.cc | 24 ++++ compiler/optimizing/parallel_move_resolver.h | 7 + 6 files changed, 191 insertions(+), 57 deletions(-) diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 4d7468350..183453ded 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -3958,23 +3958,43 @@ X86Assembler* ParallelMoveResolverX86::GetAssembler() const { } void ParallelMoveResolverX86::MoveMemoryToMemory32(int dst, int src) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp_reg = static_cast(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, src + stack_offset)); - __ movl(Address(ESP, dst + stack_offset), temp_reg); + ScratchRegisterScope possible_scratch( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp = possible_scratch.GetRegister(); + if (temp == kNoRegister) { + // Use the stack. + __ pushl(Address(ESP, src)); + __ popl(Address(ESP, dst)); + } else { + Register temp_reg = static_cast(temp); + __ movl(temp_reg, Address(ESP, src)); + __ movl(Address(ESP, dst), temp_reg); + } } void ParallelMoveResolverX86::MoveMemoryToMemory64(int dst, int src) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp_reg = static_cast(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, src + stack_offset)); - __ movl(Address(ESP, dst + stack_offset), temp_reg); - __ movl(temp_reg, Address(ESP, src + stack_offset + kX86WordSize)); - __ movl(Address(ESP, dst + stack_offset + kX86WordSize), temp_reg); + ScratchRegisterScope possible_scratch( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp = possible_scratch.GetRegister(); + if (temp == kNoRegister) { + // Use the stack instead. + // Push src low word. + __ pushl(Address(ESP, src)); + // Push src high word. Stack offset = 4. + __ pushl(Address(ESP, src + 4 /* offset */ + kX86WordSize /* high */)); + + // Pop into dst high word. Stack offset = 8. + // Pop with ESP address uses the 'after increment' value of ESP. + __ popl(Address(ESP, dst + 4 /* offset */ + kX86WordSize /* high */)); + // Finally dst low word. Stack offset = 4. + __ popl(Address(ESP, dst)); + } else { + Register temp_reg = static_cast(temp); + __ movl(temp_reg, Address(ESP, src)); + __ movl(Address(ESP, dst), temp_reg); + __ movl(temp_reg, Address(ESP, src + kX86WordSize)); + __ movl(Address(ESP, dst + kX86WordSize), temp_reg); + } } void ParallelMoveResolverX86::EmitMove(size_t index) { @@ -4039,10 +4059,18 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { __ xorps(dest, dest); } else { ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - Register temp = static_cast(ensure_scratch.GetRegister()); - __ movl(temp, Immediate(value)); - __ movd(dest, temp); + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = ensure_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + // Avoid spilling/restoring a scratch register by using the stack. + __ pushl(Immediate(value)); + __ movss(dest, Address(ESP, 0)); + __ addl(ESP, Immediate(4)); + } else { + Register temp = static_cast(temp_reg); + __ movl(temp, Immediate(value)); + __ movd(dest, temp); + } } } else { DCHECK(destination.IsStackSlot()) << destination; @@ -4091,42 +4119,96 @@ void ParallelMoveResolverX86::EmitMove(size_t index) { } } -void ParallelMoveResolverX86::Exchange(Register reg, int mem) { - Register suggested_scratch = reg == EAX ? EBX : EAX; - ScratchRegisterScope ensure_scratch( - this, reg, suggested_scratch, codegen_->GetNumberOfCoreRegisters()); +void ParallelMoveResolverX86::Exchange(Register reg1, Register reg2) { + // Prefer to avoid xchg as it isn't speedy on smaller processors. + ScratchRegisterScope possible_scratch( + this, reg1, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = possible_scratch.GetRegister(); + if (temp_reg == kNoRegister || temp_reg == reg2) { + __ pushl(reg1); + __ movl(reg1, reg2); + __ popl(reg2); + } else { + Register temp = static_cast(temp_reg); + __ movl(temp, reg1); + __ movl(reg1, reg2); + __ movl(reg2, temp); + } +} - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(static_cast(ensure_scratch.GetRegister()), Address(ESP, mem + stack_offset)); - __ movl(Address(ESP, mem + stack_offset), reg); - __ movl(reg, static_cast(ensure_scratch.GetRegister())); +void ParallelMoveResolverX86::Exchange(Register reg, int mem) { + ScratchRegisterScope possible_scratch( + this, reg, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = possible_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + __ pushl(Address(ESP, mem)); + __ movl(Address(ESP, mem + kX86WordSize), reg); + __ popl(reg); + } else { + Register temp = static_cast(temp_reg); + __ movl(temp, Address(ESP, mem)); + __ movl(Address(ESP, mem), reg); + __ movl(reg, temp); + } } void ParallelMoveResolverX86::Exchange32(XmmRegister reg, int mem) { - ScratchRegisterScope ensure_scratch( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - - Register temp_reg = static_cast(ensure_scratch.GetRegister()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86WordSize : 0; - __ movl(temp_reg, Address(ESP, mem + stack_offset)); - __ movss(Address(ESP, mem + stack_offset), reg); - __ movd(reg, temp_reg); + ScratchRegisterScope possible_scratch( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp_reg = possible_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + __ pushl(Address(ESP, mem)); + __ movss(Address(ESP, mem + kX86WordSize), reg); + __ movss(reg, Address(ESP, 0)); + __ addl(ESP, Immediate(kX86WordSize)); + } else { + Register temp = static_cast(temp_reg); + __ movl(temp, Address(ESP, mem)); + __ movss(Address(ESP, mem), reg); + __ movd(reg, temp); + } } void ParallelMoveResolverX86::Exchange(int mem1, int mem2) { - ScratchRegisterScope ensure_scratch1( - this, kNoRegister, EAX, codegen_->GetNumberOfCoreRegisters()); - - Register suggested_scratch = ensure_scratch1.GetRegister() == EAX ? EBX : EAX; - ScratchRegisterScope ensure_scratch2( - this, ensure_scratch1.GetRegister(), suggested_scratch, codegen_->GetNumberOfCoreRegisters()); - - int stack_offset = ensure_scratch1.IsSpilled() ? kX86WordSize : 0; - stack_offset += ensure_scratch2.IsSpilled() ? kX86WordSize : 0; - __ movl(static_cast(ensure_scratch1.GetRegister()), Address(ESP, mem1 + stack_offset)); - __ movl(static_cast(ensure_scratch2.GetRegister()), Address(ESP, mem2 + stack_offset)); - __ movl(Address(ESP, mem2 + stack_offset), static_cast(ensure_scratch1.GetRegister())); - __ movl(Address(ESP, mem1 + stack_offset), static_cast(ensure_scratch2.GetRegister())); + ScratchRegisterScope possible_scratch1( + this, kNoRegister, codegen_->GetNumberOfCoreRegisters()); + int temp_reg1 = possible_scratch1.GetRegister(); + if (temp_reg1 == kNoRegister) { + // No free registers. Use the stack. + __ pushl(Address(ESP, mem1)); + __ pushl(Address(ESP, mem2 + kX86WordSize)); + // Pop with ESP address uses the 'after increment' value of ESP. + __ popl(Address(ESP, mem1 + kX86WordSize)); + __ popl(Address(ESP, mem2)); + } else { + // Got the first one. Try for a second. + ScratchRegisterScope possible_scratch2( + this, temp_reg1, codegen_->GetNumberOfCoreRegisters()); + int temp_reg2 = possible_scratch2.GetRegister(); + if (temp_reg2 == kNoRegister) { + Register temp = static_cast(temp_reg1); + // Bummer. Only have one free register to use. + // Save mem1 on the stack. + __ pushl(Address(ESP, mem1)); + + // Copy mem2 into mem1. + __ movl(temp, Address(ESP, mem2 + kX86WordSize)); + __ movl(Address(ESP, mem1 + kX86WordSize), temp); + + // Now pop mem1 into mem2. + // Pop with ESP address uses the 'after increment' value of ESP. + __ popl(Address(ESP, mem2)); + } else { + // Great. We have 2 registers to play with. + Register temp1 = static_cast(temp_reg1); + Register temp2 = static_cast(temp_reg2); + DCHECK_NE(temp1, temp2); + __ movl(temp1, Address(ESP, mem1)); + __ movl(temp2, Address(ESP, mem2)); + __ movl(Address(ESP, mem2), temp1); + __ movl(Address(ESP, mem1), temp2); + } + } } void ParallelMoveResolverX86::EmitSwap(size_t index) { @@ -4135,7 +4217,7 @@ void ParallelMoveResolverX86::EmitSwap(size_t index) { Location destination = move->GetDestination(); if (source.IsRegister() && destination.IsRegister()) { - __ xchgl(destination.AsRegister(), source.AsRegister()); + Exchange(destination.AsRegister(), source.AsRegister()); } else if (source.IsRegister() && destination.IsStackSlot()) { Exchange(source.AsRegister(), destination.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsRegister()) { diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index e6e7fb7b4..b2420e407 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -106,6 +106,7 @@ class ParallelMoveResolverX86 : public ParallelMoveResolver { X86Assembler* GetAssembler() const; private: + void Exchange(Register reg1, Register Reg2); void Exchange(Register reg, int mem); void Exchange(int mem1, int mem2); void Exchange32(XmmRegister reg, int mem); diff --git a/compiler/optimizing/code_generator_x86_64.cc b/compiler/optimizing/code_generator_x86_64.cc index 5710ec57d..f714214e8 100644 --- a/compiler/optimizing/code_generator_x86_64.cc +++ b/compiler/optimizing/code_generator_x86_64.cc @@ -3838,15 +3838,27 @@ void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg, int mem) { void ParallelMoveResolverX86_64::Exchange64(int mem1, int mem2) { ScratchRegisterScope ensure_scratch( - this, TMP, RAX, codegen_->GetNumberOfCoreRegisters()); + this, TMP, codegen_->GetNumberOfCoreRegisters()); - int stack_offset = ensure_scratch.IsSpilled() ? kX86_64WordSize : 0; - __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1 + stack_offset)); - __ movq(CpuRegister(ensure_scratch.GetRegister()), - Address(CpuRegister(RSP), mem2 + stack_offset)); - __ movq(Address(CpuRegister(RSP), mem2 + stack_offset), CpuRegister(TMP)); - __ movq(Address(CpuRegister(RSP), mem1 + stack_offset), - CpuRegister(ensure_scratch.GetRegister())); + int temp_reg = ensure_scratch.GetRegister(); + if (temp_reg == kNoRegister) { + // Use the stack as a temporary. + // Save mem1 on the stack. + __ pushq(Address(CpuRegister(RSP), mem1)); + + // Copy mem2 into mem1. + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem2 + kX86_64WordSize)); + __ movq(Address(CpuRegister(RSP), mem1 + kX86_64WordSize), CpuRegister(TMP)); + + // Now pop mem1 into mem2. + __ popq(Address(CpuRegister(RSP), mem2)); + } else { + CpuRegister temp = CpuRegister(temp_reg); + __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem1)); + __ movq(temp, Address(CpuRegister(RSP), mem2)); + __ movq(Address(CpuRegister(RSP), mem2), CpuRegister(TMP)); + __ movq(Address(CpuRegister(RSP), mem1), temp); + } } void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { @@ -3855,6 +3867,13 @@ void ParallelMoveResolverX86_64::Exchange32(XmmRegister reg, int mem) { __ movd(reg, CpuRegister(TMP)); } +void ParallelMoveResolverX86_64::Exchange64(CpuRegister reg1, CpuRegister reg2) { + // Prefer to avoid xchg as it isn't speedy on smaller processors. + __ movq(CpuRegister(TMP), reg1); + __ movq(reg1, reg2); + __ movq(reg2, CpuRegister(TMP)); +} + void ParallelMoveResolverX86_64::Exchange64(XmmRegister reg, int mem) { __ movq(CpuRegister(TMP), Address(CpuRegister(RSP), mem)); __ movsd(Address(CpuRegister(RSP), mem), reg); @@ -3867,7 +3886,7 @@ void ParallelMoveResolverX86_64::EmitSwap(size_t index) { Location destination = move->GetDestination(); if (source.IsRegister() && destination.IsRegister()) { - __ xchgq(destination.AsRegister(), source.AsRegister()); + Exchange64(destination.AsRegister(), source.AsRegister()); } else if (source.IsRegister() && destination.IsStackSlot()) { Exchange32(source.AsRegister(), destination.GetStackIndex()); } else if (source.IsStackSlot() && destination.IsRegister()) { diff --git a/compiler/optimizing/code_generator_x86_64.h b/compiler/optimizing/code_generator_x86_64.h index aae7de0e4..a1bbe4799 100644 --- a/compiler/optimizing/code_generator_x86_64.h +++ b/compiler/optimizing/code_generator_x86_64.h @@ -118,6 +118,7 @@ class ParallelMoveResolverX86_64 : public ParallelMoveResolver { void Exchange32(CpuRegister reg, int mem); void Exchange32(XmmRegister reg, int mem); void Exchange32(int mem1, int mem2); + void Exchange64(CpuRegister reg1, CpuRegister reg2); void Exchange64(CpuRegister reg, int mem); void Exchange64(XmmRegister reg, int mem); void Exchange64(int mem1, int mem2); diff --git a/compiler/optimizing/parallel_move_resolver.cc b/compiler/optimizing/parallel_move_resolver.cc index 9df8f5640..493668536 100644 --- a/compiler/optimizing/parallel_move_resolver.cc +++ b/compiler/optimizing/parallel_move_resolver.cc @@ -269,6 +269,20 @@ int ParallelMoveResolver::AllocateScratchRegister(int blocked, } +int ParallelMoveResolver::AllocateScratchRegister(int blocked, + int register_count) { + int scratch = -1; + for (int reg = 0; reg < register_count; ++reg) { + if ((blocked != reg) && IsScratchLocation(Location::RegisterLocation(reg))) { + scratch = reg; + break; + } + } + + return scratch; +} + + ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers) : resolver_(resolver), @@ -282,6 +296,16 @@ ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( } +ParallelMoveResolver::ScratchRegisterScope::ScratchRegisterScope( + ParallelMoveResolver* resolver, int blocked, int number_of_registers) + : resolver_(resolver), + reg_(kNoRegister), + spilled_(false) { + // We don't want to spill a register if none are free. + reg_ = resolver_->AllocateScratchRegister(blocked, number_of_registers); +} + + ParallelMoveResolver::ScratchRegisterScope::~ScratchRegisterScope() { if (spilled_) { resolver_->RestoreScratch(reg_); diff --git a/compiler/optimizing/parallel_move_resolver.h b/compiler/optimizing/parallel_move_resolver.h index 3fa1b37af..173cffc71 100644 --- a/compiler/optimizing/parallel_move_resolver.h +++ b/compiler/optimizing/parallel_move_resolver.h @@ -42,10 +42,15 @@ class ParallelMoveResolver : public ValueObject { protected: class ScratchRegisterScope : public ValueObject { public: + // Spill a scratch register if no regs are free. ScratchRegisterScope(ParallelMoveResolver* resolver, int blocked, int if_scratch, int number_of_registers); + // Grab a scratch register only if available. + ScratchRegisterScope(ParallelMoveResolver* resolver, + int blocked, + int number_of_registers); ~ScratchRegisterScope(); int GetRegister() const { return reg_; } @@ -62,6 +67,8 @@ class ParallelMoveResolver : public ValueObject { // Allocate a scratch register for performing a move. The method will try to use // a register that is the destination of a move, but that move has not been emitted yet. int AllocateScratchRegister(int blocked, int if_scratch, int register_count, bool* spilled); + // As above, but return -1 if no free register. + int AllocateScratchRegister(int blocked, int register_count); // Emit a move. virtual void EmitMove(size_t index) = 0; -- 2.11.0