From dc00454f0b9a134f01f79b419200f4044c2af5c6 Mon Sep 17 00:00:00 2001 From: Mark Mendell Date: Fri, 30 Oct 2015 09:45:03 -0400 Subject: [PATCH] X86: Use the constant area for more operations. Allow FP HNeg to use the constant area to hold the constant to flip the sign bit. Enhance some math intrinsics to allow the use of the constant area: Abs{Float,Double}, {Min,Max}{FloatFloat,DoubleDouble}. Allow compares of floats/doubles to constants using the constant area. These eliminate almost all uses of loading constants from the stack. Change-Id: Ic4b831565825cbe9f0801b1b53c1013be7c87ae4 Signed-off-by: Mark Mendell --- compiler/optimizing/code_generator_x86.cc | 77 +++++++++++++++-- compiler/optimizing/code_generator_x86.h | 2 + compiler/optimizing/intrinsics_x86.cc | 114 ++++++++++++++++++-------- compiler/optimizing/nodes.h | 1 + compiler/optimizing/nodes_x86.h | 19 +++++ compiler/optimizing/pc_relative_fixups_x86.cc | 66 ++++++++++++++- 6 files changed, 235 insertions(+), 44 deletions(-) diff --git a/compiler/optimizing/code_generator_x86.cc b/compiler/optimizing/code_generator_x86.cc index 50c4ba23c..94bdd5542 100644 --- a/compiler/optimizing/code_generator_x86.cc +++ b/compiler/optimizing/code_generator_x86.cc @@ -1386,6 +1386,41 @@ void InstructionCodeGeneratorX86::GenerateLongComparesAndJumps(HCondition* cond, __ j(final_condition, true_label); } +void InstructionCodeGeneratorX86::GenerateFPCompare(Location lhs, + Location rhs, + HInstruction* insn, + bool is_double) { + HX86LoadFromConstantTable* const_area = insn->InputAt(1)->AsX86LoadFromConstantTable(); + if (is_double) { + if (rhs.IsFpuRegister()) { + __ ucomisd(lhs.AsFpuRegister(), rhs.AsFpuRegister()); + } else if (const_area != nullptr) { + DCHECK(!const_area->NeedsMaterialization()); + __ ucomisd(lhs.AsFpuRegister(), + codegen_->LiteralDoubleAddress( + const_area->GetConstant()->AsDoubleConstant()->GetValue(), + const_area->GetLocations()->InAt(0).AsRegister())); + } else { + DCHECK(rhs.IsDoubleStackSlot()); + __ ucomisd(lhs.AsFpuRegister(), Address(ESP, rhs.GetStackIndex())); + } + return; + } + + if (rhs.IsFpuRegister()) { + __ ucomiss(lhs.AsFpuRegister(), rhs.AsFpuRegister()); + } else if (const_area != nullptr) { + DCHECK(!const_area->NeedsMaterialization()); + __ ucomiss(lhs.AsFpuRegister(), + codegen_->LiteralFloatAddress( + const_area->GetConstant()->AsFloatConstant()->GetValue(), + const_area->GetLocations()->InAt(0).AsRegister())); + } else { + DCHECK(rhs.IsStackSlot()); + __ ucomiss(lhs.AsFpuRegister(), Address(ESP, rhs.GetStackIndex())); + } +} + template void InstructionCodeGeneratorX86::GenerateCompareTestAndBranch(HCondition* condition, LabelType* true_target_in, @@ -1406,11 +1441,11 @@ void InstructionCodeGeneratorX86::GenerateCompareTestAndBranch(HCondition* condi GenerateLongComparesAndJumps(condition, true_target, false_target); break; case Primitive::kPrimFloat: - __ ucomiss(left.AsFpuRegister(), right.AsFpuRegister()); + GenerateFPCompare(left, right, condition, false); GenerateFPJumps(condition, true_target, false_target); break; case Primitive::kPrimDouble: - __ ucomisd(left.AsFpuRegister(), right.AsFpuRegister()); + GenerateFPCompare(left, right, condition, true); GenerateFPJumps(condition, true_target, false_target); break; default: @@ -1636,7 +1671,7 @@ void LocationsBuilderX86::HandleCondition(HCondition* cond) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::Any()); if (cond->NeedsMaterialization()) { locations->SetOut(Location::RequiresRegister()); } @@ -1690,11 +1725,11 @@ void InstructionCodeGeneratorX86::HandleCondition(HCondition* cond) { GenerateLongComparesAndJumps(cond, &true_label, &false_label); break; case Primitive::kPrimFloat: - __ ucomiss(lhs.AsFpuRegister(), rhs.AsFpuRegister()); + GenerateFPCompare(lhs, rhs, cond, false); GenerateFPJumps(cond, &true_label, &false_label); break; case Primitive::kPrimDouble: - __ ucomisd(lhs.AsFpuRegister(), rhs.AsFpuRegister()); + GenerateFPCompare(lhs, rhs, cond, true); GenerateFPJumps(cond, &true_label, &false_label); break; } @@ -2130,6 +2165,32 @@ void InstructionCodeGeneratorX86::VisitNeg(HNeg* neg) { } } +void LocationsBuilderX86::VisitX86FPNeg(HX86FPNeg* neg) { + LocationSummary* locations = + new (GetGraph()->GetArena()) LocationSummary(neg, LocationSummary::kNoCall); + DCHECK(Primitive::IsFloatingPointType(neg->GetType())); + locations->SetInAt(0, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->SetOut(Location::SameAsFirstInput()); + locations->AddTemp(Location::RequiresFpuRegister()); +} + +void InstructionCodeGeneratorX86::VisitX86FPNeg(HX86FPNeg* neg) { + LocationSummary* locations = neg->GetLocations(); + Location out = locations->Out(); + DCHECK(locations->InAt(0).Equals(out)); + + Register constant_area = locations->InAt(1).AsRegister(); + XmmRegister mask = locations->GetTemp(0).AsFpuRegister(); + if (neg->GetType() == Primitive::kPrimFloat) { + __ movss(mask, codegen_->LiteralInt32Address(INT32_C(0x80000000), constant_area)); + __ xorps(out.AsFpuRegister(), mask); + } else { + __ movsd(mask, codegen_->LiteralInt64Address(INT64_C(0x8000000000000000), constant_area)); + __ xorpd(out.AsFpuRegister(), mask); + } +} + void LocationsBuilderX86::VisitTypeConversion(HTypeConversion* conversion) { Primitive::Type result_type = conversion->GetResultType(); Primitive::Type input_type = conversion->GetInputType(); @@ -4012,7 +4073,7 @@ void LocationsBuilderX86::VisitCompare(HCompare* compare) { case Primitive::kPrimFloat: case Primitive::kPrimDouble: { locations->SetInAt(0, Location::RequiresFpuRegister()); - locations->SetInAt(1, Location::RequiresFpuRegister()); + locations->SetInAt(1, Location::Any()); locations->SetOut(Location::RequiresRegister()); break; } @@ -4073,12 +4134,12 @@ void InstructionCodeGeneratorX86::VisitCompare(HCompare* compare) { break; } case Primitive::kPrimFloat: { - __ ucomiss(left.AsFpuRegister(), right.AsFpuRegister()); + GenerateFPCompare(left, right, compare, false); __ j(kUnordered, compare->IsGtBias() ? &greater : &less); break; } case Primitive::kPrimDouble: { - __ ucomisd(left.AsFpuRegister(), right.AsFpuRegister()); + GenerateFPCompare(left, right, compare, true); __ j(kUnordered, compare->IsGtBias() ? &greater : &less); break; } diff --git a/compiler/optimizing/code_generator_x86.h b/compiler/optimizing/code_generator_x86.h index 0aef47856..d51b96f85 100644 --- a/compiler/optimizing/code_generator_x86.h +++ b/compiler/optimizing/code_generator_x86.h @@ -296,6 +296,8 @@ class InstructionCodeGeneratorX86 : public InstructionCodeGenerator { HBasicBlock* switch_block, HBasicBlock* default_block); + void GenerateFPCompare(Location lhs, Location rhs, HInstruction* insn, bool is_double); + X86Assembler* const assembler_; CodeGeneratorX86* const codegen_; diff --git a/compiler/optimizing/intrinsics_x86.cc b/compiler/optimizing/intrinsics_x86.cc index 3f5688b0d..aad937881 100644 --- a/compiler/optimizing/intrinsics_x86.cc +++ b/compiler/optimizing/intrinsics_x86.cc @@ -37,10 +37,12 @@ namespace x86 { static constexpr int kDoubleNaNHigh = 0x7FF80000; static constexpr int kDoubleNaNLow = 0x00000000; -static constexpr int kFloatNaN = 0x7FC00000; +static constexpr int64_t kDoubleNaN = INT64_C(0x7FF8000000000000); +static constexpr int32_t kFloatNaN = INT32_C(0x7FC00000); IntrinsicLocationsBuilderX86::IntrinsicLocationsBuilderX86(CodeGeneratorX86* codegen) - : arena_(codegen->GetGraph()->GetArena()), codegen_(codegen) { + : arena_(codegen->GetGraph()->GetArena()), + codegen_(codegen) { } @@ -256,15 +258,36 @@ static void CreateFloatToFloat(ArenaAllocator* arena, HInvoke* invoke) { LocationSummary::kNoCall, kIntrinsified); locations->SetInAt(0, Location::RequiresFpuRegister()); - // TODO: Allow x86 to work with memory. This requires assembler support, see below. - // locations->SetInAt(0, Location::Any()); // X86 can work on memory directly. locations->SetOut(Location::SameAsFirstInput()); + HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect(); + DCHECK(static_or_direct != nullptr); + if (invoke->InputAt(static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) { + // We need addressibility for the constant area. + locations->SetInAt(1, Location::RequiresRegister()); + // We need a temporary to hold the constant. + locations->AddTemp(Location::RequiresFpuRegister()); + } } -static void MathAbsFP(LocationSummary* locations, bool is64bit, X86Assembler* assembler) { +static void MathAbsFP(LocationSummary* locations, + bool is64bit, + X86Assembler* assembler, + CodeGeneratorX86* codegen) { Location output = locations->Out(); - if (output.IsFpuRegister()) { + DCHECK(output.IsFpuRegister()); + if (locations->InAt(1).IsRegister()) { + // We also have a constant area pointer. + Register constant_area = locations->InAt(1).AsRegister(); + XmmRegister temp = locations->GetTemp(0).AsFpuRegister(); + if (is64bit) { + __ movsd(temp, codegen->LiteralInt64Address(INT64_C(0x7FFFFFFFFFFFFFFF), constant_area)); + __ andpd(output.AsFpuRegister(), temp); + } else { + __ movss(temp, codegen->LiteralInt32Address(INT32_C(0x7FFFFFFF), constant_area)); + __ andps(output.AsFpuRegister(), temp); + } + } else { // Create the right constant on an aligned stack. if (is64bit) { __ subl(ESP, Immediate(8)); @@ -277,19 +300,6 @@ static void MathAbsFP(LocationSummary* locations, bool is64bit, X86Assembler* as __ andps(output.AsFpuRegister(), Address(ESP, 0)); } __ addl(ESP, Immediate(16)); - } else { - // TODO: update when assember support is available. - UNIMPLEMENTED(FATAL) << "Needs assembler support."; -// Once assembler support is available, in-memory operations look like this: -// if (is64bit) { -// DCHECK(output.IsDoubleStackSlot()); -// __ andl(Address(Register(RSP), output.GetHighStackIndex(kX86WordSize)), -// Immediate(0x7FFFFFFF)); -// } else { -// DCHECK(output.IsStackSlot()); -// // Can use and with a literal directly. -// __ andl(Address(Register(RSP), output.GetStackIndex()), Immediate(0x7FFFFFFF)); -// } } } @@ -298,7 +308,7 @@ void IntrinsicLocationsBuilderX86::VisitMathAbsDouble(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitMathAbsDouble(HInvoke* invoke) { - MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler()); + MathAbsFP(invoke->GetLocations(), /* is64bit */ true, GetAssembler(), codegen_); } void IntrinsicLocationsBuilderX86::VisitMathAbsFloat(HInvoke* invoke) { @@ -306,7 +316,7 @@ void IntrinsicLocationsBuilderX86::VisitMathAbsFloat(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitMathAbsFloat(HInvoke* invoke) { - MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler()); + MathAbsFP(invoke->GetLocations(), /* is64bit */ false, GetAssembler(), codegen_); } static void CreateAbsIntLocation(ArenaAllocator* arena, HInvoke* invoke) { @@ -388,8 +398,11 @@ void IntrinsicCodeGeneratorX86::VisitMathAbsLong(HInvoke* invoke) { GenAbsLong(invoke->GetLocations(), GetAssembler()); } -static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, - X86Assembler* assembler) { +static void GenMinMaxFP(LocationSummary* locations, + bool is_min, + bool is_double, + X86Assembler* assembler, + CodeGeneratorX86* codegen) { Location op1_loc = locations->InAt(0); Location op2_loc = locations->InAt(1); Location out_loc = locations->Out(); @@ -450,15 +463,25 @@ static void GenMinMaxFP(LocationSummary* locations, bool is_min, bool is_double, // NaN handling. __ Bind(&nan); - if (is_double) { - __ pushl(Immediate(kDoubleNaNHigh)); - __ pushl(Immediate(kDoubleNaNLow)); - __ movsd(out, Address(ESP, 0)); - __ addl(ESP, Immediate(8)); + // Do we have a constant area pointer? + if (locations->InAt(2).IsRegister()) { + Register constant_area = locations->InAt(2).AsRegister(); + if (is_double) { + __ movsd(out, codegen->LiteralInt64Address(kDoubleNaN, constant_area)); + } else { + __ movss(out, codegen->LiteralInt32Address(kFloatNaN, constant_area)); + } } else { - __ pushl(Immediate(kFloatNaN)); - __ movss(out, Address(ESP, 0)); - __ addl(ESP, Immediate(4)); + if (is_double) { + __ pushl(Immediate(kDoubleNaNHigh)); + __ pushl(Immediate(kDoubleNaNLow)); + __ movsd(out, Address(ESP, 0)); + __ addl(ESP, Immediate(8)); + } else { + __ pushl(Immediate(kFloatNaN)); + __ movss(out, Address(ESP, 0)); + __ addl(ESP, Immediate(4)); + } } __ jmp(&done); @@ -483,6 +506,11 @@ static void CreateFPFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { // The following is sub-optimal, but all we can do for now. It would be fine to also accept // the second input to be the output (we can simply swap inputs). locations->SetOut(Location::SameAsFirstInput()); + HInvokeStaticOrDirect* static_or_direct = invoke->AsInvokeStaticOrDirect(); + DCHECK(static_or_direct != nullptr); + if (invoke->InputAt(static_or_direct->GetSpecialInputIndex())->IsX86ComputeBaseMethodAddress()) { + locations->SetInAt(2, Location::RequiresRegister()); + } } void IntrinsicLocationsBuilderX86::VisitMathMinDoubleDouble(HInvoke* invoke) { @@ -490,7 +518,11 @@ void IntrinsicLocationsBuilderX86::VisitMathMinDoubleDouble(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitMathMinDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, /* is_double */ true, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), + /* is_min */ true, + /* is_double */ true, + GetAssembler(), + codegen_); } void IntrinsicLocationsBuilderX86::VisitMathMinFloatFloat(HInvoke* invoke) { @@ -498,7 +530,11 @@ void IntrinsicLocationsBuilderX86::VisitMathMinFloatFloat(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitMathMinFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ true, /* is_double */ false, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), + /* is_min */ true, + /* is_double */ false, + GetAssembler(), + codegen_); } void IntrinsicLocationsBuilderX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { @@ -506,7 +542,11 @@ void IntrinsicLocationsBuilderX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitMathMaxDoubleDouble(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, /* is_double */ true, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), + /* is_min */ false, + /* is_double */ true, + GetAssembler(), + codegen_); } void IntrinsicLocationsBuilderX86::VisitMathMaxFloatFloat(HInvoke* invoke) { @@ -514,7 +554,11 @@ void IntrinsicLocationsBuilderX86::VisitMathMaxFloatFloat(HInvoke* invoke) { } void IntrinsicCodeGeneratorX86::VisitMathMaxFloatFloat(HInvoke* invoke) { - GenMinMaxFP(invoke->GetLocations(), /* is_min */ false, /* is_double */ false, GetAssembler()); + GenMinMaxFP(invoke->GetLocations(), + /* is_min */ false, + /* is_double */ false, + GetAssembler(), + codegen_); } static void GenMinMax(LocationSummary* locations, bool is_min, bool is_long, diff --git a/compiler/optimizing/nodes.h b/compiler/optimizing/nodes.h index 5246fd1f0..2e37f04cc 100644 --- a/compiler/optimizing/nodes.h +++ b/compiler/optimizing/nodes.h @@ -1256,6 +1256,7 @@ class HLoopInformationOutwardIterator : public ValueObject { #define FOR_EACH_CONCRETE_INSTRUCTION_X86(M) \ M(X86ComputeBaseMethodAddress, Instruction) \ M(X86LoadFromConstantTable, Instruction) \ + M(X86FPNeg, Instruction) \ M(X86PackedSwitch, Instruction) #endif diff --git a/compiler/optimizing/nodes_x86.h b/compiler/optimizing/nodes_x86.h index 556217bf7..20372467a 100644 --- a/compiler/optimizing/nodes_x86.h +++ b/compiler/optimizing/nodes_x86.h @@ -62,6 +62,25 @@ class HX86LoadFromConstantTable : public HExpression<2> { DISALLOW_COPY_AND_ASSIGN(HX86LoadFromConstantTable); }; +// Version of HNeg with access to the constant table for FP types. +class HX86FPNeg : public HExpression<2> { + public: + HX86FPNeg(Primitive::Type result_type, + HInstruction* input, + HX86ComputeBaseMethodAddress* method_base, + uint32_t dex_pc) + : HExpression(result_type, SideEffects::None(), dex_pc) { + DCHECK(Primitive::IsFloatingPointType(result_type)); + SetRawInputAt(0, input); + SetRawInputAt(1, method_base); + } + + DECLARE_INSTRUCTION(X86FPNeg); + + private: + DISALLOW_COPY_AND_ASSIGN(HX86FPNeg); +}; + // X86 version of HPackedSwitch that holds a pointer to the base method address. class HX86PackedSwitch : public HTemplateInstruction<2> { public: diff --git a/compiler/optimizing/pc_relative_fixups_x86.cc b/compiler/optimizing/pc_relative_fixups_x86.cc index 1394dfaf5..15d2024fa 100644 --- a/compiler/optimizing/pc_relative_fixups_x86.cc +++ b/compiler/optimizing/pc_relative_fixups_x86.cc @@ -53,6 +53,10 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { BinaryFP(div); } + void VisitCompare(HCompare* compare) OVERRIDE { + BinaryFP(compare); + } + void VisitReturn(HReturn* ret) OVERRIDE { HConstant* value = ret->InputAt(0)->AsConstant(); if ((value != nullptr && Primitive::IsFloatingPointType(value->GetType()))) { @@ -74,11 +78,50 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { void BinaryFP(HBinaryOperation* bin) { HConstant* rhs = bin->InputAt(1)->AsConstant(); - if (rhs != nullptr && Primitive::IsFloatingPointType(bin->GetResultType())) { + if (rhs != nullptr && Primitive::IsFloatingPointType(rhs->GetType())) { ReplaceInput(bin, rhs, 1, false); } } + void VisitEqual(HEqual* cond) OVERRIDE { + BinaryFP(cond); + } + + void VisitNotEqual(HNotEqual* cond) OVERRIDE { + BinaryFP(cond); + } + + void VisitLessThan(HLessThan* cond) OVERRIDE { + BinaryFP(cond); + } + + void VisitLessThanOrEqual(HLessThanOrEqual* cond) OVERRIDE { + BinaryFP(cond); + } + + void VisitGreaterThan(HGreaterThan* cond) OVERRIDE { + BinaryFP(cond); + } + + void VisitGreaterThanOrEqual(HGreaterThanOrEqual* cond) OVERRIDE { + BinaryFP(cond); + } + + void VisitNeg(HNeg* neg) OVERRIDE { + if (Primitive::IsFloatingPointType(neg->GetType())) { + // We need to replace the HNeg with a HX86FPNeg in order to address the constant area. + InitializePCRelativeBasePointer(); + HGraph* graph = GetGraph(); + HBasicBlock* block = neg->GetBlock(); + HX86FPNeg* x86_fp_neg = new (graph->GetArena()) HX86FPNeg( + neg->GetType(), + neg->InputAt(0), + base_, + neg->GetDexPc()); + block->ReplaceAndRemoveInstructionWith(neg, x86_fp_neg); + } + } + void VisitPackedSwitch(HPackedSwitch* switch_insn) OVERRIDE { if (switch_insn->GetNumEntries() <= InstructionCodeGeneratorX86::kPackedSwitchJumpTableThreshold) { @@ -124,11 +167,13 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { // If this is an invoke-static/-direct with PC-relative dex cache array // addressing, we need the PC-relative address base. HInvokeStaticOrDirect* invoke_static_or_direct = invoke->AsInvokeStaticOrDirect(); + bool base_added = false; if (invoke_static_or_direct != nullptr && invoke_static_or_direct->HasPcRelativeDexCache()) { InitializePCRelativeBasePointer(); // Add the extra parameter base_. DCHECK(!invoke_static_or_direct->HasCurrentMethodInput()); invoke_static_or_direct->AddSpecialInput(base_); + base_added = true; } // Ensure that we can load FP arguments from the constant area. for (size_t i = 0, e = invoke->InputCount(); i < e; i++) { @@ -137,6 +182,25 @@ class PCRelativeHandlerVisitor : public HGraphVisitor { ReplaceInput(invoke, input, i, true); } } + + // These intrinsics need the constant area. + switch (invoke->GetIntrinsic()) { + case Intrinsics::kMathAbsDouble: + case Intrinsics::kMathAbsFloat: + case Intrinsics::kMathMaxDoubleDouble: + case Intrinsics::kMathMaxFloatFloat: + case Intrinsics::kMathMinDoubleDouble: + case Intrinsics::kMathMinFloatFloat: + if (!base_added) { + DCHECK(invoke_static_or_direct != nullptr); + DCHECK(!invoke_static_or_direct->HasCurrentMethodInput()); + InitializePCRelativeBasePointer(); + invoke_static_or_direct->AddSpecialInput(base_); + } + break; + default: + break; + } } // The generated HX86ComputeBaseMethodAddress in the entry block needed as an -- 2.11.0