From 1f36f411e8f51969f0af95fa60b9809656403c0a Mon Sep 17 00:00:00 2001 From: Scott Wakeling Date: Thu, 21 Apr 2016 11:13:45 +0100 Subject: [PATCH] ARM64: Add new String.compareTo intrinsic. Benchmarked on Nexus6P big, little, and all cores. The new intrinsic is faster than pStringCompareTo for compare lengths on [1,512], so the runtime call is no longer needed. Change-Id: If94bfe24d9bf4dddcca648cc0b563709fc407b34 --- compiler/optimizing/intrinsics_arm64.cc | 116 +++++++++++++++++++++++---- runtime/arch/arm64/entrypoints_init_arm64.cc | 3 +- runtime/arch/arm64/quick_entrypoints_arm64.S | 105 ------------------------ runtime/arch/stub_test.cc | 3 +- 4 files changed, 106 insertions(+), 121 deletions(-) diff --git a/compiler/optimizing/intrinsics_arm64.cc b/compiler/optimizing/intrinsics_arm64.cc index 04ae3a673..bf7976782 100644 --- a/compiler/optimizing/intrinsics_arm64.cc +++ b/compiler/optimizing/intrinsics_arm64.cc @@ -47,6 +47,7 @@ using helpers::SRegisterFrom; using helpers::WRegisterFrom; using helpers::XRegisterFrom; using helpers::InputRegisterAt; +using helpers::OutputRegister; namespace { @@ -1173,31 +1174,118 @@ void IntrinsicCodeGeneratorARM64::VisitStringCharAt(HInvoke* invoke) { void IntrinsicLocationsBuilderARM64::VisitStringCompareTo(HInvoke* invoke) { LocationSummary* locations = new (arena_) LocationSummary(invoke, - LocationSummary::kCall, + invoke->InputAt(1)->CanBeNull() + ? LocationSummary::kCallOnSlowPath + : LocationSummary::kNoCall, kIntrinsified); - InvokeRuntimeCallingConvention calling_convention; - locations->SetInAt(0, LocationFrom(calling_convention.GetRegisterAt(0))); - locations->SetInAt(1, LocationFrom(calling_convention.GetRegisterAt(1))); - locations->SetOut(calling_convention.GetReturnLocation(Primitive::kPrimInt)); + locations->SetInAt(0, Location::RequiresRegister()); + locations->SetInAt(1, Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->AddTemp(Location::RequiresRegister()); + locations->SetOut(Location::RequiresRegister(), Location::kOutputOverlap); } void IntrinsicCodeGeneratorARM64::VisitStringCompareTo(HInvoke* invoke) { vixl::MacroAssembler* masm = GetVIXLAssembler(); LocationSummary* locations = invoke->GetLocations(); + Register str = XRegisterFrom(locations->InAt(0)); + Register arg = XRegisterFrom(locations->InAt(1)); + Register out = OutputRegister(invoke); + + Register temp0 = WRegisterFrom(locations->GetTemp(0)); + Register temp1 = WRegisterFrom(locations->GetTemp(1)); + Register temp2 = WRegisterFrom(locations->GetTemp(2)); + + vixl::Label loop; + vixl::Label find_char_diff; + vixl::Label end; + + // Get offsets of count and value fields within a string object. + const int32_t count_offset = mirror::String::CountOffset().Int32Value(); + const int32_t value_offset = mirror::String::ValueOffset().Int32Value(); + // Note that the null check must have been done earlier. DCHECK(!invoke->CanDoImplicitNullCheckOn(invoke->InputAt(0))); - Register argument = WRegisterFrom(locations->InAt(1)); - __ Cmp(argument, 0); - SlowPathCodeARM64* slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); - codegen_->AddSlowPath(slow_path); - __ B(eq, slow_path->GetEntryLabel()); + // Take slow path and throw if input can be and is null. + SlowPathCodeARM64* slow_path = nullptr; + const bool can_slow_path = invoke->InputAt(1)->CanBeNull(); + if (can_slow_path) { + slow_path = new (GetAllocator()) IntrinsicSlowPathARM64(invoke); + codegen_->AddSlowPath(slow_path); + __ Cbz(arg, slow_path->GetEntryLabel()); + } - __ Ldr( - lr, MemOperand(tr, QUICK_ENTRYPOINT_OFFSET(kArm64WordSize, pStringCompareTo).Int32Value())); - __ Blr(lr); - __ Bind(slow_path->GetExitLabel()); + // Reference equality check, return 0 if same reference. + __ Subs(out, str, arg); + __ B(&end, eq); + // Load lengths of this and argument strings. + __ Ldr(temp0, MemOperand(str.X(), count_offset)); + __ Ldr(temp1, MemOperand(arg.X(), count_offset)); + // Return zero if both strings are empty. + __ Orr(out, temp0, temp1); + __ Cbz(out, &end); + // out = length diff. + __ Subs(out, temp0, temp1); + // temp2 = min(len(str), len(arg)). + __ Csel(temp2, temp1, temp0, ge); + // Shorter string is empty? + __ Cbz(temp2, &end); + + // Store offset of string value in preparation for comparison loop. + __ Mov(temp1, value_offset); + + UseScratchRegisterScope scratch_scope(masm); + Register temp4 = scratch_scope.AcquireX(); + + // Assertions that must hold in order to compare strings 4 characters at a time. + DCHECK_ALIGNED(value_offset, 8); + static_assert(IsAligned<8>(kObjectAlignment), "String of odd length is not zero padded"); + + const size_t char_size = Primitive::ComponentSize(Primitive::kPrimChar); + DCHECK_EQ(char_size, 2u); + + // Promote temp0 to an X reg, ready for LDR. + temp0 = temp0.X(); + + // Loop to compare 4x16-bit characters at a time (ok because of string data alignment). + __ Bind(&loop); + __ Ldr(temp4, MemOperand(str.X(), temp1)); + __ Ldr(temp0, MemOperand(arg.X(), temp1)); + __ Cmp(temp4, temp0); + __ B(ne, &find_char_diff); + __ Add(temp1, temp1, char_size * 4); + __ Subs(temp2, temp2, 4); + __ B(gt, &loop); + __ B(&end); + + // Promote temp1 to an X reg, ready for EOR. + temp1 = temp1.X(); + + // Find the single 16-bit character difference. + __ Bind(&find_char_diff); + // Get the bit position of the first character that differs. + __ Eor(temp1, temp0, temp4); + __ Rbit(temp1, temp1); + __ Clz(temp1, temp1); + __ Bic(temp1, temp1, 0xf); + // If the number of 16-bit chars remaining <= the index where the difference occurs (0-3), then + // the difference occurs outside the remaining string data, so just return length diff (out). + __ Cmp(temp2, Operand(temp1, LSR, 4)); + __ B(le, &end); + // Extract the characters and calculate the difference. + __ Lsr(temp0, temp0, temp1); + __ Lsr(temp4, temp4, temp1); + __ And(temp4, temp4, 0xffff); + __ Sub(out, temp4, Operand(temp0, UXTH)); + + __ Bind(&end); + + if (can_slow_path) { + __ Bind(slow_path->GetExitLabel()); + } } void IntrinsicLocationsBuilderARM64::VisitStringEquals(HInvoke* invoke) { diff --git a/runtime/arch/arm64/entrypoints_init_arm64.cc b/runtime/arch/arm64/entrypoints_init_arm64.cc index 4db941174..f271596f8 100644 --- a/runtime/arch/arm64/entrypoints_init_arm64.cc +++ b/runtime/arch/arm64/entrypoints_init_arm64.cc @@ -134,7 +134,8 @@ void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) { // Intrinsics qpoints->pIndexOf = art_quick_indexof; - qpoints->pStringCompareTo = art_quick_string_compareto; + // The ARM64 StringCompareTo intrinsic does not call the runtime. + qpoints->pStringCompareTo = nullptr; qpoints->pMemcpy = memcpy; // Invocation diff --git a/runtime/arch/arm64/quick_entrypoints_arm64.S b/runtime/arch/arm64/quick_entrypoints_arm64.S index 1cdda2d19..27141fdf2 100644 --- a/runtime/arch/arm64/quick_entrypoints_arm64.S +++ b/runtime/arch/arm64/quick_entrypoints_arm64.S @@ -2187,108 +2187,3 @@ ENTRY art_quick_indexof asr x0, x0, #1 ret END art_quick_indexof - - /* - * String's compareTo. - * - * TODO: Not very optimized. - * - * On entry: - * x0: this object pointer - * x1: comp object pointer - * - */ - .extern __memcmp16 -ENTRY art_quick_string_compareto - mov x2, x0 // x0 is return, use x2 for first input. - sub x0, x2, x1 // Same string object? - cbnz x0,1f - ret -1: // Different string objects. - - ldr w4, [x2, #MIRROR_STRING_COUNT_OFFSET] - ldr w3, [x1, #MIRROR_STRING_COUNT_OFFSET] - add x2, x2, #MIRROR_STRING_VALUE_OFFSET - add x1, x1, #MIRROR_STRING_VALUE_OFFSET - - /* - * Now: Data* Count - * first arg x2 w4 - * second arg x1 w3 - */ - - // x0 := str1.length(w4) - str2.length(w3). ldr zero-extended w3/w4 into x3/x4. - subs x0, x4, x3 - // Min(count1, count2) into w3. - csel x3, x3, x4, ge - - // TODO: Tune this value. - // Check for long string, do memcmp16 for them. - cmp w3, #28 // Constant from arm32. - bgt .Ldo_memcmp16 - - /* - * Now: - * x2: *first string data - * x1: *second string data - * w3: iteration count - * x0: return value if comparison equal - * x4, x5, x6, x7: free - */ - - // Do a simple unrolled loop. -.Lloop: - // At least two more elements? - subs w3, w3, #2 - b.lt .Lremainder_or_done - - ldrh w4, [x2], #2 - ldrh w5, [x1], #2 - - ldrh w6, [x2], #2 - ldrh w7, [x1], #2 - - subs w4, w4, w5 - b.ne .Lw4_result - - subs w6, w6, w7 - b.ne .Lw6_result - - b .Lloop - -.Lremainder_or_done: - adds w3, w3, #1 - b.eq .Lremainder - ret - -.Lremainder: - ldrh w4, [x2], #2 - ldrh w5, [x1], #2 - subs w4, w4, w5 - b.ne .Lw4_result - ret - -// Result is in w4 -.Lw4_result: - sxtw x0, w4 - ret - -// Result is in w6 -.Lw6_result: - sxtw x0, w6 - ret - -.Ldo_memcmp16: - mov x14, x0 // Save x0 and LR. __memcmp16 does not use these temps. - mov x15, xLR // TODO: Codify and check that? - - mov x0, x2 - uxtw x2, w3 - bl __memcmp16 - - mov xLR, x15 // Restore LR. - - cmp x0, #0 // Check the memcmp difference. - csel x0, x0, x14, ne // x0 := x0 != 0 ? x14(prev x0=length diff) : x1. - ret -END art_quick_string_compareto diff --git a/runtime/arch/stub_test.cc b/runtime/arch/stub_test.cc index 75d9073cf..1fc579bed 100644 --- a/runtime/arch/stub_test.cc +++ b/runtime/arch/stub_test.cc @@ -1205,7 +1205,8 @@ TEST_F(StubTest, AllocObjectArray) { TEST_F(StubTest, StringCompareTo) { -#if defined(__i386__) || defined(__arm__) || defined(__aarch64__) || \ + // There is no StringCompareTo runtime entrypoint for __aarch64__. +#if defined(__i386__) || defined(__arm__) || \ defined(__mips__) || (defined(__x86_64__) && !defined(__APPLE__)) // TODO: Check the "Unresolved" allocation stubs -- 2.11.0