From 7fda7854ed734c4cdc786e46fcfdaa71ec600ecf Mon Sep 17 00:00:00 2001 From: Chris Larsen Date: Thu, 21 Apr 2016 16:00:36 -0700 Subject: [PATCH] MIPS64: Implement bitCount intrinsics. - int java.lang.Integer.bitCount(int) - int java.lang.Long.bitCount(long) Change-Id: If2390beeb5b900e8680ead1927e0455b35f1948a --- compiler/optimizing/intrinsics_mips64.cc | 89 ++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/compiler/optimizing/intrinsics_mips64.cc b/compiler/optimizing/intrinsics_mips64.cc index cf973aa84..1524e1e01 100644 --- a/compiler/optimizing/intrinsics_mips64.cc +++ b/compiler/optimizing/intrinsics_mips64.cc @@ -385,6 +385,92 @@ static void CreateFPToFPLocations(ArenaAllocator* arena, HInvoke* invoke) { locations->SetOut(Location::RequiresFpuRegister(), Location::kNoOutputOverlap); } +static void GenBitCount(LocationSummary* locations, + const Primitive::Type type, + Mips64Assembler* assembler) { + GpuRegister out = locations->Out().AsRegister(); + GpuRegister in = locations->InAt(0).AsRegister(); + + DCHECK(type == Primitive::kPrimInt || type == Primitive::kPrimLong); + + // https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel + // + // A generalization of the best bit counting method to integers of + // bit-widths up to 128 (parameterized by type T) is this: + // + // v = v - ((v >> 1) & (T)~(T)0/3); // temp + // v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); // temp + // v = (v + (v >> 4)) & (T)~(T)0/255*15; // temp + // c = (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * BITS_PER_BYTE; // count + // + // For comparison, for 32-bit quantities, this algorithm can be executed + // using 20 MIPS instructions (the calls to LoadConst32() generate two + // machine instructions each for the values being used in this algorithm). + // A(n unrolled) loop-based algorithm requires 25 instructions. + // + // For a 64-bit operand this can be performed in 24 instructions compared + // to a(n unrolled) loop based algorithm which requires 38 instructions. + // + // There are algorithms which are faster in the cases where very few + // bits are set but the algorithm here attempts to minimize the total + // number of instructions executed even when a large number of bits + // are set. + + if (type == Primitive::kPrimInt) { + __ Srl(TMP, in, 1); + __ LoadConst32(AT, 0x55555555); + __ And(TMP, TMP, AT); + __ Subu(TMP, in, TMP); + __ LoadConst32(AT, 0x33333333); + __ And(out, TMP, AT); + __ Srl(TMP, TMP, 2); + __ And(TMP, TMP, AT); + __ Addu(TMP, out, TMP); + __ Srl(out, TMP, 4); + __ Addu(out, out, TMP); + __ LoadConst32(AT, 0x0F0F0F0F); + __ And(out, out, AT); + __ LoadConst32(TMP, 0x01010101); + __ MulR6(out, out, TMP); + __ Srl(out, out, 24); + } else if (type == Primitive::kPrimLong) { + __ Dsrl(TMP, in, 1); + __ LoadConst64(AT, 0x5555555555555555L); + __ And(TMP, TMP, AT); + __ Dsubu(TMP, in, TMP); + __ LoadConst64(AT, 0x3333333333333333L); + __ And(out, TMP, AT); + __ Dsrl(TMP, TMP, 2); + __ And(TMP, TMP, AT); + __ Daddu(TMP, out, TMP); + __ Dsrl(out, TMP, 4); + __ Daddu(out, out, TMP); + __ LoadConst64(AT, 0x0F0F0F0F0F0F0F0FL); + __ And(out, out, AT); + __ LoadConst64(TMP, 0x0101010101010101L); + __ Dmul(out, out, TMP); + __ Dsrl32(out, out, 24); + } +} + +// int java.lang.Integer.bitCount(int) +void IntrinsicLocationsBuilderMIPS64::VisitIntegerBitCount(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitIntegerBitCount(HInvoke* invoke) { + GenBitCount(invoke->GetLocations(), Primitive::kPrimInt, GetAssembler()); +} + +// int java.lang.Long.bitCount(long) +void IntrinsicLocationsBuilderMIPS64::VisitLongBitCount(HInvoke* invoke) { + CreateIntToIntLocations(arena_, invoke); +} + +void IntrinsicCodeGeneratorMIPS64::VisitLongBitCount(HInvoke* invoke) { + GenBitCount(invoke->GetLocations(), Primitive::kPrimLong, GetAssembler()); +} + static void MathAbsFP(LocationSummary* locations, bool is64bit, Mips64Assembler* assembler) { FpuRegister in = locations->InAt(0).AsFpuRegister(); FpuRegister out = locations->Out().AsFpuRegister(); @@ -1693,9 +1779,6 @@ void IntrinsicCodeGeneratorMIPS64::VisitDoubleIsInfinite(HInvoke* invoke) { GenIsInfinite(invoke->GetLocations(), /* is64bit */ true, GetAssembler()); } -UNIMPLEMENTED_INTRINSIC(MIPS64, IntegerBitCount) -UNIMPLEMENTED_INTRINSIC(MIPS64, LongBitCount) - UNIMPLEMENTED_INTRINSIC(MIPS64, MathRoundDouble) UNIMPLEMENTED_INTRINSIC(MIPS64, MathRoundFloat) -- 2.11.0