OSDN Git Service

ARM64: Support MultiplyAccumulate for SIMD.
authorArtem Serov <artem.serov@linaro.org>
Mon, 10 Apr 2017 16:41:46 +0000 (17:41 +0100)
committerArtem Serov <artem.serov@linaro.org>
Wed, 19 Apr 2017 23:21:29 +0000 (00:21 +0100)
Test: test-art-host, test-art-target.

Change-Id: I06af8415e15352d09d176cae828163cbe99ae7a7

15 files changed:
compiler/optimizing/code_generator_vector_arm.cc
compiler/optimizing/code_generator_vector_arm64.cc
compiler/optimizing/code_generator_vector_arm_vixl.cc
compiler/optimizing/code_generator_vector_mips.cc
compiler/optimizing/code_generator_vector_mips64.cc
compiler/optimizing/code_generator_vector_x86.cc
compiler/optimizing/code_generator_vector_x86_64.cc
compiler/optimizing/graph_visualizer.cc
compiler/optimizing/instruction_simplifier_arm64.cc
compiler/optimizing/instruction_simplifier_arm64.h
compiler/optimizing/instruction_simplifier_shared.cc
compiler/optimizing/instruction_simplifier_shared.h
compiler/optimizing/nodes.h
compiler/optimizing/nodes_vector.h
test/550-checker-multiply-accumulate/src/Main.java

index 6e82123..f8552dc 100644 (file)
@@ -245,6 +245,14 @@ void InstructionCodeGeneratorARM::VisitVecUShr(HVecUShr* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorARM::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderARM::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
index 2dfccff..b3eb639 100644 (file)
@@ -681,6 +681,67 @@ void InstructionCodeGeneratorARM64::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = new (GetGraph()->GetArena()) LocationSummary(instr);
+  switch (instr->GetPackedType()) {
+    case Primitive::kPrimByte:
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+    case Primitive::kPrimInt:
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputAccumulatorIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulLeftIndex, Location::RequiresFpuRegister());
+      locations->SetInAt(
+          HVecMultiplyAccumulate::kInputMulRightIndex, Location::RequiresFpuRegister());
+      DCHECK_EQ(HVecMultiplyAccumulate::kInputAccumulatorIndex, 0);
+      locations->SetOut(Location::SameAsFirstInput());
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+      UNREACHABLE();
+  }
+}
+
+// Some early revisions of the Cortex-A53 have an erratum (835769) whereby it is possible for a
+// 64-bit scalar multiply-accumulate instruction in AArch64 state to generate an incorrect result.
+// However vector MultiplyAccumulate instruction is not affected.
+void InstructionCodeGeneratorARM64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LocationSummary* locations = instr->GetLocations();
+  VRegister acc = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputAccumulatorIndex));
+  VRegister left = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulLeftIndex));
+  VRegister right = VRegisterFrom(locations->InAt(HVecMultiplyAccumulate::kInputMulRightIndex));
+  switch (instr->GetPackedType()) {
+    case Primitive::kPrimByte:
+      DCHECK_EQ(16u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::kAdd) {
+        __ Mla(acc.V16B(), left.V16B(), right.V16B());
+      } else {
+        __ Mls(acc.V16B(), left.V16B(), right.V16B());
+      }
+      break;
+    case Primitive::kPrimChar:
+    case Primitive::kPrimShort:
+      DCHECK_EQ(8u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::kAdd) {
+        __ Mla(acc.V8H(), left.V8H(), right.V8H());
+      } else {
+        __ Mls(acc.V8H(), left.V8H(), right.V8H());
+      }
+      break;
+    case Primitive::kPrimInt:
+      DCHECK_EQ(4u, instr->GetVectorLength());
+      if (instr->GetOpKind() == HInstruction::kAdd) {
+        __ Mla(acc.V4S(), left.V4S(), right.V4S());
+      } else {
+        __ Mls(acc.V4S(), left.V4S(), right.V4S());
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported SIMD type";
+  }
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
index 990178b..53f314e 100644 (file)
@@ -245,6 +245,14 @@ void InstructionCodeGeneratorARMVIXL::VisitVecUShr(HVecUShr* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorARMVIXL::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderARMVIXL::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
index 8ea1ca7..c4a3225 100644 (file)
@@ -245,6 +245,14 @@ void InstructionCodeGeneratorMIPS::VisitVecUShr(HVecUShr* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorMIPS::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderMIPS::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
index a484bb4..50b95c1 100644 (file)
@@ -245,6 +245,14 @@ void InstructionCodeGeneratorMIPS64::VisitVecUShr(HVecUShr* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
 
+void LocationsBuilderMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorMIPS64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 void LocationsBuilderMIPS64::VisitVecLoad(HVecLoad* instruction) {
   LOG(FATAL) << "No SIMD for " << instruction->GetId();
 }
index a86d060..013b092 100644 (file)
@@ -730,6 +730,14 @@ void InstructionCodeGeneratorX86::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorX86::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
index 6967353..66f19a4 100644 (file)
@@ -719,6 +719,14 @@ void InstructionCodeGeneratorX86_64::VisitVecUShr(HVecUShr* instruction) {
   }
 }
 
+void LocationsBuilderX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
+void InstructionCodeGeneratorX86_64::VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instr) {
+  LOG(FATAL) << "No SIMD for " << instr->GetId();
+}
+
 // Helper to set up locations for vector memory operations.
 static void CreateVecMemLocations(ArenaAllocator* arena,
                                   HVecMemoryOperation* instruction,
index 1b2b9f8..e5d94c3 100644 (file)
@@ -514,6 +514,10 @@ class HGraphVisualizerPrinter : public HGraphDelegateVisitor {
     StartAttributeStream("rounded") << std::boolalpha << hadd->IsRounded() << std::noboolalpha;
   }
 
+  void VisitVecMultiplyAccumulate(HVecMultiplyAccumulate* instruction) OVERRIDE {
+    StartAttributeStream("kind") << instruction->GetOpKind();
+  }
+
 #if defined(ART_ENABLE_CODEGEN_arm) || defined(ART_ENABLE_CODEGEN_arm64)
   void VisitMultiplyAccumulate(HMultiplyAccumulate* instruction) OVERRIDE {
     StartAttributeStream("kind") << instruction->GetOpKind();
index 73b7b2b..f16e372 100644 (file)
@@ -210,5 +210,11 @@ void InstructionSimplifierArm64Visitor::VisitXor(HXor* instruction) {
   }
 }
 
+void InstructionSimplifierArm64Visitor::VisitVecMul(HVecMul* instruction) {
+  if (TryCombineVecMultiplyAccumulate(instruction, kArm64)) {
+    RecordSimplification();
+  }
+}
+
 }  // namespace arm64
 }  // namespace art
index 65654f5..eec4e49 100644 (file)
@@ -74,6 +74,7 @@ class InstructionSimplifierArm64Visitor : public HGraphVisitor {
   void VisitTypeConversion(HTypeConversion* instruction) OVERRIDE;
   void VisitUShr(HUShr* instruction) OVERRIDE;
   void VisitXor(HXor* instruction) OVERRIDE;
+  void VisitVecMul(HVecMul* instruction) OVERRIDE;
 
   OptimizingCompilerStats* stats_;
 };
index c2b1374..7d1f146 100644 (file)
@@ -278,5 +278,71 @@ bool TryExtractArrayAccessAddress(HInstruction* access,
   return true;
 }
 
+bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa) {
+  Primitive::Type type = mul->GetPackedType();
+  switch (isa) {
+    case kArm64:
+      if (!(type == Primitive::kPrimByte ||
+            type == Primitive::kPrimChar ||
+            type == Primitive::kPrimShort ||
+            type == Primitive::kPrimInt)) {
+        return false;
+      }
+      break;
+    default:
+      return false;
+  }
+
+  ArenaAllocator* arena = mul->GetBlock()->GetGraph()->GetArena();
+
+  if (mul->HasOnlyOneNonEnvironmentUse()) {
+    HInstruction* use = mul->GetUses().front().GetUser();
+    if (use->IsVecAdd() || use->IsVecSub()) {
+      // Replace code looking like
+      //    VECMUL tmp, x, y
+      //    VECADD/SUB dst, acc, tmp
+      // with
+      //    VECMULACC dst, acc, x, y
+      // Note that we do not want to (unconditionally) perform the merge when the
+      // multiplication has multiple uses and it can be merged in all of them.
+      // Multiple uses could happen on the same control-flow path, and we would
+      // then increase the amount of work. In the future we could try to evaluate
+      // whether all uses are on different control-flow paths (using dominance and
+      // reverse-dominance information) and only perform the merge when they are.
+      HInstruction* accumulator = nullptr;
+      HVecBinaryOperation* binop = use->AsVecBinaryOperation();
+      HInstruction* binop_left = binop->GetLeft();
+      HInstruction* binop_right = binop->GetRight();
+      // This is always true since the `HVecMul` has only one use (which is checked above).
+      DCHECK_NE(binop_left, binop_right);
+      if (binop_right == mul) {
+        accumulator = binop_left;
+      } else if (use->IsVecAdd()) {
+        DCHECK_EQ(binop_left, mul);
+        accumulator = binop_right;
+      }
+
+      HInstruction::InstructionKind kind =
+          use->IsVecAdd() ? HInstruction::kAdd : HInstruction::kSub;
+      if (accumulator != nullptr) {
+        HVecMultiplyAccumulate* mulacc =
+            new (arena) HVecMultiplyAccumulate(arena,
+                                               kind,
+                                               accumulator,
+                                               mul->GetLeft(),
+                                               mul->GetRight(),
+                                               binop->GetPackedType(),
+                                               binop->GetVectorLength());
+
+        binop->GetBlock()->ReplaceAndRemoveInstructionWith(binop, mulacc);
+        DCHECK(!mul->HasUses());
+        mul->GetBlock()->RemoveInstruction(mul);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
 
 }  // namespace art
index 83e3ffc..2ea103a 100644 (file)
@@ -58,6 +58,8 @@ bool TryExtractArrayAccessAddress(HInstruction* access,
                                   HInstruction* index,
                                   size_t data_offset);
 
+bool TryCombineVecMultiplyAccumulate(HVecMul* mul, InstructionSet isa);
+
 }  // namespace art
 
 #endif  // ART_COMPILER_OPTIMIZING_INSTRUCTION_SIMPLIFIER_SHARED_H_
index 6be237e..af953c8 100644 (file)
@@ -1382,6 +1382,7 @@ class HLoopInformationOutwardIterator : public ValueObject {
   M(VecShl, VecBinaryOperation)                                         \
   M(VecShr, VecBinaryOperation)                                         \
   M(VecUShr, VecBinaryOperation)                                        \
+  M(VecMultiplyAccumulate, VecOperation)                                \
   M(VecLoad, VecMemoryOperation)                                        \
   M(VecStore, VecMemoryOperation)                                       \
 
index bff58d0..450691c 100644 (file)
@@ -143,6 +143,10 @@ class HVecBinaryOperation : public HVecOperation {
                       /*number_of_inputs*/ 2,
                       vector_length,
                       dex_pc) { }
+
+  HInstruction* GetLeft() const { return InputAt(0); }
+  HInstruction* GetRight() const { return InputAt(1); }
+
   DECLARE_ABSTRACT_INSTRUCTION(VecBinaryOperation);
  private:
   DISALLOW_COPY_AND_ASSIGN(HVecBinaryOperation);
@@ -627,6 +631,59 @@ class HVecUShr FINAL : public HVecBinaryOperation {
   DISALLOW_COPY_AND_ASSIGN(HVecUShr);
 };
 
+// Multiplies every component in the two vectors, adds the result vector to the accumulator vector.
+// viz. [ acc1, .., accn ] + [ x1, .. , xn ] * [ y1, .. , yn ] =
+//     [ acc1 + x1 * y1, .. , accn + xn * yn ].
+class HVecMultiplyAccumulate FINAL : public HVecOperation {
+ public:
+  HVecMultiplyAccumulate(ArenaAllocator* arena,
+                         InstructionKind op,
+                         HInstruction* accumulator,
+                         HInstruction* mul_left,
+                         HInstruction* mul_right,
+                         Primitive::Type packed_type,
+                         size_t vector_length,
+                         uint32_t dex_pc = kNoDexPc)
+      : HVecOperation(arena,
+                      packed_type,
+                      SideEffects::None(),
+                      /*number_of_inputs*/ 3,
+                      vector_length,
+                      dex_pc),
+        op_kind_(op) {
+    DCHECK(op == InstructionKind::kAdd || op == InstructionKind::kSub);
+    DCHECK(accumulator->IsVecOperation());
+    DCHECK(mul_left->IsVecOperation() && mul_right->IsVecOperation());
+    DCHECK_EQ(accumulator->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(mul_left->AsVecOperation()->GetPackedType(), packed_type);
+    DCHECK_EQ(mul_right->AsVecOperation()->GetPackedType(), packed_type);
+
+    SetRawInputAt(kInputAccumulatorIndex, accumulator);
+    SetRawInputAt(kInputMulLeftIndex, mul_left);
+    SetRawInputAt(kInputMulRightIndex, mul_right);
+  }
+
+  static constexpr int kInputAccumulatorIndex = 0;
+  static constexpr int kInputMulLeftIndex = 1;
+  static constexpr int kInputMulRightIndex = 2;
+
+  bool CanBeMoved() const OVERRIDE { return true; }
+
+  bool InstructionDataEquals(const HInstruction* other) const OVERRIDE {
+    return op_kind_ == other->AsVecMultiplyAccumulate()->op_kind_;
+  }
+
+  InstructionKind GetOpKind() const { return op_kind_; }
+
+  DECLARE_INSTRUCTION(VecMultiplyAccumulate);
+
+ private:
+  // Indicates if this is a MADD or MSUB.
+  const InstructionKind op_kind_;
+
+  DISALLOW_COPY_AND_ASSIGN(HVecMultiplyAccumulate);
+};
+
 // Loads a vector from memory, viz. load(mem, 1)
 // yield the vector [ mem(1), .. , mem(n) ].
 class HVecLoad FINAL : public HVecMemoryOperation {
index 09376a2..810f0fa 100644 (file)
@@ -424,6 +424,88 @@ public class Main {
     return - (left * right);
   }
 
+  /// CHECK-START-ARM64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:     VecAdd                         loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-ARM64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:     VecMultiplyAccumulate kind:Add loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-ARM64: void Main.SimdMulAdd(int[], int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:     VecMull
+  /// CHECK-NOT:     VecAdd
+  public static void SimdMulAdd(int[] array1, int[] array2) {
+    for (int j = 0; j < 100; j++) {
+      array2[j] += 12345 * array1[j];
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.SimdMulSub(int[], int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:     VecSub                         loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-ARM64: void Main.SimdMulSub(int[], int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:     VecMultiplyAccumulate kind:Sub loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-ARM64: void Main.SimdMulSub(int[], int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT:     VecMull
+  /// CHECK-NOT:     VecSub
+  public static void SimdMulSub(int[] array1, int[] array2) {
+    for (int j = 0; j < 100; j++) {
+      array2[j] -= 12345 * array1[j];
+    }
+  }
+
+  /// CHECK-START-ARM64: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier_arm64 (before)
+  /// CHECK-DAG:     Phi                            loop:<<Loop:B\d+>> outer_loop:none
+  /// CHECK-DAG:     VecMul                         loop:<<Loop>>      outer_loop:none
+  /// CHECK-DAG:     VecSub                         loop:<<Loop>>      outer_loop:none
+
+  /// CHECK-START-ARM64: void Main.SimdMulMultipleUses(int[], int[]) instruction_simplifier_arm64 (after)
+  /// CHECK-NOT: VecMultiplyAccumulate
+  public static void SimdMulMultipleUses(int[] array1, int[] array2) {
+    for (int j = 0; j < 100; j++) {
+       int temp = 12345 * array1[j];
+       array2[j] -= temp;
+       array1[j] = temp;
+    }
+  }
+
+  public static final int ARRAY_SIZE = 1000;
+
+  public static void initArray(int[] array) {
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      array[i] = i;
+    }
+  }
+
+  public static int calcArraySum(int[] array) {
+    int sum = 0;
+    for (int i = 0; i < ARRAY_SIZE; i++) {
+      sum += array[i];
+    }
+    return sum;
+  }
+
+  public static void testSimdMultiplyAccumulate() {
+    int[] array1 = new int[ARRAY_SIZE];
+    int[] array2 = new int[ARRAY_SIZE];
+
+    initArray(array1);
+    initArray(array2);
+    SimdMulSub(array1, array2);
+    assertIntEquals(-60608250, calcArraySum(array2));
+
+    initArray(array1);
+    initArray(array2);
+    SimdMulAdd(array1, array2);
+    assertIntEquals(61607250, calcArraySum(array2));
+  }
+
   public static void main(String[] args) {
     assertIntEquals(7, $opt$noinline$mulAdd(1, 2, 3));
     assertLongEquals(-26, $opt$noinline$mulSub(4, 5, 6));
@@ -433,5 +515,7 @@ public class Main {
     assertLongEquals(-225, $opt$noinline$mulMinusOne(15, 16));
     assertIntEquals(-306, $opt$noinline$mulNeg(17, 18));
     assertLongEquals(-380, $opt$noinline$mulNeg(19, 20));
+
+    testSimdMultiplyAccumulate();
   }
 }