From 7c2ad5af0bdd3cc1069038f8e3422d99aeb5f44c Mon Sep 17 00:00:00 2001
From: Vladimir Marko <vmarko@google.com>
Date: Wed, 24 Sep 2014 12:42:55 +0100
Subject: [PATCH] Implement method calls using relative BL on ARM64.

Change-Id: I9e5d0b6c100b6cddd6bbb7ab07cff77ab104ea31
---
 compiler/dex/quick/arm64/arm64_lir.h       |   2 +
 compiler/dex/quick/arm64/assemble_arm64.cc |   6 +-
 compiler/dex/quick/arm64/call_arm64.cc     | 115 +++++++++++++++++
 compiler/dex/quick/arm64/codegen_arm64.h   |  24 ++++
 compiler/dex/quick/arm64/target_arm64.cc   |  20 ++-
 compiler/dex/quick/gen_invoke.cc           |   6 +-
 compiler/oat_writer.cc                     | 198 ++++++++++++++++++++---------
 compiler/oat_writer.h                      |   2 +
 8 files changed, 307 insertions(+), 66 deletions(-)

diff --git a/compiler/dex/quick/arm64/arm64_lir.h b/compiler/dex/quick/arm64/arm64_lir.h
index ab7192143..a87b06aeb 100644
--- a/compiler/dex/quick/arm64/arm64_lir.h
+++ b/compiler/dex/quick/arm64/arm64_lir.h
@@ -117,6 +117,7 @@ enum A64ResourceEncodingPos {
 #define IS_SIGNED_IMM14(value) IS_SIGNED_IMM(14, value)
 #define IS_SIGNED_IMM19(value) IS_SIGNED_IMM(19, value)
 #define IS_SIGNED_IMM21(value) IS_SIGNED_IMM(21, value)
+#define IS_SIGNED_IMM26(value) IS_SIGNED_IMM(26, value)
 
 // Quick macro used to define the registers.
 #define A64_REGISTER_CODE_LIST(R) \
@@ -240,6 +241,7 @@ enum A64Opcode {
   kA64B2ct,          // b.cond [01010100] imm_19[23-5] [0] cond[3-0].
   kA64Blr1x,         // blr [1101011000111111000000] rn[9-5] [00000].
   kA64Br1x,          // br  [1101011000011111000000] rn[9-5] [00000].
+  kA64Bl1t,          // bl  [100101] imm26[25-0].
   kA64Brk1d,         // brk [11010100001] imm_16[20-5] [00000].
   kA64B1t,           // b   [00010100] offset_26[25-0].
   kA64Cbnz2rt,       // cbnz[00110101] imm_19[23-5] rt[4-0].
diff --git a/compiler/dex/quick/arm64/assemble_arm64.cc b/compiler/dex/quick/arm64/assemble_arm64.cc
index b1cf27968..7c663a941 100644
--- a/compiler/dex/quick/arm64/assemble_arm64.cc
+++ b/compiler/dex/quick/arm64/assemble_arm64.cc
@@ -155,6 +155,10 @@ const A64EncodingMap Arm64Mir2Lir::EncodingMap[kA64Last] = {
                  kFmtRegX, 9, 5, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP | REG_USE0 | IS_BRANCH,
                  "br", "!0x", kFixupNone),
+    ENCODING_MAP(kA64Bl1t, NO_VARIANTS(0x94000000),
+                 kFmtBitBlt, 25, 0, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1, IS_UNARY_OP | IS_BRANCH | REG_DEF_LR | NEEDS_FIXUP,
+                 "bl", "!0T", kFixupLabel),
     ENCODING_MAP(kA64Brk1d, NO_VARIANTS(0xd4200000),
                  kFmtBitBlt, 20, 5, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP | IS_BRANCH,
@@ -873,7 +877,7 @@ void Arm64Mir2Lir::AssembleLIR() {
               ((target_lir->flags.generation == lir->flags.generation) ? 0 : offset_adjustment);
           int32_t delta = target - pc;
           DCHECK_EQ(delta & 0x3, 0);
-          if (!IS_SIGNED_IMM19(delta >> 2)) {
+          if (!IS_SIGNED_IMM26(delta >> 2)) {
             LOG(FATAL) << "Invalid jump range in kFixupT1Branch";
           }
           lir->operands[0] = delta >> 2;
diff --git a/compiler/dex/quick/arm64/call_arm64.cc b/compiler/dex/quick/arm64/call_arm64.cc
index 6081f289e..e8de876d6 100644
--- a/compiler/dex/quick/arm64/call_arm64.cc
+++ b/compiler/dex/quick/arm64/call_arm64.cc
@@ -21,6 +21,8 @@
 #include "dex/quick/mir_to_lir-inl.h"
 #include "gc/accounting/card_table.h"
 #include "entrypoints/quick/quick_entrypoints.h"
+#include "mirror/art_method.h"
+#include "mirror/object_array-inl.h"
 
 namespace art {
 
@@ -433,4 +435,117 @@ void Arm64Mir2Lir::GenSpecialExitSequence() {
   NewLIR0(kA64Ret);
 }
 
+static bool Arm64UseRelativeCall(CompilationUnit* cu, const MethodReference& target_method) {
+  // Always emit relative calls.
+  return true;
+}
+
+/*
+ * Bit of a hack here - in the absence of a real scheduling pass,
+ * emit the next instruction in static & direct invoke sequences.
+ */
+static int Arm64NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
+                               int state, const MethodReference& target_method,
+                               uint32_t unused,
+                               uintptr_t direct_code, uintptr_t direct_method,
+                               InvokeType type) {
+  Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
+  if (direct_code != 0 && direct_method != 0) {
+    switch (state) {
+    case 0:  // Get the current Method* [sets kArg0]
+      if (direct_code != static_cast<uintptr_t>(-1)) {
+        cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
+      } else if (Arm64UseRelativeCall(cu, target_method)) {
+        // Defer to linker patch.
+      } else {
+        cg->LoadCodeAddress(target_method, type, kInvokeTgt);
+      }
+      if (direct_method != static_cast<uintptr_t>(-1)) {
+        cg->LoadConstant(cg->TargetReg(kArg0, kRef), direct_method);
+      } else {
+        cg->LoadMethodAddress(target_method, type, kArg0);
+      }
+      break;
+    default:
+      return -1;
+    }
+  } else {
+    RegStorage arg0_ref = cg->TargetReg(kArg0, kRef);
+    switch (state) {
+    case 0:  // Get the current Method* [sets kArg0]
+      // TUNING: we can save a reg copy if Method* has been promoted.
+      cg->LoadCurrMethodDirect(arg0_ref);
+      break;
+    case 1:  // Get method->dex_cache_resolved_methods_
+      cg->LoadRefDisp(arg0_ref,
+                      mirror::ArtMethod::DexCacheResolvedMethodsOffset().Int32Value(),
+                      arg0_ref,
+                      kNotVolatile);
+      // Set up direct code if known.
+      if (direct_code != 0) {
+        if (direct_code != static_cast<uintptr_t>(-1)) {
+          cg->LoadConstant(cg->TargetPtrReg(kInvokeTgt), direct_code);
+        } else if (Arm64UseRelativeCall(cu, target_method)) {
+          // Defer to linker patch.
+        } else {
+          CHECK_LT(target_method.dex_method_index, target_method.dex_file->NumMethodIds());
+          cg->LoadCodeAddress(target_method, type, kInvokeTgt);
+        }
+      }
+      break;
+    case 2:  // Grab target method*
+      CHECK_EQ(cu->dex_file, target_method.dex_file);
+      cg->LoadRefDisp(arg0_ref,
+                      mirror::ObjectArray<mirror::Object>::OffsetOfElement(
+                          target_method.dex_method_index).Int32Value(),
+                      arg0_ref,
+                      kNotVolatile);
+      break;
+    case 3:  // Grab the code from the method*
+      if (direct_code == 0) {
+        // kInvokeTgt := arg0_ref->entrypoint
+        cg->LoadWordDisp(arg0_ref,
+                         mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value(),
+                         cg->TargetPtrReg(kInvokeTgt));
+      }
+      break;
+    default:
+      return -1;
+    }
+  }
+  return state + 1;
+}
+
+NextCallInsn Arm64Mir2Lir::GetNextSDCallInsn() {
+  return Arm64NextSDCallInsn;
+}
+
+LIR* Arm64Mir2Lir::CallWithLinkerFixup(const MethodReference& target_method, InvokeType type) {
+  // For ARM64, just generate a relative BL instruction that will be filled in at 'link time'.
+  // If the target turns out to be too far, the linker will generate a thunk for dispatch.
+  int target_method_idx = target_method.dex_method_index;
+  const DexFile* target_dex_file = target_method.dex_file;
+
+  // Generate the call instruction and save index, dex_file, and type.
+  // NOTE: Method deduplication takes linker patches into account, so we can just pass 0
+  // as a placeholder for the offset.
+  LIR* call = RawLIR(current_dalvik_offset_, kA64Bl1t, 0,
+                     target_method_idx, WrapPointer(const_cast<DexFile*>(target_dex_file)), type);
+  AppendLIR(call);
+  call_method_insns_.push_back(call);
+  return call;
+}
+
+LIR* Arm64Mir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info) {
+  LIR* call_insn;
+  if (method_info.FastPath() && Arm64UseRelativeCall(cu_, method_info.GetTargetMethod()) &&
+      (method_info.GetSharpType() == kDirect || method_info.GetSharpType() == kStatic) &&
+      method_info.DirectCode() == static_cast<uintptr_t>(-1)) {
+    call_insn = CallWithLinkerFixup(method_info.GetTargetMethod(), method_info.GetSharpType());
+  } else {
+    call_insn = OpReg(kOpBlx, TargetPtrReg(kInvokeTgt));
+  }
+  return call_insn;
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/codegen_arm64.h b/compiler/dex/quick/arm64/codegen_arm64.h
index 55cc93842..93d9b34be 100644
--- a/compiler/dex/quick/arm64/codegen_arm64.h
+++ b/compiler/dex/quick/arm64/codegen_arm64.h
@@ -259,6 +259,28 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
 
   size_t GetInstructionOffset(LIR* lir) OVERRIDE;
 
+  NextCallInsn GetNextSDCallInsn() OVERRIDE;
+
+  /*
+   * @brief Generate a relative call to the method that will be patched at link time.
+   * @param target_method The MethodReference of the method to be invoked.
+   * @param type How the method will be invoked.
+   * @returns Call instruction
+   */
+  LIR* CallWithLinkerFixup(const MethodReference& target_method, InvokeType type);
+
+  /*
+   * @brief Generate the actual call insn based on the method info.
+   * @param method_info the lowering info for the method call.
+   * @returns Call instruction
+   */
+  virtual LIR* GenCallInsn(const MirMethodLoweringInfo& method_info) OVERRIDE;
+
+  /*
+   * @brief Handle ARM specific literals.
+   */
+  void InstallLiteralPools() OVERRIDE;
+
   LIR* InvokeTrampoline(OpKind op, RegStorage r_tgt, QuickEntrypointEnum trampoline) OVERRIDE;
 
  private:
@@ -396,6 +418,8 @@ class Arm64Mir2Lir FINAL : public Mir2Lir {
 
   InToRegStorageMapping in_to_reg_storage_mapping_;
   static const A64EncodingMap EncodingMap[kA64Last];
+
+  ArenaVector<LIR*> call_method_insns_;
 };
 
 }  // namespace art
diff --git a/compiler/dex/quick/arm64/target_arm64.cc b/compiler/dex/quick/arm64/target_arm64.cc
index 0462530a3..ba47883d9 100644
--- a/compiler/dex/quick/arm64/target_arm64.cc
+++ b/compiler/dex/quick/arm64/target_arm64.cc
@@ -585,7 +585,8 @@ RegisterClass Arm64Mir2Lir::RegClassForFieldLoadStore(OpSize size, bool is_volat
 }
 
 Arm64Mir2Lir::Arm64Mir2Lir(CompilationUnit* cu, MIRGraph* mir_graph, ArenaAllocator* arena)
-    : Mir2Lir(cu, mir_graph, arena) {
+    : Mir2Lir(cu, mir_graph, arena),
+      call_method_insns_(arena->Adapter()) {
   // Sanity check - make sure encoding map lines up.
   for (int i = 0; i < kA64Last; i++) {
     if (UNWIDE(Arm64Mir2Lir::EncodingMap[i].opcode) != i) {
@@ -1201,4 +1202,21 @@ int Arm64Mir2Lir::GenDalvikArgsRange(CallInfo* info, int call_state,
   return call_state;
 }
 
+void Arm64Mir2Lir::InstallLiteralPools() {
+  // PC-relative calls to methods.
+  patches_.reserve(call_method_insns_.size());
+  for (LIR* p : call_method_insns_) {
+      DCHECK_EQ(p->opcode, kA64Bl1t);
+      uint32_t target_method_idx = p->operands[1];
+      const DexFile* target_dex_file =
+          reinterpret_cast<const DexFile*>(UnwrapPointer(p->operands[2]));
+
+      patches_.push_back(LinkerPatch::RelativeCodePatch(p->offset,
+                                                        target_dex_file, target_method_idx));
+  }
+
+  // And do the normal processing.
+  Mir2Lir::InstallLiteralPools();
+}
+
 }  // namespace art
diff --git a/compiler/dex/quick/gen_invoke.cc b/compiler/dex/quick/gen_invoke.cc
index c308932bc..174e4e0bb 100755
--- a/compiler/dex/quick/gen_invoke.cc
+++ b/compiler/dex/quick/gen_invoke.cc
@@ -495,7 +495,8 @@ static int NextSDCallInsn(CompilationUnit* cu, CallInfo* info,
                           uintptr_t direct_code, uintptr_t direct_method,
                           InvokeType type) {
   DCHECK(cu->instruction_set != kX86 && cu->instruction_set != kX86_64 &&
-         cu->instruction_set != kThumb2 && cu->instruction_set != kArm);
+         cu->instruction_set != kThumb2 && cu->instruction_set != kArm &&
+         cu->instruction_set != kArm64);
   Mir2Lir* cg = static_cast<Mir2Lir*>(cu->cg.get());
   if (direct_code != 0 && direct_method != 0) {
     switch (state) {
@@ -1751,7 +1752,8 @@ NextCallInsn Mir2Lir::GetNextSDCallInsn() {
 
 LIR* Mir2Lir::GenCallInsn(const MirMethodLoweringInfo& method_info) {
   DCHECK(cu_->instruction_set != kX86 && cu_->instruction_set != kX86_64 &&
-         cu_->instruction_set != kThumb2 && cu_->instruction_set != kArm);
+         cu_->instruction_set != kThumb2 && cu_->instruction_set != kArm &&
+         cu_->instruction_set != kArm64);
   return OpReg(kOpBlx, TargetPtrReg(kInvokeTgt));
 }
 
diff --git a/compiler/oat_writer.cc b/compiler/oat_writer.cc
index dd64368ab..e64d2ab27 100644
--- a/compiler/oat_writer.cc
+++ b/compiler/oat_writer.cc
@@ -38,6 +38,7 @@
 #include "scoped_thread_state_change.h"
 #include "handle_scope-inl.h"
 #include "utils/arm/assembler_thumb2.h"
+#include "utils/arm64/assembler_arm64.h"
 #include "verifier/method_verifier.h"
 
 namespace art {
@@ -117,10 +118,14 @@ class OatWriter::X86RelativeCallPatcher FINAL : public RelativeCallPatcher {
   DISALLOW_COPY_AND_ASSIGN(X86RelativeCallPatcher);
 };
 
-class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
+class OatWriter::ArmBaseRelativeCallPatcher : public RelativeCallPatcher {
  public:
-  explicit Thumb2RelativeCallPatcher(OatWriter* writer)
-      : writer_(writer), thunk_code_(CompileThunkCode()),
+  ArmBaseRelativeCallPatcher(OatWriter* writer,
+                             InstructionSet instruction_set, std::vector<uint8_t> thunk_code,
+                             uint32_t max_positive_displacement, uint32_t max_negative_displacement)
+      : writer_(writer), instruction_set_(instruction_set), thunk_code_(thunk_code),
+        max_positive_displacement_(max_positive_displacement),
+        max_negative_displacement_(max_negative_displacement),
         thunk_locations_(), current_thunk_to_write_(0u), unprocessed_patches_() {
   }
 
@@ -130,11 +135,11 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
     // of code. To avoid any alignment discrepancies for the final chunk, we always align the
     // offset after reserving of writing any chunk.
     if (UNLIKELY(compiled_method == nullptr)) {
-      uint32_t aligned_offset = CompiledMethod::AlignCode(offset, kThumb2);
+      uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
       bool needs_thunk = ReserveSpaceProcessPatches(aligned_offset);
       if (needs_thunk) {
         thunk_locations_.push_back(aligned_offset);
-        offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), kThumb2);
+        offset = CompiledMethod::AlignCode(aligned_offset + thunk_code_.size(), instruction_set_);
       }
       return offset;
     }
@@ -143,14 +148,14 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
     uint32_t quick_code_offset = compiled_method->AlignCode(offset) + sizeof(OatQuickMethodHeader);
     uint32_t next_aligned_offset = compiled_method->AlignCode(quick_code_offset + quick_code_size);
     if (!unprocessed_patches_.empty() &&
-        next_aligned_offset - unprocessed_patches_.front().second > kMaxPositiveDisplacement) {
+        next_aligned_offset - unprocessed_patches_.front().second > max_positive_displacement_) {
       bool needs_thunk = ReserveSpaceProcessPatches(next_aligned_offset);
       if (needs_thunk) {
         // A single thunk will cover all pending patches.
         unprocessed_patches_.clear();
         uint32_t thunk_location = compiled_method->AlignCode(offset);
         thunk_locations_.push_back(thunk_location);
-        offset = CompiledMethod::AlignCode(thunk_location + thunk_code_.size(), kThumb2);
+        offset = CompiledMethod::AlignCode(thunk_location + thunk_code_.size(), instruction_set_);
       }
     }
     for (const LinkerPatch& patch : compiled_method->GetPatches()) {
@@ -166,7 +171,7 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
     if (current_thunk_to_write_ == thunk_locations_.size()) {
       return offset;
     }
-    uint32_t aligned_offset = CompiledMethod::AlignCode(offset, kThumb2);
+    uint32_t aligned_offset = CompiledMethod::AlignCode(offset, instruction_set_);
     if (UNLIKELY(aligned_offset == thunk_locations_[current_thunk_to_write_])) {
       ++current_thunk_to_write_;
       uint32_t aligned_code_delta = aligned_offset - offset;
@@ -179,7 +184,7 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
       writer_->size_relative_call_thunks_ += thunk_code_.size();
       uint32_t thunk_end_offset = aligned_offset + thunk_code_.size();
       // Align after writing chunk, see the ReserveSpace() above.
-      offset = CompiledMethod::AlignCode(thunk_end_offset, kThumb2);
+      offset = CompiledMethod::AlignCode(thunk_end_offset, instruction_set_);
       aligned_code_delta = offset - thunk_end_offset;
       if (aligned_code_delta != 0u && !writer_->WriteCodeAlignment(out, aligned_code_delta)) {
         return 0u;
@@ -188,52 +193,27 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
     return offset;
   }
 
-  void Patch(std::vector<uint8_t>* code, uint32_t literal_offset, uint32_t patch_offset,
-             uint32_t target_offset) OVERRIDE {
-    DCHECK_LE(literal_offset + 4u, code->size());
-    DCHECK_EQ(literal_offset & 1u, 0u);
-    DCHECK_EQ(patch_offset & 1u, 0u);
-    DCHECK_EQ(target_offset & 1u, 1u);  // Thumb2 mode bit.
+ protected:
+  uint32_t CalculateDisplacement(uint32_t patch_offset, uint32_t target_offset) {
     // Unsigned arithmetic with its well-defined overflow behavior is just fine here.
-    uint32_t displacement = target_offset - 1u - patch_offset;
+    uint32_t displacement = target_offset - patch_offset;
     // NOTE: With unsigned arithmetic we do mean to use && rather than || below.
-    if (displacement > kMaxPositiveDisplacement && displacement < -kMaxNegativeDisplacement) {
+    if (displacement > max_positive_displacement_ && displacement < -max_negative_displacement_) {
       // Unwritten thunks have higher offsets, check if it's within range.
       DCHECK(current_thunk_to_write_ == thunk_locations_.size() ||
              thunk_locations_[current_thunk_to_write_] > patch_offset);
       if (current_thunk_to_write_ != thunk_locations_.size() &&
-          thunk_locations_[current_thunk_to_write_] - patch_offset < kMaxPositiveDisplacement) {
+          thunk_locations_[current_thunk_to_write_] - patch_offset < max_positive_displacement_) {
         displacement = thunk_locations_[current_thunk_to_write_] - patch_offset;
       } else {
         // We must have a previous thunk then.
         DCHECK_NE(current_thunk_to_write_, 0u);
         DCHECK_LT(thunk_locations_[current_thunk_to_write_ - 1], patch_offset);
         displacement = thunk_locations_[current_thunk_to_write_ - 1] - patch_offset;
-        DCHECK(displacement >= -kMaxNegativeDisplacement);
+        DCHECK(displacement >= -max_negative_displacement_);
       }
     }
-    displacement -= kPcDisplacement;  // The base PC is at the end of the 4-byte patch.
-    DCHECK_EQ(displacement & 1u, 0u);
-    DCHECK((displacement >> 24) == 0u || (displacement >> 24) == 255u);  // 25-bit signed.
-    uint32_t signbit = (displacement >> 31) & 0x1;
-    uint32_t i1 = (displacement >> 23) & 0x1;
-    uint32_t i2 = (displacement >> 22) & 0x1;
-    uint32_t imm10 = (displacement >> 12) & 0x03ff;
-    uint32_t imm11 = (displacement >> 1) & 0x07ff;
-    uint32_t j1 = i1 ^ (signbit ^ 1);
-    uint32_t j2 = i2 ^ (signbit ^ 1);
-    uint32_t value = (signbit << 26) | (j1 << 13) | (j2 << 11) | (imm10 << 16) | imm11;
-    value |= 0xf000d000;  // BL
-
-    uint8_t* addr = &(*code)[literal_offset];
-    // Check that we're just overwriting an existing BL.
-    DCHECK_EQ(addr[1] & 0xf8, 0xf0);
-    DCHECK_EQ(addr[3] & 0xd0, 0xd0);
-    // Write the new BL.
-    addr[0] = (value >> 16) & 0xff;
-    addr[1] = (value >> 24) & 0xff;
-    addr[2] = (value >> 0) & 0xff;
-    addr[3] = (value >> 8) & 0xff;
+    return displacement;
   }
 
  private:
@@ -246,18 +226,18 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
         // If still unresolved, check if we have a thunk within range.
         DCHECK(thunk_locations_.empty() || thunk_locations_.back() <= patch_offset);
         if (thunk_locations_.empty() ||
-            patch_offset - thunk_locations_.back() > kMaxNegativeDisplacement) {
-          return next_aligned_offset - patch_offset > kMaxPositiveDisplacement;
+            patch_offset - thunk_locations_.back() > max_negative_displacement_) {
+          return next_aligned_offset - patch_offset > max_positive_displacement_;
         }
       } else if (it->second >= patch_offset) {
-        DCHECK_LE(it->second - patch_offset, kMaxPositiveDisplacement);
+        DCHECK_LE(it->second - patch_offset, max_positive_displacement_);
       } else {
         // When calling back, check if we have a thunk that's closer than the actual target.
         uint32_t target_offset = (thunk_locations_.empty() || it->second > thunk_locations_.back())
             ? it->second
             : thunk_locations_.back();
         DCHECK_GT(patch_offset, target_offset);
-        if (patch_offset - target_offset > kMaxNegativeDisplacement) {
+        if (patch_offset - target_offset > max_negative_displacement_) {
           return true;
         }
       }
@@ -266,6 +246,60 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
     return false;
   }
 
+  OatWriter* const writer_;
+  const InstructionSet instruction_set_;
+  const std::vector<uint8_t> thunk_code_;
+  const uint32_t max_positive_displacement_;
+  const uint32_t max_negative_displacement_;
+  std::vector<uint32_t> thunk_locations_;
+  size_t current_thunk_to_write_;
+
+  // ReserveSpace() tracks unprocessed patches.
+  typedef std::pair<MethodReference, uint32_t> UnprocessedPatch;
+  std::deque<UnprocessedPatch> unprocessed_patches_;
+
+  DISALLOW_COPY_AND_ASSIGN(ArmBaseRelativeCallPatcher);
+};
+
+class OatWriter::Thumb2RelativeCallPatcher FINAL : public ArmBaseRelativeCallPatcher {
+ public:
+  explicit Thumb2RelativeCallPatcher(OatWriter* writer)
+      : ArmBaseRelativeCallPatcher(writer, kThumb2, CompileThunkCode(),
+                                   kMaxPositiveDisplacement, kMaxNegativeDisplacement) {
+  }
+
+  void Patch(std::vector<uint8_t>* code, uint32_t literal_offset, uint32_t patch_offset,
+             uint32_t target_offset) OVERRIDE {
+    DCHECK_LE(literal_offset + 4u, code->size());
+    DCHECK_EQ(literal_offset & 1u, 0u);
+    DCHECK_EQ(patch_offset & 1u, 0u);
+    DCHECK_EQ(target_offset & 1u, 1u);  // Thumb2 mode bit.
+    uint32_t displacement = CalculateDisplacement(patch_offset, target_offset & ~1u);
+    displacement -= kPcDisplacement;  // The base PC is at the end of the 4-byte patch.
+    DCHECK_EQ(displacement & 1u, 0u);
+    DCHECK((displacement >> 24) == 0u || (displacement >> 24) == 255u);  // 25-bit signed.
+    uint32_t signbit = (displacement >> 31) & 0x1;
+    uint32_t i1 = (displacement >> 23) & 0x1;
+    uint32_t i2 = (displacement >> 22) & 0x1;
+    uint32_t imm10 = (displacement >> 12) & 0x03ff;
+    uint32_t imm11 = (displacement >> 1) & 0x07ff;
+    uint32_t j1 = i1 ^ (signbit ^ 1);
+    uint32_t j2 = i2 ^ (signbit ^ 1);
+    uint32_t value = (signbit << 26) | (j1 << 13) | (j2 << 11) | (imm10 << 16) | imm11;
+    value |= 0xf000d000;  // BL
+
+    uint8_t* addr = &(*code)[literal_offset];
+    // Check that we're just overwriting an existing BL.
+    DCHECK_EQ(addr[1] & 0xf8, 0xf0);
+    DCHECK_EQ(addr[3] & 0xd0, 0xd0);
+    // Write the new BL.
+    addr[0] = (value >> 16) & 0xff;
+    addr[1] = (value >> 24) & 0xff;
+    addr[2] = (value >> 0) & 0xff;
+    addr[3] = (value >> 8) & 0xff;
+  }
+
+ private:
   static std::vector<uint8_t> CompileThunkCode() {
     // The thunk just uses the entry point in the ArtMethod. This works even for calls
     // to the generic JNI and interpreter trampolines.
@@ -289,16 +323,58 @@ class OatWriter::Thumb2RelativeCallPatcher FINAL : public RelativeCallPatcher {
   static constexpr uint32_t kMaxPositiveDisplacement = (1u << 24) - 2 + kPcDisplacement;
   static constexpr uint32_t kMaxNegativeDisplacement = (1u << 24) - kPcDisplacement;
 
-  OatWriter* const writer_;
-  const std::vector<uint8_t> thunk_code_;
-  std::vector<uint32_t> thunk_locations_;
-  size_t current_thunk_to_write_;
+  DISALLOW_COPY_AND_ASSIGN(Thumb2RelativeCallPatcher);
+};
 
-  // ReserveSpace() tracks unprocessed patches.
-  typedef std::pair<MethodReference, uint32_t> UnprocessedPatch;
-  std::deque<UnprocessedPatch> unprocessed_patches_;
+class OatWriter::Arm64RelativeCallPatcher FINAL : public ArmBaseRelativeCallPatcher {
+ public:
+  explicit Arm64RelativeCallPatcher(OatWriter* writer)
+      : ArmBaseRelativeCallPatcher(writer, kArm64, CompileThunkCode(),
+                                   kMaxPositiveDisplacement, kMaxNegativeDisplacement) {
+  }
 
-  DISALLOW_COPY_AND_ASSIGN(Thumb2RelativeCallPatcher);
+  void Patch(std::vector<uint8_t>* code, uint32_t literal_offset, uint32_t patch_offset,
+             uint32_t target_offset) OVERRIDE {
+    DCHECK_LE(literal_offset + 4u, code->size());
+    DCHECK_EQ(literal_offset & 3u, 0u);
+    DCHECK_EQ(patch_offset & 3u, 0u);
+    DCHECK_EQ(target_offset & 3u, 0u);
+    uint32_t displacement = CalculateDisplacement(patch_offset, target_offset & ~1u);
+    DCHECK_EQ(displacement & 3u, 0u);
+    DCHECK((displacement >> 27) == 0u || (displacement >> 27) == 31u);  // 28-bit signed.
+    uint32_t value = (displacement & 0x0fffffffu) >> 2;
+    value |= 0x94000000;  // BL
+
+    uint8_t* addr = &(*code)[literal_offset];
+    // Check that we're just overwriting an existing BL.
+    DCHECK_EQ(addr[3] & 0xfc, 0x94);
+    // Write the new BL.
+    addr[0] = (value >> 0) & 0xff;
+    addr[1] = (value >> 8) & 0xff;
+    addr[2] = (value >> 16) & 0xff;
+    addr[3] = (value >> 24) & 0xff;
+  }
+
+ private:
+  static std::vector<uint8_t> CompileThunkCode() {
+    // The thunk just uses the entry point in the ArtMethod. This works even for calls
+    // to the generic JNI and interpreter trampolines.
+    arm64::Arm64Assembler assembler;
+    Offset offset(mirror::ArtMethod::EntryPointFromQuickCompiledCodeOffset().Int32Value());
+    assembler.JumpTo(ManagedRegister(arm64::X0), offset, ManagedRegister(arm64::IP0));
+    std::vector<uint8_t> thunk_code(assembler.CodeSize());
+    MemoryRegion code(thunk_code.data(), thunk_code.size());
+    assembler.FinalizeInstructions(code);
+    return thunk_code;
+  }
+
+  // Maximum positive and negative displacement measured from the patch location.
+  // (Signed 28 bit displacement with the last bit 0 has range [-2^27, 2^27-4] measured from
+  // the ARM64 PC pointing to the BL.)
+  static constexpr uint32_t kMaxPositiveDisplacement = (1u << 27) - 4u;
+  static constexpr uint32_t kMaxNegativeDisplacement = (1u << 27);
+
+  DISALLOW_COPY_AND_ASSIGN(Arm64RelativeCallPatcher);
 };
 
 #define DCHECK_OFFSET() \
@@ -373,7 +449,8 @@ OatWriter::OatWriter(const std::vector<const DexFile*>& dex_files,
       relative_call_patcher_.reset(new Thumb2RelativeCallPatcher(this));
       break;
     case kArm64:
-      // TODO: Implement relative calls for arm64.
+      relative_call_patcher_.reset(new Arm64RelativeCallPatcher(this));
+      break;
     default:
       relative_call_patcher_.reset(new NoRelativeCallPatcher);
       break;
@@ -868,8 +945,8 @@ class OatWriter::WriteCodeMethodVisitor : public OatDexMethodVisitor {
     : OatDexMethodVisitor(writer, relative_offset),
       out_(out),
       file_offset_(file_offset),
-      self_(Thread::Current()),
-      old_no_thread_suspension_cause_(self_->StartAssertNoThreadSuspension("OatWriter patching")),
+      soa_(Thread::Current()),
+      no_thread_suspension_(soa_.Self(), "OatWriter patching"),
       class_linker_(Runtime::Current()->GetClassLinker()),
       dex_cache_(nullptr) {
     if (writer_->image_writer_ != nullptr) {
@@ -877,12 +954,9 @@ class OatWriter::WriteCodeMethodVisitor : public OatDexMethodVisitor {
       CHECK(writer_->image_writer_->IsImageAddressSpaceReady());
       patched_code_.reserve(16 * KB);
     }
-    self_->TransitionFromSuspendedToRunnable();
   }
 
   ~WriteCodeMethodVisitor() UNLOCK_FUNCTION(Locks::mutator_lock_) {
-    self_->EndAssertNoThreadSuspension(old_no_thread_suspension_cause_);
-    self_->TransitionFromRunnableToSuspended(kNative);
   }
 
   bool StartClass(const DexFile* dex_file, size_t class_def_index)
@@ -997,9 +1071,9 @@ class OatWriter::WriteCodeMethodVisitor : public OatDexMethodVisitor {
 
  private:
   OutputStream* const out_;
-  size_t const file_offset_;
-  Thread* const self_;
-  const char* const old_no_thread_suspension_cause_;  // TODO: Use ScopedAssertNoThreadSuspension.
+  const size_t file_offset_;
+  const ScopedObjectAccess soa_;
+  const ScopedAssertNoThreadSuspension no_thread_suspension_;
   ClassLinker* const class_linker_;
   mirror::DexCache* dex_cache_;
   std::vector<uint8_t> patched_code_;
diff --git a/compiler/oat_writer.h b/compiler/oat_writer.h
index 5545ba817..a1e61b936 100644
--- a/compiler/oat_writer.h
+++ b/compiler/oat_writer.h
@@ -330,7 +330,9 @@ class OatWriter {
   class RelativeCallPatcher;
   class NoRelativeCallPatcher;
   class X86RelativeCallPatcher;
+  class ArmBaseRelativeCallPatcher;
   class Thumb2RelativeCallPatcher;
+  class Arm64RelativeCallPatcher;
 
   std::unique_ptr<RelativeCallPatcher> relative_call_patcher_;
 
-- 
2.11.0