From 6999d84e2c55dc4a46a6c311b55bd5811336d9c4 Mon Sep 17 00:00:00 2001
From: Ben Cheng <bccheng@android.com>
Date: Tue, 26 Jan 2010 16:46:15 -0800
Subject: [PATCH] Fix performance issues related to chaining and unchaining.

1) Patching requests for predicted chaining cells (used by virtual/interface
methods) are now batched in a queue and processed when the VM is paused for GC.

2) When the code cache is full the reset operation is also conducted at the
end of GC pauses so this totally eliminates the need for the compiler thread
to issue suspend-all requests. This is a very rare event and when happening it
takes less than 5ms to finish.

3) Change the initial value of the branch in a predicted chaining cell from 0
(ie lsl r0, r0, #0) to 0xe7fe (ie branch to self) so that initializing a
predicted chaining cell doesn't need to suspend all threads. Together with 1)
seeing 20% speedup on some benchmarks.

4) Add TestCompability.c where defining "TEST_VM_IN_ECLAIR := true" in
buildspec.mk will activate dummy symbols needed to run libdvm.so in older
releases.

Bug: 2397689
Bug: 2396513
Bug: 2331313
---
 vm/Dvm.mk                               |   5 ++
 vm/Globals.h                            |  20 ++++-
 vm/TestCompability.c                    |  26 ++++++
 vm/Thread.c                             |  11 ++-
 vm/alloc/Heap.c                         |  16 +++-
 vm/compiler/Compiler.c                  | 111 ++++++++++++------------
 vm/compiler/Compiler.h                  |  16 +++-
 vm/compiler/codegen/CompilerCodegen.h   |   1 +
 vm/compiler/codegen/arm/ArmLIR.h        |  13 +--
 vm/compiler/codegen/arm/Assemble.c      | 148 ++++++++++++++++++++++----------
 vm/compiler/codegen/arm/CodegenDriver.c |   6 +-
 vm/interp/Jit.c                         |  23 +++--
 vm/mterp/armv5te/footer.S               |   3 +
 vm/mterp/common/asm-constants.h         |   6 +-
 vm/mterp/out/InterpAsm-armv4t.S         |   3 +
 vm/mterp/out/InterpAsm-armv5te-vfp.S    |   3 +
 vm/mterp/out/InterpAsm-armv5te.S        |   3 +
 vm/mterp/out/InterpAsm-armv7-a.S        |   3 +
 18 files changed, 281 insertions(+), 136 deletions(-)
 create mode 100644 vm/TestCompability.c

diff --git a/vm/Dvm.mk b/vm/Dvm.mk
index 3f5c6d532..2a03b0941 100644
--- a/vm/Dvm.mk
+++ b/vm/Dvm.mk
@@ -123,6 +123,7 @@ LOCAL_SRC_FILES := \
 	SignalCatcher.c \
 	StdioConverter.c \
 	Sync.c \
+	TestCompability.c \
 	Thread.c \
 	UtfString.c \
 	alloc/clz.c.arm \
@@ -330,3 +331,7 @@ ifeq ($(MTERP_ARCH_KNOWN),false)
   LOCAL_CFLAGS += -DdvmAsmInstructionStart=0 -DdvmAsmInstructionEnd=0 \
 	-DdvmAsmSisterStart=0 -DdvmAsmSisterEnd=0 -DDVM_NO_ASM_INTERP=1
 endif
+
+ifeq ($(TEST_VM_IN_ECLAIR),true)
+  LOCAL_CFLAGS += -DTEST_VM_IN_ECLAIR
+endif
diff --git a/vm/Globals.h b/vm/Globals.h
index f2dd86a67..5642e6f20 100644
--- a/vm/Globals.h
+++ b/vm/Globals.h
@@ -706,13 +706,14 @@ struct DvmJitGlobals {
     bool               blockingMode;
     pthread_t          compilerHandle;
     pthread_mutex_t    compilerLock;
+    pthread_mutex_t    compilerICPatchLock;
     pthread_cond_t     compilerQueueActivity;
     pthread_cond_t     compilerQueueEmpty;
     int                compilerQueueLength;
     int                compilerHighWater;
     int                compilerWorkEnqueueIndex;
     int                compilerWorkDequeueIndex;
-    CompilerWorkOrder  compilerWorkQueue[COMPILER_WORK_QUEUE_SIZE];
+    int                compilerICPatchIndex;
 
     /* JIT internal stats */
     int                compilerMaxQueued;
@@ -742,12 +743,12 @@ struct DvmJitGlobals {
     /* Flag to indicate that the code cache is full */
     bool codeCacheFull;
 
-    /* Delay count for the next code cache reset request */
-    int delayCodeCacheReset;
-
     /* Number of times that the code cache has been reset */
     int numCodeCacheReset;
 
+    /* Number of times that the code cache reset request has been delayed */
+    int numCodeCacheResetDelayed;
+
     /* true/false: compile/reject opcodes specified in the -Xjitop list */
     bool includeSelectedOp;
 
@@ -778,10 +779,21 @@ struct DvmJitGlobals {
     /* Filter method compilation blacklist with call-graph information */
     bool checkCallGraph;
 
+    /* New translation chain has been set up */
+    volatile bool hasNewChain;
+
 #if defined(WITH_SELF_VERIFICATION)
     /* Spin when error is detected, volatile so GDB can reset it */
     volatile bool selfVerificationSpin;
 #endif
+
+    /* Place arrays at the end to ease the display in gdb sessions */
+
+    /* Work order queue for compilations */
+    CompilerWorkOrder compilerWorkQueue[COMPILER_WORK_QUEUE_SIZE];
+
+    /* Work order queue for predicted chain patching */
+    ICPatchWorkOrder compilerICPatchQueue[COMPILER_IC_PATCH_QUEUE_SIZE];
 };
 
 extern struct DvmJitGlobals gDvmJit;
diff --git a/vm/TestCompability.c b/vm/TestCompability.c
new file mode 100644
index 000000000..d447db380
--- /dev/null
+++ b/vm/TestCompability.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (C) 2010 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <Dalvik.h>
+
+#if defined(TEST_VM_IN_ECLAIR)
+FILE *open_memstream(char **ptr, size_t *sizeloc)
+{
+    LOGE("Fake open_memstream entered");
+    dvmAbort();
+    return NULL;
+}
+#endif
diff --git a/vm/Thread.c b/vm/Thread.c
index 9e95d884a..79c3e5710 100644
--- a/vm/Thread.c
+++ b/vm/Thread.c
@@ -2581,11 +2581,14 @@ static void waitForThreadSuspend(Thread* self, Thread* thread)
 
 #if defined (WITH_JIT)
         /*
-         * If we're still waiting after the first timeout,
-         * unchain all translations.
+         * If we're still waiting after the first timeout, unchain all
+         * translations iff:
+         *   1) There are new chains formed since the last unchain
+         *   2) The top VM frame of the running thread is running JIT'ed code
          */
-        if (gDvmJit.pJitEntryTable && retryCount > 0) {
-            LOGD("JIT unchain all attempt #%d",retryCount);
+        if (gDvmJit.pJitEntryTable && retryCount > 0 &&
+            gDvmJit.hasNewChain && thread->inJitCodeCache) {
+            LOGD("JIT unchain all for tid %d", thread->threadId);
             dvmJitUnchainAll();
         }
 #endif
diff --git a/vm/alloc/Heap.c b/vm/alloc/Heap.c
index 477481934..dce5a82da 100644
--- a/vm/alloc/Heap.c
+++ b/vm/alloc/Heap.c
@@ -644,7 +644,7 @@ alloc_succeeded:
             dvmAddTrackedAlloc(ptr, NULL);
         }
     } else {
-        /* 
+        /*
          * The allocation failed; throw an OutOfMemoryError.
          */
         throwOOME();
@@ -1032,6 +1032,20 @@ void dvmCollectGarbageInternal(bool collectSoftReferences, enum GcReason reason)
     dvmUnlockMutex(&gDvm.heapWorkerListLock);
     dvmUnlockMutex(&gDvm.heapWorkerLock);
 
+#if defined(WITH_JIT)
+    extern void dvmCompilerPerformSafePointChecks(void);
+
+    /*
+     * Patching a chaining cell is very cheap as it only updates 4 words. It's
+     * the overhead of stopping all threads and synchronizing the I/D cache
+     * that makes it expensive.
+     *
+     * Therefore we batch those work orders in a queue and go through them
+     * when threads are suspended for GC.
+     */
+    dvmCompilerPerformSafePointChecks();
+#endif
+
     dvmResumeAllThreads(SUSPEND_FOR_GC);
     if (oldThreadPriority != kInvalidPriority) {
         if (setpriority(PRIO_PROCESS, 0, oldThreadPriority) != 0) {
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 9ed3a0567..adcc16e56 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -66,11 +66,10 @@ bool dvmCompilerWorkEnqueue(const u2 *pc, WorkOrderKind kind, void* info)
     }
 
     /*
-     * Return if queue is full.
-     * If the code cache is full, we will allow the work order to be added and
-     * we use that to trigger code cache reset.
+     * Return if queue or code cache is full.
      */
-    if (gDvmJit.compilerQueueLength == COMPILER_WORK_QUEUE_SIZE) {
+    if (gDvmJit.compilerQueueLength == COMPILER_WORK_QUEUE_SIZE ||
+        gDvmJit.codeCacheFull == true) {
         result = false;
         goto unlockAndExit;
     }
@@ -94,8 +93,7 @@ bool dvmCompilerWorkEnqueue(const u2 *pc, WorkOrderKind kind, void* info)
     newOrder->info = info;
     newOrder->result.codeAddress = NULL;
     newOrder->result.discardResult =
-        (kind == kWorkOrderTraceDebug || kind == kWorkOrderICPatch) ?
-        true : false;
+        (kind == kWorkOrderTraceDebug) ?  true : false;
     newOrder->result.requestingThread = dvmThreadSelf();
 
     gDvmJit.compilerWorkEnqueueIndex++;
@@ -136,8 +134,8 @@ bool dvmCompilerSetupCodeCache(void)
         return false;
     }
 
-    // For debugging only
-    // LOGD("Code cache starts at %p", gDvmJit.codeCache);
+    // STOPSHIP - for debugging only
+    LOGD("Code cache starts at %p", gDvmJit.codeCache);
 
     /* Copy the template code into the beginning of the code cache */
     int templateSize = (intptr_t) dmvCompilerTemplateEnd -
@@ -201,43 +199,38 @@ static void resetCodeCache(void)
     Thread* thread;
     u8 startTime = dvmGetRelativeTimeUsec();
     int inJit = 0;
-
-    LOGD("Reset the JIT code cache (%d bytes used / %d time(s))",
-         gDvmJit.codeCacheByteUsed, ++gDvmJit.numCodeCacheReset);
-
-    /* Stop the world */
-    dvmSuspendAllThreads(SUSPEND_FOR_CC_RESET);
+    int byteUsed = gDvmJit.codeCacheByteUsed;
 
     /* If any thread is found stuck in the JIT state, don't reset the cache */
     for (thread = gDvm.threadList; thread != NULL; thread = thread->next) {
+        /*
+         * Crawl the stack to wipe out the returnAddr field so that
+         * 1) the soon-to-be-deleted code in the JIT cache won't be used
+         * 2) or the thread stuck in the JIT land will soon return
+         *    to the interpreter land
+         */
+        crawlDalvikStack(thread, false);
         if (thread->inJitCodeCache) {
             inJit++;
-            /*
-             * STOPSHIP
-             * Change the verbose mode to false after the new code receives
-             * more QA love.
-             */
-            crawlDalvikStack(thread, true);
         }
     }
 
     if (inJit) {
-        /* Wait a while for the busy threads to rest and try again */
-        gDvmJit.delayCodeCacheReset = 256;
-        goto done;
+        LOGD("JIT code cache reset delayed (%d bytes %d/%d)",
+             gDvmJit.codeCacheByteUsed, gDvmJit.numCodeCacheReset,
+             ++gDvmJit.numCodeCacheResetDelayed);
+        return;
     }
 
-    /* Drain the work queue to free the work order */
+    /* Lock the mutex to clean up the work queue */
+    dvmLockMutex(&gDvmJit.compilerLock);
+
+    /* Drain the work queue to free the work orders */
     while (workQueueLength()) {
         CompilerWorkOrder work = workDequeue();
         free(work.info);
     }
 
-    /* Wipe out the returnAddr field that soon will point to stale code */
-    for (thread = gDvm.threadList; thread != NULL; thread = thread->next) {
-        crawlDalvikStack(thread, false);
-    }
-
     /* Reset the JitEntry table contents to the initial unpopulated state */
     dvmJitResetTable();
 
@@ -261,15 +254,35 @@ static void resetCodeCache(void)
     gDvmJit.compilerWorkEnqueueIndex = gDvmJit.compilerWorkDequeueIndex = 0;
     gDvmJit.compilerQueueLength = 0;
 
+    /* Reset the IC patch work queue */
+    dvmLockMutex(&gDvmJit.compilerICPatchLock);
+    gDvmJit.compilerICPatchIndex = 0;
+    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
+
     /* All clear now */
     gDvmJit.codeCacheFull = false;
 
-    LOGD("Code cache reset takes %lld usec",
-         dvmGetRelativeTimeUsec() - startTime);
+    dvmUnlockMutex(&gDvmJit.compilerLock);
 
-done:
-    /* Resume all threads */
-    dvmResumeAllThreads(SUSPEND_FOR_CC_RESET);
+    LOGD("JIT code cache reset in %lld ms (%d bytes %d/%d)",
+         (dvmGetRelativeTimeUsec() - startTime) / 1000,
+         byteUsed, ++gDvmJit.numCodeCacheReset,
+         gDvmJit.numCodeCacheResetDelayed);
+}
+
+/*
+ * Perform actions that are only safe when all threads are suspended. Currently
+ * we do:
+ * 1) Check if the code cache is full. If so reset it and restart populating it
+ *    from scratch.
+ * 2) Patch predicted chaining cells by consuming recorded work orders.
+ */
+void dvmCompilerPerformSafePointChecks(void)
+{
+    if (gDvmJit.codeCacheFull) {
+        resetCodeCache();
+    }
+    dvmCompilerPatchInlineCache();
 }
 
 bool compilerThreadStartup(void)
@@ -410,7 +423,6 @@ static void *compilerThreadStart(void *arg)
             continue;
         } else {
             do {
-                bool resizeFail = false;
                 CompilerWorkOrder work = workDequeue();
                 dvmUnlockMutex(&gDvmJit.compilerLock);
                 /*
@@ -421,11 +433,17 @@ static void *compilerThreadStart(void *arg)
                 /* Is JitTable filling up? */
                 if (gDvmJit.jitTableEntriesUsed >
                     (gDvmJit.jitTableSize - gDvmJit.jitTableSize/4)) {
-                    resizeFail = dvmJitResizeJitTable(gDvmJit.jitTableSize * 2);
+                    bool resizeFail =
+                        dvmJitResizeJitTable(gDvmJit.jitTableSize * 2);
+                    /*
+                     * If the jit table is full, consider it's time to reset
+                     * the code cache too.
+                     */
+                    gDvmJit.codeCacheFull |= resizeFail;
                 }
                 if (gDvmJit.haltCompilerThread) {
                     LOGD("Compiler shutdown in progress - discarding request");
-                } else if (!resizeFail) {
+                } else if (!gDvmJit.codeCacheFull) {
                     /* If compilation failed, use interpret-template */
                     if (!dvmCompilerDoWork(&work)) {
                         work.result.codeAddress = gDvmJit.interpretTemplate;
@@ -437,24 +455,6 @@ static void *compilerThreadStart(void *arg)
                 }
                 free(work.info);
                 dvmLockMutex(&gDvmJit.compilerLock);
-
-                /*
-                 * FIXME - temporarily disable code cache reset until
-                 * stale code stops leaking.
-                 */
-#if 0
-                if (gDvmJit.codeCacheFull == true || resizeFail) {
-                    if (gDvmJit.delayCodeCacheReset == 0) {
-                        resetCodeCache();
-                        assert(workQueueLength() == 0 ||
-                               gDvmJit.delayCodeCacheReset != 0);
-                    } else {
-                        LOGD("Delay the next %d tries to reset code cache",
-                             gDvmJit.delayCodeCacheReset);
-                        gDvmJit.delayCodeCacheReset--;
-                    }
-                }
-#endif
             } while (workQueueLength() != 0);
         }
     }
@@ -477,6 +477,7 @@ bool dvmCompilerStartup(void)
 {
 
     dvmInitMutex(&gDvmJit.compilerLock);
+    dvmInitMutex(&gDvmJit.compilerICPatchLock);
     dvmLockMutex(&gDvmJit.compilerLock);
     pthread_cond_init(&gDvmJit.compilerQueueActivity, NULL);
     pthread_cond_init(&gDvmJit.compilerQueueEmpty, NULL);
diff --git a/vm/compiler/Compiler.h b/vm/compiler/Compiler.h
index 6b4d41424..153e84568 100644
--- a/vm/compiler/Compiler.h
+++ b/vm/compiler/Compiler.h
@@ -22,6 +22,7 @@
 #define CODE_CACHE_SIZE                 1024*1024
 #define MAX_JIT_RUN_LEN                 64
 #define COMPILER_WORK_QUEUE_SIZE        100
+#define COMPILER_IC_PATCH_QUEUE_SIZE    64
 
 #define COMPILER_TRACED(X)
 #define COMPILER_TRACEE(X)
@@ -49,7 +50,6 @@ typedef enum WorkOrderKind {
     kWorkOrderMethod = 1,       // Work is to compile a whole method
     kWorkOrderTrace = 2,        // Work is to compile code fragment(s)
     kWorkOrderTraceDebug = 3,   // Work is to compile/debug code fragment(s)
-    kWorkOrderICPatch = 4,      // Work is to patch a polymorphic callsite
 } WorkOrderKind;
 
 typedef struct CompilerWorkOrder {
@@ -59,6 +59,20 @@ typedef struct CompilerWorkOrder {
     JitTranslationInfo result;
 } CompilerWorkOrder;
 
+/* Chain cell for predicted method invocation */
+typedef struct PredictedChainingCell {
+    u4 branch;                  /* Branch to chained destination */
+    const ClassObject *clazz;   /* key #1 for prediction */
+    const Method *method;       /* key #2 to lookup native PC from dalvik PC */
+    u4 counter;                 /* counter to patch the chaining cell */
+} PredictedChainingCell;
+
+/* Work order for inline cache patching */
+typedef struct ICPatchWorkOrder {
+    PredictedChainingCell *cellAddr;    /* Address to be patched */
+    PredictedChainingCell cellContent;  /* content of the new cell */
+} ICPatchWorkOrder;
+
 typedef enum JitState {
     kJitOff = 0,
     kJitNormal = 1,            // Profiling in mterp or running native
diff --git a/vm/compiler/codegen/CompilerCodegen.h b/vm/compiler/codegen/CompilerCodegen.h
index ff39cd408..4a27a670f 100644
--- a/vm/compiler/codegen/CompilerCodegen.h
+++ b/vm/compiler/codegen/CompilerCodegen.h
@@ -41,6 +41,7 @@ void dvmCompilerCodegenDump(CompilationUnit *cUnit);
 void* dvmJitChain(void *tgtAddr, u4* branchAddr);
 u4* dvmJitUnchain(void *codeAddr);
 void dvmJitUnchainAll(void);
+void dvmCompilerPatchInlineCache(void);
 
 /* Implemented in codegen/<target>/Ralloc.c */
 void dvmCompilerRegAlloc(CompilationUnit *cUnit);
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 3254ff71b..21e2a3273 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -730,16 +730,9 @@ typedef struct ArmLIR {
     u8 defMask;         // Resource mask for def
 } ArmLIR;
 
-/* Chain cell for predicted method invocation */
-typedef struct PredictedChainingCell {
-    u4 branch;                  /* Branch to chained destination */
-    const ClassObject *clazz;   /* key #1 for prediction */
-    const Method *method;       /* key #2 to lookup native PC from dalvik PC */
-    u4 counter;                 /* counter to patch the chaining cell */
-} PredictedChainingCell;
-
 /* Init values when a predicted chain is initially assembled */
-#define PREDICTED_CHAIN_BX_PAIR_INIT     0
+/* E7FE is branch to self */
+#define PREDICTED_CHAIN_BX_PAIR_INIT     0xe7fe
 #define PREDICTED_CHAIN_CLAZZ_INIT       0
 #define PREDICTED_CHAIN_METHOD_INIT      0
 #define PREDICTED_CHAIN_COUNTER_INIT     0
@@ -748,7 +741,7 @@ typedef struct PredictedChainingCell {
 #define PREDICTED_CHAIN_COUNTER_DELAY    512
 
 /* Rechain after this many mis-predictions have happened */
-#define PREDICTED_CHAIN_COUNTER_RECHAIN  8192
+#define PREDICTED_CHAIN_COUNTER_RECHAIN  1024
 
 /* Used if the resolved callee is a native method */
 #define PREDICTED_CHAIN_COUNTER_AVOID    0x7fffffff
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 998c95585..c3ad957ac 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -1328,7 +1328,12 @@ void* dvmJitChain(void* tgtAddr, u4* branchAddr)
     u4 newInst;
     bool thumbTarget;
 
-    if ((gDvmJit.pProfTable != NULL) && gDvm.sumThreadSuspendCount == 0) {
+    /*
+     * Only chain translations when there is no urge to ask all threads to
+     * suspend themselves via the interpreter.
+     */
+    if ((gDvmJit.pProfTable != NULL) && (gDvm.sumThreadSuspendCount == 0) &&
+        (gDvmJit.codeCacheFull == false)) {
         assert((branchOffset >= -(1<<22)) && (branchOffset <= ((1<<22)-2)));
 
         gDvmJit.translationChains++;
@@ -1350,12 +1355,48 @@ void* dvmJitChain(void* tgtAddr, u4* branchAddr)
 
         *branchAddr = newInst;
         cacheflush((long)branchAddr, (long)branchAddr + 4, 0);
+        gDvmJit.hasNewChain = true;
     }
 
     return tgtAddr;
 }
 
 /*
+ * Attempt to enqueue a work order to patch an inline cache for a predicted
+ * chaining cell for virtual/interface calls.
+ */
+bool inlineCachePatchEnqueue(PredictedChainingCell *cellAddr,
+                             PredictedChainingCell *newContent)
+{
+    bool result = true;
+
+    dvmLockMutex(&gDvmJit.compilerICPatchLock);
+
+    if (cellAddr->clazz == NULL &&
+        cellAddr->branch == PREDICTED_CHAIN_BX_PAIR_INIT) {
+        /*
+         * The update order matters - make sure clazz is updated last since it
+         * will bring the uninitialized chaining cell to life.
+         */
+        cellAddr->method = newContent->method;
+        cellAddr->branch = newContent->branch;
+        cellAddr->counter = newContent->counter;
+        cellAddr->clazz = newContent->clazz;
+        cacheflush((intptr_t) cellAddr, (intptr_t) (cellAddr+1), 0);
+    }
+    else if (gDvmJit.compilerICPatchIndex < COMPILER_IC_PATCH_QUEUE_SIZE)  {
+        int index = gDvmJit.compilerICPatchIndex++;
+        gDvmJit.compilerICPatchQueue[index].cellAddr = cellAddr;
+        gDvmJit.compilerICPatchQueue[index].cellContent = *newContent;
+    } else {
+        result = false;
+    }
+
+    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
+    return result;
+}
+
+/*
  * This method is called from the invoke templates for virtual and interface
  * methods to speculatively setup a chain to the callee. The templates are
  * written in assembly and have setup method, cell, and clazz at r0, r2, and
@@ -1412,41 +1453,29 @@ const Method *dvmJitToPatchPredictedChain(const Method *method,
         goto done;
     }
 
-    /*
-     * Bump up the counter first just in case other mutator threads are in
-     * nearby territory to also attempt to rechain this cell. This is not
-     * done in a thread-safe way and doesn't need to be since the consequence
-     * of the race condition [rare] is two back-to-back suspend-all attempts,
-     * which will be handled correctly.
-     */
-    cell->counter = PREDICTED_CHAIN_COUNTER_AVOID;
+    PredictedChainingCell newCell;
 
-    PredictedChainingCell *newCell =
-        (PredictedChainingCell *) malloc(sizeof(PredictedChainingCell));
+    /* Avoid back-to-back orders to the same cell */
+    cell->counter = PREDICTED_CHAIN_COUNTER_AVOID;
 
     int baseAddr = (int) cell + 4;   // PC is cur_addr + 4
     int branchOffset = tgtAddr - baseAddr;
 
-    newCell->branch = assembleChainingBranch(branchOffset, true);
-    newCell->clazz = clazz;
-    newCell->method = method;
-
-    /*
-     * Reset the counter again in case other mutator threads got invoked
-     * between the previous rest and dvmSuspendAllThreads call.
-     */
-    newCell->counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
+    newCell.branch = assembleChainingBranch(branchOffset, true);
+    newCell.clazz = clazz;
+    newCell.method = method;
+    newCell.counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
 
     /*
-     * Enter the work order to the queue for the compiler thread to patch the
-     * chaining cell.
+     * Enter the work order to the queue and the chaining cell will be patched
+     * the next time a safe point is entered.
      *
-     * No blocking call is added here because the patched result is not
-     * intended to be immediately consumed by the requesting thread. Its
-     * execution is simply resumed by chasing the class pointer to resolve the
-     * callsite.
+     * If the enqueuing fails reset the rechain count to a normal value so that
+     * it won't get indefinitely delayed.
      */
-    dvmCompilerWorkEnqueue((const u2 *) cell, kWorkOrderICPatch, newCell);
+    if (!inlineCachePatchEnqueue(cell, &newCell)) {
+        cell->counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
+    }
 #endif
 done:
     return method;
@@ -1456,31 +1485,61 @@ done:
  * Patch the inline cache content based on the content passed from the work
  * order.
  */
-bool dvmJitPatchInlineCache(void *cellPtr, void *contentPtr)
+void dvmCompilerPatchInlineCache(void)
 {
-    PredictedChainingCell *cellDest = (PredictedChainingCell *) cellPtr;
-    PredictedChainingCell *newContent = (PredictedChainingCell *) contentPtr;
+    int i;
+    PredictedChainingCell *minAddr, *maxAddr;
+
+    /* Nothing to be done */
+    if (gDvmJit.compilerICPatchIndex == 0) return;
 
-    /* Stop the world */
-    dvmSuspendAllThreads(SUSPEND_FOR_IC_PATCH);
+    /*
+     * Since all threads are already stopped we don't really need to acquire
+     * the lock. But race condition can be easily introduced in the future w/o
+     * paying attention so we still acquire the lock here.
+     */
+    dvmLockMutex(&gDvmJit.compilerICPatchLock);
 
+    //LOGD("Number of IC patch work orders: %d", gDvmJit.compilerICPatchIndex);
 
-    COMPILER_TRACE_CHAINING(
-        LOGD("Jit Runtime: predicted chain %p from %s to %s (%s) patched",
-             cellDest, cellDest->clazz ? cellDest->clazz->descriptor : "NULL",
-             newContent->clazz->descriptor,
-             newContent->method->name));
+    /* Initialize the min/max address range */
+    minAddr = (PredictedChainingCell *)
+        ((char *) gDvmJit.codeCache + CODE_CACHE_SIZE);
+    maxAddr = (PredictedChainingCell *) gDvmJit.codeCache;
 
-    /* Install the new cell content */
-    *cellDest = *newContent;
+    for (i = 0; i < gDvmJit.compilerICPatchIndex; i++) {
+        PredictedChainingCell *cellAddr =
+            gDvmJit.compilerICPatchQueue[i].cellAddr;
+        PredictedChainingCell *cellContent =
+            &gDvmJit.compilerICPatchQueue[i].cellContent;
+
+        if (cellAddr->clazz == NULL) {
+            COMPILER_TRACE_CHAINING(
+                LOGD("Jit Runtime: predicted chain %p to %s (%s) initialized",
+                     cellAddr,
+                     cellContent->clazz->descriptor,
+                     cellContent->method->name));
+        } else {
+            COMPILER_TRACE_CHAINING(
+                LOGD("Jit Runtime: predicted chain %p from %s to %s (%s) "
+                     "patched",
+                     cellAddr,
+                     cellAddr->clazz->descriptor,
+                     cellContent->clazz->descriptor,
+                     cellContent->method->name));
+        }
 
-    /* Then synchronize the I/D$ */
-    cacheflush((long) cellDest, (long) (cellDest+1), 0);
+        /* Patch the chaining cell */
+        *cellAddr = *cellContent;
+        minAddr = (cellAddr < minAddr) ? cellAddr : minAddr;
+        maxAddr = (cellAddr > maxAddr) ? cellAddr : maxAddr;
+    }
 
-    /* All done - resume all other threads */
-    dvmResumeAllThreads(SUSPEND_FOR_IC_PATCH);
+    /* Then synchronize the I/D cache */
+    cacheflush((long) minAddr, (long) (maxAddr+1), 0);
 
-    return true;
+    gDvmJit.compilerICPatchIndex = 0;
+    dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
 }
 
 /*
@@ -1617,6 +1676,7 @@ void dvmJitUnchainAll()
         dvmUnlockMutex(&gDvmJit.tableLock);
         gDvmJit.translationChains = 0;
     }
+    gDvmJit.hasNewChain = false;
 }
 
 typedef struct jitProfileAddrToLine {
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 5be07aa3d..b0e16b82f 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -4121,8 +4121,7 @@ bool dvmCompilerDoWork(CompilerWorkOrder *work)
 {
     bool res;
 
-    if (gDvmJit.codeCacheFull &&
-        (work->kind != kWorkOrderICPatch)) {
+    if (gDvmJit.codeCacheFull) {
         return false;
     }
 
@@ -4142,9 +4141,6 @@ bool dvmCompilerDoWork(CompilerWorkOrder *work)
             gDvmJit.printMe = oldPrintMe;;
             break;
         }
-        case kWorkOrderICPatch:
-            res = dvmJitPatchInlineCache((void *) work->pc, work->info);
-            break;
         default:
             res = false;
             dvmAbort();
diff --git a/vm/interp/Jit.c b/vm/interp/Jit.c
index d5360701f..b3fe1bf48 100644
--- a/vm/interp/Jit.c
+++ b/vm/interp/Jit.c
@@ -765,8 +765,11 @@ int dvmCheckJit(const u2* pc, Thread* self, InterpState* interpState)
                  * Reset "trace in progress" flag whether or not we
                  * successfully entered a work order.
                  */
-                setTraceConstruction(
-                     lookupAndAdd(interpState->currTraceHead, false), false);
+                JitEntry *jitEntry =
+                    lookupAndAdd(interpState->currTraceHead, false);
+                if (jitEntry) {
+                    setTraceConstruction(jitEntry, false);
+                }
                 switchInterp = !debugOrProfile;
             }
             break;
@@ -835,16 +838,18 @@ JitEntry *dvmFindJitEntry(const u2* pc)
 void* dvmJitGetCodeAddr(const u2* dPC)
 {
     int idx = dvmJitHash(dPC);
-    const u2* npc = (gDvmJit.pProfTable == NULL) ? NULL :
-                     gDvmJit.pJitEntryTable[idx].dPC;
-
+    const u2* npc = gDvmJit.pJitEntryTable[idx].dPC;
     if (npc != NULL) {
+        bool hideTranslation = (gDvm.sumThreadSuspendCount != 0) ||
+                               (gDvmJit.codeCacheFull == true) ||
+                               (gDvmJit.pProfTable == NULL);
+
         if (npc == dPC) {
 #if defined(EXIT_STATS)
             gDvmJit.addrLookupsFound++;
 #endif
-            return gDvm.sumThreadSuspendCount ? NULL :
-                gDvmJit.pJitEntryTable[idx].codeAddress;
+            return hideTranslation ?
+                NULL : gDvmJit.pJitEntryTable[idx].codeAddress;
         } else {
             int chainEndMarker = gDvmJit.jitTableSize;
             while (gDvmJit.pJitEntryTable[idx].u.info.chain != chainEndMarker) {
@@ -853,8 +858,8 @@ void* dvmJitGetCodeAddr(const u2* dPC)
 #if defined(EXIT_STATS)
                     gDvmJit.addrLookupsFound++;
 #endif
-                    return gDvm.sumThreadSuspendCount ? NULL :
-                        gDvmJit.pJitEntryTable[idx].codeAddress;
+                    return hideTranslation ?
+                        NULL : gDvmJit.pJitEntryTable[idx].codeAddress;
                 }
             }
         }
diff --git a/vm/mterp/armv5te/footer.S b/vm/mterp/armv5te/footer.S
index 3f4321c7c..240b603c5 100644
--- a/vm/mterp/armv5te/footer.S
+++ b/vm/mterp/armv5te/footer.S
@@ -276,6 +276,9 @@ dvmJitSelfVerificationStart:
  * before jumping back to the interpreter.
  */
 dvmJitSelfVerificationEnd:
+    ldr    r10, [rGLUE, #offGlue_self]  @ callee saved r10 <- glue->self
+    mov    r1, #0
+    str    r1, [r10, #offThread_inJitCodeCache] @ Back to the interp land
     mov    r1,rFP                        @ pass ending fp
     bl     dvmSelfVerificationRestoreState @ restore pc and fp values
     ldr    rPC,[r0,#offShadowSpace_startPC] @ restore PC
diff --git a/vm/mterp/common/asm-constants.h b/vm/mterp/common/asm-constants.h
index ab2479997..4fff3dc4e 100644
--- a/vm/mterp/common/asm-constants.h
+++ b/vm/mterp/common/asm-constants.h
@@ -118,7 +118,7 @@ MTERP_OFFSET(offGlue_jitState,          MterpGlue, jitState, 56)
 MTERP_OFFSET(offGlue_jitResume,         MterpGlue, jitResume, 60)
 MTERP_OFFSET(offGlue_jitResumePC,       MterpGlue, jitResumePC, 64)
 MTERP_OFFSET(offGlue_jitThreshold,      MterpGlue, jitThreshold, 68)
-MTERP_OFFSET(offGlue_jitppJitProfTable, MterpGlue, ppJitProfTable, 72)
+MTERP_OFFSET(offGlue_ppJitProfTable,    MterpGlue, ppJitProfTable, 72)
 #endif
 #elif defined(WITH_PROFILER)
 MTERP_OFFSET(offGlue_pActiveProfilers,  MterpGlue, pActiveProfilers, 40)
@@ -129,7 +129,7 @@ MTERP_OFFSET(offGlue_jitState,          MterpGlue, jitState, 56)
 MTERP_OFFSET(offGlue_jitResume,         MterpGlue, jitResume, 60)
 MTERP_OFFSET(offGlue_jitResumePC,       MterpGlue, jitResumePC, 64)
 MTERP_OFFSET(offGlue_jitThreshold,      MterpGlue, jitThreshold, 68)
-MTERP_OFFSET(offGlue_jitppJitProfTable, MterpGlue, ppJitProfTable, 72)
+MTERP_OFFSET(offGlue_ppJitProfTable,    MterpGlue, ppJitProfTable, 72)
 #endif
 #else
 MTERP_OFFSET(offGlue_entryPoint,        MterpGlue, entryPoint, 40)
@@ -139,7 +139,7 @@ MTERP_OFFSET(offGlue_jitState,          MterpGlue, jitState, 52)
 MTERP_OFFSET(offGlue_jitResume,         MterpGlue, jitResume, 56)
 MTERP_OFFSET(offGlue_jitResumePC,       MterpGlue, jitResumePC, 60)
 MTERP_OFFSET(offGlue_jitThreshold,      MterpGlue, jitThreshold, 64)
-MTERP_OFFSET(offGlue_jitppJitProfTable, MterpGlue, ppJitProfTable, 68)
+MTERP_OFFSET(offGlue_ppJitProfTable,    MterpGlue, ppJitProfTable, 68)
 #endif
 #endif
 /* make sure all JValue union members are stored at the same offset */
diff --git a/vm/mterp/out/InterpAsm-armv4t.S b/vm/mterp/out/InterpAsm-armv4t.S
index da3199efa..a7aeaef31 100644
--- a/vm/mterp/out/InterpAsm-armv4t.S
+++ b/vm/mterp/out/InterpAsm-armv4t.S
@@ -9803,6 +9803,9 @@ dvmJitSelfVerificationStart:
  * before jumping back to the interpreter.
  */
 dvmJitSelfVerificationEnd:
+    ldr    r10, [rGLUE, #offGlue_self]  @ callee saved r10 <- glue->self
+    mov    r1, #0
+    str    r1, [r10, #offThread_inJitCodeCache] @ Back to the interp land
     mov    r1,rFP                        @ pass ending fp
     bl     dvmSelfVerificationRestoreState @ restore pc and fp values
     ldr    rPC,[r0,#offShadowSpace_startPC] @ restore PC
diff --git a/vm/mterp/out/InterpAsm-armv5te-vfp.S b/vm/mterp/out/InterpAsm-armv5te-vfp.S
index 336d9f222..52f426303 100644
--- a/vm/mterp/out/InterpAsm-armv5te-vfp.S
+++ b/vm/mterp/out/InterpAsm-armv5te-vfp.S
@@ -9321,6 +9321,9 @@ dvmJitSelfVerificationStart:
  * before jumping back to the interpreter.
  */
 dvmJitSelfVerificationEnd:
+    ldr    r10, [rGLUE, #offGlue_self]  @ callee saved r10 <- glue->self
+    mov    r1, #0
+    str    r1, [r10, #offThread_inJitCodeCache] @ Back to the interp land
     mov    r1,rFP                        @ pass ending fp
     bl     dvmSelfVerificationRestoreState @ restore pc and fp values
     ldr    rPC,[r0,#offShadowSpace_startPC] @ restore PC
diff --git a/vm/mterp/out/InterpAsm-armv5te.S b/vm/mterp/out/InterpAsm-armv5te.S
index b947e27d5..822491ab9 100644
--- a/vm/mterp/out/InterpAsm-armv5te.S
+++ b/vm/mterp/out/InterpAsm-armv5te.S
@@ -9797,6 +9797,9 @@ dvmJitSelfVerificationStart:
  * before jumping back to the interpreter.
  */
 dvmJitSelfVerificationEnd:
+    ldr    r10, [rGLUE, #offGlue_self]  @ callee saved r10 <- glue->self
+    mov    r1, #0
+    str    r1, [r10, #offThread_inJitCodeCache] @ Back to the interp land
     mov    r1,rFP                        @ pass ending fp
     bl     dvmSelfVerificationRestoreState @ restore pc and fp values
     ldr    rPC,[r0,#offShadowSpace_startPC] @ restore PC
diff --git a/vm/mterp/out/InterpAsm-armv7-a.S b/vm/mterp/out/InterpAsm-armv7-a.S
index 0d1bcc87c..c00d316d1 100644
--- a/vm/mterp/out/InterpAsm-armv7-a.S
+++ b/vm/mterp/out/InterpAsm-armv7-a.S
@@ -9257,6 +9257,9 @@ dvmJitSelfVerificationStart:
  * before jumping back to the interpreter.
  */
 dvmJitSelfVerificationEnd:
+    ldr    r10, [rGLUE, #offGlue_self]  @ callee saved r10 <- glue->self
+    mov    r1, #0
+    str    r1, [r10, #offThread_inJitCodeCache] @ Back to the interp land
     mov    r1,rFP                        @ pass ending fp
     bl     dvmSelfVerificationRestoreState @ restore pc and fp values
     ldr    rPC,[r0,#offShadowSpace_startPC] @ restore PC
-- 
2.11.0