From d5a9659f74fc3ac23cecf1db70886b919bf98ff9 Mon Sep 17 00:00:00 2001 From: Andy McFadden Date: Wed, 17 Jun 2009 16:29:30 -0700 Subject: [PATCH] Reduce VM aborts during high CPU stress. (This was cherry-picked from master 2aa43610 for internal bug 1952616.) The VM has some timeouts that are meant to kill the current process if something gets stuck (e.g. a thread grabs a lock and then manages to die while the rest of the process continues on). These were tripping a little too easily during some high-load situations. This changes the order of operations so that we now unlock the "thread suspend" lock before sending a wakeup broadcast to the condition variable that threads sleep on. This should make it less likely for a thread to be running for an extended period while the lock is held. (Relates to internal bug 1664687.) This also wraps a couple of things (pthread_create, dlopen) with a state change to VMWAIT. During high load situations these can take a while to complete, and we would (with the K-Means Visualizer load generator running) very occasionally time out. Augmented the debug output in a couple of minor ways. Updated comments. --- vm/Globals.h | 5 +--- vm/Native.c | 18 +++++++++++--- vm/Thread.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 83 insertions(+), 18 deletions(-) diff --git a/vm/Globals.h b/vm/Globals.h index 79b9e9154..a2de27110 100644 --- a/vm/Globals.h +++ b/vm/Globals.h @@ -321,7 +321,7 @@ struct DvmGlobals { /* * The thread code grabs this before suspending all threads. There - * are four things that can cause a "suspend all": + * are a few things that can cause a "suspend all": * (1) the GC is starting; * (2) the debugger has sent a "suspend all" request; * (3) a thread has hit a breakpoint or exception that the debugger @@ -332,9 +332,6 @@ struct DvmGlobals { * do a blocking "lock" call on this mutex -- if it has been acquired, * somebody is probably trying to put you to sleep. The leading '_' is * intended as a reminder that this lock is special. - * - * This lock is also held while attaching an externally-created thread - * through JNI. That way we can correctly set the initial suspend state. */ pthread_mutex_t _threadSuspendLock; diff --git a/vm/Native.c b/vm/Native.c index 7a153d6b6..dcf68b685 100644 --- a/vm/Native.c +++ b/vm/Native.c @@ -423,9 +423,21 @@ bool dvmLoadNativeCode(const char* pathName, Object* classLoader) * - check to see if the library is valid * - check config/prelink-linux-arm.map to ensure that the library * is listed and is not being overrun by the previous entry (if - * loading suddenly stops working, this is a good one to check) + * loading suddenly stops working on a prelinked library, this is + * a good one to check) + * - write a trivial app that calls sleep() then dlopen(), attach + * to it with "strace -p " while it sleeps, and watch for + * attempts to open nonexistent dependent shared libs + * + * This can execute slowly for a large library on a busy system, so we + * want to switch from RUNNING to VMWAIT while it executes. This allows + * the GC to ignore us. */ + Thread* self = dvmThreadSelf(); + int oldStatus = dvmChangeStatus(self, THREAD_VMWAIT); handle = dlopen(pathName, RTLD_LAZY); + dvmChangeStatus(self, oldStatus); + if (handle == NULL) { LOGI("Unable to dlopen(%s): %s\n", pathName, dlerror()); return false; @@ -461,9 +473,9 @@ bool dvmLoadNativeCode(const char* pathName, Object* classLoader) Object* prevOverride = self->classLoaderOverride; self->classLoaderOverride = classLoader; - dvmChangeStatus(NULL, THREAD_NATIVE); + oldStatus = dvmChangeStatus(self, THREAD_NATIVE); version = (*func)(gDvm.vmList, NULL); - dvmChangeStatus(NULL, THREAD_RUNNING); + dvmChangeStatus(self, oldStatus); self->classLoaderOverride = prevOverride; if (version != JNI_VERSION_1_2 && version != JNI_VERSION_1_4 && diff --git a/vm/Thread.c b/vm/Thread.c index 7116dfd76..ee195eb6c 100644 --- a/vm/Thread.c +++ b/vm/Thread.c @@ -516,9 +516,15 @@ static void lockThreadSuspend(const char* who, SuspendCause why) if (cc != 0) { if (!dvmCheckSuspendPending(NULL)) { /* - * Could be we hit the window as the suspend or resume - * was started (i.e. the lock has been grabbed but the - * other thread hasn't yet set our "please suspend" flag). + * Could be that a resume-all is in progress, and something + * grabbed the CPU when the wakeup was broadcast. The thread + * performing the resume hasn't had a chance to release the + * thread suspend lock. (Should no longer be an issue -- + * we now release before broadcast.) + * + * Could be we hit the window as a suspend was started, + * and the lock has been grabbed but the suspend counts + * haven't been incremented yet. * * Could be an unusual JNI thread-attach thing. * @@ -540,6 +546,7 @@ static void lockThreadSuspend(const char* who, SuspendCause why) LOGE("threadid=%d: couldn't get thread-suspend lock (%s:%s)," " bailing\n", dvmThreadSelf()->threadId, who, getSuspendCauseStr(why)); + /* threads are not suspended, thread dump could crash */ dvmDumpAllThreads(false); dvmAbort(); } @@ -1244,9 +1251,13 @@ bool dvmCreateInterpThread(Object* threadObj, int reqStackSize) */ dvmUnlockThreadList(); - if (pthread_create(&threadHandle, &threadAttr, interpThreadStart, - newThread) != 0) - { + int cc, oldStatus; + oldStatus = dvmChangeStatus(self, THREAD_VMWAIT); + cc = pthread_create(&threadHandle, &threadAttr, interpThreadStart, + newThread); + oldStatus = dvmChangeStatus(self, oldStatus); + + if (cc != 0) { /* * Failure generally indicates that we have exceeded system * resource limits. VirtualMachineError is probably too severe, @@ -1291,7 +1302,8 @@ bool dvmCreateInterpThread(Object* threadObj, int reqStackSize) * * The easiest way to deal with this is to prevent the new thread from * running until the parent says it's okay. This results in the - * following sequence of events for a "badly timed" GC: + * following (correct) sequence of events for a "badly timed" GC + * (where '-' is us, 'o' is the child, and '+' is some other thread): * * - call pthread_create() * - lock thread list @@ -2335,6 +2347,7 @@ static void waitForThreadSuspend(Thread* self, Thread* thread) { const int kMaxRetries = 10; const int kSpinSleepTime = 750*1000; /* 0.75s */ + bool complained = false; int sleepIter = 0; int retryCount = 0; @@ -2349,6 +2362,7 @@ static void waitForThreadSuspend(Thread* self, Thread* thread) self->threadId, (int)self->handle, thread->threadId, (int)thread->handle); dumpWedgedThread(thread); + complained = true; // keep going; could be slow due to valgrind sleepIter = 0; @@ -2361,6 +2375,11 @@ static void waitForThreadSuspend(Thread* self, Thread* thread) } } } + + if (complained) { + LOGW("threadid=%d: spin on suspend resolved\n", self->threadId); + //dvmDumpThread(thread, false); /* suspended, so dump is safe */ + } } /* @@ -2497,7 +2516,9 @@ void dvmResumeAllThreads(SuspendCause why) /* debugger events don't suspend JDWP thread */ if ((why == SUSPEND_FOR_DEBUG || why == SUSPEND_FOR_DEBUG_EVENT) && thread->handle == dvmJdwpGetDebugThread(gDvm.jdwpState)) + { continue; + } if (thread->suspendCount > 0) { thread->suspendCount--; @@ -2512,6 +2533,43 @@ void dvmResumeAllThreads(SuspendCause why) dvmUnlockThreadList(); /* + * In some ways it makes sense to continue to hold the thread-suspend + * lock while we issue the wakeup broadcast. It allows us to complete + * one operation before moving on to the next, which simplifies the + * thread activity debug traces. + * + * This approach caused us some difficulty under Linux, because the + * condition variable broadcast not only made the threads runnable, + * but actually caused them to execute, and it was a while before + * the thread performing the wakeup had an opportunity to release the + * thread-suspend lock. + * + * This is a problem because, when a thread tries to acquire that + * lock, it times out after 3 seconds. If at some point the thread + * is told to suspend, the clock resets; but since the VM is still + * theoretically mid-resume, there's no suspend pending. If, for + * example, the GC was waking threads up while the SIGQUIT handler + * was trying to acquire the lock, we would occasionally time out on + * a busy system and SignalCatcher would abort. + * + * We now perform the unlock before the wakeup broadcast. The next + * suspend can't actually start until the broadcast completes and + * returns, because we're holding the thread-suspend-count lock, but the + * suspending thread is now able to make progress and we avoid the abort. + * + * (Technically there is a narrow window between when we release + * the thread-suspend lock and grab the thread-suspend-count lock. + * This could cause us to send a broadcast to threads with nonzero + * suspend counts, but this is expected and they'll all just fall + * right back to sleep. It's probably safe to grab the suspend-count + * lock before releasing thread-suspend, since we're still following + * the correct order of acquisition, but it feels weird.) + */ + + LOG_THREAD("threadid=%d: ResumeAll waking others\n", self->threadId); + unlockThreadSuspend(); + + /* * Broadcast a notification to all suspended threads, some or all of * which may choose to wake up. No need to wait for them. */ @@ -2520,8 +2578,6 @@ void dvmResumeAllThreads(SuspendCause why) assert(cc == 0); unlockThreadSuspendCount(); - unlockThreadSuspend(); - LOG_THREAD("threadid=%d: ResumeAll complete\n", self->threadId); } @@ -2989,9 +3045,9 @@ void dvmDumpThreadEx(const DebugOutputTarget* target, Thread* thread, threadName, isDaemon ? " daemon" : "", priority, thread->threadId, kStatusNames[thread->status]); dvmPrintDebugMessage(target, - " | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p\n", + " | group=\"%s\" sCount=%d dsCount=%d s=%c obj=%p self=%p\n", groupName, thread->suspendCount, thread->dbgSuspendCount, - thread->isSuspended ? 'Y' : 'N', thread->threadObj); + thread->isSuspended ? 'Y' : 'N', thread->threadObj, thread); dvmPrintDebugMessage(target, " | sysTid=%d nice=%d sched=%d/%d handle=%d\n", thread->systemTid, getpriority(PRIO_PROCESS, thread->systemTid), -- 2.11.0