From e5bcf5c6572bb5190ee8394619e936ffb61db14d Mon Sep 17 00:00:00 2001 From: John Grossman Date: Tue, 6 Dec 2011 14:05:05 -0800 Subject: [PATCH] Tungsten: Improve RX player error handling. First, clear an issue which was causing an assert to fire. Basically, once a decoder pump had entered the error state and was shutdown, it was not clearing its status, and when a substream attempt to recycle the pump, startup was failing an assert (no thread had been created, meaning that the system was not initialized, yet status indicated an error). This was a small one-liner in aah_decoder_pump.cpp. Second, try to become a little nuanced about how we handle errors in the decoder pump. A comment in the code pretty much says it all, but the summary is that we don't want to completely abort playback because a single chunk of ES failed to decode, but if nothing is decoding and we are making no progress at all, we probably need to put the MediaPlayer instance into the fatal Error state and signal the app level so that further action can be taken (automatic recovery attempts followed by bug reports and signalling the user if those fail). This is to address the fallout of http://b/issue?id=5498460, where something at the OMX decoder level becomes unhappy about not being able to obtain an output buffer which eventually unwinds to this assert which results in a dead mediaserver. After this change, the mediaserver will no longer crash, and may even recover (depending on whether or not the OMX unhappiness is transient or not), but the primary issue (unhappy OMX) is probably still around. It is quite difficult to reproduce, I will probably need to open a different bug to track that issue. Change-Id: I5b65b818378a5ae9c915e91b7db7129f0bda6837 Signed-off-by: John Grossman --- media/libaah_rtp/aah_decoder_pump.cpp | 73 +++++++++++++++++++++++++--- media/libaah_rtp/aah_decoder_pump.h | 1 + media/libaah_rtp/aah_rx_player.h | 5 +- media/libaah_rtp/aah_rx_player_core.cpp | 17 +++++++ media/libaah_rtp/aah_rx_player_substream.cpp | 24 +++++++++ 5 files changed, 113 insertions(+), 7 deletions(-) diff --git a/media/libaah_rtp/aah_decoder_pump.cpp b/media/libaah_rtp/aah_decoder_pump.cpp index b2efbe634c..f841ddb3af 100644 --- a/media/libaah_rtp/aah_decoder_pump.cpp +++ b/media/libaah_rtp/aah_decoder_pump.cpp @@ -29,12 +29,17 @@ #include #include #include +#include #include #include "aah_decoder_pump.h" namespace android { +static const long long kLongDecodeErrorThreshold = 1000000ll; +static const uint32_t kMaxLongErrorsBeforeFatal = 3; +static const uint32_t kMaxErrorsBeforeFatal = 60; + AAH_DecoderPump::AAH_DecoderPump(OMXClient& omx) : omx_(omx) , thread_status_(OK) @@ -290,11 +295,17 @@ void* AAH_DecoderPump::workThread() { return NULL; } + DurationTimer decode_timer; + uint32_t consecutive_long_errors = 0; + uint32_t consecutive_errors = 0; + while (!thread_->exitPending()) { status_t res; MediaBuffer* bufOut = NULL; + decode_timer.start(); res = decoder_->read(&bufOut); + decode_timer.stop(); if (res == INFO_FORMAT_CHANGED) { // Format has changed. Destroy our current renderer so that a new @@ -308,14 +319,58 @@ void* AAH_DecoderPump::workThread() { res = OK; } - // Any error aside from INFO_FORMAT_CHANGED is considered to be fatal - // and will result in shutdown of the decoder pump thread. + // Try to be a little nuanced in our handling of actual decode errors. + // Errors could happen because of minor stream corruption or because of + // transient resource limitations. In these cases, we would rather drop + // a little bit of output and ride out the unpleasantness then throw up + // our hands and abort everything. + // + // OTOH - When things are really bad (like we have a non-transient + // resource or bookkeeping issue, or the stream being fed to us is just + // complete and total garbage) we really want to terminate playback and + // raise an error condition all the way up to the application level so + // they can deal with it. + // + // Unfortunately, the error codes returned by the decoder can be a + // little non-specific. For example, if an OMXCodec times out + // attempting to obtain an output buffer, the error we get back is a + // generic -1. Try to distinguish between this resource timeout error + // and ES corruption error by timing how long the decode operation + // takes. Maintain accounting for both errors and "long errors". If we + // get more than a certain number consecutive errors of either type, + // consider it fatal and shutdown (which will cause the error to + // propagate all of the way up to the application level). The threshold + // for "long errors" is deliberately much lower than that of normal + // decode errors, both because of how long they take to happen and + // because they generally indicate resource limitation errors which are + // unlikely to go away in pathologically bad cases (in contrast to + // stream corruption errors which might happen 20 times in a row and + // then be suddenly OK again) if (res != OK) { - LOGE("%s: Failed to decode data (res = %d)", - __PRETTY_FUNCTION__, res); + consecutive_errors++; + if (decode_timer.durationUsecs() >= kLongDecodeErrorThreshold) + consecutive_long_errors++; + CHECK(NULL == bufOut); - thread_status_ = res; - break; + + LOGW("%s: Failed to decode data (res = %d)", + __PRETTY_FUNCTION__, res); + + if ((consecutive_errors >= kMaxErrorsBeforeFatal) || + (consecutive_long_errors >= kMaxLongErrorsBeforeFatal)) { + LOGE("%s: Maximum decode error threshold has been reached." + " There have been %d consecutive decode errors, and %d" + " consecutive decode operations which resulted in errors" + " and took more than %lld uSec to process. The last" + " decode operation took %lld uSec.", + __PRETTY_FUNCTION__, + consecutive_errors, consecutive_long_errors, + kLongDecodeErrorThreshold, decode_timer.durationUsecs()); + thread_status_ = res; + break; + } + + continue; } if (NULL == bufOut) { @@ -324,6 +379,11 @@ void* AAH_DecoderPump::workThread() { continue; } + // Successful decode (with actual output produced). Clear the error + // counters. + consecutive_errors = 0; + consecutive_long_errors = 0; + queueToRenderer(bufOut); bufOut->release(); } @@ -409,6 +469,7 @@ status_t AAH_DecoderPump::shutdown_l() { last_queued_pts_valid_ = false; last_ts_transform_valid_ = false; last_volume_ = 0xFF; + thread_status_ = OK; decoder_ = NULL; format_ = NULL; diff --git a/media/libaah_rtp/aah_decoder_pump.h b/media/libaah_rtp/aah_decoder_pump.h index c9b35cde85..594d235357 100644 --- a/media/libaah_rtp/aah_decoder_pump.h +++ b/media/libaah_rtp/aah_decoder_pump.h @@ -43,6 +43,7 @@ class AAH_DecoderPump : public MediaSource { void setRenderTSTransform(const LinearTransform& trans); void setRenderVolume(uint8_t volume); bool isAboutToUnderflow(int64_t threshold); + bool getStatus() const { return thread_status_; } // MediaSource methods virtual status_t start(MetaData *params) { return OK; } diff --git a/media/libaah_rtp/aah_rx_player.h b/media/libaah_rtp/aah_rx_player.h index 44aedc3e6a..69c3f4269d 100644 --- a/media/libaah_rtp/aah_rx_player.h +++ b/media/libaah_rtp/aah_rx_player.h @@ -189,12 +189,12 @@ class AAH_RXPlayer : public MediaPlayerInterface { int32_t ts_lower); void processPayloadCont (uint8_t* buf, uint32_t amt); - void processCompletedBuffer(); void processTSTransform(const LinearTransform& trans); bool isAboutToUnderflow(); uint32_t getSSRC() const { return ssrc_; } uint16_t getProgramID() const { return (ssrc_ >> 5) & 0x1F; } + status_t getStatus() const { return status_; } protected: virtual ~Substream() { @@ -203,11 +203,14 @@ class AAH_RXPlayer : public MediaPlayerInterface { private: void cleanupDecoder(); + bool shouldAbort(const char* log_tag); + void processCompletedBuffer(); bool setupSubstreamType(uint8_t substream_type, uint8_t codec_type); uint32_t ssrc_; bool waiting_for_rap_; + status_t status_; bool substream_details_known_; uint8_t substream_type_; diff --git a/media/libaah_rtp/aah_rx_player_core.cpp b/media/libaah_rtp/aah_rx_player_core.cpp index a86ea172b9..7fbf9c8bcb 100644 --- a/media/libaah_rtp/aah_rx_player_core.cpp +++ b/media/libaah_rtp/aah_rx_player_core.cpp @@ -384,6 +384,23 @@ bool AAH_RXPlayer::threadLoop() { if (!thread_wrapper_->exitPending()) { process_more_right_now = processGaps(); } + + // Step 5: Check for fatal errors. If any of our substreams has + // encountered a fatal, unrecoverable, error, then propagate the error + // up to user level and shut down. + for (size_t i = 0; i < substreams_.size(); ++i) { + status_t status; + CHECK(substreams_.valueAt(i) != NULL); + + status = substreams_.valueAt(i)->getStatus(); + if (OK != status) { + LOGE("Substream index %d has encountered an unrecoverable" + " error (%d). Signalling application level and shutting" + " down.", i, status); + sendEvent(MEDIA_ERROR); + goto bailout; + } + } } bailout: diff --git a/media/libaah_rtp/aah_rx_player_substream.cpp b/media/libaah_rtp/aah_rx_player_substream.cpp index 708ea36d46..698b3f92ba 100644 --- a/media/libaah_rtp/aah_rx_player_substream.cpp +++ b/media/libaah_rtp/aah_rx_player_substream.cpp @@ -37,6 +37,7 @@ AAH_RXPlayer::Substream::Substream(uint32_t ssrc, OMXClient& omx) { ssrc_ = ssrc; substream_details_known_ = false; buffer_in_progress_ = NULL; + status_ = OK; decoder_ = new AAH_DecoderPump(omx); if (decoder_ == NULL) { @@ -54,6 +55,7 @@ AAH_RXPlayer::Substream::Substream(uint32_t ssrc, OMXClient& omx) { void AAH_RXPlayer::Substream::shutdown() { substream_meta_ = NULL; + status_ = OK; cleanupBufferInProgress(); cleanupDecoder(); } @@ -75,10 +77,27 @@ void AAH_RXPlayer::Substream::cleanupDecoder() { } } +bool AAH_RXPlayer::Substream::shouldAbort(const char* log_tag) { + // If we have already encountered a fatal error, do nothing. We are just + // waiting for our owner to shut us down now. + if (OK != status_) { + LOGV("Skipping %s, substream has encountered fatal error (%d).", + log_tag, status_); + return true; + } + + return false; +} + void AAH_RXPlayer::Substream::processPayloadStart(uint8_t* buf, uint32_t amt, int32_t ts_lower) { uint32_t min_length = 6; + + if (shouldAbort(__PRETTY_FUNCTION__)) { + return; + } + // Do we have a buffer in progress already? If so, abort the buffer. In // theory, this should never happen. If there were a discontinutity in the // stream, the discon in the seq_nos at the RTP level should have already @@ -289,6 +308,10 @@ void AAH_RXPlayer::Substream::processPayloadStart(uint8_t* buf, void AAH_RXPlayer::Substream::processPayloadCont(uint8_t* buf, uint32_t amt) { + if (shouldAbort(__PRETTY_FUNCTION__)) { + return; + } + if (NULL == buffer_in_progress_) { LOGV("TRTP Receiver skipping payload continuation; no buffer currently" " in progress."); @@ -396,6 +419,7 @@ void AAH_RXPlayer::Substream::processCompletedBuffer() { if (res != OK) { LOGD("Failed to queue payload for decode, resetting decoder pump!" " (res = %d)", res); + status_ = res; cleanupDecoder(); cleanupBufferInProgress(); } -- 2.11.0