From 6bfc37d875b2556b3f1d95b00a785a09dc94ccb0 Mon Sep 17 00:00:00 2001
From: Richard Uhler <ruhler@google.com>
Date: Thu, 24 Nov 2016 11:21:41 +0000
Subject: [PATCH] Redesign implementation of RegisterNativeAllocation.

Improve the performance and remove the issue with GCing apps to death when
they reach the growth_limit_.

This implements the REDESIGN approach described in detail in the
document at go/understanding-register-native-allocation.

Bug: 29156652
Bug: 32576211
Test: m test-art-host
Test: vogar libcore/luni/src/test/java/libcore/util/NativeAllocationRegistryTest.java
Test: Device boots.
Test: adb bugreport and look for "Registered native bytes allocated"

Change-Id: I09176b2b0e4f0c401fe1947830fa5271060f4e61
---
 runtime/atomic.h                         |   9 ++
 runtime/gc/heap.cc                       | 151 +++++++++++--------------------
 runtime/gc/heap.h                        |  67 +++++++++-----
 test/004-NativeAllocations/src/Main.java |  12 ++-
 4 files changed, 119 insertions(+), 120 deletions(-)
diff --git a/runtime/atomic.h b/runtime/atomic.h
index e2a725978..45c3165b1 100644
--- a/runtime/atomic.h
+++ b/runtime/atomic.h
@@ -235,6 +235,11 @@ class PACKED(sizeof(T)) Atomic : public std::atomic<T> {
     this->store(desired, std::memory_order_seq_cst);
   }
 
+  // Atomically replace the value with desired value.
+  T ExchangeRelaxed(T desired_value) {
+    return this->exchange(desired_value, std::memory_order_relaxed);
+  }
+
   // Atomically replace the value with desired value if it matches the expected value.
   // Participates in total ordering of atomic operations.
   bool CompareExchangeStrongSequentiallyConsistent(T expected_value, T desired_value) {
@@ -283,6 +288,10 @@ class PACKED(sizeof(T)) Atomic : public std::atomic<T> {
     return this->fetch_sub(value, std::memory_order_seq_cst);  // Return old value.
   }
 
+  T FetchAndSubRelaxed(const T value) {
+    return this->fetch_sub(value, std::memory_order_relaxed);  // Return old value.
+  }
+
   T FetchAndOrSequentiallyConsistent(const T value) {
     return this->fetch_or(value, std::memory_order_seq_cst);  // Return old_value.
   }
diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc
index 34d82845d..522e0bfcb 100644
--- a/runtime/gc/heap.cc
+++ b/runtime/gc/heap.cc
@@ -127,8 +127,6 @@ static constexpr uint32_t kAllocSpaceBeginForDeterministicAoT = 0x40000000;
 // Dump the rosalloc stats on SIGQUIT.
 static constexpr bool kDumpRosAllocStatsOnSigQuit = false;
 
-static constexpr size_t kNativeAllocationHistogramBuckets = 16;
-
 // Extra added to the heap growth multiplier. Used to adjust the GC ergonomics for the read barrier
 // config.
 static constexpr double kExtraHeapGrowthMultiplier = kUseReadBarrier ? 1.0 : 0.0;
@@ -194,18 +192,12 @@ Heap::Heap(size_t initial_size,
       capacity_(capacity),
       growth_limit_(growth_limit),
       max_allowed_footprint_(initial_size),
-      native_footprint_gc_watermark_(initial_size),
-      native_need_to_run_finalization_(false),
       concurrent_start_bytes_(std::numeric_limits<size_t>::max()),
       total_bytes_freed_ever_(0),
       total_objects_freed_ever_(0),
       num_bytes_allocated_(0),
-      native_bytes_allocated_(0),
-      native_histogram_lock_("Native allocation lock"),
-      native_allocation_histogram_("Native allocation sizes",
-                                   1U,
-                                   kNativeAllocationHistogramBuckets),
-      native_free_histogram_("Native free sizes", 1U, kNativeAllocationHistogramBuckets),
+      new_native_bytes_allocated_(0),
+      old_native_bytes_allocated_(0),
       num_bytes_freed_revoke_(0),
       verify_missing_card_marks_(false),
       verify_system_weaks_(false),
@@ -1101,19 +1093,9 @@ void Heap::DumpGcPerformanceInfo(std::ostream& os) {
     rosalloc_space_->DumpStats(os);
   }
 
-  {
-    MutexLock mu(Thread::Current(), native_histogram_lock_);
-    if (native_allocation_histogram_.SampleSize() > 0u) {
-      os << "Histogram of native allocation ";
-      native_allocation_histogram_.DumpBins(os);
-      os << " bucket size " << native_allocation_histogram_.BucketWidth() << "\n";
-    }
-    if (native_free_histogram_.SampleSize() > 0u) {
-      os << "Histogram of native free ";
-      native_free_histogram_.DumpBins(os);
-      os << " bucket size " << native_free_histogram_.BucketWidth() << "\n";
-    }
-  }
+  os << "Registered native bytes allocated: "
+     << old_native_bytes_allocated_.LoadRelaxed() + new_native_bytes_allocated_.LoadRelaxed()
+     << "\n";
 
   BaseMutex::DumpAll(os);
 }
@@ -2645,6 +2627,13 @@ collector::GcType Heap::CollectGarbageInternal(collector::GcType gc_type,
   // Approximate heap size.
   ATRACE_INT("Heap size (KB)", bytes_allocated_before_gc / KB);
 
+  if (gc_type == NonStickyGcType()) {
+    // Move all bytes from new_native_bytes_allocated_ to
+    // old_native_bytes_allocated_ now that GC has been triggered, resetting
+    // new_native_bytes_allocated_ to zero in the process.
+    old_native_bytes_allocated_.FetchAndAddRelaxed(new_native_bytes_allocated_.ExchangeRelaxed(0));
+  }
+
   DCHECK_LT(gc_type, collector::kGcTypeMax);
   DCHECK_NE(gc_type, collector::kGcTypeNone);
 
@@ -3504,18 +3493,6 @@ bool Heap::IsMovableObject(ObjPtr<mirror::Object> obj) const {
   return false;
 }
 
-void Heap::UpdateMaxNativeFootprint() {
-  size_t native_size = native_bytes_allocated_.LoadRelaxed();
-  // TODO: Tune the native heap utilization to be a value other than the java heap utilization.
-  size_t target_size = native_size / GetTargetHeapUtilization();
-  if (target_size > native_size + max_free_) {
-    target_size = native_size + max_free_;
-  } else if (target_size < native_size + min_free_) {
-    target_size = native_size + min_free_;
-  }
-  native_footprint_gc_watermark_ = std::min(growth_limit_, target_size);
-}
-
 collector::GarbageCollector* Heap::FindCollectorByGcType(collector::GcType gc_type) {
   for (const auto& collector : garbage_collectors_) {
     if (collector->GetCollectorType() == collector_type_ &&
@@ -3552,11 +3529,9 @@ void Heap::GrowForUtilization(collector::GarbageCollector* collector_ran,
     target_size = bytes_allocated + delta * multiplier;
     target_size = std::min(target_size, bytes_allocated + adjusted_max_free);
     target_size = std::max(target_size, bytes_allocated + adjusted_min_free);
-    native_need_to_run_finalization_ = true;
     next_gc_type_ = collector::kGcTypeSticky;
   } else {
-    collector::GcType non_sticky_gc_type =
-        HasZygoteSpace() ? collector::kGcTypePartial : collector::kGcTypeFull;
+    collector::GcType non_sticky_gc_type = NonStickyGcType();
     // Find what the next non sticky collector will be.
     collector::GarbageCollector* non_sticky_collector = FindCollectorByGcType(non_sticky_gc_type);
     // If the throughput of the current sticky GC >= throughput of the non sticky collector, then
@@ -3707,7 +3682,7 @@ void Heap::ConcurrentGC(Thread* self, bool force_full) {
       collector::GcType next_gc_type = next_gc_type_;
       // If forcing full and next gc type is sticky, override with a non-sticky type.
       if (force_full && next_gc_type == collector::kGcTypeSticky) {
-        next_gc_type = HasZygoteSpace() ? collector::kGcTypePartial : collector::kGcTypeFull;
+        next_gc_type = NonStickyGcType();
       }
       if (CollectGarbageInternal(next_gc_type, kGcCauseBackground, false) ==
           collector::kGcTypeNone) {
@@ -3863,71 +3838,51 @@ void Heap::RunFinalization(JNIEnv* env, uint64_t timeout) {
                             static_cast<jlong>(timeout));
 }
 
-void Heap::RegisterNativeAllocation(JNIEnv* env, size_t bytes) {
-  Thread* self = ThreadForEnv(env);
-  {
-    MutexLock mu(self, native_histogram_lock_);
-    native_allocation_histogram_.AddValue(bytes);
-  }
-  if (native_need_to_run_finalization_) {
-    RunFinalization(env, kNativeAllocationFinalizeTimeout);
-    UpdateMaxNativeFootprint();
-    native_need_to_run_finalization_ = false;
-  }
-  // Total number of native bytes allocated.
-  size_t new_native_bytes_allocated = native_bytes_allocated_.FetchAndAddSequentiallyConsistent(bytes);
-  new_native_bytes_allocated += bytes;
-  if (new_native_bytes_allocated > native_footprint_gc_watermark_) {
-    collector::GcType gc_type = HasZygoteSpace() ? collector::kGcTypePartial :
-        collector::kGcTypeFull;
-
-    // The second watermark is higher than the gc watermark. If you hit this it means you are
-    // allocating native objects faster than the GC can keep up with.
-    if (new_native_bytes_allocated > growth_limit_) {
-      if (WaitForGcToComplete(kGcCauseForNativeAlloc, self) != collector::kGcTypeNone) {
-        // Just finished a GC, attempt to run finalizers.
-        RunFinalization(env, kNativeAllocationFinalizeTimeout);
-        CHECK(!env->ExceptionCheck());
-        // Native bytes allocated may be updated by finalization, refresh it.
-        new_native_bytes_allocated = native_bytes_allocated_.LoadRelaxed();
-      }
-      // If we still are over the watermark, attempt a GC for alloc and run finalizers.
-      if (new_native_bytes_allocated > growth_limit_) {
-        CollectGarbageInternal(gc_type, kGcCauseForNativeAlloc, false);
-        RunFinalization(env, kNativeAllocationFinalizeTimeout);
-        native_need_to_run_finalization_ = false;
-        CHECK(!env->ExceptionCheck());
-      }
-      // We have just run finalizers, update the native watermark since it is very likely that
-      // finalizers released native managed allocations.
-      UpdateMaxNativeFootprint();
-    } else if (!IsGCRequestPending()) {
-      if (IsGcConcurrent()) {
-        RequestConcurrentGC(self, true);  // Request non-sticky type.
-      } else {
-        CollectGarbageInternal(gc_type, kGcCauseForNativeAlloc, false);
-      }
+void Heap::TriggerGcForNativeAlloc(Thread* self) {
+  if (!IsGCRequestPending()) {
+    if (IsGcConcurrent()) {
+      RequestConcurrentGC(self, /*force_full*/true);
+    } else {
+      CollectGarbageInternal(NonStickyGcType(), kGcCauseForNativeAlloc, false);
     }
   }
 }
 
-void Heap::RegisterNativeFree(JNIEnv* env, size_t bytes) {
-  size_t expected_size;
-  {
-    MutexLock mu(Thread::Current(), native_histogram_lock_);
-    native_free_histogram_.AddValue(bytes);
+void Heap::RegisterNativeAllocation(JNIEnv* env, size_t bytes) {
+  // See the REDESIGN section of go/understanding-register-native-allocation
+  // for an explanation of how RegisterNativeAllocation works.
+  size_t new_value = bytes + new_native_bytes_allocated_.FetchAndAddRelaxed(bytes);
+  if (new_value > NativeAllocationBlockingGcWatermark()) {
+    // Wait for a new GC to finish and finalizers to run, because the
+    // allocation rate is too high.
+    Thread* self = ThreadForEnv(env);
+    WaitForGcToComplete(kGcCauseForNativeAlloc, self);
+    TriggerGcForNativeAlloc(self);
+    WaitForGcToComplete(kGcCauseForNativeAlloc, self);
+    RunFinalization(env, kNativeAllocationFinalizeTimeout);
+    CHECK(!env->ExceptionCheck());
+  } else if (new_value > NativeAllocationGcWatermark() && !IsGCRequestPending()) {
+    // Trigger another GC because there have been enough native bytes
+    // allocated since the last GC.
+    TriggerGcForNativeAlloc(ThreadForEnv(env));
   }
+}
+
+void Heap::RegisterNativeFree(JNIEnv*, size_t bytes) {
+  // Take the bytes freed out of new_native_bytes_allocated_ first. If
+  // new_native_bytes_allocated_ reaches zero, take the remaining bytes freed
+  // out of old_native_bytes_allocated_ to ensure all freed bytes are
+  // accounted for.
+  size_t allocated;
+  size_t new_freed_bytes;
   do {
-    expected_size = native_bytes_allocated_.LoadRelaxed();
-    if (UNLIKELY(bytes > expected_size)) {
-      ScopedObjectAccess soa(env);
-      env->ThrowNew(WellKnownClasses::java_lang_RuntimeException,
-                    StringPrintf("Attempted to free %zd native bytes with only %zd native bytes "
-                    "registered as allocated", bytes, expected_size).c_str());
-      break;
-    }
-  } while (!native_bytes_allocated_.CompareExchangeWeakRelaxed(expected_size,
-                                                               expected_size - bytes));
+    allocated = new_native_bytes_allocated_.LoadRelaxed();
+    new_freed_bytes = std::min(allocated, bytes);
+  } while (!new_native_bytes_allocated_.CompareExchangeWeakRelaxed(allocated,
+                                                                   allocated - new_freed_bytes));
+  if (new_freed_bytes < bytes) {
+    old_native_bytes_allocated_.FetchAndSubRelaxed(bytes - new_freed_bytes);
+  }
 }
 
 size_t Heap::GetTotalMemory() const {
diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h
index 3a8e29b08..b2a414293 100644
--- a/runtime/gc/heap.h
+++ b/runtime/gc/heap.h
@@ -260,9 +260,9 @@ class Heap {
       REQUIRES_SHARED(Locks::mutator_lock_);
 
   void RegisterNativeAllocation(JNIEnv* env, size_t bytes)
-      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !native_histogram_lock_);
+      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
   void RegisterNativeFree(JNIEnv* env, size_t bytes)
-      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !native_histogram_lock_);
+      REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
 
   // Change the allocator, updates entrypoints.
   void ChangeAllocator(AllocatorType allocator)
@@ -562,7 +562,7 @@ class Heap {
   space::Space* FindSpaceFromAddress(const void* ptr) const
       REQUIRES_SHARED(Locks::mutator_lock_);
 
-  void DumpForSigQuit(std::ostream& os) REQUIRES(!*gc_complete_lock_, !native_histogram_lock_);
+  void DumpForSigQuit(std::ostream& os) REQUIRES(!*gc_complete_lock_);
 
   // Do a pending collector transition.
   void DoPendingCollectorTransition() REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
@@ -679,7 +679,7 @@ class Heap {
 
   // GC performance measuring
   void DumpGcPerformanceInfo(std::ostream& os)
-      REQUIRES(!*gc_complete_lock_, !native_histogram_lock_);
+      REQUIRES(!*gc_complete_lock_);
   void ResetGcPerformanceInfo() REQUIRES(!*gc_complete_lock_);
 
   // Thread pool.
@@ -979,10 +979,6 @@ class Heap {
   void PostGcVerificationPaused(collector::GarbageCollector* gc)
       REQUIRES(Locks::mutator_lock_, !*gc_complete_lock_);
 
-  // Update the watermark for the native allocated bytes based on the current number of native
-  // bytes allocated and the target utilization ratio.
-  void UpdateMaxNativeFootprint();
-
   // Find a collector based on GC type.
   collector::GarbageCollector* FindCollectorByGcType(collector::GcType gc_type);
 
@@ -1066,6 +1062,36 @@ class Heap {
       REQUIRES_SHARED(Locks::mutator_lock_)
       REQUIRES(!*gc_complete_lock_, !*pending_task_lock_, !*backtrace_lock_);
 
+  collector::GcType NonStickyGcType() const {
+    return HasZygoteSpace() ? collector::kGcTypePartial : collector::kGcTypeFull;
+  }
+
+  // Helper function for RegisterNativeAllocation. Request a non-sticky GC for
+  // NativeAlloc if one isn't currently running. The GC is performed in place
+  // if the garbage collector is not concurrent.
+  void TriggerGcForNativeAlloc(Thread* self) REQUIRES(!*gc_complete_lock_, !*pending_task_lock_);
+
+  // How large new_native_bytes_allocated_ can grow before we trigger a new
+  // GC.
+  ALWAYS_INLINE size_t NativeAllocationGcWatermark() const {
+    // Reuse max_free_ for the native allocation gc watermark, so that the
+    // native heap is treated in the same way as the Java heap in the case
+    // where the gc watermark update would exceed max_free_. Using max_free_
+    // instead of the target utilization means the watermark doesn't depend on
+    // the current number of registered native allocations.
+    return max_free_;
+  }
+
+  // How large new_native_bytes_allocated_ can grow while GC is in progress
+  // before we block the allocating thread to allow GC to catch up.
+  ALWAYS_INLINE size_t NativeAllocationBlockingGcWatermark() const {
+    // Historically the native allocations were bounded by growth_limit_. This
+    // uses that same value, dividing growth_limit_ by 2 to account for
+    // the fact that now the bound is relative to the number of retained
+    // registered native allocations rather than absolute.
+    return growth_limit_ / 2;
+  }
+
   // All-known continuous spaces, where objects lie within fixed bounds.
   std::vector<space::ContinuousSpace*> continuous_spaces_ GUARDED_BY(Locks::mutator_lock_);
 
@@ -1184,12 +1210,6 @@ class Heap {
   // a GC should be triggered.
   size_t max_allowed_footprint_;
 
-  // The watermark at which a concurrent GC is requested by registerNativeAllocation.
-  size_t native_footprint_gc_watermark_;
-
-  // Whether or not we need to run finalizers in the next native allocation.
-  bool native_need_to_run_finalization_;
-
   // When num_bytes_allocated_ exceeds this amount then a concurrent GC should be requested so that
   // it completes ahead of an allocation failing.
   size_t concurrent_start_bytes_;
@@ -1203,13 +1223,18 @@ class Heap {
   // Number of bytes allocated.  Adjusted after each allocation and free.
   Atomic<size_t> num_bytes_allocated_;
 
-  // Bytes which are allocated and managed by native code but still need to be accounted for.
-  Atomic<size_t> native_bytes_allocated_;
-
-  // Native allocation stats.
-  Mutex native_histogram_lock_;
-  Histogram<uint64_t> native_allocation_histogram_;
-  Histogram<uint64_t> native_free_histogram_;
+  // Number of registered native bytes allocated since the last time GC was
+  // triggered. Adjusted after each RegisterNativeAllocation and
+  // RegisterNativeFree. Used to determine when to trigger GC for native
+  // allocations.
+  // See the REDESIGN section of go/understanding-register-native-allocation.
+  Atomic<size_t> new_native_bytes_allocated_;
+
+  // Number of registered native bytes allocated prior to the last time GC was
+  // triggered, for debugging purposes. The current number of registered
+  // native bytes is determined by taking the sum of
+  // old_native_bytes_allocated_ and new_native_bytes_allocated_.
+  Atomic<size_t> old_native_bytes_allocated_;
 
   // Number of bytes freed by thread local buffer revokes. This will
   // cancel out the ahead-of-time bulk counting of bytes allocated in
diff --git a/test/004-NativeAllocations/src/Main.java b/test/004-NativeAllocations/src/Main.java
index 92f4e21f4..871275512 100644
--- a/test/004-NativeAllocations/src/Main.java
+++ b/test/004-NativeAllocations/src/Main.java
@@ -16,6 +16,7 @@
 
 import java.lang.reflect.*;
 import java.lang.Runtime;
+import dalvik.system.VMRuntime;
 
 public class Main {
     static Object nativeLock = new Object();
@@ -33,10 +34,19 @@ public class Main {
         NativeAllocation(int bytes, boolean testingDeadlock) throws Exception {
             this.bytes = bytes;
             register_native_allocation.invoke(runtime, bytes);
+
+            // Register native allocation can only provide guarantees bounding
+            // the maximum outstanding allocations if finalizers don't time
+            // out. In case finalizers have timed out, wait longer for them
+            // now to complete so we can test the guarantees.
+            if (!testingDeadlock) {
+              VMRuntime.runFinalization(0);
+            }
+
             synchronized (nativeLock) {
                 if (!testingDeadlock) {
                     nativeBytes += bytes;
-                    if (nativeBytes > maxMem) {
+                    if (nativeBytes > 2 * maxMem) {
                         throw new OutOfMemoryError();
                     }
                 }
-- 
2.11.0