From 5ace201d84adb7753680bf4c7877b3b71558da82 Mon Sep 17 00:00:00 2001 From: Mathieu Chartier Date: Wed, 30 Nov 2016 10:15:41 -0800 Subject: [PATCH] Revert "Revert CC related changes." Disable entrypoint switching in ResetQuickAllocEntryPointsForThread instead of callers. Fixes bug where instrumentation would switch to non CC entrypoints for non X86_64 architectures causing aborts. Bug: 31018974 Test: test-art-host Test: test/run-test 099 This reverts commit 96172e0172c5fca6e9a5ad4b857a24d8c7b064e5. Change-Id: If206694ae35ff4446c6a8a97bfbcbf2dac35e3f9 --- runtime/arch/mips/entrypoints_init_mips.cc | 2 +- runtime/arch/quick_alloc_entrypoints.S | 35 +++-- runtime/arch/x86/quick_entrypoints_x86.S | 7 +- runtime/arch/x86_64/quick_entrypoints_x86_64.S | 150 ++++++++++++--------- .../entrypoints/quick/quick_alloc_entrypoints.cc | 10 +- .../entrypoints/quick/quick_alloc_entrypoints.h | 4 +- .../quick/quick_default_init_entrypoints.h | 2 +- runtime/gc/heap-inl.h | 124 +++++++---------- runtime/gc/heap.cc | 69 +++++++++- runtime/gc/heap.h | 17 ++- runtime/instrumentation.cc | 2 +- runtime/thread.cc | 13 +- runtime/thread.h | 2 +- 13 files changed, 267 insertions(+), 170 deletions(-) diff --git a/runtime/arch/mips/entrypoints_init_mips.cc b/runtime/arch/mips/entrypoints_init_mips.cc index 6a442a55b..5c569232a 100644 --- a/runtime/arch/mips/entrypoints_init_mips.cc +++ b/runtime/arch/mips/entrypoints_init_mips.cc @@ -71,7 +71,7 @@ void InitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) { jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub; // Alloc - ResetQuickAllocEntryPoints(qpoints); + ResetQuickAllocEntryPoints(qpoints, /*is_marking*/ false); // Cast qpoints->pInstanceofNonTrivial = artInstanceOfFromCode; diff --git a/runtime/arch/quick_alloc_entrypoints.S b/runtime/arch/quick_alloc_entrypoints.S index fa86bf408..db2fdcabe 100644 --- a/runtime/arch/quick_alloc_entrypoints.S +++ b/runtime/arch/quick_alloc_entrypoints.S @@ -107,7 +107,28 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) .endm +.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR +// This is to be separately defined for each architecture to allow a hand-written assembly fast path. +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) +.endm + .macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_TLAB_ALLOCATOR +.endm + +.macro GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_dlmalloc, DlMalloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_dlmalloc, DlMalloc) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_dlmalloc, DlMalloc) @@ -187,20 +208,6 @@ GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_bump_pointer_instrumented, B GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_bump_pointer_instrumented, BumpPointerInstrumented) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_bump_pointer_instrumented, BumpPointerInstrumented) -// This is to be separately defined for each architecture to allow a hand-written assembly fast path. -// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) -GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) - GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab_instrumented, TLABInstrumented) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab_instrumented, TLABInstrumented) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab_instrumented, TLABInstrumented) diff --git a/runtime/arch/x86/quick_entrypoints_x86.S b/runtime/arch/x86/quick_entrypoints_x86.S index fb405fac0..6fbc9547e 100644 --- a/runtime/arch/x86/quick_entrypoints_x86.S +++ b/runtime/arch/x86/quick_entrypoints_x86.S @@ -1085,15 +1085,12 @@ MACRO1(ALLOC_OBJECT_TLAB_SLOW_PATH, cxx_name) RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be called +// for CC if the GC is not marking. DEFINE_FUNCTION art_quick_alloc_object_tlab // Fast path tlab allocation. // EAX: uint32_t type_idx/return value, ECX: ArtMethod*. // EBX, EDX: free. -#if defined(USE_READ_BARRIER) - int3 - int3 -#endif PUSH esi PUSH edi movl ART_METHOD_DEX_CACHE_TYPES_OFFSET_32(%ecx), %edx // Load dex cache resolved types array diff --git a/runtime/arch/x86_64/quick_entrypoints_x86_64.S b/runtime/arch/x86_64/quick_entrypoints_x86_64.S index 860b77efe..f8066e45f 100644 --- a/runtime/arch/x86_64/quick_entrypoints_x86_64.S +++ b/runtime/arch/x86_64/quick_entrypoints_x86_64.S @@ -18,6 +18,13 @@ #include "arch/quick_alloc_entrypoints.S" +MACRO0(ASSERT_USE_READ_BARRIER) +#if !defined(USE_READ_BARRIER) + int3 + int3 +#endif +END_MACRO + MACRO0(SETUP_FP_CALLEE_SAVE_FRAME) // Create space for ART FP callee-saved registers subq MACRO_LITERAL(4 * 8), %rsp @@ -972,8 +979,10 @@ MACRO0(RETURN_OR_DELIVER_PENDING_EXCEPTION) END_MACRO // Generate the allocation entrypoints for each allocator. -GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_REGION_TLAB_ALLOCATORS +GENERATE_ALLOC_ENTRYPOINTS_FOR_NON_TLAB_ALLOCATORS + // Comment out allocators that have x86_64 specific asm. +// Region TLAB: // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_region_tlab, RegionTLAB) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_region_tlab, RegionTLAB) // GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_region_tlab, RegionTLAB) @@ -986,6 +995,19 @@ GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_region_tlab, GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_region_tlab, RegionTLAB) GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_region_tlab, RegionTLAB) +// Normal TLAB: +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_WITH_ACCESS_CHECK(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB) +// GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_CHECK_AND_ALLOC_ARRAY_WITH_ACCESS_CHECK(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_BYTES(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_CHARS(_tlab, TLAB) +GENERATE_ALLOC_ENTRYPOINTS_ALLOC_STRING_FROM_STRING(_tlab, TLAB) // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc). DEFINE_FUNCTION art_quick_alloc_object_rosalloc @@ -1162,16 +1184,11 @@ MACRO1(ALLOC_ARRAY_TLAB_SLOW_PATH, cxx_name) RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER // return or deliver exception END_MACRO -// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB). May be +// called with CC if the GC is not active. DEFINE_FUNCTION art_quick_alloc_object_tlab - // Fast path tlab allocation. // RDI: uint32_t type_idx, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if defined(USE_READ_BARRIER) - int3 - int3 -#endif - // Might need a special macro since rsi and edx is 32b/64b mismatched. movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array // Might need to break down into multiple instructions to get the base address in a register. // Load the class @@ -1181,29 +1198,69 @@ DEFINE_FUNCTION art_quick_alloc_object_tlab ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeTLAB END_FUNCTION art_quick_alloc_object_tlab +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_RESOLVED(_tlab, TLAB). May be +// called with CC if the GC is not active. +DEFINE_FUNCTION art_quick_alloc_object_resolved_tlab + // RDI: mirror::Class* klass, RSI: ArtMethod* + // RDX, RCX, R8, R9: free. RAX: return val. + movq %rdi, %rdx + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_tlab_slow_path +.Lart_quick_alloc_object_resolved_tlab_slow_path: + ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeResolvedTLAB +END_FUNCTION art_quick_alloc_object_resolved_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT_INITIALIZED(_tlab, TLAB). +// May be called with CC if the GC is not active. +DEFINE_FUNCTION art_quick_alloc_object_initialized_tlab + // RDI: mirror::Class* klass, RSI: ArtMethod* + // RDX, RCX, R8, R9: free. RAX: return val. + movq %rdi, %rdx + ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_tlab_slow_path +.Lart_quick_alloc_object_initialized_tlab_slow_path: + ALLOC_OBJECT_TLAB_SLOW_PATH artAllocObjectFromCodeInitializedTLAB +END_FUNCTION art_quick_alloc_object_initialized_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_array_tlab + // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod* + // RCX: klass, R8, R9: free. RAX: return val. + movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx // Load dex cache resolved types array + movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx // Load the class + testl %ecx, %ecx + jz .Lart_quick_alloc_array_tlab_slow_path + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_tlab_slow_path +.Lart_quick_alloc_array_tlab_slow_path: + ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeTLAB +END_FUNCTION art_quick_alloc_array_tlab + +// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY_RESOLVED(_tlab, TLAB). +DEFINE_FUNCTION art_quick_alloc_array_resolved_tlab + // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod* + // RCX: mirror::Class* klass, R8, R9: free. RAX: return val. + movq %rdi, %rcx + // Already resolved, no null check. + ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_tlab_slow_path +.Lart_quick_alloc_array_resolved_tlab_slow_path: + ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedTLAB +END_FUNCTION art_quick_alloc_array_resolved_tlab + // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_ARRAY(_region_tlab, RegionTLAB). DEFINE_FUNCTION art_quick_alloc_array_region_tlab // Fast path region tlab allocation. // RDI: uint32_t type_idx, RSI: int32_t component_count, RDX: ArtMethod* // RCX: klass, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rdx), %rcx // Load dex cache resolved types array movl 0(%rcx, %rdi, COMPRESSED_REFERENCE_SIZE), %ecx // Load the class // Null check so that we can load the lock word. testl %ecx, %ecx jz .Lart_quick_alloc_array_region_tlab_slow_path - - cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET - jne .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking + // Since we have allocation entrypoint switching, we know the GC is marking. + // Check the mark bit, if it is 0, do the read barrier mark. + testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) + jz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit: ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_region_tlab_slow_path -.Lart_quick_alloc_array_region_tlab_class_load_read_barrier_marking: - // Check the mark bit, if it is 1 return. - testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) - jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path: // The read barrier slow path. Mark the class. PUSH rdi @@ -1226,33 +1283,11 @@ DEFINE_FUNCTION art_quick_alloc_array_resolved_region_tlab // Fast path region tlab allocation. // RDI: mirror::Class* klass, RSI: int32_t component_count, RDX: ArtMethod* // RCX: mirror::Class* klass, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq %rdi, %rcx + // Caller is responsible for read barrier. // Already resolved, no null check. - cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET - jne .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit: ALLOC_ARRAY_TLAB_FAST_PATH_RESOLVED .Lart_quick_alloc_array_resolved_region_tlab_slow_path -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_marking: - // Check the mark bit, if it is 1 return. - testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%ecx) - jnz .Lart_quick_alloc_array_region_tlab_class_load_read_barrier_slow_path_exit -.Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path: - // The read barrier slow path. Mark the class. - PUSH rdi - PUSH rsi - PUSH rdx - // Outgoing argument set up - movq %rcx, %rdi // Pass the class as the first param. - call SYMBOL(artReadBarrierMark) // cxx_name(mirror::Object* obj) - movq %rax, %rcx - POP rdx - POP rsi - POP rdi - jmp .Lart_quick_alloc_array_resolved_region_tlab_class_load_read_barrier_slow_path_exit .Lart_quick_alloc_array_resolved_region_tlab_slow_path: ALLOC_ARRAY_TLAB_SLOW_PATH artAllocArrayFromCodeResolvedRegionTLAB END_FUNCTION art_quick_alloc_array_resolved_region_tlab @@ -1262,24 +1297,19 @@ DEFINE_FUNCTION art_quick_alloc_object_region_tlab // Fast path region tlab allocation. // RDI: uint32_t type_idx, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq ART_METHOD_DEX_CACHE_TYPES_OFFSET_64(%rsi), %rdx // Load dex cache resolved types array movl 0(%rdx, %rdi, COMPRESSED_REFERENCE_SIZE), %edx // Load the class // Null check so that we can load the lock word. testl %edx, %edx jz .Lart_quick_alloc_object_region_tlab_slow_path - // Test if the GC is marking. - cmpl LITERAL(0), %gs:THREAD_IS_GC_MARKING_OFFSET - jne .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking -.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: - ALLOC_OBJECT_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path -.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_marking: - // Check the mark bit, if it is 1 avoid the read barrier. + // Since we have allocation entrypoint switching, we know the GC is marking. + // Check the mark bit, if it is 0, do the read barrier mark. testl LITERAL(LOCK_WORD_MARK_BIT_MASK_SHIFTED), MIRROR_OBJECT_LOCK_WORD_OFFSET(%edx) - jnz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit + jz .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path +.Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path_exit: + // Use resolved one since we already did the null check. + ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_region_tlab_slow_path .Lart_quick_alloc_object_region_tlab_class_load_read_barrier_slow_path: // The read barrier slow path. Mark the class. PUSH rdi @@ -1302,10 +1332,7 @@ DEFINE_FUNCTION art_quick_alloc_object_resolved_region_tlab // Fast path region tlab allocation. // RDI: mirror::Class* klass, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER // No read barrier since the caller is responsible for that. movq %rdi, %rdx ALLOC_OBJECT_RESOLVED_TLAB_FAST_PATH .Lart_quick_alloc_object_resolved_region_tlab_slow_path @@ -1318,10 +1345,7 @@ DEFINE_FUNCTION art_quick_alloc_object_initialized_region_tlab // Fast path region tlab allocation. // RDI: mirror::Class* klass, RSI: ArtMethod* // RDX, RCX, R8, R9: free. RAX: return val. -#if !defined(USE_READ_BARRIER) - int3 - int3 -#endif + ASSERT_USE_READ_BARRIER movq %rdi, %rdx // No read barrier since the caller is responsible for that. ALLOC_OBJECT_INITIALIZED_TLAB_FAST_PATH .Lart_quick_alloc_object_initialized_region_tlab_slow_path diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc index 397655a89..82bb8e53c 100644 --- a/runtime/entrypoints/quick/quick_alloc_entrypoints.cc +++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.cc @@ -292,7 +292,7 @@ void SetQuickAllocEntryPointsInstrumented(bool instrumented) { entry_points_instrumented = instrumented; } -void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) { +void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking) { #if !defined(__APPLE__) || !defined(__LP64__) switch (entry_points_allocator) { case gc::kAllocatorTypeDlMalloc: { @@ -320,7 +320,12 @@ void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) { } case gc::kAllocatorTypeRegionTLAB: { CHECK(kMovingCollector); - SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented); + if (is_marking) { + SetQuickAllocEntryPoints_region_tlab(qpoints, entry_points_instrumented); + } else { + // Not marking means we need no read barriers and can just use the normal TLAB case. + SetQuickAllocEntryPoints_tlab(qpoints, entry_points_instrumented); + } return; } default: @@ -328,6 +333,7 @@ void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints) { } #else UNUSED(qpoints); + UNUSED(is_marking); #endif UNIMPLEMENTED(FATAL); UNREACHABLE(); diff --git a/runtime/entrypoints/quick/quick_alloc_entrypoints.h b/runtime/entrypoints/quick/quick_alloc_entrypoints.h index 14a8e0428..bd1e295e4 100644 --- a/runtime/entrypoints/quick/quick_alloc_entrypoints.h +++ b/runtime/entrypoints/quick/quick_alloc_entrypoints.h @@ -23,7 +23,9 @@ namespace art { -void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints); +// is_marking is only used for CC, if the GC is marking the allocation entrypoint is the marking +// one. +void ResetQuickAllocEntryPoints(QuickEntryPoints* qpoints, bool is_marking); // Runtime shutdown lock is necessary to prevent races in thread initialization. When the thread is // starting it doesn't hold the mutator lock until after it has been added to the thread list. diff --git a/runtime/entrypoints/quick/quick_default_init_entrypoints.h b/runtime/entrypoints/quick/quick_default_init_entrypoints.h index df23f94a3..78dad94df 100644 --- a/runtime/entrypoints/quick/quick_default_init_entrypoints.h +++ b/runtime/entrypoints/quick/quick_default_init_entrypoints.h @@ -31,7 +31,7 @@ void DefaultInitEntryPoints(JniEntryPoints* jpoints, QuickEntryPoints* qpoints) jpoints->pDlsymLookup = art_jni_dlsym_lookup_stub; // Alloc - ResetQuickAllocEntryPoints(qpoints); + ResetQuickAllocEntryPoints(qpoints, /* is_marking */ true); // DexCache qpoints->pInitializeStaticStorage = art_quick_initialize_static_storage; diff --git a/runtime/gc/heap-inl.h b/runtime/gc/heap-inl.h index 97129e8b1..54f221056 100644 --- a/runtime/gc/heap-inl.h +++ b/runtime/gc/heap-inl.h @@ -247,7 +247,7 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, if (allocator_type != kAllocatorTypeTLAB && allocator_type != kAllocatorTypeRegionTLAB && allocator_type != kAllocatorTypeRosAlloc && - UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, alloc_size))) { + UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, alloc_size, kGrow))) { return nullptr; } mirror::Object* ret; @@ -267,8 +267,9 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) { // If running on valgrind or asan, we should be using the instrumented path. size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedFor(alloc_size); - if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, - max_bytes_tl_bulk_allocated))) { + if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, + max_bytes_tl_bulk_allocated, + kGrow))) { return nullptr; } ret = rosalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, @@ -277,14 +278,18 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, DCHECK(!is_running_on_memory_tool_); size_t max_bytes_tl_bulk_allocated = rosalloc_space_->MaxBytesBulkAllocatedForNonvirtual(alloc_size); - if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, - max_bytes_tl_bulk_allocated))) { + if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, + max_bytes_tl_bulk_allocated, + kGrow))) { return nullptr; } if (!kInstrumented) { DCHECK(!rosalloc_space_->CanAllocThreadLocal(self, alloc_size)); } - ret = rosalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size, + ret = rosalloc_space_->AllocNonvirtual(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); } break; @@ -292,22 +297,34 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, case kAllocatorTypeDlMalloc: { if (kInstrumented && UNLIKELY(is_running_on_memory_tool_)) { // If running on valgrind, we should be using the instrumented path. - ret = dlmalloc_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, + ret = dlmalloc_space_->Alloc(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); } else { DCHECK(!is_running_on_memory_tool_); - ret = dlmalloc_space_->AllocNonvirtual(self, alloc_size, bytes_allocated, usable_size, + ret = dlmalloc_space_->AllocNonvirtual(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); } break; } case kAllocatorTypeNonMoving: { - ret = non_moving_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, + ret = non_moving_space_->Alloc(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); break; } case kAllocatorTypeLOS: { - ret = large_object_space_->Alloc(self, alloc_size, bytes_allocated, usable_size, + ret = large_object_space_->Alloc(self, + alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); // Note that the bump pointer spaces aren't necessarily next to // the other continuous spaces like the non-moving alloc space or @@ -315,80 +332,38 @@ inline mirror::Object* Heap::TryToAllocate(Thread* self, DCHECK(ret == nullptr || large_object_space_->Contains(ret)); break; } - case kAllocatorTypeTLAB: { - DCHECK_ALIGNED(alloc_size, space::BumpPointerSpace::kAlignment); - if (UNLIKELY(self->TlabSize() < alloc_size)) { - const size_t new_tlab_size = alloc_size + kDefaultTLABSize; - if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size))) { - return nullptr; - } - // Try allocating a new thread local buffer, if the allocaiton fails the space must be - // full so return null. - if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) { - return nullptr; - } - *bytes_tl_bulk_allocated = new_tlab_size; - } else { - *bytes_tl_bulk_allocated = 0; - } - // The allocation can't fail. - ret = self->AllocTlab(alloc_size); - DCHECK(ret != nullptr); - *bytes_allocated = alloc_size; - *usable_size = alloc_size; - break; - } case kAllocatorTypeRegion: { DCHECK(region_space_ != nullptr); alloc_size = RoundUp(alloc_size, space::RegionSpace::kAlignment); - ret = region_space_->AllocNonvirtual(alloc_size, bytes_allocated, usable_size, + ret = region_space_->AllocNonvirtual(alloc_size, + bytes_allocated, + usable_size, bytes_tl_bulk_allocated); break; } + case kAllocatorTypeTLAB: + FALLTHROUGH_INTENDED; case kAllocatorTypeRegionTLAB: { - DCHECK(region_space_ != nullptr); - DCHECK_ALIGNED(alloc_size, space::RegionSpace::kAlignment); + DCHECK_ALIGNED(alloc_size, kObjectAlignment); + static_assert(space::RegionSpace::kAlignment == space::BumpPointerSpace::kAlignment, + "mismatched alignments"); + static_assert(kObjectAlignment == space::BumpPointerSpace::kAlignment, + "mismatched alignments"); if (UNLIKELY(self->TlabSize() < alloc_size)) { - if (space::RegionSpace::kRegionSize >= alloc_size) { - // Non-large. Check OOME for a tlab. - if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, space::RegionSpace::kRegionSize))) { - // Try to allocate a tlab. - if (!region_space_->AllocNewTlab(self)) { - // Failed to allocate a tlab. Try non-tlab. - ret = region_space_->AllocNonvirtual(alloc_size, bytes_allocated, usable_size, - bytes_tl_bulk_allocated); - return ret; - } - *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize; - // Fall-through. - } else { - // Check OOME for a non-tlab allocation. - if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size)) { - ret = region_space_->AllocNonvirtual(alloc_size, bytes_allocated, usable_size, - bytes_tl_bulk_allocated); - return ret; - } else { - // Neither tlab or non-tlab works. Give up. - return nullptr; - } - } - } else { - // Large. Check OOME. - if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size))) { - ret = region_space_->AllocNonvirtual(alloc_size, bytes_allocated, usable_size, - bytes_tl_bulk_allocated); - return ret; - } else { - return nullptr; - } - } - } else { - *bytes_tl_bulk_allocated = 0; // Allocated in an existing buffer. + // kAllocatorTypeTLAB may be the allocator for region space TLAB if the GC is not marking, + // that is why the allocator is not passed down. + return AllocWithNewTLAB(self, + alloc_size, + kGrow, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); } // The allocation can't fail. ret = self->AllocTlab(alloc_size); DCHECK(ret != nullptr); *bytes_allocated = alloc_size; + *bytes_tl_bulk_allocated = 0; // Allocated in an existing buffer. *usable_size = alloc_size; break; } @@ -408,15 +383,16 @@ inline bool Heap::ShouldAllocLargeObject(ObjPtr c, size_t byte_co return byte_count >= large_object_threshold_ && (c->IsPrimitiveArray() || c->IsStringClass()); } -template -inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size) { +inline bool Heap::IsOutOfMemoryOnAllocation(AllocatorType allocator_type, + size_t alloc_size, + bool grow) { size_t new_footprint = num_bytes_allocated_.LoadSequentiallyConsistent() + alloc_size; if (UNLIKELY(new_footprint > max_allowed_footprint_)) { if (UNLIKELY(new_footprint > growth_limit_)) { return true; } if (!AllocatorMayHaveConcurrentGC(allocator_type) || !IsGcConcurrent()) { - if (!kGrow) { + if (!grow) { return true; } // TODO: Grow for allocation is racy, fix it. diff --git a/runtime/gc/heap.cc b/runtime/gc/heap.cc index 6a97edd33..5c219cc87 100644 --- a/runtime/gc/heap.cc +++ b/runtime/gc/heap.cc @@ -1819,7 +1819,7 @@ mirror::Object* Heap::AllocateInternalWithGc(Thread* self, break; } // Try to transition the heap if the allocation failure was due to the space being full. - if (!IsOutOfMemoryOnAllocation(allocator, alloc_size)) { + if (!IsOutOfMemoryOnAllocation(allocator, alloc_size, /*grow*/ false)) { // If we aren't out of memory then the OOM was probably from the non moving space being // full. Attempt to disable compaction and turn the main space into a non moving space. DisableMovingGc(); @@ -4219,5 +4219,72 @@ void Heap::RemoveGcPauseListener() { gc_pause_listener_.StoreRelaxed(nullptr); } +mirror::Object* Heap::AllocWithNewTLAB(Thread* self, + size_t alloc_size, + bool grow, + size_t* bytes_allocated, + size_t* usable_size, + size_t* bytes_tl_bulk_allocated) { + const AllocatorType allocator_type = GetCurrentAllocator(); + if (allocator_type == kAllocatorTypeTLAB) { + DCHECK(bump_pointer_space_ != nullptr); + const size_t new_tlab_size = alloc_size + kDefaultTLABSize; + if (UNLIKELY(IsOutOfMemoryOnAllocation(allocator_type, new_tlab_size, grow))) { + return nullptr; + } + // Try allocating a new thread local buffer, if the allocation fails the space must be + // full so return null. + if (!bump_pointer_space_->AllocNewTlab(self, new_tlab_size)) { + return nullptr; + } + *bytes_tl_bulk_allocated = new_tlab_size; + } else { + DCHECK(allocator_type == kAllocatorTypeRegionTLAB); + DCHECK(region_space_ != nullptr); + if (space::RegionSpace::kRegionSize >= alloc_size) { + // Non-large. Check OOME for a tlab. + if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, + space::RegionSpace::kRegionSize, + grow))) { + // Try to allocate a tlab. + if (!region_space_->AllocNewTlab(self)) { + // Failed to allocate a tlab. Try non-tlab. + return region_space_->AllocNonvirtual(alloc_size, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); + } + *bytes_tl_bulk_allocated = space::RegionSpace::kRegionSize; + // Fall-through to using the TLAB below. + } else { + // Check OOME for a non-tlab allocation. + if (!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow)) { + return region_space_->AllocNonvirtual(alloc_size, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); + } + // Neither tlab or non-tlab works. Give up. + return nullptr; + } + } else { + // Large. Check OOME. + if (LIKELY(!IsOutOfMemoryOnAllocation(allocator_type, alloc_size, grow))) { + return region_space_->AllocNonvirtual(alloc_size, + bytes_allocated, + usable_size, + bytes_tl_bulk_allocated); + } + return nullptr; + } + } + // Refilled TLAB, return. + mirror::Object* ret = self->AllocTlab(alloc_size); + DCHECK(ret != nullptr); + *bytes_allocated = alloc_size; + *usable_size = alloc_size; + return ret; +} + } // namespace gc } // namespace art diff --git a/runtime/gc/heap.h b/runtime/gc/heap.h index 0c671d269..3a8e29b08 100644 --- a/runtime/gc/heap.h +++ b/runtime/gc/heap.h @@ -854,6 +854,10 @@ class Heap { allocator_type != kAllocatorTypeRegionTLAB; } static ALWAYS_INLINE bool AllocatorMayHaveConcurrentGC(AllocatorType allocator_type) { + if (kUseReadBarrier) { + // Read barrier may have the TLAB allocator but is always concurrent. TODO: clean this up. + return true; + } return allocator_type != kAllocatorTypeBumpPointer && allocator_type != kAllocatorTypeTLAB; @@ -923,11 +927,20 @@ class Heap { size_t* bytes_tl_bulk_allocated) REQUIRES_SHARED(Locks::mutator_lock_); + mirror::Object* AllocWithNewTLAB(Thread* self, + size_t alloc_size, + bool grow, + size_t* bytes_allocated, + size_t* usable_size, + size_t* bytes_tl_bulk_allocated) + REQUIRES_SHARED(Locks::mutator_lock_); + void ThrowOutOfMemoryError(Thread* self, size_t byte_count, AllocatorType allocator_type) REQUIRES_SHARED(Locks::mutator_lock_); - template - ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, size_t alloc_size); + ALWAYS_INLINE bool IsOutOfMemoryOnAllocation(AllocatorType allocator_type, + size_t alloc_size, + bool grow); // Run the finalizers. If timeout is non zero, then we use the VMRuntime version. void RunFinalization(JNIEnv* env, uint64_t timeout); diff --git a/runtime/instrumentation.cc b/runtime/instrumentation.cc index d4c322eb8..870d1ae9b 100644 --- a/runtime/instrumentation.cc +++ b/runtime/instrumentation.cc @@ -630,7 +630,7 @@ void Instrumentation::ConfigureStubs(const char* key, InstrumentationLevel desir } static void ResetQuickAllocEntryPointsForThread(Thread* thread, void* arg ATTRIBUTE_UNUSED) { - thread->ResetQuickAllocEntryPointsForThread(); + thread->ResetQuickAllocEntryPointsForThread(kUseReadBarrier && thread->GetIsGcMarking()); } void Instrumentation::SetEntrypointsInstrumented(bool instrumented) { diff --git a/runtime/thread.cc b/runtime/thread.cc index 65c86815b..1283cf0e9 100644 --- a/runtime/thread.cc +++ b/runtime/thread.cc @@ -122,21 +122,26 @@ void Thread::SetIsGcMarkingAndUpdateEntrypoints(bool is_marking) { CHECK(kUseReadBarrier); tls32_.is_gc_marking = is_marking; UpdateReadBarrierEntrypoints(&tlsPtr_.quick_entrypoints, is_marking); + ResetQuickAllocEntryPointsForThread(is_marking); } void Thread::InitTlsEntryPoints() { // Insert a placeholder so we can easily tell if we call an unimplemented entry point. uintptr_t* begin = reinterpret_cast(&tlsPtr_.jni_entrypoints); - uintptr_t* end = reinterpret_cast(reinterpret_cast(&tlsPtr_.quick_entrypoints) + - sizeof(tlsPtr_.quick_entrypoints)); + uintptr_t* end = reinterpret_cast( + reinterpret_cast(&tlsPtr_.quick_entrypoints) + sizeof(tlsPtr_.quick_entrypoints)); for (uintptr_t* it = begin; it != end; ++it) { *it = reinterpret_cast(UnimplementedEntryPoint); } InitEntryPoints(&tlsPtr_.jni_entrypoints, &tlsPtr_.quick_entrypoints); } -void Thread::ResetQuickAllocEntryPointsForThread() { - ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints); +void Thread::ResetQuickAllocEntryPointsForThread(bool is_marking) { + if (kUseReadBarrier && kRuntimeISA != kX86_64) { + // Allocation entrypoint switching is currently only implemented for X86_64. + is_marking = true; + } + ResetQuickAllocEntryPoints(&tlsPtr_.quick_entrypoints, is_marking); } class DeoptimizationContextRecord { diff --git a/runtime/thread.h b/runtime/thread.h index 97093a635..35226f223 100644 --- a/runtime/thread.h +++ b/runtime/thread.h @@ -1007,7 +1007,7 @@ class Thread { tls32_.state_and_flags.as_atomic_int.FetchAndAndSequentiallyConsistent(-1 ^ flag); } - void ResetQuickAllocEntryPointsForThread(); + void ResetQuickAllocEntryPointsForThread(bool is_marking); // Returns the remaining space in the TLAB. size_t TlabSize() const; -- 2.11.0