OSDN Git Service

Assembly TLAB allocation fast path for arm.
authorHiroshi Yamauchi <yamauchi@google.com>
Fri, 11 Dec 2015 23:46:19 +0000 (15:46 -0800)
committerHiroshi Yamauchi <yamauchi@google.com>
Thu, 25 Feb 2016 22:00:48 +0000 (14:00 -0800)
Speedup (GSS GC with TLAB on N5):
    BinaryTrees:  1872 ->  796 ms (-57%)
    MemAllocTest: 2522 -> 2219 ms (-12%)

Bug: 9986565
Change-Id: Icb9d1259461f3abe83a4a82c8aff911937eaf57d

runtime/arch/arm/quick_entrypoints_arm.S
runtime/asm_support.h
runtime/entrypoints_order_test.cc
runtime/thread.h

index c4e314b..cfcef49 100644 (file)
@@ -942,7 +942,86 @@ ONE_ARG_DOWNCALL art_quick_resolve_string, artResolveStringFromCode, RETURN_IF_R
 
 // Generate the allocation entrypoints for each allocator.
 GENERATE_ALLOC_ENTRYPOINTS_FOR_EACH_ALLOCATOR
-GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB)
+
+// A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_tlab, TLAB).
+ENTRY art_quick_alloc_object_tlab
+    // Fast path tlab allocation.
+    // r0: type_idx/return value, r1: ArtMethod*, r9: Thread::Current
+    // r2, r3, r12: free.
+#if defined(USE_READ_BARRIER)
+    eor    r0, r0, r0                                         // Read barrier not supported here.
+    sub    r0, r0, #1                                         // Return -1.
+    bx     lr
+#endif
+    ldr    r2, [r1, #ART_METHOD_DEX_CACHE_TYPES_OFFSET_32]    // Load dex cache resolved types array
+                                                              // Load the class (r2)
+    ldr    r2, [r2, r0, lsl #COMPRESSED_REFERENCE_SIZE_SHIFT]
+    cbz    r2, .Lart_quick_alloc_object_tlab_slow_path        // Check null class
+                                                              // Check class status.
+    ldr    r3, [r2, #MIRROR_CLASS_STATUS_OFFSET]
+    cmp    r3, #MIRROR_CLASS_STATUS_INITIALIZED
+    bne    .Lart_quick_alloc_object_tlab_slow_path
+                                                              // Add a fake dependence from the
+                                                              // following access flag and size
+                                                              // loads to the status load.
+                                                              // This is to prevent those loads
+                                                              // from being reordered above the
+                                                              // status load and reading wrong
+                                                              // values (an alternative is to use
+                                                              // a load-acquire for the status).
+    eor    r3, r3, r3
+    add    r2, r2, r3
+                                                              // Check access flags has
+                                                              // kAccClassIsFinalizable.
+    ldr    r3, [r2, #MIRROR_CLASS_ACCESS_FLAGS_OFFSET]
+    tst    r3, #ACCESS_FLAGS_CLASS_IS_FINALIZABLE
+    bne    .Lart_quick_alloc_object_tlab_slow_path
+                                                              // Load thread_local_pos (r12) and
+                                                              // thread_local_end (r3) with ldrd.
+                                                              // Check constraints for ldrd.
+#if !((THREAD_LOCAL_POS_OFFSET + 4 == THREAD_LOCAL_END_OFFSET) && (THREAD_LOCAL_POS_OFFSET % 8 == 0))
+#error "Thread::thread_local_pos/end must be consecutive and are 8 byte aligned for performance"
+#endif
+    ldrd   r12, r3, [r9, #THREAD_LOCAL_POS_OFFSET]
+    sub    r12, r3, r12                                       // Compute the remaining buf size.
+    ldr    r3, [r2, #MIRROR_CLASS_OBJECT_SIZE_OFFSET]         // Load the object size (r3).
+    cmp    r3, r12                                            // Check if it fits. OK to do this
+                                                              // before rounding up the object size
+                                                              // assuming the buf size alignment.
+    bhi    .Lart_quick_alloc_object_tlab_slow_path
+    // "Point of no slow path". Won't go to the slow path from here on. OK to clobber r0 and r1.
+                                                              // Round up the object size by the
+                                                              // object alignment. (addr + 7) & ~7.
+    add    r3, r3, #OBJECT_ALIGNMENT_MASK
+    and    r3, r3, #OBJECT_ALIGNMENT_MASK_TOGGLED
+                                                              // Reload old thread_local_pos (r0)
+                                                              // for the return value.
+    ldr    r0, [r9, #THREAD_LOCAL_POS_OFFSET]
+    add    r1, r0, r3
+    str    r1, [r9, #THREAD_LOCAL_POS_OFFSET]                 // Store new thread_local_pos.
+    ldr    r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]             // Increment thread_local_objects.
+    add    r1, r1, #1
+    str    r1, [r9, #THREAD_LOCAL_OBJECTS_OFFSET]
+    POISON_HEAP_REF r2
+    str    r2, [r0, #MIRROR_OBJECT_CLASS_OFFSET]              // Store the class pointer.
+                                                              // Fence. This is "ish" not "ishst" so
+                                                              // that the code after this allocation
+                                                              // site will see the right values in
+                                                              // the fields of the class.
+                                                              // Alternatively we could use "ishst"
+                                                              // if we use load-acquire for the
+                                                              // class status load.)
+    dmb    ish
+    bx     lr
+.Lart_quick_alloc_object_tlab_slow_path:
+    SETUP_REFS_ONLY_CALLEE_SAVE_FRAME  r2, r3                 // Save callee saves in case of GC.
+    mov    r2, r9                                             // Pass Thread::Current.
+    bl     artAllocObjectFromCodeTLAB    // (uint32_t type_idx, Method* method, Thread*)
+    RESTORE_REFS_ONLY_CALLEE_SAVE_FRAME
+    RETURN_IF_RESULT_IS_NON_ZERO_OR_DELIVER
+END art_quick_alloc_object_tlab
+
+
 // A hand-written override for GENERATE_ALLOC_ENTRYPOINTS_ALLOC_OBJECT(_rosalloc, RosAlloc).
 ENTRY art_quick_alloc_object_rosalloc
     // Fast path rosalloc allocation.
index eb3b7f3..879364e 100644 (file)
@@ -121,32 +121,32 @@ ADD_TEST_EQ(THREAD_TOP_QUICK_FRAME_OFFSET,
 ADD_TEST_EQ(THREAD_SELF_OFFSET,
             art::Thread::SelfOffset<__SIZEOF_POINTER__>().Int32Value())
 
+// Offset of field Thread::tlsPtr_.thread_local_objects.
+#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_CARD_TABLE_OFFSET + 168 * __SIZEOF_POINTER__)
+ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
+            art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_pos.
-#define THREAD_LOCAL_POS_OFFSET (THREAD_CARD_TABLE_OFFSET + 169 * __SIZEOF_POINTER__)
+#define THREAD_LOCAL_POS_OFFSET (THREAD_LOCAL_OBJECTS_OFFSET + 2 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_POS_OFFSET,
             art::Thread::ThreadLocalPosOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_end.
 #define THREAD_LOCAL_END_OFFSET (THREAD_LOCAL_POS_OFFSET + __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_LOCAL_END_OFFSET,
             art::Thread::ThreadLocalEndOffset<__SIZEOF_POINTER__>().Int32Value())
-// Offset of field Thread::tlsPtr_.thread_local_objects.
-#define THREAD_LOCAL_OBJECTS_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
-ADD_TEST_EQ(THREAD_LOCAL_OBJECTS_OFFSET,
-            art::Thread::ThreadLocalObjectsOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.mterp_current_ibase.
-#define THREAD_CURRENT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
+#define THREAD_CURRENT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 2 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_CURRENT_IBASE_OFFSET,
             art::Thread::MterpCurrentIBaseOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.mterp_default_ibase.
-#define THREAD_DEFAULT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 4 * __SIZEOF_POINTER__)
+#define THREAD_DEFAULT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 3 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_DEFAULT_IBASE_OFFSET,
             art::Thread::MterpDefaultIBaseOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.mterp_alt_ibase.
-#define THREAD_ALT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 5 * __SIZEOF_POINTER__)
+#define THREAD_ALT_IBASE_OFFSET (THREAD_LOCAL_POS_OFFSET + 4 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_ALT_IBASE_OFFSET,
             art::Thread::MterpAltIBaseOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.rosalloc_runs.
-#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 6 * __SIZEOF_POINTER__)
+#define THREAD_ROSALLOC_RUNS_OFFSET (THREAD_LOCAL_POS_OFFSET + 5 * __SIZEOF_POINTER__)
 ADD_TEST_EQ(THREAD_ROSALLOC_RUNS_OFFSET,
             art::Thread::RosAllocRunsOffset<__SIZEOF_POINTER__>().Int32Value())
 // Offset of field Thread::tlsPtr_.thread_local_alloc_stack_top.
index e72809b..c621672 100644 (file)
@@ -119,10 +119,10 @@ class EntrypointsOrderTest : public CommonRuntimeTest {
 
     // Skip across the entrypoints structures.
 
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_objects, thread_local_start, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_start, thread_local_pos, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_pos, thread_local_end, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, thread_local_objects, sizeof(void*));
-    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_objects, mterp_current_ibase, sizeof(void*));
+    EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, thread_local_end, mterp_current_ibase, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mterp_current_ibase, mterp_default_ibase, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mterp_default_ibase, mterp_alt_ibase, sizeof(void*));
     EXPECT_OFFSET_DIFFP(Thread, tlsPtr_, mterp_alt_ibase, rosalloc_runs, sizeof(void*));
index 97c47e1..234750c 100644 (file)
@@ -1324,8 +1324,8 @@ class Thread {
       instrumentation_stack(nullptr), debug_invoke_req(nullptr), single_step_control(nullptr),
       stacked_shadow_frame_record(nullptr), deoptimization_context_stack(nullptr),
       frame_id_to_shadow_frame(nullptr), name(nullptr), pthread_self(0),
-      last_no_thread_suspension_cause(nullptr), thread_local_start(nullptr),
-      thread_local_pos(nullptr), thread_local_end(nullptr), thread_local_objects(0),
+      last_no_thread_suspension_cause(nullptr), thread_local_objects(0),
+      thread_local_start(nullptr), thread_local_pos(nullptr), thread_local_end(nullptr),
       mterp_current_ibase(nullptr), mterp_default_ibase(nullptr), mterp_alt_ibase(nullptr),
       thread_local_alloc_stack_top(nullptr), thread_local_alloc_stack_end(nullptr),
       nested_signal_state(nullptr), flip_function(nullptr), method_verifier(nullptr),
@@ -1440,10 +1440,12 @@ class Thread {
     QuickEntryPoints quick_entrypoints;
 
     // Thread-local allocation pointer.
+    size_t thread_local_objects;
     uint8_t* thread_local_start;
+    // thread_local_pos and thread_local_end must be consecutive for ldrd and are 8 byte aligned for
+    // potentially better performance.
     uint8_t* thread_local_pos;
     uint8_t* thread_local_end;
-    size_t thread_local_objects;
 
     // Mterp jump table bases.
     void* mterp_current_ibase;