From 963daed78044724547ff1c7c35b34a4d50a0b867 Mon Sep 17 00:00:00 2001 From: Hans Boehm Date: Wed, 22 Feb 2017 15:34:29 -0800 Subject: [PATCH] Replace cxa_guard fences with acquire loads This seemed to be the only place in bionic where a fence on a performance-critical path could be easily replaced by a stronger load/store order constraint. Do so. On x86 this should generate the same code either way. Based on microbenchmarks of the relevant ARM instructions, this is currently performance-neutral in this kind of context. But in the future, the newly generated acquire loads should give us a performance benefit. Test: Booted AOSP Change-Id: I7823e11d6ae4fd58e0425244c293262e2320fd81 --- libc/bionic/__cxa_guard.cpp | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/libc/bionic/__cxa_guard.cpp b/libc/bionic/__cxa_guard.cpp index 97284d5b0..06926dff3 100644 --- a/libc/bionic/__cxa_guard.cpp +++ b/libc/bionic/__cxa_guard.cpp @@ -79,38 +79,33 @@ union _guard_t { #define CONSTRUCTION_UNDERWAY_WITH_WAITER 0x200 extern "C" int __cxa_guard_acquire(_guard_t* gv) { - int old_value = atomic_load_explicit(&gv->state, memory_order_relaxed); + int old_value = atomic_load_explicit(&gv->state, memory_order_acquire); + // In the common CONSTRUCTION_COMPLETE case we have to ensure that all the stores performed by + // the construction function are observable on this CPU after we exit. A similar constraint may + // apply in the CONSTRUCTION_NOT_YET_STARTED case with a prior abort. while (true) { if (old_value == CONSTRUCTION_COMPLETE) { - // A load_acquire operation is need before exiting with COMPLETE state, as we have to ensure - // that all the stores performed by the construction function are observable on this CPU - // after we exit. - atomic_thread_fence(memory_order_acquire); return 0; } else if (old_value == CONSTRUCTION_NOT_YET_STARTED) { if (!atomic_compare_exchange_weak_explicit(&gv->state, &old_value, CONSTRUCTION_UNDERWAY_WITHOUT_WAITER, - memory_order_relaxed, - memory_order_relaxed)) { + memory_order_acquire /* or relaxed in C++17 */, + memory_order_acquire)) { continue; } - // The acquire fence may not be needed. But as described in section 3.3.2 of - // the Itanium C++ ABI specification, it probably has to behave like the - // acquisition of a mutex, which needs an acquire fence. - atomic_thread_fence(memory_order_acquire); return 1; } else if (old_value == CONSTRUCTION_UNDERWAY_WITHOUT_WAITER) { if (!atomic_compare_exchange_weak_explicit(&gv->state, &old_value, CONSTRUCTION_UNDERWAY_WITH_WAITER, - memory_order_relaxed, - memory_order_relaxed)) { + memory_order_acquire /* or relaxed in C++17 */, + memory_order_acquire)) { continue; } } __futex_wait_ex(&gv->state, false, CONSTRUCTION_UNDERWAY_WITH_WAITER, false, nullptr); - old_value = atomic_load_explicit(&gv->state, memory_order_relaxed); + old_value = atomic_load_explicit(&gv->state, memory_order_acquire); } } -- 2.11.0