From 971d12b7f9d7be3ca8eb98e6c04ed521f83cbd3c Mon Sep 17 00:00:00 2001 From: Ganesh Ajjanagadde Date: Sat, 10 Oct 2015 21:58:47 -0400 Subject: [PATCH] avutil/mathematics: speed up av_gcd by using Stein's binary GCD algorithm This uses Stein's binary GCD algorithm: https://en.wikipedia.org/wiki/Binary_GCD_algorithm to get a roughly 4x speedup over Euclidean GCD on standard architectures with a compiler intrinsic for ctzll, and a roughly 2x speedup otherwise. At the moment, the compiler intrinsic is used on GCC and Clang due to its easy availability. Quick note regarding overflow: yes, subtractions on int64_t can, but the llabs takes care of that. The llabs is also guaranteed to be safe, with no annoying INT64_MIN business since INT64_MIN being a power of 2, is shifted down before being sent to llabs. The binary GCD needs ff_ctzll, an extension of ff_ctz for long long (int64_t). On GCC, this is provided by a built-in. On Microsoft, there is a BitScanForward64 analog of BitScanForward that should work; but I can't confirm. Apparently it is not available on 32 bit builds; so this may or may not work correctly. On Intel, per the documentation there is only an intrinsic for _bit_scan_forward and people have posted on forums regarding _bit_scan_forward64, but often their documentation is woeful. Again, I don't have it, so I can't test. As such, to be safe, for now only the GCC/Clang intrinsic is added, the rest use a compiled version based on the De-Bruijn method of Leiserson et al: http://supertech.csail.mit.edu/papers/debruijn.pdf. Tested with FATE, sample benchmark (x86-64, GCC 5.2.0, Haswell) with a START_TIMER and STOP_TIMER in libavutil/rationsl.c, followed by a make fate. aac-am00_88.err: builtin: 714 decicycles in av_gcd, 4095 runs, 1 skips de-bruijn: 1440 decicycles in av_gcd, 4096 runs, 0 skips previous: 2889 decicycles in av_gcd, 4096 runs, 0 skips Signed-off-by: Ganesh Ajjanagadde Signed-off-by: Michael Niedermayer --- libavutil/intmath.h | 19 +++++++++++++++++++ libavutil/mathematics.c | 26 +++++++++++++++++++++----- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/libavutil/intmath.h b/libavutil/intmath.h index 08d54a64ad..b412385e53 100644 --- a/libavutil/intmath.h +++ b/libavutil/intmath.h @@ -114,6 +114,9 @@ static av_always_inline av_const int ff_log2_16bit_c(unsigned int v) #ifndef ff_ctz #define ff_ctz(v) __builtin_ctz(v) #endif +#ifndef ff_ctzll +#define ff_ctzll(v) __builtin_ctzll(v) +#endif #endif #endif @@ -158,6 +161,22 @@ static av_always_inline av_const int ff_ctz_c( int v ) #endif #endif +#ifndef ff_ctzll +#define ff_ctzll ff_ctzll_c +/* We use the De-Bruijn method outlined in: + * http://supertech.csail.mit.edu/papers/debruijn.pdf. */ +static av_always_inline av_const int ff_ctzll_c(long long v) +{ + static const int debruijn_ctz64[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, + 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, + 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, + 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12 + }; + return debruijn_ctz64[(uint64_t)((v & -v) * 0x022FDD63CC95386D) >> 58]; +} +#endif + /** * Trailing zero bit count. * diff --git a/libavutil/mathematics.c b/libavutil/mathematics.c index 252794e460..16e4eba5b9 100644 --- a/libavutil/mathematics.c +++ b/libavutil/mathematics.c @@ -27,16 +27,32 @@ #include #include "mathematics.h" +#include "libavutil/intmath.h" #include "libavutil/common.h" #include "avassert.h" #include "version.h" -int64_t av_gcd(int64_t a, int64_t b) -{ - if (b) - return av_gcd(b, a % b); - else +/* Stein's binary GCD algorithm: + * https://en.wikipedia.org/wiki/Binary_GCD_algorithm */ +int64_t av_gcd(int64_t a, int64_t b) { + int za, zb, k; + int64_t u, v; + if (a == 0) + return b; + if (b == 0) return a; + za = ff_ctzll(a); + zb = ff_ctzll(b); + k = FFMIN(za, zb); + u = llabs(a >> za); + v = llabs(b >> zb); + while (u != v) { + if (u > v) + FFSWAP(int64_t, v, u); + v -= u; + v >>= ff_ctzll(v); + } + return u << k; } int64_t av_rescale_rnd(int64_t a, int64_t b, int64_t c, enum AVRounding rnd) -- 2.11.0