From 8e1be9a34ac8ce6f115eaf2ab0d99b6a0ce37630 Mon Sep 17 00:00:00 2001 From: Roland Scheidegger Date: Thu, 23 Aug 2018 19:07:05 +0200 Subject: [PATCH] gallivm: don't use saturated unsigned add/sub intrinsics for llvm 8.0 These have been removed. Unfortunately auto-upgrade doesn't work for jit. (Worse, it seems we don't get a compilation error anymore when compiling the shader, rather llvm will just do a call to a null function in the jitted shaders making it difficult to detect when intrinsics vanish.) Luckily the signed ones are still there, I helped convincing llvm removing them is a bad idea for now, since while the unsigned ones have sort of agreed-upon simplest patterns to replace them with, this is not the case for the signed ones, and they require _significantly_ more complex patterns - to the point that the recognition is IMHO probably unlikely to ever work reliably in practice (due to other optimizations interfering). (Even for the relatively trivial unsigned patterns, llvm already added test cases where recognition doesn't work, unsaturated add followed by saturated add may produce atrocious code.) Nevertheless, it seems there's a serious quest to squash all cpu-specific intrinsics going on, so I'd expect patches to nuke them as well to resurface. Adapt the existing fallback code to match the simple patterns llvm uses and hope for the best. I've verified with lp_test_blend that it does produce the expected saturated assembly instructions. Though our cmp/select build helpers don't use boolean masks, but it doesn't seem to interfere with llvm's ability to recognize the pattern. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=106231 Reviewed-by: Jose Fonseca --- src/gallium/auxiliary/gallivm/lp_bld_arit.c | 87 ++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 27 deletions(-) diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c b/src/gallium/auxiliary/gallivm/lp_bld_arit.c index e922474ef61..f348833206b 100644 --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c @@ -557,23 +557,27 @@ lp_build_add(struct lp_build_context *bld, if (!type.floating && !type.fixed) { if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.b" : NULL; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.paddus.w" : NULL; } else if (util_cpu_caps.has_altivec) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddsbs" : "llvm.ppc.altivec.vaddubs"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.ppc.altivec.vaddshs" : "llvm.ppc.altivec.vadduhs"; } } if (type.width * type.length == 256) { if (util_cpu_caps.has_avx2) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : "llvm.x86.avx2.paddus.b"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : "llvm.x86.avx2.paddus.w"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.padds.b" : + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.b" : NULL; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.avx2.padds.w" : + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.paddus.w" : NULL; } } } @@ -592,8 +596,6 @@ lp_build_add(struct lp_build_context *bld, LLVMValueRef a_clamp_max = lp_build_min_simple(bld, a, LLVMBuildSub(builder, max_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildSub(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_max, a_clamp_min); - } else { - a = lp_build_min_simple(bld, a, lp_build_comp(bld, b), GALLIVM_NAN_BEHAVIOR_UNDEFINED); } } @@ -612,6 +614,24 @@ lp_build_add(struct lp_build_context *bld, if(bld->type.norm && (bld->type.floating || bld->type.fixed)) res = lp_build_min_simple(bld, res, bld->one, GALLIVM_NAN_BEHAVIOR_UNDEFINED); + if (type.norm && !type.floating && !type.fixed) { + if (!type.sign) { + /* + * newer llvm versions no longer support the intrinsics, but recognize + * the pattern. Since auto-upgrade of intrinsics doesn't work for jit + * code, it is important we match the pattern llvm uses (and pray llvm + * doesn't change it - and hope they decide on the same pattern for + * all backends supporting it...). + * NOTE: cmp/select does sext/trunc of the mask. Does not seem to + * interfere with llvm's ability to recognize the pattern but seems + * a bit brittle. + */ + LLVMValueRef overflowed = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, res); + res = lp_build_select(bld, overflowed, + LLVMConstAllOnes(bld->int_vec_type), res); + } + } + /* XXX clamp to floor of -1 or 0??? */ return res; @@ -858,23 +878,27 @@ lp_build_sub(struct lp_build_context *bld, if (!type.floating && !type.fixed) { if (type.width * type.length == 128) { if (util_cpu_caps.has_sse2) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.b" : NULL; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : + HAVE_LLVM < 0x0800 ? "llvm.x86.sse2.psubus.w" : NULL; } else if (util_cpu_caps.has_altivec) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.ppc.altivec.vsubsbs" : "llvm.ppc.altivec.vsububs"; + if (type.width == 16) + intrinsic = type.sign ? "llvm.ppc.altivec.vsubshs" : "llvm.ppc.altivec.vsubuhs"; } } if (type.width * type.length == 256) { if (util_cpu_caps.has_avx2) { - if (type.width == 8) - intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : "llvm.x86.avx2.psubus.b"; - if (type.width == 16) - intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : "llvm.x86.avx2.psubus.w"; + if (type.width == 8) + intrinsic = type.sign ? "llvm.x86.avx2.psubs.b" : + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.b" : NULL; + if (type.width == 16) + intrinsic = type.sign ? "llvm.x86.avx2.psubs.w" : + HAVE_LLVM < 0x0800 ? "llvm.x86.avx2.psubus.w" : NULL; } } } @@ -894,7 +918,16 @@ lp_build_sub(struct lp_build_context *bld, LLVMValueRef a_clamp_min = lp_build_max_simple(bld, a, LLVMBuildAdd(builder, min_val, b, ""), GALLIVM_NAN_BEHAVIOR_UNDEFINED); a = lp_build_select(bld, lp_build_cmp(bld, PIPE_FUNC_GREATER, b, bld->zero), a_clamp_min, a_clamp_max); } else { - a = lp_build_max_simple(bld, a, b, GALLIVM_NAN_BEHAVIOR_UNDEFINED); + /* + * This must match llvm pattern for saturated unsigned sub. + * (lp_build_max_simple actually does the job with its current + * definition but do it explicitly here.) + * NOTE: cmp/select does sext/trunc of the mask. Does not seem to + * interfere with llvm's ability to recognize the pattern but seems + * a bit brittle. + */ + LLVMValueRef no_ov = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b); + a = lp_build_select(bld, no_ov, a, b); } } -- 2.11.0