1 /**************************************************************************
3 * Copyright 2009 VMware, Inc.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sub license, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26 **************************************************************************/
33 * LLVM IR doesn't support all basic arithmetic operations we care about (most
34 * notably min/max and saturated operations), and it is often necessary to
35 * resort machine-specific intrinsics directly. The functions here hide all
36 * these implementation details from the other modules.
38 * We also do simple expressions simplification here. Reasons are:
39 * - it is very easy given we have all necessary information readily available
40 * - LLVM optimization passes fail to simplify several vector expressions
41 * - We often know value constraints which the optimization passes have no way
42 * of knowing, such as when source arguments are known to be in [0, 1] range.
44 * @author Jose Fonseca <jfonseca@vmware.com>
48 #include "util/u_memory.h"
49 #include "util/u_debug.h"
50 #include "util/u_string.h"
51 #include "util/u_cpu_detect.h"
53 #include "lp_bld_type.h"
54 #include "lp_bld_const.h"
55 #include "lp_bld_intr.h"
56 #include "lp_bld_logic.h"
57 #include "lp_bld_debug.h"
58 #include "lp_bld_arit.h"
63 * No checks for special case values of a or b = 1 or 0 are done.
66 lp_build_min_simple(struct lp_build_context *bld,
70 const struct lp_type type = bld->type;
71 const char *intrinsic = NULL;
74 /* TODO: optimize the constant case */
76 if(type.width * type.length == 128) {
78 if(type.width == 32 && util_cpu_caps.has_sse)
79 intrinsic = "llvm.x86.sse.min.ps";
80 if(type.width == 64 && util_cpu_caps.has_sse2)
81 intrinsic = "llvm.x86.sse2.min.pd";
84 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
85 intrinsic = "llvm.x86.sse2.pminu.b";
86 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
87 intrinsic = "llvm.x86.sse41.pminsb";
88 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
89 intrinsic = "llvm.x86.sse41.pminuw";
90 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
91 intrinsic = "llvm.x86.sse2.pmins.w";
92 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
93 intrinsic = "llvm.x86.sse41.pminud";
94 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
95 intrinsic = "llvm.x86.sse41.pminsd";
100 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
102 cond = lp_build_cmp(bld, PIPE_FUNC_LESS, a, b);
103 return lp_build_select(bld, cond, a, b);
109 * No checks for special case values of a or b = 1 or 0 are done.
112 lp_build_max_simple(struct lp_build_context *bld,
116 const struct lp_type type = bld->type;
117 const char *intrinsic = NULL;
120 /* TODO: optimize the constant case */
122 if(type.width * type.length == 128) {
124 if(type.width == 32 && util_cpu_caps.has_sse)
125 intrinsic = "llvm.x86.sse.max.ps";
126 if(type.width == 64 && util_cpu_caps.has_sse2)
127 intrinsic = "llvm.x86.sse2.max.pd";
130 if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
131 intrinsic = "llvm.x86.sse2.pmaxu.b";
132 if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
133 intrinsic = "llvm.x86.sse41.pmaxsb";
134 if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
135 intrinsic = "llvm.x86.sse41.pmaxuw";
136 if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
137 intrinsic = "llvm.x86.sse2.pmaxs.w";
138 if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
139 intrinsic = "llvm.x86.sse41.pmaxud";
140 if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
141 intrinsic = "llvm.x86.sse41.pmaxsd";
146 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
148 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, b);
149 return lp_build_select(bld, cond, a, b);
154 * Generate 1 - a, or ~a depending on bld->type.
157 lp_build_comp(struct lp_build_context *bld,
160 const struct lp_type type = bld->type;
167 if(type.norm && !type.floating && !type.fixed && !type.sign) {
168 if(LLVMIsConstant(a))
169 return LLVMConstNot(a);
171 return LLVMBuildNot(bld->builder, a, "");
174 if(LLVMIsConstant(a))
175 return LLVMConstSub(bld->one, a);
177 return LLVMBuildSub(bld->builder, bld->one, a, "");
185 lp_build_add(struct lp_build_context *bld,
189 const struct lp_type type = bld->type;
196 if(a == bld->undef || b == bld->undef)
200 const char *intrinsic = NULL;
202 if(a == bld->one || b == bld->one)
205 if(util_cpu_caps.has_sse2 &&
206 type.width * type.length == 128 &&
207 !type.floating && !type.fixed) {
209 intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
211 intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
215 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
218 if(LLVMIsConstant(a) && LLVMIsConstant(b))
219 res = LLVMConstAdd(a, b);
221 res = LLVMBuildAdd(bld->builder, a, b, "");
223 /* clamp to ceiling of 1.0 */
224 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
225 res = lp_build_min_simple(bld, res, bld->one);
227 /* XXX clamp to floor of -1 or 0??? */
237 lp_build_sub(struct lp_build_context *bld,
241 const struct lp_type type = bld->type;
246 if(a == bld->undef || b == bld->undef)
252 const char *intrinsic = NULL;
257 if(util_cpu_caps.has_sse2 &&
258 type.width * type.length == 128 &&
259 !type.floating && !type.fixed) {
261 intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
263 intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
267 return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
270 if(LLVMIsConstant(a) && LLVMIsConstant(b))
271 res = LLVMConstSub(a, b);
273 res = LLVMBuildSub(bld->builder, a, b, "");
275 if(bld->type.norm && (bld->type.floating || bld->type.fixed))
276 res = lp_build_max_simple(bld, res, bld->zero);
283 * Build shuffle vectors that match PUNPCKLxx and PUNPCKHxx instructions.
286 lp_build_unpack_shuffle(unsigned n, unsigned lo_hi)
288 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
291 assert(n <= LP_MAX_VECTOR_LENGTH);
294 for(i = 0, j = lo_hi*n/2; i < n; i += 2, ++j) {
295 elems[i + 0] = LLVMConstInt(LLVMInt32Type(), 0 + j, 0);
296 elems[i + 1] = LLVMConstInt(LLVMInt32Type(), n + j, 0);
299 return LLVMConstVector(elems, n);
304 * Build constant int vector of width 'n' and value 'c'.
307 lp_build_const_vec(LLVMTypeRef type, unsigned n, long long c)
309 LLVMValueRef elems[LP_MAX_VECTOR_LENGTH];
312 assert(n <= LP_MAX_VECTOR_LENGTH);
314 for(i = 0; i < n; ++i)
315 elems[i] = LLVMConstInt(type, c, 0);
317 return LLVMConstVector(elems, n);
322 * Normalized 8bit multiplication.
326 * makes the following approximation to the division (Sree)
328 * a*b/255 ~= (a*(b + 1)) >> 256
330 * which is the fastest method that satisfies the following OpenGL criteria
332 * 0*0 = 0 and 255*255 = 255
336 * takes the geometric series approximation to the division
338 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
340 * in this case just the first two terms to fit in 16bit arithmetic
342 * t/255 ~= (t + (t >> 8)) >> 8
344 * note that just by itself it doesn't satisfies the OpenGL criteria, as
345 * 255*255 = 254, so the special case b = 255 must be accounted or roundoff
348 * - geometric series plus rounding
350 * when using a geometric series division instead of truncating the result
351 * use roundoff in the approximation (Jim Blinn)
353 * t/255 ~= (t + (t >> 8) + 0x80) >> 8
355 * achieving the exact results
357 * @sa Alvy Ray Smith, Image Compositing Fundamentals, Tech Memo 4, Aug 15, 1995,
358 * ftp://ftp.alvyray.com/Acrobat/4_Comp.pdf
359 * @sa Michael Herf, The "double blend trick", May 2000,
360 * http://www.stereopsis.com/doubleblend.html
363 lp_build_mul_u8n(LLVMBuilderRef builder,
364 LLVMValueRef a, LLVMValueRef b)
366 static LLVMValueRef c01 = NULL;
367 static LLVMValueRef c08 = NULL;
368 static LLVMValueRef c80 = NULL;
371 if(!c01) c01 = lp_build_const_vec(LLVMInt16Type(), 8, 0x01);
372 if(!c08) c08 = lp_build_const_vec(LLVMInt16Type(), 8, 0x08);
373 if(!c80) c80 = lp_build_const_vec(LLVMInt16Type(), 8, 0x80);
377 /* a*b/255 ~= (a*(b + 1)) >> 256 */
378 b = LLVMBuildAdd(builder, b, c01, "");
379 ab = LLVMBuildMul(builder, a, b, "");
383 /* t/255 ~= (t + (t >> 8) + 0x80) >> 8 */
384 ab = LLVMBuildMul(builder, a, b, "");
385 ab = LLVMBuildAdd(builder, ab, LLVMBuildLShr(builder, ab, c08, ""), "");
386 ab = LLVMBuildAdd(builder, ab, c80, "");
390 ab = LLVMBuildLShr(builder, ab, c08, "");
400 lp_build_mul(struct lp_build_context *bld,
404 const struct lp_type type = bld->type;
414 if(a == bld->undef || b == bld->undef)
417 if(!type.floating && !type.fixed && type.norm) {
418 if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
419 LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
420 LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
421 static LLVMValueRef ml = NULL;
422 static LLVMValueRef mh = NULL;
423 LLVMValueRef al, ah, bl, bh;
424 LLVMValueRef abl, abh;
427 if(!ml) ml = lp_build_unpack_shuffle(16, 0);
428 if(!mh) mh = lp_build_unpack_shuffle(16, 1);
430 /* PUNPCKLBW, PUNPCKHBW */
431 al = LLVMBuildShuffleVector(bld->builder, a, bld->zero, ml, "");
432 bl = LLVMBuildShuffleVector(bld->builder, b, bld->zero, ml, "");
433 ah = LLVMBuildShuffleVector(bld->builder, a, bld->zero, mh, "");
434 bh = LLVMBuildShuffleVector(bld->builder, b, bld->zero, mh, "");
437 al = LLVMBuildBitCast(bld->builder, al, i16x8, "");
438 bl = LLVMBuildBitCast(bld->builder, bl, i16x8, "");
439 ah = LLVMBuildBitCast(bld->builder, ah, i16x8, "");
440 bh = LLVMBuildBitCast(bld->builder, bh, i16x8, "");
442 /* PMULLW, PSRLW, PADDW */
443 abl = lp_build_mul_u8n(bld->builder, al, bl);
444 abh = lp_build_mul_u8n(bld->builder, ah, bh);
447 ab = lp_build_intrinsic_binary(bld->builder, "llvm.x86.sse2.packuswb.128" , i16x8, abl, abh);
450 ab = LLVMBuildBitCast(bld->builder, ab, i8x16, "");
459 if(LLVMIsConstant(a) && LLVMIsConstant(b))
460 return LLVMConstMul(a, b);
462 return LLVMBuildMul(bld->builder, a, b, "");
470 lp_build_div(struct lp_build_context *bld,
474 const struct lp_type type = bld->type;
479 return lp_build_rcp(bld, b);
484 if(a == bld->undef || b == bld->undef)
487 if(LLVMIsConstant(a) && LLVMIsConstant(b))
488 return LLVMConstFDiv(a, b);
490 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
491 return lp_build_mul(bld, a, lp_build_rcp(bld, b));
493 return LLVMBuildFDiv(bld->builder, a, b, "");
498 lp_build_lerp(struct lp_build_context *bld,
503 return lp_build_add(bld, v0, lp_build_mul(bld, x, lp_build_sub(bld, v1, v0)));
508 lp_build_lerp_2d(struct lp_build_context *bld,
516 LLVMValueRef v0 = lp_build_lerp(bld, x, v00, v01);
517 LLVMValueRef v1 = lp_build_lerp(bld, x, v10, v11);
518 return lp_build_lerp(bld, y, v0, v1);
524 * Do checks for special cases.
527 lp_build_min(struct lp_build_context *bld,
531 if(a == bld->undef || b == bld->undef)
538 if(a == bld->zero || b == bld->zero)
546 return lp_build_min_simple(bld, a, b);
552 * Do checks for special cases.
555 lp_build_max(struct lp_build_context *bld,
559 if(a == bld->undef || b == bld->undef)
566 if(a == bld->one || b == bld->one)
574 return lp_build_max_simple(bld, a, b);
582 lp_build_abs(struct lp_build_context *bld,
585 const struct lp_type type = bld->type;
586 LLVMTypeRef vec_type = lp_build_vec_type(type);
592 /* Mask out the sign bit */
593 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
594 LLVMValueRef mask = lp_build_int_const_scalar(type, ((unsigned long long)1 << type.width) - 1);
595 a = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
596 a = LLVMBuildAnd(bld->builder, a, mask, "");
597 a = LLVMBuildBitCast(bld->builder, a, vec_type, "");
601 if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
604 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
606 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.w.128", vec_type, a);
608 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
612 return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
617 lp_build_sgn(struct lp_build_context *bld,
620 const struct lp_type type = bld->type;
621 LLVMTypeRef vec_type = lp_build_vec_type(type);
625 /* Handle non-zero case */
627 /* if not zero then sign must be positive */
630 else if(type.floating) {
631 /* Take the sign bit and add it to 1 constant */
632 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
633 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
636 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
637 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
638 one = LLVMConstBitCast(bld->one, int_vec_type);
639 res = LLVMBuildOr(bld->builder, sign, one, "");
640 res = LLVMBuildBitCast(bld->builder, res, vec_type, "");
644 LLVMValueRef minus_one = lp_build_const_scalar(type, -1.0);
645 cond = lp_build_cmp(bld, PIPE_FUNC_GREATER, a, bld->zero);
646 res = lp_build_select(bld, cond, bld->one, minus_one);
650 cond = lp_build_cmp(bld, PIPE_FUNC_EQUAL, a, bld->zero);
651 res = lp_build_select(bld, cond, bld->zero, bld->one);
657 enum lp_build_round_sse41_mode
659 LP_BUILD_ROUND_SSE41_NEAREST = 0,
660 LP_BUILD_ROUND_SSE41_FLOOR = 1,
661 LP_BUILD_ROUND_SSE41_CEIL = 2,
662 LP_BUILD_ROUND_SSE41_TRUNCATE = 3
666 static INLINE LLVMValueRef
667 lp_build_round_sse41(struct lp_build_context *bld,
669 enum lp_build_round_sse41_mode mode)
671 const struct lp_type type = bld->type;
672 LLVMTypeRef vec_type = lp_build_vec_type(type);
673 const char *intrinsic;
675 assert(type.floating);
676 assert(type.width*type.length == 128);
677 assert(lp_check_value(type, a));
678 assert(util_cpu_caps.has_sse4_1);
682 intrinsic = "llvm.x86.sse41.round.ps";
685 intrinsic = "llvm.x86.sse41.round.pd";
692 return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a,
693 LLVMConstInt(LLVMInt32Type(), mode, 0));
698 lp_build_trunc(struct lp_build_context *bld,
701 const struct lp_type type = bld->type;
703 assert(type.floating);
704 assert(lp_check_value(type, a));
706 if(util_cpu_caps.has_sse4_1)
707 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
709 LLVMTypeRef vec_type = lp_build_vec_type(type);
710 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
712 res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
713 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
720 lp_build_round(struct lp_build_context *bld,
723 const struct lp_type type = bld->type;
725 assert(type.floating);
726 assert(lp_check_value(type, a));
728 if(util_cpu_caps.has_sse4_1)
729 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
731 LLVMTypeRef vec_type = lp_build_vec_type(type);
733 res = lp_build_iround(bld, a);
734 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
741 lp_build_floor(struct lp_build_context *bld,
744 const struct lp_type type = bld->type;
746 assert(type.floating);
748 if(util_cpu_caps.has_sse4_1)
749 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
751 LLVMTypeRef vec_type = lp_build_vec_type(type);
753 res = lp_build_ifloor(bld, a);
754 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
761 lp_build_ceil(struct lp_build_context *bld,
764 const struct lp_type type = bld->type;
766 assert(type.floating);
767 assert(lp_check_value(type, a));
769 if(util_cpu_caps.has_sse4_1)
770 return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
772 LLVMTypeRef vec_type = lp_build_vec_type(type);
774 res = lp_build_iceil(bld, a);
775 res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
782 * Convert to integer, through whichever rounding method that's fastest,
783 * typically truncating to zero.
786 lp_build_itrunc(struct lp_build_context *bld,
789 const struct lp_type type = bld->type;
790 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
792 assert(type.floating);
793 assert(lp_check_value(type, a));
795 return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
800 lp_build_iround(struct lp_build_context *bld,
803 const struct lp_type type = bld->type;
804 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
807 assert(type.floating);
808 assert(lp_check_value(type, a));
810 if(util_cpu_caps.has_sse4_1) {
811 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
814 LLVMTypeRef vec_type = lp_build_vec_type(type);
815 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
820 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
821 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
824 half = lp_build_const_scalar(type, 0.5);
825 half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
826 half = LLVMBuildOr(bld->builder, sign, half, "");
827 half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
829 res = LLVMBuildAdd(bld->builder, a, half, "");
832 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
839 lp_build_ifloor(struct lp_build_context *bld,
842 const struct lp_type type = bld->type;
843 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
846 assert(type.floating);
847 assert(lp_check_value(type, a));
849 if(util_cpu_caps.has_sse4_1) {
850 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
853 /* Take the sign bit and add it to 1 constant */
854 LLVMTypeRef vec_type = lp_build_vec_type(type);
855 unsigned mantissa = lp_mantissa(type);
856 LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
860 /* sign = a < 0 ? ~0 : 0 */
861 sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
862 sign = LLVMBuildAnd(bld->builder, sign, mask, "");
863 sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
865 /* offset = -0.99999(9)f */
866 offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
867 offset = LLVMConstBitCast(offset, int_vec_type);
869 /* offset = a < 0 ? -0.99999(9)f : 0.0f */
870 offset = LLVMBuildAnd(bld->builder, offset, sign, "");
871 offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
873 res = LLVMBuildAdd(bld->builder, a, offset, "");
876 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
883 lp_build_iceil(struct lp_build_context *bld,
886 const struct lp_type type = bld->type;
887 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
890 assert(type.floating);
891 assert(lp_check_value(type, a));
893 if(util_cpu_caps.has_sse4_1) {
894 res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
901 res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
908 lp_build_sqrt(struct lp_build_context *bld,
911 const struct lp_type type = bld->type;
912 LLVMTypeRef vec_type = lp_build_vec_type(type);
915 /* TODO: optimize the constant case */
916 /* TODO: optimize the constant case */
918 assert(type.floating);
919 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sqrt.v%uf%u", type.length, type.width);
921 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
926 lp_build_rcp(struct lp_build_context *bld,
929 const struct lp_type type = bld->type;
938 assert(type.floating);
940 if(LLVMIsConstant(a))
941 return LLVMConstFDiv(bld->one, a);
943 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
944 /* FIXME: improve precision */
945 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
947 return LLVMBuildFDiv(bld->builder, bld->one, a, "");
955 lp_build_rsqrt(struct lp_build_context *bld,
958 const struct lp_type type = bld->type;
960 assert(type.floating);
962 if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
963 return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
965 return lp_build_rcp(bld, lp_build_sqrt(bld, a));
973 lp_build_cos(struct lp_build_context *bld,
976 const struct lp_type type = bld->type;
977 LLVMTypeRef vec_type = lp_build_vec_type(type);
980 /* TODO: optimize the constant case */
982 assert(type.floating);
983 util_snprintf(intrinsic, sizeof intrinsic, "llvm.cos.v%uf%u", type.length, type.width);
985 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
993 lp_build_sin(struct lp_build_context *bld,
996 const struct lp_type type = bld->type;
997 LLVMTypeRef vec_type = lp_build_vec_type(type);
1000 /* TODO: optimize the constant case */
1002 assert(type.floating);
1003 util_snprintf(intrinsic, sizeof intrinsic, "llvm.sin.v%uf%u", type.length, type.width);
1005 return lp_build_intrinsic_unary(bld->builder, intrinsic, vec_type, a);
1010 * Generate pow(x, y)
1013 lp_build_pow(struct lp_build_context *bld,
1017 /* TODO: optimize the constant case */
1018 if(LLVMIsConstant(x) && LLVMIsConstant(y))
1019 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1022 return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
1030 lp_build_exp(struct lp_build_context *bld,
1033 /* log2(e) = 1/log(2) */
1034 LLVMValueRef log2e = lp_build_const_scalar(bld->type, 1.4426950408889634);
1036 return lp_build_mul(bld, log2e, lp_build_exp2(bld, x));
1044 lp_build_log(struct lp_build_context *bld,
1048 LLVMValueRef log2 = lp_build_const_scalar(bld->type, 1.4426950408889634);
1050 return lp_build_mul(bld, log2, lp_build_exp2(bld, x));
1054 #define EXP_POLY_DEGREE 3
1055 #define LOG_POLY_DEGREE 5
1059 * Generate polynomial.
1060 * Ex: x^2 * coeffs[0] + x * coeffs[1] + coeffs[2].
1063 lp_build_polynomial(struct lp_build_context *bld,
1065 const double *coeffs,
1066 unsigned num_coeffs)
1068 const struct lp_type type = bld->type;
1069 LLVMValueRef res = NULL;
1072 /* TODO: optimize the constant case */
1073 if(LLVMIsConstant(x))
1074 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1077 for (i = num_coeffs; i--; ) {
1078 LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
1080 res = lp_build_add(bld, coeff, lp_build_mul(bld, x, res));
1093 * Minimax polynomial fit of 2**x, in range [-0.5, 0.5[
1095 const double lp_build_exp2_polynomial[] = {
1096 #if EXP_POLY_DEGREE == 5
1097 9.9999994e-1, 6.9315308e-1, 2.4015361e-1, 5.5826318e-2, 8.9893397e-3, 1.8775767e-3
1098 #elif EXP_POLY_DEGREE == 4
1099 1.0000026, 6.9300383e-1, 2.4144275e-1, 5.2011464e-2, 1.3534167e-2
1100 #elif EXP_POLY_DEGREE == 3
1101 9.9992520e-1, 6.9583356e-1, 2.2606716e-1, 7.8024521e-2
1102 #elif EXP_POLY_DEGREE == 2
1103 1.0017247, 6.5763628e-1, 3.3718944e-1
1111 lp_build_exp2_approx(struct lp_build_context *bld,
1113 LLVMValueRef *p_exp2_int_part,
1114 LLVMValueRef *p_frac_part,
1115 LLVMValueRef *p_exp2)
1117 const struct lp_type type = bld->type;
1118 LLVMTypeRef vec_type = lp_build_vec_type(type);
1119 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1120 LLVMValueRef ipart = NULL;
1121 LLVMValueRef fpart = NULL;
1122 LLVMValueRef expipart = NULL;
1123 LLVMValueRef expfpart = NULL;
1124 LLVMValueRef res = NULL;
1126 if(p_exp2_int_part || p_frac_part || p_exp2) {
1127 /* TODO: optimize the constant case */
1128 if(LLVMIsConstant(x))
1129 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1132 assert(type.floating && type.width == 32);
1134 x = lp_build_min(bld, x, lp_build_const_scalar(type, 129.0));
1135 x = lp_build_max(bld, x, lp_build_const_scalar(type, -126.99999));
1137 /* ipart = int(x - 0.5) */
1138 ipart = LLVMBuildSub(bld->builder, x, lp_build_const_scalar(type, 0.5f), "");
1139 ipart = LLVMBuildFPToSI(bld->builder, ipart, int_vec_type, "");
1141 /* fpart = x - ipart */
1142 fpart = LLVMBuildSIToFP(bld->builder, ipart, vec_type, "");
1143 fpart = LLVMBuildSub(bld->builder, x, fpart, "");
1146 if(p_exp2_int_part || p_exp2) {
1147 /* expipart = (float) (1 << ipart) */
1148 expipart = LLVMBuildAdd(bld->builder, ipart, lp_build_int_const_scalar(type, 127), "");
1149 expipart = LLVMBuildShl(bld->builder, expipart, lp_build_int_const_scalar(type, 23), "");
1150 expipart = LLVMBuildBitCast(bld->builder, expipart, vec_type, "");
1154 expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
1155 Elements(lp_build_exp2_polynomial));
1157 res = LLVMBuildMul(bld->builder, expipart, expfpart, "");
1161 *p_exp2_int_part = expipart;
1164 *p_frac_part = fpart;
1172 lp_build_exp2(struct lp_build_context *bld,
1176 lp_build_exp2_approx(bld, x, NULL, NULL, &res);
1182 * Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[
1183 * These coefficients can be generate with
1184 * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
1186 const double lp_build_log2_polynomial[] = {
1187 #if LOG_POLY_DEGREE == 6
1188 3.11578814719469302614, -3.32419399085241980044, 2.59883907202499966007, -1.23152682416275988241, 0.318212422185251071475, -0.0344359067839062357313
1189 #elif LOG_POLY_DEGREE == 5
1190 2.8882704548164776201, -2.52074962577807006663, 1.48116647521213171641, -0.465725644288844778798, 0.0596515482674574969533
1191 #elif LOG_POLY_DEGREE == 4
1192 2.61761038894603480148, -1.75647175389045657003, 0.688243882994381274313, -0.107254423828329604454
1193 #elif LOG_POLY_DEGREE == 3
1194 2.28330284476918490682, -1.04913055217340124191, 0.204446009836232697516
1202 * See http://www.devmaster.net/forums/showthread.php?p=43580
1205 lp_build_log2_approx(struct lp_build_context *bld,
1207 LLVMValueRef *p_exp,
1208 LLVMValueRef *p_floor_log2,
1209 LLVMValueRef *p_log2)
1211 const struct lp_type type = bld->type;
1212 LLVMTypeRef vec_type = lp_build_vec_type(type);
1213 LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
1215 LLVMValueRef expmask = lp_build_int_const_scalar(type, 0x7f800000);
1216 LLVMValueRef mantmask = lp_build_int_const_scalar(type, 0x007fffff);
1217 LLVMValueRef one = LLVMConstBitCast(bld->one, int_vec_type);
1219 LLVMValueRef i = NULL;
1220 LLVMValueRef exp = NULL;
1221 LLVMValueRef mant = NULL;
1222 LLVMValueRef logexp = NULL;
1223 LLVMValueRef logmant = NULL;
1224 LLVMValueRef res = NULL;
1226 if(p_exp || p_floor_log2 || p_log2) {
1227 /* TODO: optimize the constant case */
1228 if(LLVMIsConstant(x))
1229 debug_printf("%s: inefficient/imprecise constant arithmetic\n",
1232 assert(type.floating && type.width == 32);
1234 i = LLVMBuildBitCast(bld->builder, x, int_vec_type, "");
1236 /* exp = (float) exponent(x) */
1237 exp = LLVMBuildAnd(bld->builder, i, expmask, "");
1240 if(p_floor_log2 || p_log2) {
1241 logexp = LLVMBuildLShr(bld->builder, exp, lp_build_int_const_scalar(type, 23), "");
1242 logexp = LLVMBuildSub(bld->builder, logexp, lp_build_int_const_scalar(type, 127), "");
1243 logexp = LLVMBuildSIToFP(bld->builder, logexp, vec_type, "");
1247 /* mant = (float) mantissa(x) */
1248 mant = LLVMBuildAnd(bld->builder, i, mantmask, "");
1249 mant = LLVMBuildOr(bld->builder, mant, one, "");
1250 mant = LLVMBuildSIToFP(bld->builder, mant, vec_type, "");
1252 logmant = lp_build_polynomial(bld, mant, lp_build_log2_polynomial,
1253 Elements(lp_build_log2_polynomial));
1255 /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
1256 logmant = LLVMBuildMul(bld->builder, logmant, LLVMBuildMul(bld->builder, mant, bld->one, ""), "");
1258 res = LLVMBuildAdd(bld->builder, logmant, logexp, "");
1265 *p_floor_log2 = logexp;
1273 lp_build_log2(struct lp_build_context *bld,
1277 lp_build_log2_approx(bld, x, NULL, NULL, &res);