2 * Copyright 2014 Advanced Micro Devices, Inc.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sub license, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
12 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
15 * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
16 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
17 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
18 * USE OR OTHER DEALINGS IN THE SOFTWARE.
20 * The above copyright notice and this permission notice (including the
21 * next paragraph) shall be included in all copies or substantial portions
25 /* based on pieces from si_pipe.c and radeon_llvm_emit.c */
26 #include "ac_llvm_build.h"
28 #include <llvm-c/Core.h>
30 #include "c11/threads.h"
35 #include "ac_llvm_util.h"
36 #include "ac_exp_param.h"
37 #include "util/bitscan.h"
38 #include "util/macros.h"
39 #include "util/u_atomic.h"
40 #include "util/u_math.h"
43 #include "shader_enums.h"
45 #define AC_LLVM_INITIAL_CF_DEPTH 4
47 /* Data for if/else/endif and bgnloop/endloop control flow structures.
50 /* Loop exit or next part of if/else/endif. */
51 LLVMBasicBlockRef next_block;
52 LLVMBasicBlockRef loop_entry_block;
55 /* Initialize module-independent parts of the context.
57 * The caller is responsible for initializing ctx::module and ctx::builder.
60 ac_llvm_context_init(struct ac_llvm_context *ctx,
61 enum chip_class chip_class, enum radeon_family family)
65 ctx->context = LLVMContextCreate();
67 ctx->chip_class = chip_class;
72 ctx->voidt = LLVMVoidTypeInContext(ctx->context);
73 ctx->i1 = LLVMInt1TypeInContext(ctx->context);
74 ctx->i8 = LLVMInt8TypeInContext(ctx->context);
75 ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
76 ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
77 ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
78 ctx->intptr = HAVE_32BIT_POINTERS ? ctx->i32 : ctx->i64;
79 ctx->f16 = LLVMHalfTypeInContext(ctx->context);
80 ctx->f32 = LLVMFloatTypeInContext(ctx->context);
81 ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
82 ctx->v2i16 = LLVMVectorType(ctx->i16, 2);
83 ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
84 ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
85 ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
86 ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
87 ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
88 ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
90 ctx->i32_0 = LLVMConstInt(ctx->i32, 0, false);
91 ctx->i32_1 = LLVMConstInt(ctx->i32, 1, false);
92 ctx->i64_0 = LLVMConstInt(ctx->i64, 0, false);
93 ctx->i64_1 = LLVMConstInt(ctx->i64, 1, false);
94 ctx->f32_0 = LLVMConstReal(ctx->f32, 0.0);
95 ctx->f32_1 = LLVMConstReal(ctx->f32, 1.0);
96 ctx->f64_0 = LLVMConstReal(ctx->f64, 0.0);
97 ctx->f64_1 = LLVMConstReal(ctx->f64, 1.0);
99 ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
100 ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
102 ctx->range_md_kind = LLVMGetMDKindIDInContext(ctx->context,
105 ctx->invariant_load_md_kind = LLVMGetMDKindIDInContext(ctx->context,
106 "invariant.load", 14);
108 ctx->fpmath_md_kind = LLVMGetMDKindIDInContext(ctx->context, "fpmath", 6);
110 args[0] = LLVMConstReal(ctx->f32, 2.5);
111 ctx->fpmath_md_2p5_ulp = LLVMMDNodeInContext(ctx->context, args, 1);
113 ctx->uniform_md_kind = LLVMGetMDKindIDInContext(ctx->context,
114 "amdgpu.uniform", 14);
116 ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
120 ac_llvm_context_dispose(struct ac_llvm_context *ctx)
124 ctx->flow_depth_max = 0;
128 ac_get_llvm_num_components(LLVMValueRef value)
130 LLVMTypeRef type = LLVMTypeOf(value);
131 unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
132 ? LLVMGetVectorSize(type)
134 return num_components;
138 ac_llvm_extract_elem(struct ac_llvm_context *ac,
142 if (LLVMGetTypeKind(LLVMTypeOf(value)) != LLVMVectorTypeKind) {
147 return LLVMBuildExtractElement(ac->builder, value,
148 LLVMConstInt(ac->i32, index, false), "");
152 ac_get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
154 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
155 type = LLVMGetElementType(type);
157 if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
158 return LLVMGetIntTypeWidth(type);
160 if (type == ctx->f16)
162 if (type == ctx->f32)
164 if (type == ctx->f64)
167 unreachable("Unhandled type kind in get_elem_bits");
171 ac_get_type_size(LLVMTypeRef type)
173 LLVMTypeKind kind = LLVMGetTypeKind(type);
176 case LLVMIntegerTypeKind:
177 return LLVMGetIntTypeWidth(type) / 8;
178 case LLVMHalfTypeKind:
180 case LLVMFloatTypeKind:
182 case LLVMDoubleTypeKind:
184 case LLVMPointerTypeKind:
185 if (LLVMGetPointerAddressSpace(type) == AC_CONST_32BIT_ADDR_SPACE)
188 case LLVMVectorTypeKind:
189 return LLVMGetVectorSize(type) *
190 ac_get_type_size(LLVMGetElementType(type));
191 case LLVMArrayTypeKind:
192 return LLVMGetArrayLength(type) *
193 ac_get_type_size(LLVMGetElementType(type));
200 static LLVMTypeRef to_integer_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
202 if (t == ctx->f16 || t == ctx->i16)
204 else if (t == ctx->f32 || t == ctx->i32)
206 else if (t == ctx->f64 || t == ctx->i64)
209 unreachable("Unhandled integer size");
213 ac_to_integer_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
215 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
216 LLVMTypeRef elem_type = LLVMGetElementType(t);
217 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
218 LLVMGetVectorSize(t));
220 return to_integer_type_scalar(ctx, t);
224 ac_to_integer(struct ac_llvm_context *ctx, LLVMValueRef v)
226 LLVMTypeRef type = LLVMTypeOf(v);
227 return LLVMBuildBitCast(ctx->builder, v, ac_to_integer_type(ctx, type), "");
230 static LLVMTypeRef to_float_type_scalar(struct ac_llvm_context *ctx, LLVMTypeRef t)
232 if (t == ctx->i16 || t == ctx->f16)
234 else if (t == ctx->i32 || t == ctx->f32)
236 else if (t == ctx->i64 || t == ctx->f64)
239 unreachable("Unhandled float size");
243 ac_to_float_type(struct ac_llvm_context *ctx, LLVMTypeRef t)
245 if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
246 LLVMTypeRef elem_type = LLVMGetElementType(t);
247 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
248 LLVMGetVectorSize(t));
250 return to_float_type_scalar(ctx, t);
254 ac_to_float(struct ac_llvm_context *ctx, LLVMValueRef v)
256 LLVMTypeRef type = LLVMTypeOf(v);
257 return LLVMBuildBitCast(ctx->builder, v, ac_to_float_type(ctx, type), "");
262 ac_build_intrinsic(struct ac_llvm_context *ctx, const char *name,
263 LLVMTypeRef return_type, LLVMValueRef *params,
264 unsigned param_count, unsigned attrib_mask)
266 LLVMValueRef function, call;
267 bool set_callsite_attrs = !(attrib_mask & AC_FUNC_ATTR_LEGACY);
269 function = LLVMGetNamedFunction(ctx->module, name);
271 LLVMTypeRef param_types[32], function_type;
274 assert(param_count <= 32);
276 for (i = 0; i < param_count; ++i) {
278 param_types[i] = LLVMTypeOf(params[i]);
281 LLVMFunctionType(return_type, param_types, param_count, 0);
282 function = LLVMAddFunction(ctx->module, name, function_type);
284 LLVMSetFunctionCallConv(function, LLVMCCallConv);
285 LLVMSetLinkage(function, LLVMExternalLinkage);
287 if (!set_callsite_attrs)
288 ac_add_func_attributes(ctx->context, function, attrib_mask);
291 call = LLVMBuildCall(ctx->builder, function, params, param_count, "");
292 if (set_callsite_attrs)
293 ac_add_func_attributes(ctx->context, call, attrib_mask);
298 * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
301 void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
303 LLVMTypeRef elem_type = type;
305 assert(bufsize >= 8);
307 if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
308 int ret = snprintf(buf, bufsize, "v%u",
309 LLVMGetVectorSize(type));
311 char *type_name = LLVMPrintTypeToString(type);
312 fprintf(stderr, "Error building type name for: %s\n",
316 elem_type = LLVMGetElementType(type);
320 switch (LLVMGetTypeKind(elem_type)) {
322 case LLVMIntegerTypeKind:
323 snprintf(buf, bufsize, "i%d", LLVMGetIntTypeWidth(elem_type));
325 case LLVMHalfTypeKind:
326 snprintf(buf, bufsize, "f16");
328 case LLVMFloatTypeKind:
329 snprintf(buf, bufsize, "f32");
331 case LLVMDoubleTypeKind:
332 snprintf(buf, bufsize, "f64");
338 * Helper function that builds an LLVM IR PHI node and immediately adds
342 ac_build_phi(struct ac_llvm_context *ctx, LLVMTypeRef type,
343 unsigned count_incoming, LLVMValueRef *values,
344 LLVMBasicBlockRef *blocks)
346 LLVMValueRef phi = LLVMBuildPhi(ctx->builder, type, "");
347 LLVMAddIncoming(phi, values, blocks, count_incoming);
351 void ac_build_s_barrier(struct ac_llvm_context *ctx)
353 ac_build_intrinsic(ctx, "llvm.amdgcn.s.barrier", ctx->voidt, NULL,
354 0, AC_FUNC_ATTR_CONVERGENT);
357 /* Prevent optimizations (at least of memory accesses) across the current
358 * point in the program by emitting empty inline assembly that is marked as
359 * having side effects.
361 * Optionally, a value can be passed through the inline assembly to prevent
362 * LLVM from hoisting calls to ReadNone functions.
365 ac_build_optimization_barrier(struct ac_llvm_context *ctx,
368 static int counter = 0;
370 LLVMBuilderRef builder = ctx->builder;
373 snprintf(code, sizeof(code), "; %d", p_atomic_inc_return(&counter));
376 LLVMTypeRef ftype = LLVMFunctionType(ctx->voidt, NULL, 0, false);
377 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "", true, false);
378 LLVMBuildCall(builder, inlineasm, NULL, 0, "");
380 LLVMTypeRef ftype = LLVMFunctionType(ctx->i32, &ctx->i32, 1, false);
381 LLVMValueRef inlineasm = LLVMConstInlineAsm(ftype, code, "=v,0", true, false);
382 LLVMValueRef vgpr = *pvgpr;
383 LLVMTypeRef vgpr_type = LLVMTypeOf(vgpr);
384 unsigned vgpr_size = ac_get_type_size(vgpr_type);
387 assert(vgpr_size % 4 == 0);
389 vgpr = LLVMBuildBitCast(builder, vgpr, LLVMVectorType(ctx->i32, vgpr_size / 4), "");
390 vgpr0 = LLVMBuildExtractElement(builder, vgpr, ctx->i32_0, "");
391 vgpr0 = LLVMBuildCall(builder, inlineasm, &vgpr0, 1, "");
392 vgpr = LLVMBuildInsertElement(builder, vgpr, vgpr0, ctx->i32_0, "");
393 vgpr = LLVMBuildBitCast(builder, vgpr, vgpr_type, "");
400 ac_build_shader_clock(struct ac_llvm_context *ctx)
402 LLVMValueRef tmp = ac_build_intrinsic(ctx, "llvm.readcyclecounter",
403 ctx->i64, NULL, 0, 0);
404 return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2i32, "");
408 ac_build_ballot(struct ac_llvm_context *ctx,
411 LLVMValueRef args[3] = {
414 LLVMConstInt(ctx->i32, LLVMIntNE, 0)
417 /* We currently have no other way to prevent LLVM from lifting the icmp
418 * calls to a dominating basic block.
420 ac_build_optimization_barrier(ctx, &args[0]);
422 args[0] = ac_to_integer(ctx, args[0]);
424 return ac_build_intrinsic(ctx,
425 "llvm.amdgcn.icmp.i32",
427 AC_FUNC_ATTR_NOUNWIND |
428 AC_FUNC_ATTR_READNONE |
429 AC_FUNC_ATTR_CONVERGENT);
433 ac_build_vote_all(struct ac_llvm_context *ctx, LLVMValueRef value)
435 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
436 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
437 return LLVMBuildICmp(ctx->builder, LLVMIntEQ, vote_set, active_set, "");
441 ac_build_vote_any(struct ac_llvm_context *ctx, LLVMValueRef value)
443 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
444 return LLVMBuildICmp(ctx->builder, LLVMIntNE, vote_set,
445 LLVMConstInt(ctx->i64, 0, 0), "");
449 ac_build_vote_eq(struct ac_llvm_context *ctx, LLVMValueRef value)
451 LLVMValueRef active_set = ac_build_ballot(ctx, ctx->i32_1);
452 LLVMValueRef vote_set = ac_build_ballot(ctx, value);
454 LLVMValueRef all = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
455 vote_set, active_set, "");
456 LLVMValueRef none = LLVMBuildICmp(ctx->builder, LLVMIntEQ,
458 LLVMConstInt(ctx->i64, 0, 0), "");
459 return LLVMBuildOr(ctx->builder, all, none, "");
463 ac_build_varying_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
464 unsigned value_count, unsigned component)
466 LLVMValueRef vec = NULL;
468 if (value_count == 1) {
469 return values[component];
470 } else if (!value_count)
471 unreachable("value_count is 0");
473 for (unsigned i = component; i < value_count + component; i++) {
474 LLVMValueRef value = values[i];
477 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
478 LLVMValueRef index = LLVMConstInt(ctx->i32, i - component, false);
479 vec = LLVMBuildInsertElement(ctx->builder, vec, value, index, "");
485 ac_build_gather_values_extended(struct ac_llvm_context *ctx,
486 LLVMValueRef *values,
487 unsigned value_count,
488 unsigned value_stride,
492 LLVMBuilderRef builder = ctx->builder;
493 LLVMValueRef vec = NULL;
496 if (value_count == 1 && !always_vector) {
498 return LLVMBuildLoad(builder, values[0], "");
500 } else if (!value_count)
501 unreachable("value_count is 0");
503 for (i = 0; i < value_count; i++) {
504 LLVMValueRef value = values[i * value_stride];
506 value = LLVMBuildLoad(builder, value, "");
509 vec = LLVMGetUndef( LLVMVectorType(LLVMTypeOf(value), value_count));
510 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
511 vec = LLVMBuildInsertElement(builder, vec, value, index, "");
517 ac_build_gather_values(struct ac_llvm_context *ctx,
518 LLVMValueRef *values,
519 unsigned value_count)
521 return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
524 /* Expand a scalar or vector to <4 x type> by filling the remaining channels
525 * with undef. Extract at most num_channels components from the input.
527 LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx,
529 unsigned num_channels)
531 LLVMTypeRef elemtype;
532 LLVMValueRef chan[4];
534 if (LLVMGetTypeKind(LLVMTypeOf(value)) == LLVMVectorTypeKind) {
535 unsigned vec_size = LLVMGetVectorSize(LLVMTypeOf(value));
536 num_channels = MIN2(num_channels, vec_size);
538 if (num_channels >= 4)
541 for (unsigned i = 0; i < num_channels; i++)
542 chan[i] = ac_llvm_extract_elem(ctx, value, i);
544 elemtype = LLVMGetElementType(LLVMTypeOf(value));
547 assert(num_channels == 1);
550 elemtype = LLVMTypeOf(value);
553 while (num_channels < 4)
554 chan[num_channels++] = LLVMGetUndef(elemtype);
556 return ac_build_gather_values(ctx, chan, 4);
560 ac_build_fdiv(struct ac_llvm_context *ctx,
564 /* If we do (num / den), LLVM >= 7.0 does:
565 * return num * v_rcp_f32(den * (fabs(den) > 0x1.0p+96f ? 0x1.0p-32f : 1.0f));
567 * If we do (num * (1 / den)), LLVM does:
568 * return num * v_rcp_f32(den);
570 LLVMValueRef rcp = LLVMBuildFDiv(ctx->builder, ctx->f32_1, den, "");
571 LLVMValueRef ret = LLVMBuildFMul(ctx->builder, num, rcp, "");
573 /* Use v_rcp_f32 instead of precise division. */
574 if (!LLVMIsConstant(ret))
575 LLVMSetMetadata(ret, ctx->fpmath_md_kind, ctx->fpmath_md_2p5_ulp);
579 /* Coordinates for cube map selection. sc, tc, and ma are as in Table 8.27
580 * of the OpenGL 4.5 (Compatibility Profile) specification, except ma is
581 * already multiplied by two. id is the cube face number.
583 struct cube_selection_coords {
590 build_cube_intrinsic(struct ac_llvm_context *ctx,
592 struct cube_selection_coords *out)
594 LLVMTypeRef f32 = ctx->f32;
596 out->stc[1] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubetc",
597 f32, in, 3, AC_FUNC_ATTR_READNONE);
598 out->stc[0] = ac_build_intrinsic(ctx, "llvm.amdgcn.cubesc",
599 f32, in, 3, AC_FUNC_ATTR_READNONE);
600 out->ma = ac_build_intrinsic(ctx, "llvm.amdgcn.cubema",
601 f32, in, 3, AC_FUNC_ATTR_READNONE);
602 out->id = ac_build_intrinsic(ctx, "llvm.amdgcn.cubeid",
603 f32, in, 3, AC_FUNC_ATTR_READNONE);
607 * Build a manual selection sequence for cube face sc/tc coordinates and
608 * major axis vector (multiplied by 2 for consistency) for the given
609 * vec3 \p coords, for the face implied by \p selcoords.
611 * For the major axis, we always adjust the sign to be in the direction of
612 * selcoords.ma; i.e., a positive out_ma means that coords is pointed towards
613 * the selcoords major axis.
615 static void build_cube_select(struct ac_llvm_context *ctx,
616 const struct cube_selection_coords *selcoords,
617 const LLVMValueRef *coords,
618 LLVMValueRef *out_st,
619 LLVMValueRef *out_ma)
621 LLVMBuilderRef builder = ctx->builder;
622 LLVMTypeRef f32 = LLVMTypeOf(coords[0]);
623 LLVMValueRef is_ma_positive;
625 LLVMValueRef is_ma_z, is_not_ma_z;
626 LLVMValueRef is_ma_y;
627 LLVMValueRef is_ma_x;
631 is_ma_positive = LLVMBuildFCmp(builder, LLVMRealUGE,
632 selcoords->ma, LLVMConstReal(f32, 0.0), "");
633 sgn_ma = LLVMBuildSelect(builder, is_ma_positive,
634 LLVMConstReal(f32, 1.0), LLVMConstReal(f32, -1.0), "");
636 is_ma_z = LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 4.0), "");
637 is_not_ma_z = LLVMBuildNot(builder, is_ma_z, "");
638 is_ma_y = LLVMBuildAnd(builder, is_not_ma_z,
639 LLVMBuildFCmp(builder, LLVMRealUGE, selcoords->id, LLVMConstReal(f32, 2.0), ""), "");
640 is_ma_x = LLVMBuildAnd(builder, is_not_ma_z, LLVMBuildNot(builder, is_ma_y, ""), "");
643 tmp = LLVMBuildSelect(builder, is_ma_x, coords[2], coords[0], "");
644 sgn = LLVMBuildSelect(builder, is_ma_y, LLVMConstReal(f32, 1.0),
645 LLVMBuildSelect(builder, is_ma_z, sgn_ma,
646 LLVMBuildFNeg(builder, sgn_ma, ""), ""), "");
647 out_st[0] = LLVMBuildFMul(builder, tmp, sgn, "");
650 tmp = LLVMBuildSelect(builder, is_ma_y, coords[2], coords[1], "");
651 sgn = LLVMBuildSelect(builder, is_ma_y, sgn_ma,
652 LLVMConstReal(f32, -1.0), "");
653 out_st[1] = LLVMBuildFMul(builder, tmp, sgn, "");
656 tmp = LLVMBuildSelect(builder, is_ma_z, coords[2],
657 LLVMBuildSelect(builder, is_ma_y, coords[1], coords[0], ""), "");
658 tmp = ac_build_intrinsic(ctx, "llvm.fabs.f32",
659 ctx->f32, &tmp, 1, AC_FUNC_ATTR_READNONE);
660 *out_ma = LLVMBuildFMul(builder, tmp, LLVMConstReal(f32, 2.0), "");
664 ac_prepare_cube_coords(struct ac_llvm_context *ctx,
665 bool is_deriv, bool is_array, bool is_lod,
666 LLVMValueRef *coords_arg,
667 LLVMValueRef *derivs_arg)
670 LLVMBuilderRef builder = ctx->builder;
671 struct cube_selection_coords selcoords;
672 LLVMValueRef coords[3];
675 if (is_array && !is_lod) {
676 LLVMValueRef tmp = coords_arg[3];
677 tmp = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &tmp, 1, 0);
679 /* Section 8.9 (Texture Functions) of the GLSL 4.50 spec says:
681 * "For Array forms, the array layer used will be
683 * max(0, min(d−1, floor(layer+0.5)))
685 * where d is the depth of the texture array and layer
686 * comes from the component indicated in the tables below.
687 * Workaroudn for an issue where the layer is taken from a
688 * helper invocation which happens to fall on a different
689 * layer due to extrapolation."
691 * VI and earlier attempt to implement this in hardware by
692 * clamping the value of coords[2] = (8 * layer) + face.
693 * Unfortunately, this means that the we end up with the wrong
694 * face when clamping occurs.
696 * Clamp the layer earlier to work around the issue.
698 if (ctx->chip_class <= VI) {
700 ge0 = LLVMBuildFCmp(builder, LLVMRealOGE, tmp, ctx->f32_0, "");
701 tmp = LLVMBuildSelect(builder, ge0, tmp, ctx->f32_0, "");
707 build_cube_intrinsic(ctx, coords_arg, &selcoords);
709 invma = ac_build_intrinsic(ctx, "llvm.fabs.f32",
710 ctx->f32, &selcoords.ma, 1, AC_FUNC_ATTR_READNONE);
711 invma = ac_build_fdiv(ctx, LLVMConstReal(ctx->f32, 1.0), invma);
713 for (int i = 0; i < 2; ++i)
714 coords[i] = LLVMBuildFMul(builder, selcoords.stc[i], invma, "");
716 coords[2] = selcoords.id;
718 if (is_deriv && derivs_arg) {
719 LLVMValueRef derivs[4];
722 /* Convert cube derivatives to 2D derivatives. */
723 for (axis = 0; axis < 2; axis++) {
724 LLVMValueRef deriv_st[2];
725 LLVMValueRef deriv_ma;
727 /* Transform the derivative alongside the texture
728 * coordinate. Mathematically, the correct formula is
729 * as follows. Assume we're projecting onto the +Z face
730 * and denote by dx/dh the derivative of the (original)
731 * X texture coordinate with respect to horizontal
732 * window coordinates. The projection onto the +Z face
737 * Then df/dh = df/dx * dx/dh + df/dz * dz/dh
738 * = 1/z * dx/dh - x/z * 1/z * dz/dh.
740 * This motivatives the implementation below.
742 * Whether this actually gives the expected results for
743 * apps that might feed in derivatives obtained via
744 * finite differences is anyone's guess. The OpenGL spec
745 * seems awfully quiet about how textureGrad for cube
746 * maps should be handled.
748 build_cube_select(ctx, &selcoords, &derivs_arg[axis * 3],
749 deriv_st, &deriv_ma);
751 deriv_ma = LLVMBuildFMul(builder, deriv_ma, invma, "");
753 for (int i = 0; i < 2; ++i)
754 derivs[axis * 2 + i] =
755 LLVMBuildFSub(builder,
756 LLVMBuildFMul(builder, deriv_st[i], invma, ""),
757 LLVMBuildFMul(builder, deriv_ma, coords[i], ""), "");
760 memcpy(derivs_arg, derivs, sizeof(derivs));
763 /* Shift the texture coordinate. This must be applied after the
764 * derivative calculation.
766 for (int i = 0; i < 2; ++i)
767 coords[i] = LLVMBuildFAdd(builder, coords[i], LLVMConstReal(ctx->f32, 1.5), "");
770 /* for cube arrays coord.z = coord.w(array_index) * 8 + face */
771 /* coords_arg.w component - array_index for cube arrays */
772 coords[2] = ac_build_fmad(ctx, coords_arg[3], LLVMConstReal(ctx->f32, 8.0), coords[2]);
775 memcpy(coords_arg, coords, sizeof(coords));
780 ac_build_fs_interp(struct ac_llvm_context *ctx,
781 LLVMValueRef llvm_chan,
782 LLVMValueRef attr_number,
787 LLVMValueRef args[5];
792 args[2] = attr_number;
795 p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1",
796 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
801 args[3] = attr_number;
804 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2",
805 ctx->f32, args, 5, AC_FUNC_ATTR_READNONE);
809 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
810 LLVMValueRef parameter,
811 LLVMValueRef llvm_chan,
812 LLVMValueRef attr_number,
815 LLVMValueRef args[4];
819 args[2] = attr_number;
822 return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.mov",
823 ctx->f32, args, 4, AC_FUNC_ATTR_READNONE);
827 ac_build_gep0(struct ac_llvm_context *ctx,
828 LLVMValueRef base_ptr,
831 LLVMValueRef indices[2] = {
832 LLVMConstInt(ctx->i32, 0, 0),
835 return LLVMBuildGEP(ctx->builder, base_ptr,
840 ac_build_indexed_store(struct ac_llvm_context *ctx,
841 LLVMValueRef base_ptr, LLVMValueRef index,
844 LLVMBuildStore(ctx->builder, value,
845 ac_build_gep0(ctx, base_ptr, index));
849 * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
850 * It's equivalent to doing a load from &base_ptr[index].
852 * \param base_ptr Where the array starts.
853 * \param index The element index into the array.
854 * \param uniform Whether the base_ptr and index can be assumed to be
855 * dynamically uniform (i.e. load to an SGPR)
856 * \param invariant Whether the load is invariant (no other opcodes affect it)
859 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
860 LLVMValueRef index, bool uniform, bool invariant)
862 LLVMValueRef pointer, result;
864 pointer = ac_build_gep0(ctx, base_ptr, index);
866 LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
867 result = LLVMBuildLoad(ctx->builder, pointer, "");
869 LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
873 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
876 return ac_build_load_custom(ctx, base_ptr, index, false, false);
879 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
880 LLVMValueRef base_ptr, LLVMValueRef index)
882 return ac_build_load_custom(ctx, base_ptr, index, false, true);
885 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
886 LLVMValueRef base_ptr, LLVMValueRef index)
888 return ac_build_load_custom(ctx, base_ptr, index, true, true);
891 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
892 * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
893 * or v4i32 (num_channels=3,4).
896 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
899 unsigned num_channels,
900 LLVMValueRef voffset,
901 LLVMValueRef soffset,
902 unsigned inst_offset,
905 bool writeonly_memory,
906 bool swizzle_enable_hint)
908 /* Split 3 channel stores, becase LLVM doesn't support 3-channel
910 if (num_channels == 3) {
911 LLVMValueRef v[3], v01;
913 for (int i = 0; i < 3; i++) {
914 v[i] = LLVMBuildExtractElement(ctx->builder, vdata,
915 LLVMConstInt(ctx->i32, i, 0), "");
917 v01 = ac_build_gather_values(ctx, v, 2);
919 ac_build_buffer_store_dword(ctx, rsrc, v01, 2, voffset,
920 soffset, inst_offset, glc, slc,
921 writeonly_memory, swizzle_enable_hint);
922 ac_build_buffer_store_dword(ctx, rsrc, v[2], 1, voffset,
923 soffset, inst_offset + 8,
925 writeonly_memory, swizzle_enable_hint);
929 /* SWIZZLE_ENABLE requires that soffset isn't folded into voffset
930 * (voffset is swizzled, but soffset isn't swizzled).
931 * llvm.amdgcn.buffer.store doesn't have a separate soffset parameter.
933 if (!swizzle_enable_hint) {
934 LLVMValueRef offset = soffset;
936 static const char *types[] = {"f32", "v2f32", "v4f32"};
939 offset = LLVMBuildAdd(ctx->builder, offset,
940 LLVMConstInt(ctx->i32, inst_offset, 0), "");
942 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
944 LLVMValueRef args[] = {
945 ac_to_float(ctx, vdata),
946 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
947 LLVMConstInt(ctx->i32, 0, 0),
949 LLVMConstInt(ctx->i1, glc, 0),
950 LLVMConstInt(ctx->i1, slc, 0),
954 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.store.%s",
955 types[CLAMP(num_channels, 1, 3) - 1]);
957 ac_build_intrinsic(ctx, name, ctx->voidt,
958 args, ARRAY_SIZE(args),
960 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
961 AC_FUNC_ATTR_WRITEONLY);
965 static const unsigned dfmt[] = {
966 V_008F0C_BUF_DATA_FORMAT_32,
967 V_008F0C_BUF_DATA_FORMAT_32_32,
968 V_008F0C_BUF_DATA_FORMAT_32_32_32,
969 V_008F0C_BUF_DATA_FORMAT_32_32_32_32
971 static const char *types[] = {"i32", "v2i32", "v4i32"};
972 LLVMValueRef args[] = {
974 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
975 LLVMConstInt(ctx->i32, 0, 0),
976 voffset ? voffset : LLVMConstInt(ctx->i32, 0, 0),
978 LLVMConstInt(ctx->i32, inst_offset, 0),
979 LLVMConstInt(ctx->i32, dfmt[num_channels - 1], 0),
980 LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, 0),
981 LLVMConstInt(ctx->i1, glc, 0),
982 LLVMConstInt(ctx->i1, slc, 0),
985 snprintf(name, sizeof(name), "llvm.amdgcn.tbuffer.store.%s",
986 types[CLAMP(num_channels, 1, 3) - 1]);
988 ac_build_intrinsic(ctx, name, ctx->voidt,
989 args, ARRAY_SIZE(args),
991 AC_FUNC_ATTR_INACCESSIBLE_MEM_ONLY :
992 AC_FUNC_ATTR_WRITEONLY);
996 ac_build_buffer_load_common(struct ac_llvm_context *ctx,
999 LLVMValueRef voffset,
1000 unsigned num_channels,
1006 LLVMValueRef args[] = {
1007 LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, ""),
1008 vindex ? vindex : LLVMConstInt(ctx->i32, 0, 0),
1010 LLVMConstInt(ctx->i1, glc, 0),
1011 LLVMConstInt(ctx->i1, slc, 0)
1013 unsigned func = CLAMP(num_channels, 1, 3) - 1;
1015 LLVMTypeRef types[] = {ctx->f32, ctx->v2f32, ctx->v4f32};
1016 const char *type_names[] = {"f32", "v2f32", "v4f32"};
1020 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.format.%s",
1023 snprintf(name, sizeof(name), "llvm.amdgcn.buffer.load.%s",
1027 return ac_build_intrinsic(ctx, name, types[func], args,
1029 ac_get_load_intr_attribs(can_speculate));
1033 ac_build_buffer_load(struct ac_llvm_context *ctx,
1036 LLVMValueRef vindex,
1037 LLVMValueRef voffset,
1038 LLVMValueRef soffset,
1039 unsigned inst_offset,
1045 LLVMValueRef offset = LLVMConstInt(ctx->i32, inst_offset, 0);
1047 offset = LLVMBuildAdd(ctx->builder, offset, voffset, "");
1049 offset = LLVMBuildAdd(ctx->builder, offset, soffset, "");
1051 /* TODO: VI and later generations can use SMEM with GLC=1.*/
1052 if (allow_smem && !glc && !slc) {
1053 assert(vindex == NULL);
1055 LLVMValueRef result[8];
1057 for (int i = 0; i < num_channels; i++) {
1059 offset = LLVMBuildAdd(ctx->builder, offset,
1060 LLVMConstInt(ctx->i32, 4, 0), "");
1062 LLVMValueRef args[2] = {rsrc, offset};
1063 result[i] = ac_build_intrinsic(ctx, "llvm.SI.load.const.v4i32",
1065 AC_FUNC_ATTR_READNONE |
1066 AC_FUNC_ATTR_LEGACY);
1068 if (num_channels == 1)
1071 if (num_channels == 3)
1072 result[num_channels++] = LLVMGetUndef(ctx->f32);
1073 return ac_build_gather_values(ctx, result, num_channels);
1076 return ac_build_buffer_load_common(ctx, rsrc, vindex, offset,
1077 num_channels, glc, slc,
1078 can_speculate, false);
1081 LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx,
1083 LLVMValueRef vindex,
1084 LLVMValueRef voffset,
1085 unsigned num_channels,
1089 return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset,
1090 num_channels, glc, false,
1091 can_speculate, true);
1094 LLVMValueRef ac_build_buffer_load_format_gfx9_safe(struct ac_llvm_context *ctx,
1096 LLVMValueRef vindex,
1097 LLVMValueRef voffset,
1098 unsigned num_channels,
1102 LLVMValueRef elem_count = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 2, 0), "");
1103 LLVMValueRef stride = LLVMBuildExtractElement(ctx->builder, rsrc, LLVMConstInt(ctx->i32, 1, 0), "");
1104 stride = LLVMBuildLShr(ctx->builder, stride, LLVMConstInt(ctx->i32, 16, 0), "");
1106 LLVMValueRef new_elem_count = LLVMBuildSelect(ctx->builder,
1107 LLVMBuildICmp(ctx->builder, LLVMIntUGT, elem_count, stride, ""),
1108 elem_count, stride, "");
1110 LLVMValueRef new_rsrc = LLVMBuildInsertElement(ctx->builder, rsrc, new_elem_count,
1111 LLVMConstInt(ctx->i32, 2, 0), "");
1113 return ac_build_buffer_load_common(ctx, new_rsrc, vindex, voffset,
1114 num_channels, glc, false,
1115 can_speculate, true);
1119 ac_build_tbuffer_load_short(struct ac_llvm_context *ctx,
1121 LLVMValueRef vindex,
1122 LLVMValueRef voffset,
1123 LLVMValueRef soffset,
1124 LLVMValueRef immoffset)
1126 const char *name = "llvm.amdgcn.tbuffer.load.i32";
1127 LLVMTypeRef type = ctx->i32;
1128 LLVMValueRef params[] = {
1134 LLVMConstInt(ctx->i32, V_008F0C_BUF_DATA_FORMAT_16, false),
1135 LLVMConstInt(ctx->i32, V_008F0C_BUF_NUM_FORMAT_UINT, false),
1139 LLVMValueRef res = ac_build_intrinsic(ctx, name, type, params, 9, 0);
1140 return LLVMBuildTrunc(ctx->builder, res, ctx->i16, "");
1144 * Set range metadata on an instruction. This can only be used on load and
1145 * call instructions. If you know an instruction can only produce the values
1146 * 0, 1, 2, you would do set_range_metadata(value, 0, 3);
1147 * \p lo is the minimum value inclusive.
1148 * \p hi is the maximum value exclusive.
1150 static void set_range_metadata(struct ac_llvm_context *ctx,
1151 LLVMValueRef value, unsigned lo, unsigned hi)
1153 LLVMValueRef range_md, md_args[2];
1154 LLVMTypeRef type = LLVMTypeOf(value);
1155 LLVMContextRef context = LLVMGetTypeContext(type);
1157 md_args[0] = LLVMConstInt(type, lo, false);
1158 md_args[1] = LLVMConstInt(type, hi, false);
1159 range_md = LLVMMDNodeInContext(context, md_args, 2);
1160 LLVMSetMetadata(value, ctx->range_md_kind, range_md);
1164 ac_get_thread_id(struct ac_llvm_context *ctx)
1168 LLVMValueRef tid_args[2];
1169 tid_args[0] = LLVMConstInt(ctx->i32, 0xffffffff, false);
1170 tid_args[1] = LLVMConstInt(ctx->i32, 0, false);
1171 tid_args[1] = ac_build_intrinsic(ctx,
1172 "llvm.amdgcn.mbcnt.lo", ctx->i32,
1173 tid_args, 2, AC_FUNC_ATTR_READNONE);
1175 tid = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi",
1177 2, AC_FUNC_ATTR_READNONE);
1178 set_range_metadata(ctx, tid, 0, 64);
1183 * SI implements derivatives using the local data store (LDS)
1184 * All writes to the LDS happen in all executing threads at
1185 * the same time. TID is the Thread ID for the current
1186 * thread and is a value between 0 and 63, representing
1187 * the thread's position in the wavefront.
1189 * For the pixel shader threads are grouped into quads of four pixels.
1190 * The TIDs of the pixels of a quad are:
1198 * So, masking the TID with 0xfffffffc yields the TID of the top left pixel
1199 * of the quad, masking with 0xfffffffd yields the TID of the top pixel of
1200 * the current pixel's column, and masking with 0xfffffffe yields the TID
1201 * of the left pixel of the current pixel's row.
1203 * Adding 1 yields the TID of the pixel to the right of the left pixel, and
1204 * adding 2 yields the TID of the pixel below the top pixel.
1207 ac_build_ddxy(struct ac_llvm_context *ctx,
1212 LLVMValueRef tl, trbl, args[2];
1213 LLVMValueRef result;
1215 if (HAVE_LLVM >= 0x0700) {
1216 unsigned tl_lanes[4], trbl_lanes[4];
1218 for (unsigned i = 0; i < 4; ++i) {
1219 tl_lanes[i] = i & mask;
1220 trbl_lanes[i] = (i & mask) + idx;
1223 tl = ac_build_quad_swizzle(ctx, val,
1224 tl_lanes[0], tl_lanes[1],
1225 tl_lanes[2], tl_lanes[3]);
1226 trbl = ac_build_quad_swizzle(ctx, val,
1227 trbl_lanes[0], trbl_lanes[1],
1228 trbl_lanes[2], trbl_lanes[3]);
1229 } else if (ctx->chip_class >= VI) {
1230 LLVMValueRef thread_id, tl_tid, trbl_tid;
1231 thread_id = ac_get_thread_id(ctx);
1233 tl_tid = LLVMBuildAnd(ctx->builder, thread_id,
1234 LLVMConstInt(ctx->i32, mask, false), "");
1236 trbl_tid = LLVMBuildAdd(ctx->builder, tl_tid,
1237 LLVMConstInt(ctx->i32, idx, false), "");
1239 args[0] = LLVMBuildMul(ctx->builder, tl_tid,
1240 LLVMConstInt(ctx->i32, 4, false), "");
1242 tl = ac_build_intrinsic(ctx,
1243 "llvm.amdgcn.ds.bpermute", ctx->i32,
1245 AC_FUNC_ATTR_READNONE |
1246 AC_FUNC_ATTR_CONVERGENT);
1248 args[0] = LLVMBuildMul(ctx->builder, trbl_tid,
1249 LLVMConstInt(ctx->i32, 4, false), "");
1250 trbl = ac_build_intrinsic(ctx,
1251 "llvm.amdgcn.ds.bpermute", ctx->i32,
1253 AC_FUNC_ATTR_READNONE |
1254 AC_FUNC_ATTR_CONVERGENT);
1256 uint32_t masks[2] = {};
1259 case AC_TID_MASK_TOP_LEFT:
1267 case AC_TID_MASK_TOP:
1271 case AC_TID_MASK_LEFT:
1280 args[1] = LLVMConstInt(ctx->i32, masks[0], false);
1282 tl = ac_build_intrinsic(ctx,
1283 "llvm.amdgcn.ds.swizzle", ctx->i32,
1285 AC_FUNC_ATTR_READNONE |
1286 AC_FUNC_ATTR_CONVERGENT);
1288 args[1] = LLVMConstInt(ctx->i32, masks[1], false);
1289 trbl = ac_build_intrinsic(ctx,
1290 "llvm.amdgcn.ds.swizzle", ctx->i32,
1292 AC_FUNC_ATTR_READNONE |
1293 AC_FUNC_ATTR_CONVERGENT);
1296 tl = LLVMBuildBitCast(ctx->builder, tl, ctx->f32, "");
1297 trbl = LLVMBuildBitCast(ctx->builder, trbl, ctx->f32, "");
1298 result = LLVMBuildFSub(ctx->builder, trbl, tl, "");
1300 if (HAVE_LLVM >= 0x0700) {
1301 result = ac_build_intrinsic(ctx,
1302 "llvm.amdgcn.wqm.f32", ctx->f32,
1310 ac_build_sendmsg(struct ac_llvm_context *ctx,
1312 LLVMValueRef wave_id)
1314 LLVMValueRef args[2];
1315 args[0] = LLVMConstInt(ctx->i32, msg, false);
1317 ac_build_intrinsic(ctx, "llvm.amdgcn.s.sendmsg", ctx->voidt, args, 2, 0);
1321 ac_build_imsb(struct ac_llvm_context *ctx,
1323 LLVMTypeRef dst_type)
1325 LLVMValueRef msb = ac_build_intrinsic(ctx, "llvm.amdgcn.sffbh.i32",
1327 AC_FUNC_ATTR_READNONE);
1329 /* The HW returns the last bit index from MSB, but NIR/TGSI wants
1330 * the index from LSB. Invert it by doing "31 - msb". */
1331 msb = LLVMBuildSub(ctx->builder, LLVMConstInt(ctx->i32, 31, false),
1334 LLVMValueRef all_ones = LLVMConstInt(ctx->i32, -1, true);
1335 LLVMValueRef cond = LLVMBuildOr(ctx->builder,
1336 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1337 arg, LLVMConstInt(ctx->i32, 0, 0), ""),
1338 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
1339 arg, all_ones, ""), "");
1341 return LLVMBuildSelect(ctx->builder, cond, all_ones, msb, "");
1345 ac_build_umsb(struct ac_llvm_context *ctx,
1347 LLVMTypeRef dst_type)
1349 const char *intrin_name;
1351 LLVMValueRef highest_bit;
1354 if (ac_get_elem_bits(ctx, LLVMTypeOf(arg)) == 64) {
1355 intrin_name = "llvm.ctlz.i64";
1357 highest_bit = LLVMConstInt(ctx->i64, 63, false);
1360 intrin_name = "llvm.ctlz.i32";
1362 highest_bit = LLVMConstInt(ctx->i32, 31, false);
1366 LLVMValueRef params[2] = {
1371 LLVMValueRef msb = ac_build_intrinsic(ctx, intrin_name, type,
1373 AC_FUNC_ATTR_READNONE);
1375 /* The HW returns the last bit index from MSB, but TGSI/NIR wants
1376 * the index from LSB. Invert it by doing "31 - msb". */
1377 msb = LLVMBuildSub(ctx->builder, highest_bit, msb, "");
1378 msb = LLVMBuildTruncOrBitCast(ctx->builder, msb, ctx->i32, "");
1380 /* check for zero */
1381 return LLVMBuildSelect(ctx->builder,
1382 LLVMBuildICmp(ctx->builder, LLVMIntEQ, arg, zero, ""),
1383 LLVMConstInt(ctx->i32, -1, true), msb, "");
1386 LLVMValueRef ac_build_fmin(struct ac_llvm_context *ctx, LLVMValueRef a,
1389 LLVMValueRef args[2] = {a, b};
1390 return ac_build_intrinsic(ctx, "llvm.minnum.f32", ctx->f32, args, 2,
1391 AC_FUNC_ATTR_READNONE);
1394 LLVMValueRef ac_build_fmax(struct ac_llvm_context *ctx, LLVMValueRef a,
1397 LLVMValueRef args[2] = {a, b};
1398 return ac_build_intrinsic(ctx, "llvm.maxnum.f32", ctx->f32, args, 2,
1399 AC_FUNC_ATTR_READNONE);
1402 LLVMValueRef ac_build_imin(struct ac_llvm_context *ctx, LLVMValueRef a,
1405 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSLE, a, b, "");
1406 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1409 LLVMValueRef ac_build_imax(struct ac_llvm_context *ctx, LLVMValueRef a,
1412 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, a, b, "");
1413 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1416 LLVMValueRef ac_build_umin(struct ac_llvm_context *ctx, LLVMValueRef a,
1419 LLVMValueRef cmp = LLVMBuildICmp(ctx->builder, LLVMIntULE, a, b, "");
1420 return LLVMBuildSelect(ctx->builder, cmp, a, b, "");
1423 LLVMValueRef ac_build_clamp(struct ac_llvm_context *ctx, LLVMValueRef value)
1425 return ac_build_fmin(ctx, ac_build_fmax(ctx, value, ctx->f32_0),
1429 void ac_build_export(struct ac_llvm_context *ctx, struct ac_export_args *a)
1431 LLVMValueRef args[9];
1433 args[0] = LLVMConstInt(ctx->i32, a->target, 0);
1434 args[1] = LLVMConstInt(ctx->i32, a->enabled_channels, 0);
1437 LLVMTypeRef i16 = LLVMInt16TypeInContext(ctx->context);
1438 LLVMTypeRef v2i16 = LLVMVectorType(i16, 2);
1440 args[2] = LLVMBuildBitCast(ctx->builder, a->out[0],
1442 args[3] = LLVMBuildBitCast(ctx->builder, a->out[1],
1444 args[4] = LLVMConstInt(ctx->i1, a->done, 0);
1445 args[5] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1447 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.compr.v2i16",
1448 ctx->voidt, args, 6, 0);
1450 args[2] = a->out[0];
1451 args[3] = a->out[1];
1452 args[4] = a->out[2];
1453 args[5] = a->out[3];
1454 args[6] = LLVMConstInt(ctx->i1, a->done, 0);
1455 args[7] = LLVMConstInt(ctx->i1, a->valid_mask, 0);
1457 ac_build_intrinsic(ctx, "llvm.amdgcn.exp.f32",
1458 ctx->voidt, args, 8, 0);
1462 void ac_build_export_null(struct ac_llvm_context *ctx)
1464 struct ac_export_args args;
1466 args.enabled_channels = 0x0; /* enabled channels */
1467 args.valid_mask = 1; /* whether the EXEC mask is valid */
1468 args.done = 1; /* DONE bit */
1469 args.target = V_008DFC_SQ_EXP_NULL;
1470 args.compr = 0; /* COMPR flag (0 = 32-bit export) */
1471 args.out[0] = LLVMGetUndef(ctx->f32); /* R */
1472 args.out[1] = LLVMGetUndef(ctx->f32); /* G */
1473 args.out[2] = LLVMGetUndef(ctx->f32); /* B */
1474 args.out[3] = LLVMGetUndef(ctx->f32); /* A */
1476 ac_build_export(ctx, &args);
1479 static unsigned ac_num_coords(enum ac_image_dim dim)
1485 case ac_image_1darray:
1489 case ac_image_2darray:
1490 case ac_image_2dmsaa:
1492 case ac_image_2darraymsaa:
1495 unreachable("ac_num_coords: bad dim");
1499 static unsigned ac_num_derivs(enum ac_image_dim dim)
1503 case ac_image_1darray:
1506 case ac_image_2darray:
1511 case ac_image_2dmsaa:
1512 case ac_image_2darraymsaa:
1514 unreachable("derivatives not supported");
1518 static const char *get_atomic_name(enum ac_atomic_op op)
1521 case ac_atomic_swap: return "swap";
1522 case ac_atomic_add: return "add";
1523 case ac_atomic_sub: return "sub";
1524 case ac_atomic_smin: return "smin";
1525 case ac_atomic_umin: return "umin";
1526 case ac_atomic_smax: return "smax";
1527 case ac_atomic_umax: return "umax";
1528 case ac_atomic_and: return "and";
1529 case ac_atomic_or: return "or";
1530 case ac_atomic_xor: return "xor";
1532 unreachable("bad atomic op");
1535 /* LLVM 6 and older */
1536 static LLVMValueRef ac_build_image_opcode_llvm6(struct ac_llvm_context *ctx,
1537 struct ac_image_args *a)
1539 LLVMValueRef args[16];
1540 LLVMTypeRef retty = ctx->v4f32;
1541 const char *name = NULL;
1542 const char *atomic_subop = "";
1543 char intr_name[128], coords_type[64];
1545 bool sample = a->opcode == ac_image_sample ||
1546 a->opcode == ac_image_gather4 ||
1547 a->opcode == ac_image_get_lod;
1548 bool atomic = a->opcode == ac_image_atomic ||
1549 a->opcode == ac_image_atomic_cmpswap;
1550 bool da = a->dim == ac_image_cube ||
1551 a->dim == ac_image_1darray ||
1552 a->dim == ac_image_2darray ||
1553 a->dim == ac_image_2darraymsaa;
1554 if (a->opcode == ac_image_get_lod)
1557 unsigned num_coords =
1558 a->opcode != ac_image_get_resinfo ? ac_num_coords(a->dim) : 0;
1560 unsigned num_addr = 0;
1562 if (a->opcode == ac_image_get_lod) {
1564 case ac_image_1darray:
1567 case ac_image_2darray:
1577 args[num_addr++] = ac_to_integer(ctx, a->offset);
1579 args[num_addr++] = ac_to_integer(ctx, a->bias);
1581 args[num_addr++] = ac_to_integer(ctx, a->compare);
1583 unsigned num_derivs = ac_num_derivs(a->dim);
1584 for (unsigned i = 0; i < num_derivs; ++i)
1585 args[num_addr++] = ac_to_integer(ctx, a->derivs[i]);
1587 for (unsigned i = 0; i < num_coords; ++i)
1588 args[num_addr++] = ac_to_integer(ctx, a->coords[i]);
1590 args[num_addr++] = ac_to_integer(ctx, a->lod);
1592 unsigned pad_goal = util_next_power_of_two(num_addr);
1593 while (num_addr < pad_goal)
1594 args[num_addr++] = LLVMGetUndef(ctx->i32);
1596 addr = ac_build_gather_values(ctx, args, num_addr);
1598 unsigned num_args = 0;
1599 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1600 args[num_args++] = a->data[0];
1601 if (a->opcode == ac_image_atomic_cmpswap)
1602 args[num_args++] = a->data[1];
1605 unsigned coords_arg = num_args;
1607 args[num_args++] = ac_to_float(ctx, addr);
1609 args[num_args++] = ac_to_integer(ctx, addr);
1611 args[num_args++] = a->resource;
1613 args[num_args++] = a->sampler;
1615 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, 0);
1617 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, 0);
1618 args[num_args++] = a->cache_policy & ac_glc ? ctx->i1true : ctx->i1false;
1619 args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
1620 args[num_args++] = ctx->i1false; /* lwe */
1621 args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
1623 args[num_args++] = ctx->i1false; /* r128 */
1624 args[num_args++] = LLVMConstInt(ctx->i1, da, 0);
1625 args[num_args++] = a->cache_policy & ac_slc ? ctx->i1true : ctx->i1false;
1628 switch (a->opcode) {
1629 case ac_image_sample:
1630 name = "llvm.amdgcn.image.sample";
1632 case ac_image_gather4:
1633 name = "llvm.amdgcn.image.gather4";
1636 name = "llvm.amdgcn.image.load";
1638 case ac_image_load_mip:
1639 name = "llvm.amdgcn.image.load.mip";
1641 case ac_image_store:
1642 name = "llvm.amdgcn.image.store";
1645 case ac_image_store_mip:
1646 name = "llvm.amdgcn.image.store.mip";
1649 case ac_image_atomic:
1650 case ac_image_atomic_cmpswap:
1651 name = "llvm.amdgcn.image.atomic.";
1653 if (a->opcode == ac_image_atomic_cmpswap) {
1654 atomic_subop = "cmpswap";
1656 atomic_subop = get_atomic_name(a->atomic);
1659 case ac_image_get_lod:
1660 name = "llvm.amdgcn.image.getlod";
1662 case ac_image_get_resinfo:
1663 name = "llvm.amdgcn.image.getresinfo";
1666 unreachable("invalid image opcode");
1669 ac_build_type_name_for_intr(LLVMTypeOf(args[coords_arg]), coords_type,
1670 sizeof(coords_type));
1673 snprintf(intr_name, sizeof(intr_name), "llvm.amdgcn.image.atomic.%s.%s",
1674 atomic_subop, coords_type);
1677 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1679 snprintf(intr_name, sizeof(intr_name), "%s%s%s%s.v4f32.%s.v8i32",
1681 a->compare ? ".c" : "",
1684 a->derivs[0] ? ".d" :
1685 a->level_zero ? ".lz" : "",
1686 a->offset ? ".o" : "",
1690 LLVMValueRef result =
1691 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
1693 if (!sample && retty == ctx->v4f32) {
1694 result = LLVMBuildBitCast(ctx->builder, result,
1700 LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx,
1701 struct ac_image_args *a)
1703 const char *overload[3] = { "", "", "" };
1704 unsigned num_overloads = 0;
1705 LLVMValueRef args[18];
1706 unsigned num_args = 0;
1707 enum ac_image_dim dim = a->dim;
1709 assert(!a->lod || a->lod == ctx->i32_0 || a->lod == ctx->f32_0 ||
1711 assert((a->opcode != ac_image_get_resinfo && a->opcode != ac_image_load_mip &&
1712 a->opcode != ac_image_store_mip) ||
1714 assert(a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1715 (!a->compare && !a->offset));
1716 assert((a->opcode == ac_image_sample || a->opcode == ac_image_gather4 ||
1717 a->opcode == ac_image_get_lod) ||
1719 assert((a->bias ? 1 : 0) +
1721 (a->level_zero ? 1 : 0) +
1722 (a->derivs[0] ? 1 : 0) <= 1);
1724 if (HAVE_LLVM < 0x0700)
1725 return ac_build_image_opcode_llvm6(ctx, a);
1727 if (a->opcode == ac_image_get_lod) {
1729 case ac_image_1darray:
1732 case ac_image_2darray:
1741 bool sample = a->opcode == ac_image_sample ||
1742 a->opcode == ac_image_gather4 ||
1743 a->opcode == ac_image_get_lod;
1744 bool atomic = a->opcode == ac_image_atomic ||
1745 a->opcode == ac_image_atomic_cmpswap;
1746 LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
1748 if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
1749 args[num_args++] = a->data[0];
1750 if (a->opcode == ac_image_atomic_cmpswap)
1751 args[num_args++] = a->data[1];
1755 args[num_args++] = LLVMConstInt(ctx->i32, a->dmask, false);
1758 args[num_args++] = ac_to_integer(ctx, a->offset);
1760 args[num_args++] = ac_to_float(ctx, a->bias);
1761 overload[num_overloads++] = ".f32";
1764 args[num_args++] = ac_to_float(ctx, a->compare);
1766 unsigned count = ac_num_derivs(dim);
1767 for (unsigned i = 0; i < count; ++i)
1768 args[num_args++] = ac_to_float(ctx, a->derivs[i]);
1769 overload[num_overloads++] = ".f32";
1771 unsigned num_coords =
1772 a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0;
1773 for (unsigned i = 0; i < num_coords; ++i)
1774 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->coords[i], coord_type, "");
1776 args[num_args++] = LLVMBuildBitCast(ctx->builder, a->lod, coord_type, "");
1777 overload[num_overloads++] = sample ? ".f32" : ".i32";
1779 args[num_args++] = a->resource;
1781 args[num_args++] = a->sampler;
1782 args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
1785 args[num_args++] = ctx->i32_0; /* texfailctrl */
1786 args[num_args++] = LLVMConstInt(ctx->i32, a->cache_policy, false);
1789 const char *atomic_subop = "";
1790 switch (a->opcode) {
1791 case ac_image_sample: name = "sample"; break;
1792 case ac_image_gather4: name = "gather4"; break;
1793 case ac_image_load: name = "load"; break;
1794 case ac_image_load_mip: name = "load.mip"; break;
1795 case ac_image_store: name = "store"; break;
1796 case ac_image_store_mip: name = "store.mip"; break;
1797 case ac_image_atomic:
1799 atomic_subop = get_atomic_name(a->atomic);
1801 case ac_image_atomic_cmpswap:
1803 atomic_subop = "cmpswap";
1805 case ac_image_get_lod: name = "getlod"; break;
1806 case ac_image_get_resinfo: name = "getresinfo"; break;
1807 default: unreachable("invalid image opcode");
1810 const char *dimname;
1812 case ac_image_1d: dimname = "1d"; break;
1813 case ac_image_2d: dimname = "2d"; break;
1814 case ac_image_3d: dimname = "3d"; break;
1815 case ac_image_cube: dimname = "cube"; break;
1816 case ac_image_1darray: dimname = "1darray"; break;
1817 case ac_image_2darray: dimname = "2darray"; break;
1818 case ac_image_2dmsaa: dimname = "2dmsaa"; break;
1819 case ac_image_2darraymsaa: dimname = "2darraymsaa"; break;
1820 default: unreachable("invalid dim");
1824 a->lod && (a->opcode == ac_image_sample || a->opcode == ac_image_gather4);
1826 snprintf(intr_name, sizeof(intr_name),
1827 "llvm.amdgcn.image.%s%s" /* base name */
1828 "%s%s%s" /* sample/gather modifiers */
1829 ".%s.%s%s%s%s", /* dimension and type overloads */
1831 a->compare ? ".c" : "",
1834 a->derivs[0] ? ".d" :
1835 a->level_zero ? ".lz" : "",
1836 a->offset ? ".o" : "",
1838 atomic ? "i32" : "v4f32",
1839 overload[0], overload[1], overload[2]);
1844 else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
1849 LLVMValueRef result =
1850 ac_build_intrinsic(ctx, intr_name, retty, args, num_args,
1852 if (!sample && retty == ctx->v4f32) {
1853 result = LLVMBuildBitCast(ctx->builder, result,
1859 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx,
1860 LLVMValueRef args[2])
1863 LLVMVectorType(LLVMHalfTypeInContext(ctx->context), 2);
1865 return ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pkrtz", v2f16,
1866 args, 2, AC_FUNC_ATTR_READNONE);
1869 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx,
1870 LLVMValueRef args[2])
1873 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.i16",
1874 ctx->v2i16, args, 2,
1875 AC_FUNC_ATTR_READNONE);
1876 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1879 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx,
1880 LLVMValueRef args[2])
1883 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pknorm.u16",
1884 ctx->v2i16, args, 2,
1885 AC_FUNC_ATTR_READNONE);
1886 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1889 /* The 8-bit and 10-bit clamping is for HW workarounds. */
1890 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx,
1891 LLVMValueRef args[2], unsigned bits, bool hi)
1893 assert(bits == 8 || bits == 10 || bits == 16);
1895 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1896 bits == 8 ? 127 : bits == 10 ? 511 : 32767, 0);
1897 LLVMValueRef min_rgb = LLVMConstInt(ctx->i32,
1898 bits == 8 ? -128 : bits == 10 ? -512 : -32768, 0);
1899 LLVMValueRef max_alpha =
1900 bits != 10 ? max_rgb : ctx->i32_1;
1901 LLVMValueRef min_alpha =
1902 bits != 10 ? min_rgb : LLVMConstInt(ctx->i32, -2, 0);
1906 for (int i = 0; i < 2; i++) {
1907 bool alpha = hi && i == 1;
1908 args[i] = ac_build_imin(ctx, args[i],
1909 alpha ? max_alpha : max_rgb);
1910 args[i] = ac_build_imax(ctx, args[i],
1911 alpha ? min_alpha : min_rgb);
1916 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.i16",
1917 ctx->v2i16, args, 2,
1918 AC_FUNC_ATTR_READNONE);
1919 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1922 /* The 8-bit and 10-bit clamping is for HW workarounds. */
1923 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx,
1924 LLVMValueRef args[2], unsigned bits, bool hi)
1926 assert(bits == 8 || bits == 10 || bits == 16);
1928 LLVMValueRef max_rgb = LLVMConstInt(ctx->i32,
1929 bits == 8 ? 255 : bits == 10 ? 1023 : 65535, 0);
1930 LLVMValueRef max_alpha =
1931 bits != 10 ? max_rgb : LLVMConstInt(ctx->i32, 3, 0);
1935 for (int i = 0; i < 2; i++) {
1936 bool alpha = hi && i == 1;
1937 args[i] = ac_build_umin(ctx, args[i],
1938 alpha ? max_alpha : max_rgb);
1943 ac_build_intrinsic(ctx, "llvm.amdgcn.cvt.pk.u16",
1944 ctx->v2i16, args, 2,
1945 AC_FUNC_ATTR_READNONE);
1946 return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
1949 LLVMValueRef ac_build_wqm_vote(struct ac_llvm_context *ctx, LLVMValueRef i1)
1951 return ac_build_intrinsic(ctx, "llvm.amdgcn.wqm.vote", ctx->i1,
1952 &i1, 1, AC_FUNC_ATTR_READNONE);
1955 void ac_build_kill_if_false(struct ac_llvm_context *ctx, LLVMValueRef i1)
1957 ac_build_intrinsic(ctx, "llvm.amdgcn.kill", ctx->voidt,
1961 LLVMValueRef ac_build_bfe(struct ac_llvm_context *ctx, LLVMValueRef input,
1962 LLVMValueRef offset, LLVMValueRef width,
1965 LLVMValueRef args[] = {
1971 return ac_build_intrinsic(ctx,
1972 is_signed ? "llvm.amdgcn.sbfe.i32" :
1973 "llvm.amdgcn.ubfe.i32",
1975 AC_FUNC_ATTR_READNONE);
1978 LLVMValueRef ac_build_imad(struct ac_llvm_context *ctx, LLVMValueRef s0,
1979 LLVMValueRef s1, LLVMValueRef s2)
1981 return LLVMBuildAdd(ctx->builder,
1982 LLVMBuildMul(ctx->builder, s0, s1, ""), s2, "");
1985 LLVMValueRef ac_build_fmad(struct ac_llvm_context *ctx, LLVMValueRef s0,
1986 LLVMValueRef s1, LLVMValueRef s2)
1988 return LLVMBuildFAdd(ctx->builder,
1989 LLVMBuildFMul(ctx->builder, s0, s1, ""), s2, "");
1992 void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned simm16)
1994 LLVMValueRef args[1] = {
1995 LLVMConstInt(ctx->i32, simm16, false),
1997 ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt",
1998 ctx->voidt, args, 1, 0);
2001 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0,
2007 if (bitsize == 32) {
2008 intr = "llvm.floor.f32";
2011 intr = "llvm.floor.f64";
2015 LLVMValueRef params[] = {
2018 LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1,
2019 AC_FUNC_ATTR_READNONE);
2020 return LLVMBuildFSub(ctx->builder, src0, floor, "");
2023 LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2026 LLVMValueRef cmp, val, zero, one;
2029 if (bitsize == 32) {
2039 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
2040 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2041 cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
2042 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
2046 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src0,
2049 LLVMValueRef cmp, val, zero, one;
2052 if (bitsize == 32) {
2062 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
2063 val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
2064 cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
2065 val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
2069 #define AC_EXP_TARGET 0
2070 #define AC_EXP_ENABLED_CHANNELS 1
2071 #define AC_EXP_OUT0 2
2079 struct ac_vs_exp_chan
2083 enum ac_ir_type type;
2086 struct ac_vs_exp_inst {
2089 struct ac_vs_exp_chan chan[4];
2092 struct ac_vs_exports {
2094 struct ac_vs_exp_inst exp[VARYING_SLOT_MAX];
2097 /* Return true if the PARAM export has been eliminated. */
2098 static bool ac_eliminate_const_output(uint8_t *vs_output_param_offset,
2099 uint32_t num_outputs,
2100 struct ac_vs_exp_inst *exp)
2102 unsigned i, default_val; /* SPI_PS_INPUT_CNTL_i.DEFAULT_VAL */
2103 bool is_zero[4] = {}, is_one[4] = {};
2105 for (i = 0; i < 4; i++) {
2106 /* It's a constant expression. Undef outputs are eliminated too. */
2107 if (exp->chan[i].type == AC_IR_UNDEF) {
2110 } else if (exp->chan[i].type == AC_IR_CONST) {
2111 if (exp->chan[i].const_float == 0)
2113 else if (exp->chan[i].const_float == 1)
2116 return false; /* other constant */
2121 /* Only certain combinations of 0 and 1 can be eliminated. */
2122 if (is_zero[0] && is_zero[1] && is_zero[2])
2123 default_val = is_zero[3] ? 0 : 1;
2124 else if (is_one[0] && is_one[1] && is_one[2])
2125 default_val = is_zero[3] ? 2 : 3;
2129 /* The PARAM export can be represented as DEFAULT_VAL. Kill it. */
2130 LLVMInstructionEraseFromParent(exp->inst);
2132 /* Change OFFSET to DEFAULT_VAL. */
2133 for (i = 0; i < num_outputs; i++) {
2134 if (vs_output_param_offset[i] == exp->offset) {
2135 vs_output_param_offset[i] =
2136 AC_EXP_PARAM_DEFAULT_VAL_0000 + default_val;
2143 static bool ac_eliminate_duplicated_output(struct ac_llvm_context *ctx,
2144 uint8_t *vs_output_param_offset,
2145 uint32_t num_outputs,
2146 struct ac_vs_exports *processed,
2147 struct ac_vs_exp_inst *exp)
2149 unsigned p, copy_back_channels = 0;
2151 /* See if the output is already in the list of processed outputs.
2152 * The LLVMValueRef comparison relies on SSA.
2154 for (p = 0; p < processed->num; p++) {
2155 bool different = false;
2157 for (unsigned j = 0; j < 4; j++) {
2158 struct ac_vs_exp_chan *c1 = &processed->exp[p].chan[j];
2159 struct ac_vs_exp_chan *c2 = &exp->chan[j];
2161 /* Treat undef as a match. */
2162 if (c2->type == AC_IR_UNDEF)
2165 /* If c1 is undef but c2 isn't, we can copy c2 to c1
2166 * and consider the instruction duplicated.
2168 if (c1->type == AC_IR_UNDEF) {
2169 copy_back_channels |= 1 << j;
2173 /* Test whether the channels are not equal. */
2174 if (c1->type != c2->type ||
2175 (c1->type == AC_IR_CONST &&
2176 c1->const_float != c2->const_float) ||
2177 (c1->type == AC_IR_VALUE &&
2178 c1->value != c2->value)) {
2186 copy_back_channels = 0;
2188 if (p == processed->num)
2191 /* If a match was found, but the matching export has undef where the new
2192 * one has a normal value, copy the normal value to the undef channel.
2194 struct ac_vs_exp_inst *match = &processed->exp[p];
2196 /* Get current enabled channels mask. */
2197 LLVMValueRef arg = LLVMGetOperand(match->inst, AC_EXP_ENABLED_CHANNELS);
2198 unsigned enabled_channels = LLVMConstIntGetZExtValue(arg);
2200 while (copy_back_channels) {
2201 unsigned chan = u_bit_scan(©_back_channels);
2203 assert(match->chan[chan].type == AC_IR_UNDEF);
2204 LLVMSetOperand(match->inst, AC_EXP_OUT0 + chan,
2205 exp->chan[chan].value);
2206 match->chan[chan] = exp->chan[chan];
2208 /* Update number of enabled channels because the original mask
2209 * is not always 0xf.
2211 enabled_channels |= (1 << chan);
2212 LLVMSetOperand(match->inst, AC_EXP_ENABLED_CHANNELS,
2213 LLVMConstInt(ctx->i32, enabled_channels, 0));
2216 /* The PARAM export is duplicated. Kill it. */
2217 LLVMInstructionEraseFromParent(exp->inst);
2219 /* Change OFFSET to the matching export. */
2220 for (unsigned i = 0; i < num_outputs; i++) {
2221 if (vs_output_param_offset[i] == exp->offset) {
2222 vs_output_param_offset[i] = match->offset;
2229 void ac_optimize_vs_outputs(struct ac_llvm_context *ctx,
2230 LLVMValueRef main_fn,
2231 uint8_t *vs_output_param_offset,
2232 uint32_t num_outputs,
2233 uint8_t *num_param_exports)
2235 LLVMBasicBlockRef bb;
2236 bool removed_any = false;
2237 struct ac_vs_exports exports;
2241 /* Process all LLVM instructions. */
2242 bb = LLVMGetFirstBasicBlock(main_fn);
2244 LLVMValueRef inst = LLVMGetFirstInstruction(bb);
2247 LLVMValueRef cur = inst;
2248 inst = LLVMGetNextInstruction(inst);
2249 struct ac_vs_exp_inst exp;
2251 if (LLVMGetInstructionOpcode(cur) != LLVMCall)
2254 LLVMValueRef callee = ac_llvm_get_called_value(cur);
2256 if (!ac_llvm_is_function(callee))
2259 const char *name = LLVMGetValueName(callee);
2260 unsigned num_args = LLVMCountParams(callee);
2262 /* Check if this is an export instruction. */
2263 if ((num_args != 9 && num_args != 8) ||
2264 (strcmp(name, "llvm.SI.export") &&
2265 strcmp(name, "llvm.amdgcn.exp.f32")))
2268 LLVMValueRef arg = LLVMGetOperand(cur, AC_EXP_TARGET);
2269 unsigned target = LLVMConstIntGetZExtValue(arg);
2271 if (target < V_008DFC_SQ_EXP_PARAM)
2274 target -= V_008DFC_SQ_EXP_PARAM;
2276 /* Parse the instruction. */
2277 memset(&exp, 0, sizeof(exp));
2278 exp.offset = target;
2281 for (unsigned i = 0; i < 4; i++) {
2282 LLVMValueRef v = LLVMGetOperand(cur, AC_EXP_OUT0 + i);
2284 exp.chan[i].value = v;
2286 if (LLVMIsUndef(v)) {
2287 exp.chan[i].type = AC_IR_UNDEF;
2288 } else if (LLVMIsAConstantFP(v)) {
2289 LLVMBool loses_info;
2290 exp.chan[i].type = AC_IR_CONST;
2291 exp.chan[i].const_float =
2292 LLVMConstRealGetDouble(v, &loses_info);
2294 exp.chan[i].type = AC_IR_VALUE;
2298 /* Eliminate constant and duplicated PARAM exports. */
2299 if (ac_eliminate_const_output(vs_output_param_offset,
2300 num_outputs, &exp) ||
2301 ac_eliminate_duplicated_output(ctx,
2302 vs_output_param_offset,
2303 num_outputs, &exports,
2307 exports.exp[exports.num++] = exp;
2310 bb = LLVMGetNextBasicBlock(bb);
2313 /* Remove holes in export memory due to removed PARAM exports.
2314 * This is done by renumbering all PARAM exports.
2317 uint8_t old_offset[VARYING_SLOT_MAX];
2320 /* Make a copy of the offsets. We need the old version while
2321 * we are modifying some of them. */
2322 memcpy(old_offset, vs_output_param_offset,
2323 sizeof(old_offset));
2325 for (i = 0; i < exports.num; i++) {
2326 unsigned offset = exports.exp[i].offset;
2328 /* Update vs_output_param_offset. Multiple outputs can
2329 * have the same offset.
2331 for (out = 0; out < num_outputs; out++) {
2332 if (old_offset[out] == offset)
2333 vs_output_param_offset[out] = i;
2336 /* Change the PARAM offset in the instruction. */
2337 LLVMSetOperand(exports.exp[i].inst, AC_EXP_TARGET,
2338 LLVMConstInt(ctx->i32,
2339 V_008DFC_SQ_EXP_PARAM + i, 0));
2341 *num_param_exports = exports.num;
2345 void ac_init_exec_full_mask(struct ac_llvm_context *ctx)
2347 LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
2348 ac_build_intrinsic(ctx,
2349 "llvm.amdgcn.init.exec", ctx->voidt,
2350 &full_mask, 1, AC_FUNC_ATTR_CONVERGENT);
2353 void ac_declare_lds_as_pointer(struct ac_llvm_context *ctx)
2355 unsigned lds_size = ctx->chip_class >= CIK ? 65536 : 32768;
2356 ctx->lds = LLVMBuildIntToPtr(ctx->builder, ctx->i32_0,
2357 LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), AC_LOCAL_ADDR_SPACE),
2361 LLVMValueRef ac_lds_load(struct ac_llvm_context *ctx,
2362 LLVMValueRef dw_addr)
2364 return ac_build_load(ctx, ctx->lds, dw_addr);
2367 void ac_lds_store(struct ac_llvm_context *ctx,
2368 LLVMValueRef dw_addr,
2371 value = ac_to_integer(ctx, value);
2372 ac_build_indexed_store(ctx, ctx->lds,
2376 LLVMValueRef ac_find_lsb(struct ac_llvm_context *ctx,
2377 LLVMTypeRef dst_type,
2380 unsigned src0_bitsize = ac_get_elem_bits(ctx, LLVMTypeOf(src0));
2381 const char *intrin_name;
2384 if (src0_bitsize == 64) {
2385 intrin_name = "llvm.cttz.i64";
2389 intrin_name = "llvm.cttz.i32";
2394 LLVMValueRef params[2] = {
2397 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
2398 * add special code to check for x=0. The reason is that
2399 * the LLVM behavior for x=0 is different from what we
2400 * need here. However, LLVM also assumes that ffs(x) is
2401 * in [0, 31], but GLSL expects that ffs(0) = -1, so
2402 * a conditional assignment to handle 0 is still required.
2404 * The hardware already implements the correct behavior.
2406 LLVMConstInt(ctx->i1, 1, false),
2409 LLVMValueRef lsb = ac_build_intrinsic(ctx, intrin_name, type,
2411 AC_FUNC_ATTR_READNONE);
2413 if (src0_bitsize == 64) {
2414 lsb = LLVMBuildTrunc(ctx->builder, lsb, ctx->i32, "");
2417 /* TODO: We need an intrinsic to skip this conditional. */
2418 /* Check for zero: */
2419 return LLVMBuildSelect(ctx->builder, LLVMBuildICmp(ctx->builder,
2422 LLVMConstInt(ctx->i32, -1, 0), lsb, "");
2425 LLVMTypeRef ac_array_in_const_addr_space(LLVMTypeRef elem_type)
2427 return LLVMPointerType(LLVMArrayType(elem_type, 0),
2428 AC_CONST_ADDR_SPACE);
2431 LLVMTypeRef ac_array_in_const32_addr_space(LLVMTypeRef elem_type)
2433 if (!HAVE_32BIT_POINTERS)
2434 return ac_array_in_const_addr_space(elem_type);
2436 return LLVMPointerType(LLVMArrayType(elem_type, 0),
2437 AC_CONST_32BIT_ADDR_SPACE);
2440 static struct ac_llvm_flow *
2441 get_current_flow(struct ac_llvm_context *ctx)
2443 if (ctx->flow_depth > 0)
2444 return &ctx->flow[ctx->flow_depth - 1];
2448 static struct ac_llvm_flow *
2449 get_innermost_loop(struct ac_llvm_context *ctx)
2451 for (unsigned i = ctx->flow_depth; i > 0; --i) {
2452 if (ctx->flow[i - 1].loop_entry_block)
2453 return &ctx->flow[i - 1];
2458 static struct ac_llvm_flow *
2459 push_flow(struct ac_llvm_context *ctx)
2461 struct ac_llvm_flow *flow;
2463 if (ctx->flow_depth >= ctx->flow_depth_max) {
2464 unsigned new_max = MAX2(ctx->flow_depth << 1,
2465 AC_LLVM_INITIAL_CF_DEPTH);
2467 ctx->flow = realloc(ctx->flow, new_max * sizeof(*ctx->flow));
2468 ctx->flow_depth_max = new_max;
2471 flow = &ctx->flow[ctx->flow_depth];
2474 flow->next_block = NULL;
2475 flow->loop_entry_block = NULL;
2479 static void set_basicblock_name(LLVMBasicBlockRef bb, const char *base,
2483 snprintf(buf, sizeof(buf), "%s%d", base, label_id);
2484 LLVMSetValueName(LLVMBasicBlockAsValue(bb), buf);
2487 /* Append a basic block at the level of the parent flow.
2489 static LLVMBasicBlockRef append_basic_block(struct ac_llvm_context *ctx,
2492 assert(ctx->flow_depth >= 1);
2494 if (ctx->flow_depth >= 2) {
2495 struct ac_llvm_flow *flow = &ctx->flow[ctx->flow_depth - 2];
2497 return LLVMInsertBasicBlockInContext(ctx->context,
2498 flow->next_block, name);
2501 LLVMValueRef main_fn =
2502 LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->builder));
2503 return LLVMAppendBasicBlockInContext(ctx->context, main_fn, name);
2506 /* Emit a branch to the given default target for the current block if
2507 * applicable -- that is, if the current block does not already contain a
2508 * branch from a break or continue.
2510 static void emit_default_branch(LLVMBuilderRef builder,
2511 LLVMBasicBlockRef target)
2513 if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
2514 LLVMBuildBr(builder, target);
2517 void ac_build_bgnloop(struct ac_llvm_context *ctx, int label_id)
2519 struct ac_llvm_flow *flow = push_flow(ctx);
2520 flow->loop_entry_block = append_basic_block(ctx, "LOOP");
2521 flow->next_block = append_basic_block(ctx, "ENDLOOP");
2522 set_basicblock_name(flow->loop_entry_block, "loop", label_id);
2523 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2524 LLVMPositionBuilderAtEnd(ctx->builder, flow->loop_entry_block);
2527 void ac_build_break(struct ac_llvm_context *ctx)
2529 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2530 LLVMBuildBr(ctx->builder, flow->next_block);
2533 void ac_build_continue(struct ac_llvm_context *ctx)
2535 struct ac_llvm_flow *flow = get_innermost_loop(ctx);
2536 LLVMBuildBr(ctx->builder, flow->loop_entry_block);
2539 void ac_build_else(struct ac_llvm_context *ctx, int label_id)
2541 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2542 LLVMBasicBlockRef endif_block;
2544 assert(!current_branch->loop_entry_block);
2546 endif_block = append_basic_block(ctx, "ENDIF");
2547 emit_default_branch(ctx->builder, endif_block);
2549 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2550 set_basicblock_name(current_branch->next_block, "else", label_id);
2552 current_branch->next_block = endif_block;
2555 void ac_build_endif(struct ac_llvm_context *ctx, int label_id)
2557 struct ac_llvm_flow *current_branch = get_current_flow(ctx);
2559 assert(!current_branch->loop_entry_block);
2561 emit_default_branch(ctx->builder, current_branch->next_block);
2562 LLVMPositionBuilderAtEnd(ctx->builder, current_branch->next_block);
2563 set_basicblock_name(current_branch->next_block, "endif", label_id);
2568 void ac_build_endloop(struct ac_llvm_context *ctx, int label_id)
2570 struct ac_llvm_flow *current_loop = get_current_flow(ctx);
2572 assert(current_loop->loop_entry_block);
2574 emit_default_branch(ctx->builder, current_loop->loop_entry_block);
2576 LLVMPositionBuilderAtEnd(ctx->builder, current_loop->next_block);
2577 set_basicblock_name(current_loop->next_block, "endloop", label_id);
2581 static void if_cond_emit(struct ac_llvm_context *ctx, LLVMValueRef cond,
2584 struct ac_llvm_flow *flow = push_flow(ctx);
2585 LLVMBasicBlockRef if_block;
2587 if_block = append_basic_block(ctx, "IF");
2588 flow->next_block = append_basic_block(ctx, "ELSE");
2589 set_basicblock_name(if_block, "if", label_id);
2590 LLVMBuildCondBr(ctx->builder, cond, if_block, flow->next_block);
2591 LLVMPositionBuilderAtEnd(ctx->builder, if_block);
2594 void ac_build_if(struct ac_llvm_context *ctx, LLVMValueRef value,
2597 LLVMValueRef cond = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
2598 value, ctx->f32_0, "");
2599 if_cond_emit(ctx, cond, label_id);
2602 void ac_build_uif(struct ac_llvm_context *ctx, LLVMValueRef value,
2605 LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
2606 ac_to_integer(ctx, value),
2608 if_cond_emit(ctx, cond, label_id);
2611 LLVMValueRef ac_build_alloca(struct ac_llvm_context *ac, LLVMTypeRef type,
2614 LLVMBuilderRef builder = ac->builder;
2615 LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
2616 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
2617 LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
2618 LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
2619 LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
2623 LLVMPositionBuilderBefore(first_builder, first_instr);
2625 LLVMPositionBuilderAtEnd(first_builder, first_block);
2628 res = LLVMBuildAlloca(first_builder, type, name);
2629 LLVMBuildStore(builder, LLVMConstNull(type), res);
2631 LLVMDisposeBuilder(first_builder);
2636 LLVMValueRef ac_build_alloca_undef(struct ac_llvm_context *ac,
2637 LLVMTypeRef type, const char *name)
2639 LLVMValueRef ptr = ac_build_alloca(ac, type, name);
2640 LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
2644 LLVMValueRef ac_cast_ptr(struct ac_llvm_context *ctx, LLVMValueRef ptr,
2647 int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
2648 return LLVMBuildBitCast(ctx->builder, ptr,
2649 LLVMPointerType(type, addr_space), "");
2652 LLVMValueRef ac_trim_vector(struct ac_llvm_context *ctx, LLVMValueRef value,
2655 unsigned num_components = ac_get_llvm_num_components(value);
2656 if (count == num_components)
2659 LLVMValueRef masks[] = {
2660 LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
2661 LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
2664 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
2667 LLVMValueRef swizzle = LLVMConstVector(masks, count);
2668 return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
2671 LLVMValueRef ac_unpack_param(struct ac_llvm_context *ctx, LLVMValueRef param,
2672 unsigned rshift, unsigned bitwidth)
2674 LLVMValueRef value = param;
2676 value = LLVMBuildLShr(ctx->builder, value,
2677 LLVMConstInt(ctx->i32, rshift, false), "");
2679 if (rshift + bitwidth < 32) {
2680 unsigned mask = (1 << bitwidth) - 1;
2681 value = LLVMBuildAnd(ctx->builder, value,
2682 LLVMConstInt(ctx->i32, mask, false), "");
2687 /* Adjust the sample index according to FMASK.
2689 * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
2690 * which is the identity mapping. Each nibble says which physical sample
2691 * should be fetched to get that sample.
2693 * For example, 0x11111100 means there are only 2 samples stored and
2694 * the second sample covers 3/4 of the pixel. When reading samples 0
2695 * and 1, return physical sample 0 (determined by the first two 0s
2696 * in FMASK), otherwise return physical sample 1.
2698 * The sample index should be adjusted as follows:
2699 * addr[sample_index] = (fmask >> (addr[sample_index] * 4)) & 0xF;
2701 void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask,
2702 LLVMValueRef *addr, bool is_array_tex)
2704 struct ac_image_args fmask_load = {};
2705 fmask_load.opcode = ac_image_load;
2706 fmask_load.resource = fmask;
2707 fmask_load.dmask = 0xf;
2708 fmask_load.dim = is_array_tex ? ac_image_2darray : ac_image_2d;
2710 fmask_load.coords[0] = addr[0];
2711 fmask_load.coords[1] = addr[1];
2713 fmask_load.coords[2] = addr[2];
2715 LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load);
2716 fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value,
2719 /* Apply the formula. */
2720 unsigned sample_chan = is_array_tex ? 3 : 2;
2721 LLVMValueRef final_sample;
2722 final_sample = LLVMBuildMul(ac->builder, addr[sample_chan],
2723 LLVMConstInt(ac->i32, 4, 0), "");
2724 final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, "");
2725 /* Mask the sample index by 0x7, because 0x8 means an unknown value
2726 * with EQAA, so those will map to 0. */
2727 final_sample = LLVMBuildAnd(ac->builder, final_sample,
2728 LLVMConstInt(ac->i32, 0x7, 0), "");
2730 /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
2731 * resource descriptor is 0 (invalid).
2734 tmp = LLVMBuildBitCast(ac->builder, fmask, ac->v8i32, "");
2735 tmp = LLVMBuildExtractElement(ac->builder, tmp, ac->i32_1, "");
2736 tmp = LLVMBuildICmp(ac->builder, LLVMIntNE, tmp, ac->i32_0, "");
2738 /* Replace the MSAA sample index. */
2739 addr[sample_chan] = LLVMBuildSelect(ac->builder, tmp, final_sample,
2740 addr[sample_chan], "");
2744 _ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2746 ac_build_optimization_barrier(ctx, &src);
2747 return ac_build_intrinsic(ctx,
2748 lane == NULL ? "llvm.amdgcn.readfirstlane" : "llvm.amdgcn.readlane",
2749 LLVMTypeOf(src), (LLVMValueRef []) {
2751 lane == NULL ? 1 : 2,
2752 AC_FUNC_ATTR_READNONE |
2753 AC_FUNC_ATTR_CONVERGENT);
2757 * Builds the "llvm.amdgcn.readlane" or "llvm.amdgcn.readfirstlane" intrinsic.
2760 * @param lane - id of the lane or NULL for the first active lane
2761 * @return value of the lane
2764 ac_build_readlane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef lane)
2766 LLVMTypeRef src_type = LLVMTypeOf(src);
2767 src = ac_to_integer(ctx, src);
2768 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2772 ret = _ac_build_readlane(ctx, src, lane);
2774 assert(bits % 32 == 0);
2775 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2776 LLVMValueRef src_vector =
2777 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2778 ret = LLVMGetUndef(vec_type);
2779 for (unsigned i = 0; i < bits / 32; i++) {
2780 src = LLVMBuildExtractElement(ctx->builder, src_vector,
2781 LLVMConstInt(ctx->i32, i, 0), "");
2782 LLVMValueRef ret_comp = _ac_build_readlane(ctx, src, lane);
2783 ret = LLVMBuildInsertElement(ctx->builder, ret, ret_comp,
2784 LLVMConstInt(ctx->i32, i, 0), "");
2787 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2791 ac_build_writelane(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef value, LLVMValueRef lane)
2793 /* TODO: Use the actual instruction when LLVM adds an intrinsic for it.
2795 LLVMValueRef pred = LLVMBuildICmp(ctx->builder, LLVMIntEQ, lane,
2796 ac_get_thread_id(ctx), "");
2797 return LLVMBuildSelect(ctx->builder, pred, value, src, "");
2801 ac_build_mbcnt(struct ac_llvm_context *ctx, LLVMValueRef mask)
2803 LLVMValueRef mask_vec = LLVMBuildBitCast(ctx->builder, mask,
2804 LLVMVectorType(ctx->i32, 2),
2806 LLVMValueRef mask_lo = LLVMBuildExtractElement(ctx->builder, mask_vec,
2808 LLVMValueRef mask_hi = LLVMBuildExtractElement(ctx->builder, mask_vec,
2811 ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.lo", ctx->i32,
2812 (LLVMValueRef []) { mask_lo, ctx->i32_0 },
2813 2, AC_FUNC_ATTR_READNONE);
2814 val = ac_build_intrinsic(ctx, "llvm.amdgcn.mbcnt.hi", ctx->i32,
2815 (LLVMValueRef []) { mask_hi, val },
2816 2, AC_FUNC_ATTR_READNONE);
2821 _dpp_quad_perm = 0x000,
2822 _dpp_row_sl = 0x100,
2823 _dpp_row_sr = 0x110,
2824 _dpp_row_rr = 0x120,
2829 dpp_row_mirror = 0x140,
2830 dpp_row_half_mirror = 0x141,
2831 dpp_row_bcast15 = 0x142,
2832 dpp_row_bcast31 = 0x143
2835 static inline enum dpp_ctrl
2836 dpp_quad_perm(unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
2838 assert(lane0 < 4 && lane1 < 4 && lane2 < 4 && lane3 < 4);
2839 return _dpp_quad_perm | lane0 | (lane1 << 2) | (lane2 << 4) | (lane3 << 6);
2842 static inline enum dpp_ctrl
2843 dpp_row_sl(unsigned amount)
2845 assert(amount > 0 && amount < 16);
2846 return _dpp_row_sl | amount;
2849 static inline enum dpp_ctrl
2850 dpp_row_sr(unsigned amount)
2852 assert(amount > 0 && amount < 16);
2853 return _dpp_row_sr | amount;
2857 _ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2858 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2861 return ac_build_intrinsic(ctx, "llvm.amdgcn.update.dpp.i32",
2865 LLVMConstInt(ctx->i32, dpp_ctrl, 0),
2866 LLVMConstInt(ctx->i32, row_mask, 0),
2867 LLVMConstInt(ctx->i32, bank_mask, 0),
2868 LLVMConstInt(ctx->i1, bound_ctrl, 0) },
2869 6, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
2873 ac_build_dpp(struct ac_llvm_context *ctx, LLVMValueRef old, LLVMValueRef src,
2874 enum dpp_ctrl dpp_ctrl, unsigned row_mask, unsigned bank_mask,
2877 LLVMTypeRef src_type = LLVMTypeOf(src);
2878 src = ac_to_integer(ctx, src);
2879 old = ac_to_integer(ctx, old);
2880 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2883 ret = _ac_build_dpp(ctx, old, src, dpp_ctrl, row_mask,
2884 bank_mask, bound_ctrl);
2886 assert(bits % 32 == 0);
2887 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2888 LLVMValueRef src_vector =
2889 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2890 LLVMValueRef old_vector =
2891 LLVMBuildBitCast(ctx->builder, old, vec_type, "");
2892 ret = LLVMGetUndef(vec_type);
2893 for (unsigned i = 0; i < bits / 32; i++) {
2894 src = LLVMBuildExtractElement(ctx->builder, src_vector,
2895 LLVMConstInt(ctx->i32, i,
2897 old = LLVMBuildExtractElement(ctx->builder, old_vector,
2898 LLVMConstInt(ctx->i32, i,
2900 LLVMValueRef ret_comp = _ac_build_dpp(ctx, old, src,
2905 ret = LLVMBuildInsertElement(ctx->builder, ret,
2907 LLVMConstInt(ctx->i32, i,
2911 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2914 static inline unsigned
2915 ds_pattern_bitmode(unsigned and_mask, unsigned or_mask, unsigned xor_mask)
2917 assert(and_mask < 32 && or_mask < 32 && xor_mask < 32);
2918 return and_mask | (or_mask << 5) | (xor_mask << 10);
2922 _ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2924 return ac_build_intrinsic(ctx, "llvm.amdgcn.ds.swizzle",
2925 LLVMTypeOf(src), (LLVMValueRef []) {
2926 src, LLVMConstInt(ctx->i32, mask, 0) },
2927 2, AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_CONVERGENT);
2931 ac_build_ds_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src, unsigned mask)
2933 LLVMTypeRef src_type = LLVMTypeOf(src);
2934 src = ac_to_integer(ctx, src);
2935 unsigned bits = LLVMGetIntTypeWidth(LLVMTypeOf(src));
2938 ret = _ac_build_ds_swizzle(ctx, src, mask);
2940 assert(bits % 32 == 0);
2941 LLVMTypeRef vec_type = LLVMVectorType(ctx->i32, bits / 32);
2942 LLVMValueRef src_vector =
2943 LLVMBuildBitCast(ctx->builder, src, vec_type, "");
2944 ret = LLVMGetUndef(vec_type);
2945 for (unsigned i = 0; i < bits / 32; i++) {
2946 src = LLVMBuildExtractElement(ctx->builder, src_vector,
2947 LLVMConstInt(ctx->i32, i,
2949 LLVMValueRef ret_comp = _ac_build_ds_swizzle(ctx, src,
2951 ret = LLVMBuildInsertElement(ctx->builder, ret,
2953 LLVMConstInt(ctx->i32, i,
2957 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2961 ac_build_wwm(struct ac_llvm_context *ctx, LLVMValueRef src)
2963 char name[32], type[8];
2964 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
2965 snprintf(name, sizeof(name), "llvm.amdgcn.wwm.%s", type);
2966 return ac_build_intrinsic(ctx, name, LLVMTypeOf(src),
2967 (LLVMValueRef []) { src }, 1,
2968 AC_FUNC_ATTR_READNONE);
2972 ac_build_set_inactive(struct ac_llvm_context *ctx, LLVMValueRef src,
2973 LLVMValueRef inactive)
2975 char name[33], type[8];
2976 LLVMTypeRef src_type = LLVMTypeOf(src);
2977 src = ac_to_integer(ctx, src);
2978 inactive = ac_to_integer(ctx, inactive);
2979 ac_build_type_name_for_intr(LLVMTypeOf(src), type, sizeof(type));
2980 snprintf(name, sizeof(name), "llvm.amdgcn.set.inactive.%s", type);
2982 ac_build_intrinsic(ctx, name,
2983 LLVMTypeOf(src), (LLVMValueRef []) {
2985 AC_FUNC_ATTR_READNONE |
2986 AC_FUNC_ATTR_CONVERGENT);
2987 return LLVMBuildBitCast(ctx->builder, ret, src_type, "");
2991 get_reduction_identity(struct ac_llvm_context *ctx, nir_op op, unsigned type_size)
2993 if (type_size == 4) {
2995 case nir_op_iadd: return ctx->i32_0;
2996 case nir_op_fadd: return ctx->f32_0;
2997 case nir_op_imul: return ctx->i32_1;
2998 case nir_op_fmul: return ctx->f32_1;
2999 case nir_op_imin: return LLVMConstInt(ctx->i32, INT32_MAX, 0);
3000 case nir_op_umin: return LLVMConstInt(ctx->i32, UINT32_MAX, 0);
3001 case nir_op_fmin: return LLVMConstReal(ctx->f32, INFINITY);
3002 case nir_op_imax: return LLVMConstInt(ctx->i32, INT32_MIN, 0);
3003 case nir_op_umax: return ctx->i32_0;
3004 case nir_op_fmax: return LLVMConstReal(ctx->f32, -INFINITY);
3005 case nir_op_iand: return LLVMConstInt(ctx->i32, -1, 0);
3006 case nir_op_ior: return ctx->i32_0;
3007 case nir_op_ixor: return ctx->i32_0;
3009 unreachable("bad reduction intrinsic");
3011 } else { /* type_size == 64bit */
3013 case nir_op_iadd: return ctx->i64_0;
3014 case nir_op_fadd: return ctx->f64_0;
3015 case nir_op_imul: return ctx->i64_1;
3016 case nir_op_fmul: return ctx->f64_1;
3017 case nir_op_imin: return LLVMConstInt(ctx->i64, INT64_MAX, 0);
3018 case nir_op_umin: return LLVMConstInt(ctx->i64, UINT64_MAX, 0);
3019 case nir_op_fmin: return LLVMConstReal(ctx->f64, INFINITY);
3020 case nir_op_imax: return LLVMConstInt(ctx->i64, INT64_MIN, 0);
3021 case nir_op_umax: return ctx->i64_0;
3022 case nir_op_fmax: return LLVMConstReal(ctx->f64, -INFINITY);
3023 case nir_op_iand: return LLVMConstInt(ctx->i64, -1, 0);
3024 case nir_op_ior: return ctx->i64_0;
3025 case nir_op_ixor: return ctx->i64_0;
3027 unreachable("bad reduction intrinsic");
3033 ac_build_alu_op(struct ac_llvm_context *ctx, LLVMValueRef lhs, LLVMValueRef rhs, nir_op op)
3035 bool _64bit = ac_get_type_size(LLVMTypeOf(lhs)) == 8;
3037 case nir_op_iadd: return LLVMBuildAdd(ctx->builder, lhs, rhs, "");
3038 case nir_op_fadd: return LLVMBuildFAdd(ctx->builder, lhs, rhs, "");
3039 case nir_op_imul: return LLVMBuildMul(ctx->builder, lhs, rhs, "");
3040 case nir_op_fmul: return LLVMBuildFMul(ctx->builder, lhs, rhs, "");
3041 case nir_op_imin: return LLVMBuildSelect(ctx->builder,
3042 LLVMBuildICmp(ctx->builder, LLVMIntSLT, lhs, rhs, ""),
3044 case nir_op_umin: return LLVMBuildSelect(ctx->builder,
3045 LLVMBuildICmp(ctx->builder, LLVMIntULT, lhs, rhs, ""),
3047 case nir_op_fmin: return ac_build_intrinsic(ctx,
3048 _64bit ? "llvm.minnum.f64" : "llvm.minnum.f32",
3049 _64bit ? ctx->f64 : ctx->f32,
3050 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3051 case nir_op_imax: return LLVMBuildSelect(ctx->builder,
3052 LLVMBuildICmp(ctx->builder, LLVMIntSGT, lhs, rhs, ""),
3054 case nir_op_umax: return LLVMBuildSelect(ctx->builder,
3055 LLVMBuildICmp(ctx->builder, LLVMIntUGT, lhs, rhs, ""),
3057 case nir_op_fmax: return ac_build_intrinsic(ctx,
3058 _64bit ? "llvm.maxnum.f64" : "llvm.maxnum.f32",
3059 _64bit ? ctx->f64 : ctx->f32,
3060 (LLVMValueRef[]){lhs, rhs}, 2, AC_FUNC_ATTR_READNONE);
3061 case nir_op_iand: return LLVMBuildAnd(ctx->builder, lhs, rhs, "");
3062 case nir_op_ior: return LLVMBuildOr(ctx->builder, lhs, rhs, "");
3063 case nir_op_ixor: return LLVMBuildXor(ctx->builder, lhs, rhs, "");
3065 unreachable("bad reduction intrinsic");
3069 /* TODO: add inclusive and excluse scan functions for SI chip class. */
3071 ac_build_scan(struct ac_llvm_context *ctx, nir_op op, LLVMValueRef src, LLVMValueRef identity)
3073 LLVMValueRef result, tmp;
3075 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(1), 0xf, 0xf, false);
3076 result = ac_build_alu_op(ctx, result, tmp, op);
3077 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(2), 0xf, 0xf, false);
3078 result = ac_build_alu_op(ctx, result, tmp, op);
3079 tmp = ac_build_dpp(ctx, identity, src, dpp_row_sr(3), 0xf, 0xf, false);
3080 result = ac_build_alu_op(ctx, result, tmp, op);
3081 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(4), 0xf, 0xe, false);
3082 result = ac_build_alu_op(ctx, result, tmp, op);
3083 tmp = ac_build_dpp(ctx, identity, result, dpp_row_sr(8), 0xf, 0xc, false);
3084 result = ac_build_alu_op(ctx, result, tmp, op);
3085 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3086 result = ac_build_alu_op(ctx, result, tmp, op);
3087 tmp = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3088 result = ac_build_alu_op(ctx, result, tmp, op);
3093 ac_build_inclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3095 ac_build_optimization_barrier(ctx, &src);
3096 LLVMValueRef result;
3097 LLVMValueRef identity = get_reduction_identity(ctx, op,
3098 ac_get_type_size(LLVMTypeOf(src)));
3099 result = LLVMBuildBitCast(ctx->builder,
3100 ac_build_set_inactive(ctx, src, identity),
3101 LLVMTypeOf(identity), "");
3102 result = ac_build_scan(ctx, op, result, identity);
3104 return ac_build_wwm(ctx, result);
3108 ac_build_exclusive_scan(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op)
3110 ac_build_optimization_barrier(ctx, &src);
3111 LLVMValueRef result;
3112 LLVMValueRef identity = get_reduction_identity(ctx, op,
3113 ac_get_type_size(LLVMTypeOf(src)));
3114 result = LLVMBuildBitCast(ctx->builder,
3115 ac_build_set_inactive(ctx, src, identity),
3116 LLVMTypeOf(identity), "");
3117 result = ac_build_dpp(ctx, identity, result, dpp_wf_sr1, 0xf, 0xf, false);
3118 result = ac_build_scan(ctx, op, result, identity);
3120 return ac_build_wwm(ctx, result);
3124 ac_build_reduce(struct ac_llvm_context *ctx, LLVMValueRef src, nir_op op, unsigned cluster_size)
3126 if (cluster_size == 1) return src;
3127 ac_build_optimization_barrier(ctx, &src);
3128 LLVMValueRef result, swap;
3129 LLVMValueRef identity = get_reduction_identity(ctx, op,
3130 ac_get_type_size(LLVMTypeOf(src)));
3131 result = LLVMBuildBitCast(ctx->builder,
3132 ac_build_set_inactive(ctx, src, identity),
3133 LLVMTypeOf(identity), "");
3134 swap = ac_build_quad_swizzle(ctx, result, 1, 0, 3, 2);
3135 result = ac_build_alu_op(ctx, result, swap, op);
3136 if (cluster_size == 2) return ac_build_wwm(ctx, result);
3138 swap = ac_build_quad_swizzle(ctx, result, 2, 3, 0, 1);
3139 result = ac_build_alu_op(ctx, result, swap, op);
3140 if (cluster_size == 4) return ac_build_wwm(ctx, result);
3142 if (ctx->chip_class >= VI)
3143 swap = ac_build_dpp(ctx, identity, result, dpp_row_half_mirror, 0xf, 0xf, false);
3145 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x04));
3146 result = ac_build_alu_op(ctx, result, swap, op);
3147 if (cluster_size == 8) return ac_build_wwm(ctx, result);
3149 if (ctx->chip_class >= VI)
3150 swap = ac_build_dpp(ctx, identity, result, dpp_row_mirror, 0xf, 0xf, false);
3152 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x08));
3153 result = ac_build_alu_op(ctx, result, swap, op);
3154 if (cluster_size == 16) return ac_build_wwm(ctx, result);
3156 if (ctx->chip_class >= VI && cluster_size != 32)
3157 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast15, 0xa, 0xf, false);
3159 swap = ac_build_ds_swizzle(ctx, result, ds_pattern_bitmode(0x1f, 0, 0x10));
3160 result = ac_build_alu_op(ctx, result, swap, op);
3161 if (cluster_size == 32) return ac_build_wwm(ctx, result);
3163 if (ctx->chip_class >= VI) {
3164 swap = ac_build_dpp(ctx, identity, result, dpp_row_bcast31, 0xc, 0xf, false);
3165 result = ac_build_alu_op(ctx, result, swap, op);
3166 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 63, 0));
3167 return ac_build_wwm(ctx, result);
3169 swap = ac_build_readlane(ctx, result, ctx->i32_0);
3170 result = ac_build_readlane(ctx, result, LLVMConstInt(ctx->i32, 32, 0));
3171 result = ac_build_alu_op(ctx, result, swap, op);
3172 return ac_build_wwm(ctx, result);
3177 ac_build_quad_swizzle(struct ac_llvm_context *ctx, LLVMValueRef src,
3178 unsigned lane0, unsigned lane1, unsigned lane2, unsigned lane3)
3180 unsigned mask = dpp_quad_perm(lane0, lane1, lane2, lane3);
3181 if (ctx->chip_class >= VI) {
3182 return ac_build_dpp(ctx, src, src, mask, 0xf, 0xf, false);
3184 return ac_build_ds_swizzle(ctx, src, (1 << 15) | mask);
3189 ac_build_shuffle(struct ac_llvm_context *ctx, LLVMValueRef src, LLVMValueRef index)
3191 index = LLVMBuildMul(ctx->builder, index, LLVMConstInt(ctx->i32, 4, 0), "");
3192 return ac_build_intrinsic(ctx,
3193 "llvm.amdgcn.ds.bpermute", ctx->i32,
3194 (LLVMValueRef []) {index, src}, 2,
3195 AC_FUNC_ATTR_READNONE |
3196 AC_FUNC_ATTR_CONVERGENT);