OSDN Git Service

amd: remove support for LLVM 3.9
[android-x86/external-mesa.git] / src / amd / common / ac_nir_to_llvm.c
1 /*
2  * Copyright © 2016 Bas Nieuwenhuizen
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include "ac_nir_to_llvm.h"
25 #include "ac_llvm_build.h"
26 #include "ac_llvm_util.h"
27 #include "ac_binary.h"
28 #include "sid.h"
29 #include "nir/nir.h"
30 #include "../vulkan/radv_descriptor_set.h"
31 #include "util/bitscan.h"
32 #include <llvm-c/Transforms/Scalar.h>
33 #include "ac_shader_abi.h"
34 #include "ac_shader_info.h"
35 #include "ac_shader_util.h"
36 #include "ac_exp_param.h"
37
38 enum radeon_llvm_calling_convention {
39         RADEON_LLVM_AMDGPU_VS = 87,
40         RADEON_LLVM_AMDGPU_GS = 88,
41         RADEON_LLVM_AMDGPU_PS = 89,
42         RADEON_LLVM_AMDGPU_CS = 90,
43         RADEON_LLVM_AMDGPU_HS = 93,
44 };
45
46 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
47 #define RADEON_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
48
49 struct nir_to_llvm_context;
50
51 struct ac_nir_context {
52         struct ac_llvm_context ac;
53         struct ac_shader_abi *abi;
54
55         gl_shader_stage stage;
56
57         struct hash_table *defs;
58         struct hash_table *phis;
59         struct hash_table *vars;
60
61         LLVMValueRef main_function;
62         LLVMBasicBlockRef continue_block;
63         LLVMBasicBlockRef break_block;
64
65         LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS * 4];
66
67         int num_locals;
68         LLVMValueRef *locals;
69
70         struct nir_to_llvm_context *nctx; /* TODO get rid of this */
71 };
72
73 struct nir_to_llvm_context {
74         struct ac_llvm_context ac;
75         const struct ac_nir_compiler_options *options;
76         struct ac_shader_variant_info *shader_info;
77         struct ac_shader_abi abi;
78         struct ac_nir_context *nir;
79
80         unsigned max_workgroup_size;
81         LLVMContextRef context;
82         LLVMModuleRef module;
83         LLVMBuilderRef builder;
84         LLVMValueRef main_function;
85
86         struct hash_table *defs;
87         struct hash_table *phis;
88
89         LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
90         LLVMValueRef ring_offsets;
91         LLVMValueRef push_constants;
92         LLVMValueRef view_index;
93         LLVMValueRef num_work_groups;
94         LLVMValueRef workgroup_ids[3];
95         LLVMValueRef local_invocation_ids;
96         LLVMValueRef tg_size;
97
98         LLVMValueRef vertex_buffers;
99         LLVMValueRef rel_auto_id;
100         LLVMValueRef vs_prim_id;
101         LLVMValueRef ls_out_layout;
102         LLVMValueRef es2gs_offset;
103
104         LLVMValueRef tcs_offchip_layout;
105         LLVMValueRef tcs_out_offsets;
106         LLVMValueRef tcs_out_layout;
107         LLVMValueRef tcs_in_layout;
108         LLVMValueRef oc_lds;
109         LLVMValueRef merged_wave_info;
110         LLVMValueRef tess_factor_offset;
111         LLVMValueRef tes_rel_patch_id;
112         LLVMValueRef tes_u;
113         LLVMValueRef tes_v;
114
115         LLVMValueRef gsvs_ring_stride;
116         LLVMValueRef gsvs_num_entries;
117         LLVMValueRef gs2vs_offset;
118         LLVMValueRef gs_wave_id;
119         LLVMValueRef gs_vtx_offset[6];
120
121         LLVMValueRef esgs_ring;
122         LLVMValueRef gsvs_ring;
123         LLVMValueRef hs_ring_tess_offchip;
124         LLVMValueRef hs_ring_tess_factor;
125
126         LLVMValueRef sample_pos_offset;
127         LLVMValueRef persp_sample, persp_center, persp_centroid;
128         LLVMValueRef linear_sample, linear_center, linear_centroid;
129
130         gl_shader_stage stage;
131
132         LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
133
134         uint64_t input_mask;
135         uint64_t output_mask;
136         uint8_t num_output_clips;
137         uint8_t num_output_culls;
138
139         bool is_gs_copy_shader;
140         LLVMValueRef gs_next_vertex;
141         unsigned gs_max_out_vertices;
142
143         unsigned tes_primitive_mode;
144         uint64_t tess_outputs_written;
145         uint64_t tess_patch_outputs_written;
146
147         uint32_t tcs_patch_outputs_read;
148         uint64_t tcs_outputs_read;
149 };
150
151 static inline struct nir_to_llvm_context *
152 nir_to_llvm_context_from_abi(struct ac_shader_abi *abi)
153 {
154         struct nir_to_llvm_context *ctx = NULL;
155         return container_of(abi, ctx, abi);
156 }
157
158 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
159                                      const nir_deref_var *deref,
160                                      enum ac_descriptor_type desc_type,
161                                      const nir_tex_instr *instr,
162                                      bool image, bool write);
163
164 static unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan)
165 {
166         return (index * 4) + chan;
167 }
168
169 static unsigned shader_io_get_unique_index(gl_varying_slot slot)
170 {
171         /* handle patch indices separate */
172         if (slot == VARYING_SLOT_TESS_LEVEL_OUTER)
173                 return 0;
174         if (slot == VARYING_SLOT_TESS_LEVEL_INNER)
175                 return 1;
176         if (slot >= VARYING_SLOT_PATCH0 && slot <= VARYING_SLOT_TESS_MAX)
177                 return 2 + (slot - VARYING_SLOT_PATCH0);
178
179         if (slot == VARYING_SLOT_POS)
180                 return 0;
181         if (slot == VARYING_SLOT_PSIZ)
182                 return 1;
183         if (slot == VARYING_SLOT_CLIP_DIST0)
184                 return 2;
185         /* 3 is reserved for clip dist as well */
186         if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
187                 return 4 + (slot - VARYING_SLOT_VAR0);
188         unreachable("illegal slot in get unique index\n");
189 }
190
191 static void set_llvm_calling_convention(LLVMValueRef func,
192                                         gl_shader_stage stage)
193 {
194         enum radeon_llvm_calling_convention calling_conv;
195
196         switch (stage) {
197         case MESA_SHADER_VERTEX:
198         case MESA_SHADER_TESS_EVAL:
199                 calling_conv = RADEON_LLVM_AMDGPU_VS;
200                 break;
201         case MESA_SHADER_GEOMETRY:
202                 calling_conv = RADEON_LLVM_AMDGPU_GS;
203                 break;
204         case MESA_SHADER_TESS_CTRL:
205                 calling_conv = HAVE_LLVM >= 0x0500 ? RADEON_LLVM_AMDGPU_HS : RADEON_LLVM_AMDGPU_VS;
206                 break;
207         case MESA_SHADER_FRAGMENT:
208                 calling_conv = RADEON_LLVM_AMDGPU_PS;
209                 break;
210         case MESA_SHADER_COMPUTE:
211                 calling_conv = RADEON_LLVM_AMDGPU_CS;
212                 break;
213         default:
214                 unreachable("Unhandle shader type");
215         }
216
217         LLVMSetFunctionCallConv(func, calling_conv);
218 }
219
220 #define MAX_ARGS 23
221 struct arg_info {
222         LLVMTypeRef types[MAX_ARGS];
223         LLVMValueRef *assign[MAX_ARGS];
224         unsigned array_params_mask;
225         uint8_t count;
226         uint8_t sgpr_count;
227         uint8_t num_sgprs_used;
228         uint8_t num_vgprs_used;
229 };
230
231 enum ac_arg_regfile {
232         ARG_SGPR,
233         ARG_VGPR,
234 };
235
236 static void
237 add_arg(struct arg_info *info, enum ac_arg_regfile regfile, LLVMTypeRef type,
238         LLVMValueRef *param_ptr)
239 {
240         assert(info->count < MAX_ARGS);
241
242         info->assign[info->count] = param_ptr;
243         info->types[info->count] = type;
244         info->count++;
245
246         if (regfile == ARG_SGPR) {
247                 info->num_sgprs_used += ac_get_type_size(type) / 4;
248                 info->sgpr_count++;
249         } else {
250                 assert(regfile == ARG_VGPR);
251                 info->num_vgprs_used += ac_get_type_size(type) / 4;
252         }
253 }
254
255 static inline void
256 add_array_arg(struct arg_info *info, LLVMTypeRef type, LLVMValueRef *param_ptr)
257 {
258         info->array_params_mask |= (1 << info->count);
259         add_arg(info, ARG_SGPR, type, param_ptr);
260 }
261
262 static void assign_arguments(LLVMValueRef main_function,
263                              struct arg_info *info)
264 {
265         unsigned i;
266         for (i = 0; i < info->count; i++) {
267                 if (info->assign[i])
268                         *info->assign[i] = LLVMGetParam(main_function, i);
269         }
270 }
271
272 static LLVMValueRef
273 create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
274                      LLVMBuilderRef builder, LLVMTypeRef *return_types,
275                      unsigned num_return_elems,
276                      struct arg_info *args,
277                      unsigned max_workgroup_size,
278                      bool unsafe_math)
279 {
280         LLVMTypeRef main_function_type, ret_type;
281         LLVMBasicBlockRef main_function_body;
282
283         if (num_return_elems)
284                 ret_type = LLVMStructTypeInContext(ctx, return_types,
285                                                    num_return_elems, true);
286         else
287                 ret_type = LLVMVoidTypeInContext(ctx);
288
289         /* Setup the function */
290         main_function_type =
291             LLVMFunctionType(ret_type, args->types, args->count, 0);
292         LLVMValueRef main_function =
293             LLVMAddFunction(module, "main", main_function_type);
294         main_function_body =
295             LLVMAppendBasicBlockInContext(ctx, main_function, "main_body");
296         LLVMPositionBuilderAtEnd(builder, main_function_body);
297
298         LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS);
299         for (unsigned i = 0; i < args->sgpr_count; ++i) {
300                 ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG);
301
302                 if (args->array_params_mask & (1 << i)) {
303                         LLVMValueRef P = LLVMGetParam(main_function, i);
304                         ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_NOALIAS);
305                         ac_add_attr_dereferenceable(P, UINT64_MAX);
306                 }
307         }
308
309         if (max_workgroup_size) {
310                 ac_llvm_add_target_dep_function_attr(main_function,
311                                                      "amdgpu-max-work-group-size",
312                                                      max_workgroup_size);
313         }
314         if (unsafe_math) {
315                 /* These were copied from some LLVM test. */
316                 LLVMAddTargetDependentFunctionAttr(main_function,
317                                                    "less-precise-fpmad",
318                                                    "true");
319                 LLVMAddTargetDependentFunctionAttr(main_function,
320                                                    "no-infs-fp-math",
321                                                    "true");
322                 LLVMAddTargetDependentFunctionAttr(main_function,
323                                                    "no-nans-fp-math",
324                                                    "true");
325                 LLVMAddTargetDependentFunctionAttr(main_function,
326                                                    "unsafe-fp-math",
327                                                    "true");
328                 LLVMAddTargetDependentFunctionAttr(main_function,
329                                            "no-signed-zeros-fp-math",
330                                            "true");
331         }
332         return main_function;
333 }
334
335 static int get_elem_bits(struct ac_llvm_context *ctx, LLVMTypeRef type)
336 {
337         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
338                 type = LLVMGetElementType(type);
339
340         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
341                 return LLVMGetIntTypeWidth(type);
342
343         if (type == ctx->f16)
344                 return 16;
345         if (type == ctx->f32)
346                 return 32;
347         if (type == ctx->f64)
348                 return 64;
349
350         unreachable("Unhandled type kind in get_elem_bits");
351 }
352
353 static LLVMValueRef unpack_param(struct ac_llvm_context *ctx,
354                                  LLVMValueRef param, unsigned rshift,
355                                  unsigned bitwidth)
356 {
357         LLVMValueRef value = param;
358         if (rshift)
359                 value = LLVMBuildLShr(ctx->builder, value,
360                                       LLVMConstInt(ctx->i32, rshift, false), "");
361
362         if (rshift + bitwidth < 32) {
363                 unsigned mask = (1 << bitwidth) - 1;
364                 value = LLVMBuildAnd(ctx->builder, value,
365                                      LLVMConstInt(ctx->i32, mask, false), "");
366         }
367         return value;
368 }
369
370 static LLVMValueRef get_rel_patch_id(struct nir_to_llvm_context *ctx)
371 {
372         switch (ctx->stage) {
373         case MESA_SHADER_TESS_CTRL:
374                 return unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
375         case MESA_SHADER_TESS_EVAL:
376                 return ctx->tes_rel_patch_id;
377                 break;
378         default:
379                 unreachable("Illegal stage");
380         }
381 }
382
383 /* Tessellation shaders pass outputs to the next shader using LDS.
384  *
385  * LS outputs = TCS inputs
386  * TCS outputs = TES inputs
387  *
388  * The LDS layout is:
389  * - TCS inputs for patch 0
390  * - TCS inputs for patch 1
391  * - TCS inputs for patch 2             = get_tcs_in_current_patch_offset (if RelPatchID==2)
392  * - ...
393  * - TCS outputs for patch 0            = get_tcs_out_patch0_offset
394  * - Per-patch TCS outputs for patch 0  = get_tcs_out_patch0_patch_data_offset
395  * - TCS outputs for patch 1
396  * - Per-patch TCS outputs for patch 1
397  * - TCS outputs for patch 2            = get_tcs_out_current_patch_offset (if RelPatchID==2)
398  * - Per-patch TCS outputs for patch 2  = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
399  * - ...
400  *
401  * All three shaders VS(LS), TCS, TES share the same LDS space.
402  */
403 static LLVMValueRef
404 get_tcs_in_patch_stride(struct nir_to_llvm_context *ctx)
405 {
406         if (ctx->stage == MESA_SHADER_VERTEX)
407                 return unpack_param(&ctx->ac, ctx->ls_out_layout, 0, 13);
408         else if (ctx->stage == MESA_SHADER_TESS_CTRL)
409                 return unpack_param(&ctx->ac, ctx->tcs_in_layout, 0, 13);
410         else {
411                 assert(0);
412                 return NULL;
413         }
414 }
415
416 static LLVMValueRef
417 get_tcs_out_patch_stride(struct nir_to_llvm_context *ctx)
418 {
419         return unpack_param(&ctx->ac, ctx->tcs_out_layout, 0, 13);
420 }
421
422 static LLVMValueRef
423 get_tcs_out_patch0_offset(struct nir_to_llvm_context *ctx)
424 {
425         return LLVMBuildMul(ctx->builder,
426                             unpack_param(&ctx->ac, ctx->tcs_out_offsets, 0, 16),
427                             LLVMConstInt(ctx->ac.i32, 4, false), "");
428 }
429
430 static LLVMValueRef
431 get_tcs_out_patch0_patch_data_offset(struct nir_to_llvm_context *ctx)
432 {
433         return LLVMBuildMul(ctx->builder,
434                             unpack_param(&ctx->ac, ctx->tcs_out_offsets, 16, 16),
435                             LLVMConstInt(ctx->ac.i32, 4, false), "");
436 }
437
438 static LLVMValueRef
439 get_tcs_in_current_patch_offset(struct nir_to_llvm_context *ctx)
440 {
441         LLVMValueRef patch_stride = get_tcs_in_patch_stride(ctx);
442         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
443
444         return LLVMBuildMul(ctx->builder, patch_stride, rel_patch_id, "");
445 }
446
447 static LLVMValueRef
448 get_tcs_out_current_patch_offset(struct nir_to_llvm_context *ctx)
449 {
450         LLVMValueRef patch0_offset = get_tcs_out_patch0_offset(ctx);
451         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
452         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
453
454         return LLVMBuildAdd(ctx->builder, patch0_offset,
455                             LLVMBuildMul(ctx->builder, patch_stride,
456                                          rel_patch_id, ""),
457                             "");
458 }
459
460 static LLVMValueRef
461 get_tcs_out_current_patch_data_offset(struct nir_to_llvm_context *ctx)
462 {
463         LLVMValueRef patch0_patch_data_offset =
464                 get_tcs_out_patch0_patch_data_offset(ctx);
465         LLVMValueRef patch_stride = get_tcs_out_patch_stride(ctx);
466         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
467
468         return LLVMBuildAdd(ctx->builder, patch0_patch_data_offset,
469                             LLVMBuildMul(ctx->builder, patch_stride,
470                                          rel_patch_id, ""),
471                             "");
472 }
473
474 static void
475 set_loc(struct ac_userdata_info *ud_info, uint8_t *sgpr_idx, uint8_t num_sgprs,
476         uint32_t indirect_offset)
477 {
478         ud_info->sgpr_idx = *sgpr_idx;
479         ud_info->num_sgprs = num_sgprs;
480         ud_info->indirect = indirect_offset > 0;
481         ud_info->indirect_offset = indirect_offset;
482         *sgpr_idx += num_sgprs;
483 }
484
485 static void
486 set_loc_shader(struct nir_to_llvm_context *ctx, int idx, uint8_t *sgpr_idx,
487                uint8_t num_sgprs)
488 {
489         struct ac_userdata_info *ud_info =
490                 &ctx->shader_info->user_sgprs_locs.shader_data[idx];
491         assert(ud_info);
492
493         set_loc(ud_info, sgpr_idx, num_sgprs, 0);
494 }
495
496 static void
497 set_loc_desc(struct nir_to_llvm_context *ctx, int idx,  uint8_t *sgpr_idx,
498              uint32_t indirect_offset)
499 {
500         struct ac_userdata_info *ud_info =
501                 &ctx->shader_info->user_sgprs_locs.descriptor_sets[idx];
502         assert(ud_info);
503
504         set_loc(ud_info, sgpr_idx, 2, indirect_offset);
505 }
506
507 struct user_sgpr_info {
508         bool need_ring_offsets;
509         uint8_t sgpr_count;
510         bool indirect_all_descriptor_sets;
511 };
512
513 static bool needs_view_index_sgpr(struct nir_to_llvm_context *ctx,
514                                   gl_shader_stage stage)
515 {
516         switch (stage) {
517         case MESA_SHADER_VERTEX:
518                 if (ctx->shader_info->info.needs_multiview_view_index ||
519                     (!ctx->options->key.vs.as_es && !ctx->options->key.vs.as_ls && ctx->options->key.has_multiview_view_index))
520                         return true;
521                 break;
522         case MESA_SHADER_TESS_EVAL:
523                 if (ctx->shader_info->info.needs_multiview_view_index || (!ctx->options->key.tes.as_es && ctx->options->key.has_multiview_view_index))
524                         return true;
525                 break;
526         case MESA_SHADER_GEOMETRY:
527         case MESA_SHADER_TESS_CTRL:
528                 if (ctx->shader_info->info.needs_multiview_view_index)
529                         return true;
530                 break;
531         default:
532                 break;
533         }
534         return false;
535 }
536
537 static void allocate_user_sgprs(struct nir_to_llvm_context *ctx,
538                                 gl_shader_stage stage,
539                                 bool needs_view_index,
540                                 struct user_sgpr_info *user_sgpr_info)
541 {
542         memset(user_sgpr_info, 0, sizeof(struct user_sgpr_info));
543
544         /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
545         if (stage == MESA_SHADER_GEOMETRY ||
546             stage == MESA_SHADER_VERTEX ||
547             stage == MESA_SHADER_TESS_CTRL ||
548             stage == MESA_SHADER_TESS_EVAL ||
549             ctx->is_gs_copy_shader)
550                 user_sgpr_info->need_ring_offsets = true;
551
552         if (stage == MESA_SHADER_FRAGMENT &&
553             ctx->shader_info->info.ps.needs_sample_positions)
554                 user_sgpr_info->need_ring_offsets = true;
555
556         /* 2 user sgprs will nearly always be allocated for scratch/rings */
557         if (ctx->options->supports_spill || user_sgpr_info->need_ring_offsets) {
558                 user_sgpr_info->sgpr_count += 2;
559         }
560
561         /* FIXME: fix the number of user sgprs for merged shaders on GFX9 */
562         switch (stage) {
563         case MESA_SHADER_COMPUTE:
564                 if (ctx->shader_info->info.cs.uses_grid_size)
565                         user_sgpr_info->sgpr_count += 3;
566                 break;
567         case MESA_SHADER_FRAGMENT:
568                 user_sgpr_info->sgpr_count += ctx->shader_info->info.ps.needs_sample_positions;
569                 break;
570         case MESA_SHADER_VERTEX:
571                 if (!ctx->is_gs_copy_shader) {
572                         user_sgpr_info->sgpr_count += ctx->shader_info->info.vs.has_vertex_buffers ? 2 : 0;
573                         if (ctx->shader_info->info.vs.needs_draw_id) {
574                                 user_sgpr_info->sgpr_count += 3;
575                         } else {
576                                 user_sgpr_info->sgpr_count += 2;
577                         }
578                 }
579                 if (ctx->options->key.vs.as_ls)
580                         user_sgpr_info->sgpr_count++;
581                 break;
582         case MESA_SHADER_TESS_CTRL:
583                 user_sgpr_info->sgpr_count += 4;
584                 break;
585         case MESA_SHADER_TESS_EVAL:
586                 user_sgpr_info->sgpr_count += 1;
587                 break;
588         case MESA_SHADER_GEOMETRY:
589                 user_sgpr_info->sgpr_count += 2;
590                 break;
591         default:
592                 break;
593         }
594
595         if (needs_view_index)
596                 user_sgpr_info->sgpr_count++;
597
598         if (ctx->shader_info->info.loads_push_constants)
599                 user_sgpr_info->sgpr_count += 2;
600
601         uint32_t available_sgprs = ctx->options->chip_class >= GFX9 ? 32 : 16;
602         uint32_t remaining_sgprs = available_sgprs - user_sgpr_info->sgpr_count;
603
604         if (remaining_sgprs / 2 < util_bitcount(ctx->shader_info->info.desc_set_used_mask)) {
605                 user_sgpr_info->sgpr_count += 2;
606                 user_sgpr_info->indirect_all_descriptor_sets = true;
607         } else {
608                 user_sgpr_info->sgpr_count += util_bitcount(ctx->shader_info->info.desc_set_used_mask) * 2;
609         }
610 }
611
612 static void
613 declare_global_input_sgprs(struct nir_to_llvm_context *ctx,
614                            gl_shader_stage stage,
615                            bool has_previous_stage,
616                            gl_shader_stage previous_stage,
617                            const struct user_sgpr_info *user_sgpr_info,
618                            struct arg_info *args,
619                            LLVMValueRef *desc_sets)
620 {
621         LLVMTypeRef type = ac_array_in_const_addr_space(ctx->ac.i8);
622         unsigned num_sets = ctx->options->layout ?
623                             ctx->options->layout->num_sets : 0;
624         unsigned stage_mask = 1 << stage;
625
626         if (has_previous_stage)
627                 stage_mask |= 1 << previous_stage;
628
629         /* 1 for each descriptor set */
630         if (!user_sgpr_info->indirect_all_descriptor_sets) {
631                 for (unsigned i = 0; i < num_sets; ++i) {
632                         if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
633                                 add_array_arg(args, type,
634                                               &ctx->descriptor_sets[i]);
635                         }
636                 }
637         } else {
638                 add_array_arg(args, ac_array_in_const_addr_space(type), desc_sets);
639         }
640
641         if (ctx->shader_info->info.loads_push_constants) {
642                 /* 1 for push constants and dynamic descriptors */
643                 add_array_arg(args, type, &ctx->push_constants);
644         }
645 }
646
647 static void
648 declare_vs_specific_input_sgprs(struct nir_to_llvm_context *ctx,
649                                 gl_shader_stage stage,
650                                 bool has_previous_stage,
651                                 gl_shader_stage previous_stage,
652                                 struct arg_info *args)
653 {
654         if (!ctx->is_gs_copy_shader &&
655             (stage == MESA_SHADER_VERTEX ||
656              (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
657                 if (ctx->shader_info->info.vs.has_vertex_buffers) {
658                         add_arg(args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32),
659                                 &ctx->vertex_buffers);
660                 }
661                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.base_vertex);
662                 add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.start_instance);
663                 if (ctx->shader_info->info.vs.needs_draw_id) {
664                         add_arg(args, ARG_SGPR, ctx->ac.i32, &ctx->abi.draw_id);
665                 }
666         }
667 }
668
669 static void
670 declare_vs_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
671 {
672         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.vertex_id);
673         if (!ctx->is_gs_copy_shader) {
674                 if (ctx->options->key.vs.as_ls) {
675                         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->rel_auto_id);
676                         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
677                 } else {
678                         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.instance_id);
679                         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->vs_prim_id);
680                 }
681                 add_arg(args, ARG_VGPR, ctx->ac.i32, NULL); /* unused */
682         }
683 }
684
685 static void
686 declare_tes_input_vgprs(struct nir_to_llvm_context *ctx, struct arg_info *args)
687 {
688         add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_u);
689         add_arg(args, ARG_VGPR, ctx->ac.f32, &ctx->tes_v);
690         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->tes_rel_patch_id);
691         add_arg(args, ARG_VGPR, ctx->ac.i32, &ctx->abi.tes_patch_id);
692 }
693
694 static void
695 set_global_input_locs(struct nir_to_llvm_context *ctx, gl_shader_stage stage,
696                       bool has_previous_stage, gl_shader_stage previous_stage,
697                       const struct user_sgpr_info *user_sgpr_info,
698                       LLVMValueRef desc_sets, uint8_t *user_sgpr_idx)
699 {
700         unsigned num_sets = ctx->options->layout ?
701                             ctx->options->layout->num_sets : 0;
702         unsigned stage_mask = 1 << stage;
703
704         if (has_previous_stage)
705                 stage_mask |= 1 << previous_stage;
706
707         if (!user_sgpr_info->indirect_all_descriptor_sets) {
708                 for (unsigned i = 0; i < num_sets; ++i) {
709                         if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
710                                 set_loc_desc(ctx, i, user_sgpr_idx, 0);
711                         } else
712                                 ctx->descriptor_sets[i] = NULL;
713                 }
714         } else {
715                 set_loc_shader(ctx, AC_UD_INDIRECT_DESCRIPTOR_SETS,
716                                user_sgpr_idx, 2);
717
718                 for (unsigned i = 0; i < num_sets; ++i) {
719                         if (ctx->options->layout->set[i].layout->shader_stages & stage_mask) {
720                                 set_loc_desc(ctx, i, user_sgpr_idx, i * 8);
721                                 ctx->descriptor_sets[i] =
722                                         ac_build_load_to_sgpr(&ctx->ac,
723                                                               desc_sets,
724                                                               LLVMConstInt(ctx->ac.i32, i, false));
725
726                         } else
727                                 ctx->descriptor_sets[i] = NULL;
728                 }
729                 ctx->shader_info->need_indirect_descriptor_sets = true;
730         }
731
732         if (ctx->shader_info->info.loads_push_constants) {
733                 set_loc_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
734         }
735 }
736
737 static void
738 set_vs_specific_input_locs(struct nir_to_llvm_context *ctx,
739                            gl_shader_stage stage, bool has_previous_stage,
740                            gl_shader_stage previous_stage,
741                            uint8_t *user_sgpr_idx)
742 {
743         if (!ctx->is_gs_copy_shader &&
744             (stage == MESA_SHADER_VERTEX ||
745              (has_previous_stage && previous_stage == MESA_SHADER_VERTEX))) {
746                 if (ctx->shader_info->info.vs.has_vertex_buffers) {
747                         set_loc_shader(ctx, AC_UD_VS_VERTEX_BUFFERS,
748                                        user_sgpr_idx, 2);
749                 }
750
751                 unsigned vs_num = 2;
752                 if (ctx->shader_info->info.vs.needs_draw_id)
753                         vs_num++;
754
755                 set_loc_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE,
756                                user_sgpr_idx, vs_num);
757         }
758 }
759
760 static void create_function(struct nir_to_llvm_context *ctx,
761                             gl_shader_stage stage,
762                             bool has_previous_stage,
763                             gl_shader_stage previous_stage)
764 {
765         uint8_t user_sgpr_idx;
766         struct user_sgpr_info user_sgpr_info;
767         struct arg_info args = {};
768         LLVMValueRef desc_sets;
769         bool needs_view_index = needs_view_index_sgpr(ctx, stage);
770         allocate_user_sgprs(ctx, stage, needs_view_index, &user_sgpr_info);
771
772         if (user_sgpr_info.need_ring_offsets && !ctx->options->supports_spill) {
773                 add_arg(&args, ARG_SGPR, ac_array_in_const_addr_space(ctx->ac.v4i32),
774                         &ctx->ring_offsets);
775         }
776
777         switch (stage) {
778         case MESA_SHADER_COMPUTE:
779                 declare_global_input_sgprs(ctx, stage, has_previous_stage,
780                                            previous_stage, &user_sgpr_info,
781                                            &args, &desc_sets);
782
783                 if (ctx->shader_info->info.cs.uses_grid_size) {
784                         add_arg(&args, ARG_SGPR, ctx->ac.v3i32,
785                                 &ctx->num_work_groups);
786                 }
787
788                 for (int i = 0; i < 3; i++) {
789                         ctx->workgroup_ids[i] = NULL;
790                         if (ctx->shader_info->info.cs.uses_block_id[i]) {
791                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
792                                         &ctx->workgroup_ids[i]);
793                         }
794                 }
795
796                 if (ctx->shader_info->info.cs.uses_local_invocation_idx)
797                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tg_size);
798                 add_arg(&args, ARG_VGPR, ctx->ac.v3i32,
799                         &ctx->local_invocation_ids);
800                 break;
801         case MESA_SHADER_VERTEX:
802                 declare_global_input_sgprs(ctx, stage, has_previous_stage,
803                                            previous_stage, &user_sgpr_info,
804                                            &args, &desc_sets);
805                 declare_vs_specific_input_sgprs(ctx, stage, has_previous_stage,
806                                                 previous_stage, &args);
807
808                 if (needs_view_index)
809                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
810                 if (ctx->options->key.vs.as_es)
811                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
812                                 &ctx->es2gs_offset);
813                 else if (ctx->options->key.vs.as_ls)
814                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
815                                 &ctx->ls_out_layout);
816
817                 declare_vs_input_vgprs(ctx, &args);
818                 break;
819         case MESA_SHADER_TESS_CTRL:
820                 if (has_previous_stage) {
821                         // First 6 system regs
822                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
823                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
824                                 &ctx->merged_wave_info);
825                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
826                                 &ctx->tess_factor_offset);
827
828                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset
829                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
830                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
831
832                         declare_global_input_sgprs(ctx, stage,
833                                                    has_previous_stage,
834                                                    previous_stage,
835                                                    &user_sgpr_info, &args,
836                                                    &desc_sets);
837                         declare_vs_specific_input_sgprs(ctx, stage,
838                                                         has_previous_stage,
839                                                         previous_stage, &args);
840
841                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
842                                 &ctx->ls_out_layout);
843
844                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
845                                 &ctx->tcs_offchip_layout);
846                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
847                                 &ctx->tcs_out_offsets);
848                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
849                                 &ctx->tcs_out_layout);
850                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
851                                 &ctx->tcs_in_layout);
852                         if (needs_view_index)
853                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
854                                         &ctx->view_index);
855
856                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
857                                 &ctx->abi.tcs_patch_id);
858                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
859                                 &ctx->abi.tcs_rel_ids);
860
861                         declare_vs_input_vgprs(ctx, &args);
862                 } else {
863                         declare_global_input_sgprs(ctx, stage,
864                                                    has_previous_stage,
865                                                    previous_stage,
866                                                    &user_sgpr_info, &args,
867                                                    &desc_sets);
868
869                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
870                                 &ctx->tcs_offchip_layout);
871                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
872                                 &ctx->tcs_out_offsets);
873                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
874                                 &ctx->tcs_out_layout);
875                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
876                                 &ctx->tcs_in_layout);
877                         if (needs_view_index)
878                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
879                                         &ctx->view_index);
880
881                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
882                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
883                                 &ctx->tess_factor_offset);
884                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
885                                 &ctx->abi.tcs_patch_id);
886                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
887                                 &ctx->abi.tcs_rel_ids);
888                 }
889                 break;
890         case MESA_SHADER_TESS_EVAL:
891                 declare_global_input_sgprs(ctx, stage, has_previous_stage,
892                                            previous_stage, &user_sgpr_info,
893                                            &args, &desc_sets);
894
895                 add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->tcs_offchip_layout);
896                 if (needs_view_index)
897                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->view_index);
898
899                 if (ctx->options->key.tes.as_es) {
900                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
901                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL);
902                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
903                                 &ctx->es2gs_offset);
904                 } else {
905                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL);
906                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
907                 }
908                 declare_tes_input_vgprs(ctx, &args);
909                 break;
910         case MESA_SHADER_GEOMETRY:
911                 if (has_previous_stage) {
912                         // First 6 system regs
913                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
914                                 &ctx->gs2vs_offset);
915                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
916                                 &ctx->merged_wave_info);
917                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->oc_lds);
918
919                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // scratch offset
920                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
921                         add_arg(&args, ARG_SGPR, ctx->ac.i32, NULL); // unknown
922
923                         declare_global_input_sgprs(ctx, stage,
924                                                    has_previous_stage,
925                                                    previous_stage,
926                                                    &user_sgpr_info, &args,
927                                                    &desc_sets);
928
929                         if (previous_stage == MESA_SHADER_TESS_EVAL) {
930                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
931                                         &ctx->tcs_offchip_layout);
932                         } else {
933                                 declare_vs_specific_input_sgprs(ctx, stage,
934                                                                 has_previous_stage,
935                                                                 previous_stage,
936                                                                 &args);
937                         }
938
939                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
940                                 &ctx->gsvs_ring_stride);
941                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
942                                 &ctx->gsvs_num_entries);
943                         if (needs_view_index)
944                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
945                                         &ctx->view_index);
946
947                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
948                                 &ctx->gs_vtx_offset[0]);
949                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
950                                 &ctx->gs_vtx_offset[2]);
951                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
952                                 &ctx->abi.gs_prim_id);
953                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
954                                 &ctx->abi.gs_invocation_id);
955                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
956                                 &ctx->gs_vtx_offset[4]);
957
958                         if (previous_stage == MESA_SHADER_VERTEX) {
959                                 declare_vs_input_vgprs(ctx, &args);
960                         } else {
961                                 declare_tes_input_vgprs(ctx, &args);
962                         }
963                 } else {
964                         declare_global_input_sgprs(ctx, stage,
965                                                    has_previous_stage,
966                                                    previous_stage,
967                                                    &user_sgpr_info, &args,
968                                                    &desc_sets);
969
970                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
971                                 &ctx->gsvs_ring_stride);
972                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
973                                 &ctx->gsvs_num_entries);
974                         if (needs_view_index)
975                                 add_arg(&args, ARG_SGPR, ctx->ac.i32,
976                                         &ctx->view_index);
977
978                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs2vs_offset);
979                         add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->gs_wave_id);
980                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
981                                 &ctx->gs_vtx_offset[0]);
982                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
983                                 &ctx->gs_vtx_offset[1]);
984                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
985                                 &ctx->abi.gs_prim_id);
986                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
987                                 &ctx->gs_vtx_offset[2]);
988                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
989                                 &ctx->gs_vtx_offset[3]);
990                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
991                                 &ctx->gs_vtx_offset[4]);
992                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
993                                 &ctx->gs_vtx_offset[5]);
994                         add_arg(&args, ARG_VGPR, ctx->ac.i32,
995                                 &ctx->abi.gs_invocation_id);
996                 }
997                 break;
998         case MESA_SHADER_FRAGMENT:
999                 declare_global_input_sgprs(ctx, stage, has_previous_stage,
1000                                            previous_stage, &user_sgpr_info,
1001                                            &args, &desc_sets);
1002
1003                 if (ctx->shader_info->info.ps.needs_sample_positions)
1004                         add_arg(&args, ARG_SGPR, ctx->ac.i32,
1005                                 &ctx->sample_pos_offset);
1006
1007                 add_arg(&args, ARG_SGPR, ctx->ac.i32, &ctx->abi.prim_mask);
1008                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_sample);
1009                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_center);
1010                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->persp_centroid);
1011                 add_arg(&args, ARG_VGPR, ctx->ac.v3i32, NULL); /* persp pull model */
1012                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_sample);
1013                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_center);
1014                 add_arg(&args, ARG_VGPR, ctx->ac.v2i32, &ctx->linear_centroid);
1015                 add_arg(&args, ARG_VGPR, ctx->ac.f32, NULL);  /* line stipple tex */
1016                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[0]);
1017                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[1]);
1018                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[2]);
1019                 add_arg(&args, ARG_VGPR, ctx->ac.f32, &ctx->abi.frag_pos[3]);
1020                 add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.front_face);
1021                 add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.ancillary);
1022                 add_arg(&args, ARG_VGPR, ctx->ac.i32, &ctx->abi.sample_coverage);
1023                 add_arg(&args, ARG_VGPR, ctx->ac.i32, NULL);  /* fixed pt */
1024                 break;
1025         default:
1026                 unreachable("Shader stage not implemented");
1027         }
1028
1029         ctx->main_function = create_llvm_function(
1030             ctx->context, ctx->module, ctx->builder, NULL, 0, &args,
1031             ctx->max_workgroup_size,
1032             ctx->options->unsafe_math);
1033         set_llvm_calling_convention(ctx->main_function, stage);
1034
1035
1036         ctx->shader_info->num_input_vgprs = 0;
1037         ctx->shader_info->num_input_sgprs = ctx->options->supports_spill ? 2 : 0;
1038
1039         ctx->shader_info->num_input_sgprs += args.num_sgprs_used;
1040
1041         if (ctx->stage != MESA_SHADER_FRAGMENT)
1042                 ctx->shader_info->num_input_vgprs = args.num_vgprs_used;
1043
1044         assign_arguments(ctx->main_function, &args);
1045
1046         user_sgpr_idx = 0;
1047
1048         if (ctx->options->supports_spill || user_sgpr_info.need_ring_offsets) {
1049                 set_loc_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS,
1050                                &user_sgpr_idx, 2);
1051                 if (ctx->options->supports_spill) {
1052                         ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
1053                                                                LLVMPointerType(ctx->ac.i8, AC_CONST_ADDR_SPACE),
1054                                                                NULL, 0, AC_FUNC_ATTR_READNONE);
1055                         ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
1056                                                              ac_array_in_const_addr_space(ctx->ac.v4i32), "");
1057                 }
1058         }
1059         
1060         /* For merged shaders the user SGPRs start at 8, with 8 system SGPRs in front (including
1061          * the rw_buffers at s0/s1. With user SGPR0 = s8, lets restart the count from 0 */
1062         if (has_previous_stage)
1063                 user_sgpr_idx = 0;
1064
1065         set_global_input_locs(ctx, stage, has_previous_stage, previous_stage,
1066                               &user_sgpr_info, desc_sets, &user_sgpr_idx);
1067
1068         switch (stage) {
1069         case MESA_SHADER_COMPUTE:
1070                 if (ctx->shader_info->info.cs.uses_grid_size) {
1071                         set_loc_shader(ctx, AC_UD_CS_GRID_SIZE,
1072                                        &user_sgpr_idx, 3);
1073                 }
1074                 break;
1075         case MESA_SHADER_VERTEX:
1076                 set_vs_specific_input_locs(ctx, stage, has_previous_stage,
1077                                            previous_stage, &user_sgpr_idx);
1078                 if (ctx->view_index)
1079                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1080                 if (ctx->options->key.vs.as_ls) {
1081                         set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
1082                                        &user_sgpr_idx, 1);
1083                 }
1084                 if (ctx->options->key.vs.as_ls)
1085                         ac_declare_lds_as_pointer(&ctx->ac);
1086                 break;
1087         case MESA_SHADER_TESS_CTRL:
1088                 set_vs_specific_input_locs(ctx, stage, has_previous_stage,
1089                                            previous_stage, &user_sgpr_idx);
1090                 if (has_previous_stage)
1091                         set_loc_shader(ctx, AC_UD_VS_LS_TCS_IN_LAYOUT,
1092                                        &user_sgpr_idx, 1);
1093                 set_loc_shader(ctx, AC_UD_TCS_OFFCHIP_LAYOUT, &user_sgpr_idx, 4);
1094                 if (ctx->view_index)
1095                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1096                 ac_declare_lds_as_pointer(&ctx->ac);
1097                 break;
1098         case MESA_SHADER_TESS_EVAL:
1099                 set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT, &user_sgpr_idx, 1);
1100                 if (ctx->view_index)
1101                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1102                 break;
1103         case MESA_SHADER_GEOMETRY:
1104                 if (has_previous_stage) {
1105                         if (previous_stage == MESA_SHADER_VERTEX)
1106                                 set_vs_specific_input_locs(ctx, stage,
1107                                                            has_previous_stage,
1108                                                            previous_stage,
1109                                                            &user_sgpr_idx);
1110                         else
1111                                 set_loc_shader(ctx, AC_UD_TES_OFFCHIP_LAYOUT,
1112                                                &user_sgpr_idx, 1);
1113                 }
1114                 set_loc_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES,
1115                                &user_sgpr_idx, 2);
1116                 if (ctx->view_index)
1117                         set_loc_shader(ctx, AC_UD_VIEW_INDEX, &user_sgpr_idx, 1);
1118                 if (has_previous_stage)
1119                         ac_declare_lds_as_pointer(&ctx->ac);
1120                 break;
1121         case MESA_SHADER_FRAGMENT:
1122                 if (ctx->shader_info->info.ps.needs_sample_positions) {
1123                         set_loc_shader(ctx, AC_UD_PS_SAMPLE_POS_OFFSET,
1124                                        &user_sgpr_idx, 1);
1125                 }
1126                 break;
1127         default:
1128                 unreachable("Shader stage not implemented");
1129         }
1130
1131         ctx->shader_info->num_user_sgprs = user_sgpr_idx;
1132 }
1133
1134 static LLVMValueRef trim_vector(struct ac_llvm_context *ctx,
1135                                 LLVMValueRef value, unsigned count)
1136 {
1137         unsigned num_components = ac_get_llvm_num_components(value);
1138         if (count == num_components)
1139                 return value;
1140
1141         LLVMValueRef masks[] = {
1142             LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
1143             LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
1144
1145         if (count == 1)
1146                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
1147                                                "");
1148
1149         LLVMValueRef swizzle = LLVMConstVector(masks, count);
1150         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
1151 }
1152
1153 static void
1154 build_store_values_extended(struct ac_llvm_context *ac,
1155                              LLVMValueRef *values,
1156                              unsigned value_count,
1157                              unsigned value_stride,
1158                              LLVMValueRef vec)
1159 {
1160         LLVMBuilderRef builder = ac->builder;
1161         unsigned i;
1162
1163         for (i = 0; i < value_count; i++) {
1164                 LLVMValueRef ptr = values[i * value_stride];
1165                 LLVMValueRef index = LLVMConstInt(ac->i32, i, false);
1166                 LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, "");
1167                 LLVMBuildStore(builder, value, ptr);
1168         }
1169 }
1170
1171 static LLVMTypeRef get_def_type(struct ac_nir_context *ctx,
1172                                 const nir_ssa_def *def)
1173 {
1174         LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, def->bit_size);
1175         if (def->num_components > 1) {
1176                 type = LLVMVectorType(type, def->num_components);
1177         }
1178         return type;
1179 }
1180
1181 static LLVMValueRef get_src(struct ac_nir_context *nir, nir_src src)
1182 {
1183         assert(src.is_ssa);
1184         struct hash_entry *entry = _mesa_hash_table_search(nir->defs, src.ssa);
1185         return (LLVMValueRef)entry->data;
1186 }
1187
1188
1189 static LLVMBasicBlockRef get_block(struct ac_nir_context *nir,
1190                                    const struct nir_block *b)
1191 {
1192         struct hash_entry *entry = _mesa_hash_table_search(nir->defs, b);
1193         return (LLVMBasicBlockRef)entry->data;
1194 }
1195
1196 static LLVMValueRef get_alu_src(struct ac_nir_context *ctx,
1197                                 nir_alu_src src,
1198                                 unsigned num_components)
1199 {
1200         LLVMValueRef value = get_src(ctx, src.src);
1201         bool need_swizzle = false;
1202
1203         assert(value);
1204         LLVMTypeRef type = LLVMTypeOf(value);
1205         unsigned src_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
1206                                       ? LLVMGetVectorSize(type)
1207                                       : 1;
1208
1209         for (unsigned i = 0; i < num_components; ++i) {
1210                 assert(src.swizzle[i] < src_components);
1211                 if (src.swizzle[i] != i)
1212                         need_swizzle = true;
1213         }
1214
1215         if (need_swizzle || num_components != src_components) {
1216                 LLVMValueRef masks[] = {
1217                     LLVMConstInt(ctx->ac.i32, src.swizzle[0], false),
1218                     LLVMConstInt(ctx->ac.i32, src.swizzle[1], false),
1219                     LLVMConstInt(ctx->ac.i32, src.swizzle[2], false),
1220                     LLVMConstInt(ctx->ac.i32, src.swizzle[3], false)};
1221
1222                 if (src_components > 1 && num_components == 1) {
1223                         value = LLVMBuildExtractElement(ctx->ac.builder, value,
1224                                                         masks[0], "");
1225                 } else if (src_components == 1 && num_components > 1) {
1226                         LLVMValueRef values[] = {value, value, value, value};
1227                         value = ac_build_gather_values(&ctx->ac, values, num_components);
1228                 } else {
1229                         LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
1230                         value = LLVMBuildShuffleVector(ctx->ac.builder, value, value,
1231                                                        swizzle, "");
1232                 }
1233         }
1234         assert(!src.negate);
1235         assert(!src.abs);
1236         return value;
1237 }
1238
1239 static LLVMValueRef emit_int_cmp(struct ac_llvm_context *ctx,
1240                                  LLVMIntPredicate pred, LLVMValueRef src0,
1241                                  LLVMValueRef src1)
1242 {
1243         LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
1244         return LLVMBuildSelect(ctx->builder, result,
1245                                LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
1246                                ctx->i32_0, "");
1247 }
1248
1249 static LLVMValueRef emit_float_cmp(struct ac_llvm_context *ctx,
1250                                    LLVMRealPredicate pred, LLVMValueRef src0,
1251                                    LLVMValueRef src1)
1252 {
1253         LLVMValueRef result;
1254         src0 = ac_to_float(ctx, src0);
1255         src1 = ac_to_float(ctx, src1);
1256         result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, "");
1257         return LLVMBuildSelect(ctx->builder, result,
1258                                LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
1259                                ctx->i32_0, "");
1260 }
1261
1262 static LLVMValueRef emit_intrin_1f_param(struct ac_llvm_context *ctx,
1263                                          const char *intrin,
1264                                          LLVMTypeRef result_type,
1265                                          LLVMValueRef src0)
1266 {
1267         char name[64];
1268         LLVMValueRef params[] = {
1269                 ac_to_float(ctx, src0),
1270         };
1271
1272         MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
1273                                                  get_elem_bits(ctx, result_type));
1274         assert(length < sizeof(name));
1275         return ac_build_intrinsic(ctx, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
1276 }
1277
1278 static LLVMValueRef emit_intrin_2f_param(struct ac_llvm_context *ctx,
1279                                        const char *intrin,
1280                                        LLVMTypeRef result_type,
1281                                        LLVMValueRef src0, LLVMValueRef src1)
1282 {
1283         char name[64];
1284         LLVMValueRef params[] = {
1285                 ac_to_float(ctx, src0),
1286                 ac_to_float(ctx, src1),
1287         };
1288
1289         MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
1290                                                  get_elem_bits(ctx, result_type));
1291         assert(length < sizeof(name));
1292         return ac_build_intrinsic(ctx, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
1293 }
1294
1295 static LLVMValueRef emit_intrin_3f_param(struct ac_llvm_context *ctx,
1296                                          const char *intrin,
1297                                          LLVMTypeRef result_type,
1298                                          LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
1299 {
1300         char name[64];
1301         LLVMValueRef params[] = {
1302                 ac_to_float(ctx, src0),
1303                 ac_to_float(ctx, src1),
1304                 ac_to_float(ctx, src2),
1305         };
1306
1307         MAYBE_UNUSED const int length = snprintf(name, sizeof(name), "%s.f%d", intrin,
1308                                                  get_elem_bits(ctx, result_type));
1309         assert(length < sizeof(name));
1310         return ac_build_intrinsic(ctx, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
1311 }
1312
1313 static LLVMValueRef emit_bcsel(struct ac_llvm_context *ctx,
1314                                LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
1315 {
1316         LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0,
1317                                        ctx->i32_0, "");
1318         return LLVMBuildSelect(ctx->builder, v, src1, src2, "");
1319 }
1320
1321 static LLVMValueRef emit_minmax_int(struct ac_llvm_context *ctx,
1322                                     LLVMIntPredicate pred,
1323                                     LLVMValueRef src0, LLVMValueRef src1)
1324 {
1325         return LLVMBuildSelect(ctx->builder,
1326                                LLVMBuildICmp(ctx->builder, pred, src0, src1, ""),
1327                                src0,
1328                                src1, "");
1329
1330 }
1331 static LLVMValueRef emit_iabs(struct ac_llvm_context *ctx,
1332                               LLVMValueRef src0)
1333 {
1334         return emit_minmax_int(ctx, LLVMIntSGT, src0,
1335                                LLVMBuildNeg(ctx->builder, src0, ""));
1336 }
1337
1338 static LLVMValueRef emit_fsign(struct ac_llvm_context *ctx,
1339                                LLVMValueRef src0,
1340                                unsigned bitsize)
1341 {
1342         LLVMValueRef cmp, val, zero, one;
1343         LLVMTypeRef type;
1344
1345         if (bitsize == 32) {
1346                 type = ctx->f32;
1347                 zero = ctx->f32_0;
1348                 one = ctx->f32_1;
1349         } else {
1350                 type = ctx->f64;
1351                 zero = ctx->f64_0;
1352                 one = ctx->f64_1;
1353         }
1354
1355         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, zero, "");
1356         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
1357         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, zero, "");
1358         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(type, -1.0), "");
1359         return val;
1360 }
1361
1362 static LLVMValueRef emit_isign(struct ac_llvm_context *ctx,
1363                                LLVMValueRef src0, unsigned bitsize)
1364 {
1365         LLVMValueRef cmp, val, zero, one;
1366         LLVMTypeRef type;
1367
1368         if (bitsize == 32) {
1369                 type = ctx->i32;
1370                 zero = ctx->i32_0;
1371                 one = ctx->i32_1;
1372         } else {
1373                 type = ctx->i64;
1374                 zero = ctx->i64_0;
1375                 one = ctx->i64_1;
1376         }
1377
1378         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, zero, "");
1379         val = LLVMBuildSelect(ctx->builder, cmp, one, src0, "");
1380         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, zero, "");
1381         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(type, -1, true), "");
1382         return val;
1383 }
1384
1385 static LLVMValueRef emit_ffract(struct ac_llvm_context *ctx,
1386                                 LLVMValueRef src0, unsigned bitsize)
1387 {
1388         LLVMTypeRef type;
1389         char *intr;
1390
1391         if (bitsize == 32) {
1392                 intr = "llvm.floor.f32";
1393                 type = ctx->f32;
1394         } else {
1395                 intr = "llvm.floor.f64";
1396                 type = ctx->f64;
1397         }
1398
1399         LLVMValueRef fsrc0 = ac_to_float(ctx, src0);
1400         LLVMValueRef params[] = {
1401                 fsrc0,
1402         };
1403         LLVMValueRef floor = ac_build_intrinsic(ctx, intr, type, params, 1,
1404                                                 AC_FUNC_ATTR_READNONE);
1405         return LLVMBuildFSub(ctx->builder, fsrc0, floor, "");
1406 }
1407
1408 static LLVMValueRef emit_uint_carry(struct ac_llvm_context *ctx,
1409                                     const char *intrin,
1410                                     LLVMValueRef src0, LLVMValueRef src1)
1411 {
1412         LLVMTypeRef ret_type;
1413         LLVMTypeRef types[] = { ctx->i32, ctx->i1 };
1414         LLVMValueRef res;
1415         LLVMValueRef params[] = { src0, src1 };
1416         ret_type = LLVMStructTypeInContext(ctx->context, types,
1417                                            2, true);
1418
1419         res = ac_build_intrinsic(ctx, intrin, ret_type,
1420                                  params, 2, AC_FUNC_ATTR_READNONE);
1421
1422         res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
1423         res = LLVMBuildZExt(ctx->builder, res, ctx->i32, "");
1424         return res;
1425 }
1426
1427 static LLVMValueRef emit_b2f(struct ac_llvm_context *ctx,
1428                              LLVMValueRef src0)
1429 {
1430         return LLVMBuildAnd(ctx->builder, src0, LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), "");
1431 }
1432
1433 static LLVMValueRef emit_f2b(struct ac_llvm_context *ctx,
1434                              LLVMValueRef src0)
1435 {
1436         src0 = ac_to_float(ctx, src0);
1437         return LLVMBuildSExt(ctx->builder,
1438                              LLVMBuildFCmp(ctx->builder, LLVMRealUNE, src0, ctx->f32_0, ""),
1439                              ctx->i32, "");
1440 }
1441
1442 static LLVMValueRef emit_b2i(struct ac_llvm_context *ctx,
1443                              LLVMValueRef src0,
1444                              unsigned bitsize)
1445 {
1446         LLVMValueRef result = LLVMBuildAnd(ctx->builder, src0, ctx->i32_1, "");
1447
1448         if (bitsize == 32)
1449                 return result;
1450
1451         return LLVMBuildZExt(ctx->builder, result, ctx->i64, "");
1452 }
1453
1454 static LLVMValueRef emit_i2b(struct ac_llvm_context *ctx,
1455                              LLVMValueRef src0)
1456 {
1457         return LLVMBuildSExt(ctx->builder,
1458                              LLVMBuildICmp(ctx->builder, LLVMIntNE, src0, ctx->i32_0, ""),
1459                              ctx->i32, "");
1460 }
1461
1462 static LLVMValueRef emit_f2f16(struct nir_to_llvm_context *ctx,
1463                                LLVMValueRef src0)
1464 {
1465         LLVMValueRef result;
1466         LLVMValueRef cond = NULL;
1467
1468         src0 = ac_to_float(&ctx->ac, src0);
1469         result = LLVMBuildFPTrunc(ctx->builder, src0, ctx->ac.f16, "");
1470
1471         if (ctx->options->chip_class >= VI) {
1472                 LLVMValueRef args[2];
1473                 /* Check if the result is a denormal - and flush to 0 if so. */
1474                 args[0] = result;
1475                 args[1] = LLVMConstInt(ctx->ac.i32, N_SUBNORMAL | P_SUBNORMAL, false);
1476                 cond = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.class.f16", ctx->ac.i1, args, 2, AC_FUNC_ATTR_READNONE);
1477         }
1478
1479         /* need to convert back up to f32 */
1480         result = LLVMBuildFPExt(ctx->builder, result, ctx->ac.f32, "");
1481
1482         if (ctx->options->chip_class >= VI)
1483                 result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, result, "");
1484         else {
1485                 /* for SI/CIK */
1486                 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float,
1487                  * so compare the result and flush to 0 if it's smaller.
1488                  */
1489                 LLVMValueRef temp, cond2;
1490                 temp = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
1491                                             ctx->ac.f32, result);
1492                 cond = LLVMBuildFCmp(ctx->builder, LLVMRealUGT,
1493                                      LLVMBuildBitCast(ctx->builder, LLVMConstInt(ctx->ac.i32, 0x38800000, false), ctx->ac.f32, ""),
1494                                      temp, "");
1495                 cond2 = LLVMBuildFCmp(ctx->builder, LLVMRealUNE,
1496                                       temp, ctx->ac.f32_0, "");
1497                 cond = LLVMBuildAnd(ctx->builder, cond, cond2, "");
1498                 result = LLVMBuildSelect(ctx->builder, cond, ctx->ac.f32_0, result, "");
1499         }
1500         return result;
1501 }
1502
1503 static LLVMValueRef emit_umul_high(struct ac_llvm_context *ctx,
1504                                    LLVMValueRef src0, LLVMValueRef src1)
1505 {
1506         LLVMValueRef dst64, result;
1507         src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, "");
1508         src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, "");
1509
1510         dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
1511         dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
1512         result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
1513         return result;
1514 }
1515
1516 static LLVMValueRef emit_imul_high(struct ac_llvm_context *ctx,
1517                                    LLVMValueRef src0, LLVMValueRef src1)
1518 {
1519         LLVMValueRef dst64, result;
1520         src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, "");
1521         src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, "");
1522
1523         dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
1524         dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
1525         result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
1526         return result;
1527 }
1528
1529 static LLVMValueRef emit_bitfield_extract(struct ac_llvm_context *ctx,
1530                                           bool is_signed,
1531                                           const LLVMValueRef srcs[3])
1532 {
1533         LLVMValueRef result;
1534         LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
1535
1536         result = ac_build_bfe(ctx, srcs[0], srcs[1], srcs[2], is_signed);
1537         result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, "");
1538         return result;
1539 }
1540
1541 static LLVMValueRef emit_bitfield_insert(struct ac_llvm_context *ctx,
1542                                          LLVMValueRef src0, LLVMValueRef src1,
1543                                          LLVMValueRef src2, LLVMValueRef src3)
1544 {
1545         LLVMValueRef bfi_args[3], result;
1546
1547         bfi_args[0] = LLVMBuildShl(ctx->builder,
1548                                    LLVMBuildSub(ctx->builder,
1549                                                 LLVMBuildShl(ctx->builder,
1550                                                              ctx->i32_1,
1551                                                              src3, ""),
1552                                                 ctx->i32_1, ""),
1553                                    src2, "");
1554         bfi_args[1] = LLVMBuildShl(ctx->builder, src1, src2, "");
1555         bfi_args[2] = src0;
1556
1557         LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, src3, LLVMConstInt(ctx->i32, 32, false), "");
1558
1559         /* Calculate:
1560          *   (arg0 & arg1) | (~arg0 & arg2) = arg2 ^ (arg0 & (arg1 ^ arg2)
1561          * Use the right-hand side, which the LLVM backend can convert to V_BFI.
1562          */
1563         result = LLVMBuildXor(ctx->builder, bfi_args[2],
1564                               LLVMBuildAnd(ctx->builder, bfi_args[0],
1565                                            LLVMBuildXor(ctx->builder, bfi_args[1], bfi_args[2], ""), ""), "");
1566
1567         result = LLVMBuildSelect(ctx->builder, icond, src1, result, "");
1568         return result;
1569 }
1570
1571 static LLVMValueRef emit_pack_half_2x16(struct ac_llvm_context *ctx,
1572                                         LLVMValueRef src0)
1573 {
1574         LLVMValueRef comp[2];
1575
1576         src0 = ac_to_float(ctx, src0);
1577         comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_0, "");
1578         comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32_1, "");
1579
1580         return ac_build_cvt_pkrtz_f16(ctx, comp);
1581 }
1582
1583 static LLVMValueRef emit_unpack_half_2x16(struct ac_llvm_context *ctx,
1584                                           LLVMValueRef src0)
1585 {
1586         LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
1587         LLVMValueRef temps[2], result, val;
1588         int i;
1589
1590         for (i = 0; i < 2; i++) {
1591                 val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0;
1592                 val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, "");
1593                 val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, "");
1594                 temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
1595         }
1596
1597         result = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), temps[0],
1598                                         ctx->i32_0, "");
1599         result = LLVMBuildInsertElement(ctx->builder, result, temps[1],
1600                                         ctx->i32_1, "");
1601         return result;
1602 }
1603
1604 static LLVMValueRef emit_ddxy(struct ac_nir_context *ctx,
1605                               nir_op op,
1606                               LLVMValueRef src0)
1607 {
1608         unsigned mask;
1609         int idx;
1610         LLVMValueRef result;
1611
1612         if (op == nir_op_fddx_fine || op == nir_op_fddx)
1613                 mask = AC_TID_MASK_LEFT;
1614         else if (op == nir_op_fddy_fine || op == nir_op_fddy)
1615                 mask = AC_TID_MASK_TOP;
1616         else
1617                 mask = AC_TID_MASK_TOP_LEFT;
1618
1619         /* for DDX we want to next X pixel, DDY next Y pixel. */
1620         if (op == nir_op_fddx_fine ||
1621             op == nir_op_fddx_coarse ||
1622             op == nir_op_fddx)
1623                 idx = 1;
1624         else
1625                 idx = 2;
1626
1627         result = ac_build_ddxy(&ctx->ac, mask, idx, src0);
1628         return result;
1629 }
1630
1631 /*
1632  * this takes an I,J coordinate pair,
1633  * and works out the X and Y derivatives.
1634  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
1635  */
1636 static LLVMValueRef emit_ddxy_interp(
1637         struct ac_nir_context *ctx,
1638         LLVMValueRef interp_ij)
1639 {
1640         LLVMValueRef result[4], a;
1641         unsigned i;
1642
1643         for (i = 0; i < 2; i++) {
1644                 a = LLVMBuildExtractElement(ctx->ac.builder, interp_ij,
1645                                             LLVMConstInt(ctx->ac.i32, i, false), "");
1646                 result[i] = emit_ddxy(ctx, nir_op_fddx, a);
1647                 result[2+i] = emit_ddxy(ctx, nir_op_fddy, a);
1648         }
1649         return ac_build_gather_values(&ctx->ac, result, 4);
1650 }
1651
1652 static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
1653 {
1654         LLVMValueRef src[4], result = NULL;
1655         unsigned num_components = instr->dest.dest.ssa.num_components;
1656         unsigned src_components;
1657         LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
1658
1659         assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
1660         switch (instr->op) {
1661         case nir_op_vec2:
1662         case nir_op_vec3:
1663         case nir_op_vec4:
1664                 src_components = 1;
1665                 break;
1666         case nir_op_pack_half_2x16:
1667                 src_components = 2;
1668                 break;
1669         case nir_op_unpack_half_2x16:
1670                 src_components = 1;
1671                 break;
1672         default:
1673                 src_components = num_components;
1674                 break;
1675         }
1676         for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1677                 src[i] = get_alu_src(ctx, instr->src[i], src_components);
1678
1679         switch (instr->op) {
1680         case nir_op_fmov:
1681         case nir_op_imov:
1682                 result = src[0];
1683                 break;
1684         case nir_op_fneg:
1685                 src[0] = ac_to_float(&ctx->ac, src[0]);
1686                 result = LLVMBuildFNeg(ctx->ac.builder, src[0], "");
1687                 break;
1688         case nir_op_ineg:
1689                 result = LLVMBuildNeg(ctx->ac.builder, src[0], "");
1690                 break;
1691         case nir_op_inot:
1692                 result = LLVMBuildNot(ctx->ac.builder, src[0], "");
1693                 break;
1694         case nir_op_iadd:
1695                 result = LLVMBuildAdd(ctx->ac.builder, src[0], src[1], "");
1696                 break;
1697         case nir_op_fadd:
1698                 src[0] = ac_to_float(&ctx->ac, src[0]);
1699                 src[1] = ac_to_float(&ctx->ac, src[1]);
1700                 result = LLVMBuildFAdd(ctx->ac.builder, src[0], src[1], "");
1701                 break;
1702         case nir_op_fsub:
1703                 src[0] = ac_to_float(&ctx->ac, src[0]);
1704                 src[1] = ac_to_float(&ctx->ac, src[1]);
1705                 result = LLVMBuildFSub(ctx->ac.builder, src[0], src[1], "");
1706                 break;
1707         case nir_op_isub:
1708                 result = LLVMBuildSub(ctx->ac.builder, src[0], src[1], "");
1709                 break;
1710         case nir_op_imul:
1711                 result = LLVMBuildMul(ctx->ac.builder, src[0], src[1], "");
1712                 break;
1713         case nir_op_imod:
1714                 result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
1715                 break;
1716         case nir_op_umod:
1717                 result = LLVMBuildURem(ctx->ac.builder, src[0], src[1], "");
1718                 break;
1719         case nir_op_fmod:
1720                 src[0] = ac_to_float(&ctx->ac, src[0]);
1721                 src[1] = ac_to_float(&ctx->ac, src[1]);
1722                 result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
1723                 result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
1724                                               ac_to_float_type(&ctx->ac, def_type), result);
1725                 result = LLVMBuildFMul(ctx->ac.builder, src[1] , result, "");
1726                 result = LLVMBuildFSub(ctx->ac.builder, src[0], result, "");
1727                 break;
1728         case nir_op_frem:
1729                 src[0] = ac_to_float(&ctx->ac, src[0]);
1730                 src[1] = ac_to_float(&ctx->ac, src[1]);
1731                 result = LLVMBuildFRem(ctx->ac.builder, src[0], src[1], "");
1732                 break;
1733         case nir_op_irem:
1734                 result = LLVMBuildSRem(ctx->ac.builder, src[0], src[1], "");
1735                 break;
1736         case nir_op_idiv:
1737                 result = LLVMBuildSDiv(ctx->ac.builder, src[0], src[1], "");
1738                 break;
1739         case nir_op_udiv:
1740                 result = LLVMBuildUDiv(ctx->ac.builder, src[0], src[1], "");
1741                 break;
1742         case nir_op_fmul:
1743                 src[0] = ac_to_float(&ctx->ac, src[0]);
1744                 src[1] = ac_to_float(&ctx->ac, src[1]);
1745                 result = LLVMBuildFMul(ctx->ac.builder, src[0], src[1], "");
1746                 break;
1747         case nir_op_fdiv:
1748                 src[0] = ac_to_float(&ctx->ac, src[0]);
1749                 src[1] = ac_to_float(&ctx->ac, src[1]);
1750                 result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
1751                 break;
1752         case nir_op_frcp:
1753                 src[0] = ac_to_float(&ctx->ac, src[0]);
1754                 result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
1755                                        src[0]);
1756                 break;
1757         case nir_op_iand:
1758                 result = LLVMBuildAnd(ctx->ac.builder, src[0], src[1], "");
1759                 break;
1760         case nir_op_ior:
1761                 result = LLVMBuildOr(ctx->ac.builder, src[0], src[1], "");
1762                 break;
1763         case nir_op_ixor:
1764                 result = LLVMBuildXor(ctx->ac.builder, src[0], src[1], "");
1765                 break;
1766         case nir_op_ishl:
1767                 result = LLVMBuildShl(ctx->ac.builder, src[0],
1768                                       LLVMBuildZExt(ctx->ac.builder, src[1],
1769                                                     LLVMTypeOf(src[0]), ""),
1770                                       "");
1771                 break;
1772         case nir_op_ishr:
1773                 result = LLVMBuildAShr(ctx->ac.builder, src[0],
1774                                        LLVMBuildZExt(ctx->ac.builder, src[1],
1775                                                      LLVMTypeOf(src[0]), ""),
1776                                        "");
1777                 break;
1778         case nir_op_ushr:
1779                 result = LLVMBuildLShr(ctx->ac.builder, src[0],
1780                                        LLVMBuildZExt(ctx->ac.builder, src[1],
1781                                                      LLVMTypeOf(src[0]), ""),
1782                                        "");
1783                 break;
1784         case nir_op_ilt:
1785                 result = emit_int_cmp(&ctx->ac, LLVMIntSLT, src[0], src[1]);
1786                 break;
1787         case nir_op_ine:
1788                 result = emit_int_cmp(&ctx->ac, LLVMIntNE, src[0], src[1]);
1789                 break;
1790         case nir_op_ieq:
1791                 result = emit_int_cmp(&ctx->ac, LLVMIntEQ, src[0], src[1]);
1792                 break;
1793         case nir_op_ige:
1794                 result = emit_int_cmp(&ctx->ac, LLVMIntSGE, src[0], src[1]);
1795                 break;
1796         case nir_op_ult:
1797                 result = emit_int_cmp(&ctx->ac, LLVMIntULT, src[0], src[1]);
1798                 break;
1799         case nir_op_uge:
1800                 result = emit_int_cmp(&ctx->ac, LLVMIntUGE, src[0], src[1]);
1801                 break;
1802         case nir_op_feq:
1803                 result = emit_float_cmp(&ctx->ac, LLVMRealUEQ, src[0], src[1]);
1804                 break;
1805         case nir_op_fne:
1806                 result = emit_float_cmp(&ctx->ac, LLVMRealUNE, src[0], src[1]);
1807                 break;
1808         case nir_op_flt:
1809                 result = emit_float_cmp(&ctx->ac, LLVMRealULT, src[0], src[1]);
1810                 break;
1811         case nir_op_fge:
1812                 result = emit_float_cmp(&ctx->ac, LLVMRealUGE, src[0], src[1]);
1813                 break;
1814         case nir_op_fabs:
1815                 result = emit_intrin_1f_param(&ctx->ac, "llvm.fabs",
1816                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1817                 break;
1818         case nir_op_iabs:
1819                 result = emit_iabs(&ctx->ac, src[0]);
1820                 break;
1821         case nir_op_imax:
1822                 result = emit_minmax_int(&ctx->ac, LLVMIntSGT, src[0], src[1]);
1823                 break;
1824         case nir_op_imin:
1825                 result = emit_minmax_int(&ctx->ac, LLVMIntSLT, src[0], src[1]);
1826                 break;
1827         case nir_op_umax:
1828                 result = emit_minmax_int(&ctx->ac, LLVMIntUGT, src[0], src[1]);
1829                 break;
1830         case nir_op_umin:
1831                 result = emit_minmax_int(&ctx->ac, LLVMIntULT, src[0], src[1]);
1832                 break;
1833         case nir_op_isign:
1834                 result = emit_isign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1835                 break;
1836         case nir_op_fsign:
1837                 src[0] = ac_to_float(&ctx->ac, src[0]);
1838                 result = emit_fsign(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1839                 break;
1840         case nir_op_ffloor:
1841                 result = emit_intrin_1f_param(&ctx->ac, "llvm.floor",
1842                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1843                 break;
1844         case nir_op_ftrunc:
1845                 result = emit_intrin_1f_param(&ctx->ac, "llvm.trunc",
1846                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1847                 break;
1848         case nir_op_fceil:
1849                 result = emit_intrin_1f_param(&ctx->ac, "llvm.ceil",
1850                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1851                 break;
1852         case nir_op_fround_even:
1853                 result = emit_intrin_1f_param(&ctx->ac, "llvm.rint",
1854                                               ac_to_float_type(&ctx->ac, def_type),src[0]);
1855                 break;
1856         case nir_op_ffract:
1857                 result = emit_ffract(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
1858                 break;
1859         case nir_op_fsin:
1860                 result = emit_intrin_1f_param(&ctx->ac, "llvm.sin",
1861                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1862                 break;
1863         case nir_op_fcos:
1864                 result = emit_intrin_1f_param(&ctx->ac, "llvm.cos",
1865                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1866                 break;
1867         case nir_op_fsqrt:
1868                 result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
1869                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1870                 break;
1871         case nir_op_fexp2:
1872                 result = emit_intrin_1f_param(&ctx->ac, "llvm.exp2",
1873                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1874                 break;
1875         case nir_op_flog2:
1876                 result = emit_intrin_1f_param(&ctx->ac, "llvm.log2",
1877                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1878                 break;
1879         case nir_op_frsq:
1880                 result = emit_intrin_1f_param(&ctx->ac, "llvm.sqrt",
1881                                               ac_to_float_type(&ctx->ac, def_type), src[0]);
1882                 result = ac_build_fdiv(&ctx->ac, instr->dest.dest.ssa.bit_size == 32 ? ctx->ac.f32_1 : ctx->ac.f64_1,
1883                                        result);
1884                 break;
1885         case nir_op_fpow:
1886                 result = emit_intrin_2f_param(&ctx->ac, "llvm.pow",
1887                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
1888                 break;
1889         case nir_op_fmax:
1890                 result = emit_intrin_2f_param(&ctx->ac, "llvm.maxnum",
1891                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
1892                 if (ctx->ac.chip_class < GFX9 &&
1893                     instr->dest.dest.ssa.bit_size == 32) {
1894                         /* Only pre-GFX9 chips do not flush denorms. */
1895                         result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
1896                                                       ac_to_float_type(&ctx->ac, def_type),
1897                                                       result);
1898                 }
1899                 break;
1900         case nir_op_fmin:
1901                 result = emit_intrin_2f_param(&ctx->ac, "llvm.minnum",
1902                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1]);
1903                 if (ctx->ac.chip_class < GFX9 &&
1904                     instr->dest.dest.ssa.bit_size == 32) {
1905                         /* Only pre-GFX9 chips do not flush denorms. */
1906                         result = emit_intrin_1f_param(&ctx->ac, "llvm.canonicalize",
1907                                                       ac_to_float_type(&ctx->ac, def_type),
1908                                                       result);
1909                 }
1910                 break;
1911         case nir_op_ffma:
1912                 result = emit_intrin_3f_param(&ctx->ac, "llvm.fmuladd",
1913                                               ac_to_float_type(&ctx->ac, def_type), src[0], src[1], src[2]);
1914                 break;
1915         case nir_op_ibitfield_extract:
1916                 result = emit_bitfield_extract(&ctx->ac, true, src);
1917                 break;
1918         case nir_op_ubitfield_extract:
1919                 result = emit_bitfield_extract(&ctx->ac, false, src);
1920                 break;
1921         case nir_op_bitfield_insert:
1922                 result = emit_bitfield_insert(&ctx->ac, src[0], src[1], src[2], src[3]);
1923                 break;
1924         case nir_op_bitfield_reverse:
1925                 result = ac_build_intrinsic(&ctx->ac, "llvm.bitreverse.i32", ctx->ac.i32, src, 1, AC_FUNC_ATTR_READNONE);
1926                 break;
1927         case nir_op_bit_count:
1928                 result = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->ac.i32, src, 1, AC_FUNC_ATTR_READNONE);
1929                 break;
1930         case nir_op_vec2:
1931         case nir_op_vec3:
1932         case nir_op_vec4:
1933                 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1934                         src[i] = ac_to_integer(&ctx->ac, src[i]);
1935                 result = ac_build_gather_values(&ctx->ac, src, num_components);
1936                 break;
1937         case nir_op_f2i32:
1938         case nir_op_f2i64:
1939                 src[0] = ac_to_float(&ctx->ac, src[0]);
1940                 result = LLVMBuildFPToSI(ctx->ac.builder, src[0], def_type, "");
1941                 break;
1942         case nir_op_f2u32:
1943         case nir_op_f2u64:
1944                 src[0] = ac_to_float(&ctx->ac, src[0]);
1945                 result = LLVMBuildFPToUI(ctx->ac.builder, src[0], def_type, "");
1946                 break;
1947         case nir_op_i2f32:
1948         case nir_op_i2f64:
1949                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1950                 result = LLVMBuildSIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1951                 break;
1952         case nir_op_u2f32:
1953         case nir_op_u2f64:
1954                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1955                 result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1956                 break;
1957         case nir_op_f2f64:
1958                 src[0] = ac_to_float(&ctx->ac, src[0]);
1959                 result = LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1960                 break;
1961         case nir_op_f2f32:
1962                 result = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), "");
1963                 break;
1964         case nir_op_u2u32:
1965         case nir_op_u2u64:
1966                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1967                 if (get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < get_elem_bits(&ctx->ac, def_type))
1968                         result = LLVMBuildZExt(ctx->ac.builder, src[0], def_type, "");
1969                 else
1970                         result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
1971                 break;
1972         case nir_op_i2i32:
1973         case nir_op_i2i64:
1974                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1975                 if (get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < get_elem_bits(&ctx->ac, def_type))
1976                         result = LLVMBuildSExt(ctx->ac.builder, src[0], def_type, "");
1977                 else
1978                         result = LLVMBuildTrunc(ctx->ac.builder, src[0], def_type, "");
1979                 break;
1980         case nir_op_bcsel:
1981                 result = emit_bcsel(&ctx->ac, src[0], src[1], src[2]);
1982                 break;
1983         case nir_op_find_lsb:
1984                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1985                 result = ac_find_lsb(&ctx->ac, ctx->ac.i32, src[0]);
1986                 break;
1987         case nir_op_ufind_msb:
1988                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1989                 result = ac_build_umsb(&ctx->ac, src[0], ctx->ac.i32);
1990                 break;
1991         case nir_op_ifind_msb:
1992                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1993                 result = ac_build_imsb(&ctx->ac, src[0], ctx->ac.i32);
1994                 break;
1995         case nir_op_uadd_carry:
1996                 src[0] = ac_to_integer(&ctx->ac, src[0]);
1997                 src[1] = ac_to_integer(&ctx->ac, src[1]);
1998                 result = emit_uint_carry(&ctx->ac, "llvm.uadd.with.overflow.i32", src[0], src[1]);
1999                 break;
2000         case nir_op_usub_borrow:
2001                 src[0] = ac_to_integer(&ctx->ac, src[0]);
2002                 src[1] = ac_to_integer(&ctx->ac, src[1]);
2003                 result = emit_uint_carry(&ctx->ac, "llvm.usub.with.overflow.i32", src[0], src[1]);
2004                 break;
2005         case nir_op_b2f:
2006                 result = emit_b2f(&ctx->ac, src[0]);
2007                 break;
2008         case nir_op_f2b:
2009                 result = emit_f2b(&ctx->ac, src[0]);
2010                 break;
2011         case nir_op_b2i:
2012                 result = emit_b2i(&ctx->ac, src[0], instr->dest.dest.ssa.bit_size);
2013                 break;
2014         case nir_op_i2b:
2015                 src[0] = ac_to_integer(&ctx->ac, src[0]);
2016                 result = emit_i2b(&ctx->ac, src[0]);
2017                 break;
2018         case nir_op_fquantize2f16:
2019                 result = emit_f2f16(ctx->nctx, src[0]);
2020                 break;
2021         case nir_op_umul_high:
2022                 src[0] = ac_to_integer(&ctx->ac, src[0]);
2023                 src[1] = ac_to_integer(&ctx->ac, src[1]);
2024                 result = emit_umul_high(&ctx->ac, src[0], src[1]);
2025                 break;
2026         case nir_op_imul_high:
2027                 src[0] = ac_to_integer(&ctx->ac, src[0]);
2028                 src[1] = ac_to_integer(&ctx->ac, src[1]);
2029                 result = emit_imul_high(&ctx->ac, src[0], src[1]);
2030                 break;
2031         case nir_op_pack_half_2x16:
2032                 result = emit_pack_half_2x16(&ctx->ac, src[0]);
2033                 break;
2034         case nir_op_unpack_half_2x16:
2035                 result = emit_unpack_half_2x16(&ctx->ac, src[0]);
2036                 break;
2037         case nir_op_fddx:
2038         case nir_op_fddy:
2039         case nir_op_fddx_fine:
2040         case nir_op_fddy_fine:
2041         case nir_op_fddx_coarse:
2042         case nir_op_fddy_coarse:
2043                 result = emit_ddxy(ctx, instr->op, src[0]);
2044                 break;
2045
2046         case nir_op_unpack_64_2x32_split_x: {
2047                 assert(instr->src[0].src.ssa->num_components == 1);
2048                 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
2049                                                     ctx->ac.v2i32,
2050                                                     "");
2051                 result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
2052                                                  ctx->ac.i32_0, "");
2053                 break;
2054         }
2055
2056         case nir_op_unpack_64_2x32_split_y: {
2057                 assert(instr->src[0].src.ssa->num_components == 1);
2058                 LLVMValueRef tmp = LLVMBuildBitCast(ctx->ac.builder, src[0],
2059                                                     ctx->ac.v2i32,
2060                                                     "");
2061                 result = LLVMBuildExtractElement(ctx->ac.builder, tmp,
2062                                                  ctx->ac.i32_1, "");
2063                 break;
2064         }
2065
2066         case nir_op_pack_64_2x32_split: {
2067                 LLVMValueRef tmp = LLVMGetUndef(ctx->ac.v2i32);
2068                 tmp = LLVMBuildInsertElement(ctx->ac.builder, tmp,
2069                                              src[0], ctx->ac.i32_0, "");
2070                 tmp = LLVMBuildInsertElement(ctx->ac.builder, tmp,
2071                                              src[1], ctx->ac.i32_1, "");
2072                 result = LLVMBuildBitCast(ctx->ac.builder, tmp, ctx->ac.i64, "");
2073                 break;
2074         }
2075
2076         default:
2077                 fprintf(stderr, "Unknown NIR alu instr: ");
2078                 nir_print_instr(&instr->instr, stderr);
2079                 fprintf(stderr, "\n");
2080                 abort();
2081         }
2082
2083         if (result) {
2084                 assert(instr->dest.dest.is_ssa);
2085                 result = ac_to_integer(&ctx->ac, result);
2086                 _mesa_hash_table_insert(ctx->defs, &instr->dest.dest.ssa,
2087                                         result);
2088         }
2089 }
2090
2091 static void visit_load_const(struct ac_nir_context *ctx,
2092                              const nir_load_const_instr *instr)
2093 {
2094         LLVMValueRef values[4], value = NULL;
2095         LLVMTypeRef element_type =
2096             LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
2097
2098         for (unsigned i = 0; i < instr->def.num_components; ++i) {
2099                 switch (instr->def.bit_size) {
2100                 case 32:
2101                         values[i] = LLVMConstInt(element_type,
2102                                                  instr->value.u32[i], false);
2103                         break;
2104                 case 64:
2105                         values[i] = LLVMConstInt(element_type,
2106                                                  instr->value.u64[i], false);
2107                         break;
2108                 default:
2109                         fprintf(stderr,
2110                                 "unsupported nir load_const bit_size: %d\n",
2111                                 instr->def.bit_size);
2112                         abort();
2113                 }
2114         }
2115         if (instr->def.num_components > 1) {
2116                 value = LLVMConstVector(values, instr->def.num_components);
2117         } else
2118                 value = values[0];
2119
2120         _mesa_hash_table_insert(ctx->defs, &instr->def, value);
2121 }
2122
2123 static LLVMValueRef cast_ptr(struct nir_to_llvm_context *ctx, LLVMValueRef ptr,
2124                              LLVMTypeRef type)
2125 {
2126         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
2127         return LLVMBuildBitCast(ctx->builder, ptr,
2128                                 LLVMPointerType(type, addr_space), "");
2129 }
2130
2131 static LLVMValueRef
2132 get_buffer_size(struct ac_nir_context *ctx, LLVMValueRef descriptor, bool in_elements)
2133 {
2134         LLVMValueRef size =
2135                 LLVMBuildExtractElement(ctx->ac.builder, descriptor,
2136                                         LLVMConstInt(ctx->ac.i32, 2, false), "");
2137
2138         /* VI only */
2139         if (ctx->ac.chip_class == VI && in_elements) {
2140                 /* On VI, the descriptor contains the size in bytes,
2141                  * but TXQ must return the size in elements.
2142                  * The stride is always non-zero for resources using TXQ.
2143                  */
2144                 LLVMValueRef stride =
2145                         LLVMBuildExtractElement(ctx->ac.builder, descriptor,
2146                                                 ctx->ac.i32_1, "");
2147                 stride = LLVMBuildLShr(ctx->ac.builder, stride,
2148                                        LLVMConstInt(ctx->ac.i32, 16, false), "");
2149                 stride = LLVMBuildAnd(ctx->ac.builder, stride,
2150                                       LLVMConstInt(ctx->ac.i32, 0x3fff, false), "");
2151
2152                 size = LLVMBuildUDiv(ctx->ac.builder, size, stride, "");
2153         }
2154         return size;
2155 }
2156
2157 /**
2158  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
2159  * intrinsic names).
2160  */
2161 static void build_int_type_name(
2162         LLVMTypeRef type,
2163         char *buf, unsigned bufsize)
2164 {
2165         assert(bufsize >= 6);
2166
2167         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
2168                 snprintf(buf, bufsize, "v%ui32",
2169                          LLVMGetVectorSize(type));
2170         else
2171                 strcpy(buf, "i32");
2172 }
2173
2174 static LLVMValueRef radv_lower_gather4_integer(struct ac_llvm_context *ctx,
2175                                                struct ac_image_args *args,
2176                                                const nir_tex_instr *instr)
2177 {
2178         enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
2179         LLVMValueRef coord = args->addr;
2180         LLVMValueRef half_texel[2];
2181         LLVMValueRef compare_cube_wa = NULL;
2182         LLVMValueRef result;
2183         int c;
2184         unsigned coord_vgpr_index = (unsigned)args->offset + (unsigned)args->compare;
2185
2186         //TODO Rect
2187         {
2188                 struct ac_image_args txq_args = { 0 };
2189
2190                 txq_args.da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
2191                 txq_args.opcode = ac_image_get_resinfo;
2192                 txq_args.dmask = 0xf;
2193                 txq_args.addr = ctx->i32_0;
2194                 txq_args.resource = args->resource;
2195                 LLVMValueRef size = ac_build_image_opcode(ctx, &txq_args);
2196
2197                 for (c = 0; c < 2; c++) {
2198                         half_texel[c] = LLVMBuildExtractElement(ctx->builder, size,
2199                                                                 LLVMConstInt(ctx->i32, c, false), "");
2200                         half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, "");
2201                         half_texel[c] = ac_build_fdiv(ctx, ctx->f32_1, half_texel[c]);
2202                         half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c],
2203                                                       LLVMConstReal(ctx->f32, -0.5), "");
2204                 }
2205         }
2206
2207         LLVMValueRef orig_coords = args->addr;
2208
2209         for (c = 0; c < 2; c++) {
2210                 LLVMValueRef tmp;
2211                 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
2212                 tmp = LLVMBuildExtractElement(ctx->builder, coord, index, "");
2213                 tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
2214                 tmp = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], "");
2215                 tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
2216                 coord = LLVMBuildInsertElement(ctx->builder, coord, tmp, index, "");
2217         }
2218
2219
2220         /*
2221          * Apparantly cube has issue with integer types that the workaround doesn't solve,
2222          * so this tests if the format is 8_8_8_8 and an integer type do an alternate
2223          * workaround by sampling using a scaled type and converting.
2224          * This is taken from amdgpu-pro shaders.
2225          */
2226         /* NOTE this produces some ugly code compared to amdgpu-pro,
2227          * LLVM ends up dumping SGPRs into VGPRs to deal with the compare/select,
2228          * and then reads them back. -pro generates two selects,
2229          * one s_cmp for the descriptor rewriting
2230          * one v_cmp for the coordinate and result changes.
2231          */
2232         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2233                 LLVMValueRef tmp, tmp2;
2234
2235                 /* workaround 8/8/8/8 uint/sint cube gather bug */
2236                 /* first detect it then change to a scaled read and f2i */
2237                 tmp = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32_1, "");
2238                 tmp2 = tmp;
2239
2240                 /* extract the DATA_FORMAT */
2241                 tmp = ac_build_bfe(ctx, tmp, LLVMConstInt(ctx->i32, 20, false),
2242                                    LLVMConstInt(ctx->i32, 6, false), false);
2243
2244                 /* is the DATA_FORMAT == 8_8_8_8 */
2245                 compare_cube_wa = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tmp, LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), "");
2246
2247                 if (stype == GLSL_TYPE_UINT)
2248                         /* Create a NUM FORMAT - 0x2 or 0x4 - USCALED or UINT */
2249                         tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0x8000000, false),
2250                                               LLVMConstInt(ctx->i32, 0x10000000, false), "");
2251                 else
2252                         /* Create a NUM FORMAT - 0x3 or 0x5 - SSCALED or SINT */
2253                         tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0xc000000, false),
2254                                               LLVMConstInt(ctx->i32, 0x14000000, false), "");
2255
2256                 /* replace the NUM FORMAT in the descriptor */
2257                 tmp2 = LLVMBuildAnd(ctx->builder, tmp2, LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT_GFX6, false), "");
2258                 tmp2 = LLVMBuildOr(ctx->builder, tmp2, tmp, "");
2259
2260                 args->resource = LLVMBuildInsertElement(ctx->builder, args->resource, tmp2, ctx->i32_1, "");
2261
2262                 /* don't modify the coordinates for this case */
2263                 coord = LLVMBuildSelect(ctx->builder, compare_cube_wa, orig_coords, coord, "");
2264         }
2265         args->addr = coord;
2266         result = ac_build_image_opcode(ctx, args);
2267
2268         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
2269                 LLVMValueRef tmp, tmp2;
2270
2271                 /* if the cube workaround is in place, f2i the result. */
2272                 for (c = 0; c < 4; c++) {
2273                         tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), "");
2274                         if (stype == GLSL_TYPE_UINT)
2275                                 tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, "");
2276                         else
2277                                 tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, "");
2278                         tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
2279                         tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, "");
2280                         tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, tmp2, tmp, "");
2281                         tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
2282                         result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), "");
2283                 }
2284         }
2285         return result;
2286 }
2287
2288 static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx,
2289                                         const nir_tex_instr *instr,
2290                                         bool lod_is_zero,
2291                                         struct ac_image_args *args)
2292 {
2293         if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
2294                 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
2295
2296                 return ac_build_buffer_load_format(&ctx->ac,
2297                                                    args->resource,
2298                                                    args->addr,
2299                                                    ctx->ac.i32_0,
2300                                                    util_last_bit(mask),
2301                                                    false, true);
2302         }
2303
2304         args->opcode = ac_image_sample;
2305         args->compare = instr->is_shadow;
2306
2307         switch (instr->op) {
2308         case nir_texop_txf:
2309         case nir_texop_txf_ms:
2310         case nir_texop_samples_identical:
2311                 args->opcode = lod_is_zero ||
2312                                instr->sampler_dim == GLSL_SAMPLER_DIM_MS ?
2313                                         ac_image_load : ac_image_load_mip;
2314                 args->compare = false;
2315                 args->offset = false;
2316                 break;
2317         case nir_texop_txb:
2318                 args->bias = true;
2319                 break;
2320         case nir_texop_txl:
2321                 if (lod_is_zero)
2322                         args->level_zero = true;
2323                 else
2324                         args->lod = true;
2325                 break;
2326         case nir_texop_txs:
2327         case nir_texop_query_levels:
2328                 args->opcode = ac_image_get_resinfo;
2329                 break;
2330         case nir_texop_tex:
2331                 if (ctx->stage != MESA_SHADER_FRAGMENT)
2332                         args->level_zero = true;
2333                 break;
2334         case nir_texop_txd:
2335                 args->deriv = true;
2336                 break;
2337         case nir_texop_tg4:
2338                 args->opcode = ac_image_gather4;
2339                 args->level_zero = true;
2340                 break;
2341         case nir_texop_lod:
2342                 args->opcode = ac_image_get_lod;
2343                 args->compare = false;
2344                 args->offset = false;
2345                 break;
2346         default:
2347                 break;
2348         }
2349
2350         if (instr->op == nir_texop_tg4 && ctx->ac.chip_class <= VI) {
2351                 enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
2352                 if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
2353                         return radv_lower_gather4_integer(&ctx->ac, args, instr);
2354                 }
2355         }
2356         return ac_build_image_opcode(&ctx->ac, args);
2357 }
2358
2359 static LLVMValueRef visit_vulkan_resource_index(struct nir_to_llvm_context *ctx,
2360                                                 nir_intrinsic_instr *instr)
2361 {
2362         LLVMValueRef index = get_src(ctx->nir, instr->src[0]);
2363         unsigned desc_set = nir_intrinsic_desc_set(instr);
2364         unsigned binding = nir_intrinsic_binding(instr);
2365         LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
2366         struct radv_pipeline_layout *pipeline_layout = ctx->options->layout;
2367         struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
2368         unsigned base_offset = layout->binding[binding].offset;
2369         LLVMValueRef offset, stride;
2370
2371         if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
2372             layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
2373                 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
2374                         layout->binding[binding].dynamic_offset_offset;
2375                 desc_ptr = ctx->push_constants;
2376                 base_offset = pipeline_layout->push_constant_size + 16 * idx;
2377                 stride = LLVMConstInt(ctx->ac.i32, 16, false);
2378         } else
2379                 stride = LLVMConstInt(ctx->ac.i32, layout->binding[binding].size, false);
2380
2381         offset = LLVMConstInt(ctx->ac.i32, base_offset, false);
2382         index = LLVMBuildMul(ctx->builder, index, stride, "");
2383         offset = LLVMBuildAdd(ctx->builder, offset, index, "");
2384         
2385         desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset);
2386         desc_ptr = cast_ptr(ctx, desc_ptr, ctx->ac.v4i32);
2387         LLVMSetMetadata(desc_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
2388
2389         return desc_ptr;
2390 }
2391
2392 static LLVMValueRef visit_vulkan_resource_reindex(struct nir_to_llvm_context *ctx,
2393                                                   nir_intrinsic_instr *instr)
2394 {
2395         LLVMValueRef ptr = get_src(ctx->nir, instr->src[0]);
2396         LLVMValueRef index = get_src(ctx->nir, instr->src[1]);
2397
2398         LLVMValueRef result = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
2399         LLVMSetMetadata(result, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
2400         return result;
2401 }
2402
2403 static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
2404                                              nir_intrinsic_instr *instr)
2405 {
2406         LLVMValueRef ptr, addr;
2407
2408         addr = LLVMConstInt(ctx->ac.i32, nir_intrinsic_base(instr), 0);
2409         addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx->nir, instr->src[0]), "");
2410
2411         ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
2412         ptr = cast_ptr(ctx, ptr, get_def_type(ctx->nir, &instr->dest.ssa));
2413
2414         return LLVMBuildLoad(ctx->builder, ptr, "");
2415 }
2416
2417 static LLVMValueRef visit_get_buffer_size(struct ac_nir_context *ctx,
2418                                           const nir_intrinsic_instr *instr)
2419 {
2420         LLVMValueRef index = get_src(ctx, instr->src[0]);
2421
2422         return get_buffer_size(ctx, ctx->abi->load_ssbo(ctx->abi, index, false), false);
2423 }
2424
2425 static uint32_t widen_mask(uint32_t mask, unsigned multiplier)
2426 {
2427         uint32_t new_mask = 0;
2428         for(unsigned i = 0; i < 32 && (1u << i) <= mask; ++i)
2429                 if (mask & (1u << i))
2430                         new_mask |= ((1u << multiplier) - 1u) << (i * multiplier);
2431         return new_mask;
2432 }
2433
2434 static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueRef src,
2435                                          unsigned start, unsigned count)
2436 {
2437         LLVMTypeRef type = LLVMTypeOf(src);
2438
2439         if (LLVMGetTypeKind(type) != LLVMVectorTypeKind) {
2440                 assert(start == 0);
2441                 assert(count == 1);
2442                 return src;
2443         }
2444
2445         unsigned src_elements = LLVMGetVectorSize(type);
2446         assert(start < src_elements);
2447         assert(start + count <= src_elements);
2448
2449         if (start == 0 && count == src_elements)
2450                 return src;
2451
2452         if (count == 1)
2453                 return LLVMBuildExtractElement(ctx->builder, src, LLVMConstInt(ctx->i32, start, false), "");
2454
2455         assert(count <= 8);
2456         LLVMValueRef indices[8];
2457         for (unsigned i = 0; i < count; ++i)
2458                 indices[i] = LLVMConstInt(ctx->i32, start + i, false);
2459
2460         LLVMValueRef swizzle = LLVMConstVector(indices, count);
2461         return LLVMBuildShuffleVector(ctx->builder, src, src, swizzle, "");
2462 }
2463
2464 static void visit_store_ssbo(struct ac_nir_context *ctx,
2465                              nir_intrinsic_instr *instr)
2466 {
2467         const char *store_name;
2468         LLVMValueRef src_data = get_src(ctx, instr->src[0]);
2469         LLVMTypeRef data_type = ctx->ac.f32;
2470         int elem_size_mult = get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 32;
2471         int components_32bit = elem_size_mult * instr->num_components;
2472         unsigned writemask = nir_intrinsic_write_mask(instr);
2473         LLVMValueRef base_data, base_offset;
2474         LLVMValueRef params[6];
2475
2476         params[1] = ctx->abi->load_ssbo(ctx->abi,
2477                                         get_src(ctx, instr->src[1]), true);
2478         params[2] = ctx->ac.i32_0; /* vindex */
2479         params[4] = ctx->ac.i1false;  /* glc */
2480         params[5] = ctx->ac.i1false;  /* slc */
2481
2482         if (components_32bit > 1)
2483                 data_type = LLVMVectorType(ctx->ac.f32, components_32bit);
2484
2485         writemask = widen_mask(writemask, elem_size_mult);
2486
2487         base_data = ac_to_float(&ctx->ac, src_data);
2488         base_data = trim_vector(&ctx->ac, base_data, instr->num_components);
2489         base_data = LLVMBuildBitCast(ctx->ac.builder, base_data,
2490                                      data_type, "");
2491         base_offset = get_src(ctx, instr->src[2]);      /* voffset */
2492         while (writemask) {
2493                 int start, count;
2494                 LLVMValueRef data;
2495                 LLVMValueRef offset;
2496
2497                 u_bit_scan_consecutive_range(&writemask, &start, &count);
2498
2499                 /* Due to an LLVM limitation, split 3-element writes
2500                  * into a 2-element and a 1-element write. */
2501                 if (count == 3) {
2502                         writemask |= 1 << (start + 2);
2503                         count = 2;
2504                 }
2505
2506                 if (count > 4) {
2507                         writemask |= ((1u << (count - 4)) - 1u) << (start + 4);
2508                         count = 4;
2509                 }
2510
2511                 if (count == 4) {
2512                         store_name = "llvm.amdgcn.buffer.store.v4f32";
2513                 } else if (count == 2) {
2514                         store_name = "llvm.amdgcn.buffer.store.v2f32";
2515
2516                 } else {
2517                         assert(count == 1);
2518                         store_name = "llvm.amdgcn.buffer.store.f32";
2519                 }
2520                 data = extract_vector_range(&ctx->ac, base_data, start, count);
2521
2522                 offset = base_offset;
2523                 if (start != 0) {
2524                         offset = LLVMBuildAdd(ctx->ac.builder, offset, LLVMConstInt(ctx->ac.i32, start * 4, false), "");
2525                 }
2526                 params[0] = data;
2527                 params[3] = offset;
2528                 ac_build_intrinsic(&ctx->ac, store_name,
2529                                    ctx->ac.voidt, params, 6, 0);
2530         }
2531 }
2532
2533 static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx,
2534                                       const nir_intrinsic_instr *instr)
2535 {
2536         const char *name;
2537         LLVMValueRef params[6];
2538         int arg_count = 0;
2539
2540         if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
2541                 params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[3]), 0);
2542         }
2543         params[arg_count++] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
2544         params[arg_count++] = ctx->abi->load_ssbo(ctx->abi,
2545                                                  get_src(ctx, instr->src[0]),
2546                                                  true);
2547         params[arg_count++] = ctx->ac.i32_0; /* vindex */
2548         params[arg_count++] = get_src(ctx, instr->src[1]);      /* voffset */
2549         params[arg_count++] = LLVMConstInt(ctx->ac.i1, 0, false);  /* slc */
2550
2551         switch (instr->intrinsic) {
2552         case nir_intrinsic_ssbo_atomic_add:
2553                 name = "llvm.amdgcn.buffer.atomic.add";
2554                 break;
2555         case nir_intrinsic_ssbo_atomic_imin:
2556                 name = "llvm.amdgcn.buffer.atomic.smin";
2557                 break;
2558         case nir_intrinsic_ssbo_atomic_umin:
2559                 name = "llvm.amdgcn.buffer.atomic.umin";
2560                 break;
2561         case nir_intrinsic_ssbo_atomic_imax:
2562                 name = "llvm.amdgcn.buffer.atomic.smax";
2563                 break;
2564         case nir_intrinsic_ssbo_atomic_umax:
2565                 name = "llvm.amdgcn.buffer.atomic.umax";
2566                 break;
2567         case nir_intrinsic_ssbo_atomic_and:
2568                 name = "llvm.amdgcn.buffer.atomic.and";
2569                 break;
2570         case nir_intrinsic_ssbo_atomic_or:
2571                 name = "llvm.amdgcn.buffer.atomic.or";
2572                 break;
2573         case nir_intrinsic_ssbo_atomic_xor:
2574                 name = "llvm.amdgcn.buffer.atomic.xor";
2575                 break;
2576         case nir_intrinsic_ssbo_atomic_exchange:
2577                 name = "llvm.amdgcn.buffer.atomic.swap";
2578                 break;
2579         case nir_intrinsic_ssbo_atomic_comp_swap:
2580                 name = "llvm.amdgcn.buffer.atomic.cmpswap";
2581                 break;
2582         default:
2583                 abort();
2584         }
2585
2586         return ac_build_intrinsic(&ctx->ac, name, ctx->ac.i32, params, arg_count, 0);
2587 }
2588
2589 static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx,
2590                                       const nir_intrinsic_instr *instr)
2591 {
2592         LLVMValueRef results[2];
2593         int load_components;
2594         int num_components = instr->num_components;
2595         if (instr->dest.ssa.bit_size == 64)
2596                 num_components *= 2;
2597
2598         for (int i = 0; i < num_components; i += load_components) {
2599                 load_components = MIN2(num_components - i, 4);
2600                 const char *load_name;
2601                 LLVMTypeRef data_type = ctx->ac.f32;
2602                 LLVMValueRef offset = LLVMConstInt(ctx->ac.i32, i * 4, false);
2603                 offset = LLVMBuildAdd(ctx->ac.builder, get_src(ctx, instr->src[1]), offset, "");
2604
2605                 if (load_components == 3)
2606                         data_type = LLVMVectorType(ctx->ac.f32, 4);
2607                 else if (load_components > 1)
2608                         data_type = LLVMVectorType(ctx->ac.f32, load_components);
2609
2610                 if (load_components >= 3)
2611                         load_name = "llvm.amdgcn.buffer.load.v4f32";
2612                 else if (load_components == 2)
2613                         load_name = "llvm.amdgcn.buffer.load.v2f32";
2614                 else if (load_components == 1)
2615                         load_name = "llvm.amdgcn.buffer.load.f32";
2616                 else
2617                         unreachable("unhandled number of components");
2618
2619                 LLVMValueRef params[] = {
2620                         ctx->abi->load_ssbo(ctx->abi,
2621                                             get_src(ctx, instr->src[0]),
2622                                             false),
2623                         ctx->ac.i32_0,
2624                         offset,
2625                         ctx->ac.i1false,
2626                         ctx->ac.i1false,
2627                 };
2628
2629                 results[i > 0 ? 1 : 0] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
2630         }
2631
2632         assume(results[0]);
2633         LLVMValueRef ret = results[0];
2634         if (num_components > 4 || num_components == 3) {
2635                 LLVMValueRef masks[] = {
2636                         LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
2637                         LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
2638                         LLVMConstInt(ctx->ac.i32, 4, false), LLVMConstInt(ctx->ac.i32, 5, false),
2639                         LLVMConstInt(ctx->ac.i32, 6, false), LLVMConstInt(ctx->ac.i32, 7, false)
2640                 };
2641
2642                 LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
2643                 ret = LLVMBuildShuffleVector(ctx->ac.builder, results[0],
2644                                              results[num_components > 4 ? 1 : 0], swizzle, "");
2645         }
2646
2647         return LLVMBuildBitCast(ctx->ac.builder, ret,
2648                                 get_def_type(ctx, &instr->dest.ssa), "");
2649 }
2650
2651 static LLVMValueRef visit_load_ubo_buffer(struct ac_nir_context *ctx,
2652                                           const nir_intrinsic_instr *instr)
2653 {
2654         LLVMValueRef ret;
2655         LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
2656         LLVMValueRef offset = get_src(ctx, instr->src[1]);
2657         int num_components = instr->num_components;
2658
2659         if (ctx->abi->load_ubo)
2660                 rsrc = ctx->abi->load_ubo(ctx->abi, rsrc);
2661
2662         if (instr->dest.ssa.bit_size == 64)
2663                 num_components *= 2;
2664
2665         ret = ac_build_buffer_load(&ctx->ac, rsrc, num_components, NULL, offset,
2666                                    NULL, 0, false, false, true, true);
2667         ret = trim_vector(&ctx->ac, ret, num_components);
2668         return LLVMBuildBitCast(ctx->ac.builder, ret,
2669                                 get_def_type(ctx, &instr->dest.ssa), "");
2670 }
2671
2672 static void
2673 get_deref_offset(struct ac_nir_context *ctx, nir_deref_var *deref,
2674                  bool vs_in, unsigned *vertex_index_out,
2675                  LLVMValueRef *vertex_index_ref,
2676                  unsigned *const_out, LLVMValueRef *indir_out)
2677 {
2678         unsigned const_offset = 0;
2679         nir_deref *tail = &deref->deref;
2680         LLVMValueRef offset = NULL;
2681
2682         if (vertex_index_out != NULL || vertex_index_ref != NULL) {
2683                 tail = tail->child;
2684                 nir_deref_array *deref_array = nir_deref_as_array(tail);
2685                 if (vertex_index_out)
2686                         *vertex_index_out = deref_array->base_offset;
2687
2688                 if (vertex_index_ref) {
2689                         LLVMValueRef vtx = LLVMConstInt(ctx->ac.i32, deref_array->base_offset, false);
2690                         if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
2691                                 vtx = LLVMBuildAdd(ctx->ac.builder, vtx, get_src(ctx, deref_array->indirect), "");
2692                         }
2693                         *vertex_index_ref = vtx;
2694                 }
2695         }
2696
2697         if (deref->var->data.compact) {
2698                 assert(tail->child->deref_type == nir_deref_type_array);
2699                 assert(glsl_type_is_scalar(glsl_without_array(deref->var->type)));
2700                 nir_deref_array *deref_array = nir_deref_as_array(tail->child);
2701                 /* We always lower indirect dereferences for "compact" array vars. */
2702                 assert(deref_array->deref_array_type == nir_deref_array_type_direct);
2703
2704                 const_offset = deref_array->base_offset;
2705                 goto out;
2706         }
2707
2708         while (tail->child != NULL) {
2709                 const struct glsl_type *parent_type = tail->type;
2710                 tail = tail->child;
2711
2712                 if (tail->deref_type == nir_deref_type_array) {
2713                         nir_deref_array *deref_array = nir_deref_as_array(tail);
2714                         LLVMValueRef index, stride, local_offset;
2715                         unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
2716
2717                         const_offset += size * deref_array->base_offset;
2718                         if (deref_array->deref_array_type == nir_deref_array_type_direct)
2719                                 continue;
2720
2721                         assert(deref_array->deref_array_type == nir_deref_array_type_indirect);
2722                         index = get_src(ctx, deref_array->indirect);
2723                         stride = LLVMConstInt(ctx->ac.i32, size, 0);
2724                         local_offset = LLVMBuildMul(ctx->ac.builder, stride, index, "");
2725
2726                         if (offset)
2727                                 offset = LLVMBuildAdd(ctx->ac.builder, offset, local_offset, "");
2728                         else
2729                                 offset = local_offset;
2730                 } else if (tail->deref_type == nir_deref_type_struct) {
2731                         nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
2732
2733                         for (unsigned i = 0; i < deref_struct->index; i++) {
2734                                 const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
2735                                 const_offset += glsl_count_attribute_slots(ft, vs_in);
2736                         }
2737                 } else
2738                         unreachable("unsupported deref type");
2739
2740         }
2741 out:
2742         if (const_offset && offset)
2743                 offset = LLVMBuildAdd(ctx->ac.builder, offset,
2744                                       LLVMConstInt(ctx->ac.i32, const_offset, 0),
2745                                       "");
2746
2747         *const_out = const_offset;
2748         *indir_out = offset;
2749 }
2750
2751
2752 /* The offchip buffer layout for TCS->TES is
2753  *
2754  * - attribute 0 of patch 0 vertex 0
2755  * - attribute 0 of patch 0 vertex 1
2756  * - attribute 0 of patch 0 vertex 2
2757  *   ...
2758  * - attribute 0 of patch 1 vertex 0
2759  * - attribute 0 of patch 1 vertex 1
2760  *   ...
2761  * - attribute 1 of patch 0 vertex 0
2762  * - attribute 1 of patch 0 vertex 1
2763  *   ...
2764  * - per patch attribute 0 of patch 0
2765  * - per patch attribute 0 of patch 1
2766  *   ...
2767  *
2768  * Note that every attribute has 4 components.
2769  */
2770 static LLVMValueRef get_tcs_tes_buffer_address(struct nir_to_llvm_context *ctx,
2771                                                LLVMValueRef vertex_index,
2772                                                LLVMValueRef param_index)
2773 {
2774         LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices;
2775         LLVMValueRef param_stride, constant16;
2776         LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
2777
2778         vertices_per_patch = unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 9, 6);
2779         num_patches = unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 0, 9);
2780         total_vertices = LLVMBuildMul(ctx->builder, vertices_per_patch,
2781                                       num_patches, "");
2782
2783         constant16 = LLVMConstInt(ctx->ac.i32, 16, false);
2784         if (vertex_index) {
2785                 base_addr = LLVMBuildMul(ctx->builder, rel_patch_id,
2786                                          vertices_per_patch, "");
2787
2788                 base_addr = LLVMBuildAdd(ctx->builder, base_addr,
2789                                          vertex_index, "");
2790
2791                 param_stride = total_vertices;
2792         } else {
2793                 base_addr = rel_patch_id;
2794                 param_stride = num_patches;
2795         }
2796
2797         base_addr = LLVMBuildAdd(ctx->builder, base_addr,
2798                                  LLVMBuildMul(ctx->builder, param_index,
2799                                               param_stride, ""), "");
2800
2801         base_addr = LLVMBuildMul(ctx->builder, base_addr, constant16, "");
2802
2803         if (!vertex_index) {
2804                 LLVMValueRef patch_data_offset =
2805                            unpack_param(&ctx->ac, ctx->tcs_offchip_layout, 16, 16);
2806
2807                 base_addr = LLVMBuildAdd(ctx->builder, base_addr,
2808                                          patch_data_offset, "");
2809         }
2810         return base_addr;
2811 }
2812
2813 static LLVMValueRef get_tcs_tes_buffer_address_params(struct nir_to_llvm_context *ctx,
2814                                                       unsigned param,
2815                                                       unsigned const_index,
2816                                                       bool is_compact,
2817                                                       LLVMValueRef vertex_index,
2818                                                       LLVMValueRef indir_index)
2819 {
2820         LLVMValueRef param_index;
2821
2822         if (indir_index)
2823                 param_index = LLVMBuildAdd(ctx->builder, LLVMConstInt(ctx->ac.i32, param, false),
2824                                            indir_index, "");
2825         else {
2826                 if (const_index && !is_compact)
2827                         param += const_index;
2828                 param_index = LLVMConstInt(ctx->ac.i32, param, false);
2829         }
2830         return get_tcs_tes_buffer_address(ctx, vertex_index, param_index);
2831 }
2832
2833 static void
2834 mark_tess_output(struct nir_to_llvm_context *ctx,
2835                  bool is_patch, uint32_t param)
2836
2837 {
2838         if (is_patch) {
2839                 ctx->tess_patch_outputs_written |= (1ull << param);
2840         } else
2841                 ctx->tess_outputs_written |= (1ull << param);
2842 }
2843
2844 static LLVMValueRef
2845 get_dw_address(struct nir_to_llvm_context *ctx,
2846                LLVMValueRef dw_addr,
2847                unsigned param,
2848                unsigned const_index,
2849                bool compact_const_index,
2850                LLVMValueRef vertex_index,
2851                LLVMValueRef stride,
2852                LLVMValueRef indir_index)
2853
2854 {
2855
2856         if (vertex_index) {
2857                 dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
2858                                        LLVMBuildMul(ctx->builder,
2859                                                     vertex_index,
2860                                                     stride, ""), "");
2861         }
2862
2863         if (indir_index)
2864                 dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
2865                                        LLVMBuildMul(ctx->builder, indir_index,
2866                                                     LLVMConstInt(ctx->ac.i32, 4, false), ""), "");
2867         else if (const_index && !compact_const_index)
2868                 dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
2869                                        LLVMConstInt(ctx->ac.i32, const_index, false), "");
2870
2871         dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
2872                                LLVMConstInt(ctx->ac.i32, param * 4, false), "");
2873
2874         if (const_index && compact_const_index)
2875                 dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
2876                                        LLVMConstInt(ctx->ac.i32, const_index, false), "");
2877         return dw_addr;
2878 }
2879
2880 static LLVMValueRef
2881 load_tcs_varyings(struct ac_shader_abi *abi,
2882                   LLVMValueRef vertex_index,
2883                   LLVMValueRef indir_index,
2884                   unsigned const_index,
2885                   unsigned location,
2886                   unsigned driver_location,
2887                   unsigned component,
2888                   unsigned num_components,
2889                   bool is_patch,
2890                   bool is_compact,
2891                   bool load_input)
2892 {
2893         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
2894         LLVMValueRef dw_addr, stride;
2895         LLVMValueRef value[4], result;
2896         unsigned param = shader_io_get_unique_index(location);
2897
2898         if (load_input) {
2899                 stride = unpack_param(&ctx->ac, ctx->tcs_in_layout, 13, 8);
2900                 dw_addr = get_tcs_in_current_patch_offset(ctx);
2901         } else {
2902                 if (!is_patch) {
2903                         stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8);
2904                         dw_addr = get_tcs_out_current_patch_offset(ctx);
2905                 } else {
2906                         dw_addr = get_tcs_out_current_patch_data_offset(ctx);
2907                         stride = NULL;
2908                 }
2909         }
2910
2911         dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
2912                                  indir_index);
2913
2914         for (unsigned i = 0; i < num_components + component; i++) {
2915                 value[i] = ac_lds_load(&ctx->ac, dw_addr);
2916                 dw_addr = LLVMBuildAdd(ctx->builder, dw_addr,
2917                                        ctx->ac.i32_1, "");
2918         }
2919         result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
2920         return result;
2921 }
2922
2923 static void
2924 store_tcs_output(struct ac_shader_abi *abi,
2925                  LLVMValueRef vertex_index,
2926                  LLVMValueRef param_index,
2927                  unsigned const_index,
2928                  unsigned location,
2929                  unsigned driver_location,
2930                  LLVMValueRef src,
2931                  unsigned component,
2932                  bool is_patch,
2933                  bool is_compact,
2934                  unsigned writemask)
2935 {
2936         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
2937         LLVMValueRef dw_addr;
2938         LLVMValueRef stride = NULL;
2939         LLVMValueRef buf_addr = NULL;
2940         unsigned param;
2941         bool store_lds = true;
2942
2943         if (is_patch) {
2944                 if (!(ctx->tcs_patch_outputs_read & (1U << (location - VARYING_SLOT_PATCH0))))
2945                         store_lds = false;
2946         } else {
2947                 if (!(ctx->tcs_outputs_read & (1ULL << location)))
2948                         store_lds = false;
2949         }
2950
2951         param = shader_io_get_unique_index(location);
2952         if (location == VARYING_SLOT_CLIP_DIST0 &&
2953             is_compact && const_index > 3) {
2954                 const_index -= 3;
2955                 param++;
2956         }
2957
2958         if (!is_patch) {
2959                 stride = unpack_param(&ctx->ac, ctx->tcs_out_layout, 13, 8);
2960                 dw_addr = get_tcs_out_current_patch_offset(ctx);
2961         } else {
2962                 dw_addr = get_tcs_out_current_patch_data_offset(ctx);
2963         }
2964
2965         mark_tess_output(ctx, is_patch, param);
2966
2967         dw_addr = get_dw_address(ctx, dw_addr, param, const_index, is_compact, vertex_index, stride,
2968                                  param_index);
2969         buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index, is_compact,
2970                                                      vertex_index, param_index);
2971
2972         bool is_tess_factor = false;
2973         if (location == VARYING_SLOT_TESS_LEVEL_INNER ||
2974             location == VARYING_SLOT_TESS_LEVEL_OUTER)
2975                 is_tess_factor = true;
2976
2977         unsigned base = is_compact ? const_index : 0;
2978         for (unsigned chan = 0; chan < 8; chan++) {
2979                 if (!(writemask & (1 << chan)))
2980                         continue;
2981                 LLVMValueRef value = ac_llvm_extract_elem(&ctx->ac, src, chan - component);
2982
2983                 if (store_lds || is_tess_factor) {
2984                         LLVMValueRef dw_addr_chan =
2985                                 LLVMBuildAdd(ctx->builder, dw_addr,
2986                                                            LLVMConstInt(ctx->ac.i32, chan, false), "");
2987                         ac_lds_store(&ctx->ac, dw_addr_chan, value);
2988                 }
2989
2990                 if (!is_tess_factor && writemask != 0xF)
2991                         ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, value, 1,
2992                                                     buf_addr, ctx->oc_lds,
2993                                                     4 * (base + chan), 1, 0, true, false);
2994         }
2995
2996         if (writemask == 0xF) {
2997                 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, src, 4,
2998                                             buf_addr, ctx->oc_lds,
2999                                             (base * 4), 1, 0, true, false);
3000         }
3001 }
3002
3003 static LLVMValueRef
3004 load_tes_input(struct ac_shader_abi *abi,
3005                LLVMValueRef vertex_index,
3006                LLVMValueRef param_index,
3007                unsigned const_index,
3008                unsigned location,
3009                unsigned driver_location,
3010                unsigned component,
3011                unsigned num_components,
3012                bool is_patch,
3013                bool is_compact,
3014                bool load_input)
3015 {
3016         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
3017         LLVMValueRef buf_addr;
3018         LLVMValueRef result;
3019         unsigned param = shader_io_get_unique_index(location);
3020
3021         if (location == VARYING_SLOT_CLIP_DIST0 && is_compact && const_index > 3) {
3022                 const_index -= 3;
3023                 param++;
3024         }
3025
3026         buf_addr = get_tcs_tes_buffer_address_params(ctx, param, const_index,
3027                                                      is_compact, vertex_index, param_index);
3028
3029         LLVMValueRef comp_offset = LLVMConstInt(ctx->ac.i32, component * 4, false);
3030         buf_addr = LLVMBuildAdd(ctx->builder, buf_addr, comp_offset, "");
3031
3032         result = ac_build_buffer_load(&ctx->ac, ctx->hs_ring_tess_offchip, num_components, NULL,
3033                                       buf_addr, ctx->oc_lds, is_compact ? (4 * const_index) : 0, 1, 0, true, false);
3034         result = trim_vector(&ctx->ac, result, num_components);
3035         return result;
3036 }
3037
3038 static LLVMValueRef
3039 load_gs_input(struct ac_shader_abi *abi,
3040               unsigned location,
3041               unsigned driver_location,
3042               unsigned component,
3043               unsigned num_components,
3044               unsigned vertex_index,
3045               unsigned const_index,
3046               LLVMTypeRef type)
3047 {
3048         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
3049         LLVMValueRef vtx_offset;
3050         unsigned param, vtx_offset_param;
3051         LLVMValueRef value[4], result;
3052
3053         vtx_offset_param = vertex_index;
3054         assert(vtx_offset_param < 6);
3055         vtx_offset = LLVMBuildMul(ctx->builder, ctx->gs_vtx_offset[vtx_offset_param],
3056                                   LLVMConstInt(ctx->ac.i32, 4, false), "");
3057
3058         param = shader_io_get_unique_index(location);
3059
3060         for (unsigned i = component; i < num_components + component; i++) {
3061                 if (ctx->ac.chip_class >= GFX9) {
3062                         LLVMValueRef dw_addr = ctx->gs_vtx_offset[vtx_offset_param];
3063                         dw_addr = LLVMBuildAdd(ctx->ac.builder, dw_addr,
3064                                                LLVMConstInt(ctx->ac.i32, param * 4 + i + const_index, 0), "");
3065                         value[i] = ac_lds_load(&ctx->ac, dw_addr);
3066                 } else {
3067                         LLVMValueRef soffset =
3068                                 LLVMConstInt(ctx->ac.i32,
3069                                              (param * 4 + i + const_index) * 256,
3070                                              false);
3071
3072                         value[i] = ac_build_buffer_load(&ctx->ac,
3073                                                         ctx->esgs_ring, 1,
3074                                                         ctx->ac.i32_0,
3075                                                         vtx_offset, soffset,
3076                                                         0, 1, 0, true, false);
3077                 }
3078         }
3079         result = ac_build_varying_gather_values(&ctx->ac, value, num_components, component);
3080
3081         return result;
3082 }
3083
3084 static LLVMValueRef
3085 build_gep_for_deref(struct ac_nir_context *ctx,
3086                     nir_deref_var *deref)
3087 {
3088         struct hash_entry *entry = _mesa_hash_table_search(ctx->vars, deref->var);
3089         assert(entry->data);
3090         LLVMValueRef val = entry->data;
3091         nir_deref *tail = deref->deref.child;
3092         while (tail != NULL) {
3093                 LLVMValueRef offset;
3094                 switch (tail->deref_type) {
3095                 case nir_deref_type_array: {
3096                         nir_deref_array *array = nir_deref_as_array(tail);
3097                         offset = LLVMConstInt(ctx->ac.i32, array->base_offset, 0);
3098                         if (array->deref_array_type ==
3099                             nir_deref_array_type_indirect) {
3100                                 offset = LLVMBuildAdd(ctx->ac.builder, offset,
3101                                                       get_src(ctx,
3102                                                               array->indirect),
3103                                                       "");
3104                         }
3105                         break;
3106                 }
3107                 case nir_deref_type_struct: {
3108                         nir_deref_struct *deref_struct =
3109                                 nir_deref_as_struct(tail);
3110                         offset = LLVMConstInt(ctx->ac.i32,
3111                                               deref_struct->index, 0);
3112                         break;
3113                 }
3114                 default:
3115                         unreachable("bad deref type");
3116                 }
3117                 val = ac_build_gep0(&ctx->ac, val, offset);
3118                 tail = tail->child;
3119         }
3120         return val;
3121 }
3122
3123 static LLVMValueRef load_tess_varyings(struct ac_nir_context *ctx,
3124                                        nir_intrinsic_instr *instr,
3125                                        bool load_inputs)
3126 {
3127         LLVMValueRef result;
3128         LLVMValueRef vertex_index = NULL;
3129         LLVMValueRef indir_index = NULL;
3130         unsigned const_index = 0;
3131         unsigned location = instr->variables[0]->var->data.location;
3132         unsigned driver_location = instr->variables[0]->var->data.driver_location;
3133         const bool is_patch =  instr->variables[0]->var->data.patch;
3134         const bool is_compact = instr->variables[0]->var->data.compact;
3135
3136         get_deref_offset(ctx, instr->variables[0],
3137                          false, NULL, is_patch ? NULL : &vertex_index,
3138                          &const_index, &indir_index);
3139
3140         result = ctx->abi->load_tess_varyings(ctx->abi, vertex_index, indir_index,
3141                                               const_index, location, driver_location,
3142                                               instr->variables[0]->var->data.location_frac,
3143                                               instr->num_components,
3144                                               is_patch, is_compact, load_inputs);
3145         return LLVMBuildBitCast(ctx->ac.builder, result, get_def_type(ctx, &instr->dest.ssa), "");
3146 }
3147
3148 static LLVMValueRef visit_load_var(struct ac_nir_context *ctx,
3149                                    nir_intrinsic_instr *instr)
3150 {
3151         LLVMValueRef values[8];
3152         int idx = instr->variables[0]->var->data.driver_location;
3153         int ve = instr->dest.ssa.num_components;
3154         unsigned comp = instr->variables[0]->var->data.location_frac;
3155         LLVMValueRef indir_index;
3156         LLVMValueRef ret;
3157         unsigned const_index;
3158         unsigned stride = instr->variables[0]->var->data.compact ? 1 : 4;
3159         bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
3160                      instr->variables[0]->var->data.mode == nir_var_shader_in;
3161         get_deref_offset(ctx, instr->variables[0], vs_in, NULL, NULL,
3162                                       &const_index, &indir_index);
3163
3164         if (instr->dest.ssa.bit_size == 64)
3165                 ve *= 2;
3166
3167         switch (instr->variables[0]->var->data.mode) {
3168         case nir_var_shader_in:
3169                 if (ctx->stage == MESA_SHADER_TESS_CTRL ||
3170                     ctx->stage == MESA_SHADER_TESS_EVAL) {
3171                         return load_tess_varyings(ctx, instr, true);
3172                 }
3173
3174                 if (ctx->stage == MESA_SHADER_GEOMETRY) {
3175                         LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->dest.ssa.bit_size);
3176                         LLVMValueRef indir_index;
3177                         unsigned const_index, vertex_index;
3178                         get_deref_offset(ctx, instr->variables[0],
3179                                          false, &vertex_index, NULL,
3180                                          &const_index, &indir_index);
3181
3182                         return ctx->abi->load_inputs(ctx->abi, instr->variables[0]->var->data.location,
3183                                                      instr->variables[0]->var->data.driver_location,
3184                                                      instr->variables[0]->var->data.location_frac, ve,
3185                                                      vertex_index, const_index, type);
3186                 }
3187
3188                 for (unsigned chan = comp; chan < ve + comp; chan++) {
3189                         if (indir_index) {
3190                                 unsigned count = glsl_count_attribute_slots(
3191                                                 instr->variables[0]->var->type,
3192                                                 ctx->stage == MESA_SHADER_VERTEX);
3193                                 count -= chan / 4;
3194                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
3195                                                 &ctx->ac, ctx->abi->inputs + idx + chan, count,
3196                                                 stride, false, true);
3197
3198                                 values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
3199                                                                        tmp_vec,
3200                                                                        indir_index, "");
3201                         } else
3202                                 values[chan] = ctx->abi->inputs[idx + chan + const_index * stride];
3203                 }
3204                 break;
3205         case nir_var_local:
3206                 for (unsigned chan = 0; chan < ve; chan++) {
3207                         if (indir_index) {
3208                                 unsigned count = glsl_count_attribute_slots(
3209                                         instr->variables[0]->var->type, false);
3210                                 count -= chan / 4;
3211                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
3212                                                 &ctx->ac, ctx->locals + idx + chan, count,
3213                                                 stride, true, true);
3214
3215                                 values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
3216                                                                        tmp_vec,
3217                                                                        indir_index, "");
3218                         } else {
3219                                 values[chan] = LLVMBuildLoad(ctx->ac.builder, ctx->locals[idx + chan + const_index * stride], "");
3220                         }
3221                 }
3222                 break;
3223         case nir_var_shared: {
3224                 LLVMValueRef address = build_gep_for_deref(ctx,
3225                                                            instr->variables[0]);
3226                 LLVMValueRef val = LLVMBuildLoad(ctx->ac.builder, address, "");
3227                 return LLVMBuildBitCast(ctx->ac.builder, val,
3228                                         get_def_type(ctx, &instr->dest.ssa),
3229                                         "");
3230         }
3231         case nir_var_shader_out:
3232                 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
3233                         return load_tess_varyings(ctx, instr, false);
3234                 }
3235
3236                 for (unsigned chan = comp; chan < ve + comp; chan++) {
3237                         if (indir_index) {
3238                                 unsigned count = glsl_count_attribute_slots(
3239                                                 instr->variables[0]->var->type, false);
3240                                 count -= chan / 4;
3241                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
3242                                                 &ctx->ac, ctx->outputs + idx + chan, count,
3243                                                 stride, true, true);
3244
3245                                 values[chan] = LLVMBuildExtractElement(ctx->ac.builder,
3246                                                                        tmp_vec,
3247                                                                        indir_index, "");
3248                         } else {
3249                                 values[chan] = LLVMBuildLoad(ctx->ac.builder,
3250                                                      ctx->outputs[idx + chan + const_index * stride],
3251                                                      "");
3252                         }
3253                 }
3254                 break;
3255         default:
3256                 unreachable("unhandle variable mode");
3257         }
3258         ret = ac_build_varying_gather_values(&ctx->ac, values, ve, comp);
3259         return LLVMBuildBitCast(ctx->ac.builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
3260 }
3261
3262 static void
3263 visit_store_var(struct ac_nir_context *ctx,
3264                 nir_intrinsic_instr *instr)
3265 {
3266         LLVMValueRef temp_ptr, value;
3267         int idx = instr->variables[0]->var->data.driver_location;
3268         unsigned comp = instr->variables[0]->var->data.location_frac;
3269         LLVMValueRef src = ac_to_float(&ctx->ac, get_src(ctx, instr->src[0]));
3270         int writemask = instr->const_index[0] << comp;
3271         LLVMValueRef indir_index;
3272         unsigned const_index;
3273         get_deref_offset(ctx, instr->variables[0], false,
3274                          NULL, NULL, &const_index, &indir_index);
3275
3276         if (get_elem_bits(&ctx->ac, LLVMTypeOf(src)) == 64) {
3277
3278                 src = LLVMBuildBitCast(ctx->ac.builder, src,
3279                                        LLVMVectorType(ctx->ac.f32, ac_get_llvm_num_components(src) * 2),
3280                                        "");
3281
3282                 writemask = widen_mask(writemask, 2);
3283         }
3284
3285         switch (instr->variables[0]->var->data.mode) {
3286         case nir_var_shader_out:
3287
3288                 if (ctx->stage == MESA_SHADER_TESS_CTRL) {
3289                         LLVMValueRef vertex_index = NULL;
3290                         LLVMValueRef indir_index = NULL;
3291                         unsigned const_index = 0;
3292                         const unsigned location = instr->variables[0]->var->data.location;
3293                         const unsigned driver_location = instr->variables[0]->var->data.driver_location;
3294                         const unsigned comp = instr->variables[0]->var->data.location_frac;
3295                         const bool is_patch = instr->variables[0]->var->data.patch;
3296                         const bool is_compact = instr->variables[0]->var->data.compact;
3297
3298                         get_deref_offset(ctx, instr->variables[0],
3299                                          false, NULL, is_patch ? NULL : &vertex_index,
3300                                          &const_index, &indir_index);
3301
3302                         ctx->abi->store_tcs_outputs(ctx->abi, vertex_index, indir_index,
3303                                                     const_index, location, driver_location,
3304                                                     src, comp, is_patch, is_compact, writemask);
3305                         return;
3306                 }
3307
3308                 for (unsigned chan = 0; chan < 8; chan++) {
3309                         int stride = 4;
3310                         if (!(writemask & (1 << chan)))
3311                                 continue;
3312
3313                         value = ac_llvm_extract_elem(&ctx->ac, src, chan - comp);
3314
3315                         if (instr->variables[0]->var->data.compact)
3316                                 stride = 1;
3317                         if (indir_index) {
3318                                 unsigned count = glsl_count_attribute_slots(
3319                                                 instr->variables[0]->var->type, false);
3320                                 count -= chan / 4;
3321                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
3322                                                 &ctx->ac, ctx->outputs + idx + chan, count,
3323                                                 stride, true, true);
3324
3325                                 tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec,
3326                                                                  value, indir_index, "");
3327                                 build_store_values_extended(&ctx->ac, ctx->outputs + idx + chan,
3328                                                             count, stride, tmp_vec);
3329
3330                         } else {
3331                                 temp_ptr = ctx->outputs[idx + chan + const_index * stride];
3332
3333                                 LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
3334                         }
3335                 }
3336                 break;
3337         case nir_var_local:
3338                 for (unsigned chan = 0; chan < 8; chan++) {
3339                         if (!(writemask & (1 << chan)))
3340                                 continue;
3341
3342                         value = ac_llvm_extract_elem(&ctx->ac, src, chan);
3343                         if (indir_index) {
3344                                 unsigned count = glsl_count_attribute_slots(
3345                                         instr->variables[0]->var->type, false);
3346                                 count -= chan / 4;
3347                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
3348                                         &ctx->ac, ctx->locals + idx + chan, count,
3349                                         4, true, true);
3350
3351                                 tmp_vec = LLVMBuildInsertElement(ctx->ac.builder, tmp_vec,
3352                                                                  value, indir_index, "");
3353                                 build_store_values_extended(&ctx->ac, ctx->locals + idx + chan,
3354                                                             count, 4, tmp_vec);
3355                         } else {
3356                                 temp_ptr = ctx->locals[idx + chan + const_index * 4];
3357
3358                                 LLVMBuildStore(ctx->ac.builder, value, temp_ptr);
3359                         }
3360                 }
3361                 break;
3362         case nir_var_shared: {
3363                 int writemask = instr->const_index[0];
3364                 LLVMValueRef address = build_gep_for_deref(ctx,
3365                                                            instr->variables[0]);
3366                 LLVMValueRef val = get_src(ctx, instr->src[0]);
3367                 unsigned components =
3368                         glsl_get_vector_elements(
3369                            nir_deref_tail(&instr->variables[0]->deref)->type);
3370                 if (writemask == (1 << components) - 1) {
3371                         val = LLVMBuildBitCast(
3372                            ctx->ac.builder, val,
3373                            LLVMGetElementType(LLVMTypeOf(address)), "");
3374                         LLVMBuildStore(ctx->ac.builder, val, address);
3375                 } else {
3376                         for (unsigned chan = 0; chan < 4; chan++) {
3377                                 if (!(writemask & (1 << chan)))
3378                                         continue;
3379                                 LLVMValueRef ptr =
3380                                         LLVMBuildStructGEP(ctx->ac.builder,
3381                                                            address, chan, "");
3382                                 LLVMValueRef src = ac_llvm_extract_elem(&ctx->ac, val,
3383                                                                         chan);
3384                                 src = LLVMBuildBitCast(
3385                                    ctx->ac.builder, src,
3386                                    LLVMGetElementType(LLVMTypeOf(ptr)), "");
3387                                 LLVMBuildStore(ctx->ac.builder, src, ptr);
3388                         }
3389                 }
3390                 break;
3391         }
3392         default:
3393                 break;
3394         }
3395 }
3396
3397 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
3398 {
3399         switch (dim) {
3400         case GLSL_SAMPLER_DIM_BUF:
3401                 return 1;
3402         case GLSL_SAMPLER_DIM_1D:
3403                 return array ? 2 : 1;
3404         case GLSL_SAMPLER_DIM_2D:
3405                 return array ? 3 : 2;
3406         case GLSL_SAMPLER_DIM_MS:
3407                 return array ? 4 : 3;
3408         case GLSL_SAMPLER_DIM_3D:
3409         case GLSL_SAMPLER_DIM_CUBE:
3410                 return 3;
3411         case GLSL_SAMPLER_DIM_RECT:
3412         case GLSL_SAMPLER_DIM_SUBPASS:
3413                 return 2;
3414         case GLSL_SAMPLER_DIM_SUBPASS_MS:
3415                 return 3;
3416         default:
3417                 break;
3418         }
3419         return 0;
3420 }
3421
3422
3423
3424 /* Adjust the sample index according to FMASK.
3425  *
3426  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
3427  * which is the identity mapping. Each nibble says which physical sample
3428  * should be fetched to get that sample.
3429  *
3430  * For example, 0x11111100 means there are only 2 samples stored and
3431  * the second sample covers 3/4 of the pixel. When reading samples 0
3432  * and 1, return physical sample 0 (determined by the first two 0s
3433  * in FMASK), otherwise return physical sample 1.
3434  *
3435  * The sample index should be adjusted as follows:
3436  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
3437  */
3438 static LLVMValueRef adjust_sample_index_using_fmask(struct ac_llvm_context *ctx,
3439                                                     LLVMValueRef coord_x, LLVMValueRef coord_y,
3440                                                     LLVMValueRef coord_z,
3441                                                     LLVMValueRef sample_index,
3442                                                     LLVMValueRef fmask_desc_ptr)
3443 {
3444         LLVMValueRef fmask_load_address[4];
3445         LLVMValueRef res;
3446
3447         fmask_load_address[0] = coord_x;
3448         fmask_load_address[1] = coord_y;
3449         if (coord_z) {
3450                 fmask_load_address[2] = coord_z;
3451                 fmask_load_address[3] = LLVMGetUndef(ctx->i32);
3452         }
3453
3454         struct ac_image_args args = {0};
3455
3456         args.opcode = ac_image_load;
3457         args.da = coord_z ? true : false;
3458         args.resource = fmask_desc_ptr;
3459         args.dmask = 0xf;
3460         args.addr = ac_build_gather_values(ctx, fmask_load_address, coord_z ? 4 : 2);
3461
3462         res = ac_build_image_opcode(ctx, &args);
3463
3464         res = ac_to_integer(ctx, res);
3465         LLVMValueRef four = LLVMConstInt(ctx->i32, 4, false);
3466         LLVMValueRef F = LLVMConstInt(ctx->i32, 0xf, false);
3467
3468         LLVMValueRef fmask = LLVMBuildExtractElement(ctx->builder,
3469                                                      res,
3470                                                      ctx->i32_0, "");
3471
3472         LLVMValueRef sample_index4 =
3473                 LLVMBuildMul(ctx->builder, sample_index, four, "");
3474         LLVMValueRef shifted_fmask =
3475                 LLVMBuildLShr(ctx->builder, fmask, sample_index4, "");
3476         LLVMValueRef final_sample =
3477                 LLVMBuildAnd(ctx->builder, shifted_fmask, F, "");
3478
3479         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
3480          * resource descriptor is 0 (invalid),
3481          */
3482         LLVMValueRef fmask_desc =
3483                 LLVMBuildBitCast(ctx->builder, fmask_desc_ptr,
3484                                  ctx->v8i32, "");
3485
3486         LLVMValueRef fmask_word1 =
3487                 LLVMBuildExtractElement(ctx->builder, fmask_desc,
3488                                         ctx->i32_1, "");
3489
3490         LLVMValueRef word1_is_nonzero =
3491                 LLVMBuildICmp(ctx->builder, LLVMIntNE,
3492                               fmask_word1, ctx->i32_0, "");
3493
3494         /* Replace the MSAA sample index. */
3495         sample_index =
3496                 LLVMBuildSelect(ctx->builder, word1_is_nonzero,
3497                                 final_sample, sample_index, "");
3498         return sample_index;
3499 }
3500
3501 static LLVMValueRef get_image_coords(struct ac_nir_context *ctx,
3502                                      const nir_intrinsic_instr *instr)
3503 {
3504         const struct glsl_type *type = glsl_without_array(instr->variables[0]->var->type);
3505
3506         LLVMValueRef src0 = get_src(ctx, instr->src[0]);
3507         LLVMValueRef coords[4];
3508         LLVMValueRef masks[] = {
3509                 LLVMConstInt(ctx->ac.i32, 0, false), LLVMConstInt(ctx->ac.i32, 1, false),
3510                 LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false),
3511         };
3512         LLVMValueRef res;
3513         LLVMValueRef sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[1]), 0);
3514
3515         int count;
3516         enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3517         bool is_array = glsl_sampler_type_is_array(type);
3518         bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
3519                              dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3520         bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
3521                       dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
3522         bool gfx9_1d = ctx->ac.chip_class >= GFX9 && dim == GLSL_SAMPLER_DIM_1D;
3523         count = image_type_to_components_count(dim, is_array);
3524
3525         if (is_ms) {
3526                 LLVMValueRef fmask_load_address[3];
3527                 int chan;
3528
3529                 fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
3530                 fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], "");
3531                 if (is_array)
3532                         fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], "");
3533                 else
3534                         fmask_load_address[2] = NULL;
3535                 if (add_frag_pos) {
3536                         for (chan = 0; chan < 2; ++chan)
3537                                 fmask_load_address[chan] =
3538                                         LLVMBuildAdd(ctx->ac.builder, fmask_load_address[chan],
3539                                                 LLVMBuildFPToUI(ctx->ac.builder, ctx->abi->frag_pos[chan],
3540                                                                 ctx->ac.i32, ""), "");
3541                         fmask_load_address[2] = ac_to_integer(&ctx->ac, ctx->abi->inputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]);
3542                 }
3543                 sample_index = adjust_sample_index_using_fmask(&ctx->ac,
3544                                                                fmask_load_address[0],
3545                                                                fmask_load_address[1],
3546                                                                fmask_load_address[2],
3547                                                                sample_index,
3548                                                                get_sampler_desc(ctx, instr->variables[0], AC_DESC_FMASK, NULL, true, false));
3549         }
3550         if (count == 1 && !gfx9_1d) {
3551                 if (instr->src[0].ssa->num_components)
3552                         res = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
3553                 else
3554                         res = src0;
3555         } else {
3556                 int chan;
3557                 if (is_ms)
3558                         count--;
3559                 for (chan = 0; chan < count; ++chan) {
3560                         coords[chan] = ac_llvm_extract_elem(&ctx->ac, src0, chan);
3561                 }
3562                 if (add_frag_pos) {
3563                         for (chan = 0; chan < 2; ++chan)
3564                                 coords[chan] = LLVMBuildAdd(ctx->ac.builder, coords[chan], LLVMBuildFPToUI(ctx->ac.builder, ctx->abi->frag_pos[chan],
3565                                                 ctx->ac.i32, ""), "");
3566                         coords[2] = ac_to_integer(&ctx->ac, ctx->abi->inputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)]);
3567                         count++;
3568                 }
3569
3570                 if (gfx9_1d) {
3571                         if (is_array) {
3572                                 coords[2] = coords[1];
3573                                 coords[1] = ctx->ac.i32_0;
3574                         } else
3575                                 coords[1] = ctx->ac.i32_0;
3576                         count++;
3577                 }
3578
3579                 if (is_ms) {
3580                         coords[count] = sample_index;
3581                         count++;
3582                 }
3583
3584                 if (count == 3) {
3585                         coords[3] = LLVMGetUndef(ctx->ac.i32);
3586                         count = 4;
3587                 }
3588                 res = ac_build_gather_values(&ctx->ac, coords, count);
3589         }
3590         return res;
3591 }
3592
3593 static LLVMValueRef visit_image_load(struct ac_nir_context *ctx,
3594                                      const nir_intrinsic_instr *instr)
3595 {
3596         LLVMValueRef params[7];
3597         LLVMValueRef res;
3598         char intrinsic_name[64];
3599         const nir_variable *var = instr->variables[0]->var;
3600         const struct glsl_type *type = var->type;
3601
3602         if(instr->variables[0]->deref.child)
3603                 type = instr->variables[0]->deref.child->type;
3604
3605         type = glsl_without_array(type);
3606
3607         const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3608         if (dim == GLSL_SAMPLER_DIM_BUF) {
3609                 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
3610                 unsigned num_channels = util_last_bit(mask);
3611                 LLVMValueRef rsrc, vindex;
3612
3613                 rsrc = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, false);
3614                 vindex = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
3615                                                  ctx->ac.i32_0, "");
3616
3617                 /* TODO: set "glc" and "can_speculate" when OpenGL needs it. */
3618                 res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex,
3619                                                   ctx->ac.i32_0, num_channels,
3620                                                   false, false);
3621                 res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
3622
3623                 res = trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
3624                 res = ac_to_integer(&ctx->ac, res);
3625         } else {
3626                 bool is_da = glsl_sampler_type_is_array(type) ||
3627                              dim == GLSL_SAMPLER_DIM_CUBE ||
3628                              dim == GLSL_SAMPLER_DIM_3D ||
3629                              dim == GLSL_SAMPLER_DIM_SUBPASS ||
3630                              dim == GLSL_SAMPLER_DIM_SUBPASS_MS;
3631                 LLVMValueRef da = is_da ? ctx->ac.i1true : ctx->ac.i1false;
3632                 LLVMValueRef glc = ctx->ac.i1false;
3633                 LLVMValueRef slc = ctx->ac.i1false;
3634
3635                 params[0] = get_image_coords(ctx, instr);
3636                 params[1] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, false);
3637                 params[2] = LLVMConstInt(ctx->ac.i32, 15, false); /* dmask */
3638                 params[3] = glc;
3639                 params[4] = slc;
3640                 params[5] = ctx->ac.i1false;
3641                 params[6] = da;
3642
3643                 ac_get_image_intr_name("llvm.amdgcn.image.load",
3644                                        ctx->ac.v4f32, /* vdata */
3645                                        LLVMTypeOf(params[0]), /* coords */
3646                                        LLVMTypeOf(params[1]), /* rsrc */
3647                                        intrinsic_name, sizeof(intrinsic_name));
3648
3649                 res = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.v4f32,
3650                                          params, 7, AC_FUNC_ATTR_READONLY);
3651         }
3652         return ac_to_integer(&ctx->ac, res);
3653 }
3654
3655 static void visit_image_store(struct ac_nir_context *ctx,
3656                               nir_intrinsic_instr *instr)
3657 {
3658         LLVMValueRef params[8];
3659         char intrinsic_name[64];
3660         const nir_variable *var = instr->variables[0]->var;
3661         const struct glsl_type *type = glsl_without_array(var->type);
3662         const enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
3663         LLVMValueRef glc = ctx->ac.i1false;
3664         bool force_glc = ctx->ac.chip_class == SI;
3665         if (force_glc)
3666                 glc = ctx->ac.i1true;
3667
3668         if (dim == GLSL_SAMPLER_DIM_BUF) {
3669                 params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2])); /* data */
3670                 params[1] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER, NULL, true, true);
3671                 params[2] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
3672                                                     ctx->ac.i32_0, ""); /* vindex */
3673                 params[3] = ctx->ac.i32_0; /* voffset */
3674                 params[4] = glc;  /* glc */
3675                 params[5] = ctx->ac.i1false;  /* slc */
3676                 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->ac.voidt,
3677                                    params, 6, 0);
3678         } else {
3679                 bool is_da = glsl_sampler_type_is_array(type) ||
3680                              dim == GLSL_SAMPLER_DIM_CUBE ||
3681                              dim == GLSL_SAMPLER_DIM_3D;
3682                 LLVMValueRef da = is_da ? ctx->ac.i1true : ctx->ac.i1false;
3683                 LLVMValueRef slc = ctx->ac.i1false;
3684
3685                 params[0] = ac_to_float(&ctx->ac, get_src(ctx, instr->src[2]));
3686                 params[1] = get_image_coords(ctx, instr); /* coords */
3687                 params[2] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, true);
3688                 params[3] = LLVMConstInt(ctx->ac.i32, 15, false); /* dmask */
3689                 params[4] = glc;
3690                 params[5] = slc;
3691                 params[6] = ctx->ac.i1false;
3692                 params[7] = da;
3693
3694                 ac_get_image_intr_name("llvm.amdgcn.image.store",
3695                                        LLVMTypeOf(params[0]), /* vdata */
3696                                        LLVMTypeOf(params[1]), /* coords */
3697                                        LLVMTypeOf(params[2]), /* rsrc */
3698                                        intrinsic_name, sizeof(intrinsic_name));
3699
3700                 ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.voidt,
3701                                    params, 8, 0);
3702         }
3703
3704 }
3705
3706 static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx,
3707                                        const nir_intrinsic_instr *instr)
3708 {
3709         LLVMValueRef params[7];
3710         int param_count = 0;
3711         const nir_variable *var = instr->variables[0]->var;
3712
3713         const char *atomic_name;
3714         char intrinsic_name[41];
3715         const struct glsl_type *type = glsl_without_array(var->type);
3716         MAYBE_UNUSED int length;
3717
3718         bool is_unsigned = glsl_get_sampler_result_type(type) == GLSL_TYPE_UINT;
3719
3720         switch (instr->intrinsic) {
3721         case nir_intrinsic_image_atomic_add:
3722                 atomic_name = "add";
3723                 break;
3724         case nir_intrinsic_image_atomic_min:
3725                 atomic_name = is_unsigned ? "umin" : "smin";
3726                 break;
3727         case nir_intrinsic_image_atomic_max:
3728                 atomic_name = is_unsigned ? "umax" : "smax";
3729                 break;
3730         case nir_intrinsic_image_atomic_and:
3731                 atomic_name = "and";
3732                 break;
3733         case nir_intrinsic_image_atomic_or:
3734                 atomic_name = "or";
3735                 break;
3736         case nir_intrinsic_image_atomic_xor:
3737                 atomic_name = "xor";
3738                 break;
3739         case nir_intrinsic_image_atomic_exchange:
3740                 atomic_name = "swap";
3741                 break;
3742         case nir_intrinsic_image_atomic_comp_swap:
3743                 atomic_name = "cmpswap";
3744                 break;
3745         default:
3746                 abort();
3747         }
3748
3749         if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
3750                 params[param_count++] = get_src(ctx, instr->src[3]);
3751         params[param_count++] = get_src(ctx, instr->src[2]);
3752
3753         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
3754                 params[param_count++] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_BUFFER,
3755                                                          NULL, true, true);
3756                 params[param_count++] = LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[0]),
3757                                                                 ctx->ac.i32_0, ""); /* vindex */
3758                 params[param_count++] = ctx->ac.i32_0; /* voffset */
3759                 params[param_count++] = ctx->ac.i1false;  /* slc */
3760
3761                 length = snprintf(intrinsic_name, sizeof(intrinsic_name),
3762                                   "llvm.amdgcn.buffer.atomic.%s", atomic_name);
3763         } else {
3764                 char coords_type[8];
3765
3766                 bool da = glsl_sampler_type_is_array(type) ||
3767                           glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
3768
3769                 LLVMValueRef coords = params[param_count++] = get_image_coords(ctx, instr);
3770                 params[param_count++] = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE,
3771                                                          NULL, true, true);
3772                 params[param_count++] = ctx->ac.i1false; /* r128 */
3773                 params[param_count++] = da ? ctx->ac.i1true : ctx->ac.i1false;      /* da */
3774                 params[param_count++] = ctx->ac.i1false;  /* slc */
3775
3776                 build_int_type_name(LLVMTypeOf(coords),
3777                                     coords_type, sizeof(coords_type));
3778
3779                 length = snprintf(intrinsic_name, sizeof(intrinsic_name),
3780                                   "llvm.amdgcn.image.atomic.%s.%s", atomic_name, coords_type);
3781         }
3782
3783         assert(length < sizeof(intrinsic_name));
3784         return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->ac.i32, params, param_count, 0);
3785 }
3786
3787 static LLVMValueRef visit_image_size(struct ac_nir_context *ctx,
3788                                      const nir_intrinsic_instr *instr)
3789 {
3790         LLVMValueRef res;
3791         const nir_variable *var = instr->variables[0]->var;
3792         const struct glsl_type *type = instr->variables[0]->var->type;
3793         bool da = glsl_sampler_type_is_array(var->type) ||
3794                   glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_CUBE ||
3795                   glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_3D;
3796         if(instr->variables[0]->deref.child)
3797                 type = instr->variables[0]->deref.child->type;
3798
3799         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF)
3800                 return get_buffer_size(ctx,
3801                         get_sampler_desc(ctx, instr->variables[0],
3802                                          AC_DESC_BUFFER, NULL, true, false), true);
3803
3804         struct ac_image_args args = { 0 };
3805
3806         args.da = da;
3807         args.dmask = 0xf;
3808         args.resource = get_sampler_desc(ctx, instr->variables[0], AC_DESC_IMAGE, NULL, true, false);
3809         args.opcode = ac_image_get_resinfo;
3810         args.addr = ctx->ac.i32_0;
3811
3812         res = ac_build_image_opcode(&ctx->ac, &args);
3813
3814         LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
3815
3816         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
3817             glsl_sampler_type_is_array(type)) {
3818                 LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
3819                 LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
3820                 z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
3821                 res = LLVMBuildInsertElement(ctx->ac.builder, res, z, two, "");
3822         }
3823         if (ctx->ac.chip_class >= GFX9 &&
3824             glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_1D &&
3825             glsl_sampler_type_is_array(type)) {
3826                 LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, res, two, "");
3827                 res = LLVMBuildInsertElement(ctx->ac.builder, res, layers,
3828                                                 ctx->ac.i32_1, "");
3829
3830         }
3831         return res;
3832 }
3833
3834 #define NOOP_WAITCNT 0xf7f
3835 #define LGKM_CNT 0x07f
3836 #define VM_CNT 0xf70
3837
3838 static void emit_membar(struct nir_to_llvm_context *ctx,
3839                         const nir_intrinsic_instr *instr)
3840 {
3841         unsigned waitcnt = NOOP_WAITCNT;
3842
3843         switch (instr->intrinsic) {
3844         case nir_intrinsic_memory_barrier:
3845         case nir_intrinsic_group_memory_barrier:
3846                 waitcnt &= VM_CNT & LGKM_CNT;
3847                 break;
3848         case nir_intrinsic_memory_barrier_atomic_counter:
3849         case nir_intrinsic_memory_barrier_buffer:
3850         case nir_intrinsic_memory_barrier_image:
3851                 waitcnt &= VM_CNT;
3852                 break;
3853         case nir_intrinsic_memory_barrier_shared:
3854                 waitcnt &= LGKM_CNT;
3855                 break;
3856         default:
3857                 break;
3858         }
3859         if (waitcnt != NOOP_WAITCNT)
3860                 ac_build_waitcnt(&ctx->ac, waitcnt);
3861 }
3862
3863 static void emit_barrier(struct ac_llvm_context *ac, gl_shader_stage stage)
3864 {
3865         /* SI only (thanks to a hw bug workaround):
3866          * The real barrier instruction isn’t needed, because an entire patch
3867          * always fits into a single wave.
3868          */
3869         if (ac->chip_class == SI && stage == MESA_SHADER_TESS_CTRL) {
3870                 ac_build_waitcnt(ac, LGKM_CNT & VM_CNT);
3871                 return;
3872         }
3873         ac_build_intrinsic(ac, "llvm.amdgcn.s.barrier",
3874                            ac->voidt, NULL, 0, AC_FUNC_ATTR_CONVERGENT);
3875 }
3876
3877 static void emit_discard(struct ac_nir_context *ctx,
3878                          const nir_intrinsic_instr *instr)
3879 {
3880         LLVMValueRef cond;
3881
3882         if (instr->intrinsic == nir_intrinsic_discard_if) {
3883                 cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ,
3884                                      get_src(ctx, instr->src[0]),
3885                                      ctx->ac.i32_0, "");
3886         } else {
3887                 assert(instr->intrinsic == nir_intrinsic_discard);
3888                 cond = LLVMConstInt(ctx->ac.i1, false, 0);
3889         }
3890
3891         ac_build_kill_if_false(&ctx->ac, cond);
3892 }
3893
3894 static LLVMValueRef
3895 visit_load_helper_invocation(struct ac_nir_context *ctx)
3896 {
3897         LLVMValueRef result = ac_build_intrinsic(&ctx->ac,
3898                                                  "llvm.amdgcn.ps.live",
3899                                                  ctx->ac.i1, NULL, 0,
3900                                                  AC_FUNC_ATTR_READNONE);
3901         result = LLVMBuildNot(ctx->ac.builder, result, "");
3902         return LLVMBuildSExt(ctx->ac.builder, result, ctx->ac.i32, "");
3903 }
3904
3905 static LLVMValueRef
3906 visit_load_local_invocation_index(struct nir_to_llvm_context *ctx)
3907 {
3908         LLVMValueRef result;
3909         LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac);
3910         result = LLVMBuildAnd(ctx->builder, ctx->tg_size,
3911                               LLVMConstInt(ctx->ac.i32, 0xfc0, false), "");
3912
3913         return LLVMBuildAdd(ctx->builder, result, thread_id, "");
3914 }
3915
3916 static LLVMValueRef visit_var_atomic(struct nir_to_llvm_context *ctx,
3917                                      const nir_intrinsic_instr *instr)
3918 {
3919         LLVMValueRef ptr, result;
3920         LLVMValueRef src = get_src(ctx->nir, instr->src[0]);
3921         ptr = build_gep_for_deref(ctx->nir, instr->variables[0]);
3922
3923         if (instr->intrinsic == nir_intrinsic_var_atomic_comp_swap) {
3924                 LLVMValueRef src1 = get_src(ctx->nir, instr->src[1]);
3925                 result = LLVMBuildAtomicCmpXchg(ctx->builder,
3926                                                 ptr, src, src1,
3927                                                 LLVMAtomicOrderingSequentiallyConsistent,
3928                                                 LLVMAtomicOrderingSequentiallyConsistent,
3929                                                 false);
3930         } else {
3931                 LLVMAtomicRMWBinOp op;
3932                 switch (instr->intrinsic) {
3933                 case nir_intrinsic_var_atomic_add:
3934                         op = LLVMAtomicRMWBinOpAdd;
3935                         break;
3936                 case nir_intrinsic_var_atomic_umin:
3937                         op = LLVMAtomicRMWBinOpUMin;
3938                         break;
3939                 case nir_intrinsic_var_atomic_umax:
3940                         op = LLVMAtomicRMWBinOpUMax;
3941                         break;
3942                 case nir_intrinsic_var_atomic_imin:
3943                         op = LLVMAtomicRMWBinOpMin;
3944                         break;
3945                 case nir_intrinsic_var_atomic_imax:
3946                         op = LLVMAtomicRMWBinOpMax;
3947                         break;
3948                 case nir_intrinsic_var_atomic_and:
3949                         op = LLVMAtomicRMWBinOpAnd;
3950                         break;
3951                 case nir_intrinsic_var_atomic_or:
3952                         op = LLVMAtomicRMWBinOpOr;
3953                         break;
3954                 case nir_intrinsic_var_atomic_xor:
3955                         op = LLVMAtomicRMWBinOpXor;
3956                         break;
3957                 case nir_intrinsic_var_atomic_exchange:
3958                         op = LLVMAtomicRMWBinOpXchg;
3959                         break;
3960                 default:
3961                         return NULL;
3962                 }
3963
3964                 result = LLVMBuildAtomicRMW(ctx->builder, op, ptr, ac_to_integer(&ctx->ac, src),
3965                                             LLVMAtomicOrderingSequentiallyConsistent,
3966                                             false);
3967         }
3968         return result;
3969 }
3970
3971 static LLVMValueRef lookup_interp_param(struct ac_shader_abi *abi,
3972                                         enum glsl_interp_mode interp, unsigned location)
3973 {
3974         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
3975
3976         switch (interp) {
3977         case INTERP_MODE_FLAT:
3978         default:
3979                 return NULL;
3980         case INTERP_MODE_SMOOTH:
3981         case INTERP_MODE_NONE:
3982                 if (location == INTERP_CENTER)
3983                         return ctx->persp_center;
3984                 else if (location == INTERP_CENTROID)
3985                         return ctx->persp_centroid;
3986                 else if (location == INTERP_SAMPLE)
3987                         return ctx->persp_sample;
3988                 break;
3989         case INTERP_MODE_NOPERSPECTIVE:
3990                 if (location == INTERP_CENTER)
3991                         return ctx->linear_center;
3992                 else if (location == INTERP_CENTROID)
3993                         return ctx->linear_centroid;
3994                 else if (location == INTERP_SAMPLE)
3995                         return ctx->linear_sample;
3996                 break;
3997         }
3998         return NULL;
3999 }
4000
4001 static LLVMValueRef load_sample_position(struct ac_shader_abi *abi,
4002                                          LLVMValueRef sample_id)
4003 {
4004         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4005
4006         LLVMValueRef result;
4007         LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_PS_SAMPLE_POSITIONS, false));
4008
4009         ptr = LLVMBuildBitCast(ctx->builder, ptr,
4010                                ac_array_in_const_addr_space(ctx->ac.v2f32), "");
4011
4012         sample_id = LLVMBuildAdd(ctx->builder, sample_id, ctx->sample_pos_offset, "");
4013         result = ac_build_load_invariant(&ctx->ac, ptr, sample_id);
4014
4015         return result;
4016 }
4017
4018 static LLVMValueRef load_sample_pos(struct ac_nir_context *ctx)
4019 {
4020         LLVMValueRef values[2];
4021
4022         values[0] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[0], 32);
4023         values[1] = emit_ffract(&ctx->ac, ctx->abi->frag_pos[1], 32);
4024         return ac_build_gather_values(&ctx->ac, values, 2);
4025 }
4026
4027 static LLVMValueRef load_sample_mask_in(struct ac_nir_context *ctx)
4028 {
4029         uint8_t log2_ps_iter_samples = ctx->nctx->shader_info->info.ps.force_persample ? ctx->nctx->options->key.fs.log2_num_samples : ctx->nctx->options->key.fs.log2_ps_iter_samples;
4030
4031         /* The bit pattern matches that used by fixed function fragment
4032          * processing. */
4033         static const uint16_t ps_iter_masks[] = {
4034                 0xffff, /* not used */
4035                 0x5555,
4036                 0x1111,
4037                 0x0101,
4038                 0x0001,
4039         };
4040         assert(log2_ps_iter_samples < ARRAY_SIZE(ps_iter_masks));
4041
4042         uint32_t ps_iter_mask = ps_iter_masks[log2_ps_iter_samples];
4043
4044         LLVMValueRef result, sample_id;
4045         sample_id = unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4);
4046         sample_id = LLVMBuildShl(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, ps_iter_mask, false), sample_id, "");
4047         result = LLVMBuildAnd(ctx->ac.builder, sample_id, ctx->abi->sample_coverage, "");
4048         return result;
4049 }
4050
4051 static LLVMValueRef visit_interp(struct ac_nir_context *ctx,
4052                                  const nir_intrinsic_instr *instr)
4053 {
4054         LLVMValueRef result[4];
4055         LLVMValueRef interp_param, attr_number;
4056         unsigned location;
4057         unsigned chan;
4058         LLVMValueRef src_c0 = NULL;
4059         LLVMValueRef src_c1 = NULL;
4060         LLVMValueRef src0 = NULL;
4061         int input_index = instr->variables[0]->var->data.location - VARYING_SLOT_VAR0;
4062         switch (instr->intrinsic) {
4063         case nir_intrinsic_interp_var_at_centroid:
4064                 location = INTERP_CENTROID;
4065                 break;
4066         case nir_intrinsic_interp_var_at_sample:
4067         case nir_intrinsic_interp_var_at_offset:
4068                 location = INTERP_CENTER;
4069                 src0 = get_src(ctx, instr->src[0]);
4070                 break;
4071         default:
4072                 break;
4073         }
4074
4075         if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) {
4076                 src_c0 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, src0, ctx->ac.i32_0, ""));
4077                 src_c1 = ac_to_float(&ctx->ac, LLVMBuildExtractElement(ctx->ac.builder, src0, ctx->ac.i32_1, ""));
4078         } else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) {
4079                 LLVMValueRef sample_position;
4080                 LLVMValueRef halfval = LLVMConstReal(ctx->ac.f32, 0.5f);
4081
4082                 /* fetch sample ID */
4083                 sample_position = ctx->abi->load_sample_position(ctx->abi, src0);
4084
4085                 src_c0 = LLVMBuildExtractElement(ctx->ac.builder, sample_position, ctx->ac.i32_0, "");
4086                 src_c0 = LLVMBuildFSub(ctx->ac.builder, src_c0, halfval, "");
4087                 src_c1 = LLVMBuildExtractElement(ctx->ac.builder, sample_position, ctx->ac.i32_1, "");
4088                 src_c1 = LLVMBuildFSub(ctx->ac.builder, src_c1, halfval, "");
4089         }
4090         interp_param = ctx->abi->lookup_interp_param(ctx->abi, instr->variables[0]->var->data.interpolation, location);
4091         attr_number = LLVMConstInt(ctx->ac.i32, input_index, false);
4092
4093         if (location == INTERP_CENTER) {
4094                 LLVMValueRef ij_out[2];
4095                 LLVMValueRef ddxy_out = emit_ddxy_interp(ctx, interp_param);
4096
4097                 /*
4098                  * take the I then J parameters, and the DDX/Y for it, and
4099                  * calculate the IJ inputs for the interpolator.
4100                  * temp1 = ddx * offset/sample.x + I;
4101                  * interp_param.I = ddy * offset/sample.y + temp1;
4102                  * temp1 = ddx * offset/sample.x + J;
4103                  * interp_param.J = ddy * offset/sample.y + temp1;
4104                  */
4105                 for (unsigned i = 0; i < 2; i++) {
4106                         LLVMValueRef ix_ll = LLVMConstInt(ctx->ac.i32, i, false);
4107                         LLVMValueRef iy_ll = LLVMConstInt(ctx->ac.i32, i + 2, false);
4108                         LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->ac.builder,
4109                                                                       ddxy_out, ix_ll, "");
4110                         LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->ac.builder,
4111                                                                       ddxy_out, iy_ll, "");
4112                         LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->ac.builder,
4113                                                                          interp_param, ix_ll, "");
4114                         LLVMValueRef temp1, temp2;
4115
4116                         interp_el = LLVMBuildBitCast(ctx->ac.builder, interp_el,
4117                                                      ctx->ac.f32, "");
4118
4119                         temp1 = LLVMBuildFMul(ctx->ac.builder, ddx_el, src_c0, "");
4120                         temp1 = LLVMBuildFAdd(ctx->ac.builder, temp1, interp_el, "");
4121
4122                         temp2 = LLVMBuildFMul(ctx->ac.builder, ddy_el, src_c1, "");
4123                         temp2 = LLVMBuildFAdd(ctx->ac.builder, temp2, temp1, "");
4124
4125                         ij_out[i] = LLVMBuildBitCast(ctx->ac.builder,
4126                                                      temp2, ctx->ac.i32, "");
4127                 }
4128                 interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
4129
4130         }
4131
4132         for (chan = 0; chan < 4; chan++) {
4133                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
4134
4135                 if (interp_param) {
4136                         interp_param = LLVMBuildBitCast(ctx->ac.builder,
4137                                                         interp_param, ctx->ac.v2f32, "");
4138                         LLVMValueRef i = LLVMBuildExtractElement(
4139                                 ctx->ac.builder, interp_param, ctx->ac.i32_0, "");
4140                         LLVMValueRef j = LLVMBuildExtractElement(
4141                                 ctx->ac.builder, interp_param, ctx->ac.i32_1, "");
4142
4143                         result[chan] = ac_build_fs_interp(&ctx->ac,
4144                                                           llvm_chan, attr_number,
4145                                                           ctx->abi->prim_mask, i, j);
4146                 } else {
4147                         result[chan] = ac_build_fs_interp_mov(&ctx->ac,
4148                                                               LLVMConstInt(ctx->ac.i32, 2, false),
4149                                                               llvm_chan, attr_number,
4150                                                               ctx->abi->prim_mask);
4151                 }
4152         }
4153         return ac_build_varying_gather_values(&ctx->ac, result, instr->num_components,
4154                                               instr->variables[0]->var->data.location_frac);
4155 }
4156
4157 static void
4158 visit_emit_vertex(struct ac_shader_abi *abi, unsigned stream, LLVMValueRef *addrs)
4159 {
4160         LLVMValueRef gs_next_vertex;
4161         LLVMValueRef can_emit;
4162         int idx;
4163         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4164
4165         assert(stream == 0);
4166
4167         /* Write vertex attribute values to GSVS ring */
4168         gs_next_vertex = LLVMBuildLoad(ctx->builder,
4169                                        ctx->gs_next_vertex,
4170                                        "");
4171
4172         /* If this thread has already emitted the declared maximum number of
4173          * vertices, kill it: excessive vertex emissions are not supposed to
4174          * have any effect, and GS threads have no externally observable
4175          * effects other than emitting vertices.
4176          */
4177         can_emit = LLVMBuildICmp(ctx->builder, LLVMIntULT, gs_next_vertex,
4178                                  LLVMConstInt(ctx->ac.i32, ctx->gs_max_out_vertices, false), "");
4179         ac_build_kill_if_false(&ctx->ac, can_emit);
4180
4181         /* loop num outputs */
4182         idx = 0;
4183         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
4184                 LLVMValueRef *out_ptr = &addrs[i * 4];
4185                 int length = 4;
4186                 int slot = idx;
4187                 int slot_inc = 1;
4188
4189                 if (!(ctx->output_mask & (1ull << i)))
4190                         continue;
4191
4192                 if (i == VARYING_SLOT_CLIP_DIST0) {
4193                         /* pack clip and cull into a single set of slots */
4194                         length = ctx->num_output_clips + ctx->num_output_culls;
4195                         if (length > 4)
4196                                 slot_inc = 2;
4197                 }
4198                 for (unsigned j = 0; j < length; j++) {
4199                         LLVMValueRef out_val = LLVMBuildLoad(ctx->builder,
4200                                                              out_ptr[j], "");
4201                         LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, (slot * 4 + j) * ctx->gs_max_out_vertices, false);
4202                         voffset = LLVMBuildAdd(ctx->builder, voffset, gs_next_vertex, "");
4203                         voffset = LLVMBuildMul(ctx->builder, voffset, LLVMConstInt(ctx->ac.i32, 4, false), "");
4204
4205                         out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->ac.i32, "");
4206
4207                         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring,
4208                                                     out_val, 1,
4209                                                     voffset, ctx->gs2vs_offset, 0,
4210                                                     1, 1, true, true);
4211                 }
4212                 idx += slot_inc;
4213         }
4214
4215         gs_next_vertex = LLVMBuildAdd(ctx->builder, gs_next_vertex,
4216                                       ctx->ac.i32_1, "");
4217         LLVMBuildStore(ctx->builder, gs_next_vertex, ctx->gs_next_vertex);
4218
4219         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id);
4220 }
4221
4222 static void
4223 visit_end_primitive(struct ac_shader_abi *abi, unsigned stream)
4224 {
4225         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4226         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (stream << 8), ctx->gs_wave_id);
4227 }
4228
4229 static LLVMValueRef
4230 load_tess_coord(struct ac_shader_abi *abi, LLVMTypeRef type,
4231                 unsigned num_components)
4232 {
4233         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4234
4235         LLVMValueRef coord[4] = {
4236                 ctx->tes_u,
4237                 ctx->tes_v,
4238                 ctx->ac.f32_0,
4239                 ctx->ac.f32_0,
4240         };
4241
4242         if (ctx->tes_primitive_mode == GL_TRIANGLES)
4243                 coord[2] = LLVMBuildFSub(ctx->builder, ctx->ac.f32_1,
4244                                         LLVMBuildFAdd(ctx->builder, coord[0], coord[1], ""), "");
4245
4246         LLVMValueRef result = ac_build_gather_values(&ctx->ac, coord, num_components);
4247         return LLVMBuildBitCast(ctx->builder, result, type, "");
4248 }
4249
4250 static LLVMValueRef
4251 load_patch_vertices_in(struct ac_shader_abi *abi)
4252 {
4253         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4254         return LLVMConstInt(ctx->ac.i32, ctx->options->key.tcs.input_vertices, false);
4255 }
4256
4257 static void visit_intrinsic(struct ac_nir_context *ctx,
4258                             nir_intrinsic_instr *instr)
4259 {
4260         LLVMValueRef result = NULL;
4261
4262         switch (instr->intrinsic) {
4263         case nir_intrinsic_ballot:
4264                 result = ac_build_ballot(&ctx->ac, get_src(ctx, instr->src[0]));
4265                 break;
4266         case nir_intrinsic_read_invocation:
4267         case nir_intrinsic_read_first_invocation: {
4268                 LLVMValueRef args[2];
4269
4270                 /* Value */
4271                 args[0] = get_src(ctx, instr->src[0]);
4272
4273                 unsigned num_args;
4274                 const char *intr_name;
4275                 if (instr->intrinsic == nir_intrinsic_read_invocation) {
4276                         num_args = 2;
4277                         intr_name = "llvm.amdgcn.readlane";
4278
4279                         /* Invocation */
4280                         args[1] = get_src(ctx, instr->src[1]);
4281                 } else {
4282                         num_args = 1;
4283                         intr_name = "llvm.amdgcn.readfirstlane";
4284                 }
4285
4286                 /* We currently have no other way to prevent LLVM from lifting the icmp
4287                  * calls to a dominating basic block.
4288                  */
4289                 ac_build_optimization_barrier(&ctx->ac, &args[0]);
4290
4291                 result = ac_build_intrinsic(&ctx->ac, intr_name,
4292                                             ctx->ac.i32, args, num_args,
4293                                             AC_FUNC_ATTR_READNONE |
4294                                             AC_FUNC_ATTR_CONVERGENT);
4295                 break;
4296         }
4297         case nir_intrinsic_load_subgroup_invocation:
4298                 result = ac_get_thread_id(&ctx->ac);
4299                 break;
4300         case nir_intrinsic_load_work_group_id: {
4301                 LLVMValueRef values[3];
4302
4303                 for (int i = 0; i < 3; i++) {
4304                         values[i] = ctx->nctx->workgroup_ids[i] ?
4305                                     ctx->nctx->workgroup_ids[i] : ctx->ac.i32_0;
4306                 }
4307
4308                 result = ac_build_gather_values(&ctx->ac, values, 3);
4309                 break;
4310         }
4311         case nir_intrinsic_load_base_vertex: {
4312                 result = ctx->abi->base_vertex;
4313                 break;
4314         }
4315         case nir_intrinsic_load_vertex_id_zero_base: {
4316                 result = ctx->abi->vertex_id;
4317                 break;
4318         }
4319         case nir_intrinsic_load_local_invocation_id: {
4320                 result = ctx->nctx->local_invocation_ids;
4321                 break;
4322         }
4323         case nir_intrinsic_load_base_instance:
4324                 result = ctx->abi->start_instance;
4325                 break;
4326         case nir_intrinsic_load_draw_id:
4327                 result = ctx->abi->draw_id;
4328                 break;
4329         case nir_intrinsic_load_view_index:
4330                 result = ctx->nctx->view_index ? ctx->nctx->view_index : ctx->ac.i32_0;
4331                 break;
4332         case nir_intrinsic_load_invocation_id:
4333                 if (ctx->stage == MESA_SHADER_TESS_CTRL)
4334                         result = unpack_param(&ctx->ac, ctx->abi->tcs_rel_ids, 8, 5);
4335                 else
4336                         result = ctx->abi->gs_invocation_id;
4337                 break;
4338         case nir_intrinsic_load_primitive_id:
4339                 if (ctx->stage == MESA_SHADER_GEOMETRY) {
4340                         result = ctx->abi->gs_prim_id;
4341                 } else if (ctx->stage == MESA_SHADER_TESS_CTRL) {
4342                         result = ctx->abi->tcs_patch_id;
4343                 } else if (ctx->stage == MESA_SHADER_TESS_EVAL) {
4344                         result = ctx->abi->tes_patch_id;
4345                 } else
4346                         fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
4347                 break;
4348         case nir_intrinsic_load_sample_id:
4349                 result = unpack_param(&ctx->ac, ctx->abi->ancillary, 8, 4);
4350                 break;
4351         case nir_intrinsic_load_sample_pos:
4352                 result = load_sample_pos(ctx);
4353                 break;
4354         case nir_intrinsic_load_sample_mask_in:
4355                 if (ctx->nctx)
4356                         result = load_sample_mask_in(ctx);
4357                 else
4358                         result = ctx->abi->sample_coverage;
4359                 break;
4360         case nir_intrinsic_load_frag_coord: {
4361                 LLVMValueRef values[4] = {
4362                         ctx->abi->frag_pos[0],
4363                         ctx->abi->frag_pos[1],
4364                         ctx->abi->frag_pos[2],
4365                         ac_build_fdiv(&ctx->ac, ctx->ac.f32_1, ctx->abi->frag_pos[3])
4366                 };
4367                 result = ac_build_gather_values(&ctx->ac, values, 4);
4368                 break;
4369         }
4370         case nir_intrinsic_load_front_face:
4371                 result = ctx->abi->front_face;
4372                 break;
4373         case nir_intrinsic_load_helper_invocation:
4374                 result = visit_load_helper_invocation(ctx);
4375                 break;
4376         case nir_intrinsic_load_instance_id:
4377                 result = ctx->abi->instance_id;
4378                 break;
4379         case nir_intrinsic_load_num_work_groups:
4380                 result = ctx->nctx->num_work_groups;
4381                 break;
4382         case nir_intrinsic_load_local_invocation_index:
4383                 result = visit_load_local_invocation_index(ctx->nctx);
4384                 break;
4385         case nir_intrinsic_load_push_constant:
4386                 result = visit_load_push_constant(ctx->nctx, instr);
4387                 break;
4388         case nir_intrinsic_vulkan_resource_index:
4389                 result = visit_vulkan_resource_index(ctx->nctx, instr);
4390                 break;
4391         case nir_intrinsic_vulkan_resource_reindex:
4392                 result = visit_vulkan_resource_reindex(ctx->nctx, instr);
4393                 break;
4394         case nir_intrinsic_store_ssbo:
4395                 visit_store_ssbo(ctx, instr);
4396                 break;
4397         case nir_intrinsic_load_ssbo:
4398                 result = visit_load_buffer(ctx, instr);
4399                 break;
4400         case nir_intrinsic_ssbo_atomic_add:
4401         case nir_intrinsic_ssbo_atomic_imin:
4402         case nir_intrinsic_ssbo_atomic_umin:
4403         case nir_intrinsic_ssbo_atomic_imax:
4404         case nir_intrinsic_ssbo_atomic_umax:
4405         case nir_intrinsic_ssbo_atomic_and:
4406         case nir_intrinsic_ssbo_atomic_or:
4407         case nir_intrinsic_ssbo_atomic_xor:
4408         case nir_intrinsic_ssbo_atomic_exchange:
4409         case nir_intrinsic_ssbo_atomic_comp_swap:
4410                 result = visit_atomic_ssbo(ctx, instr);
4411                 break;
4412         case nir_intrinsic_load_ubo:
4413                 result = visit_load_ubo_buffer(ctx, instr);
4414                 break;
4415         case nir_intrinsic_get_buffer_size:
4416                 result = visit_get_buffer_size(ctx, instr);
4417                 break;
4418         case nir_intrinsic_load_var:
4419                 result = visit_load_var(ctx, instr);
4420                 break;
4421         case nir_intrinsic_store_var:
4422                 visit_store_var(ctx, instr);
4423                 break;
4424         case nir_intrinsic_image_load:
4425                 result = visit_image_load(ctx, instr);
4426                 break;
4427         case nir_intrinsic_image_store:
4428                 visit_image_store(ctx, instr);
4429                 break;
4430         case nir_intrinsic_image_atomic_add:
4431         case nir_intrinsic_image_atomic_min:
4432         case nir_intrinsic_image_atomic_max:
4433         case nir_intrinsic_image_atomic_and:
4434         case nir_intrinsic_image_atomic_or:
4435         case nir_intrinsic_image_atomic_xor:
4436         case nir_intrinsic_image_atomic_exchange:
4437         case nir_intrinsic_image_atomic_comp_swap:
4438                 result = visit_image_atomic(ctx, instr);
4439                 break;
4440         case nir_intrinsic_image_size:
4441                 result = visit_image_size(ctx, instr);
4442                 break;
4443         case nir_intrinsic_discard:
4444         case nir_intrinsic_discard_if:
4445                 emit_discard(ctx, instr);
4446                 break;
4447         case nir_intrinsic_memory_barrier:
4448         case nir_intrinsic_group_memory_barrier:
4449         case nir_intrinsic_memory_barrier_atomic_counter:
4450         case nir_intrinsic_memory_barrier_buffer:
4451         case nir_intrinsic_memory_barrier_image:
4452         case nir_intrinsic_memory_barrier_shared:
4453                 emit_membar(ctx->nctx, instr);
4454                 break;
4455         case nir_intrinsic_barrier:
4456                 emit_barrier(&ctx->ac, ctx->stage);
4457                 break;
4458         case nir_intrinsic_var_atomic_add:
4459         case nir_intrinsic_var_atomic_imin:
4460         case nir_intrinsic_var_atomic_umin:
4461         case nir_intrinsic_var_atomic_imax:
4462         case nir_intrinsic_var_atomic_umax:
4463         case nir_intrinsic_var_atomic_and:
4464         case nir_intrinsic_var_atomic_or:
4465         case nir_intrinsic_var_atomic_xor:
4466         case nir_intrinsic_var_atomic_exchange:
4467         case nir_intrinsic_var_atomic_comp_swap:
4468                 result = visit_var_atomic(ctx->nctx, instr);
4469                 break;
4470         case nir_intrinsic_interp_var_at_centroid:
4471         case nir_intrinsic_interp_var_at_sample:
4472         case nir_intrinsic_interp_var_at_offset:
4473                 result = visit_interp(ctx, instr);
4474                 break;
4475         case nir_intrinsic_emit_vertex:
4476                 ctx->abi->emit_vertex(ctx->abi, nir_intrinsic_stream_id(instr), ctx->outputs);
4477                 break;
4478         case nir_intrinsic_end_primitive:
4479                 ctx->abi->emit_primitive(ctx->abi, nir_intrinsic_stream_id(instr));
4480                 break;
4481         case nir_intrinsic_load_tess_coord: {
4482                 LLVMTypeRef type = ctx->nctx ?
4483                         get_def_type(ctx->nctx->nir, &instr->dest.ssa) :
4484                         NULL;
4485                 result = ctx->abi->load_tess_coord(ctx->abi, type, instr->num_components);
4486                 break;
4487         }
4488         case nir_intrinsic_load_tess_level_outer:
4489                 result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_OUTER);
4490                 break;
4491         case nir_intrinsic_load_tess_level_inner:
4492                 result = ctx->abi->load_tess_level(ctx->abi, VARYING_SLOT_TESS_LEVEL_INNER);
4493                 break;
4494         case nir_intrinsic_load_patch_vertices_in:
4495                 result = ctx->abi->load_patch_vertices_in(ctx->abi);
4496                 break;
4497         case nir_intrinsic_vote_all: {
4498                 LLVMValueRef tmp = ac_build_vote_all(&ctx->ac, get_src(ctx, instr->src[0]));
4499                 result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
4500                 break;
4501         }
4502         case nir_intrinsic_vote_any: {
4503                 LLVMValueRef tmp = ac_build_vote_any(&ctx->ac, get_src(ctx, instr->src[0]));
4504                 result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
4505                 break;
4506         }
4507         case nir_intrinsic_vote_eq: {
4508                 LLVMValueRef tmp = ac_build_vote_eq(&ctx->ac, get_src(ctx, instr->src[0]));
4509                 result = LLVMBuildSExt(ctx->ac.builder, tmp, ctx->ac.i32, "");
4510                 break;
4511         }
4512         default:
4513                 fprintf(stderr, "Unknown intrinsic: ");
4514                 nir_print_instr(&instr->instr, stderr);
4515                 fprintf(stderr, "\n");
4516                 break;
4517         }
4518         if (result) {
4519                 _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
4520         }
4521 }
4522
4523 static LLVMValueRef radv_load_ssbo(struct ac_shader_abi *abi,
4524                                    LLVMValueRef buffer_ptr, bool write)
4525 {
4526         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4527         LLVMValueRef result;
4528
4529         LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
4530
4531         result = LLVMBuildLoad(ctx->builder, buffer_ptr, "");
4532         LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
4533
4534         return result;
4535 }
4536
4537 static LLVMValueRef radv_load_ubo(struct ac_shader_abi *abi, LLVMValueRef buffer_ptr)
4538 {
4539         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4540         LLVMValueRef result;
4541
4542         LLVMSetMetadata(buffer_ptr, ctx->ac.uniform_md_kind, ctx->ac.empty_md);
4543
4544         result = LLVMBuildLoad(ctx->builder, buffer_ptr, "");
4545         LLVMSetMetadata(result, ctx->ac.invariant_load_md_kind, ctx->ac.empty_md);
4546
4547         return result;
4548 }
4549
4550 static LLVMValueRef radv_get_sampler_desc(struct ac_shader_abi *abi,
4551                                           unsigned descriptor_set,
4552                                           unsigned base_index,
4553                                           unsigned constant_index,
4554                                           LLVMValueRef index,
4555                                           enum ac_descriptor_type desc_type,
4556                                           bool image, bool write)
4557 {
4558         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
4559         LLVMValueRef list = ctx->descriptor_sets[descriptor_set];
4560         struct radv_descriptor_set_layout *layout = ctx->options->layout->set[descriptor_set].layout;
4561         struct radv_descriptor_set_binding_layout *binding = layout->binding + base_index;
4562         unsigned offset = binding->offset;
4563         unsigned stride = binding->size;
4564         unsigned type_size;
4565         LLVMBuilderRef builder = ctx->builder;
4566         LLVMTypeRef type;
4567
4568         assert(base_index < layout->binding_count);
4569
4570         switch (desc_type) {
4571         case AC_DESC_IMAGE:
4572                 type = ctx->ac.v8i32;
4573                 type_size = 32;
4574                 break;
4575         case AC_DESC_FMASK:
4576                 type = ctx->ac.v8i32;
4577                 offset += 32;
4578                 type_size = 32;
4579                 break;
4580         case AC_DESC_SAMPLER:
4581                 type = ctx->ac.v4i32;
4582                 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
4583                         offset += 64;
4584
4585                 type_size = 16;
4586                 break;
4587         case AC_DESC_BUFFER:
4588                 type = ctx->ac.v4i32;
4589                 type_size = 16;
4590                 break;
4591         default:
4592                 unreachable("invalid desc_type\n");
4593         }
4594
4595         offset += constant_index * stride;
4596
4597         if (desc_type == AC_DESC_SAMPLER && binding->immutable_samplers_offset &&
4598             (!index || binding->immutable_samplers_equal)) {
4599                 if (binding->immutable_samplers_equal)
4600                         constant_index = 0;
4601
4602                 const uint32_t *samplers = radv_immutable_samplers(layout, binding);
4603
4604                 LLVMValueRef constants[] = {
4605                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 0], 0),
4606                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 1], 0),
4607                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 2], 0),
4608                         LLVMConstInt(ctx->ac.i32, samplers[constant_index * 4 + 3], 0),
4609                 };
4610                 return ac_build_gather_values(&ctx->ac, constants, 4);
4611         }
4612
4613         assert(stride % type_size == 0);
4614
4615         if (!index)
4616                 index = ctx->ac.i32_0;
4617
4618         index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, stride / type_size, 0), "");
4619
4620         list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->ac.i32, offset, 0));
4621         list = LLVMBuildPointerCast(builder, list, ac_array_in_const_addr_space(type), "");
4622
4623         return ac_build_load_to_sgpr(&ctx->ac, list, index);
4624 }
4625
4626 static LLVMValueRef get_sampler_desc(struct ac_nir_context *ctx,
4627                                      const nir_deref_var *deref,
4628                                      enum ac_descriptor_type desc_type,
4629                                      const nir_tex_instr *tex_instr,
4630                                      bool image, bool write)
4631 {
4632         LLVMValueRef index = NULL;
4633         unsigned constant_index = 0;
4634         unsigned descriptor_set;
4635         unsigned base_index;
4636
4637         if (!deref) {
4638                 assert(tex_instr && !image);
4639                 descriptor_set = 0;
4640                 base_index = tex_instr->sampler_index;
4641         } else {
4642                 const nir_deref *tail = &deref->deref;
4643                 while (tail->child) {
4644                         const nir_deref_array *child = nir_deref_as_array(tail->child);
4645                         unsigned array_size = glsl_get_aoa_size(tail->child->type);
4646
4647                         if (!array_size)
4648                                 array_size = 1;
4649
4650                         assert(child->deref_array_type != nir_deref_array_type_wildcard);
4651
4652                         if (child->deref_array_type == nir_deref_array_type_indirect) {
4653                                 LLVMValueRef indirect = get_src(ctx, child->indirect);
4654
4655                                 indirect = LLVMBuildMul(ctx->ac.builder, indirect,
4656                                         LLVMConstInt(ctx->ac.i32, array_size, false), "");
4657
4658                                 if (!index)
4659                                         index = indirect;
4660                                 else
4661                                         index = LLVMBuildAdd(ctx->ac.builder, index, indirect, "");
4662                         }
4663
4664                         constant_index += child->base_offset * array_size;
4665
4666                         tail = &child->deref;
4667                 }
4668                 descriptor_set = deref->var->data.descriptor_set;
4669                 base_index = deref->var->data.binding;
4670         }
4671
4672         return ctx->abi->load_sampler_desc(ctx->abi,
4673                                           descriptor_set,
4674                                           base_index,
4675                                           constant_index, index,
4676                                           desc_type, image, write);
4677 }
4678
4679 static void set_tex_fetch_args(struct ac_llvm_context *ctx,
4680                                struct ac_image_args *args,
4681                                const nir_tex_instr *instr,
4682                                nir_texop op,
4683                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
4684                                LLVMValueRef *param, unsigned count,
4685                                unsigned dmask)
4686 {
4687         unsigned is_rect = 0;
4688         bool da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
4689
4690         if (op == nir_texop_lod)
4691                 da = false;
4692         /* Pad to power of two vector */
4693         while (count < util_next_power_of_two(count))
4694                 param[count++] = LLVMGetUndef(ctx->i32);
4695
4696         if (count > 1)
4697                 args->addr = ac_build_gather_values(ctx, param, count);
4698         else
4699                 args->addr = param[0];
4700
4701         args->resource = res_ptr;
4702         args->sampler = samp_ptr;
4703
4704         if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF && op == nir_texop_txf) {
4705                 args->addr = param[0];
4706                 return;
4707         }
4708
4709         args->dmask = dmask;
4710         args->unorm = is_rect;
4711         args->da = da;
4712 }
4713
4714 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
4715  *
4716  * SI-CI:
4717  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
4718  *   filtering manually. The driver sets img7 to a mask clearing
4719  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
4720  *     s_and_b32 samp0, samp0, img7
4721  *
4722  * VI:
4723  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
4724  */
4725 static LLVMValueRef sici_fix_sampler_aniso(struct ac_nir_context *ctx,
4726                                            LLVMValueRef res, LLVMValueRef samp)
4727 {
4728         LLVMBuilderRef builder = ctx->ac.builder;
4729         LLVMValueRef img7, samp0;
4730
4731         if (ctx->ac.chip_class >= VI)
4732                 return samp;
4733
4734         img7 = LLVMBuildExtractElement(builder, res,
4735                                        LLVMConstInt(ctx->ac.i32, 7, 0), "");
4736         samp0 = LLVMBuildExtractElement(builder, samp,
4737                                         LLVMConstInt(ctx->ac.i32, 0, 0), "");
4738         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
4739         return LLVMBuildInsertElement(builder, samp, samp0,
4740                                       LLVMConstInt(ctx->ac.i32, 0, 0), "");
4741 }
4742
4743 static void tex_fetch_ptrs(struct ac_nir_context *ctx,
4744                            nir_tex_instr *instr,
4745                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
4746                            LLVMValueRef *fmask_ptr)
4747 {
4748         if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF)
4749                 *res_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_BUFFER, instr, false, false);
4750         else
4751                 *res_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_IMAGE, instr, false, false);
4752         if (samp_ptr) {
4753                 if (instr->sampler)
4754                         *samp_ptr = get_sampler_desc(ctx, instr->sampler, AC_DESC_SAMPLER, instr, false, false);
4755                 else
4756                         *samp_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_SAMPLER, instr, false, false);
4757                 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
4758                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
4759         }
4760         if (fmask_ptr && !instr->sampler && (instr->op == nir_texop_txf_ms ||
4761                                              instr->op == nir_texop_samples_identical))
4762                 *fmask_ptr = get_sampler_desc(ctx, instr->texture, AC_DESC_FMASK, instr, false, false);
4763 }
4764
4765 static LLVMValueRef apply_round_slice(struct ac_llvm_context *ctx,
4766                                       LLVMValueRef coord)
4767 {
4768         coord = ac_to_float(ctx, coord);
4769         coord = ac_build_intrinsic(ctx, "llvm.rint.f32", ctx->f32, &coord, 1, 0);
4770         coord = ac_to_integer(ctx, coord);
4771         return coord;
4772 }
4773
4774 static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
4775 {
4776         LLVMValueRef result = NULL;
4777         struct ac_image_args args = { 0 };
4778         unsigned dmask = 0xf;
4779         LLVMValueRef address[16];
4780         LLVMValueRef coords[5];
4781         LLVMValueRef coord = NULL, lod = NULL, comparator = NULL;
4782         LLVMValueRef bias = NULL, offsets = NULL;
4783         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL, sample_index = NULL;
4784         LLVMValueRef ddx = NULL, ddy = NULL;
4785         LLVMValueRef derivs[6];
4786         unsigned chan, count = 0;
4787         unsigned const_src = 0, num_deriv_comp = 0;
4788         bool lod_is_zero = false;
4789
4790         tex_fetch_ptrs(ctx, instr, &res_ptr, &samp_ptr, &fmask_ptr);
4791
4792         for (unsigned i = 0; i < instr->num_srcs; i++) {
4793                 switch (instr->src[i].src_type) {
4794                 case nir_tex_src_coord:
4795                         coord = get_src(ctx, instr->src[i].src);
4796                         break;
4797                 case nir_tex_src_projector:
4798                         break;
4799                 case nir_tex_src_comparator:
4800                         comparator = get_src(ctx, instr->src[i].src);
4801                         break;
4802                 case nir_tex_src_offset:
4803                         offsets = get_src(ctx, instr->src[i].src);
4804                         const_src = i;
4805                         break;
4806                 case nir_tex_src_bias:
4807                         bias = get_src(ctx, instr->src[i].src);
4808                         break;
4809                 case nir_tex_src_lod: {
4810                         nir_const_value *val = nir_src_as_const_value(instr->src[i].src);
4811
4812                         if (val && val->i32[0] == 0)
4813                                 lod_is_zero = true;
4814                         lod = get_src(ctx, instr->src[i].src);
4815                         break;
4816                 }
4817                 case nir_tex_src_ms_index:
4818                         sample_index = get_src(ctx, instr->src[i].src);
4819                         break;
4820                 case nir_tex_src_ms_mcs:
4821                         break;
4822                 case nir_tex_src_ddx:
4823                         ddx = get_src(ctx, instr->src[i].src);
4824                         num_deriv_comp = instr->src[i].src.ssa->num_components;
4825                         break;
4826                 case nir_tex_src_ddy:
4827                         ddy = get_src(ctx, instr->src[i].src);
4828                         break;
4829                 case nir_tex_src_texture_offset:
4830                 case nir_tex_src_sampler_offset:
4831                 case nir_tex_src_plane:
4832                 default:
4833                         break;
4834                 }
4835         }
4836
4837         if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
4838                 result = get_buffer_size(ctx, res_ptr, true);
4839                 goto write_result;
4840         }
4841
4842         if (instr->op == nir_texop_texture_samples) {
4843                 LLVMValueRef res, samples, is_msaa;
4844                 res = LLVMBuildBitCast(ctx->ac.builder, res_ptr, ctx->ac.v8i32, "");
4845                 samples = LLVMBuildExtractElement(ctx->ac.builder, res,
4846                                                   LLVMConstInt(ctx->ac.i32, 3, false), "");
4847                 is_msaa = LLVMBuildLShr(ctx->ac.builder, samples,
4848                                         LLVMConstInt(ctx->ac.i32, 28, false), "");
4849                 is_msaa = LLVMBuildAnd(ctx->ac.builder, is_msaa,
4850                                        LLVMConstInt(ctx->ac.i32, 0xe, false), "");
4851                 is_msaa = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, is_msaa,
4852                                         LLVMConstInt(ctx->ac.i32, 0xe, false), "");
4853
4854                 samples = LLVMBuildLShr(ctx->ac.builder, samples,
4855                                         LLVMConstInt(ctx->ac.i32, 16, false), "");
4856                 samples = LLVMBuildAnd(ctx->ac.builder, samples,
4857                                        LLVMConstInt(ctx->ac.i32, 0xf, false), "");
4858                 samples = LLVMBuildShl(ctx->ac.builder, ctx->ac.i32_1,
4859                                        samples, "");
4860                 samples = LLVMBuildSelect(ctx->ac.builder, is_msaa, samples,
4861                                           ctx->ac.i32_1, "");
4862                 result = samples;
4863                 goto write_result;
4864         }
4865
4866         if (coord)
4867                 for (chan = 0; chan < instr->coord_components; chan++)
4868                         coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan);
4869
4870         if (offsets && instr->op != nir_texop_txf) {
4871                 LLVMValueRef offset[3], pack;
4872                 for (chan = 0; chan < 3; ++chan)
4873                         offset[chan] = ctx->ac.i32_0;
4874
4875                 args.offset = true;
4876                 for (chan = 0; chan < ac_get_llvm_num_components(offsets); chan++) {
4877                         offset[chan] = ac_llvm_extract_elem(&ctx->ac, offsets, chan);
4878                         offset[chan] = LLVMBuildAnd(ctx->ac.builder, offset[chan],
4879                                                     LLVMConstInt(ctx->ac.i32, 0x3f, false), "");
4880                         if (chan)
4881                                 offset[chan] = LLVMBuildShl(ctx->ac.builder, offset[chan],
4882                                                             LLVMConstInt(ctx->ac.i32, chan * 8, false), "");
4883                 }
4884                 pack = LLVMBuildOr(ctx->ac.builder, offset[0], offset[1], "");
4885                 pack = LLVMBuildOr(ctx->ac.builder, pack, offset[2], "");
4886                 address[count++] = pack;
4887
4888         }
4889         /* pack LOD bias value */
4890         if (instr->op == nir_texop_txb && bias) {
4891                 address[count++] = bias;
4892         }
4893
4894         /* Pack depth comparison value */
4895         if (instr->is_shadow && comparator) {
4896                 LLVMValueRef z = ac_to_float(&ctx->ac,
4897                                              ac_llvm_extract_elem(&ctx->ac, comparator, 0));
4898
4899                 /* TC-compatible HTILE on radeonsi promotes Z16 and Z24 to Z32_FLOAT,
4900                  * so the depth comparison value isn't clamped for Z16 and
4901                  * Z24 anymore. Do it manually here.
4902                  *
4903                  * It's unnecessary if the original texture format was
4904                  * Z32_FLOAT, but we don't know that here.
4905                  */
4906                 if (ctx->ac.chip_class == VI && ctx->abi->clamp_shadow_reference)
4907                         z = ac_build_clamp(&ctx->ac, z);
4908
4909                 address[count++] = z;
4910         }
4911
4912         /* pack derivatives */
4913         if (ddx || ddy) {
4914                 int num_src_deriv_channels, num_dest_deriv_channels;
4915                 switch (instr->sampler_dim) {
4916                 case GLSL_SAMPLER_DIM_3D:
4917                 case GLSL_SAMPLER_DIM_CUBE:
4918                         num_deriv_comp = 3;
4919                         num_src_deriv_channels = 3;
4920                         num_dest_deriv_channels = 3;
4921                         break;
4922                 case GLSL_SAMPLER_DIM_2D:
4923                 default:
4924                         num_src_deriv_channels = 2;
4925                         num_dest_deriv_channels = 2;
4926                         num_deriv_comp = 2;
4927                         break;
4928                 case GLSL_SAMPLER_DIM_1D:
4929                         num_src_deriv_channels = 1;
4930                         if (ctx->ac.chip_class >= GFX9) {
4931                                 num_dest_deriv_channels = 2;
4932                                 num_deriv_comp = 2;
4933                         } else {
4934                                 num_dest_deriv_channels = 1;
4935                                 num_deriv_comp = 1;
4936                         }
4937                         break;
4938                 }
4939
4940                 for (unsigned i = 0; i < num_src_deriv_channels; i++) {
4941                         derivs[i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddx, i));
4942                         derivs[num_dest_deriv_channels + i] = ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i));
4943                 }
4944                 for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) {
4945                         derivs[i] = ctx->ac.f32_0;
4946                         derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0;
4947                 }
4948         }
4949
4950         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && coord) {
4951                 for (chan = 0; chan < instr->coord_components; chan++)
4952                         coords[chan] = ac_to_float(&ctx->ac, coords[chan]);
4953                 if (instr->coord_components == 3)
4954                         coords[3] = LLVMGetUndef(ctx->ac.f32);
4955                 ac_prepare_cube_coords(&ctx->ac,
4956                         instr->op == nir_texop_txd, instr->is_array,
4957                         instr->op == nir_texop_lod, coords, derivs);
4958                 if (num_deriv_comp)
4959                         num_deriv_comp--;
4960         }
4961
4962         if (ddx || ddy) {
4963                 for (unsigned i = 0; i < num_deriv_comp * 2; i++)
4964                         address[count++] = derivs[i];
4965         }
4966
4967         /* Pack texture coordinates */
4968         if (coord) {
4969                 address[count++] = coords[0];
4970                 if (instr->coord_components > 1) {
4971                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array && instr->op != nir_texop_txf) {
4972                                 coords[1] = apply_round_slice(&ctx->ac, coords[1]);
4973                         }
4974                         address[count++] = coords[1];
4975                 }
4976                 if (instr->coord_components > 2) {
4977                         /* This seems like a bit of a hack - but it passes Vulkan CTS with it */
4978                         if (instr->sampler_dim != GLSL_SAMPLER_DIM_3D &&
4979                             instr->sampler_dim != GLSL_SAMPLER_DIM_CUBE &&
4980                             instr->op != nir_texop_txf) {
4981                                 coords[2] = apply_round_slice(&ctx->ac, coords[2]);
4982                         }
4983                         address[count++] = coords[2];
4984                 }
4985
4986                 if (ctx->ac.chip_class >= GFX9) {
4987                         LLVMValueRef filler;
4988                         if (instr->op == nir_texop_txf)
4989                                 filler = ctx->ac.i32_0;
4990                         else
4991                                 filler = LLVMConstReal(ctx->ac.f32, 0.5);
4992
4993                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) {
4994                                 /* No nir_texop_lod, because it does not take a slice
4995                                  * even with array textures. */
4996                                 if (instr->is_array && instr->op != nir_texop_lod ) {
4997                                         address[count] = address[count - 1];
4998                                         address[count - 1] = filler;
4999                                         count++;
5000                                 } else
5001                                         address[count++] = filler;
5002                         }
5003                 }
5004         }
5005
5006         /* Pack LOD */
5007         if (lod && ((instr->op == nir_texop_txl && !lod_is_zero) ||
5008                     instr->op == nir_texop_txf)) {
5009                 address[count++] = lod;
5010         } else if (instr->op == nir_texop_txf_ms && sample_index) {
5011                 address[count++] = sample_index;
5012         } else if(instr->op == nir_texop_txs) {
5013                 count = 0;
5014                 if (lod)
5015                         address[count++] = lod;
5016                 else
5017                         address[count++] = ctx->ac.i32_0;
5018         }
5019
5020         for (chan = 0; chan < count; chan++) {
5021                 address[chan] = LLVMBuildBitCast(ctx->ac.builder,
5022                                                  address[chan], ctx->ac.i32, "");
5023         }
5024
5025         if (instr->op == nir_texop_samples_identical) {
5026                 LLVMValueRef txf_address[4];
5027                 struct ac_image_args txf_args = { 0 };
5028                 unsigned txf_count = count;
5029                 memcpy(txf_address, address, sizeof(txf_address));
5030
5031                 if (!instr->is_array)
5032                         txf_address[2] = ctx->ac.i32_0;
5033                 txf_address[3] = ctx->ac.i32_0;
5034
5035                 set_tex_fetch_args(&ctx->ac, &txf_args, instr, nir_texop_txf,
5036                                    fmask_ptr, NULL,
5037                                    txf_address, txf_count, 0xf);
5038
5039                 result = build_tex_intrinsic(ctx, instr, false, &txf_args);
5040
5041                 result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
5042                 result = emit_int_cmp(&ctx->ac, LLVMIntEQ, result, ctx->ac.i32_0);
5043                 goto write_result;
5044         }
5045
5046         if (instr->sampler_dim == GLSL_SAMPLER_DIM_MS &&
5047             instr->op != nir_texop_txs) {
5048                 unsigned sample_chan = instr->is_array ? 3 : 2;
5049                 address[sample_chan] = adjust_sample_index_using_fmask(&ctx->ac,
5050                                                                        address[0],
5051                                                                        address[1],
5052                                                                        instr->is_array ? address[2] : NULL,
5053                                                                        address[sample_chan],
5054                                                                        fmask_ptr);
5055         }
5056
5057         if (offsets && instr->op == nir_texop_txf) {
5058                 nir_const_value *const_offset =
5059                         nir_src_as_const_value(instr->src[const_src].src);
5060                 int num_offsets = instr->src[const_src].src.ssa->num_components;
5061                 assert(const_offset);
5062                 num_offsets = MIN2(num_offsets, instr->coord_components);
5063                 if (num_offsets > 2)
5064                         address[2] = LLVMBuildAdd(ctx->ac.builder,
5065                                                   address[2], LLVMConstInt(ctx->ac.i32, const_offset->i32[2], false), "");
5066                 if (num_offsets > 1)
5067                         address[1] = LLVMBuildAdd(ctx->ac.builder,
5068                                                   address[1], LLVMConstInt(ctx->ac.i32, const_offset->i32[1], false), "");
5069                 address[0] = LLVMBuildAdd(ctx->ac.builder,
5070                                           address[0], LLVMConstInt(ctx->ac.i32, const_offset->i32[0], false), "");
5071
5072         }
5073
5074         /* TODO TG4 support */
5075         if (instr->op == nir_texop_tg4) {
5076                 if (instr->is_shadow)
5077                         dmask = 1;
5078                 else
5079                         dmask = 1 << instr->component;
5080         }
5081         set_tex_fetch_args(&ctx->ac, &args, instr, instr->op,
5082                            res_ptr, samp_ptr, address, count, dmask);
5083
5084         result = build_tex_intrinsic(ctx, instr, lod_is_zero, &args);
5085
5086         if (instr->op == nir_texop_query_levels)
5087                 result = LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), "");
5088         else if (instr->is_shadow && instr->is_new_style_shadow &&
5089                  instr->op != nir_texop_txs && instr->op != nir_texop_lod &&
5090                  instr->op != nir_texop_tg4)
5091                 result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, "");
5092         else if (instr->op == nir_texop_txs &&
5093                  instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
5094                  instr->is_array) {
5095                 LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
5096                 LLVMValueRef six = LLVMConstInt(ctx->ac.i32, 6, false);
5097                 LLVMValueRef z = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
5098                 z = LLVMBuildSDiv(ctx->ac.builder, z, six, "");
5099                 result = LLVMBuildInsertElement(ctx->ac.builder, result, z, two, "");
5100         } else if (ctx->ac.chip_class >= GFX9 &&
5101                    instr->op == nir_texop_txs &&
5102                    instr->sampler_dim == GLSL_SAMPLER_DIM_1D &&
5103                    instr->is_array) {
5104                 LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
5105                 LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
5106                 result = LLVMBuildInsertElement(ctx->ac.builder, result, layers,
5107                                                 ctx->ac.i32_1, "");
5108         } else if (instr->dest.ssa.num_components != 4)
5109                 result = trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
5110
5111 write_result:
5112         if (result) {
5113                 assert(instr->dest.is_ssa);
5114                 result = ac_to_integer(&ctx->ac, result);
5115                 _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
5116         }
5117 }
5118
5119
5120 static void visit_phi(struct ac_nir_context *ctx, nir_phi_instr *instr)
5121 {
5122         LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
5123         LLVMValueRef result = LLVMBuildPhi(ctx->ac.builder, type, "");
5124
5125         _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
5126         _mesa_hash_table_insert(ctx->phis, instr, result);
5127 }
5128
5129 static void visit_post_phi(struct ac_nir_context *ctx,
5130                            nir_phi_instr *instr,
5131                            LLVMValueRef llvm_phi)
5132 {
5133         nir_foreach_phi_src(src, instr) {
5134                 LLVMBasicBlockRef block = get_block(ctx, src->pred);
5135                 LLVMValueRef llvm_src = get_src(ctx, src->src);
5136
5137                 LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1);
5138         }
5139 }
5140
5141 static void phi_post_pass(struct ac_nir_context *ctx)
5142 {
5143         struct hash_entry *entry;
5144         hash_table_foreach(ctx->phis, entry) {
5145                 visit_post_phi(ctx, (nir_phi_instr*)entry->key,
5146                                (LLVMValueRef)entry->data);
5147         }
5148 }
5149
5150
5151 static void visit_ssa_undef(struct ac_nir_context *ctx,
5152                             const nir_ssa_undef_instr *instr)
5153 {
5154         unsigned num_components = instr->def.num_components;
5155         LLVMTypeRef type = LLVMIntTypeInContext(ctx->ac.context, instr->def.bit_size);
5156         LLVMValueRef undef;
5157
5158         if (num_components == 1)
5159                 undef = LLVMGetUndef(type);
5160         else {
5161                 undef = LLVMGetUndef(LLVMVectorType(type, num_components));
5162         }
5163         _mesa_hash_table_insert(ctx->defs, &instr->def, undef);
5164 }
5165
5166 static void visit_jump(struct ac_nir_context *ctx,
5167                        const nir_jump_instr *instr)
5168 {
5169         switch (instr->type) {
5170         case nir_jump_break:
5171                 LLVMBuildBr(ctx->ac.builder, ctx->break_block);
5172                 LLVMClearInsertionPosition(ctx->ac.builder);
5173                 break;
5174         case nir_jump_continue:
5175                 LLVMBuildBr(ctx->ac.builder, ctx->continue_block);
5176                 LLVMClearInsertionPosition(ctx->ac.builder);
5177                 break;
5178         default:
5179                 fprintf(stderr, "Unknown NIR jump instr: ");
5180                 nir_print_instr(&instr->instr, stderr);
5181                 fprintf(stderr, "\n");
5182                 abort();
5183         }
5184 }
5185
5186 static void visit_cf_list(struct ac_nir_context *ctx,
5187                           struct exec_list *list);
5188
5189 static void visit_block(struct ac_nir_context *ctx, nir_block *block)
5190 {
5191         LLVMBasicBlockRef llvm_block = LLVMGetInsertBlock(ctx->ac.builder);
5192         nir_foreach_instr(instr, block)
5193         {
5194                 switch (instr->type) {
5195                 case nir_instr_type_alu:
5196                         visit_alu(ctx, nir_instr_as_alu(instr));
5197                         break;
5198                 case nir_instr_type_load_const:
5199                         visit_load_const(ctx, nir_instr_as_load_const(instr));
5200                         break;
5201                 case nir_instr_type_intrinsic:
5202                         visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
5203                         break;
5204                 case nir_instr_type_tex:
5205                         visit_tex(ctx, nir_instr_as_tex(instr));
5206                         break;
5207                 case nir_instr_type_phi:
5208                         visit_phi(ctx, nir_instr_as_phi(instr));
5209                         break;
5210                 case nir_instr_type_ssa_undef:
5211                         visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr));
5212                         break;
5213                 case nir_instr_type_jump:
5214                         visit_jump(ctx, nir_instr_as_jump(instr));
5215                         break;
5216                 default:
5217                         fprintf(stderr, "Unknown NIR instr type: ");
5218                         nir_print_instr(instr, stderr);
5219                         fprintf(stderr, "\n");
5220                         abort();
5221                 }
5222         }
5223
5224         _mesa_hash_table_insert(ctx->defs, block, llvm_block);
5225 }
5226
5227 static void visit_if(struct ac_nir_context *ctx, nir_if *if_stmt)
5228 {
5229         LLVMValueRef value = get_src(ctx, if_stmt->condition);
5230
5231         LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
5232         LLVMBasicBlockRef merge_block =
5233             LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
5234         LLVMBasicBlockRef if_block =
5235             LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
5236         LLVMBasicBlockRef else_block = merge_block;
5237         if (!exec_list_is_empty(&if_stmt->else_list))
5238                 else_block = LLVMAppendBasicBlockInContext(
5239                     ctx->ac.context, fn, "");
5240
5241         LLVMValueRef cond = LLVMBuildICmp(ctx->ac.builder, LLVMIntNE, value,
5242                                           ctx->ac.i32_0, "");
5243         LLVMBuildCondBr(ctx->ac.builder, cond, if_block, else_block);
5244
5245         LLVMPositionBuilderAtEnd(ctx->ac.builder, if_block);
5246         visit_cf_list(ctx, &if_stmt->then_list);
5247         if (LLVMGetInsertBlock(ctx->ac.builder))
5248                 LLVMBuildBr(ctx->ac.builder, merge_block);
5249
5250         if (!exec_list_is_empty(&if_stmt->else_list)) {
5251                 LLVMPositionBuilderAtEnd(ctx->ac.builder, else_block);
5252                 visit_cf_list(ctx, &if_stmt->else_list);
5253                 if (LLVMGetInsertBlock(ctx->ac.builder))
5254                         LLVMBuildBr(ctx->ac.builder, merge_block);
5255         }
5256
5257         LLVMPositionBuilderAtEnd(ctx->ac.builder, merge_block);
5258 }
5259
5260 static void visit_loop(struct ac_nir_context *ctx, nir_loop *loop)
5261 {
5262         LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx->ac.builder));
5263         LLVMBasicBlockRef continue_parent = ctx->continue_block;
5264         LLVMBasicBlockRef break_parent = ctx->break_block;
5265
5266         ctx->continue_block =
5267             LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
5268         ctx->break_block =
5269             LLVMAppendBasicBlockInContext(ctx->ac.context, fn, "");
5270
5271         LLVMBuildBr(ctx->ac.builder, ctx->continue_block);
5272         LLVMPositionBuilderAtEnd(ctx->ac.builder, ctx->continue_block);
5273         visit_cf_list(ctx, &loop->body);
5274
5275         if (LLVMGetInsertBlock(ctx->ac.builder))
5276                 LLVMBuildBr(ctx->ac.builder, ctx->continue_block);
5277         LLVMPositionBuilderAtEnd(ctx->ac.builder, ctx->break_block);
5278
5279         ctx->continue_block = continue_parent;
5280         ctx->break_block = break_parent;
5281 }
5282
5283 static void visit_cf_list(struct ac_nir_context *ctx,
5284                           struct exec_list *list)
5285 {
5286         foreach_list_typed(nir_cf_node, node, node, list)
5287         {
5288                 switch (node->type) {
5289                 case nir_cf_node_block:
5290                         visit_block(ctx, nir_cf_node_as_block(node));
5291                         break;
5292
5293                 case nir_cf_node_if:
5294                         visit_if(ctx, nir_cf_node_as_if(node));
5295                         break;
5296
5297                 case nir_cf_node_loop:
5298                         visit_loop(ctx, nir_cf_node_as_loop(node));
5299                         break;
5300
5301                 default:
5302                         assert(0);
5303                 }
5304         }
5305 }
5306
5307 static void
5308 handle_vs_input_decl(struct nir_to_llvm_context *ctx,
5309                      struct nir_variable *variable)
5310 {
5311         LLVMValueRef t_list_ptr = ctx->vertex_buffers;
5312         LLVMValueRef t_offset;
5313         LLVMValueRef t_list;
5314         LLVMValueRef input;
5315         LLVMValueRef buffer_index;
5316         int index = variable->data.location - VERT_ATTRIB_GENERIC0;
5317         int idx = variable->data.location;
5318         unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
5319
5320         variable->data.driver_location = idx * 4;
5321
5322         for (unsigned i = 0; i < attrib_count; ++i, ++idx) {
5323                 if (ctx->options->key.vs.instance_rate_inputs & (1u << (index + i))) {
5324                         buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.instance_id,
5325                                                     ctx->abi.start_instance, "");
5326                         if (ctx->options->key.vs.as_ls) {
5327                                 ctx->shader_info->vs.vgpr_comp_cnt =
5328                                         MAX2(2, ctx->shader_info->vs.vgpr_comp_cnt);
5329                         } else {
5330                                 ctx->shader_info->vs.vgpr_comp_cnt =
5331                                         MAX2(1, ctx->shader_info->vs.vgpr_comp_cnt);
5332                         }
5333                 } else
5334                         buffer_index = LLVMBuildAdd(ctx->builder, ctx->abi.vertex_id,
5335                                                     ctx->abi.base_vertex, "");
5336                 t_offset = LLVMConstInt(ctx->ac.i32, index + i, false);
5337
5338                 t_list = ac_build_load_to_sgpr(&ctx->ac, t_list_ptr, t_offset);
5339
5340                 input = ac_build_buffer_load_format(&ctx->ac, t_list,
5341                                                     buffer_index,
5342                                                     ctx->ac.i32_0,
5343                                                     4, false, true);
5344
5345                 for (unsigned chan = 0; chan < 4; chan++) {
5346                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
5347                         ctx->inputs[radeon_llvm_reg_index_soa(idx, chan)] =
5348                                 ac_to_integer(&ctx->ac, LLVMBuildExtractElement(ctx->builder,
5349                                                         input, llvm_chan, ""));
5350                 }
5351         }
5352 }
5353
5354 static void interp_fs_input(struct nir_to_llvm_context *ctx,
5355                             unsigned attr,
5356                             LLVMValueRef interp_param,
5357                             LLVMValueRef prim_mask,
5358                             LLVMValueRef result[4])
5359 {
5360         LLVMValueRef attr_number;
5361         unsigned chan;
5362         LLVMValueRef i, j;
5363         bool interp = interp_param != NULL;
5364
5365         attr_number = LLVMConstInt(ctx->ac.i32, attr, false);
5366
5367         /* fs.constant returns the param from the middle vertex, so it's not
5368          * really useful for flat shading. It's meant to be used for custom
5369          * interpolation (but the intrinsic can't fetch from the other two
5370          * vertices).
5371          *
5372          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
5373          * to do the right thing. The only reason we use fs.constant is that
5374          * fs.interp cannot be used on integers, because they can be equal
5375          * to NaN.
5376          */
5377         if (interp) {
5378                 interp_param = LLVMBuildBitCast(ctx->builder, interp_param,
5379                                                 ctx->ac.v2f32, "");
5380
5381                 i = LLVMBuildExtractElement(ctx->builder, interp_param,
5382                                                 ctx->ac.i32_0, "");
5383                 j = LLVMBuildExtractElement(ctx->builder, interp_param,
5384                                                 ctx->ac.i32_1, "");
5385         }
5386
5387         for (chan = 0; chan < 4; chan++) {
5388                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false);
5389
5390                 if (interp) {
5391                         result[chan] = ac_build_fs_interp(&ctx->ac,
5392                                                           llvm_chan,
5393                                                           attr_number,
5394                                                           prim_mask, i, j);
5395                 } else {
5396                         result[chan] = ac_build_fs_interp_mov(&ctx->ac,
5397                                                               LLVMConstInt(ctx->ac.i32, 2, false),
5398                                                               llvm_chan,
5399                                                               attr_number,
5400                                                               prim_mask);
5401                 }
5402         }
5403 }
5404
5405 static void
5406 handle_fs_input_decl(struct nir_to_llvm_context *ctx,
5407                      struct nir_variable *variable)
5408 {
5409         int idx = variable->data.location;
5410         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
5411         LLVMValueRef interp;
5412
5413         variable->data.driver_location = idx * 4;
5414         ctx->input_mask |= ((1ull << attrib_count) - 1) << variable->data.location;
5415
5416         if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) {
5417                 unsigned interp_type;
5418                 if (variable->data.sample) {
5419                         interp_type = INTERP_SAMPLE;
5420                         ctx->shader_info->info.ps.force_persample = true;
5421                 } else if (variable->data.centroid)
5422                         interp_type = INTERP_CENTROID;
5423                 else
5424                         interp_type = INTERP_CENTER;
5425
5426                 interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type);
5427         } else
5428                 interp = NULL;
5429
5430         for (unsigned i = 0; i < attrib_count; ++i)
5431                 ctx->inputs[radeon_llvm_reg_index_soa(idx + i, 0)] = interp;
5432
5433 }
5434
5435 static void
5436 handle_vs_inputs(struct nir_to_llvm_context *ctx,
5437                  struct nir_shader *nir) {
5438         nir_foreach_variable(variable, &nir->inputs)
5439                 handle_vs_input_decl(ctx, variable);
5440 }
5441
5442 static void
5443 prepare_interp_optimize(struct nir_to_llvm_context *ctx,
5444                         struct nir_shader *nir)
5445 {
5446         if (!ctx->options->key.fs.multisample)
5447                 return;
5448
5449         bool uses_center = false;
5450         bool uses_centroid = false;
5451         nir_foreach_variable(variable, &nir->inputs) {
5452                 if (glsl_get_base_type(glsl_without_array(variable->type)) != GLSL_TYPE_FLOAT ||
5453                     variable->data.sample)
5454                         continue;
5455
5456                 if (variable->data.centroid)
5457                         uses_centroid = true;
5458                 else
5459                         uses_center = true;
5460         }
5461
5462         if (uses_center && uses_centroid) {
5463                 LLVMValueRef sel = LLVMBuildICmp(ctx->builder, LLVMIntSLT, ctx->abi.prim_mask, ctx->ac.i32_0, "");
5464                 ctx->persp_centroid = LLVMBuildSelect(ctx->builder, sel, ctx->persp_center, ctx->persp_centroid, "");
5465                 ctx->linear_centroid = LLVMBuildSelect(ctx->builder, sel, ctx->linear_center, ctx->linear_centroid, "");
5466         }
5467 }
5468
5469 static void
5470 handle_fs_inputs(struct nir_to_llvm_context *ctx,
5471                  struct nir_shader *nir)
5472 {
5473         prepare_interp_optimize(ctx, nir);
5474
5475         nir_foreach_variable(variable, &nir->inputs)
5476                 handle_fs_input_decl(ctx, variable);
5477
5478         unsigned index = 0;
5479
5480         if (ctx->shader_info->info.ps.uses_input_attachments ||
5481             ctx->shader_info->info.needs_multiview_view_index)
5482                 ctx->input_mask |= 1ull << VARYING_SLOT_LAYER;
5483
5484         for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
5485                 LLVMValueRef interp_param;
5486                 LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);
5487
5488                 if (!(ctx->input_mask & (1ull << i)))
5489                         continue;
5490
5491                 if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
5492                     i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
5493                         interp_param = *inputs;
5494                         interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask,
5495                                         inputs);
5496
5497                         if (!interp_param)
5498                                 ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
5499                         ++index;
5500                 } else if (i == VARYING_SLOT_POS) {
5501                         for(int i = 0; i < 3; ++i)
5502                                 inputs[i] = ctx->abi.frag_pos[i];
5503
5504                         inputs[3] = ac_build_fdiv(&ctx->ac, ctx->ac.f32_1,
5505                                                   ctx->abi.frag_pos[3]);
5506                 }
5507         }
5508         ctx->shader_info->fs.num_interp = index;
5509         if (ctx->input_mask & (1 << VARYING_SLOT_PNTC))
5510                 ctx->shader_info->fs.has_pcoord = true;
5511         if (ctx->input_mask & (1 << VARYING_SLOT_PRIMITIVE_ID))
5512                 ctx->shader_info->fs.prim_id_input = true;
5513         if (ctx->input_mask & (1 << VARYING_SLOT_LAYER))
5514                 ctx->shader_info->fs.layer_input = true;
5515         ctx->shader_info->fs.input_mask = ctx->input_mask >> VARYING_SLOT_VAR0;
5516
5517         if (ctx->shader_info->info.needs_multiview_view_index)
5518                 ctx->view_index = ctx->inputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
5519 }
5520
5521 static LLVMValueRef
5522 ac_build_alloca(struct ac_llvm_context *ac,
5523                 LLVMTypeRef type,
5524                 const char *name)
5525 {
5526         LLVMBuilderRef builder = ac->builder;
5527         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
5528         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
5529         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
5530         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
5531         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ac->context);
5532         LLVMValueRef res;
5533
5534         if (first_instr) {
5535                 LLVMPositionBuilderBefore(first_builder, first_instr);
5536         } else {
5537                 LLVMPositionBuilderAtEnd(first_builder, first_block);
5538         }
5539
5540         res = LLVMBuildAlloca(first_builder, type, name);
5541         LLVMBuildStore(builder, LLVMConstNull(type), res);
5542
5543         LLVMDisposeBuilder(first_builder);
5544
5545         return res;
5546 }
5547
5548 static LLVMValueRef si_build_alloca_undef(struct ac_llvm_context *ac,
5549                                           LLVMTypeRef type,
5550                                           const char *name)
5551 {
5552         LLVMValueRef ptr = ac_build_alloca(ac, type, name);
5553         LLVMBuildStore(ac->builder, LLVMGetUndef(type), ptr);
5554         return ptr;
5555 }
5556
5557 static void
5558 scan_shader_output_decl(struct nir_to_llvm_context *ctx,
5559                         struct nir_variable *variable,
5560                         struct nir_shader *shader,
5561                         gl_shader_stage stage)
5562 {
5563         int idx = variable->data.location + variable->data.index;
5564         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
5565         uint64_t mask_attribs;
5566
5567         variable->data.driver_location = idx * 4;
5568
5569         /* tess ctrl has it's own load/store paths for outputs */
5570         if (stage == MESA_SHADER_TESS_CTRL)
5571                 return;
5572
5573         mask_attribs = ((1ull << attrib_count) - 1) << idx;
5574         if (stage == MESA_SHADER_VERTEX ||
5575             stage == MESA_SHADER_TESS_EVAL ||
5576             stage == MESA_SHADER_GEOMETRY) {
5577                 if (idx == VARYING_SLOT_CLIP_DIST0) {
5578                         int length = shader->info.clip_distance_array_size +
5579                                      shader->info.cull_distance_array_size;
5580                         if (stage == MESA_SHADER_VERTEX) {
5581                                 ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
5582                                 ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
5583                         }
5584                         if (stage == MESA_SHADER_TESS_EVAL) {
5585                                 ctx->shader_info->tes.outinfo.clip_dist_mask = (1 << shader->info.clip_distance_array_size) - 1;
5586                                 ctx->shader_info->tes.outinfo.cull_dist_mask = (1 << shader->info.cull_distance_array_size) - 1;
5587                         }
5588
5589                         if (length > 4)
5590                                 attrib_count = 2;
5591                         else
5592                                 attrib_count = 1;
5593                         mask_attribs = 1ull << idx;
5594                 }
5595         }
5596
5597         ctx->output_mask |= mask_attribs;
5598 }
5599
5600 static void
5601 handle_shader_output_decl(struct ac_nir_context *ctx,
5602                           struct nir_shader *nir,
5603                           struct nir_variable *variable)
5604 {
5605         unsigned output_loc = variable->data.driver_location / 4;
5606         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
5607
5608         /* tess ctrl has it's own load/store paths for outputs */
5609         if (ctx->stage == MESA_SHADER_TESS_CTRL)
5610                 return;
5611
5612         if (ctx->stage == MESA_SHADER_VERTEX ||
5613             ctx->stage == MESA_SHADER_TESS_EVAL ||
5614             ctx->stage == MESA_SHADER_GEOMETRY) {
5615                 int idx = variable->data.location + variable->data.index;
5616                 if (idx == VARYING_SLOT_CLIP_DIST0) {
5617                         int length = nir->info.clip_distance_array_size +
5618                                      nir->info.cull_distance_array_size;
5619
5620                         if (length > 4)
5621                                 attrib_count = 2;
5622                         else
5623                                 attrib_count = 1;
5624                 }
5625         }
5626
5627         for (unsigned i = 0; i < attrib_count; ++i) {
5628                 for (unsigned chan = 0; chan < 4; chan++) {
5629                         ctx->outputs[radeon_llvm_reg_index_soa(output_loc + i, chan)] =
5630                                        si_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
5631                 }
5632         }
5633 }
5634
5635 static LLVMTypeRef
5636 glsl_base_to_llvm_type(struct nir_to_llvm_context *ctx,
5637                        enum glsl_base_type type)
5638 {
5639         switch (type) {
5640         case GLSL_TYPE_INT:
5641         case GLSL_TYPE_UINT:
5642         case GLSL_TYPE_BOOL:
5643         case GLSL_TYPE_SUBROUTINE:
5644                 return ctx->ac.i32;
5645         case GLSL_TYPE_FLOAT: /* TODO handle mediump */
5646                 return ctx->ac.f32;
5647         case GLSL_TYPE_INT64:
5648         case GLSL_TYPE_UINT64:
5649                 return ctx->ac.i64;
5650         case GLSL_TYPE_DOUBLE:
5651                 return ctx->ac.f64;
5652         default:
5653                 unreachable("unknown GLSL type");
5654         }
5655 }
5656
5657 static LLVMTypeRef
5658 glsl_to_llvm_type(struct nir_to_llvm_context *ctx,
5659                   const struct glsl_type *type)
5660 {
5661         if (glsl_type_is_scalar(type)) {
5662                 return glsl_base_to_llvm_type(ctx, glsl_get_base_type(type));
5663         }
5664
5665         if (glsl_type_is_vector(type)) {
5666                 return LLVMVectorType(
5667                    glsl_base_to_llvm_type(ctx, glsl_get_base_type(type)),
5668                    glsl_get_vector_elements(type));
5669         }
5670
5671         if (glsl_type_is_matrix(type)) {
5672                 return LLVMArrayType(
5673                    glsl_to_llvm_type(ctx, glsl_get_column_type(type)),
5674                    glsl_get_matrix_columns(type));
5675         }
5676
5677         if (glsl_type_is_array(type)) {
5678                 return LLVMArrayType(
5679                    glsl_to_llvm_type(ctx, glsl_get_array_element(type)),
5680                    glsl_get_length(type));
5681         }
5682
5683         assert(glsl_type_is_struct(type));
5684
5685         LLVMTypeRef member_types[glsl_get_length(type)];
5686
5687         for (unsigned i = 0; i < glsl_get_length(type); i++) {
5688                 member_types[i] =
5689                         glsl_to_llvm_type(ctx,
5690                                           glsl_get_struct_field(type, i));
5691         }
5692
5693         return LLVMStructTypeInContext(ctx->context, member_types,
5694                                        glsl_get_length(type), false);
5695 }
5696
5697 static void
5698 setup_locals(struct ac_nir_context *ctx,
5699              struct nir_function *func)
5700 {
5701         int i, j;
5702         ctx->num_locals = 0;
5703         nir_foreach_variable(variable, &func->impl->locals) {
5704                 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
5705                 variable->data.driver_location = ctx->num_locals * 4;
5706                 variable->data.location_frac = 0;
5707                 ctx->num_locals += attrib_count;
5708         }
5709         ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef));
5710         if (!ctx->locals)
5711             return;
5712
5713         for (i = 0; i < ctx->num_locals; i++) {
5714                 for (j = 0; j < 4; j++) {
5715                         ctx->locals[i * 4 + j] =
5716                                 si_build_alloca_undef(&ctx->ac, ctx->ac.f32, "temp");
5717                 }
5718         }
5719 }
5720
5721 static void
5722 setup_shared(struct ac_nir_context *ctx,
5723              struct nir_shader *nir)
5724 {
5725         nir_foreach_variable(variable, &nir->shared) {
5726                 LLVMValueRef shared =
5727                         LLVMAddGlobalInAddressSpace(
5728                            ctx->ac.module, glsl_to_llvm_type(ctx->nctx, variable->type),
5729                            variable->name ? variable->name : "",
5730                            AC_LOCAL_ADDR_SPACE);
5731                 _mesa_hash_table_insert(ctx->vars, variable, shared);
5732         }
5733 }
5734
5735 static LLVMValueRef
5736 emit_float_saturate(struct ac_llvm_context *ctx, LLVMValueRef v, float lo, float hi)
5737 {
5738         v = ac_to_float(ctx, v);
5739         v = emit_intrin_2f_param(ctx, "llvm.maxnum", ctx->f32, v, LLVMConstReal(ctx->f32, lo));
5740         return emit_intrin_2f_param(ctx, "llvm.minnum", ctx->f32, v, LLVMConstReal(ctx->f32, hi));
5741 }
5742
5743
5744 static LLVMValueRef emit_pack_int16(struct nir_to_llvm_context *ctx,
5745                                         LLVMValueRef src0, LLVMValueRef src1)
5746 {
5747         LLVMValueRef const16 = LLVMConstInt(ctx->ac.i32, 16, false);
5748         LLVMValueRef comp[2];
5749
5750         comp[0] = LLVMBuildAnd(ctx->builder, src0, LLVMConstInt(ctx->ac.i32, 65535, 0), "");
5751         comp[1] = LLVMBuildAnd(ctx->builder, src1, LLVMConstInt(ctx->ac.i32, 65535, 0), "");
5752         comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, "");
5753         return LLVMBuildOr(ctx->builder, comp[0], comp[1], "");
5754 }
5755
5756 /* Initialize arguments for the shader export intrinsic */
5757 static void
5758 si_llvm_init_export_args(struct nir_to_llvm_context *ctx,
5759                          LLVMValueRef *values,
5760                          unsigned target,
5761                          struct ac_export_args *args)
5762 {
5763         /* Default is 0xf. Adjusted below depending on the format. */
5764         args->enabled_channels = 0xf;
5765
5766         /* Specify whether the EXEC mask represents the valid mask */
5767         args->valid_mask = 0;
5768
5769         /* Specify whether this is the last export */
5770         args->done = 0;
5771
5772         /* Specify the target we are exporting */
5773         args->target = target;
5774
5775         args->compr = false;
5776         args->out[0] = LLVMGetUndef(ctx->ac.f32);
5777         args->out[1] = LLVMGetUndef(ctx->ac.f32);
5778         args->out[2] = LLVMGetUndef(ctx->ac.f32);
5779         args->out[3] = LLVMGetUndef(ctx->ac.f32);
5780
5781         if (!values)
5782                 return;
5783
5784         if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) {
5785                 LLVMValueRef val[4];
5786                 unsigned index = target - V_008DFC_SQ_EXP_MRT;
5787                 unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
5788                 bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
5789                 bool is_int10 = (ctx->options->key.fs.is_int10 >> index) & 1;
5790
5791                 switch(col_format) {
5792                 case V_028714_SPI_SHADER_ZERO:
5793                         args->enabled_channels = 0; /* writemask */
5794                         args->target = V_008DFC_SQ_EXP_NULL;
5795                         break;
5796
5797                 case V_028714_SPI_SHADER_32_R:
5798                         args->enabled_channels = 1;
5799                         args->out[0] = values[0];
5800                         break;
5801
5802                 case V_028714_SPI_SHADER_32_GR:
5803                         args->enabled_channels = 0x3;
5804                         args->out[0] = values[0];
5805                         args->out[1] = values[1];
5806                         break;
5807
5808                 case V_028714_SPI_SHADER_32_AR:
5809                         args->enabled_channels = 0x9;
5810                         args->out[0] = values[0];
5811                         args->out[3] = values[3];
5812                         break;
5813
5814                 case V_028714_SPI_SHADER_FP16_ABGR:
5815                         args->compr = 1;
5816
5817                         for (unsigned chan = 0; chan < 2; chan++) {
5818                                 LLVMValueRef pack_args[2] = {
5819                                         values[2 * chan],
5820                                         values[2 * chan + 1]
5821                                 };
5822                                 LLVMValueRef packed;
5823
5824                                 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
5825                                 args->out[chan] = packed;
5826                         }
5827                         break;
5828
5829                 case V_028714_SPI_SHADER_UNORM16_ABGR:
5830                         for (unsigned chan = 0; chan < 4; chan++) {
5831                                 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
5832                                 val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
5833                                                         LLVMConstReal(ctx->ac.f32, 65535), "");
5834                                 val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
5835                                                         LLVMConstReal(ctx->ac.f32, 0.5), "");
5836                                 val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan],
5837                                                         ctx->ac.i32, "");
5838                         }
5839
5840                         args->compr = 1;
5841                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
5842                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
5843                         break;
5844
5845                 case V_028714_SPI_SHADER_SNORM16_ABGR:
5846                         for (unsigned chan = 0; chan < 4; chan++) {
5847                                 val[chan] = emit_float_saturate(&ctx->ac, values[chan], -1, 1);
5848                                 val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
5849                                                         LLVMConstReal(ctx->ac.f32, 32767), "");
5850
5851                                 /* If positive, add 0.5, else add -0.5. */
5852                                 val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
5853                                                 LLVMBuildSelect(ctx->builder,
5854                                                         LLVMBuildFCmp(ctx->builder, LLVMRealOGE,
5855                                                                 val[chan], ctx->ac.f32_0, ""),
5856                                                         LLVMConstReal(ctx->ac.f32, 0.5),
5857                                                         LLVMConstReal(ctx->ac.f32, -0.5), ""), "");
5858                                 val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->ac.i32, "");
5859                         }
5860
5861                         args->compr = 1;
5862                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
5863                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
5864                         break;
5865
5866                 case V_028714_SPI_SHADER_UINT16_ABGR: {
5867                         LLVMValueRef max_rgb = LLVMConstInt(ctx->ac.i32,
5868                                                             is_int8 ? 255 : is_int10 ? 1023 : 65535, 0);
5869                         LLVMValueRef max_alpha = !is_int10 ? max_rgb : LLVMConstInt(ctx->ac.i32, 3, 0);
5870
5871                         for (unsigned chan = 0; chan < 4; chan++) {
5872                                 val[chan] = ac_to_integer(&ctx->ac, values[chan]);
5873                                 val[chan] = emit_minmax_int(&ctx->ac, LLVMIntULT, val[chan], chan == 3 ? max_alpha : max_rgb);
5874                         }
5875
5876                         args->compr = 1;
5877                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
5878                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
5879                         break;
5880                 }
5881
5882                 case V_028714_SPI_SHADER_SINT16_ABGR: {
5883                         LLVMValueRef max_rgb = LLVMConstInt(ctx->ac.i32,
5884                                                             is_int8 ? 127 : is_int10 ? 511 : 32767, 0);
5885                         LLVMValueRef min_rgb = LLVMConstInt(ctx->ac.i32,
5886                                                             is_int8 ? -128 : is_int10 ? -512 : -32768, 0);
5887                         LLVMValueRef max_alpha = !is_int10 ? max_rgb : ctx->ac.i32_1;
5888                         LLVMValueRef min_alpha = !is_int10 ? min_rgb : LLVMConstInt(ctx->ac.i32, -2, 0);
5889
5890                         /* Clamp. */
5891                         for (unsigned chan = 0; chan < 4; chan++) {
5892                                 val[chan] = ac_to_integer(&ctx->ac, values[chan]);
5893                                 val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSLT, val[chan], chan == 3 ? max_alpha : max_rgb);
5894                                 val[chan] = emit_minmax_int(&ctx->ac, LLVMIntSGT, val[chan], chan == 3 ? min_alpha : min_rgb);
5895                         }
5896
5897                         args->compr = 1;
5898                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
5899                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
5900                         break;
5901                 }
5902
5903                 default:
5904                 case V_028714_SPI_SHADER_32_ABGR:
5905                         memcpy(&args->out[0], values, sizeof(values[0]) * 4);
5906                         break;
5907                 }
5908         } else
5909                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
5910
5911         for (unsigned i = 0; i < 4; ++i)
5912                 args->out[i] = ac_to_float(&ctx->ac, args->out[i]);
5913 }
5914
5915 static void
5916 handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
5917                        bool export_prim_id,
5918                        struct ac_vs_output_info *outinfo)
5919 {
5920         uint32_t param_count = 0;
5921         unsigned target;
5922         unsigned pos_idx, num_pos_exports = 0;
5923         struct ac_export_args args, pos_args[4] = {};
5924         LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_index_value = NULL;
5925         int i;
5926
5927         if (ctx->options->key.has_multiview_view_index) {
5928                 LLVMValueRef* tmp_out = &ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)];
5929                 if(!*tmp_out) {
5930                         for(unsigned i = 0; i < 4; ++i)
5931                                 ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, i)] =
5932                                             si_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
5933                 }
5934
5935                 LLVMBuildStore(ctx->builder, ac_to_float(&ctx->ac, ctx->view_index),  *tmp_out);
5936                 ctx->output_mask |= 1ull << VARYING_SLOT_LAYER;
5937         }
5938
5939         memset(outinfo->vs_output_param_offset, AC_EXP_PARAM_UNDEFINED,
5940                sizeof(outinfo->vs_output_param_offset));
5941
5942         if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) {
5943                 LLVMValueRef slots[8];
5944                 unsigned j;
5945
5946                 if (outinfo->cull_dist_mask)
5947                         outinfo->cull_dist_mask <<= ctx->num_output_clips;
5948
5949                 i = VARYING_SLOT_CLIP_DIST0;
5950                 for (j = 0; j < ctx->num_output_clips + ctx->num_output_culls; j++)
5951                         slots[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
5952                                                                ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
5953
5954                 for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++)
5955                         slots[i] = LLVMGetUndef(ctx->ac.f32);
5956
5957                 if (ctx->num_output_clips + ctx->num_output_culls > 4) {
5958                         target = V_008DFC_SQ_EXP_POS + 3;
5959                         si_llvm_init_export_args(ctx, &slots[4], target, &args);
5960                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
5961                                &args, sizeof(args));
5962                 }
5963
5964                 target = V_008DFC_SQ_EXP_POS + 2;
5965                 si_llvm_init_export_args(ctx, &slots[0], target, &args);
5966                 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
5967                        &args, sizeof(args));
5968
5969         }
5970
5971         LLVMValueRef pos_values[4] = {ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_0, ctx->ac.f32_1};
5972         if (ctx->output_mask & (1ull << VARYING_SLOT_POS)) {
5973                 for (unsigned j = 0; j < 4; j++)
5974                         pos_values[j] = LLVMBuildLoad(ctx->builder,
5975                                                  ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_POS, j)], "");
5976         }
5977         si_llvm_init_export_args(ctx, pos_values, V_008DFC_SQ_EXP_POS, &pos_args[0]);
5978
5979         if (ctx->output_mask & (1ull << VARYING_SLOT_PSIZ)) {
5980                 outinfo->writes_pointsize = true;
5981                 psize_value = LLVMBuildLoad(ctx->builder,
5982                                             ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_PSIZ, 0)], "");
5983         }
5984
5985         if (ctx->output_mask & (1ull << VARYING_SLOT_LAYER)) {
5986                 outinfo->writes_layer = true;
5987                 layer_value = LLVMBuildLoad(ctx->builder,
5988                                             ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_LAYER, 0)], "");
5989         }
5990
5991         if (ctx->output_mask & (1ull << VARYING_SLOT_VIEWPORT)) {
5992                 outinfo->writes_viewport_index = true;
5993                 viewport_index_value = LLVMBuildLoad(ctx->builder,
5994                                                      ctx->nir->outputs[radeon_llvm_reg_index_soa(VARYING_SLOT_VIEWPORT, 0)], "");
5995         }
5996
5997         if (outinfo->writes_pointsize ||
5998             outinfo->writes_layer ||
5999             outinfo->writes_viewport_index) {
6000                 pos_args[1].enabled_channels = ((outinfo->writes_pointsize == true ? 1 : 0) |
6001                                                 (outinfo->writes_layer == true ? 4 : 0));
6002                 pos_args[1].valid_mask = 0;
6003                 pos_args[1].done = 0;
6004                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
6005                 pos_args[1].compr = 0;
6006                 pos_args[1].out[0] = ctx->ac.f32_0; /* X */
6007                 pos_args[1].out[1] = ctx->ac.f32_0; /* Y */
6008                 pos_args[1].out[2] = ctx->ac.f32_0; /* Z */
6009                 pos_args[1].out[3] = ctx->ac.f32_0;  /* W */
6010
6011                 if (outinfo->writes_pointsize == true)
6012                         pos_args[1].out[0] = psize_value;
6013                 if (outinfo->writes_layer == true)
6014                         pos_args[1].out[2] = layer_value;
6015                 if (outinfo->writes_viewport_index == true) {
6016                         if (ctx->options->chip_class >= GFX9) {
6017                                 /* GFX9 has the layer in out.z[10:0] and the viewport
6018                                  * index in out.z[19:16].
6019                                  */
6020                                 LLVMValueRef v = viewport_index_value;
6021                                 v = ac_to_integer(&ctx->ac, v);
6022                                 v = LLVMBuildShl(ctx->builder, v,
6023                                                  LLVMConstInt(ctx->ac.i32, 16, false),
6024                                                  "");
6025                                 v = LLVMBuildOr(ctx->builder, v,
6026                                                 ac_to_integer(&ctx->ac, pos_args[1].out[2]), "");
6027
6028                                 pos_args[1].out[2] = ac_to_float(&ctx->ac, v);
6029                                 pos_args[1].enabled_channels |= 1 << 2;
6030                         } else {
6031                                 pos_args[1].out[3] = viewport_index_value;
6032                                 pos_args[1].enabled_channels |= 1 << 3;
6033                         }
6034                 }
6035         }
6036         for (i = 0; i < 4; i++) {
6037                 if (pos_args[i].out[0])
6038                         num_pos_exports++;
6039         }
6040
6041         pos_idx = 0;
6042         for (i = 0; i < 4; i++) {
6043                 if (!pos_args[i].out[0])
6044                         continue;
6045
6046                 /* Specify the target we are exporting */
6047                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
6048                 if (pos_idx == num_pos_exports)
6049                         pos_args[i].done = 1;
6050                 ac_build_export(&ctx->ac, &pos_args[i]);
6051         }
6052
6053         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
6054                 LLVMValueRef values[4];
6055                 if (!(ctx->output_mask & (1ull << i)))
6056                         continue;
6057
6058                 for (unsigned j = 0; j < 4; j++)
6059                         values[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
6060                                                 ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
6061
6062                 if (i == VARYING_SLOT_LAYER) {
6063                         target = V_008DFC_SQ_EXP_PARAM + param_count;
6064                         outinfo->vs_output_param_offset[VARYING_SLOT_LAYER] = param_count;
6065                         param_count++;
6066                 } else if (i == VARYING_SLOT_PRIMITIVE_ID) {
6067                         target = V_008DFC_SQ_EXP_PARAM + param_count;
6068                         outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count;
6069                         param_count++;
6070                 } else if (i >= VARYING_SLOT_VAR0) {
6071                         outinfo->export_mask |= 1u << (i - VARYING_SLOT_VAR0);
6072                         target = V_008DFC_SQ_EXP_PARAM + param_count;
6073                         outinfo->vs_output_param_offset[i] = param_count;
6074                         param_count++;
6075                 } else
6076                         continue;
6077
6078                 si_llvm_init_export_args(ctx, values, target, &args);
6079
6080                 if (target >= V_008DFC_SQ_EXP_POS &&
6081                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
6082                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
6083                                &args, sizeof(args));
6084                 } else {
6085                         ac_build_export(&ctx->ac, &args);
6086                 }
6087         }
6088
6089         if (export_prim_id) {
6090                 LLVMValueRef values[4];
6091                 target = V_008DFC_SQ_EXP_PARAM + param_count;
6092                 outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID] = param_count;
6093                 param_count++;
6094
6095                 values[0] = ctx->vs_prim_id;
6096                 ctx->shader_info->vs.vgpr_comp_cnt = MAX2(2,
6097                                                           ctx->shader_info->vs.vgpr_comp_cnt);
6098                 for (unsigned j = 1; j < 4; j++)
6099                         values[j] = ctx->ac.f32_0;
6100                 si_llvm_init_export_args(ctx, values, target, &args);
6101                 ac_build_export(&ctx->ac, &args);
6102                 outinfo->export_prim_id = true;
6103         }
6104
6105         outinfo->pos_exports = num_pos_exports;
6106         outinfo->param_exports = param_count;
6107 }
6108
6109 static void
6110 handle_es_outputs_post(struct nir_to_llvm_context *ctx,
6111                        struct ac_es_output_info *outinfo)
6112 {
6113         int j;
6114         uint64_t max_output_written = 0;
6115         LLVMValueRef lds_base = NULL;
6116
6117         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
6118                 int param_index;
6119                 int length = 4;
6120
6121                 if (!(ctx->output_mask & (1ull << i)))
6122                         continue;
6123
6124                 if (i == VARYING_SLOT_CLIP_DIST0)
6125                         length = ctx->num_output_clips + ctx->num_output_culls;
6126
6127                 param_index = shader_io_get_unique_index(i);
6128
6129                 max_output_written = MAX2(param_index + (length > 4), max_output_written);
6130         }
6131
6132         outinfo->esgs_itemsize = (max_output_written + 1) * 16;
6133
6134         if (ctx->ac.chip_class  >= GFX9) {
6135                 unsigned itemsize_dw = outinfo->esgs_itemsize / 4;
6136                 LLVMValueRef vertex_idx = ac_get_thread_id(&ctx->ac);
6137                 LLVMValueRef wave_idx = ac_build_bfe(&ctx->ac, ctx->merged_wave_info,
6138                                                      LLVMConstInt(ctx->ac.i32, 24, false),
6139                                                      LLVMConstInt(ctx->ac.i32, 4, false), false);
6140                 vertex_idx = LLVMBuildOr(ctx->ac.builder, vertex_idx,
6141                                          LLVMBuildMul(ctx->ac.builder, wave_idx,
6142                                                       LLVMConstInt(ctx->ac.i32, 64, false), ""), "");
6143                 lds_base = LLVMBuildMul(ctx->ac.builder, vertex_idx,
6144                                         LLVMConstInt(ctx->ac.i32, itemsize_dw, 0), "");
6145         }
6146
6147         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
6148                 LLVMValueRef dw_addr;
6149                 LLVMValueRef *out_ptr = &ctx->nir->outputs[i * 4];
6150                 int param_index;
6151                 int length = 4;
6152
6153                 if (!(ctx->output_mask & (1ull << i)))
6154                         continue;
6155
6156                 if (i == VARYING_SLOT_CLIP_DIST0)
6157                         length = ctx->num_output_clips + ctx->num_output_culls;
6158
6159                 param_index = shader_io_get_unique_index(i);
6160
6161                 if (lds_base) {
6162                         dw_addr = LLVMBuildAdd(ctx->builder, lds_base,
6163                                                LLVMConstInt(ctx->ac.i32, param_index * 4, false),
6164                                                "");
6165                 }
6166                 for (j = 0; j < length; j++) {
6167                         LLVMValueRef out_val = LLVMBuildLoad(ctx->builder, out_ptr[j], "");
6168                         out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->ac.i32, "");
6169
6170                         if (ctx->ac.chip_class  >= GFX9) {
6171                                 ac_lds_store(&ctx->ac, dw_addr,
6172                                              LLVMBuildLoad(ctx->builder, out_ptr[j], ""));
6173                                 dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, ctx->ac.i32_1, "");
6174                         } else {
6175                                 ac_build_buffer_store_dword(&ctx->ac,
6176                                                             ctx->esgs_ring,
6177                                                             out_val, 1,
6178                                                             NULL, ctx->es2gs_offset,
6179                                                             (4 * param_index + j) * 4,
6180                                                             1, 1, true, true);
6181                         }
6182                 }
6183         }
6184 }
6185
6186 static void
6187 handle_ls_outputs_post(struct nir_to_llvm_context *ctx)
6188 {
6189         LLVMValueRef vertex_id = ctx->rel_auto_id;
6190         LLVMValueRef vertex_dw_stride = unpack_param(&ctx->ac, ctx->ls_out_layout, 13, 8);
6191         LLVMValueRef base_dw_addr = LLVMBuildMul(ctx->builder, vertex_id,
6192                                                  vertex_dw_stride, "");
6193
6194         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
6195                 LLVMValueRef *out_ptr = &ctx->nir->outputs[i * 4];
6196                 int length = 4;
6197
6198                 if (!(ctx->output_mask & (1ull << i)))
6199                         continue;
6200
6201                 if (i == VARYING_SLOT_CLIP_DIST0)
6202                         length = ctx->num_output_clips + ctx->num_output_culls;
6203                 int param = shader_io_get_unique_index(i);
6204                 mark_tess_output(ctx, false, param);
6205                 if (length > 4)
6206                         mark_tess_output(ctx, false, param + 1);
6207                 LLVMValueRef dw_addr = LLVMBuildAdd(ctx->builder, base_dw_addr,
6208                                                     LLVMConstInt(ctx->ac.i32, param * 4, false),
6209                                                     "");
6210                 for (unsigned j = 0; j < length; j++) {
6211                         ac_lds_store(&ctx->ac, dw_addr,
6212                                      LLVMBuildLoad(ctx->builder, out_ptr[j], ""));
6213                         dw_addr = LLVMBuildAdd(ctx->builder, dw_addr, ctx->ac.i32_1, "");
6214                 }
6215         }
6216 }
6217
6218 struct ac_build_if_state
6219 {
6220         struct nir_to_llvm_context *ctx;
6221         LLVMValueRef condition;
6222         LLVMBasicBlockRef entry_block;
6223         LLVMBasicBlockRef true_block;
6224         LLVMBasicBlockRef false_block;
6225         LLVMBasicBlockRef merge_block;
6226 };
6227
6228 static LLVMBasicBlockRef
6229 ac_build_insert_new_block(struct nir_to_llvm_context *ctx, const char *name)
6230 {
6231         LLVMBasicBlockRef current_block;
6232         LLVMBasicBlockRef next_block;
6233         LLVMBasicBlockRef new_block;
6234
6235         /* get current basic block */
6236         current_block = LLVMGetInsertBlock(ctx->builder);
6237
6238         /* chqeck if there's another block after this one */
6239         next_block = LLVMGetNextBasicBlock(current_block);
6240         if (next_block) {
6241                 /* insert the new block before the next block */
6242                 new_block = LLVMInsertBasicBlockInContext(ctx->context, next_block, name);
6243         }
6244         else {
6245                 /* append new block after current block */
6246                 LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
6247                 new_block = LLVMAppendBasicBlockInContext(ctx->context, function, name);
6248         }
6249         return new_block;
6250 }
6251
6252 static void
6253 ac_nir_build_if(struct ac_build_if_state *ifthen,
6254                 struct nir_to_llvm_context *ctx,
6255                 LLVMValueRef condition)
6256 {
6257         LLVMBasicBlockRef block = LLVMGetInsertBlock(ctx->builder);
6258
6259         memset(ifthen, 0, sizeof *ifthen);
6260         ifthen->ctx = ctx;
6261         ifthen->condition = condition;
6262         ifthen->entry_block = block;
6263
6264         /* create endif/merge basic block for the phi functions */
6265         ifthen->merge_block = ac_build_insert_new_block(ctx, "endif-block");
6266
6267         /* create/insert true_block before merge_block */
6268         ifthen->true_block =
6269                 LLVMInsertBasicBlockInContext(ctx->context,
6270                                               ifthen->merge_block,
6271                                               "if-true-block");
6272
6273         /* successive code goes into the true block */
6274         LLVMPositionBuilderAtEnd(ctx->builder, ifthen->true_block);
6275 }
6276
6277 /**
6278  * End a conditional.
6279  */
6280 static void
6281 ac_nir_build_endif(struct ac_build_if_state *ifthen)
6282 {
6283         LLVMBuilderRef builder = ifthen->ctx->builder;
6284
6285         /* Insert branch to the merge block from current block */
6286         LLVMBuildBr(builder, ifthen->merge_block);
6287
6288         /*
6289          * Now patch in the various branch instructions.
6290          */
6291
6292         /* Insert the conditional branch instruction at the end of entry_block */
6293         LLVMPositionBuilderAtEnd(builder, ifthen->entry_block);
6294         if (ifthen->false_block) {
6295                 /* we have an else clause */
6296                 LLVMBuildCondBr(builder, ifthen->condition,
6297                                 ifthen->true_block, ifthen->false_block);
6298         }
6299         else {
6300                 /* no else clause */
6301                 LLVMBuildCondBr(builder, ifthen->condition,
6302                                 ifthen->true_block, ifthen->merge_block);
6303         }
6304
6305         /* Resume building code at end of the ifthen->merge_block */
6306         LLVMPositionBuilderAtEnd(builder, ifthen->merge_block);
6307 }
6308
6309 static void
6310 write_tess_factors(struct nir_to_llvm_context *ctx)
6311 {
6312         unsigned stride, outer_comps, inner_comps;
6313         struct ac_build_if_state if_ctx, inner_if_ctx;
6314         LLVMValueRef invocation_id = unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 8, 5);
6315         LLVMValueRef rel_patch_id = unpack_param(&ctx->ac, ctx->abi.tcs_rel_ids, 0, 8);
6316         unsigned tess_inner_index, tess_outer_index;
6317         LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer;
6318         LLVMValueRef out[6], vec0, vec1, tf_base, inner[4], outer[4];
6319         int i;
6320         emit_barrier(&ctx->ac, ctx->stage);
6321
6322         switch (ctx->options->key.tcs.primitive_mode) {
6323         case GL_ISOLINES:
6324                 stride = 2;
6325                 outer_comps = 2;
6326                 inner_comps = 0;
6327                 break;
6328         case GL_TRIANGLES:
6329                 stride = 4;
6330                 outer_comps = 3;
6331                 inner_comps = 1;
6332                 break;
6333         case GL_QUADS:
6334                 stride = 6;
6335                 outer_comps = 4;
6336                 inner_comps = 2;
6337                 break;
6338         default:
6339                 return;
6340         }
6341
6342         ac_nir_build_if(&if_ctx, ctx,
6343                         LLVMBuildICmp(ctx->builder, LLVMIntEQ,
6344                                       invocation_id, ctx->ac.i32_0, ""));
6345
6346         tess_inner_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
6347         tess_outer_index = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
6348
6349         mark_tess_output(ctx, true, tess_inner_index);
6350         mark_tess_output(ctx, true, tess_outer_index);
6351         lds_base = get_tcs_out_current_patch_data_offset(ctx);
6352         lds_inner = LLVMBuildAdd(ctx->builder, lds_base,
6353                                  LLVMConstInt(ctx->ac.i32, tess_inner_index * 4, false), "");
6354         lds_outer = LLVMBuildAdd(ctx->builder, lds_base,
6355                                  LLVMConstInt(ctx->ac.i32, tess_outer_index * 4, false), "");
6356
6357         for (i = 0; i < 4; i++) {
6358                 inner[i] = LLVMGetUndef(ctx->ac.i32);
6359                 outer[i] = LLVMGetUndef(ctx->ac.i32);
6360         }
6361
6362         // LINES reverseal
6363         if (ctx->options->key.tcs.primitive_mode == GL_ISOLINES) {
6364                 outer[0] = out[1] = ac_lds_load(&ctx->ac, lds_outer);
6365                 lds_outer = LLVMBuildAdd(ctx->builder, lds_outer,
6366                                          ctx->ac.i32_1, "");
6367                 outer[1] = out[0] = ac_lds_load(&ctx->ac, lds_outer);
6368         } else {
6369                 for (i = 0; i < outer_comps; i++) {
6370                         outer[i] = out[i] =
6371                                 ac_lds_load(&ctx->ac, lds_outer);
6372                         lds_outer = LLVMBuildAdd(ctx->builder, lds_outer,
6373                                                  ctx->ac.i32_1, "");
6374                 }
6375                 for (i = 0; i < inner_comps; i++) {
6376                         inner[i] = out[outer_comps+i] =
6377                                 ac_lds_load(&ctx->ac, lds_inner);
6378                         lds_inner = LLVMBuildAdd(ctx->builder, lds_inner,
6379                                                  ctx->ac.i32_1, "");
6380                 }
6381         }
6382
6383         /* Convert the outputs to vectors for stores. */
6384         vec0 = ac_build_gather_values(&ctx->ac, out, MIN2(stride, 4));
6385         vec1 = NULL;
6386
6387         if (stride > 4)
6388                 vec1 = ac_build_gather_values(&ctx->ac, out + 4, stride - 4);
6389
6390
6391         buffer = ctx->hs_ring_tess_factor;
6392         tf_base = ctx->tess_factor_offset;
6393         byteoffset = LLVMBuildMul(ctx->builder, rel_patch_id,
6394                                   LLVMConstInt(ctx->ac.i32, 4 * stride, false), "");
6395         unsigned tf_offset = 0;
6396
6397         if (ctx->options->chip_class <= VI) {
6398                 ac_nir_build_if(&inner_if_ctx, ctx,
6399                                 LLVMBuildICmp(ctx->builder, LLVMIntEQ,
6400                                               rel_patch_id, ctx->ac.i32_0, ""));
6401
6402                 /* Store the dynamic HS control word. */
6403                 ac_build_buffer_store_dword(&ctx->ac, buffer,
6404                                             LLVMConstInt(ctx->ac.i32, 0x80000000, false),
6405                                             1, ctx->ac.i32_0, tf_base,
6406                                             0, 1, 0, true, false);
6407                 tf_offset += 4;
6408
6409                 ac_nir_build_endif(&inner_if_ctx);
6410         }
6411
6412         /* Store the tessellation factors. */
6413         ac_build_buffer_store_dword(&ctx->ac, buffer, vec0,
6414                                     MIN2(stride, 4), byteoffset, tf_base,
6415                                     tf_offset, 1, 0, true, false);
6416         if (vec1)
6417                 ac_build_buffer_store_dword(&ctx->ac, buffer, vec1,
6418                                             stride - 4, byteoffset, tf_base,
6419                                             16 + tf_offset, 1, 0, true, false);
6420
6421         //store to offchip for TES to read - only if TES reads them
6422         if (ctx->options->key.tcs.tes_reads_tess_factors) {
6423                 LLVMValueRef inner_vec, outer_vec, tf_outer_offset;
6424                 LLVMValueRef tf_inner_offset;
6425                 unsigned param_outer, param_inner;
6426
6427                 param_outer = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_OUTER);
6428                 tf_outer_offset = get_tcs_tes_buffer_address(ctx, NULL,
6429                                                              LLVMConstInt(ctx->ac.i32, param_outer, 0));
6430
6431                 outer_vec = ac_build_gather_values(&ctx->ac, outer,
6432                                                    util_next_power_of_two(outer_comps));
6433
6434                 ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, outer_vec,
6435                                             outer_comps, tf_outer_offset,
6436                                             ctx->oc_lds, 0, 1, 0, true, false);
6437                 if (inner_comps) {
6438                         param_inner = shader_io_get_unique_index(VARYING_SLOT_TESS_LEVEL_INNER);
6439                         tf_inner_offset = get_tcs_tes_buffer_address(ctx, NULL,
6440                                                                      LLVMConstInt(ctx->ac.i32, param_inner, 0));
6441
6442                         inner_vec = inner_comps == 1 ? inner[0] :
6443                                 ac_build_gather_values(&ctx->ac, inner, inner_comps);
6444                         ac_build_buffer_store_dword(&ctx->ac, ctx->hs_ring_tess_offchip, inner_vec,
6445                                                     inner_comps, tf_inner_offset,
6446                                                     ctx->oc_lds, 0, 1, 0, true, false);
6447                 }
6448         }
6449         ac_nir_build_endif(&if_ctx);
6450 }
6451
6452 static void
6453 handle_tcs_outputs_post(struct nir_to_llvm_context *ctx)
6454 {
6455         write_tess_factors(ctx);
6456 }
6457
6458 static bool
6459 si_export_mrt_color(struct nir_to_llvm_context *ctx,
6460                     LLVMValueRef *color, unsigned param, bool is_last,
6461                     struct ac_export_args *args)
6462 {
6463         /* Export */
6464         si_llvm_init_export_args(ctx, color, param,
6465                                  args);
6466
6467         if (is_last) {
6468                 args->valid_mask = 1; /* whether the EXEC mask is valid */
6469                 args->done = 1; /* DONE bit */
6470         } else if (!args->enabled_channels)
6471                 return false; /* unnecessary NULL export */
6472
6473         return true;
6474 }
6475
6476 static void
6477 radv_export_mrt_z(struct nir_to_llvm_context *ctx,
6478                   LLVMValueRef depth, LLVMValueRef stencil,
6479                   LLVMValueRef samplemask)
6480 {
6481         struct ac_export_args args;
6482
6483         ac_export_mrt_z(&ctx->ac, depth, stencil, samplemask, &args);
6484
6485         ac_build_export(&ctx->ac, &args);
6486 }
6487
6488 static void
6489 handle_fs_outputs_post(struct nir_to_llvm_context *ctx)
6490 {
6491         unsigned index = 0;
6492         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
6493         struct ac_export_args color_args[8];
6494
6495         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
6496                 LLVMValueRef values[4];
6497
6498                 if (!(ctx->output_mask & (1ull << i)))
6499                         continue;
6500
6501                 if (i == FRAG_RESULT_DEPTH) {
6502                         ctx->shader_info->fs.writes_z = true;
6503                         depth = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
6504                                                             ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
6505                 } else if (i == FRAG_RESULT_STENCIL) {
6506                         ctx->shader_info->fs.writes_stencil = true;
6507                         stencil = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
6508                                                               ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
6509                 } else if (i == FRAG_RESULT_SAMPLE_MASK) {
6510                         ctx->shader_info->fs.writes_sample_mask = true;
6511                         samplemask = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
6512                                                                   ctx->nir->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
6513                 } else {
6514                         bool last = false;
6515                         for (unsigned j = 0; j < 4; j++)
6516                                 values[j] = ac_to_float(&ctx->ac, LLVMBuildLoad(ctx->builder,
6517                                                                         ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
6518
6519                         if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil && !ctx->shader_info->fs.writes_sample_mask)
6520                                 last = ctx->output_mask <= ((1ull << (i + 1)) - 1);
6521
6522                         bool ret = si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + (i - FRAG_RESULT_DATA0), last, &color_args[index]);
6523                         if (ret)
6524                                 index++;
6525                 }
6526         }
6527
6528         for (unsigned i = 0; i < index; i++)
6529                 ac_build_export(&ctx->ac, &color_args[i]);
6530         if (depth || stencil || samplemask)
6531                 radv_export_mrt_z(ctx, depth, stencil, samplemask);
6532         else if (!index) {
6533                 si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true, &color_args[0]);
6534                 ac_build_export(&ctx->ac, &color_args[0]);
6535         }
6536 }
6537
6538 static void
6539 emit_gs_epilogue(struct nir_to_llvm_context *ctx)
6540 {
6541         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
6542 }
6543
6544 static void
6545 handle_shader_outputs_post(struct ac_shader_abi *abi, unsigned max_outputs,
6546                            LLVMValueRef *addrs)
6547 {
6548         struct nir_to_llvm_context *ctx = nir_to_llvm_context_from_abi(abi);
6549
6550         switch (ctx->stage) {
6551         case MESA_SHADER_VERTEX:
6552                 if (ctx->options->key.vs.as_ls)
6553                         handle_ls_outputs_post(ctx);
6554                 else if (ctx->options->key.vs.as_es)
6555                         handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
6556                 else
6557                         handle_vs_outputs_post(ctx, ctx->options->key.vs.export_prim_id,
6558                                                &ctx->shader_info->vs.outinfo);
6559                 break;
6560         case MESA_SHADER_FRAGMENT:
6561                 handle_fs_outputs_post(ctx);
6562                 break;
6563         case MESA_SHADER_GEOMETRY:
6564                 emit_gs_epilogue(ctx);
6565                 break;
6566         case MESA_SHADER_TESS_CTRL:
6567                 handle_tcs_outputs_post(ctx);
6568                 break;
6569         case MESA_SHADER_TESS_EVAL:
6570                 if (ctx->options->key.tes.as_es)
6571                         handle_es_outputs_post(ctx, &ctx->shader_info->tes.es_info);
6572                 else
6573                         handle_vs_outputs_post(ctx, ctx->options->key.tes.export_prim_id,
6574                                                &ctx->shader_info->tes.outinfo);
6575                 break;
6576         default:
6577                 break;
6578         }
6579 }
6580
6581 static void ac_llvm_finalize_module(struct nir_to_llvm_context * ctx)
6582 {
6583         LLVMPassManagerRef passmgr;
6584         /* Create the pass manager */
6585         passmgr = LLVMCreateFunctionPassManagerForModule(
6586                                                         ctx->module);
6587
6588         /* This pass should eliminate all the load and store instructions */
6589         LLVMAddPromoteMemoryToRegisterPass(passmgr);
6590
6591         /* Add some optimization passes */
6592         LLVMAddScalarReplAggregatesPass(passmgr);
6593         LLVMAddLICMPass(passmgr);
6594         LLVMAddAggressiveDCEPass(passmgr);
6595         LLVMAddCFGSimplificationPass(passmgr);
6596         LLVMAddInstructionCombiningPass(passmgr);
6597
6598         /* Run the pass */
6599         LLVMInitializeFunctionPassManager(passmgr);
6600         LLVMRunFunctionPassManager(passmgr, ctx->main_function);
6601         LLVMFinalizeFunctionPassManager(passmgr);
6602
6603         LLVMDisposeBuilder(ctx->builder);
6604         LLVMDisposePassManager(passmgr);
6605 }
6606
6607 static void
6608 ac_nir_eliminate_const_vs_outputs(struct nir_to_llvm_context *ctx)
6609 {
6610         struct ac_vs_output_info *outinfo;
6611
6612         switch (ctx->stage) {
6613         case MESA_SHADER_FRAGMENT:
6614         case MESA_SHADER_COMPUTE:
6615         case MESA_SHADER_TESS_CTRL:
6616         case MESA_SHADER_GEOMETRY:
6617                 return;
6618         case MESA_SHADER_VERTEX:
6619                 if (ctx->options->key.vs.as_ls ||
6620                     ctx->options->key.vs.as_es)
6621                         return;
6622                 outinfo = &ctx->shader_info->vs.outinfo;
6623                 break;
6624         case MESA_SHADER_TESS_EVAL:
6625                 if (ctx->options->key.vs.as_es)
6626                         return;
6627                 outinfo = &ctx->shader_info->tes.outinfo;
6628                 break;
6629         default:
6630                 unreachable("Unhandled shader type");
6631         }
6632
6633         ac_optimize_vs_outputs(&ctx->ac,
6634                                ctx->main_function,
6635                                outinfo->vs_output_param_offset,
6636                                VARYING_SLOT_MAX,
6637                                &outinfo->param_exports);
6638 }
6639
6640 static void
6641 ac_setup_rings(struct nir_to_llvm_context *ctx)
6642 {
6643         if ((ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) ||
6644             (ctx->stage == MESA_SHADER_TESS_EVAL && ctx->options->key.tes.as_es)) {
6645                 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_VS, false));
6646         }
6647
6648         if (ctx->is_gs_copy_shader) {
6649                 ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_VS, false));
6650         }
6651         if (ctx->stage == MESA_SHADER_GEOMETRY) {
6652                 LLVMValueRef tmp;
6653                 ctx->esgs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_ESGS_GS, false));
6654                 ctx->gsvs_ring = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_GSVS_GS, false));
6655
6656                 ctx->gsvs_ring = LLVMBuildBitCast(ctx->builder, ctx->gsvs_ring, ctx->ac.v4i32, "");
6657
6658                 ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, ctx->gsvs_num_entries, LLVMConstInt(ctx->ac.i32, 2, false), "");
6659                 tmp = LLVMBuildExtractElement(ctx->builder, ctx->gsvs_ring, ctx->ac.i32_1, "");
6660                 tmp = LLVMBuildOr(ctx->builder, tmp, ctx->gsvs_ring_stride, "");
6661                 ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, tmp, ctx->ac.i32_1, "");
6662         }
6663
6664         if (ctx->stage == MESA_SHADER_TESS_CTRL ||
6665             ctx->stage == MESA_SHADER_TESS_EVAL) {
6666                 ctx->hs_ring_tess_offchip = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_OFFCHIP, false));
6667                 ctx->hs_ring_tess_factor = ac_build_load_to_sgpr(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->ac.i32, RING_HS_TESS_FACTOR, false));
6668         }
6669 }
6670
6671 static unsigned
6672 ac_nir_get_max_workgroup_size(enum chip_class chip_class,
6673                               const struct nir_shader *nir)
6674 {
6675         switch (nir->info.stage) {
6676         case MESA_SHADER_TESS_CTRL:
6677                 return chip_class >= CIK ? 128 : 64;
6678         case MESA_SHADER_GEOMETRY:
6679                 return chip_class >= GFX9 ? 128 : 64;
6680         case MESA_SHADER_COMPUTE:
6681                 break;
6682         default:
6683                 return 0;
6684         }
6685
6686         unsigned max_workgroup_size = nir->info.cs.local_size[0] *
6687                 nir->info.cs.local_size[1] *
6688                 nir->info.cs.local_size[2];
6689         return max_workgroup_size;
6690 }
6691
6692 /* Fixup the HW not emitting the TCS regs if there are no HS threads. */
6693 static void ac_nir_fixup_ls_hs_input_vgprs(struct nir_to_llvm_context *ctx)
6694 {
6695         LLVMValueRef count = ac_build_bfe(&ctx->ac, ctx->merged_wave_info,
6696                                           LLVMConstInt(ctx->ac.i32, 8, false),
6697                                           LLVMConstInt(ctx->ac.i32, 8, false), false);
6698         LLVMValueRef hs_empty = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, count,
6699                                               ctx->ac.i32_0, "");
6700         ctx->abi.instance_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->rel_auto_id, ctx->abi.instance_id, "");
6701         ctx->vs_prim_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.vertex_id, ctx->vs_prim_id, "");
6702         ctx->rel_auto_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_rel_ids, ctx->rel_auto_id, "");
6703         ctx->abi.vertex_id = LLVMBuildSelect(ctx->ac.builder, hs_empty, ctx->abi.tcs_patch_id, ctx->abi.vertex_id, "");
6704 }
6705
6706 static void prepare_gs_input_vgprs(struct nir_to_llvm_context *ctx)
6707 {
6708         for(int i = 5; i >= 0; --i) {
6709                 ctx->gs_vtx_offset[i] = ac_build_bfe(&ctx->ac, ctx->gs_vtx_offset[i & ~1],
6710                                                      LLVMConstInt(ctx->ac.i32, (i & 1) * 16, false),
6711                                                      LLVMConstInt(ctx->ac.i32, 16, false), false);
6712         }
6713
6714         ctx->gs_wave_id = ac_build_bfe(&ctx->ac, ctx->merged_wave_info,
6715                                        LLVMConstInt(ctx->ac.i32, 16, false),
6716                                        LLVMConstInt(ctx->ac.i32, 8, false), false);
6717 }
6718
6719 void ac_nir_translate(struct ac_llvm_context *ac, struct ac_shader_abi *abi,
6720                       struct nir_shader *nir, struct nir_to_llvm_context *nctx)
6721 {
6722         struct ac_nir_context ctx = {};
6723         struct nir_function *func;
6724
6725         ctx.ac = *ac;
6726         ctx.abi = abi;
6727
6728         ctx.nctx = nctx;
6729         if (nctx)
6730                 nctx->nir = &ctx;
6731
6732         ctx.stage = nir->info.stage;
6733
6734         ctx.main_function = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
6735
6736         nir_foreach_variable(variable, &nir->outputs)
6737                 handle_shader_output_decl(&ctx, nir, variable);
6738
6739         ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
6740                                            _mesa_key_pointer_equal);
6741         ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
6742                                            _mesa_key_pointer_equal);
6743         ctx.vars = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
6744                                            _mesa_key_pointer_equal);
6745
6746         func = (struct nir_function *)exec_list_get_head(&nir->functions);
6747
6748         setup_locals(&ctx, func);
6749
6750         if (nir->info.stage == MESA_SHADER_COMPUTE)
6751                 setup_shared(&ctx, nir);
6752
6753         visit_cf_list(&ctx, &func->impl->body);
6754         phi_post_pass(&ctx);
6755
6756         ctx.abi->emit_outputs(ctx.abi, RADEON_LLVM_MAX_OUTPUTS,
6757                               ctx.outputs);
6758
6759         free(ctx.locals);
6760         ralloc_free(ctx.defs);
6761         ralloc_free(ctx.phis);
6762         ralloc_free(ctx.vars);
6763
6764         if (nctx)
6765                 nctx->nir = NULL;
6766 }
6767
6768 static
6769 LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
6770                                        struct nir_shader *const *shaders,
6771                                        int shader_count,
6772                                        struct ac_shader_variant_info *shader_info,
6773                                        const struct ac_nir_compiler_options *options)
6774 {
6775         struct nir_to_llvm_context ctx = {0};
6776         unsigned i;
6777         ctx.options = options;
6778         ctx.shader_info = shader_info;
6779         ctx.context = LLVMContextCreate();
6780         ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
6781
6782         ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class,
6783                              options->family);
6784         ctx.ac.module = ctx.module;
6785         LLVMSetTarget(ctx.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
6786
6787         LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
6788         char *data_layout_str = LLVMCopyStringRepOfTargetData(data_layout);
6789         LLVMSetDataLayout(ctx.module, data_layout_str);
6790         LLVMDisposeTargetData(data_layout);
6791         LLVMDisposeMessage(data_layout_str);
6792
6793         enum ac_float_mode float_mode =
6794                 options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
6795                                        AC_FLOAT_MODE_DEFAULT;
6796
6797         ctx.builder = ac_create_builder(ctx.context, float_mode);
6798         ctx.ac.builder = ctx.builder;
6799
6800         memset(shader_info, 0, sizeof(*shader_info));
6801
6802         for(int i = 0; i < shader_count; ++i)
6803                 ac_nir_shader_info_pass(shaders[i], options, &shader_info->info);
6804
6805         for (i = 0; i < AC_UD_MAX_SETS; i++)
6806                 shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
6807         for (i = 0; i < AC_UD_MAX_UD; i++)
6808                 shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
6809
6810         ctx.max_workgroup_size = 0;
6811         for (int i = 0; i < shader_count; ++i) {
6812                 ctx.max_workgroup_size = MAX2(ctx.max_workgroup_size,
6813                                               ac_nir_get_max_workgroup_size(ctx.options->chip_class,
6814                                                                             shaders[i]));
6815         }
6816
6817         create_function(&ctx, shaders[shader_count - 1]->info.stage, shader_count >= 2,
6818                         shader_count >= 2 ? shaders[shader_count - 2]->info.stage  : MESA_SHADER_VERTEX);
6819
6820         ctx.abi.inputs = &ctx.inputs[0];
6821         ctx.abi.emit_outputs = handle_shader_outputs_post;
6822         ctx.abi.emit_vertex = visit_emit_vertex;
6823         ctx.abi.load_ubo = radv_load_ubo;
6824         ctx.abi.load_ssbo = radv_load_ssbo;
6825         ctx.abi.load_sampler_desc = radv_get_sampler_desc;
6826         ctx.abi.clamp_shadow_reference = false;
6827
6828         if (shader_count >= 2)
6829                 ac_init_exec_full_mask(&ctx.ac);
6830
6831         if (ctx.ac.chip_class == GFX9 &&
6832             shaders[shader_count - 1]->info.stage == MESA_SHADER_TESS_CTRL)
6833                 ac_nir_fixup_ls_hs_input_vgprs(&ctx);
6834
6835         for(int i = 0; i < shader_count; ++i) {
6836                 ctx.stage = shaders[i]->info.stage;
6837                 ctx.output_mask = 0;
6838                 ctx.tess_outputs_written = 0;
6839                 ctx.num_output_clips = shaders[i]->info.clip_distance_array_size;
6840                 ctx.num_output_culls = shaders[i]->info.cull_distance_array_size;
6841
6842                 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
6843                         ctx.gs_next_vertex = ac_build_alloca(&ctx.ac, ctx.ac.i32, "gs_next_vertex");
6844                         ctx.gs_max_out_vertices = shaders[i]->info.gs.vertices_out;
6845                         ctx.abi.load_inputs = load_gs_input;
6846                         ctx.abi.emit_primitive = visit_end_primitive;
6847                 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
6848                         ctx.tcs_outputs_read = shaders[i]->info.outputs_read;
6849                         ctx.tcs_patch_outputs_read = shaders[i]->info.patch_outputs_read;
6850                         ctx.abi.load_tess_varyings = load_tcs_varyings;
6851                         ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
6852                         ctx.abi.store_tcs_outputs = store_tcs_output;
6853                 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_EVAL) {
6854                         ctx.tes_primitive_mode = shaders[i]->info.tess.primitive_mode;
6855                         ctx.abi.load_tess_varyings = load_tes_input;
6856                         ctx.abi.load_tess_coord = load_tess_coord;
6857                         ctx.abi.load_patch_vertices_in = load_patch_vertices_in;
6858                 } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX) {
6859                         if (shader_info->info.vs.needs_instance_id) {
6860                                 if (ctx.options->key.vs.as_ls) {
6861                                         ctx.shader_info->vs.vgpr_comp_cnt =
6862                                                 MAX2(2, ctx.shader_info->vs.vgpr_comp_cnt);
6863                                 } else {
6864                                         ctx.shader_info->vs.vgpr_comp_cnt =
6865                                                 MAX2(1, ctx.shader_info->vs.vgpr_comp_cnt);
6866                                 }
6867                         }
6868                 } else if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT) {
6869                         shader_info->fs.can_discard = shaders[i]->info.fs.uses_discard;
6870                         ctx.abi.lookup_interp_param = lookup_interp_param;
6871                         ctx.abi.load_sample_position = load_sample_position;
6872                 }
6873
6874                 if (i)
6875                         emit_barrier(&ctx.ac, ctx.stage);
6876
6877                 ac_setup_rings(&ctx);
6878
6879                 LLVMBasicBlockRef merge_block;
6880                 if (shader_count >= 2) {
6881                         LLVMValueRef fn = LLVMGetBasicBlockParent(LLVMGetInsertBlock(ctx.ac.builder));
6882                         LLVMBasicBlockRef then_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
6883                         merge_block = LLVMAppendBasicBlockInContext(ctx.ac.context, fn, "");
6884
6885                         LLVMValueRef count = ac_build_bfe(&ctx.ac, ctx.merged_wave_info,
6886                                                           LLVMConstInt(ctx.ac.i32, 8 * i, false),
6887                                                           LLVMConstInt(ctx.ac.i32, 8, false), false);
6888                         LLVMValueRef thread_id = ac_get_thread_id(&ctx.ac);
6889                         LLVMValueRef cond = LLVMBuildICmp(ctx.ac.builder, LLVMIntULT,
6890                                                           thread_id, count, "");
6891                         LLVMBuildCondBr(ctx.ac.builder, cond, then_block, merge_block);
6892
6893                         LLVMPositionBuilderAtEnd(ctx.ac.builder, then_block);
6894                 }
6895
6896                 if (shaders[i]->info.stage == MESA_SHADER_FRAGMENT)
6897                         handle_fs_inputs(&ctx, shaders[i]);
6898                 else if(shaders[i]->info.stage == MESA_SHADER_VERTEX)
6899                         handle_vs_inputs(&ctx, shaders[i]);
6900                 else if(shader_count >= 2 && shaders[i]->info.stage == MESA_SHADER_GEOMETRY)
6901                         prepare_gs_input_vgprs(&ctx);
6902
6903                 nir_foreach_variable(variable, &shaders[i]->outputs)
6904                         scan_shader_output_decl(&ctx, variable, shaders[i], shaders[i]->info.stage);
6905
6906                 ac_nir_translate(&ctx.ac, &ctx.abi, shaders[i], &ctx);
6907
6908                 if (shader_count >= 2) {
6909                         LLVMBuildBr(ctx.ac.builder, merge_block);
6910                         LLVMPositionBuilderAtEnd(ctx.ac.builder, merge_block);
6911                 }
6912
6913                 if (shaders[i]->info.stage == MESA_SHADER_GEOMETRY) {
6914                         unsigned addclip = shaders[i]->info.clip_distance_array_size +
6915                                         shaders[i]->info.cull_distance_array_size > 4;
6916                         shader_info->gs.gsvs_vertex_size = (util_bitcount64(ctx.output_mask) + addclip) * 16;
6917                         shader_info->gs.max_gsvs_emit_size = shader_info->gs.gsvs_vertex_size *
6918                                 shaders[i]->info.gs.vertices_out;
6919                 } else if (shaders[i]->info.stage == MESA_SHADER_TESS_CTRL) {
6920                         shader_info->tcs.outputs_written = ctx.tess_outputs_written;
6921                         shader_info->tcs.patch_outputs_written = ctx.tess_patch_outputs_written;
6922                 } else if (shaders[i]->info.stage == MESA_SHADER_VERTEX && ctx.options->key.vs.as_ls) {
6923                         shader_info->vs.outputs_written = ctx.tess_outputs_written;
6924                 }
6925         }
6926
6927         LLVMBuildRetVoid(ctx.builder);
6928
6929         if (options->dump_preoptir)
6930                 ac_dump_module(ctx.module);
6931
6932         ac_llvm_finalize_module(&ctx);
6933
6934         if (shader_count == 1)
6935                 ac_nir_eliminate_const_vs_outputs(&ctx);
6936
6937         return ctx.module;
6938 }
6939
6940 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
6941 {
6942         unsigned *retval = (unsigned *)context;
6943         LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
6944         char *description = LLVMGetDiagInfoDescription(di);
6945
6946         if (severity == LLVMDSError) {
6947                 *retval = 1;
6948                 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n",
6949                         description);
6950         }
6951
6952         LLVMDisposeMessage(description);
6953 }
6954
6955 static unsigned ac_llvm_compile(LLVMModuleRef M,
6956                                 struct ac_shader_binary *binary,
6957                                 LLVMTargetMachineRef tm)
6958 {
6959         unsigned retval = 0;
6960         char *err;
6961         LLVMContextRef llvm_ctx;
6962         LLVMMemoryBufferRef out_buffer;
6963         unsigned buffer_size;
6964         const char *buffer_data;
6965         LLVMBool mem_err;
6966
6967         /* Setup Diagnostic Handler*/
6968         llvm_ctx = LLVMGetModuleContext(M);
6969
6970         LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler,
6971                                         &retval);
6972
6973         /* Compile IR*/
6974         mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile,
6975                                                       &err, &out_buffer);
6976
6977         /* Process Errors/Warnings */
6978         if (mem_err) {
6979                 fprintf(stderr, "%s: %s", __FUNCTION__, err);
6980                 free(err);
6981                 retval = 1;
6982                 goto out;
6983         }
6984
6985         /* Extract Shader Code*/
6986         buffer_size = LLVMGetBufferSize(out_buffer);
6987         buffer_data = LLVMGetBufferStart(out_buffer);
6988
6989         ac_elf_read(buffer_data, buffer_size, binary);
6990
6991         /* Clean up */
6992         LLVMDisposeMemoryBuffer(out_buffer);
6993
6994 out:
6995         return retval;
6996 }
6997
6998 static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
6999                                    LLVMModuleRef llvm_module,
7000                                    struct ac_shader_binary *binary,
7001                                    struct ac_shader_config *config,
7002                                    struct ac_shader_variant_info *shader_info,
7003                                    gl_shader_stage stage,
7004                                    bool dump_shader, bool supports_spill)
7005 {
7006         if (dump_shader)
7007                 ac_dump_module(llvm_module);
7008
7009         memset(binary, 0, sizeof(*binary));
7010         int v = ac_llvm_compile(llvm_module, binary, tm);
7011         if (v) {
7012                 fprintf(stderr, "compile failed\n");
7013         }
7014
7015         if (dump_shader)
7016                 fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
7017
7018         ac_shader_binary_read_config(binary, config, 0, supports_spill);
7019
7020         LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
7021         LLVMDisposeModule(llvm_module);
7022         LLVMContextDispose(ctx);
7023
7024         if (stage == MESA_SHADER_FRAGMENT) {
7025                 shader_info->num_input_vgprs = 0;
7026                 if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
7027                         shader_info->num_input_vgprs += 2;
7028                 if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
7029                         shader_info->num_input_vgprs += 2;
7030                 if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
7031                         shader_info->num_input_vgprs += 2;
7032                 if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
7033                         shader_info->num_input_vgprs += 3;
7034                 if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
7035                         shader_info->num_input_vgprs += 2;
7036                 if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
7037                         shader_info->num_input_vgprs += 2;
7038                 if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
7039                         shader_info->num_input_vgprs += 2;
7040                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
7041                         shader_info->num_input_vgprs += 1;
7042                 if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
7043                         shader_info->num_input_vgprs += 1;
7044                 if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
7045                         shader_info->num_input_vgprs += 1;
7046                 if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
7047                         shader_info->num_input_vgprs += 1;
7048                 if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
7049                         shader_info->num_input_vgprs += 1;
7050                 if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr))
7051                         shader_info->num_input_vgprs += 1;
7052                 if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr))
7053                         shader_info->num_input_vgprs += 1;
7054                 if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
7055                         shader_info->num_input_vgprs += 1;
7056                 if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
7057                         shader_info->num_input_vgprs += 1;
7058         }
7059         config->num_vgprs = MAX2(config->num_vgprs, shader_info->num_input_vgprs);
7060
7061         /* +3 for scratch wave offset and VCC */
7062         config->num_sgprs = MAX2(config->num_sgprs,
7063                                  shader_info->num_input_sgprs + 3);
7064
7065         /* Enable 64-bit and 16-bit denormals, because there is no performance
7066          * cost.
7067          *
7068          * If denormals are enabled, all floating-point output modifiers are
7069          * ignored.
7070          *
7071          * Don't enable denormals for 32-bit floats, because:
7072          * - Floating-point output modifiers would be ignored by the hw.
7073          * - Some opcodes don't support denormals, such as v_mad_f32. We would
7074          *   have to stop using those.
7075          * - SI & CI would be very slow.
7076          */
7077         config->float_mode |= V_00B028_FP_64_DENORMS;
7078 }
7079
7080 static void
7081 ac_fill_shader_info(struct ac_shader_variant_info *shader_info, struct nir_shader *nir, const struct ac_nir_compiler_options *options)
7082 {
7083         switch (nir->info.stage) {
7084         case MESA_SHADER_COMPUTE:
7085                 for (int i = 0; i < 3; ++i)
7086                         shader_info->cs.block_size[i] = nir->info.cs.local_size[i];
7087                 break;
7088         case MESA_SHADER_FRAGMENT:
7089                 shader_info->fs.early_fragment_test = nir->info.fs.early_fragment_tests;
7090                 break;
7091         case MESA_SHADER_GEOMETRY:
7092                 shader_info->gs.vertices_in = nir->info.gs.vertices_in;
7093                 shader_info->gs.vertices_out = nir->info.gs.vertices_out;
7094                 shader_info->gs.output_prim = nir->info.gs.output_primitive;
7095                 shader_info->gs.invocations = nir->info.gs.invocations;
7096                 break;
7097         case MESA_SHADER_TESS_EVAL:
7098                 shader_info->tes.primitive_mode = nir->info.tess.primitive_mode;
7099                 shader_info->tes.spacing = nir->info.tess.spacing;
7100                 shader_info->tes.ccw = nir->info.tess.ccw;
7101                 shader_info->tes.point_mode = nir->info.tess.point_mode;
7102                 shader_info->tes.as_es = options->key.tes.as_es;
7103                 break;
7104         case MESA_SHADER_TESS_CTRL:
7105                 shader_info->tcs.tcs_vertices_out = nir->info.tess.tcs_vertices_out;
7106                 break;
7107         case MESA_SHADER_VERTEX:
7108                 shader_info->vs.as_es = options->key.vs.as_es;
7109                 shader_info->vs.as_ls = options->key.vs.as_ls;
7110                 /* in LS mode we need at least 1, invocation id needs 2, handled elsewhere */
7111                 if (options->key.vs.as_ls)
7112                         shader_info->vs.vgpr_comp_cnt = MAX2(1, shader_info->vs.vgpr_comp_cnt);
7113                 break;
7114         default:
7115                 break;
7116         }
7117 }
7118
7119 void ac_compile_nir_shader(LLVMTargetMachineRef tm,
7120                            struct ac_shader_binary *binary,
7121                            struct ac_shader_config *config,
7122                            struct ac_shader_variant_info *shader_info,
7123                            struct nir_shader *const *nir,
7124                            int nir_count,
7125                            const struct ac_nir_compiler_options *options,
7126                            bool dump_shader)
7127 {
7128
7129         LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, nir_count, shader_info,
7130                                                              options);
7131
7132         ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir[0]->info.stage, dump_shader, options->supports_spill);
7133         for (int i = 0; i < nir_count; ++i)
7134                 ac_fill_shader_info(shader_info, nir[i], options);
7135
7136         /* Determine the ES type (VS or TES) for the GS on GFX9. */
7137         if (options->chip_class == GFX9) {
7138                 if (nir_count == 2 &&
7139                     nir[1]->info.stage == MESA_SHADER_GEOMETRY) {
7140                         shader_info->gs.es_type = nir[0]->info.stage;
7141                 }
7142         }
7143 }
7144
7145 static void
7146 ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
7147 {
7148         LLVMValueRef vtx_offset =
7149                 LLVMBuildMul(ctx->builder, ctx->abi.vertex_id,
7150                              LLVMConstInt(ctx->ac.i32, 4, false), "");
7151         int idx = 0;
7152
7153         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
7154                 int length = 4;
7155                 int slot = idx;
7156                 int slot_inc = 1;
7157                 if (!(ctx->output_mask & (1ull << i)))
7158                         continue;
7159
7160                 if (i == VARYING_SLOT_CLIP_DIST0) {
7161                         /* unpack clip and cull from a single set of slots */
7162                         length = ctx->num_output_clips + ctx->num_output_culls;
7163                         if (length > 4)
7164                                 slot_inc = 2;
7165                 }
7166
7167                 for (unsigned j = 0; j < length; j++) {
7168                         LLVMValueRef value, soffset;
7169
7170                         soffset = LLVMConstInt(ctx->ac.i32,
7171                                                (slot * 4 + j) *
7172                                                ctx->gs_max_out_vertices * 16 * 4, false);
7173
7174                         value = ac_build_buffer_load(&ctx->ac, ctx->gsvs_ring,
7175                                                      1, ctx->ac.i32_0,
7176                                                      vtx_offset, soffset,
7177                                                      0, 1, 1, true, false);
7178
7179                         LLVMBuildStore(ctx->builder,
7180                                        ac_to_float(&ctx->ac, value), ctx->nir->outputs[radeon_llvm_reg_index_soa(i, j)]);
7181                 }
7182                 idx += slot_inc;
7183         }
7184         handle_vs_outputs_post(ctx, false, &ctx->shader_info->vs.outinfo);
7185 }
7186
7187 void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
7188                               struct nir_shader *geom_shader,
7189                               struct ac_shader_binary *binary,
7190                               struct ac_shader_config *config,
7191                               struct ac_shader_variant_info *shader_info,
7192                               const struct ac_nir_compiler_options *options,
7193                               bool dump_shader)
7194 {
7195         struct nir_to_llvm_context ctx = {0};
7196         ctx.context = LLVMContextCreate();
7197         ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
7198         ctx.options = options;
7199         ctx.shader_info = shader_info;
7200
7201         ac_llvm_context_init(&ctx.ac, ctx.context, options->chip_class,
7202                              options->family);
7203         ctx.ac.module = ctx.module;
7204
7205         ctx.is_gs_copy_shader = true;
7206         LLVMSetTarget(ctx.module, "amdgcn--");
7207
7208         enum ac_float_mode float_mode =
7209                 options->unsafe_math ? AC_FLOAT_MODE_UNSAFE_FP_MATH :
7210                                        AC_FLOAT_MODE_DEFAULT;
7211
7212         ctx.builder = ac_create_builder(ctx.context, float_mode);
7213         ctx.ac.builder = ctx.builder;
7214         ctx.stage = MESA_SHADER_VERTEX;
7215
7216         create_function(&ctx, MESA_SHADER_VERTEX, false, MESA_SHADER_VERTEX);
7217
7218         ctx.gs_max_out_vertices = geom_shader->info.gs.vertices_out;
7219         ac_setup_rings(&ctx);
7220
7221         ctx.num_output_clips = geom_shader->info.clip_distance_array_size;
7222         ctx.num_output_culls = geom_shader->info.cull_distance_array_size;
7223
7224         struct ac_nir_context nir_ctx = {};
7225         nir_ctx.ac = ctx.ac;
7226         nir_ctx.abi = &ctx.abi;
7227
7228         nir_ctx.nctx = &ctx;
7229         ctx.nir = &nir_ctx;
7230
7231         nir_foreach_variable(variable, &geom_shader->outputs) {
7232                 scan_shader_output_decl(&ctx, variable, geom_shader, MESA_SHADER_VERTEX);
7233                 handle_shader_output_decl(&nir_ctx, geom_shader, variable);
7234         }
7235
7236         ac_gs_copy_shader_emit(&ctx);
7237
7238         ctx.nir = NULL;
7239
7240         LLVMBuildRetVoid(ctx.builder);
7241
7242         ac_llvm_finalize_module(&ctx);
7243
7244         ac_compile_llvm_module(tm, ctx.module, binary, config, shader_info,
7245                                MESA_SHADER_VERTEX,
7246                                dump_shader, options->supports_spill);
7247 }