OSDN Git Service

radv: rework vertex/export shader output handling
[android-x86/external-mesa.git] / src / amd / common / ac_nir_to_llvm.c
1 /*
2  * Copyright © 2016 Bas Nieuwenhuizen
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23
24 #include "ac_nir_to_llvm.h"
25 #include "ac_llvm_build.h"
26 #include "ac_llvm_util.h"
27 #include "ac_binary.h"
28 #include "sid.h"
29 #include "nir/nir.h"
30 #include "../vulkan/radv_descriptor_set.h"
31 #include "util/bitscan.h"
32 #include <llvm-c/Transforms/Scalar.h>
33
34 enum radeon_llvm_calling_convention {
35         RADEON_LLVM_AMDGPU_VS = 87,
36         RADEON_LLVM_AMDGPU_GS = 88,
37         RADEON_LLVM_AMDGPU_PS = 89,
38         RADEON_LLVM_AMDGPU_CS = 90,
39 };
40
41 #define CONST_ADDR_SPACE 2
42 #define LOCAL_ADDR_SPACE 3
43
44 #define RADEON_LLVM_MAX_INPUTS (VARYING_SLOT_VAR31 + 1)
45 #define RADEON_LLVM_MAX_OUTPUTS (VARYING_SLOT_VAR31 + 1)
46
47 enum desc_type {
48         DESC_IMAGE,
49         DESC_FMASK,
50         DESC_SAMPLER,
51         DESC_BUFFER,
52 };
53
54 struct nir_to_llvm_context {
55         struct ac_llvm_context ac;
56         const struct ac_nir_compiler_options *options;
57         struct ac_shader_variant_info *shader_info;
58
59         LLVMContextRef context;
60         LLVMModuleRef module;
61         LLVMBuilderRef builder;
62         LLVMValueRef main_function;
63
64         struct hash_table *defs;
65         struct hash_table *phis;
66
67         LLVMValueRef descriptor_sets[AC_UD_MAX_SETS];
68         LLVMValueRef ring_offsets;
69         LLVMValueRef push_constants;
70         LLVMValueRef num_work_groups;
71         LLVMValueRef workgroup_ids;
72         LLVMValueRef local_invocation_ids;
73         LLVMValueRef tg_size;
74
75         LLVMValueRef vertex_buffers;
76         LLVMValueRef base_vertex;
77         LLVMValueRef start_instance;
78         LLVMValueRef draw_index;
79         LLVMValueRef vertex_id;
80         LLVMValueRef rel_auto_id;
81         LLVMValueRef vs_prim_id;
82         LLVMValueRef instance_id;
83
84         LLVMValueRef es2gs_offset;
85
86         LLVMValueRef gsvs_ring_stride;
87         LLVMValueRef gsvs_num_entries;
88         LLVMValueRef gs2vs_offset;
89         LLVMValueRef gs_wave_id;
90         LLVMValueRef gs_vtx_offset[6];
91         LLVMValueRef gs_prim_id, gs_invocation_id;
92
93         LLVMValueRef esgs_ring;
94         LLVMValueRef gsvs_ring;
95
96         LLVMValueRef prim_mask;
97         LLVMValueRef sample_positions;
98         LLVMValueRef persp_sample, persp_center, persp_centroid;
99         LLVMValueRef linear_sample, linear_center, linear_centroid;
100         LLVMValueRef front_face;
101         LLVMValueRef ancillary;
102         LLVMValueRef sample_coverage;
103         LLVMValueRef frag_pos[4];
104
105         LLVMBasicBlockRef continue_block;
106         LLVMBasicBlockRef break_block;
107
108         LLVMTypeRef i1;
109         LLVMTypeRef i8;
110         LLVMTypeRef i16;
111         LLVMTypeRef i32;
112         LLVMTypeRef i64;
113         LLVMTypeRef v2i32;
114         LLVMTypeRef v3i32;
115         LLVMTypeRef v4i32;
116         LLVMTypeRef v8i32;
117         LLVMTypeRef f64;
118         LLVMTypeRef f32;
119         LLVMTypeRef f16;
120         LLVMTypeRef v2f32;
121         LLVMTypeRef v4f32;
122         LLVMTypeRef v16i8;
123         LLVMTypeRef voidt;
124
125         LLVMValueRef i1true;
126         LLVMValueRef i1false;
127         LLVMValueRef i32zero;
128         LLVMValueRef i32one;
129         LLVMValueRef f32zero;
130         LLVMValueRef f32one;
131         LLVMValueRef v4f32empty;
132
133         unsigned uniform_md_kind;
134         LLVMValueRef empty_md;
135         gl_shader_stage stage;
136
137         LLVMValueRef lds;
138         LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4];
139         LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS * 4];
140
141         LLVMValueRef shared_memory;
142         uint64_t input_mask;
143         uint64_t output_mask;
144         int num_locals;
145         LLVMValueRef *locals;
146         bool has_ddxy;
147         uint8_t num_input_clips;
148         uint8_t num_input_culls;
149         uint8_t num_output_clips;
150         uint8_t num_output_culls;
151
152         bool has_ds_bpermute;
153
154         bool is_gs_copy_shader;
155         LLVMValueRef gs_next_vertex;
156         unsigned gs_max_out_vertices;
157 };
158
159 static LLVMValueRef get_sampler_desc(struct nir_to_llvm_context *ctx,
160                                      nir_deref_var *deref,
161                                      enum desc_type desc_type);
162 static unsigned radeon_llvm_reg_index_soa(unsigned index, unsigned chan)
163 {
164         return (index * 4) + chan;
165 }
166
167 static unsigned shader_io_get_unique_index(gl_varying_slot slot)
168 {
169         if (slot == VARYING_SLOT_POS)
170                 return 0;
171         if (slot == VARYING_SLOT_PSIZ)
172                 return 1;
173         if (slot == VARYING_SLOT_CLIP_DIST0 ||
174             slot == VARYING_SLOT_CULL_DIST0)
175                 return 2;
176         if (slot == VARYING_SLOT_CLIP_DIST1 ||
177             slot == VARYING_SLOT_CULL_DIST1)
178                 return 3;
179         if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
180                 return 4 + (slot - VARYING_SLOT_VAR0);
181         unreachable("illegal slot in get unique index\n");
182 }
183
184 static unsigned llvm_get_type_size(LLVMTypeRef type)
185 {
186         LLVMTypeKind kind = LLVMGetTypeKind(type);
187
188         switch (kind) {
189         case LLVMIntegerTypeKind:
190                 return LLVMGetIntTypeWidth(type) / 8;
191         case LLVMFloatTypeKind:
192                 return 4;
193         case LLVMPointerTypeKind:
194                 return 8;
195         case LLVMVectorTypeKind:
196                 return LLVMGetVectorSize(type) *
197                        llvm_get_type_size(LLVMGetElementType(type));
198         default:
199                 assert(0);
200                 return 0;
201         }
202 }
203
204 static void set_llvm_calling_convention(LLVMValueRef func,
205                                         gl_shader_stage stage)
206 {
207         enum radeon_llvm_calling_convention calling_conv;
208
209         switch (stage) {
210         case MESA_SHADER_VERTEX:
211         case MESA_SHADER_TESS_CTRL:
212         case MESA_SHADER_TESS_EVAL:
213                 calling_conv = RADEON_LLVM_AMDGPU_VS;
214                 break;
215         case MESA_SHADER_GEOMETRY:
216                 calling_conv = RADEON_LLVM_AMDGPU_GS;
217                 break;
218         case MESA_SHADER_FRAGMENT:
219                 calling_conv = RADEON_LLVM_AMDGPU_PS;
220                 break;
221         case MESA_SHADER_COMPUTE:
222                 calling_conv = RADEON_LLVM_AMDGPU_CS;
223                 break;
224         default:
225                 unreachable("Unhandle shader type");
226         }
227
228         LLVMSetFunctionCallConv(func, calling_conv);
229 }
230
231 static LLVMValueRef
232 create_llvm_function(LLVMContextRef ctx, LLVMModuleRef module,
233                      LLVMBuilderRef builder, LLVMTypeRef *return_types,
234                      unsigned num_return_elems, LLVMTypeRef *param_types,
235                      unsigned param_count, unsigned array_params_mask,
236                      unsigned sgpr_params, bool unsafe_math)
237 {
238         LLVMTypeRef main_function_type, ret_type;
239         LLVMBasicBlockRef main_function_body;
240
241         if (num_return_elems)
242                 ret_type = LLVMStructTypeInContext(ctx, return_types,
243                                                    num_return_elems, true);
244         else
245                 ret_type = LLVMVoidTypeInContext(ctx);
246
247         /* Setup the function */
248         main_function_type =
249             LLVMFunctionType(ret_type, param_types, param_count, 0);
250         LLVMValueRef main_function =
251             LLVMAddFunction(module, "main", main_function_type);
252         main_function_body =
253             LLVMAppendBasicBlockInContext(ctx, main_function, "main_body");
254         LLVMPositionBuilderAtEnd(builder, main_function_body);
255
256         LLVMSetFunctionCallConv(main_function, RADEON_LLVM_AMDGPU_CS);
257         for (unsigned i = 0; i < sgpr_params; ++i) {
258                 if (array_params_mask & (1 << i)) {
259                         LLVMValueRef P = LLVMGetParam(main_function, i);
260                         ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_BYVAL);
261                         ac_add_attr_dereferenceable(P, UINT64_MAX);
262                 }
263                 else {
264                         ac_add_function_attr(ctx, main_function, i + 1, AC_FUNC_ATTR_INREG);
265                 }
266         }
267
268         if (unsafe_math) {
269                 /* These were copied from some LLVM test. */
270                 LLVMAddTargetDependentFunctionAttr(main_function,
271                                                    "less-precise-fpmad",
272                                                    "true");
273                 LLVMAddTargetDependentFunctionAttr(main_function,
274                                                    "no-infs-fp-math",
275                                                    "true");
276                 LLVMAddTargetDependentFunctionAttr(main_function,
277                                                    "no-nans-fp-math",
278                                                    "true");
279                 LLVMAddTargetDependentFunctionAttr(main_function,
280                                                    "unsafe-fp-math",
281                                                    "true");
282         }
283         return main_function;
284 }
285
286 static LLVMTypeRef const_array(LLVMTypeRef elem_type, int num_elements)
287 {
288         return LLVMPointerType(LLVMArrayType(elem_type, num_elements),
289                                CONST_ADDR_SPACE);
290 }
291
292 static LLVMValueRef get_shared_memory_ptr(struct nir_to_llvm_context *ctx,
293                                           int idx,
294                                           LLVMTypeRef type)
295 {
296         LLVMValueRef offset;
297         LLVMValueRef ptr;
298         int addr_space;
299
300         offset = LLVMConstInt(ctx->i32, idx * 16, false);
301
302         ptr = ctx->shared_memory;
303         ptr = LLVMBuildGEP(ctx->builder, ptr, &offset, 1, "");
304         addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
305         ptr = LLVMBuildBitCast(ctx->builder, ptr, LLVMPointerType(type, addr_space), "");
306         return ptr;
307 }
308
309 static LLVMTypeRef to_integer_type_scalar(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
310 {
311         if (t == ctx->f16 || t == ctx->i16)
312                 return ctx->i16;
313         else if (t == ctx->f32 || t == ctx->i32)
314                 return ctx->i32;
315         else if (t == ctx->f64 || t == ctx->i64)
316                 return ctx->i64;
317         else
318                 unreachable("Unhandled integer size");
319 }
320
321 static LLVMTypeRef to_integer_type(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
322 {
323         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
324                 LLVMTypeRef elem_type = LLVMGetElementType(t);
325                 return LLVMVectorType(to_integer_type_scalar(ctx, elem_type),
326                                       LLVMGetVectorSize(t));
327         }
328         return to_integer_type_scalar(ctx, t);
329 }
330
331 static LLVMValueRef to_integer(struct nir_to_llvm_context *ctx, LLVMValueRef v)
332 {
333         LLVMTypeRef type = LLVMTypeOf(v);
334         return LLVMBuildBitCast(ctx->builder, v, to_integer_type(ctx, type), "");
335 }
336
337 static LLVMTypeRef to_float_type_scalar(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
338 {
339         if (t == ctx->i16 || t == ctx->f16)
340                 return ctx->f16;
341         else if (t == ctx->i32 || t == ctx->f32)
342                 return ctx->f32;
343         else if (t == ctx->i64 || t == ctx->f64)
344                 return ctx->f64;
345         else
346                 unreachable("Unhandled float size");
347 }
348
349 static LLVMTypeRef to_float_type(struct nir_to_llvm_context *ctx, LLVMTypeRef t)
350 {
351         if (LLVMGetTypeKind(t) == LLVMVectorTypeKind) {
352                 LLVMTypeRef elem_type = LLVMGetElementType(t);
353                 return LLVMVectorType(to_float_type_scalar(ctx, elem_type),
354                                       LLVMGetVectorSize(t));
355         }
356         return to_float_type_scalar(ctx, t);
357 }
358
359 static LLVMValueRef to_float(struct nir_to_llvm_context *ctx, LLVMValueRef v)
360 {
361         LLVMTypeRef type = LLVMTypeOf(v);
362         return LLVMBuildBitCast(ctx->builder, v, to_float_type(ctx, type), "");
363 }
364
365 static int get_elem_bits(struct nir_to_llvm_context *ctx, LLVMTypeRef type)
366 {
367         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
368                 type = LLVMGetElementType(type);
369
370         if (LLVMGetTypeKind(type) == LLVMIntegerTypeKind)
371                 return LLVMGetIntTypeWidth(type);
372
373         if (type == ctx->f16)
374                 return 16;
375         if (type == ctx->f32)
376                 return 32;
377         if (type == ctx->f64)
378                 return 64;
379
380         unreachable("Unhandled type kind in get_elem_bits");
381 }
382
383 static LLVMValueRef unpack_param(struct nir_to_llvm_context *ctx,
384                                  LLVMValueRef param, unsigned rshift,
385                                  unsigned bitwidth)
386 {
387         LLVMValueRef value = param;
388         if (rshift)
389                 value = LLVMBuildLShr(ctx->builder, value,
390                                       LLVMConstInt(ctx->i32, rshift, false), "");
391
392         if (rshift + bitwidth < 32) {
393                 unsigned mask = (1 << bitwidth) - 1;
394                 value = LLVMBuildAnd(ctx->builder, value,
395                                      LLVMConstInt(ctx->i32, mask, false), "");
396         }
397         return value;
398 }
399
400 static void set_userdata_location(struct ac_userdata_info *ud_info, uint8_t sgpr_idx, uint8_t num_sgprs)
401 {
402         ud_info->sgpr_idx = sgpr_idx;
403         ud_info->num_sgprs = num_sgprs;
404         ud_info->indirect = false;
405         ud_info->indirect_offset = 0;
406 }
407
408 static void set_userdata_location_shader(struct nir_to_llvm_context *ctx,
409                                          int idx, uint8_t sgpr_idx, uint8_t num_sgprs)
410 {
411         set_userdata_location(&ctx->shader_info->user_sgprs_locs.shader_data[idx], sgpr_idx, num_sgprs);
412 }
413
414 #if 0
415 static void set_userdata_location_indirect(struct ac_userdata_info *ud_info, uint8_t sgpr_idx, uint8_t num_sgprs,
416                                            uint32_t indirect_offset)
417 {
418         ud_info->sgpr_idx = sgpr_idx;
419         ud_info->num_sgprs = num_sgprs;
420         ud_info->indirect = true;
421         ud_info->indirect_offset = indirect_offset;
422 }
423 #endif
424
425 static void create_function(struct nir_to_llvm_context *ctx)
426 {
427         LLVMTypeRef arg_types[23];
428         unsigned arg_idx = 0;
429         unsigned array_params_mask = 0;
430         unsigned sgpr_count = 0, user_sgpr_count;
431         unsigned i;
432         unsigned num_sets = ctx->options->layout ? ctx->options->layout->num_sets : 0;
433         unsigned user_sgpr_idx;
434         bool need_push_constants;
435         bool need_ring_offsets = false;
436
437         /* until we sort out scratch/global buffers always assign ring offsets for gs/vs/es */
438         if (ctx->stage == MESA_SHADER_GEOMETRY ||
439             ctx->stage == MESA_SHADER_VERTEX ||
440             ctx->is_gs_copy_shader)
441                 need_ring_offsets = true;
442
443         need_push_constants = true;
444         if (!ctx->options->layout)
445                 need_push_constants = false;
446         else if (!ctx->options->layout->push_constant_size &&
447                  !ctx->options->layout->dynamic_offset_count)
448                 need_push_constants = false;
449
450         if (need_ring_offsets && !ctx->options->supports_spill) {
451                 arg_types[arg_idx++] = const_array(ctx->v16i8, 8); /* address of rings */
452         }
453
454         /* 1 for each descriptor set */
455         for (unsigned i = 0; i < num_sets; ++i) {
456                 if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
457                         array_params_mask |= (1 << arg_idx);
458                         arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024);
459                 }
460         }
461
462         if (need_push_constants) {
463                 /* 1 for push constants and dynamic descriptors */
464                 array_params_mask |= (1 << arg_idx);
465                 arg_types[arg_idx++] = const_array(ctx->i8, 1024 * 1024);
466         }
467
468         switch (ctx->stage) {
469         case MESA_SHADER_COMPUTE:
470                 arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3); /* grid size */
471                 user_sgpr_count = arg_idx;
472                 arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3);
473                 arg_types[arg_idx++] = ctx->i32;
474                 sgpr_count = arg_idx;
475
476                 arg_types[arg_idx++] = LLVMVectorType(ctx->i32, 3);
477                 break;
478         case MESA_SHADER_VERTEX:
479                 if (!ctx->is_gs_copy_shader) {
480                         arg_types[arg_idx++] = const_array(ctx->v16i8, 16); /* vertex buffers */
481                         arg_types[arg_idx++] = ctx->i32; // base vertex
482                         arg_types[arg_idx++] = ctx->i32; // start instance
483                         arg_types[arg_idx++] = ctx->i32; // draw index
484                 }
485                 user_sgpr_count = arg_idx;
486                 if (ctx->options->key.vs.as_es)
487                         arg_types[arg_idx++] = ctx->i32; //es2gs offset
488                 sgpr_count = arg_idx;
489                 arg_types[arg_idx++] = ctx->i32; // vertex id
490                 if (!ctx->is_gs_copy_shader) {
491                         arg_types[arg_idx++] = ctx->i32; // rel auto id
492                         arg_types[arg_idx++] = ctx->i32; // vs prim id
493                         arg_types[arg_idx++] = ctx->i32; // instance id
494                 }
495                 break;
496         case MESA_SHADER_GEOMETRY:
497                 arg_types[arg_idx++] = ctx->i32; // gsvs stride
498                 arg_types[arg_idx++] = ctx->i32; // gsvs num entires
499                 user_sgpr_count = arg_idx;
500                 arg_types[arg_idx++] = ctx->i32; // gs2vs offset
501                 arg_types[arg_idx++] = ctx->i32; // wave id
502                 sgpr_count = arg_idx;
503                 arg_types[arg_idx++] = ctx->i32; // vtx0
504                 arg_types[arg_idx++] = ctx->i32; // vtx1
505                 arg_types[arg_idx++] = ctx->i32; // prim id
506                 arg_types[arg_idx++] = ctx->i32; // vtx2
507                 arg_types[arg_idx++] = ctx->i32; // vtx3
508                 arg_types[arg_idx++] = ctx->i32; // vtx4
509                 arg_types[arg_idx++] = ctx->i32; // vtx5
510                 arg_types[arg_idx++] = ctx->i32; // GS instance id
511                 break;
512         case MESA_SHADER_FRAGMENT:
513                 arg_types[arg_idx++] = const_array(ctx->f32, 32); /* sample positions */
514                 user_sgpr_count = arg_idx;
515                 arg_types[arg_idx++] = ctx->i32; /* prim mask */
516                 sgpr_count = arg_idx;
517                 arg_types[arg_idx++] = ctx->v2i32; /* persp sample */
518                 arg_types[arg_idx++] = ctx->v2i32; /* persp center */
519                 arg_types[arg_idx++] = ctx->v2i32; /* persp centroid */
520                 arg_types[arg_idx++] = ctx->v3i32; /* persp pull model */
521                 arg_types[arg_idx++] = ctx->v2i32; /* linear sample */
522                 arg_types[arg_idx++] = ctx->v2i32; /* linear center */
523                 arg_types[arg_idx++] = ctx->v2i32; /* linear centroid */
524                 arg_types[arg_idx++] = ctx->f32;  /* line stipple tex */
525                 arg_types[arg_idx++] = ctx->f32;  /* pos x float */
526                 arg_types[arg_idx++] = ctx->f32;  /* pos y float */
527                 arg_types[arg_idx++] = ctx->f32;  /* pos z float */
528                 arg_types[arg_idx++] = ctx->f32;  /* pos w float */
529                 arg_types[arg_idx++] = ctx->i32;  /* front face */
530                 arg_types[arg_idx++] = ctx->i32;  /* ancillary */
531                 arg_types[arg_idx++] = ctx->i32;  /* sample coverage */
532                 arg_types[arg_idx++] = ctx->i32;  /* fixed pt */
533                 break;
534         default:
535                 unreachable("Shader stage not implemented");
536         }
537
538         ctx->main_function = create_llvm_function(
539             ctx->context, ctx->module, ctx->builder, NULL, 0, arg_types,
540             arg_idx, array_params_mask, sgpr_count, ctx->options->unsafe_math);
541         set_llvm_calling_convention(ctx->main_function, ctx->stage);
542
543         ctx->shader_info->num_input_sgprs = 0;
544         ctx->shader_info->num_input_vgprs = 0;
545
546         ctx->shader_info->num_user_sgprs = ctx->options->supports_spill ? 2 : 0;
547         for (i = 0; i < user_sgpr_count; i++)
548                 ctx->shader_info->num_user_sgprs += llvm_get_type_size(arg_types[i]) / 4;
549
550         ctx->shader_info->num_input_sgprs = ctx->shader_info->num_user_sgprs;
551         for (; i < sgpr_count; i++)
552                 ctx->shader_info->num_input_sgprs += llvm_get_type_size(arg_types[i]) / 4;
553
554         if (ctx->stage != MESA_SHADER_FRAGMENT)
555                 for (; i < arg_idx; ++i)
556                         ctx->shader_info->num_input_vgprs += llvm_get_type_size(arg_types[i]) / 4;
557
558         arg_idx = 0;
559         user_sgpr_idx = 0;
560
561         if (ctx->options->supports_spill || need_ring_offsets) {
562                 set_userdata_location_shader(ctx, AC_UD_SCRATCH_RING_OFFSETS, user_sgpr_idx, 2);
563                 user_sgpr_idx += 2;
564                 if (ctx->options->supports_spill) {
565                         ctx->ring_offsets = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.implicit.buffer.ptr",
566                                                                LLVMPointerType(ctx->i8, CONST_ADDR_SPACE),
567                                                                NULL, 0, AC_FUNC_ATTR_READNONE);
568                         ctx->ring_offsets = LLVMBuildBitCast(ctx->builder, ctx->ring_offsets,
569                                                              const_array(ctx->v16i8, 8), "");
570                 } else
571                         ctx->ring_offsets = LLVMGetParam(ctx->main_function, arg_idx++);
572         }
573
574         for (unsigned i = 0; i < num_sets; ++i) {
575                 if (ctx->options->layout->set[i].layout->shader_stages & (1 << ctx->stage)) {
576                         set_userdata_location(&ctx->shader_info->user_sgprs_locs.descriptor_sets[i], user_sgpr_idx, 2);
577                         user_sgpr_idx += 2;
578                         ctx->descriptor_sets[i] =
579                                 LLVMGetParam(ctx->main_function, arg_idx++);
580                 } else
581                         ctx->descriptor_sets[i] = NULL;
582         }
583
584         if (need_push_constants) {
585                 ctx->push_constants = LLVMGetParam(ctx->main_function, arg_idx++);
586                 set_userdata_location_shader(ctx, AC_UD_PUSH_CONSTANTS, user_sgpr_idx, 2);
587                 user_sgpr_idx += 2;
588         }
589
590         switch (ctx->stage) {
591         case MESA_SHADER_COMPUTE:
592                 set_userdata_location_shader(ctx, AC_UD_CS_GRID_SIZE, user_sgpr_idx, 3);
593                 user_sgpr_idx += 3;
594                 ctx->num_work_groups =
595                     LLVMGetParam(ctx->main_function, arg_idx++);
596                 ctx->workgroup_ids =
597                     LLVMGetParam(ctx->main_function, arg_idx++);
598                 ctx->tg_size =
599                     LLVMGetParam(ctx->main_function, arg_idx++);
600                 ctx->local_invocation_ids =
601                     LLVMGetParam(ctx->main_function, arg_idx++);
602                 break;
603         case MESA_SHADER_VERTEX:
604                 if (!ctx->is_gs_copy_shader) {
605                         set_userdata_location_shader(ctx, AC_UD_VS_VERTEX_BUFFERS, user_sgpr_idx, 2);
606                         user_sgpr_idx += 2;
607                         ctx->vertex_buffers = LLVMGetParam(ctx->main_function, arg_idx++);
608                         set_userdata_location_shader(ctx, AC_UD_VS_BASE_VERTEX_START_INSTANCE, user_sgpr_idx, 3);
609                         user_sgpr_idx += 3;
610                         ctx->base_vertex = LLVMGetParam(ctx->main_function, arg_idx++);
611                         ctx->start_instance = LLVMGetParam(ctx->main_function, arg_idx++);
612                         ctx->draw_index = LLVMGetParam(ctx->main_function, arg_idx++);
613                 }
614                 if (ctx->options->key.vs.as_es)
615                         ctx->es2gs_offset = LLVMGetParam(ctx->main_function, arg_idx++);
616                 ctx->vertex_id = LLVMGetParam(ctx->main_function, arg_idx++);
617                 if (!ctx->is_gs_copy_shader) {
618                         ctx->rel_auto_id = LLVMGetParam(ctx->main_function, arg_idx++);
619                         ctx->vs_prim_id = LLVMGetParam(ctx->main_function, arg_idx++);
620                         ctx->instance_id = LLVMGetParam(ctx->main_function, arg_idx++);
621                 }
622                 break;
623         case MESA_SHADER_GEOMETRY:
624                 set_userdata_location_shader(ctx, AC_UD_GS_VS_RING_STRIDE_ENTRIES, user_sgpr_idx, 2);
625                 user_sgpr_idx += 2;
626                 ctx->gsvs_ring_stride = LLVMGetParam(ctx->main_function, arg_idx++);
627                 ctx->gsvs_num_entries = LLVMGetParam(ctx->main_function, arg_idx++);
628                 ctx->gs2vs_offset = LLVMGetParam(ctx->main_function, arg_idx++);
629                 ctx->gs_wave_id = LLVMGetParam(ctx->main_function, arg_idx++);
630                 ctx->gs_vtx_offset[0] = LLVMGetParam(ctx->main_function, arg_idx++);
631                 ctx->gs_vtx_offset[1] = LLVMGetParam(ctx->main_function, arg_idx++);
632                 ctx->gs_prim_id = LLVMGetParam(ctx->main_function, arg_idx++);
633                 ctx->gs_vtx_offset[2] = LLVMGetParam(ctx->main_function, arg_idx++);
634                 ctx->gs_vtx_offset[3] = LLVMGetParam(ctx->main_function, arg_idx++);
635                 ctx->gs_vtx_offset[4] = LLVMGetParam(ctx->main_function, arg_idx++);
636                 ctx->gs_vtx_offset[5] = LLVMGetParam(ctx->main_function, arg_idx++);
637                 ctx->gs_invocation_id = LLVMGetParam(ctx->main_function, arg_idx++);
638                 break;
639         case MESA_SHADER_FRAGMENT:
640                 set_userdata_location_shader(ctx, AC_UD_PS_SAMPLE_POS, user_sgpr_idx, 2);
641                 user_sgpr_idx += 2;
642                 ctx->sample_positions = LLVMGetParam(ctx->main_function, arg_idx++);
643                 ctx->prim_mask = LLVMGetParam(ctx->main_function, arg_idx++);
644                 ctx->persp_sample = LLVMGetParam(ctx->main_function, arg_idx++);
645                 ctx->persp_center = LLVMGetParam(ctx->main_function, arg_idx++);
646                 ctx->persp_centroid = LLVMGetParam(ctx->main_function, arg_idx++);
647                 arg_idx++;
648                 ctx->linear_sample = LLVMGetParam(ctx->main_function, arg_idx++);
649                 ctx->linear_center = LLVMGetParam(ctx->main_function, arg_idx++);
650                 ctx->linear_centroid = LLVMGetParam(ctx->main_function, arg_idx++);
651                 arg_idx++; /* line stipple */
652                 ctx->frag_pos[0] = LLVMGetParam(ctx->main_function, arg_idx++);
653                 ctx->frag_pos[1] = LLVMGetParam(ctx->main_function, arg_idx++);
654                 ctx->frag_pos[2] = LLVMGetParam(ctx->main_function, arg_idx++);
655                 ctx->frag_pos[3] = LLVMGetParam(ctx->main_function, arg_idx++);
656                 ctx->front_face = LLVMGetParam(ctx->main_function, arg_idx++);
657                 ctx->ancillary = LLVMGetParam(ctx->main_function, arg_idx++);
658                 ctx->sample_coverage = LLVMGetParam(ctx->main_function, arg_idx++);
659                 break;
660         default:
661                 unreachable("Shader stage not implemented");
662         }
663 }
664
665 static void setup_types(struct nir_to_llvm_context *ctx)
666 {
667         LLVMValueRef args[4];
668
669         ctx->voidt = LLVMVoidTypeInContext(ctx->context);
670         ctx->i1 = LLVMIntTypeInContext(ctx->context, 1);
671         ctx->i8 = LLVMIntTypeInContext(ctx->context, 8);
672         ctx->i16 = LLVMIntTypeInContext(ctx->context, 16);
673         ctx->i32 = LLVMIntTypeInContext(ctx->context, 32);
674         ctx->i64 = LLVMIntTypeInContext(ctx->context, 64);
675         ctx->v2i32 = LLVMVectorType(ctx->i32, 2);
676         ctx->v3i32 = LLVMVectorType(ctx->i32, 3);
677         ctx->v4i32 = LLVMVectorType(ctx->i32, 4);
678         ctx->v8i32 = LLVMVectorType(ctx->i32, 8);
679         ctx->f32 = LLVMFloatTypeInContext(ctx->context);
680         ctx->f16 = LLVMHalfTypeInContext(ctx->context);
681         ctx->f64 = LLVMDoubleTypeInContext(ctx->context);
682         ctx->v2f32 = LLVMVectorType(ctx->f32, 2);
683         ctx->v4f32 = LLVMVectorType(ctx->f32, 4);
684         ctx->v16i8 = LLVMVectorType(ctx->i8, 16);
685
686         ctx->i1false = LLVMConstInt(ctx->i1, 0, false);
687         ctx->i1true = LLVMConstInt(ctx->i1, 1, false);
688         ctx->i32zero = LLVMConstInt(ctx->i32, 0, false);
689         ctx->i32one = LLVMConstInt(ctx->i32, 1, false);
690         ctx->f32zero = LLVMConstReal(ctx->f32, 0.0);
691         ctx->f32one = LLVMConstReal(ctx->f32, 1.0);
692
693         args[0] = ctx->f32zero;
694         args[1] = ctx->f32zero;
695         args[2] = ctx->f32zero;
696         args[3] = ctx->f32one;
697         ctx->v4f32empty = LLVMConstVector(args, 4);
698
699         ctx->uniform_md_kind =
700             LLVMGetMDKindIDInContext(ctx->context, "amdgpu.uniform", 14);
701         ctx->empty_md = LLVMMDNodeInContext(ctx->context, NULL, 0);
702
703         args[0] = LLVMConstReal(ctx->f32, 2.5);
704 }
705
706 static int get_llvm_num_components(LLVMValueRef value)
707 {
708         LLVMTypeRef type = LLVMTypeOf(value);
709         unsigned num_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
710                                       ? LLVMGetVectorSize(type)
711                                       : 1;
712         return num_components;
713 }
714
715 static LLVMValueRef llvm_extract_elem(struct nir_to_llvm_context *ctx,
716                                       LLVMValueRef value,
717                                       int index)
718 {
719         int count = get_llvm_num_components(value);
720
721         assert(index < count);
722         if (count == 1)
723                 return value;
724
725         return LLVMBuildExtractElement(ctx->builder, value,
726                                        LLVMConstInt(ctx->i32, index, false), "");
727 }
728
729 static LLVMValueRef trim_vector(struct nir_to_llvm_context *ctx,
730                                 LLVMValueRef value, unsigned count)
731 {
732         unsigned num_components = get_llvm_num_components(value);
733         if (count == num_components)
734                 return value;
735
736         LLVMValueRef masks[] = {
737             LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
738             LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false)};
739
740         if (count == 1)
741                 return LLVMBuildExtractElement(ctx->builder, value, masks[0],
742                                                "");
743
744         LLVMValueRef swizzle = LLVMConstVector(masks, count);
745         return LLVMBuildShuffleVector(ctx->builder, value, value, swizzle, "");
746 }
747
748 static void
749 build_store_values_extended(struct nir_to_llvm_context *ctx,
750                              LLVMValueRef *values,
751                              unsigned value_count,
752                              unsigned value_stride,
753                              LLVMValueRef vec)
754 {
755         LLVMBuilderRef builder = ctx->builder;
756         unsigned i;
757
758         if (value_count == 1) {
759                 LLVMBuildStore(builder, vec, values[0]);
760                 return;
761         }
762
763         for (i = 0; i < value_count; i++) {
764                 LLVMValueRef ptr = values[i * value_stride];
765                 LLVMValueRef index = LLVMConstInt(ctx->i32, i, false);
766                 LLVMValueRef value = LLVMBuildExtractElement(builder, vec, index, "");
767                 LLVMBuildStore(builder, value, ptr);
768         }
769 }
770
771 static LLVMTypeRef get_def_type(struct nir_to_llvm_context *ctx,
772                                 nir_ssa_def *def)
773 {
774         LLVMTypeRef type = LLVMIntTypeInContext(ctx->context, def->bit_size);
775         if (def->num_components > 1) {
776                 type = LLVMVectorType(type, def->num_components);
777         }
778         return type;
779 }
780
781 static LLVMValueRef get_src(struct nir_to_llvm_context *ctx, nir_src src)
782 {
783         assert(src.is_ssa);
784         struct hash_entry *entry = _mesa_hash_table_search(ctx->defs, src.ssa);
785         return (LLVMValueRef)entry->data;
786 }
787
788
789 static LLVMBasicBlockRef get_block(struct nir_to_llvm_context *ctx,
790                                    struct nir_block *b)
791 {
792         struct hash_entry *entry = _mesa_hash_table_search(ctx->defs, b);
793         return (LLVMBasicBlockRef)entry->data;
794 }
795
796 static LLVMValueRef get_alu_src(struct nir_to_llvm_context *ctx,
797                                 nir_alu_src src,
798                                 unsigned num_components)
799 {
800         LLVMValueRef value = get_src(ctx, src.src);
801         bool need_swizzle = false;
802
803         assert(value);
804         LLVMTypeRef type = LLVMTypeOf(value);
805         unsigned src_components = LLVMGetTypeKind(type) == LLVMVectorTypeKind
806                                       ? LLVMGetVectorSize(type)
807                                       : 1;
808
809         for (unsigned i = 0; i < num_components; ++i) {
810                 assert(src.swizzle[i] < src_components);
811                 if (src.swizzle[i] != i)
812                         need_swizzle = true;
813         }
814
815         if (need_swizzle || num_components != src_components) {
816                 LLVMValueRef masks[] = {
817                     LLVMConstInt(ctx->i32, src.swizzle[0], false),
818                     LLVMConstInt(ctx->i32, src.swizzle[1], false),
819                     LLVMConstInt(ctx->i32, src.swizzle[2], false),
820                     LLVMConstInt(ctx->i32, src.swizzle[3], false)};
821
822                 if (src_components > 1 && num_components == 1) {
823                         value = LLVMBuildExtractElement(ctx->builder, value,
824                                                         masks[0], "");
825                 } else if (src_components == 1 && num_components > 1) {
826                         LLVMValueRef values[] = {value, value, value, value};
827                         value = ac_build_gather_values(&ctx->ac, values, num_components);
828                 } else {
829                         LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
830                         value = LLVMBuildShuffleVector(ctx->builder, value, value,
831                                                        swizzle, "");
832                 }
833         }
834         assert(!src.negate);
835         assert(!src.abs);
836         return value;
837 }
838
839 static LLVMValueRef emit_int_cmp(struct nir_to_llvm_context *ctx,
840                                  LLVMIntPredicate pred, LLVMValueRef src0,
841                                  LLVMValueRef src1)
842 {
843         LLVMValueRef result = LLVMBuildICmp(ctx->builder, pred, src0, src1, "");
844         return LLVMBuildSelect(ctx->builder, result,
845                                LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
846                                LLVMConstInt(ctx->i32, 0, false), "");
847 }
848
849 static LLVMValueRef emit_float_cmp(struct nir_to_llvm_context *ctx,
850                                    LLVMRealPredicate pred, LLVMValueRef src0,
851                                    LLVMValueRef src1)
852 {
853         LLVMValueRef result;
854         src0 = to_float(ctx, src0);
855         src1 = to_float(ctx, src1);
856         result = LLVMBuildFCmp(ctx->builder, pred, src0, src1, "");
857         return LLVMBuildSelect(ctx->builder, result,
858                                LLVMConstInt(ctx->i32, 0xFFFFFFFF, false),
859                                LLVMConstInt(ctx->i32, 0, false), "");
860 }
861
862 static LLVMValueRef emit_intrin_1f_param(struct nir_to_llvm_context *ctx,
863                                          const char *intrin,
864                                          LLVMTypeRef result_type,
865                                          LLVMValueRef src0)
866 {
867         char name[64];
868         LLVMValueRef params[] = {
869                 to_float(ctx, src0),
870         };
871
872         sprintf(name, "%s.f%d", intrin, get_elem_bits(ctx, result_type));
873         return ac_build_intrinsic(&ctx->ac, name, result_type, params, 1, AC_FUNC_ATTR_READNONE);
874 }
875
876 static LLVMValueRef emit_intrin_2f_param(struct nir_to_llvm_context *ctx,
877                                        const char *intrin,
878                                        LLVMTypeRef result_type,
879                                        LLVMValueRef src0, LLVMValueRef src1)
880 {
881         char name[64];
882         LLVMValueRef params[] = {
883                 to_float(ctx, src0),
884                 to_float(ctx, src1),
885         };
886
887         sprintf(name, "%s.f%d", intrin, get_elem_bits(ctx, result_type));
888         return ac_build_intrinsic(&ctx->ac, name, result_type, params, 2, AC_FUNC_ATTR_READNONE);
889 }
890
891 static LLVMValueRef emit_intrin_3f_param(struct nir_to_llvm_context *ctx,
892                                          const char *intrin,
893                                          LLVMTypeRef result_type,
894                                          LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
895 {
896         char name[64];
897         LLVMValueRef params[] = {
898                 to_float(ctx, src0),
899                 to_float(ctx, src1),
900                 to_float(ctx, src2),
901         };
902
903         sprintf(name, "%s.f%d", intrin, get_elem_bits(ctx, result_type));
904         return ac_build_intrinsic(&ctx->ac, name, result_type, params, 3, AC_FUNC_ATTR_READNONE);
905 }
906
907 static LLVMValueRef emit_bcsel(struct nir_to_llvm_context *ctx,
908                                LLVMValueRef src0, LLVMValueRef src1, LLVMValueRef src2)
909 {
910         LLVMValueRef v = LLVMBuildICmp(ctx->builder, LLVMIntNE, src0,
911                                        ctx->i32zero, "");
912         return LLVMBuildSelect(ctx->builder, v, src1, src2, "");
913 }
914
915 static LLVMValueRef emit_find_lsb(struct nir_to_llvm_context *ctx,
916                                   LLVMValueRef src0)
917 {
918         LLVMValueRef params[2] = {
919                 src0,
920
921                 /* The value of 1 means that ffs(x=0) = undef, so LLVM won't
922                  * add special code to check for x=0. The reason is that
923                  * the LLVM behavior for x=0 is different from what we
924                  * need here.
925                  *
926                  * The hardware already implements the correct behavior.
927                  */
928                 LLVMConstInt(ctx->i32, 1, false),
929         };
930         return ac_build_intrinsic(&ctx->ac, "llvm.cttz.i32", ctx->i32, params, 2, AC_FUNC_ATTR_READNONE);
931 }
932
933 static LLVMValueRef emit_ifind_msb(struct nir_to_llvm_context *ctx,
934                                    LLVMValueRef src0)
935 {
936         return ac_build_imsb(&ctx->ac, src0, ctx->i32);
937 }
938
939 static LLVMValueRef emit_ufind_msb(struct nir_to_llvm_context *ctx,
940                                    LLVMValueRef src0)
941 {
942         return ac_build_umsb(&ctx->ac, src0, ctx->i32);
943 }
944
945 static LLVMValueRef emit_minmax_int(struct nir_to_llvm_context *ctx,
946                                     LLVMIntPredicate pred,
947                                     LLVMValueRef src0, LLVMValueRef src1)
948 {
949         return LLVMBuildSelect(ctx->builder,
950                                LLVMBuildICmp(ctx->builder, pred, src0, src1, ""),
951                                src0,
952                                src1, "");
953
954 }
955 static LLVMValueRef emit_iabs(struct nir_to_llvm_context *ctx,
956                               LLVMValueRef src0)
957 {
958         return emit_minmax_int(ctx, LLVMIntSGT, src0,
959                                LLVMBuildNeg(ctx->builder, src0, ""));
960 }
961
962 static LLVMValueRef emit_fsign(struct nir_to_llvm_context *ctx,
963                                LLVMValueRef src0)
964 {
965         LLVMValueRef cmp, val;
966
967         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGT, src0, ctx->f32zero, "");
968         val = LLVMBuildSelect(ctx->builder, cmp, ctx->f32one, src0, "");
969         cmp = LLVMBuildFCmp(ctx->builder, LLVMRealOGE, val, ctx->f32zero, "");
970         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstReal(ctx->f32, -1.0), "");
971         return val;
972 }
973
974 static LLVMValueRef emit_isign(struct nir_to_llvm_context *ctx,
975                                LLVMValueRef src0)
976 {
977         LLVMValueRef cmp, val;
978
979         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGT, src0, ctx->i32zero, "");
980         val = LLVMBuildSelect(ctx->builder, cmp, ctx->i32one, src0, "");
981         cmp = LLVMBuildICmp(ctx->builder, LLVMIntSGE, val, ctx->i32zero, "");
982         val = LLVMBuildSelect(ctx->builder, cmp, val, LLVMConstInt(ctx->i32, -1, true), "");
983         return val;
984 }
985
986 static LLVMValueRef emit_ffract(struct nir_to_llvm_context *ctx,
987                                 LLVMValueRef src0)
988 {
989         const char *intr = "llvm.floor.f32";
990         LLVMValueRef fsrc0 = to_float(ctx, src0);
991         LLVMValueRef params[] = {
992                 fsrc0,
993         };
994         LLVMValueRef floor = ac_build_intrinsic(&ctx->ac, intr,
995                                                 ctx->f32, params, 1,
996                                                 AC_FUNC_ATTR_READNONE);
997         return LLVMBuildFSub(ctx->builder, fsrc0, floor, "");
998 }
999
1000 static LLVMValueRef emit_uint_carry(struct nir_to_llvm_context *ctx,
1001                                     const char *intrin,
1002                                     LLVMValueRef src0, LLVMValueRef src1)
1003 {
1004         LLVMTypeRef ret_type;
1005         LLVMTypeRef types[] = { ctx->i32, ctx->i1 };
1006         LLVMValueRef res;
1007         LLVMValueRef params[] = { src0, src1 };
1008         ret_type = LLVMStructTypeInContext(ctx->context, types,
1009                                            2, true);
1010
1011         res = ac_build_intrinsic(&ctx->ac, intrin, ret_type,
1012                                  params, 2, AC_FUNC_ATTR_READNONE);
1013
1014         res = LLVMBuildExtractValue(ctx->builder, res, 1, "");
1015         res = LLVMBuildZExt(ctx->builder, res, ctx->i32, "");
1016         return res;
1017 }
1018
1019 static LLVMValueRef emit_b2f(struct nir_to_llvm_context *ctx,
1020                              LLVMValueRef src0)
1021 {
1022         return LLVMBuildAnd(ctx->builder, src0, LLVMBuildBitCast(ctx->builder, LLVMConstReal(ctx->f32, 1.0), ctx->i32, ""), "");
1023 }
1024
1025 static LLVMValueRef emit_umul_high(struct nir_to_llvm_context *ctx,
1026                                    LLVMValueRef src0, LLVMValueRef src1)
1027 {
1028         LLVMValueRef dst64, result;
1029         src0 = LLVMBuildZExt(ctx->builder, src0, ctx->i64, "");
1030         src1 = LLVMBuildZExt(ctx->builder, src1, ctx->i64, "");
1031
1032         dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
1033         dst64 = LLVMBuildLShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
1034         result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
1035         return result;
1036 }
1037
1038 static LLVMValueRef emit_imul_high(struct nir_to_llvm_context *ctx,
1039                                    LLVMValueRef src0, LLVMValueRef src1)
1040 {
1041         LLVMValueRef dst64, result;
1042         src0 = LLVMBuildSExt(ctx->builder, src0, ctx->i64, "");
1043         src1 = LLVMBuildSExt(ctx->builder, src1, ctx->i64, "");
1044
1045         dst64 = LLVMBuildMul(ctx->builder, src0, src1, "");
1046         dst64 = LLVMBuildAShr(ctx->builder, dst64, LLVMConstInt(ctx->i64, 32, false), "");
1047         result = LLVMBuildTrunc(ctx->builder, dst64, ctx->i32, "");
1048         return result;
1049 }
1050
1051 static LLVMValueRef emit_bitfield_extract(struct nir_to_llvm_context *ctx,
1052                                           bool is_signed,
1053                                           LLVMValueRef srcs[3])
1054 {
1055         LLVMValueRef result;
1056         LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, srcs[2], LLVMConstInt(ctx->i32, 32, false), "");
1057
1058         result = ac_build_bfe(&ctx->ac, srcs[0], srcs[1], srcs[2], is_signed);
1059         result = LLVMBuildSelect(ctx->builder, icond, srcs[0], result, "");
1060         return result;
1061 }
1062
1063 static LLVMValueRef emit_bitfield_insert(struct nir_to_llvm_context *ctx,
1064                                          LLVMValueRef src0, LLVMValueRef src1,
1065                                          LLVMValueRef src2, LLVMValueRef src3)
1066 {
1067         LLVMValueRef bfi_args[3], result;
1068
1069         bfi_args[0] = LLVMBuildShl(ctx->builder,
1070                                    LLVMBuildSub(ctx->builder,
1071                                                 LLVMBuildShl(ctx->builder,
1072                                                              ctx->i32one,
1073                                                              src3, ""),
1074                                                 ctx->i32one, ""),
1075                                    src2, "");
1076         bfi_args[1] = LLVMBuildShl(ctx->builder, src1, src2, "");
1077         bfi_args[2] = src0;
1078
1079         LLVMValueRef icond = LLVMBuildICmp(ctx->builder, LLVMIntEQ, src3, LLVMConstInt(ctx->i32, 32, false), "");
1080
1081         /* Calculate:
1082          *   (arg0 & arg1) | (~arg0 & arg2) = arg2 ^ (arg0 & (arg1 ^ arg2)
1083          * Use the right-hand side, which the LLVM backend can convert to V_BFI.
1084          */
1085         result = LLVMBuildXor(ctx->builder, bfi_args[2],
1086                               LLVMBuildAnd(ctx->builder, bfi_args[0],
1087                                            LLVMBuildXor(ctx->builder, bfi_args[1], bfi_args[2], ""), ""), "");
1088
1089         result = LLVMBuildSelect(ctx->builder, icond, src1, result, "");
1090         return result;
1091 }
1092
1093 static LLVMValueRef emit_pack_half_2x16(struct nir_to_llvm_context *ctx,
1094                                         LLVMValueRef src0)
1095 {
1096         LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
1097         int i;
1098         LLVMValueRef comp[2];
1099
1100         src0 = to_float(ctx, src0);
1101         comp[0] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, "");
1102         comp[1] = LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, "");
1103         for (i = 0; i < 2; i++) {
1104                 comp[i] = LLVMBuildFPTrunc(ctx->builder, comp[i], ctx->f16, "");
1105                 comp[i] = LLVMBuildBitCast(ctx->builder, comp[i], ctx->i16, "");
1106                 comp[i] = LLVMBuildZExt(ctx->builder, comp[i], ctx->i32, "");
1107         }
1108
1109         comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, "");
1110         comp[0] = LLVMBuildOr(ctx->builder, comp[0], comp[1], "");
1111
1112         return comp[0];
1113 }
1114
1115 static LLVMValueRef emit_unpack_half_2x16(struct nir_to_llvm_context *ctx,
1116                                           LLVMValueRef src0)
1117 {
1118         LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
1119         LLVMValueRef temps[2], result, val;
1120         int i;
1121
1122         for (i = 0; i < 2; i++) {
1123                 val = i == 1 ? LLVMBuildLShr(ctx->builder, src0, const16, "") : src0;
1124                 val = LLVMBuildTrunc(ctx->builder, val, ctx->i16, "");
1125                 val = LLVMBuildBitCast(ctx->builder, val, ctx->f16, "");
1126                 temps[i] = LLVMBuildFPExt(ctx->builder, val, ctx->f32, "");
1127         }
1128
1129         result = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), temps[0],
1130                                         ctx->i32zero, "");
1131         result = LLVMBuildInsertElement(ctx->builder, result, temps[1],
1132                                         ctx->i32one, "");
1133         return result;
1134 }
1135
1136 static LLVMValueRef emit_ddxy(struct nir_to_llvm_context *ctx,
1137                               nir_op op,
1138                               LLVMValueRef src0)
1139 {
1140         unsigned mask;
1141         int idx;
1142         LLVMValueRef result;
1143         ctx->has_ddxy = true;
1144
1145         if (!ctx->lds && !ctx->has_ds_bpermute)
1146                 ctx->lds = LLVMAddGlobalInAddressSpace(ctx->module,
1147                                                        LLVMArrayType(ctx->i32, 64),
1148                                                        "ddxy_lds", LOCAL_ADDR_SPACE);
1149
1150         if (op == nir_op_fddx_fine || op == nir_op_fddx)
1151                 mask = AC_TID_MASK_LEFT;
1152         else if (op == nir_op_fddy_fine || op == nir_op_fddy)
1153                 mask = AC_TID_MASK_TOP;
1154         else
1155                 mask = AC_TID_MASK_TOP_LEFT;
1156
1157         /* for DDX we want to next X pixel, DDY next Y pixel. */
1158         if (op == nir_op_fddx_fine ||
1159             op == nir_op_fddx_coarse ||
1160             op == nir_op_fddx)
1161                 idx = 1;
1162         else
1163                 idx = 2;
1164
1165         result = ac_build_ddxy(&ctx->ac, ctx->has_ds_bpermute,
1166                               mask, idx, ctx->lds,
1167                               src0);
1168         return result;
1169 }
1170
1171 /*
1172  * this takes an I,J coordinate pair,
1173  * and works out the X and Y derivatives.
1174  * it returns DDX(I), DDX(J), DDY(I), DDY(J).
1175  */
1176 static LLVMValueRef emit_ddxy_interp(
1177         struct nir_to_llvm_context *ctx,
1178         LLVMValueRef interp_ij)
1179 {
1180         LLVMValueRef result[4], a;
1181         unsigned i;
1182
1183         for (i = 0; i < 2; i++) {
1184                 a = LLVMBuildExtractElement(ctx->builder, interp_ij,
1185                                             LLVMConstInt(ctx->i32, i, false), "");
1186                 result[i] = emit_ddxy(ctx, nir_op_fddx, a);
1187                 result[2+i] = emit_ddxy(ctx, nir_op_fddy, a);
1188         }
1189         return ac_build_gather_values(&ctx->ac, result, 4);
1190 }
1191
1192 static void visit_alu(struct nir_to_llvm_context *ctx, nir_alu_instr *instr)
1193 {
1194         LLVMValueRef src[4], result = NULL;
1195         unsigned num_components = instr->dest.dest.ssa.num_components;
1196         unsigned src_components;
1197         LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa);
1198
1199         assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src));
1200         switch (instr->op) {
1201         case nir_op_vec2:
1202         case nir_op_vec3:
1203         case nir_op_vec4:
1204                 src_components = 1;
1205                 break;
1206         case nir_op_pack_half_2x16:
1207                 src_components = 2;
1208                 break;
1209         case nir_op_unpack_half_2x16:
1210                 src_components = 1;
1211                 break;
1212         default:
1213                 src_components = num_components;
1214                 break;
1215         }
1216         for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1217                 src[i] = get_alu_src(ctx, instr->src[i], src_components);
1218
1219         switch (instr->op) {
1220         case nir_op_fmov:
1221         case nir_op_imov:
1222                 result = src[0];
1223                 break;
1224         case nir_op_fneg:
1225                 src[0] = to_float(ctx, src[0]);
1226                 result = LLVMBuildFNeg(ctx->builder, src[0], "");
1227                 break;
1228         case nir_op_ineg:
1229                 result = LLVMBuildNeg(ctx->builder, src[0], "");
1230                 break;
1231         case nir_op_inot:
1232                 result = LLVMBuildNot(ctx->builder, src[0], "");
1233                 break;
1234         case nir_op_iadd:
1235                 result = LLVMBuildAdd(ctx->builder, src[0], src[1], "");
1236                 break;
1237         case nir_op_fadd:
1238                 src[0] = to_float(ctx, src[0]);
1239                 src[1] = to_float(ctx, src[1]);
1240                 result = LLVMBuildFAdd(ctx->builder, src[0], src[1], "");
1241                 break;
1242         case nir_op_fsub:
1243                 src[0] = to_float(ctx, src[0]);
1244                 src[1] = to_float(ctx, src[1]);
1245                 result = LLVMBuildFSub(ctx->builder, src[0], src[1], "");
1246                 break;
1247         case nir_op_isub:
1248                 result = LLVMBuildSub(ctx->builder, src[0], src[1], "");
1249                 break;
1250         case nir_op_imul:
1251                 result = LLVMBuildMul(ctx->builder, src[0], src[1], "");
1252                 break;
1253         case nir_op_imod:
1254                 result = LLVMBuildSRem(ctx->builder, src[0], src[1], "");
1255                 break;
1256         case nir_op_umod:
1257                 result = LLVMBuildURem(ctx->builder, src[0], src[1], "");
1258                 break;
1259         case nir_op_fmod:
1260                 src[0] = to_float(ctx, src[0]);
1261                 src[1] = to_float(ctx, src[1]);
1262                 result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
1263                 result = emit_intrin_1f_param(ctx, "llvm.floor",
1264                                               to_float_type(ctx, def_type), result);
1265                 result = LLVMBuildFMul(ctx->builder, src[1] , result, "");
1266                 result = LLVMBuildFSub(ctx->builder, src[0], result, "");
1267                 break;
1268         case nir_op_frem:
1269                 src[0] = to_float(ctx, src[0]);
1270                 src[1] = to_float(ctx, src[1]);
1271                 result = LLVMBuildFRem(ctx->builder, src[0], src[1], "");
1272                 break;
1273         case nir_op_irem:
1274                 result = LLVMBuildSRem(ctx->builder, src[0], src[1], "");
1275                 break;
1276         case nir_op_idiv:
1277                 result = LLVMBuildSDiv(ctx->builder, src[0], src[1], "");
1278                 break;
1279         case nir_op_udiv:
1280                 result = LLVMBuildUDiv(ctx->builder, src[0], src[1], "");
1281                 break;
1282         case nir_op_fmul:
1283                 src[0] = to_float(ctx, src[0]);
1284                 src[1] = to_float(ctx, src[1]);
1285                 result = LLVMBuildFMul(ctx->builder, src[0], src[1], "");
1286                 break;
1287         case nir_op_fdiv:
1288                 src[0] = to_float(ctx, src[0]);
1289                 src[1] = to_float(ctx, src[1]);
1290                 result = ac_build_fdiv(&ctx->ac, src[0], src[1]);
1291                 break;
1292         case nir_op_frcp:
1293                 src[0] = to_float(ctx, src[0]);
1294                 result = ac_build_fdiv(&ctx->ac, ctx->f32one, src[0]);
1295                 break;
1296         case nir_op_iand:
1297                 result = LLVMBuildAnd(ctx->builder, src[0], src[1], "");
1298                 break;
1299         case nir_op_ior:
1300                 result = LLVMBuildOr(ctx->builder, src[0], src[1], "");
1301                 break;
1302         case nir_op_ixor:
1303                 result = LLVMBuildXor(ctx->builder, src[0], src[1], "");
1304                 break;
1305         case nir_op_ishl:
1306                 result = LLVMBuildShl(ctx->builder, src[0], src[1], "");
1307                 break;
1308         case nir_op_ishr:
1309                 result = LLVMBuildAShr(ctx->builder, src[0], src[1], "");
1310                 break;
1311         case nir_op_ushr:
1312                 result = LLVMBuildLShr(ctx->builder, src[0], src[1], "");
1313                 break;
1314         case nir_op_ilt:
1315                 result = emit_int_cmp(ctx, LLVMIntSLT, src[0], src[1]);
1316                 break;
1317         case nir_op_ine:
1318                 result = emit_int_cmp(ctx, LLVMIntNE, src[0], src[1]);
1319                 break;
1320         case nir_op_ieq:
1321                 result = emit_int_cmp(ctx, LLVMIntEQ, src[0], src[1]);
1322                 break;
1323         case nir_op_ige:
1324                 result = emit_int_cmp(ctx, LLVMIntSGE, src[0], src[1]);
1325                 break;
1326         case nir_op_ult:
1327                 result = emit_int_cmp(ctx, LLVMIntULT, src[0], src[1]);
1328                 break;
1329         case nir_op_uge:
1330                 result = emit_int_cmp(ctx, LLVMIntUGE, src[0], src[1]);
1331                 break;
1332         case nir_op_feq:
1333                 result = emit_float_cmp(ctx, LLVMRealUEQ, src[0], src[1]);
1334                 break;
1335         case nir_op_fne:
1336                 result = emit_float_cmp(ctx, LLVMRealUNE, src[0], src[1]);
1337                 break;
1338         case nir_op_flt:
1339                 result = emit_float_cmp(ctx, LLVMRealULT, src[0], src[1]);
1340                 break;
1341         case nir_op_fge:
1342                 result = emit_float_cmp(ctx, LLVMRealUGE, src[0], src[1]);
1343                 break;
1344         case nir_op_fabs:
1345                 result = emit_intrin_1f_param(ctx, "llvm.fabs",
1346                                               to_float_type(ctx, def_type), src[0]);
1347                 break;
1348         case nir_op_iabs:
1349                 result = emit_iabs(ctx, src[0]);
1350                 break;
1351         case nir_op_imax:
1352                 result = emit_minmax_int(ctx, LLVMIntSGT, src[0], src[1]);
1353                 break;
1354         case nir_op_imin:
1355                 result = emit_minmax_int(ctx, LLVMIntSLT, src[0], src[1]);
1356                 break;
1357         case nir_op_umax:
1358                 result = emit_minmax_int(ctx, LLVMIntUGT, src[0], src[1]);
1359                 break;
1360         case nir_op_umin:
1361                 result = emit_minmax_int(ctx, LLVMIntULT, src[0], src[1]);
1362                 break;
1363         case nir_op_isign:
1364                 result = emit_isign(ctx, src[0]);
1365                 break;
1366         case nir_op_fsign:
1367                 src[0] = to_float(ctx, src[0]);
1368                 result = emit_fsign(ctx, src[0]);
1369                 break;
1370         case nir_op_ffloor:
1371                 result = emit_intrin_1f_param(ctx, "llvm.floor",
1372                                               to_float_type(ctx, def_type), src[0]);
1373                 break;
1374         case nir_op_ftrunc:
1375                 result = emit_intrin_1f_param(ctx, "llvm.trunc",
1376                                               to_float_type(ctx, def_type), src[0]);
1377                 break;
1378         case nir_op_fceil:
1379                 result = emit_intrin_1f_param(ctx, "llvm.ceil",
1380                                               to_float_type(ctx, def_type), src[0]);
1381                 break;
1382         case nir_op_fround_even:
1383                 result = emit_intrin_1f_param(ctx, "llvm.rint",
1384                                               to_float_type(ctx, def_type),src[0]);
1385                 break;
1386         case nir_op_ffract:
1387                 result = emit_ffract(ctx, src[0]);
1388                 break;
1389         case nir_op_fsin:
1390                 result = emit_intrin_1f_param(ctx, "llvm.sin",
1391                                               to_float_type(ctx, def_type), src[0]);
1392                 break;
1393         case nir_op_fcos:
1394                 result = emit_intrin_1f_param(ctx, "llvm.cos",
1395                                               to_float_type(ctx, def_type), src[0]);
1396                 break;
1397         case nir_op_fsqrt:
1398                 result = emit_intrin_1f_param(ctx, "llvm.sqrt",
1399                                               to_float_type(ctx, def_type), src[0]);
1400                 break;
1401         case nir_op_fexp2:
1402                 result = emit_intrin_1f_param(ctx, "llvm.exp2",
1403                                               to_float_type(ctx, def_type), src[0]);
1404                 break;
1405         case nir_op_flog2:
1406                 result = emit_intrin_1f_param(ctx, "llvm.log2",
1407                                               to_float_type(ctx, def_type), src[0]);
1408                 break;
1409         case nir_op_frsq:
1410                 result = emit_intrin_1f_param(ctx, "llvm.sqrt",
1411                                               to_float_type(ctx, def_type), src[0]);
1412                 result = ac_build_fdiv(&ctx->ac, ctx->f32one, result);
1413                 break;
1414         case nir_op_fpow:
1415                 result = emit_intrin_2f_param(ctx, "llvm.pow",
1416                                               to_float_type(ctx, def_type), src[0], src[1]);
1417                 break;
1418         case nir_op_fmax:
1419                 result = emit_intrin_2f_param(ctx, "llvm.maxnum",
1420                                               to_float_type(ctx, def_type), src[0], src[1]);
1421                 break;
1422         case nir_op_fmin:
1423                 result = emit_intrin_2f_param(ctx, "llvm.minnum",
1424                                               to_float_type(ctx, def_type), src[0], src[1]);
1425                 break;
1426         case nir_op_ffma:
1427                 result = emit_intrin_3f_param(ctx, "llvm.fma",
1428                                               to_float_type(ctx, def_type), src[0], src[1], src[2]);
1429                 break;
1430         case nir_op_ibitfield_extract:
1431                 result = emit_bitfield_extract(ctx, true, src);
1432                 break;
1433         case nir_op_ubitfield_extract:
1434                 result = emit_bitfield_extract(ctx, false, src);
1435                 break;
1436         case nir_op_bitfield_insert:
1437                 result = emit_bitfield_insert(ctx, src[0], src[1], src[2], src[3]);
1438                 break;
1439         case nir_op_bitfield_reverse:
1440                 result = ac_build_intrinsic(&ctx->ac, "llvm.bitreverse.i32", ctx->i32, src, 1, AC_FUNC_ATTR_READNONE);
1441                 break;
1442         case nir_op_bit_count:
1443                 result = ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i32", ctx->i32, src, 1, AC_FUNC_ATTR_READNONE);
1444                 break;
1445         case nir_op_vec2:
1446         case nir_op_vec3:
1447         case nir_op_vec4:
1448                 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
1449                         src[i] = to_integer(ctx, src[i]);
1450                 result = ac_build_gather_values(&ctx->ac, src, num_components);
1451                 break;
1452         case nir_op_f2i32:
1453         case nir_op_f2i64:
1454                 src[0] = to_float(ctx, src[0]);
1455                 result = LLVMBuildFPToSI(ctx->builder, src[0], def_type, "");
1456                 break;
1457         case nir_op_f2u32:
1458         case nir_op_f2u64:
1459                 src[0] = to_float(ctx, src[0]);
1460                 result = LLVMBuildFPToUI(ctx->builder, src[0], def_type, "");
1461                 break;
1462         case nir_op_i2f32:
1463         case nir_op_i2f64:
1464                 result = LLVMBuildSIToFP(ctx->builder, src[0], to_float_type(ctx, def_type), "");
1465                 break;
1466         case nir_op_u2f32:
1467         case nir_op_u2f64:
1468                 result = LLVMBuildUIToFP(ctx->builder, src[0], to_float_type(ctx, def_type), "");
1469                 break;
1470         case nir_op_f2f64:
1471                 result = LLVMBuildFPExt(ctx->builder, src[0], to_float_type(ctx, def_type), "");
1472                 break;
1473         case nir_op_f2f32:
1474                 result = LLVMBuildFPTrunc(ctx->builder, src[0], to_float_type(ctx, def_type), "");
1475                 break;
1476         case nir_op_u2u32:
1477         case nir_op_u2u64:
1478                 if (get_elem_bits(ctx, LLVMTypeOf(src[0])) < get_elem_bits(ctx, def_type))
1479                         result = LLVMBuildZExt(ctx->builder, src[0], def_type, "");
1480                 else
1481                         result = LLVMBuildTrunc(ctx->builder, src[0], def_type, "");
1482                 break;
1483         case nir_op_i2i32:
1484         case nir_op_i2i64:
1485                 if (get_elem_bits(ctx, LLVMTypeOf(src[0])) < get_elem_bits(ctx, def_type))
1486                         result = LLVMBuildSExt(ctx->builder, src[0], def_type, "");
1487                 else
1488                         result = LLVMBuildTrunc(ctx->builder, src[0], def_type, "");
1489                 break;
1490         case nir_op_bcsel:
1491                 result = emit_bcsel(ctx, src[0], src[1], src[2]);
1492                 break;
1493         case nir_op_find_lsb:
1494                 result = emit_find_lsb(ctx, src[0]);
1495                 break;
1496         case nir_op_ufind_msb:
1497                 result = emit_ufind_msb(ctx, src[0]);
1498                 break;
1499         case nir_op_ifind_msb:
1500                 result = emit_ifind_msb(ctx, src[0]);
1501                 break;
1502         case nir_op_uadd_carry:
1503                 result = emit_uint_carry(ctx, "llvm.uadd.with.overflow.i32", src[0], src[1]);
1504                 break;
1505         case nir_op_usub_borrow:
1506                 result = emit_uint_carry(ctx, "llvm.usub.with.overflow.i32", src[0], src[1]);
1507                 break;
1508         case nir_op_b2f:
1509                 result = emit_b2f(ctx, src[0]);
1510                 break;
1511         case nir_op_fquantize2f16:
1512                 src[0] = to_float(ctx, src[0]);
1513                 result = LLVMBuildFPTrunc(ctx->builder, src[0], ctx->f16, "");
1514                 /* need to convert back up to f32 */
1515                 result = LLVMBuildFPExt(ctx->builder, result, ctx->f32, "");
1516                 break;
1517         case nir_op_umul_high:
1518                 result = emit_umul_high(ctx, src[0], src[1]);
1519                 break;
1520         case nir_op_imul_high:
1521                 result = emit_imul_high(ctx, src[0], src[1]);
1522                 break;
1523         case nir_op_pack_half_2x16:
1524                 result = emit_pack_half_2x16(ctx, src[0]);
1525                 break;
1526         case nir_op_unpack_half_2x16:
1527                 result = emit_unpack_half_2x16(ctx, src[0]);
1528                 break;
1529         case nir_op_fddx:
1530         case nir_op_fddy:
1531         case nir_op_fddx_fine:
1532         case nir_op_fddy_fine:
1533         case nir_op_fddx_coarse:
1534         case nir_op_fddy_coarse:
1535                 result = emit_ddxy(ctx, instr->op, src[0]);
1536                 break;
1537         default:
1538                 fprintf(stderr, "Unknown NIR alu instr: ");
1539                 nir_print_instr(&instr->instr, stderr);
1540                 fprintf(stderr, "\n");
1541                 abort();
1542         }
1543
1544         if (result) {
1545                 assert(instr->dest.dest.is_ssa);
1546                 result = to_integer(ctx, result);
1547                 _mesa_hash_table_insert(ctx->defs, &instr->dest.dest.ssa,
1548                                         result);
1549         }
1550 }
1551
1552 static void visit_load_const(struct nir_to_llvm_context *ctx,
1553                              nir_load_const_instr *instr)
1554 {
1555         LLVMValueRef values[4], value = NULL;
1556         LLVMTypeRef element_type =
1557             LLVMIntTypeInContext(ctx->context, instr->def.bit_size);
1558
1559         for (unsigned i = 0; i < instr->def.num_components; ++i) {
1560                 switch (instr->def.bit_size) {
1561                 case 32:
1562                         values[i] = LLVMConstInt(element_type,
1563                                                  instr->value.u32[i], false);
1564                         break;
1565                 case 64:
1566                         values[i] = LLVMConstInt(element_type,
1567                                                  instr->value.u64[i], false);
1568                         break;
1569                 default:
1570                         fprintf(stderr,
1571                                 "unsupported nir load_const bit_size: %d\n",
1572                                 instr->def.bit_size);
1573                         abort();
1574                 }
1575         }
1576         if (instr->def.num_components > 1) {
1577                 value = LLVMConstVector(values, instr->def.num_components);
1578         } else
1579                 value = values[0];
1580
1581         _mesa_hash_table_insert(ctx->defs, &instr->def, value);
1582 }
1583
1584 static LLVMValueRef cast_ptr(struct nir_to_llvm_context *ctx, LLVMValueRef ptr,
1585                              LLVMTypeRef type)
1586 {
1587         int addr_space = LLVMGetPointerAddressSpace(LLVMTypeOf(ptr));
1588         return LLVMBuildBitCast(ctx->builder, ptr,
1589                                 LLVMPointerType(type, addr_space), "");
1590 }
1591
1592 static LLVMValueRef
1593 get_buffer_size(struct nir_to_llvm_context *ctx, LLVMValueRef descriptor, bool in_elements)
1594 {
1595         LLVMValueRef size =
1596                 LLVMBuildExtractElement(ctx->builder, descriptor,
1597                                         LLVMConstInt(ctx->i32, 2, false), "");
1598
1599         /* VI only */
1600         if (ctx->options->chip_class >= VI && in_elements) {
1601                 /* On VI, the descriptor contains the size in bytes,
1602                  * but TXQ must return the size in elements.
1603                  * The stride is always non-zero for resources using TXQ.
1604                  */
1605                 LLVMValueRef stride =
1606                         LLVMBuildExtractElement(ctx->builder, descriptor,
1607                                                 LLVMConstInt(ctx->i32, 1, false), "");
1608                 stride = LLVMBuildLShr(ctx->builder, stride,
1609                                        LLVMConstInt(ctx->i32, 16, false), "");
1610                 stride = LLVMBuildAnd(ctx->builder, stride,
1611                                       LLVMConstInt(ctx->i32, 0x3fff, false), "");
1612
1613                 size = LLVMBuildUDiv(ctx->builder, size, stride, "");
1614         }
1615         return size;
1616 }
1617
1618 /**
1619  * Given the i32 or vNi32 \p type, generate the textual name (e.g. for use with
1620  * intrinsic names).
1621  */
1622 static void build_int_type_name(
1623         LLVMTypeRef type,
1624         char *buf, unsigned bufsize)
1625 {
1626         assert(bufsize >= 6);
1627
1628         if (LLVMGetTypeKind(type) == LLVMVectorTypeKind)
1629                 snprintf(buf, bufsize, "v%ui32",
1630                          LLVMGetVectorSize(type));
1631         else
1632                 strcpy(buf, "i32");
1633 }
1634
1635 static LLVMValueRef radv_lower_gather4_integer(struct nir_to_llvm_context *ctx,
1636                                                struct ac_image_args *args,
1637                                                nir_tex_instr *instr)
1638 {
1639         enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
1640         LLVMValueRef coord = args->addr;
1641         LLVMValueRef half_texel[2];
1642         LLVMValueRef compare_cube_wa;
1643         LLVMValueRef result;
1644         int c;
1645         unsigned coord_vgpr_index = (unsigned)args->offset + (unsigned)args->compare;
1646
1647         //TODO Rect
1648         {
1649                 struct ac_image_args txq_args = { 0 };
1650
1651                 txq_args.da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
1652                 txq_args.opcode = ac_image_get_resinfo;
1653                 txq_args.dmask = 0xf;
1654                 txq_args.addr = ctx->i32zero;
1655                 txq_args.resource = args->resource;
1656                 LLVMValueRef size = ac_build_image_opcode(&ctx->ac, &txq_args);
1657
1658                 for (c = 0; c < 2; c++) {
1659                         half_texel[c] = LLVMBuildExtractElement(ctx->builder, size,
1660                                                                 LLVMConstInt(ctx->i32, c, false), "");
1661                         half_texel[c] = LLVMBuildUIToFP(ctx->builder, half_texel[c], ctx->f32, "");
1662                         half_texel[c] = ac_build_fdiv(&ctx->ac, ctx->f32one, half_texel[c]);
1663                         half_texel[c] = LLVMBuildFMul(ctx->builder, half_texel[c],
1664                                                       LLVMConstReal(ctx->f32, -0.5), "");
1665                 }
1666         }
1667
1668         LLVMValueRef orig_coords = args->addr;
1669
1670         for (c = 0; c < 2; c++) {
1671                 LLVMValueRef tmp;
1672                 LLVMValueRef index = LLVMConstInt(ctx->i32, coord_vgpr_index + c, 0);
1673                 tmp = LLVMBuildExtractElement(ctx->builder, coord, index, "");
1674                 tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
1675                 tmp = LLVMBuildFAdd(ctx->builder, tmp, half_texel[c], "");
1676                 tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
1677                 coord = LLVMBuildInsertElement(ctx->builder, coord, tmp, index, "");
1678         }
1679
1680
1681         /*
1682          * Apparantly cube has issue with integer types that the workaround doesn't solve,
1683          * so this tests if the format is 8_8_8_8 and an integer type do an alternate
1684          * workaround by sampling using a scaled type and converting.
1685          * This is taken from amdgpu-pro shaders.
1686          */
1687         /* NOTE this produces some ugly code compared to amdgpu-pro,
1688          * LLVM ends up dumping SGPRs into VGPRs to deal with the compare/select,
1689          * and then reads them back. -pro generates two selects,
1690          * one s_cmp for the descriptor rewriting
1691          * one v_cmp for the coordinate and result changes.
1692          */
1693         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1694                 LLVMValueRef tmp, tmp2;
1695
1696                 /* workaround 8/8/8/8 uint/sint cube gather bug */
1697                 /* first detect it then change to a scaled read and f2i */
1698                 tmp = LLVMBuildExtractElement(ctx->builder, args->resource, ctx->i32one, "");
1699                 tmp2 = tmp;
1700
1701                 /* extract the DATA_FORMAT */
1702                 tmp = ac_build_bfe(&ctx->ac, tmp, LLVMConstInt(ctx->i32, 20, false),
1703                                    LLVMConstInt(ctx->i32, 6, false), false);
1704
1705                 /* is the DATA_FORMAT == 8_8_8_8 */
1706                 compare_cube_wa = LLVMBuildICmp(ctx->builder, LLVMIntEQ, tmp, LLVMConstInt(ctx->i32, V_008F14_IMG_DATA_FORMAT_8_8_8_8, false), "");
1707
1708                 if (stype == GLSL_TYPE_UINT)
1709                         /* Create a NUM FORMAT - 0x2 or 0x4 - USCALED or UINT */
1710                         tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0x8000000, false),
1711                                               LLVMConstInt(ctx->i32, 0x10000000, false), "");
1712                 else
1713                         /* Create a NUM FORMAT - 0x3 or 0x5 - SSCALED or SINT */
1714                         tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, LLVMConstInt(ctx->i32, 0xc000000, false),
1715                                               LLVMConstInt(ctx->i32, 0x14000000, false), "");
1716
1717                 /* replace the NUM FORMAT in the descriptor */
1718                 tmp2 = LLVMBuildAnd(ctx->builder, tmp2, LLVMConstInt(ctx->i32, C_008F14_NUM_FORMAT, false), "");
1719                 tmp2 = LLVMBuildOr(ctx->builder, tmp2, tmp, "");
1720
1721                 args->resource = LLVMBuildInsertElement(ctx->builder, args->resource, tmp2, ctx->i32one, "");
1722
1723                 /* don't modify the coordinates for this case */
1724                 coord = LLVMBuildSelect(ctx->builder, compare_cube_wa, orig_coords, coord, "");
1725         }
1726         args->addr = coord;
1727         result = ac_build_image_opcode(&ctx->ac, args);
1728
1729         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) {
1730                 LLVMValueRef tmp, tmp2;
1731
1732                 /* if the cube workaround is in place, f2i the result. */
1733                 for (c = 0; c < 4; c++) {
1734                         tmp = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, c, false), "");
1735                         if (stype == GLSL_TYPE_UINT)
1736                                 tmp2 = LLVMBuildFPToUI(ctx->builder, tmp, ctx->i32, "");
1737                         else
1738                                 tmp2 = LLVMBuildFPToSI(ctx->builder, tmp, ctx->i32, "");
1739                         tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->i32, "");
1740                         tmp2 = LLVMBuildBitCast(ctx->builder, tmp2, ctx->i32, "");
1741                         tmp = LLVMBuildSelect(ctx->builder, compare_cube_wa, tmp2, tmp, "");
1742                         tmp = LLVMBuildBitCast(ctx->builder, tmp, ctx->f32, "");
1743                         result = LLVMBuildInsertElement(ctx->builder, result, tmp, LLVMConstInt(ctx->i32, c, false), "");
1744                 }
1745         }
1746         return result;
1747 }
1748
1749 static LLVMValueRef build_tex_intrinsic(struct nir_to_llvm_context *ctx,
1750                                         nir_tex_instr *instr,
1751                                         struct ac_image_args *args)
1752 {
1753         if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
1754                 return ac_build_buffer_load_format(&ctx->ac,
1755                                                    args->resource,
1756                                                    args->addr,
1757                                                    LLVMConstInt(ctx->i32, 0, false),
1758                                                    true);
1759         }
1760
1761         args->opcode = ac_image_sample;
1762         args->compare = instr->is_shadow;
1763
1764         switch (instr->op) {
1765         case nir_texop_txf:
1766         case nir_texop_txf_ms:
1767         case nir_texop_samples_identical:
1768                 args->opcode = instr->sampler_dim == GLSL_SAMPLER_DIM_MS ? ac_image_load : ac_image_load_mip;
1769                 args->compare = false;
1770                 args->offset = false;
1771                 break;
1772         case nir_texop_txb:
1773                 args->bias = true;
1774                 break;
1775         case nir_texop_txl:
1776                 args->lod = true;
1777                 break;
1778         case nir_texop_txs:
1779         case nir_texop_query_levels:
1780                 args->opcode = ac_image_get_resinfo;
1781                 break;
1782         case nir_texop_tex:
1783                 if (ctx->stage != MESA_SHADER_FRAGMENT)
1784                         args->level_zero = true;
1785                 break;
1786         case nir_texop_txd:
1787                 args->deriv = true;
1788                 break;
1789         case nir_texop_tg4:
1790                 args->opcode = ac_image_gather4;
1791                 args->level_zero = true;
1792                 break;
1793         case nir_texop_lod:
1794                 args->opcode = ac_image_get_lod;
1795                 args->compare = false;
1796                 args->offset = false;
1797                 break;
1798         default:
1799                 break;
1800         }
1801
1802         if (instr->op == nir_texop_tg4) {
1803                 enum glsl_base_type stype = glsl_get_sampler_result_type(instr->texture->var->type);
1804                 if (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT) {
1805                         return radv_lower_gather4_integer(ctx, args, instr);
1806                 }
1807         }
1808         return ac_build_image_opcode(&ctx->ac, args);
1809 }
1810
1811 static LLVMValueRef visit_vulkan_resource_index(struct nir_to_llvm_context *ctx,
1812                                                 nir_intrinsic_instr *instr)
1813 {
1814         LLVMValueRef index = get_src(ctx, instr->src[0]);
1815         unsigned desc_set = nir_intrinsic_desc_set(instr);
1816         unsigned binding = nir_intrinsic_binding(instr);
1817         LLVMValueRef desc_ptr = ctx->descriptor_sets[desc_set];
1818         struct radv_pipeline_layout *pipeline_layout = ctx->options->layout;
1819         struct radv_descriptor_set_layout *layout = pipeline_layout->set[desc_set].layout;
1820         unsigned base_offset = layout->binding[binding].offset;
1821         LLVMValueRef offset, stride;
1822
1823         if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
1824             layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
1825                 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start +
1826                         layout->binding[binding].dynamic_offset_offset;
1827                 desc_ptr = ctx->push_constants;
1828                 base_offset = pipeline_layout->push_constant_size + 16 * idx;
1829                 stride = LLVMConstInt(ctx->i32, 16, false);
1830         } else
1831                 stride = LLVMConstInt(ctx->i32, layout->binding[binding].size, false);
1832
1833         offset = LLVMConstInt(ctx->i32, base_offset, false);
1834         index = LLVMBuildMul(ctx->builder, index, stride, "");
1835         offset = LLVMBuildAdd(ctx->builder, offset, index, "");
1836         
1837         desc_ptr = ac_build_gep0(&ctx->ac, desc_ptr, offset);
1838         desc_ptr = cast_ptr(ctx, desc_ptr, ctx->v4i32);
1839         LLVMSetMetadata(desc_ptr, ctx->uniform_md_kind, ctx->empty_md);
1840
1841         return LLVMBuildLoad(ctx->builder, desc_ptr, "");
1842 }
1843
1844 static LLVMValueRef visit_load_push_constant(struct nir_to_llvm_context *ctx,
1845                                              nir_intrinsic_instr *instr)
1846 {
1847         LLVMValueRef ptr, addr;
1848
1849         addr = LLVMConstInt(ctx->i32, nir_intrinsic_base(instr), 0);
1850         addr = LLVMBuildAdd(ctx->builder, addr, get_src(ctx, instr->src[0]), "");
1851
1852         ptr = ac_build_gep0(&ctx->ac, ctx->push_constants, addr);
1853         ptr = cast_ptr(ctx, ptr, get_def_type(ctx, &instr->dest.ssa));
1854
1855         return LLVMBuildLoad(ctx->builder, ptr, "");
1856 }
1857
1858 static LLVMValueRef visit_get_buffer_size(struct nir_to_llvm_context *ctx,
1859                                           nir_intrinsic_instr *instr)
1860 {
1861         LLVMValueRef desc = get_src(ctx, instr->src[0]);
1862
1863         return get_buffer_size(ctx, desc, false);
1864 }
1865 static void visit_store_ssbo(struct nir_to_llvm_context *ctx,
1866                              nir_intrinsic_instr *instr)
1867 {
1868         const char *store_name;
1869         LLVMValueRef src_data = get_src(ctx, instr->src[0]);
1870         LLVMTypeRef data_type = ctx->f32;
1871         int elem_size_mult = get_elem_bits(ctx, LLVMTypeOf(src_data)) / 32;
1872         int components_32bit = elem_size_mult * instr->num_components;
1873         unsigned writemask = nir_intrinsic_write_mask(instr);
1874         LLVMValueRef base_data, base_offset;
1875         LLVMValueRef params[6];
1876
1877         if (ctx->stage == MESA_SHADER_FRAGMENT)
1878                 ctx->shader_info->fs.writes_memory = true;
1879
1880         params[1] = get_src(ctx, instr->src[1]);
1881         params[2] = LLVMConstInt(ctx->i32, 0, false); /* vindex */
1882         params[4] = ctx->i1false;  /* glc */
1883         params[5] = ctx->i1false;  /* slc */
1884
1885         if (components_32bit > 1)
1886                 data_type = LLVMVectorType(ctx->f32, components_32bit);
1887
1888         base_data = to_float(ctx, src_data);
1889         base_data = trim_vector(ctx, base_data, instr->num_components);
1890         base_data = LLVMBuildBitCast(ctx->builder, base_data,
1891                                      data_type, "");
1892         base_offset = get_src(ctx, instr->src[2]);      /* voffset */
1893         while (writemask) {
1894                 int start, count;
1895                 LLVMValueRef data;
1896                 LLVMValueRef offset;
1897                 LLVMValueRef tmp;
1898                 u_bit_scan_consecutive_range(&writemask, &start, &count);
1899
1900                 /* Due to an LLVM limitation, split 3-element writes
1901                  * into a 2-element and a 1-element write. */
1902                 if (count == 3) {
1903                         writemask |= 1 << (start + 2);
1904                         count = 2;
1905                 }
1906
1907                 start *= elem_size_mult;
1908                 count *= elem_size_mult;
1909
1910                 if (count > 4) {
1911                         writemask |= ((1u << (count - 4)) - 1u) << (start + 4);
1912                         count = 4;
1913                 }
1914
1915                 if (count == 4) {
1916                         store_name = "llvm.amdgcn.buffer.store.v4f32";
1917                         data = base_data;
1918                 } else if (count == 2) {
1919                         tmp = LLVMBuildExtractElement(ctx->builder,
1920                                                       base_data, LLVMConstInt(ctx->i32, start, false), "");
1921                         data = LLVMBuildInsertElement(ctx->builder, LLVMGetUndef(ctx->v2f32), tmp,
1922                                                       ctx->i32zero, "");
1923
1924                         tmp = LLVMBuildExtractElement(ctx->builder,
1925                                                       base_data, LLVMConstInt(ctx->i32, start + 1, false), "");
1926                         data = LLVMBuildInsertElement(ctx->builder, data, tmp,
1927                                                       ctx->i32one, "");
1928                         store_name = "llvm.amdgcn.buffer.store.v2f32";
1929
1930                 } else {
1931                         assert(count == 1);
1932                         if (get_llvm_num_components(base_data) > 1)
1933                                 data = LLVMBuildExtractElement(ctx->builder, base_data,
1934                                                                LLVMConstInt(ctx->i32, start, false), "");
1935                         else
1936                                 data = base_data;
1937                         store_name = "llvm.amdgcn.buffer.store.f32";
1938                 }
1939
1940                 offset = base_offset;
1941                 if (start != 0) {
1942                         offset = LLVMBuildAdd(ctx->builder, offset, LLVMConstInt(ctx->i32, start * 4, false), "");
1943                 }
1944                 params[0] = data;
1945                 params[3] = offset;
1946                 ac_build_intrinsic(&ctx->ac, store_name,
1947                                    ctx->voidt, params, 6, 0);
1948         }
1949 }
1950
1951 static LLVMValueRef visit_atomic_ssbo(struct nir_to_llvm_context *ctx,
1952                                       nir_intrinsic_instr *instr)
1953 {
1954         const char *name;
1955         LLVMValueRef params[6];
1956         int arg_count = 0;
1957         if (ctx->stage == MESA_SHADER_FRAGMENT)
1958                 ctx->shader_info->fs.writes_memory = true;
1959
1960         if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) {
1961                 params[arg_count++] = llvm_extract_elem(ctx, get_src(ctx, instr->src[3]), 0);
1962         }
1963         params[arg_count++] = llvm_extract_elem(ctx, get_src(ctx, instr->src[2]), 0);
1964         params[arg_count++] = get_src(ctx, instr->src[0]);
1965         params[arg_count++] = LLVMConstInt(ctx->i32, 0, false); /* vindex */
1966         params[arg_count++] = get_src(ctx, instr->src[1]);      /* voffset */
1967         params[arg_count++] = ctx->i1false;  /* slc */
1968
1969         switch (instr->intrinsic) {
1970         case nir_intrinsic_ssbo_atomic_add:
1971                 name = "llvm.amdgcn.buffer.atomic.add";
1972                 break;
1973         case nir_intrinsic_ssbo_atomic_imin:
1974                 name = "llvm.amdgcn.buffer.atomic.smin";
1975                 break;
1976         case nir_intrinsic_ssbo_atomic_umin:
1977                 name = "llvm.amdgcn.buffer.atomic.umin";
1978                 break;
1979         case nir_intrinsic_ssbo_atomic_imax:
1980                 name = "llvm.amdgcn.buffer.atomic.smax";
1981                 break;
1982         case nir_intrinsic_ssbo_atomic_umax:
1983                 name = "llvm.amdgcn.buffer.atomic.umax";
1984                 break;
1985         case nir_intrinsic_ssbo_atomic_and:
1986                 name = "llvm.amdgcn.buffer.atomic.and";
1987                 break;
1988         case nir_intrinsic_ssbo_atomic_or:
1989                 name = "llvm.amdgcn.buffer.atomic.or";
1990                 break;
1991         case nir_intrinsic_ssbo_atomic_xor:
1992                 name = "llvm.amdgcn.buffer.atomic.xor";
1993                 break;
1994         case nir_intrinsic_ssbo_atomic_exchange:
1995                 name = "llvm.amdgcn.buffer.atomic.swap";
1996                 break;
1997         case nir_intrinsic_ssbo_atomic_comp_swap:
1998                 name = "llvm.amdgcn.buffer.atomic.cmpswap";
1999                 break;
2000         default:
2001                 abort();
2002         }
2003
2004         return ac_build_intrinsic(&ctx->ac, name, ctx->i32, params, arg_count, 0);
2005 }
2006
2007 static LLVMValueRef visit_load_buffer(struct nir_to_llvm_context *ctx,
2008                                       nir_intrinsic_instr *instr)
2009 {
2010         LLVMValueRef results[2];
2011         int load_components;
2012         int num_components = instr->num_components;
2013         if (instr->dest.ssa.bit_size == 64)
2014                 num_components *= 2;
2015
2016         for (int i = 0; i < num_components; i += load_components) {
2017                 load_components = MIN2(num_components - i, 4);
2018                 const char *load_name;
2019                 LLVMTypeRef data_type = ctx->f32;
2020                 LLVMValueRef offset = LLVMConstInt(ctx->i32, i * 4, false);
2021                 offset = LLVMBuildAdd(ctx->builder, get_src(ctx, instr->src[1]), offset, "");
2022
2023                 if (load_components == 3)
2024                         data_type = LLVMVectorType(ctx->f32, 4);
2025                 else if (load_components > 1)
2026                         data_type = LLVMVectorType(ctx->f32, load_components);
2027
2028                 if (load_components >= 3)
2029                         load_name = "llvm.amdgcn.buffer.load.v4f32";
2030                 else if (load_components == 2)
2031                         load_name = "llvm.amdgcn.buffer.load.v2f32";
2032                 else if (load_components == 1)
2033                         load_name = "llvm.amdgcn.buffer.load.f32";
2034                 else
2035                         unreachable("unhandled number of components");
2036
2037                 LLVMValueRef params[] = {
2038                         get_src(ctx, instr->src[0]),
2039                         LLVMConstInt(ctx->i32, 0, false),
2040                         offset,
2041                         ctx->i1false,
2042                         ctx->i1false,
2043                 };
2044
2045                 results[i] = ac_build_intrinsic(&ctx->ac, load_name, data_type, params, 5, 0);
2046
2047         }
2048
2049         LLVMValueRef ret = results[0];
2050         if (num_components > 4 || num_components == 3) {
2051                 LLVMValueRef masks[] = {
2052                         LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
2053                         LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false),
2054                         LLVMConstInt(ctx->i32, 4, false), LLVMConstInt(ctx->i32, 5, false),
2055                         LLVMConstInt(ctx->i32, 6, false), LLVMConstInt(ctx->i32, 7, false)
2056                 };
2057
2058                 LLVMValueRef swizzle = LLVMConstVector(masks, num_components);
2059                 ret = LLVMBuildShuffleVector(ctx->builder, results[0],
2060                                              results[num_components > 4 ? 1 : 0], swizzle, "");
2061         }
2062
2063         return LLVMBuildBitCast(ctx->builder, ret,
2064                                 get_def_type(ctx, &instr->dest.ssa), "");
2065 }
2066
2067 static LLVMValueRef visit_load_ubo_buffer(struct nir_to_llvm_context *ctx,
2068                                           nir_intrinsic_instr *instr)
2069 {
2070         LLVMValueRef results[8], ret;
2071         LLVMValueRef rsrc = get_src(ctx, instr->src[0]);
2072         LLVMValueRef offset = get_src(ctx, instr->src[1]);
2073         int num_components = instr->num_components;
2074
2075         rsrc = LLVMBuildBitCast(ctx->builder, rsrc, LLVMVectorType(ctx->i8, 16), "");
2076
2077         if (instr->dest.ssa.bit_size == 64)
2078                 num_components *= 2;
2079
2080         for (unsigned i = 0; i < num_components; ++i) {
2081                 LLVMValueRef params[] = {
2082                         rsrc,
2083                         LLVMBuildAdd(ctx->builder, LLVMConstInt(ctx->i32, 4 * i, 0),
2084                                      offset, "")
2085                 };
2086                 results[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.load.const", ctx->f32,
2087                                                 params, 2,
2088                                                 AC_FUNC_ATTR_READNONE |
2089                                                 AC_FUNC_ATTR_LEGACY);
2090         }
2091
2092
2093         ret = ac_build_gather_values(&ctx->ac, results, instr->num_components);
2094         return LLVMBuildBitCast(ctx->builder, ret,
2095                                 get_def_type(ctx, &instr->dest.ssa), "");
2096 }
2097
2098 static void
2099 radv_get_deref_offset(struct nir_to_llvm_context *ctx, nir_deref *tail,
2100                       bool vs_in, unsigned *vertex_index_out,
2101                       unsigned *const_out, LLVMValueRef *indir_out)
2102 {
2103         unsigned const_offset = 0;
2104         LLVMValueRef offset = NULL;
2105
2106         if (vertex_index_out != NULL) {
2107                 tail = tail->child;
2108                 nir_deref_array *deref_array = nir_deref_as_array(tail);
2109                 *vertex_index_out = deref_array->base_offset;
2110         }
2111
2112         while (tail->child != NULL) {
2113                 const struct glsl_type *parent_type = tail->type;
2114                 tail = tail->child;
2115
2116                 if (tail->deref_type == nir_deref_type_array) {
2117                         nir_deref_array *deref_array = nir_deref_as_array(tail);
2118                         LLVMValueRef index, stride, local_offset;
2119                         unsigned size = glsl_count_attribute_slots(tail->type, vs_in);
2120
2121                         const_offset += size * deref_array->base_offset;
2122                         if (deref_array->deref_array_type == nir_deref_array_type_direct)
2123                                 continue;
2124
2125                         assert(deref_array->deref_array_type == nir_deref_array_type_indirect);
2126                         index = get_src(ctx, deref_array->indirect);
2127                         stride = LLVMConstInt(ctx->i32, size, 0);
2128                         local_offset = LLVMBuildMul(ctx->builder, stride, index, "");
2129
2130                         if (offset)
2131                                 offset = LLVMBuildAdd(ctx->builder, offset, local_offset, "");
2132                         else
2133                                 offset = local_offset;
2134                 } else if (tail->deref_type == nir_deref_type_struct) {
2135                         nir_deref_struct *deref_struct = nir_deref_as_struct(tail);
2136
2137                         for (unsigned i = 0; i < deref_struct->index; i++) {
2138                                 const struct glsl_type *ft = glsl_get_struct_field(parent_type, i);
2139                                 const_offset += glsl_count_attribute_slots(ft, vs_in);
2140                         }
2141                 } else
2142                         unreachable("unsupported deref type");
2143
2144         }
2145
2146         if (const_offset && offset)
2147                 offset = LLVMBuildAdd(ctx->builder, offset,
2148                                       LLVMConstInt(ctx->i32, const_offset, 0),
2149                                       "");
2150
2151         *const_out = const_offset;
2152         *indir_out = offset;
2153 }
2154
2155 static LLVMValueRef
2156 load_gs_input(struct nir_to_llvm_context *ctx,
2157               nir_intrinsic_instr *instr)
2158 {
2159         LLVMValueRef indir_index, vtx_offset;
2160         unsigned const_index;
2161         LLVMValueRef args[9];
2162         unsigned param, vtx_offset_param;
2163         LLVMValueRef value[4], result;
2164         unsigned vertex_index;
2165         unsigned cull_offset = 0;
2166         radv_get_deref_offset(ctx, &instr->variables[0]->deref,
2167                               false, &vertex_index,
2168                               &const_index, &indir_index);
2169         vtx_offset_param = vertex_index;
2170         assert(vtx_offset_param < 6);
2171         vtx_offset = LLVMBuildMul(ctx->builder, ctx->gs_vtx_offset[vtx_offset_param],
2172                                   LLVMConstInt(ctx->i32, 4, false), "");
2173
2174         param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
2175         if (instr->variables[0]->var->data.location == VARYING_SLOT_CULL_DIST0)
2176                 cull_offset += ctx->num_input_clips;
2177         for (unsigned i = 0; i < instr->num_components; i++) {
2178
2179                 args[0] = ctx->esgs_ring;
2180                 args[1] = vtx_offset;
2181                 args[2] = LLVMConstInt(ctx->i32, (param * 4 + i + const_index + cull_offset) * 256, false);
2182                 args[3] = ctx->i32zero;
2183                 args[4] = ctx->i32one; /* OFFEN */
2184                 args[5] = ctx->i32zero; /* IDXEN */
2185                 args[6] = ctx->i32one; /* GLC */
2186                 args[7] = ctx->i32zero; /* SLC */
2187                 args[8] = ctx->i32zero; /* TFE */
2188
2189                 value[i] = ac_build_intrinsic(&ctx->ac, "llvm.SI.buffer.load.dword.i32.i32",
2190                                               ctx->i32, args, 9,
2191                                               AC_FUNC_ATTR_READONLY |
2192                                               AC_FUNC_ATTR_LEGACY);
2193         }
2194         result = ac_build_gather_values(&ctx->ac, value, instr->num_components);
2195
2196         return result;
2197 }
2198
2199 static LLVMValueRef visit_load_var(struct nir_to_llvm_context *ctx,
2200                                    nir_intrinsic_instr *instr)
2201 {
2202         LLVMValueRef values[8];
2203         int idx = instr->variables[0]->var->data.driver_location;
2204         int ve = instr->dest.ssa.num_components;
2205         LLVMValueRef indir_index;
2206         LLVMValueRef ret;
2207         unsigned const_index;
2208         bool vs_in = ctx->stage == MESA_SHADER_VERTEX &&
2209                      instr->variables[0]->var->data.mode == nir_var_shader_in;
2210         radv_get_deref_offset(ctx, &instr->variables[0]->deref, vs_in, NULL,
2211                                       &const_index, &indir_index);
2212
2213         if (instr->dest.ssa.bit_size == 64)
2214                 ve *= 2;
2215
2216         switch (instr->variables[0]->var->data.mode) {
2217         case nir_var_shader_in:
2218                 if (ctx->stage == MESA_SHADER_GEOMETRY) {
2219                         return load_gs_input(ctx, instr);
2220                 }
2221                 for (unsigned chan = 0; chan < ve; chan++) {
2222                         if (indir_index) {
2223                                 unsigned count = glsl_count_attribute_slots(
2224                                                 instr->variables[0]->var->type,
2225                                                 ctx->stage == MESA_SHADER_VERTEX);
2226                                 count -= chan / 4;
2227                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2228                                                 &ctx->ac, ctx->inputs + idx + chan, count,
2229                                                 4, false);
2230
2231                                 values[chan] = LLVMBuildExtractElement(ctx->builder,
2232                                                                        tmp_vec,
2233                                                                        indir_index, "");
2234                         } else
2235                                 values[chan] = ctx->inputs[idx + chan + const_index * 4];
2236                 }
2237                 break;
2238         case nir_var_local:
2239                 for (unsigned chan = 0; chan < ve; chan++) {
2240                         if (indir_index) {
2241                                 unsigned count = glsl_count_attribute_slots(
2242                                         instr->variables[0]->var->type, false);
2243                                 count -= chan / 4;
2244                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2245                                                 &ctx->ac, ctx->locals + idx + chan, count,
2246                                                 4, true);
2247
2248                                 values[chan] = LLVMBuildExtractElement(ctx->builder,
2249                                                                        tmp_vec,
2250                                                                        indir_index, "");
2251                         } else {
2252                                 values[chan] = LLVMBuildLoad(ctx->builder, ctx->locals[idx + chan + const_index * 4], "");
2253                         }
2254                 }
2255                 break;
2256         case nir_var_shader_out:
2257                 for (unsigned chan = 0; chan < ve; chan++) {
2258                         if (indir_index) {
2259                                 unsigned count = glsl_count_attribute_slots(
2260                                                 instr->variables[0]->var->type, false);
2261                                 count -= chan / 4;
2262                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2263                                                 &ctx->ac, ctx->outputs + idx + chan, count,
2264                                                 4, true);
2265
2266                                 values[chan] = LLVMBuildExtractElement(ctx->builder,
2267                                                                        tmp_vec,
2268                                                                        indir_index, "");
2269                         } else {
2270                         values[chan] = LLVMBuildLoad(ctx->builder,
2271                                                      ctx->outputs[idx + chan + const_index * 4],
2272                                                      "");
2273                         }
2274                 }
2275                 break;
2276         case nir_var_shared: {
2277                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
2278                 LLVMValueRef derived_ptr;
2279
2280                 if (indir_index)
2281                         indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
2282
2283                 for (unsigned chan = 0; chan < ve; chan++) {
2284                         LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
2285                         if (indir_index)
2286                                 index = LLVMBuildAdd(ctx->builder, index, indir_index, "");
2287                         derived_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
2288
2289                         values[chan] = LLVMBuildLoad(ctx->builder, derived_ptr, "");
2290                 }
2291                 break;
2292         }
2293         default:
2294                 unreachable("unhandle variable mode");
2295         }
2296         ret = ac_build_gather_values(&ctx->ac, values, ve);
2297         return LLVMBuildBitCast(ctx->builder, ret, get_def_type(ctx, &instr->dest.ssa), "");
2298 }
2299
2300 static void
2301 visit_store_var(struct nir_to_llvm_context *ctx,
2302                                    nir_intrinsic_instr *instr)
2303 {
2304         LLVMValueRef temp_ptr, value;
2305         int idx = instr->variables[0]->var->data.driver_location;
2306         LLVMValueRef src = to_float(ctx, get_src(ctx, instr->src[0]));
2307         int writemask = instr->const_index[0];
2308         LLVMValueRef indir_index;
2309         unsigned const_index;
2310         radv_get_deref_offset(ctx, &instr->variables[0]->deref, false,
2311                               NULL, &const_index, &indir_index);
2312
2313         if (get_elem_bits(ctx, LLVMTypeOf(src)) == 64) {
2314                 int old_writemask = writemask;
2315
2316                 src = LLVMBuildBitCast(ctx->builder, src,
2317                                        LLVMVectorType(ctx->f32, get_llvm_num_components(src) * 2),
2318                                        "");
2319
2320                 writemask = 0;
2321                 for (unsigned chan = 0; chan < 4; chan++) {
2322                         if (old_writemask & (1 << chan))
2323                                 writemask |= 3u << (2 * chan);
2324                 }
2325         }
2326
2327         switch (instr->variables[0]->var->data.mode) {
2328         case nir_var_shader_out:
2329                 for (unsigned chan = 0; chan < 8; chan++) {
2330                         int stride = 4;
2331                         if (!(writemask & (1 << chan)))
2332                                 continue;
2333
2334                         value = llvm_extract_elem(ctx, src, chan);
2335
2336                         if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 ||
2337                             instr->variables[0]->var->data.location == VARYING_SLOT_CULL_DIST0)
2338                                 stride = 1;
2339                         if (indir_index) {
2340                                 unsigned count = glsl_count_attribute_slots(
2341                                                 instr->variables[0]->var->type, false);
2342                                 count -= chan / 4;
2343                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2344                                                 &ctx->ac, ctx->outputs + idx + chan, count,
2345                                                 stride, true);
2346
2347                                 if (get_llvm_num_components(tmp_vec) > 1) {
2348                                         tmp_vec = LLVMBuildInsertElement(ctx->builder, tmp_vec,
2349                                                                          value, indir_index, "");
2350                                 } else
2351                                         tmp_vec = value;
2352                                 build_store_values_extended(ctx, ctx->outputs + idx + chan,
2353                                                             count, stride, tmp_vec);
2354
2355                         } else {
2356                                 temp_ptr = ctx->outputs[idx + chan + const_index * stride];
2357
2358                                 LLVMBuildStore(ctx->builder, value, temp_ptr);
2359                         }
2360                 }
2361                 break;
2362         case nir_var_local:
2363                 for (unsigned chan = 0; chan < 8; chan++) {
2364                         if (!(writemask & (1 << chan)))
2365                                 continue;
2366
2367                         value = llvm_extract_elem(ctx, src, chan);
2368                         if (indir_index) {
2369                                 unsigned count = glsl_count_attribute_slots(
2370                                         instr->variables[0]->var->type, false);
2371                                 count -= chan / 4;
2372                                 LLVMValueRef tmp_vec = ac_build_gather_values_extended(
2373                                         &ctx->ac, ctx->locals + idx + chan, count,
2374                                         4, true);
2375
2376                                 tmp_vec = LLVMBuildInsertElement(ctx->builder, tmp_vec,
2377                                                                  value, indir_index, "");
2378                                 build_store_values_extended(ctx, ctx->locals + idx + chan,
2379                                                             count, 4, tmp_vec);
2380                         } else {
2381                                 temp_ptr = ctx->locals[idx + chan + const_index * 4];
2382
2383                                 LLVMBuildStore(ctx->builder, value, temp_ptr);
2384                         }
2385                 }
2386                 break;
2387         case nir_var_shared: {
2388                 LLVMValueRef ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
2389
2390                 if (indir_index)
2391                         indir_index = LLVMBuildMul(ctx->builder, indir_index, LLVMConstInt(ctx->i32, 4, false), "");
2392
2393                 for (unsigned chan = 0; chan < 8; chan++) {
2394                         if (!(writemask & (1 << chan)))
2395                                 continue;
2396                         LLVMValueRef index = LLVMConstInt(ctx->i32, chan, false);
2397                         LLVMValueRef derived_ptr;
2398
2399                         if (indir_index)
2400                                 index = LLVMBuildAdd(ctx->builder, index, indir_index, "");
2401
2402                         value = llvm_extract_elem(ctx, src, chan);
2403                         derived_ptr = LLVMBuildGEP(ctx->builder, ptr, &index, 1, "");
2404                         LLVMBuildStore(ctx->builder,
2405                                        to_integer(ctx, value), derived_ptr);
2406                 }
2407                 break;
2408         }
2409         default:
2410                 break;
2411         }
2412 }
2413
2414 static int image_type_to_components_count(enum glsl_sampler_dim dim, bool array)
2415 {
2416         switch (dim) {
2417         case GLSL_SAMPLER_DIM_BUF:
2418                 return 1;
2419         case GLSL_SAMPLER_DIM_1D:
2420                 return array ? 2 : 1;
2421         case GLSL_SAMPLER_DIM_2D:
2422                 return array ? 3 : 2;
2423         case GLSL_SAMPLER_DIM_MS:
2424                 return array ? 4 : 3;
2425         case GLSL_SAMPLER_DIM_3D:
2426         case GLSL_SAMPLER_DIM_CUBE:
2427                 return 3;
2428         case GLSL_SAMPLER_DIM_RECT:
2429         case GLSL_SAMPLER_DIM_SUBPASS:
2430                 return 2;
2431         case GLSL_SAMPLER_DIM_SUBPASS_MS:
2432                 return 3;
2433         default:
2434                 break;
2435         }
2436         return 0;
2437 }
2438
2439
2440
2441 /* Adjust the sample index according to FMASK.
2442  *
2443  * For uncompressed MSAA surfaces, FMASK should return 0x76543210,
2444  * which is the identity mapping. Each nibble says which physical sample
2445  * should be fetched to get that sample.
2446  *
2447  * For example, 0x11111100 means there are only 2 samples stored and
2448  * the second sample covers 3/4 of the pixel. When reading samples 0
2449  * and 1, return physical sample 0 (determined by the first two 0s
2450  * in FMASK), otherwise return physical sample 1.
2451  *
2452  * The sample index should be adjusted as follows:
2453  *   sample_index = (fmask >> (sample_index * 4)) & 0xF;
2454  */
2455 static LLVMValueRef adjust_sample_index_using_fmask(struct nir_to_llvm_context *ctx,
2456                                                     LLVMValueRef coord_x, LLVMValueRef coord_y,
2457                                                     LLVMValueRef coord_z,
2458                                                     LLVMValueRef sample_index,
2459                                                     LLVMValueRef fmask_desc_ptr)
2460 {
2461         LLVMValueRef fmask_load_address[4];
2462         LLVMValueRef res;
2463
2464         fmask_load_address[0] = coord_x;
2465         fmask_load_address[1] = coord_y;
2466         if (coord_z) {
2467                 fmask_load_address[2] = coord_z;
2468                 fmask_load_address[3] = LLVMGetUndef(ctx->i32);
2469         }
2470
2471         struct ac_image_args args = {0};
2472
2473         args.opcode = ac_image_load;
2474         args.da = coord_z ? true : false;
2475         args.resource = fmask_desc_ptr;
2476         args.dmask = 0xf;
2477         args.addr = ac_build_gather_values(&ctx->ac, fmask_load_address, coord_z ? 4 : 2);
2478
2479         res = ac_build_image_opcode(&ctx->ac, &args);
2480
2481         res = to_integer(ctx, res);
2482         LLVMValueRef four = LLVMConstInt(ctx->i32, 4, false);
2483         LLVMValueRef F = LLVMConstInt(ctx->i32, 0xf, false);
2484
2485         LLVMValueRef fmask = LLVMBuildExtractElement(ctx->builder,
2486                                                      res,
2487                                                      ctx->i32zero, "");
2488
2489         LLVMValueRef sample_index4 =
2490                 LLVMBuildMul(ctx->builder, sample_index, four, "");
2491         LLVMValueRef shifted_fmask =
2492                 LLVMBuildLShr(ctx->builder, fmask, sample_index4, "");
2493         LLVMValueRef final_sample =
2494                 LLVMBuildAnd(ctx->builder, shifted_fmask, F, "");
2495
2496         /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK
2497          * resource descriptor is 0 (invalid),
2498          */
2499         LLVMValueRef fmask_desc =
2500                 LLVMBuildBitCast(ctx->builder, fmask_desc_ptr,
2501                                  ctx->v8i32, "");
2502
2503         LLVMValueRef fmask_word1 =
2504                 LLVMBuildExtractElement(ctx->builder, fmask_desc,
2505                                         ctx->i32one, "");
2506
2507         LLVMValueRef word1_is_nonzero =
2508                 LLVMBuildICmp(ctx->builder, LLVMIntNE,
2509                               fmask_word1, ctx->i32zero, "");
2510
2511         /* Replace the MSAA sample index. */
2512         sample_index =
2513                 LLVMBuildSelect(ctx->builder, word1_is_nonzero,
2514                                 final_sample, sample_index, "");
2515         return sample_index;
2516 }
2517
2518 static LLVMValueRef get_image_coords(struct nir_to_llvm_context *ctx,
2519                                      nir_intrinsic_instr *instr)
2520 {
2521         const struct glsl_type *type = instr->variables[0]->var->type;
2522         if(instr->variables[0]->deref.child)
2523                 type = instr->variables[0]->deref.child->type;
2524
2525         LLVMValueRef src0 = get_src(ctx, instr->src[0]);
2526         LLVMValueRef coords[4];
2527         LLVMValueRef masks[] = {
2528                 LLVMConstInt(ctx->i32, 0, false), LLVMConstInt(ctx->i32, 1, false),
2529                 LLVMConstInt(ctx->i32, 2, false), LLVMConstInt(ctx->i32, 3, false),
2530         };
2531         LLVMValueRef res;
2532         LLVMValueRef sample_index = llvm_extract_elem(ctx, get_src(ctx, instr->src[1]), 0);
2533
2534         int count;
2535         enum glsl_sampler_dim dim = glsl_get_sampler_dim(type);
2536         bool add_frag_pos = (dim == GLSL_SAMPLER_DIM_SUBPASS ||
2537                              dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
2538         bool is_ms = (dim == GLSL_SAMPLER_DIM_MS ||
2539                       dim == GLSL_SAMPLER_DIM_SUBPASS_MS);
2540
2541         count = image_type_to_components_count(dim,
2542                                                glsl_sampler_type_is_array(type));
2543
2544         if (is_ms) {
2545                 LLVMValueRef fmask_load_address[3];
2546                 int chan;
2547
2548                 fmask_load_address[0] = LLVMBuildExtractElement(ctx->builder, src0, masks[0], "");
2549                 fmask_load_address[1] = LLVMBuildExtractElement(ctx->builder, src0, masks[1], "");
2550                 if (glsl_sampler_type_is_array(type))
2551                         fmask_load_address[2] = LLVMBuildExtractElement(ctx->builder, src0, masks[2], "");
2552                 else
2553                         fmask_load_address[2] = NULL;
2554                 if (add_frag_pos) {
2555                         for (chan = 0; chan < 2; ++chan)
2556                                 fmask_load_address[chan] = LLVMBuildAdd(ctx->builder, fmask_load_address[chan], LLVMBuildFPToUI(ctx->builder, ctx->frag_pos[chan], ctx->i32, ""), "");
2557                 }
2558                 sample_index = adjust_sample_index_using_fmask(ctx,
2559                                                                fmask_load_address[0],
2560                                                                fmask_load_address[1],
2561                                                                fmask_load_address[2],
2562                                                                sample_index,
2563                                                                get_sampler_desc(ctx, instr->variables[0], DESC_FMASK));
2564         }
2565         if (count == 1) {
2566                 if (instr->src[0].ssa->num_components)
2567                         res = LLVMBuildExtractElement(ctx->builder, src0, masks[0], "");
2568                 else
2569                         res = src0;
2570         } else {
2571                 int chan;
2572                 if (is_ms)
2573                         count--;
2574                 for (chan = 0; chan < count; ++chan) {
2575                         coords[chan] = LLVMBuildExtractElement(ctx->builder, src0, masks[chan], "");
2576                 }
2577
2578                 if (add_frag_pos) {
2579                         for (chan = 0; chan < count; ++chan)
2580                                 coords[chan] = LLVMBuildAdd(ctx->builder, coords[chan], LLVMBuildFPToUI(ctx->builder, ctx->frag_pos[chan], ctx->i32, ""), "");
2581                 }
2582                 if (is_ms) {
2583                         coords[count] = sample_index;
2584                         count++;
2585                 }
2586
2587                 if (count == 3) {
2588                         coords[3] = LLVMGetUndef(ctx->i32);
2589                         count = 4;
2590                 }
2591                 res = ac_build_gather_values(&ctx->ac, coords, count);
2592         }
2593         return res;
2594 }
2595
2596 static LLVMValueRef visit_image_load(struct nir_to_llvm_context *ctx,
2597                                      nir_intrinsic_instr *instr)
2598 {
2599         LLVMValueRef params[7];
2600         LLVMValueRef res;
2601         char intrinsic_name[64];
2602         const nir_variable *var = instr->variables[0]->var;
2603         const struct glsl_type *type = var->type;
2604         if(instr->variables[0]->deref.child)
2605                 type = instr->variables[0]->deref.child->type;
2606
2607         type = glsl_without_array(type);
2608         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
2609                 params[0] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER);
2610                 params[1] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]),
2611                                                     LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */
2612                 params[2] = LLVMConstInt(ctx->i32, 0, false); /* voffset */
2613                 params[3] = ctx->i1false;  /* glc */
2614                 params[4] = ctx->i1false;  /* slc */
2615                 res = ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.load.format.v4f32", ctx->v4f32,
2616                                          params, 5, 0);
2617
2618                 res = trim_vector(ctx, res, instr->dest.ssa.num_components);
2619                 res = to_integer(ctx, res);
2620         } else {
2621                 bool is_da = glsl_sampler_type_is_array(type) ||
2622                              glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
2623                 LLVMValueRef da = is_da ? ctx->i1true : ctx->i1false;
2624                 LLVMValueRef glc = ctx->i1false;
2625                 LLVMValueRef slc = ctx->i1false;
2626
2627                 params[0] = get_image_coords(ctx, instr);
2628                 params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
2629                 params[2] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
2630                 if (HAVE_LLVM <= 0x0309) {
2631                         params[3] = ctx->i1false;  /* r128 */
2632                         params[4] = da;
2633                         params[5] = glc;
2634                         params[6] = slc;
2635                 } else {
2636                         LLVMValueRef lwe = ctx->i1false;
2637                         params[3] = glc;
2638                         params[4] = slc;
2639                         params[5] = lwe;
2640                         params[6] = da;
2641                 }
2642
2643                 ac_get_image_intr_name("llvm.amdgcn.image.load",
2644                                        ctx->v4f32, /* vdata */
2645                                        LLVMTypeOf(params[0]), /* coords */
2646                                        LLVMTypeOf(params[1]), /* rsrc */
2647                                        intrinsic_name, sizeof(intrinsic_name));
2648
2649                 res = ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->v4f32,
2650                                          params, 7, AC_FUNC_ATTR_READONLY);
2651         }
2652         return to_integer(ctx, res);
2653 }
2654
2655 static void visit_image_store(struct nir_to_llvm_context *ctx,
2656                               nir_intrinsic_instr *instr)
2657 {
2658         LLVMValueRef params[8];
2659         char intrinsic_name[64];
2660         const nir_variable *var = instr->variables[0]->var;
2661         const struct glsl_type *type = glsl_without_array(var->type);
2662
2663         if (ctx->stage == MESA_SHADER_FRAGMENT)
2664                 ctx->shader_info->fs.writes_memory = true;
2665
2666         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
2667                 params[0] = to_float(ctx, get_src(ctx, instr->src[2])); /* data */
2668                 params[1] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER);
2669                 params[2] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]),
2670                                                     LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */
2671                 params[3] = LLVMConstInt(ctx->i32, 0, false); /* voffset */
2672                 params[4] = ctx->i1false;  /* glc */
2673                 params[5] = ctx->i1false;  /* slc */
2674                 ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32", ctx->voidt,
2675                                    params, 6, 0);
2676         } else {
2677                 bool is_da = glsl_sampler_type_is_array(type) ||
2678                              glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
2679                 LLVMValueRef da = is_da ? ctx->i1true : ctx->i1false;
2680                 LLVMValueRef glc = ctx->i1false;
2681                 LLVMValueRef slc = ctx->i1false;
2682
2683                 params[0] = to_float(ctx, get_src(ctx, instr->src[2]));
2684                 params[1] = get_image_coords(ctx, instr); /* coords */
2685                 params[2] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
2686                 params[3] = LLVMConstInt(ctx->i32, 15, false); /* dmask */
2687                 if (HAVE_LLVM <= 0x0309) {
2688                         params[4] = ctx->i1false;  /* r128 */
2689                         params[5] = da;
2690                         params[6] = glc;
2691                         params[7] = slc;
2692                 } else {
2693                         LLVMValueRef lwe = ctx->i1false;
2694                         params[4] = glc;
2695                         params[5] = slc;
2696                         params[6] = lwe;
2697                         params[7] = da;
2698                 }
2699
2700                 ac_get_image_intr_name("llvm.amdgcn.image.store",
2701                                        LLVMTypeOf(params[0]), /* vdata */
2702                                        LLVMTypeOf(params[1]), /* coords */
2703                                        LLVMTypeOf(params[2]), /* rsrc */
2704                                        intrinsic_name, sizeof(intrinsic_name));
2705
2706                 ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->voidt,
2707                                    params, 8, 0);
2708         }
2709
2710 }
2711
2712 static LLVMValueRef visit_image_atomic(struct nir_to_llvm_context *ctx,
2713                                        nir_intrinsic_instr *instr)
2714 {
2715         LLVMValueRef params[6];
2716         int param_count = 0;
2717         const nir_variable *var = instr->variables[0]->var;
2718
2719         const char *base_name = "llvm.amdgcn.image.atomic";
2720         const char *atomic_name;
2721         LLVMValueRef coords;
2722         char intrinsic_name[32], coords_type[8];
2723         const struct glsl_type *type = glsl_without_array(var->type);
2724
2725         if (ctx->stage == MESA_SHADER_FRAGMENT)
2726                 ctx->shader_info->fs.writes_memory = true;
2727
2728         params[param_count++] = get_src(ctx, instr->src[2]);
2729         if (instr->intrinsic == nir_intrinsic_image_atomic_comp_swap)
2730                 params[param_count++] = get_src(ctx, instr->src[3]);
2731
2732         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF) {
2733                 params[param_count++] = get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER);
2734                 coords = params[param_count++] = LLVMBuildExtractElement(ctx->builder, get_src(ctx, instr->src[0]),
2735                                                                         LLVMConstInt(ctx->i32, 0, false), ""); /* vindex */
2736                 params[param_count++] = ctx->i32zero; /* voffset */
2737                 params[param_count++] = ctx->i1false;  /* glc */
2738                 params[param_count++] = ctx->i1false;  /* slc */
2739         } else {
2740                 bool da = glsl_sampler_type_is_array(type) ||
2741                           glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE;
2742
2743                 coords = params[param_count++] = get_image_coords(ctx, instr);
2744                 params[param_count++] = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
2745                 params[param_count++] = ctx->i1false; /* r128 */
2746                 params[param_count++] = da ? ctx->i1true : ctx->i1false;      /* da */
2747                 params[param_count++] = ctx->i1false;  /* slc */
2748         }
2749
2750         switch (instr->intrinsic) {
2751         case nir_intrinsic_image_atomic_add:
2752                 atomic_name = "add";
2753                 break;
2754         case nir_intrinsic_image_atomic_min:
2755                 atomic_name = "smin";
2756                 break;
2757         case nir_intrinsic_image_atomic_max:
2758                 atomic_name = "smax";
2759                 break;
2760         case nir_intrinsic_image_atomic_and:
2761                 atomic_name = "and";
2762                 break;
2763         case nir_intrinsic_image_atomic_or:
2764                 atomic_name = "or";
2765                 break;
2766         case nir_intrinsic_image_atomic_xor:
2767                 atomic_name = "xor";
2768                 break;
2769         case nir_intrinsic_image_atomic_exchange:
2770                 atomic_name = "swap";
2771                 break;
2772         case nir_intrinsic_image_atomic_comp_swap:
2773                 atomic_name = "cmpswap";
2774                 break;
2775         default:
2776                 abort();
2777         }
2778         build_int_type_name(LLVMTypeOf(coords),
2779                             coords_type, sizeof(coords_type));
2780
2781         snprintf(intrinsic_name, sizeof(intrinsic_name),
2782                          "%s.%s.%s", base_name, atomic_name, coords_type);
2783         return ac_build_intrinsic(&ctx->ac, intrinsic_name, ctx->i32, params, param_count, 0);
2784 }
2785
2786 static LLVMValueRef visit_image_size(struct nir_to_llvm_context *ctx,
2787                                      nir_intrinsic_instr *instr)
2788 {
2789         LLVMValueRef res;
2790         const nir_variable *var = instr->variables[0]->var;
2791         const struct glsl_type *type = instr->variables[0]->var->type;
2792         bool da = glsl_sampler_type_is_array(var->type) ||
2793                   glsl_get_sampler_dim(var->type) == GLSL_SAMPLER_DIM_CUBE;
2794         if(instr->variables[0]->deref.child)
2795                 type = instr->variables[0]->deref.child->type;
2796
2797         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_BUF)
2798                 return get_buffer_size(ctx, get_sampler_desc(ctx, instr->variables[0], DESC_BUFFER), true);
2799
2800         struct ac_image_args args = { 0 };
2801
2802         args.da = da;
2803         args.dmask = 0xf;
2804         args.resource = get_sampler_desc(ctx, instr->variables[0], DESC_IMAGE);
2805         args.opcode = ac_image_get_resinfo;
2806         args.addr = ctx->i32zero;
2807
2808         res = ac_build_image_opcode(&ctx->ac, &args);
2809
2810         if (glsl_get_sampler_dim(type) == GLSL_SAMPLER_DIM_CUBE &&
2811             glsl_sampler_type_is_array(type)) {
2812                 LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false);
2813                 LLVMValueRef six = LLVMConstInt(ctx->i32, 6, false);
2814                 LLVMValueRef z = LLVMBuildExtractElement(ctx->builder, res, two, "");
2815                 z = LLVMBuildSDiv(ctx->builder, z, six, "");
2816                 res = LLVMBuildInsertElement(ctx->builder, res, z, two, "");
2817         }
2818         return res;
2819 }
2820
2821 static void emit_waitcnt(struct nir_to_llvm_context *ctx)
2822 {
2823         LLVMValueRef args[1] = {
2824                 LLVMConstInt(ctx->i32, 0xf70, false),
2825         };
2826         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.s.waitcnt",
2827                            ctx->voidt, args, 1, 0);
2828 }
2829
2830 static void emit_barrier(struct nir_to_llvm_context *ctx)
2831 {
2832         // TODO tess
2833         ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.s.barrier",
2834                            ctx->voidt, NULL, 0, 0);
2835 }
2836
2837 static void emit_discard_if(struct nir_to_llvm_context *ctx,
2838                             nir_intrinsic_instr *instr)
2839 {
2840         LLVMValueRef cond;
2841         ctx->shader_info->fs.can_discard = true;
2842
2843         cond = LLVMBuildICmp(ctx->builder, LLVMIntNE,
2844                              get_src(ctx, instr->src[0]),
2845                              ctx->i32zero, "");
2846
2847         cond = LLVMBuildSelect(ctx->builder, cond,
2848                                LLVMConstReal(ctx->f32, -1.0f),
2849                                ctx->f32zero, "");
2850         ac_build_kill(&ctx->ac, cond);
2851 }
2852
2853 static LLVMValueRef
2854 visit_load_local_invocation_index(struct nir_to_llvm_context *ctx)
2855 {
2856         LLVMValueRef result;
2857         LLVMValueRef thread_id = ac_get_thread_id(&ctx->ac);
2858         result = LLVMBuildAnd(ctx->builder, ctx->tg_size,
2859                               LLVMConstInt(ctx->i32, 0xfc0, false), "");
2860
2861         return LLVMBuildAdd(ctx->builder, result, thread_id, "");
2862 }
2863
2864 static LLVMValueRef visit_var_atomic(struct nir_to_llvm_context *ctx,
2865                                      nir_intrinsic_instr *instr)
2866 {
2867         LLVMValueRef ptr, result;
2868         int idx = instr->variables[0]->var->data.driver_location;
2869         LLVMValueRef src = get_src(ctx, instr->src[0]);
2870         ptr = get_shared_memory_ptr(ctx, idx, ctx->i32);
2871
2872         if (instr->intrinsic == nir_intrinsic_var_atomic_comp_swap) {
2873                 LLVMValueRef src1 = get_src(ctx, instr->src[1]);
2874                 result = LLVMBuildAtomicCmpXchg(ctx->builder,
2875                                                 ptr, src, src1,
2876                                                 LLVMAtomicOrderingSequentiallyConsistent,
2877                                                 LLVMAtomicOrderingSequentiallyConsistent,
2878                                                 false);
2879         } else {
2880                 LLVMAtomicRMWBinOp op;
2881                 switch (instr->intrinsic) {
2882                 case nir_intrinsic_var_atomic_add:
2883                         op = LLVMAtomicRMWBinOpAdd;
2884                         break;
2885                 case nir_intrinsic_var_atomic_umin:
2886                         op = LLVMAtomicRMWBinOpUMin;
2887                         break;
2888                 case nir_intrinsic_var_atomic_umax:
2889                         op = LLVMAtomicRMWBinOpUMax;
2890                         break;
2891                 case nir_intrinsic_var_atomic_imin:
2892                         op = LLVMAtomicRMWBinOpMin;
2893                         break;
2894                 case nir_intrinsic_var_atomic_imax:
2895                         op = LLVMAtomicRMWBinOpMax;
2896                         break;
2897                 case nir_intrinsic_var_atomic_and:
2898                         op = LLVMAtomicRMWBinOpAnd;
2899                         break;
2900                 case nir_intrinsic_var_atomic_or:
2901                         op = LLVMAtomicRMWBinOpOr;
2902                         break;
2903                 case nir_intrinsic_var_atomic_xor:
2904                         op = LLVMAtomicRMWBinOpXor;
2905                         break;
2906                 case nir_intrinsic_var_atomic_exchange:
2907                         op = LLVMAtomicRMWBinOpXchg;
2908                         break;
2909                 default:
2910                         return NULL;
2911                 }
2912
2913                 result = LLVMBuildAtomicRMW(ctx->builder, op, ptr, to_integer(ctx, src),
2914                                             LLVMAtomicOrderingSequentiallyConsistent,
2915                                             false);
2916         }
2917         return result;
2918 }
2919
2920 #define INTERP_CENTER 0
2921 #define INTERP_CENTROID 1
2922 #define INTERP_SAMPLE 2
2923
2924 static LLVMValueRef lookup_interp_param(struct nir_to_llvm_context *ctx,
2925                                         enum glsl_interp_mode interp, unsigned location)
2926 {
2927         switch (interp) {
2928         case INTERP_MODE_FLAT:
2929         default:
2930                 return NULL;
2931         case INTERP_MODE_SMOOTH:
2932         case INTERP_MODE_NONE:
2933                 if (location == INTERP_CENTER)
2934                         return ctx->persp_center;
2935                 else if (location == INTERP_CENTROID)
2936                         return ctx->persp_centroid;
2937                 else if (location == INTERP_SAMPLE)
2938                         return ctx->persp_sample;
2939                 break;
2940         case INTERP_MODE_NOPERSPECTIVE:
2941                 if (location == INTERP_CENTER)
2942                         return ctx->linear_center;
2943                 else if (location == INTERP_CENTROID)
2944                         return ctx->linear_centroid;
2945                 else if (location == INTERP_SAMPLE)
2946                         return ctx->linear_sample;
2947                 break;
2948         }
2949         return NULL;
2950 }
2951
2952 static LLVMValueRef load_sample_position(struct nir_to_llvm_context *ctx,
2953                                          LLVMValueRef sample_id)
2954 {
2955         /* offset = sample_id * 8  (8 = 2 floats containing samplepos.xy) */
2956         LLVMValueRef offset0 = LLVMBuildMul(ctx->builder, sample_id, LLVMConstInt(ctx->i32, 8, false), "");
2957         LLVMValueRef offset1 = LLVMBuildAdd(ctx->builder, offset0, LLVMConstInt(ctx->i32, 4, false), "");
2958         LLVMValueRef result[2];
2959
2960         result[0] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset0);
2961         result[1] = ac_build_indexed_load_const(&ctx->ac, ctx->sample_positions, offset1);
2962
2963         return ac_build_gather_values(&ctx->ac, result, 2);
2964 }
2965
2966 static LLVMValueRef load_sample_pos(struct nir_to_llvm_context *ctx)
2967 {
2968         LLVMValueRef values[2];
2969
2970         values[0] = emit_ffract(ctx, ctx->frag_pos[0]);
2971         values[1] = emit_ffract(ctx, ctx->frag_pos[1]);
2972         return ac_build_gather_values(&ctx->ac, values, 2);
2973 }
2974
2975 static LLVMValueRef visit_interp(struct nir_to_llvm_context *ctx,
2976                                  nir_intrinsic_instr *instr)
2977 {
2978         LLVMValueRef result[2];
2979         LLVMValueRef interp_param, attr_number;
2980         unsigned location;
2981         unsigned chan;
2982         LLVMValueRef src_c0, src_c1;
2983         LLVMValueRef src0;
2984         int input_index = instr->variables[0]->var->data.location - VARYING_SLOT_VAR0;
2985         switch (instr->intrinsic) {
2986         case nir_intrinsic_interp_var_at_centroid:
2987                 location = INTERP_CENTROID;
2988                 break;
2989         case nir_intrinsic_interp_var_at_sample:
2990                 location = INTERP_SAMPLE;
2991                 src0 = get_src(ctx, instr->src[0]);
2992                 break;
2993         case nir_intrinsic_interp_var_at_offset:
2994                 location = INTERP_CENTER;
2995                 src0 = get_src(ctx, instr->src[0]);
2996         default:
2997                 break;
2998         }
2999
3000         if (instr->intrinsic == nir_intrinsic_interp_var_at_offset) {
3001                 src_c0 = to_float(ctx, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32zero, ""));
3002                 src_c1 = to_float(ctx, LLVMBuildExtractElement(ctx->builder, src0, ctx->i32one, ""));
3003         } else if (instr->intrinsic == nir_intrinsic_interp_var_at_sample) {
3004                 LLVMValueRef sample_position;
3005                 LLVMValueRef halfval = LLVMConstReal(ctx->f32, 0.5f);
3006
3007                 /* fetch sample ID */
3008                 sample_position = load_sample_position(ctx, src0);
3009
3010                 src_c0 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->i32zero, "");
3011                 src_c0 = LLVMBuildFSub(ctx->builder, src_c0, halfval, "");
3012                 src_c1 = LLVMBuildExtractElement(ctx->builder, sample_position, ctx->i32one, "");
3013                 src_c1 = LLVMBuildFSub(ctx->builder, src_c1, halfval, "");
3014         }
3015         interp_param = lookup_interp_param(ctx, instr->variables[0]->var->data.interpolation, location);
3016         attr_number = LLVMConstInt(ctx->i32, input_index, false);
3017
3018         if (location == INTERP_SAMPLE || location == INTERP_CENTER) {
3019                 LLVMValueRef ij_out[2];
3020                 LLVMValueRef ddxy_out = emit_ddxy_interp(ctx, interp_param);
3021
3022                 /*
3023                  * take the I then J parameters, and the DDX/Y for it, and
3024                  * calculate the IJ inputs for the interpolator.
3025                  * temp1 = ddx * offset/sample.x + I;
3026                  * interp_param.I = ddy * offset/sample.y + temp1;
3027                  * temp1 = ddx * offset/sample.x + J;
3028                  * interp_param.J = ddy * offset/sample.y + temp1;
3029                  */
3030                 for (unsigned i = 0; i < 2; i++) {
3031                         LLVMValueRef ix_ll = LLVMConstInt(ctx->i32, i, false);
3032                         LLVMValueRef iy_ll = LLVMConstInt(ctx->i32, i + 2, false);
3033                         LLVMValueRef ddx_el = LLVMBuildExtractElement(ctx->builder,
3034                                                                       ddxy_out, ix_ll, "");
3035                         LLVMValueRef ddy_el = LLVMBuildExtractElement(ctx->builder,
3036                                                                       ddxy_out, iy_ll, "");
3037                         LLVMValueRef interp_el = LLVMBuildExtractElement(ctx->builder,
3038                                                                          interp_param, ix_ll, "");
3039                         LLVMValueRef temp1, temp2;
3040
3041                         interp_el = LLVMBuildBitCast(ctx->builder, interp_el,
3042                                                      ctx->f32, "");
3043
3044                         temp1 = LLVMBuildFMul(ctx->builder, ddx_el, src_c0, "");
3045                         temp1 = LLVMBuildFAdd(ctx->builder, temp1, interp_el, "");
3046
3047                         temp2 = LLVMBuildFMul(ctx->builder, ddy_el, src_c1, "");
3048                         temp2 = LLVMBuildFAdd(ctx->builder, temp2, temp1, "");
3049
3050                         ij_out[i] = LLVMBuildBitCast(ctx->builder,
3051                                                      temp2, ctx->i32, "");
3052                 }
3053                 interp_param = ac_build_gather_values(&ctx->ac, ij_out, 2);
3054
3055         }
3056
3057         for (chan = 0; chan < 2; chan++) {
3058                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false);
3059
3060                 if (interp_param) {
3061                         interp_param = LLVMBuildBitCast(ctx->builder,
3062                                                         interp_param, LLVMVectorType(ctx->f32, 2), "");
3063                         LLVMValueRef i = LLVMBuildExtractElement(
3064                                 ctx->builder, interp_param, ctx->i32zero, "");
3065                         LLVMValueRef j = LLVMBuildExtractElement(
3066                                 ctx->builder, interp_param, ctx->i32one, "");
3067
3068                         result[chan] = ac_build_fs_interp(&ctx->ac,
3069                                                           llvm_chan, attr_number,
3070                                                           ctx->prim_mask, i, j);
3071                 } else {
3072                         result[chan] = ac_build_fs_interp_mov(&ctx->ac,
3073                                                               LLVMConstInt(ctx->i32, 2, false),
3074                                                               llvm_chan, attr_number,
3075                                                               ctx->prim_mask);
3076                 }
3077         }
3078         return ac_build_gather_values(&ctx->ac, result, 2);
3079 }
3080
3081 static void
3082 visit_emit_vertex(struct nir_to_llvm_context *ctx,
3083                   nir_intrinsic_instr *instr)
3084 {
3085         LLVMValueRef gs_next_vertex;
3086         LLVMValueRef can_emit, kill;
3087         int idx;
3088         int clip_cull_slot = -1;
3089         assert(instr->const_index[0] == 0);
3090         /* Write vertex attribute values to GSVS ring */
3091         gs_next_vertex = LLVMBuildLoad(ctx->builder,
3092                                        ctx->gs_next_vertex,
3093                                        "");
3094
3095         /* If this thread has already emitted the declared maximum number of
3096          * vertices, kill it: excessive vertex emissions are not supposed to
3097          * have any effect, and GS threads have no externally observable
3098          * effects other than emitting vertices.
3099          */
3100         can_emit = LLVMBuildICmp(ctx->builder, LLVMIntULT, gs_next_vertex,
3101                                  LLVMConstInt(ctx->i32, ctx->gs_max_out_vertices, false), "");
3102
3103         kill = LLVMBuildSelect(ctx->builder, can_emit,
3104                                LLVMConstReal(ctx->f32, 1.0f),
3105                                LLVMConstReal(ctx->f32, -1.0f), "");
3106         ac_build_kill(&ctx->ac, kill);
3107
3108         /* loop num outputs */
3109         idx = 0;
3110         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
3111                 LLVMValueRef *out_ptr = &ctx->outputs[i * 4];
3112                 int length = 4;
3113                 int start = 0;
3114                 int slot = idx;
3115                 int slot_inc = 1;
3116
3117                 if (!(ctx->output_mask & (1ull << i)))
3118                         continue;
3119
3120                 if (i == VARYING_SLOT_CLIP_DIST1 ||
3121                     i == VARYING_SLOT_CULL_DIST1)
3122                         continue;
3123
3124                 if (i == VARYING_SLOT_CLIP_DIST0 ||
3125                     i == VARYING_SLOT_CULL_DIST0) {
3126                         /* pack clip and cull into a single set of slots */
3127                         if (clip_cull_slot == -1) {
3128                                 clip_cull_slot = idx;
3129                                 if (ctx->num_output_clips + ctx->num_output_culls > 4)
3130                                         slot_inc = 2;
3131                         } else {
3132                                 slot = clip_cull_slot;
3133                                 slot_inc = 0;
3134                         }
3135                         if (i == VARYING_SLOT_CLIP_DIST0)
3136                                 length = ctx->num_output_clips;
3137                         if (i == VARYING_SLOT_CULL_DIST0) {
3138                                 start = ctx->num_output_clips;
3139                                 length = ctx->num_output_culls;
3140                         }
3141                 }
3142                 for (unsigned j = 0; j < length; j++) {
3143                         LLVMValueRef out_val = LLVMBuildLoad(ctx->builder,
3144                                                              out_ptr[j], "");
3145                         LLVMValueRef voffset = LLVMConstInt(ctx->i32, (slot * 4 + j + start) * ctx->gs_max_out_vertices, false);
3146                         voffset = LLVMBuildAdd(ctx->builder, voffset, gs_next_vertex, "");
3147                         voffset = LLVMBuildMul(ctx->builder, voffset, LLVMConstInt(ctx->i32, 4, false), "");
3148
3149                         out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->i32, "");
3150
3151                         ac_build_buffer_store_dword(&ctx->ac, ctx->gsvs_ring,
3152                                                     out_val, 1,
3153                                                     voffset, ctx->gs2vs_offset, 0,
3154                                                     1, 1, true, true);
3155                 }
3156                 idx += slot_inc;
3157         }
3158
3159         gs_next_vertex = LLVMBuildAdd(ctx->builder, gs_next_vertex,
3160                                       ctx->i32one, "");
3161         LLVMBuildStore(ctx->builder, gs_next_vertex, ctx->gs_next_vertex);
3162
3163         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_EMIT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id);
3164 }
3165
3166 static void
3167 visit_end_primitive(struct nir_to_llvm_context *ctx,
3168                     nir_intrinsic_instr *instr)
3169 {
3170         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_CUT | AC_SENDMSG_GS | (0 << 8), ctx->gs_wave_id);
3171 }
3172
3173 static void visit_intrinsic(struct nir_to_llvm_context *ctx,
3174                             nir_intrinsic_instr *instr)
3175 {
3176         LLVMValueRef result = NULL;
3177
3178         switch (instr->intrinsic) {
3179         case nir_intrinsic_load_work_group_id: {
3180                 result = ctx->workgroup_ids;
3181                 break;
3182         }
3183         case nir_intrinsic_load_base_vertex: {
3184                 result = ctx->base_vertex;
3185                 break;
3186         }
3187         case nir_intrinsic_load_vertex_id_zero_base: {
3188                 result = ctx->vertex_id;
3189                 break;
3190         }
3191         case nir_intrinsic_load_local_invocation_id: {
3192                 result = ctx->local_invocation_ids;
3193                 break;
3194         }
3195         case nir_intrinsic_load_base_instance:
3196                 result = ctx->start_instance;
3197                 break;
3198         case nir_intrinsic_load_draw_id:
3199                 result = ctx->draw_index;
3200                 break;
3201         case nir_intrinsic_load_invocation_id:
3202                 result = ctx->gs_invocation_id;
3203                 break;
3204         case nir_intrinsic_load_primitive_id:
3205                 if (ctx->stage == MESA_SHADER_GEOMETRY)
3206                         result = ctx->gs_prim_id;
3207                 else
3208                         fprintf(stderr, "Unknown primitive id intrinsic: %d", ctx->stage);
3209                 break;
3210         case nir_intrinsic_load_sample_id:
3211                 ctx->shader_info->fs.force_persample = true;
3212                 result = unpack_param(ctx, ctx->ancillary, 8, 4);
3213                 break;
3214         case nir_intrinsic_load_sample_pos:
3215                 ctx->shader_info->fs.force_persample = true;
3216                 result = load_sample_pos(ctx);
3217                 break;
3218         case nir_intrinsic_load_sample_mask_in:
3219                 result = ctx->sample_coverage;
3220                 break;
3221         case nir_intrinsic_load_front_face:
3222                 result = ctx->front_face;
3223                 break;
3224         case nir_intrinsic_load_instance_id:
3225                 result = ctx->instance_id;
3226                 ctx->shader_info->vs.vgpr_comp_cnt = MAX2(3,
3227                                             ctx->shader_info->vs.vgpr_comp_cnt);
3228                 break;
3229         case nir_intrinsic_load_num_work_groups:
3230                 result = ctx->num_work_groups;
3231                 break;
3232         case nir_intrinsic_load_local_invocation_index:
3233                 result = visit_load_local_invocation_index(ctx);
3234                 break;
3235         case nir_intrinsic_load_push_constant:
3236                 result = visit_load_push_constant(ctx, instr);
3237                 break;
3238         case nir_intrinsic_vulkan_resource_index:
3239                 result = visit_vulkan_resource_index(ctx, instr);
3240                 break;
3241         case nir_intrinsic_store_ssbo:
3242                 visit_store_ssbo(ctx, instr);
3243                 break;
3244         case nir_intrinsic_load_ssbo:
3245                 result = visit_load_buffer(ctx, instr);
3246                 break;
3247         case nir_intrinsic_ssbo_atomic_add:
3248         case nir_intrinsic_ssbo_atomic_imin:
3249         case nir_intrinsic_ssbo_atomic_umin:
3250         case nir_intrinsic_ssbo_atomic_imax:
3251         case nir_intrinsic_ssbo_atomic_umax:
3252         case nir_intrinsic_ssbo_atomic_and:
3253         case nir_intrinsic_ssbo_atomic_or:
3254         case nir_intrinsic_ssbo_atomic_xor:
3255         case nir_intrinsic_ssbo_atomic_exchange:
3256         case nir_intrinsic_ssbo_atomic_comp_swap:
3257                 result = visit_atomic_ssbo(ctx, instr);
3258                 break;
3259         case nir_intrinsic_load_ubo:
3260                 result = visit_load_ubo_buffer(ctx, instr);
3261                 break;
3262         case nir_intrinsic_get_buffer_size:
3263                 result = visit_get_buffer_size(ctx, instr);
3264                 break;
3265         case nir_intrinsic_load_var:
3266                 result = visit_load_var(ctx, instr);
3267                 break;
3268         case nir_intrinsic_store_var:
3269                 visit_store_var(ctx, instr);
3270                 break;
3271         case nir_intrinsic_image_load:
3272                 result = visit_image_load(ctx, instr);
3273                 break;
3274         case nir_intrinsic_image_store:
3275                 visit_image_store(ctx, instr);
3276                 break;
3277         case nir_intrinsic_image_atomic_add:
3278         case nir_intrinsic_image_atomic_min:
3279         case nir_intrinsic_image_atomic_max:
3280         case nir_intrinsic_image_atomic_and:
3281         case nir_intrinsic_image_atomic_or:
3282         case nir_intrinsic_image_atomic_xor:
3283         case nir_intrinsic_image_atomic_exchange:
3284         case nir_intrinsic_image_atomic_comp_swap:
3285                 result = visit_image_atomic(ctx, instr);
3286                 break;
3287         case nir_intrinsic_image_size:
3288                 result = visit_image_size(ctx, instr);
3289                 break;
3290         case nir_intrinsic_discard:
3291                 ctx->shader_info->fs.can_discard = true;
3292                 ac_build_intrinsic(&ctx->ac, "llvm.AMDGPU.kilp",
3293                                    ctx->voidt,
3294                                    NULL, 0, AC_FUNC_ATTR_LEGACY);
3295                 break;
3296         case nir_intrinsic_discard_if:
3297                 emit_discard_if(ctx, instr);
3298                 break;
3299         case nir_intrinsic_memory_barrier:
3300                 emit_waitcnt(ctx);
3301                 break;
3302         case nir_intrinsic_barrier:
3303                 emit_barrier(ctx);
3304                 break;
3305         case nir_intrinsic_var_atomic_add:
3306         case nir_intrinsic_var_atomic_imin:
3307         case nir_intrinsic_var_atomic_umin:
3308         case nir_intrinsic_var_atomic_imax:
3309         case nir_intrinsic_var_atomic_umax:
3310         case nir_intrinsic_var_atomic_and:
3311         case nir_intrinsic_var_atomic_or:
3312         case nir_intrinsic_var_atomic_xor:
3313         case nir_intrinsic_var_atomic_exchange:
3314         case nir_intrinsic_var_atomic_comp_swap:
3315                 result = visit_var_atomic(ctx, instr);
3316                 break;
3317         case nir_intrinsic_interp_var_at_centroid:
3318         case nir_intrinsic_interp_var_at_sample:
3319         case nir_intrinsic_interp_var_at_offset:
3320                 result = visit_interp(ctx, instr);
3321                 break;
3322         case nir_intrinsic_emit_vertex:
3323                 visit_emit_vertex(ctx, instr);
3324                 break;
3325         case nir_intrinsic_end_primitive:
3326                 visit_end_primitive(ctx, instr);
3327                 break;
3328         default:
3329                 fprintf(stderr, "Unknown intrinsic: ");
3330                 nir_print_instr(&instr->instr, stderr);
3331                 fprintf(stderr, "\n");
3332                 break;
3333         }
3334         if (result) {
3335                 _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
3336         }
3337 }
3338
3339 static LLVMValueRef get_sampler_desc(struct nir_to_llvm_context *ctx,
3340                                           nir_deref_var *deref,
3341                                           enum desc_type desc_type)
3342 {
3343         unsigned desc_set = deref->var->data.descriptor_set;
3344         LLVMValueRef list = ctx->descriptor_sets[desc_set];
3345         struct radv_descriptor_set_layout *layout = ctx->options->layout->set[desc_set].layout;
3346         struct radv_descriptor_set_binding_layout *binding = layout->binding + deref->var->data.binding;
3347         unsigned offset = binding->offset;
3348         unsigned stride = binding->size;
3349         unsigned type_size;
3350         LLVMBuilderRef builder = ctx->builder;
3351         LLVMTypeRef type;
3352         LLVMValueRef index = NULL;
3353         unsigned constant_index = 0;
3354
3355         assert(deref->var->data.binding < layout->binding_count);
3356
3357         switch (desc_type) {
3358         case DESC_IMAGE:
3359                 type = ctx->v8i32;
3360                 type_size = 32;
3361                 break;
3362         case DESC_FMASK:
3363                 type = ctx->v8i32;
3364                 offset += 32;
3365                 type_size = 32;
3366                 break;
3367         case DESC_SAMPLER:
3368                 type = ctx->v4i32;
3369                 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3370                         offset += 64;
3371
3372                 type_size = 16;
3373                 break;
3374         case DESC_BUFFER:
3375                 type = ctx->v4i32;
3376                 type_size = 16;
3377                 break;
3378         default:
3379                 unreachable("invalid desc_type\n");
3380         }
3381
3382         if (deref->deref.child) {
3383                 nir_deref_array *child = (nir_deref_array*)deref->deref.child;
3384
3385                 assert(child->deref_array_type != nir_deref_array_type_wildcard);
3386                 offset += child->base_offset * stride;
3387                 if (child->deref_array_type == nir_deref_array_type_indirect) {
3388                         index = get_src(ctx, child->indirect);
3389                 }
3390
3391                 constant_index = child->base_offset;
3392         }
3393         if (desc_type == DESC_SAMPLER && binding->immutable_samplers &&
3394             (!index || binding->immutable_samplers_equal)) {
3395                 if (binding->immutable_samplers_equal)
3396                         constant_index = 0;
3397
3398                 LLVMValueRef constants[] = {
3399                         LLVMConstInt(ctx->i32, binding->immutable_samplers[constant_index * 4 + 0], 0),
3400                         LLVMConstInt(ctx->i32, binding->immutable_samplers[constant_index * 4 + 1], 0),
3401                         LLVMConstInt(ctx->i32, binding->immutable_samplers[constant_index * 4 + 2], 0),
3402                         LLVMConstInt(ctx->i32, binding->immutable_samplers[constant_index * 4 + 3], 0),
3403                 };
3404                 return ac_build_gather_values(&ctx->ac, constants, 4);
3405         }
3406
3407         assert(stride % type_size == 0);
3408
3409         if (!index)
3410                 index = ctx->i32zero;
3411
3412         index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->i32, stride / type_size, 0), "");
3413
3414         list = ac_build_gep0(&ctx->ac, list, LLVMConstInt(ctx->i32, offset, 0));
3415         list = LLVMBuildPointerCast(builder, list, const_array(type, 0), "");
3416
3417         return ac_build_indexed_load_const(&ctx->ac, list, index);
3418 }
3419
3420 static void set_tex_fetch_args(struct nir_to_llvm_context *ctx,
3421                                struct ac_image_args *args,
3422                                nir_tex_instr *instr,
3423                                nir_texop op,
3424                                LLVMValueRef res_ptr, LLVMValueRef samp_ptr,
3425                                LLVMValueRef *param, unsigned count,
3426                                unsigned dmask)
3427 {
3428         unsigned is_rect = 0;
3429         bool da = instr->is_array || instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE;
3430
3431         if (op == nir_texop_lod)
3432                 da = false;
3433         /* Pad to power of two vector */
3434         while (count < util_next_power_of_two(count))
3435                 param[count++] = LLVMGetUndef(ctx->i32);
3436
3437         if (count > 1)
3438                 args->addr = ac_build_gather_values(&ctx->ac, param, count);
3439         else
3440                 args->addr = param[0];
3441
3442         args->resource = res_ptr;
3443         args->sampler = samp_ptr;
3444
3445         if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF && op == nir_texop_txf) {
3446                 args->addr = param[0];
3447                 return;
3448         }
3449
3450         args->dmask = dmask;
3451         args->unorm = is_rect;
3452         args->da = da;
3453 }
3454
3455 /* Disable anisotropic filtering if BASE_LEVEL == LAST_LEVEL.
3456  *
3457  * SI-CI:
3458  *   If BASE_LEVEL == LAST_LEVEL, the shader must disable anisotropic
3459  *   filtering manually. The driver sets img7 to a mask clearing
3460  *   MAX_ANISO_RATIO if BASE_LEVEL == LAST_LEVEL. The shader must do:
3461  *     s_and_b32 samp0, samp0, img7
3462  *
3463  * VI:
3464  *   The ANISO_OVERRIDE sampler field enables this fix in TA.
3465  */
3466 static LLVMValueRef sici_fix_sampler_aniso(struct nir_to_llvm_context *ctx,
3467                                            LLVMValueRef res, LLVMValueRef samp)
3468 {
3469         LLVMBuilderRef builder = ctx->builder;
3470         LLVMValueRef img7, samp0;
3471
3472         if (ctx->options->chip_class >= VI)
3473                 return samp;
3474
3475         img7 = LLVMBuildExtractElement(builder, res,
3476                                        LLVMConstInt(ctx->i32, 7, 0), "");
3477         samp0 = LLVMBuildExtractElement(builder, samp,
3478                                         LLVMConstInt(ctx->i32, 0, 0), "");
3479         samp0 = LLVMBuildAnd(builder, samp0, img7, "");
3480         return LLVMBuildInsertElement(builder, samp, samp0,
3481                                       LLVMConstInt(ctx->i32, 0, 0), "");
3482 }
3483
3484 static void tex_fetch_ptrs(struct nir_to_llvm_context *ctx,
3485                            nir_tex_instr *instr,
3486                            LLVMValueRef *res_ptr, LLVMValueRef *samp_ptr,
3487                            LLVMValueRef *fmask_ptr)
3488 {
3489         if (instr->sampler_dim  == GLSL_SAMPLER_DIM_BUF)
3490                 *res_ptr = get_sampler_desc(ctx, instr->texture, DESC_BUFFER);
3491         else
3492                 *res_ptr = get_sampler_desc(ctx, instr->texture, DESC_IMAGE);
3493         if (samp_ptr) {
3494                 if (instr->sampler)
3495                         *samp_ptr = get_sampler_desc(ctx, instr->sampler, DESC_SAMPLER);
3496                 else
3497                         *samp_ptr = get_sampler_desc(ctx, instr->texture, DESC_SAMPLER);
3498                 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT)
3499                         *samp_ptr = sici_fix_sampler_aniso(ctx, *res_ptr, *samp_ptr);
3500         }
3501         if (fmask_ptr && !instr->sampler && (instr->op == nir_texop_txf_ms ||
3502                                              instr->op == nir_texop_samples_identical))
3503                 *fmask_ptr = get_sampler_desc(ctx, instr->texture, DESC_FMASK);
3504 }
3505
3506 static LLVMValueRef apply_round_slice(struct nir_to_llvm_context *ctx,
3507                                       LLVMValueRef coord)
3508 {
3509         coord = to_float(ctx, coord);
3510         coord = ac_build_intrinsic(&ctx->ac, "llvm.rint.f32", ctx->f32, &coord, 1, 0);
3511         coord = to_integer(ctx, coord);
3512         return coord;
3513 }
3514
3515 static void visit_tex(struct nir_to_llvm_context *ctx, nir_tex_instr *instr)
3516 {
3517         LLVMValueRef result = NULL;
3518         struct ac_image_args args = { 0 };
3519         unsigned dmask = 0xf;
3520         LLVMValueRef address[16];
3521         LLVMValueRef coords[5];
3522         LLVMValueRef coord = NULL, lod = NULL, comparator = NULL;
3523         LLVMValueRef bias = NULL, offsets = NULL;
3524         LLVMValueRef res_ptr, samp_ptr, fmask_ptr = NULL, sample_index = NULL;
3525         LLVMValueRef ddx = NULL, ddy = NULL;
3526         LLVMValueRef derivs[6];
3527         unsigned chan, count = 0;
3528         unsigned const_src = 0, num_deriv_comp = 0;
3529
3530         tex_fetch_ptrs(ctx, instr, &res_ptr, &samp_ptr, &fmask_ptr);
3531
3532         for (unsigned i = 0; i < instr->num_srcs; i++) {
3533                 switch (instr->src[i].src_type) {
3534                 case nir_tex_src_coord:
3535                         coord = get_src(ctx, instr->src[i].src);
3536                         break;
3537                 case nir_tex_src_projector:
3538                         break;
3539                 case nir_tex_src_comparator:
3540                         comparator = get_src(ctx, instr->src[i].src);
3541                         break;
3542                 case nir_tex_src_offset:
3543                         offsets = get_src(ctx, instr->src[i].src);
3544                         const_src = i;
3545                         break;
3546                 case nir_tex_src_bias:
3547                         bias = get_src(ctx, instr->src[i].src);
3548                         break;
3549                 case nir_tex_src_lod:
3550                         lod = get_src(ctx, instr->src[i].src);
3551                         break;
3552                 case nir_tex_src_ms_index:
3553                         sample_index = get_src(ctx, instr->src[i].src);
3554                         break;
3555                 case nir_tex_src_ms_mcs:
3556                         break;
3557                 case nir_tex_src_ddx:
3558                         ddx = get_src(ctx, instr->src[i].src);
3559                         num_deriv_comp = instr->src[i].src.ssa->num_components;
3560                         break;
3561                 case nir_tex_src_ddy:
3562                         ddy = get_src(ctx, instr->src[i].src);
3563                         break;
3564                 case nir_tex_src_texture_offset:
3565                 case nir_tex_src_sampler_offset:
3566                 case nir_tex_src_plane:
3567                 default:
3568                         break;
3569                 }
3570         }
3571
3572         if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
3573                 result = get_buffer_size(ctx, res_ptr, true);
3574                 goto write_result;
3575         }
3576
3577         if (instr->op == nir_texop_texture_samples) {
3578                 LLVMValueRef res, samples, is_msaa;
3579                 res = LLVMBuildBitCast(ctx->builder, res_ptr, ctx->v8i32, "");
3580                 samples = LLVMBuildExtractElement(ctx->builder, res,
3581                                                   LLVMConstInt(ctx->i32, 3, false), "");
3582                 is_msaa = LLVMBuildLShr(ctx->builder, samples,
3583                                         LLVMConstInt(ctx->i32, 28, false), "");
3584                 is_msaa = LLVMBuildAnd(ctx->builder, is_msaa,
3585                                        LLVMConstInt(ctx->i32, 0xe, false), "");
3586                 is_msaa = LLVMBuildICmp(ctx->builder, LLVMIntEQ, is_msaa,
3587                                         LLVMConstInt(ctx->i32, 0xe, false), "");
3588
3589                 samples = LLVMBuildLShr(ctx->builder, samples,
3590                                         LLVMConstInt(ctx->i32, 16, false), "");
3591                 samples = LLVMBuildAnd(ctx->builder, samples,
3592                                        LLVMConstInt(ctx->i32, 0xf, false), "");
3593                 samples = LLVMBuildShl(ctx->builder, ctx->i32one,
3594                                        samples, "");
3595                 samples = LLVMBuildSelect(ctx->builder, is_msaa, samples,
3596                                           ctx->i32one, "");
3597                 result = samples;
3598                 goto write_result;
3599         }
3600
3601         if (coord)
3602                 for (chan = 0; chan < instr->coord_components; chan++)
3603                         coords[chan] = llvm_extract_elem(ctx, coord, chan);
3604
3605         if (offsets && instr->op != nir_texop_txf) {
3606                 LLVMValueRef offset[3], pack;
3607                 for (chan = 0; chan < 3; ++chan)
3608                         offset[chan] = ctx->i32zero;
3609
3610                 args.offset = true;
3611                 for (chan = 0; chan < get_llvm_num_components(offsets); chan++) {
3612                         offset[chan] = llvm_extract_elem(ctx, offsets, chan);
3613                         offset[chan] = LLVMBuildAnd(ctx->builder, offset[chan],
3614                                                     LLVMConstInt(ctx->i32, 0x3f, false), "");
3615                         if (chan)
3616                                 offset[chan] = LLVMBuildShl(ctx->builder, offset[chan],
3617                                                             LLVMConstInt(ctx->i32, chan * 8, false), "");
3618                 }
3619                 pack = LLVMBuildOr(ctx->builder, offset[0], offset[1], "");
3620                 pack = LLVMBuildOr(ctx->builder, pack, offset[2], "");
3621                 address[count++] = pack;
3622
3623         }
3624         /* pack LOD bias value */
3625         if (instr->op == nir_texop_txb && bias) {
3626                 address[count++] = bias;
3627         }
3628
3629         /* Pack depth comparison value */
3630         if (instr->is_shadow && comparator) {
3631                 address[count++] = llvm_extract_elem(ctx, comparator, 0);
3632         }
3633
3634         /* pack derivatives */
3635         if (ddx || ddy) {
3636                 switch (instr->sampler_dim) {
3637                 case GLSL_SAMPLER_DIM_3D:
3638                 case GLSL_SAMPLER_DIM_CUBE:
3639                         num_deriv_comp = 3;
3640                         break;
3641                 case GLSL_SAMPLER_DIM_2D:
3642                 default:
3643                         num_deriv_comp = 2;
3644                         break;
3645                 case GLSL_SAMPLER_DIM_1D:
3646                         num_deriv_comp = 1;
3647                         break;
3648                 }
3649
3650                 for (unsigned i = 0; i < num_deriv_comp; i++) {
3651                         derivs[i * 2] = to_float(ctx, llvm_extract_elem(ctx, ddx, i));
3652                         derivs[i * 2 + 1] = to_float(ctx, llvm_extract_elem(ctx, ddy, i));
3653                 }
3654         }
3655
3656         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && coord) {
3657                 for (chan = 0; chan < instr->coord_components; chan++)
3658                         coords[chan] = to_float(ctx, coords[chan]);
3659                 if (instr->coord_components == 3)
3660                         coords[3] = LLVMGetUndef(ctx->f32);
3661                 ac_prepare_cube_coords(&ctx->ac,
3662                         instr->op == nir_texop_txd, instr->is_array,
3663                         coords, derivs);
3664                 if (num_deriv_comp)
3665                         num_deriv_comp--;
3666         }
3667
3668         if (ddx || ddy) {
3669                 for (unsigned i = 0; i < num_deriv_comp * 2; i++)
3670                         address[count++] = derivs[i];
3671         }
3672
3673         /* Pack texture coordinates */
3674         if (coord) {
3675                 address[count++] = coords[0];
3676                 if (instr->coord_components > 1) {
3677                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array && instr->op != nir_texop_txf) {
3678                                 coords[1] = apply_round_slice(ctx, coords[1]);
3679                         }
3680                         address[count++] = coords[1];
3681                 }
3682                 if (instr->coord_components > 2) {
3683                         /* This seems like a bit of a hack - but it passes Vulkan CTS with it */
3684                         if (instr->sampler_dim != GLSL_SAMPLER_DIM_3D && instr->op != nir_texop_txf) {
3685                                 coords[2] = apply_round_slice(ctx, coords[2]);
3686                         }
3687                         address[count++] = coords[2];
3688                 }
3689         }
3690
3691         /* Pack LOD */
3692         if ((instr->op == nir_texop_txl || instr->op == nir_texop_txf) && lod) {
3693                 address[count++] = lod;
3694         } else if (instr->op == nir_texop_txf_ms && sample_index) {
3695                 address[count++] = sample_index;
3696         } else if(instr->op == nir_texop_txs) {
3697                 count = 0;
3698                 if (lod)
3699                         address[count++] = lod;
3700                 else
3701                         address[count++] = ctx->i32zero;
3702         }
3703
3704         for (chan = 0; chan < count; chan++) {
3705                 address[chan] = LLVMBuildBitCast(ctx->builder,
3706                                                  address[chan], ctx->i32, "");
3707         }
3708
3709         if (instr->op == nir_texop_samples_identical) {
3710                 LLVMValueRef txf_address[4];
3711                 struct ac_image_args txf_args = { 0 };
3712                 unsigned txf_count = count;
3713                 memcpy(txf_address, address, sizeof(txf_address));
3714
3715                 if (!instr->is_array)
3716                         txf_address[2] = ctx->i32zero;
3717                 txf_address[3] = ctx->i32zero;
3718
3719                 set_tex_fetch_args(ctx, &txf_args, instr, nir_texop_txf,
3720                                    fmask_ptr, NULL,
3721                                    txf_address, txf_count, 0xf);
3722
3723                 result = build_tex_intrinsic(ctx, instr, &txf_args);
3724
3725                 result = LLVMBuildExtractElement(ctx->builder, result, ctx->i32zero, "");
3726                 result = emit_int_cmp(ctx, LLVMIntEQ, result, ctx->i32zero);
3727                 goto write_result;
3728         }
3729
3730         if (instr->sampler_dim == GLSL_SAMPLER_DIM_MS &&
3731             instr->op != nir_texop_txs) {
3732                 unsigned sample_chan = instr->is_array ? 3 : 2;
3733                 address[sample_chan] = adjust_sample_index_using_fmask(ctx,
3734                                                                        address[0],
3735                                                                        address[1],
3736                                                                        instr->is_array ? address[2] : NULL,
3737                                                                        address[sample_chan],
3738                                                                        fmask_ptr);
3739         }
3740
3741         if (offsets && instr->op == nir_texop_txf) {
3742                 nir_const_value *const_offset =
3743                         nir_src_as_const_value(instr->src[const_src].src);
3744                 int num_offsets = instr->src[const_src].src.ssa->num_components;
3745                 assert(const_offset);
3746                 num_offsets = MIN2(num_offsets, instr->coord_components);
3747                 if (num_offsets > 2)
3748                         address[2] = LLVMBuildAdd(ctx->builder,
3749                                                   address[2], LLVMConstInt(ctx->i32, const_offset->i32[2], false), "");
3750                 if (num_offsets > 1)
3751                         address[1] = LLVMBuildAdd(ctx->builder,
3752                                                   address[1], LLVMConstInt(ctx->i32, const_offset->i32[1], false), "");
3753                 address[0] = LLVMBuildAdd(ctx->builder,
3754                                           address[0], LLVMConstInt(ctx->i32, const_offset->i32[0], false), "");
3755
3756         }
3757
3758         /* TODO TG4 support */
3759         if (instr->op == nir_texop_tg4) {
3760                 if (instr->is_shadow)
3761                         dmask = 1;
3762                 else
3763                         dmask = 1 << instr->component;
3764         }
3765         set_tex_fetch_args(ctx, &args, instr, instr->op,
3766                            res_ptr, samp_ptr, address, count, dmask);
3767
3768         result = build_tex_intrinsic(ctx, instr, &args);
3769
3770         if (instr->op == nir_texop_query_levels)
3771                 result = LLVMBuildExtractElement(ctx->builder, result, LLVMConstInt(ctx->i32, 3, false), "");
3772         else if (instr->is_shadow && instr->op != nir_texop_txs && instr->op != nir_texop_lod && instr->op != nir_texop_tg4)
3773                 result = LLVMBuildExtractElement(ctx->builder, result, ctx->i32zero, "");
3774         else if (instr->op == nir_texop_txs &&
3775                  instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
3776                  instr->is_array) {
3777                 LLVMValueRef two = LLVMConstInt(ctx->i32, 2, false);
3778                 LLVMValueRef six = LLVMConstInt(ctx->i32, 6, false);
3779                 LLVMValueRef z = LLVMBuildExtractElement(ctx->builder, result, two, "");
3780                 z = LLVMBuildSDiv(ctx->builder, z, six, "");
3781                 result = LLVMBuildInsertElement(ctx->builder, result, z, two, "");
3782         } else if (instr->dest.ssa.num_components != 4)
3783                 result = trim_vector(ctx, result, instr->dest.ssa.num_components);
3784
3785 write_result:
3786         if (result) {
3787                 assert(instr->dest.is_ssa);
3788                 result = to_integer(ctx, result);
3789                 _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
3790         }
3791 }
3792
3793
3794 static void visit_phi(struct nir_to_llvm_context *ctx, nir_phi_instr *instr)
3795 {
3796         LLVMTypeRef type = get_def_type(ctx, &instr->dest.ssa);
3797         LLVMValueRef result = LLVMBuildPhi(ctx->builder, type, "");
3798
3799         _mesa_hash_table_insert(ctx->defs, &instr->dest.ssa, result);
3800         _mesa_hash_table_insert(ctx->phis, instr, result);
3801 }
3802
3803 static void visit_post_phi(struct nir_to_llvm_context *ctx,
3804                            nir_phi_instr *instr,
3805                            LLVMValueRef llvm_phi)
3806 {
3807         nir_foreach_phi_src(src, instr) {
3808                 LLVMBasicBlockRef block = get_block(ctx, src->pred);
3809                 LLVMValueRef llvm_src = get_src(ctx, src->src);
3810
3811                 LLVMAddIncoming(llvm_phi, &llvm_src, &block, 1);
3812         }
3813 }
3814
3815 static void phi_post_pass(struct nir_to_llvm_context *ctx)
3816 {
3817         struct hash_entry *entry;
3818         hash_table_foreach(ctx->phis, entry) {
3819                 visit_post_phi(ctx, (nir_phi_instr*)entry->key,
3820                                (LLVMValueRef)entry->data);
3821         }
3822 }
3823
3824
3825 static void visit_ssa_undef(struct nir_to_llvm_context *ctx,
3826                             nir_ssa_undef_instr *instr)
3827 {
3828         unsigned num_components = instr->def.num_components;
3829         LLVMValueRef undef;
3830
3831         if (num_components == 1)
3832                 undef = LLVMGetUndef(ctx->i32);
3833         else {
3834                 undef = LLVMGetUndef(LLVMVectorType(ctx->i32, num_components));
3835         }
3836         _mesa_hash_table_insert(ctx->defs, &instr->def, undef);
3837 }
3838
3839 static void visit_jump(struct nir_to_llvm_context *ctx,
3840                        nir_jump_instr *instr)
3841 {
3842         switch (instr->type) {
3843         case nir_jump_break:
3844                 LLVMBuildBr(ctx->builder, ctx->break_block);
3845                 LLVMClearInsertionPosition(ctx->builder);
3846                 break;
3847         case nir_jump_continue:
3848                 LLVMBuildBr(ctx->builder, ctx->continue_block);
3849                 LLVMClearInsertionPosition(ctx->builder);
3850                 break;
3851         default:
3852                 fprintf(stderr, "Unknown NIR jump instr: ");
3853                 nir_print_instr(&instr->instr, stderr);
3854                 fprintf(stderr, "\n");
3855                 abort();
3856         }
3857 }
3858
3859 static void visit_cf_list(struct nir_to_llvm_context *ctx,
3860                           struct exec_list *list);
3861
3862 static void visit_block(struct nir_to_llvm_context *ctx, nir_block *block)
3863 {
3864         LLVMBasicBlockRef llvm_block = LLVMGetInsertBlock(ctx->builder);
3865         nir_foreach_instr(instr, block)
3866         {
3867                 switch (instr->type) {
3868                 case nir_instr_type_alu:
3869                         visit_alu(ctx, nir_instr_as_alu(instr));
3870                         break;
3871                 case nir_instr_type_load_const:
3872                         visit_load_const(ctx, nir_instr_as_load_const(instr));
3873                         break;
3874                 case nir_instr_type_intrinsic:
3875                         visit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
3876                         break;
3877                 case nir_instr_type_tex:
3878                         visit_tex(ctx, nir_instr_as_tex(instr));
3879                         break;
3880                 case nir_instr_type_phi:
3881                         visit_phi(ctx, nir_instr_as_phi(instr));
3882                         break;
3883                 case nir_instr_type_ssa_undef:
3884                         visit_ssa_undef(ctx, nir_instr_as_ssa_undef(instr));
3885                         break;
3886                 case nir_instr_type_jump:
3887                         visit_jump(ctx, nir_instr_as_jump(instr));
3888                         break;
3889                 default:
3890                         fprintf(stderr, "Unknown NIR instr type: ");
3891                         nir_print_instr(instr, stderr);
3892                         fprintf(stderr, "\n");
3893                         abort();
3894                 }
3895         }
3896
3897         _mesa_hash_table_insert(ctx->defs, block, llvm_block);
3898 }
3899
3900 static void visit_if(struct nir_to_llvm_context *ctx, nir_if *if_stmt)
3901 {
3902         LLVMValueRef value = get_src(ctx, if_stmt->condition);
3903
3904         LLVMBasicBlockRef merge_block =
3905             LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, "");
3906         LLVMBasicBlockRef if_block =
3907             LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, "");
3908         LLVMBasicBlockRef else_block = merge_block;
3909         if (!exec_list_is_empty(&if_stmt->else_list))
3910                 else_block = LLVMAppendBasicBlockInContext(
3911                     ctx->context, ctx->main_function, "");
3912
3913         LLVMValueRef cond = LLVMBuildICmp(ctx->builder, LLVMIntNE, value,
3914                                           LLVMConstInt(ctx->i32, 0, false), "");
3915         LLVMBuildCondBr(ctx->builder, cond, if_block, else_block);
3916
3917         LLVMPositionBuilderAtEnd(ctx->builder, if_block);
3918         visit_cf_list(ctx, &if_stmt->then_list);
3919         if (LLVMGetInsertBlock(ctx->builder))
3920                 LLVMBuildBr(ctx->builder, merge_block);
3921
3922         if (!exec_list_is_empty(&if_stmt->else_list)) {
3923                 LLVMPositionBuilderAtEnd(ctx->builder, else_block);
3924                 visit_cf_list(ctx, &if_stmt->else_list);
3925                 if (LLVMGetInsertBlock(ctx->builder))
3926                         LLVMBuildBr(ctx->builder, merge_block);
3927         }
3928
3929         LLVMPositionBuilderAtEnd(ctx->builder, merge_block);
3930 }
3931
3932 static void visit_loop(struct nir_to_llvm_context *ctx, nir_loop *loop)
3933 {
3934         LLVMBasicBlockRef continue_parent = ctx->continue_block;
3935         LLVMBasicBlockRef break_parent = ctx->break_block;
3936
3937         ctx->continue_block =
3938             LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, "");
3939         ctx->break_block =
3940             LLVMAppendBasicBlockInContext(ctx->context, ctx->main_function, "");
3941
3942         LLVMBuildBr(ctx->builder, ctx->continue_block);
3943         LLVMPositionBuilderAtEnd(ctx->builder, ctx->continue_block);
3944         visit_cf_list(ctx, &loop->body);
3945
3946         if (LLVMGetInsertBlock(ctx->builder))
3947                 LLVMBuildBr(ctx->builder, ctx->continue_block);
3948         LLVMPositionBuilderAtEnd(ctx->builder, ctx->break_block);
3949
3950         ctx->continue_block = continue_parent;
3951         ctx->break_block = break_parent;
3952 }
3953
3954 static void visit_cf_list(struct nir_to_llvm_context *ctx,
3955                           struct exec_list *list)
3956 {
3957         foreach_list_typed(nir_cf_node, node, node, list)
3958         {
3959                 switch (node->type) {
3960                 case nir_cf_node_block:
3961                         visit_block(ctx, nir_cf_node_as_block(node));
3962                         break;
3963
3964                 case nir_cf_node_if:
3965                         visit_if(ctx, nir_cf_node_as_if(node));
3966                         break;
3967
3968                 case nir_cf_node_loop:
3969                         visit_loop(ctx, nir_cf_node_as_loop(node));
3970                         break;
3971
3972                 default:
3973                         assert(0);
3974                 }
3975         }
3976 }
3977
3978 static void
3979 handle_vs_input_decl(struct nir_to_llvm_context *ctx,
3980                      struct nir_variable *variable)
3981 {
3982         LLVMValueRef t_list_ptr = ctx->vertex_buffers;
3983         LLVMValueRef t_offset;
3984         LLVMValueRef t_list;
3985         LLVMValueRef args[3];
3986         LLVMValueRef input;
3987         LLVMValueRef buffer_index;
3988         int index = variable->data.location - VERT_ATTRIB_GENERIC0;
3989         int idx = variable->data.location;
3990         unsigned attrib_count = glsl_count_attribute_slots(variable->type, true);
3991
3992         variable->data.driver_location = idx * 4;
3993
3994         if (ctx->options->key.vs.instance_rate_inputs & (1u << index)) {
3995                 buffer_index = LLVMBuildAdd(ctx->builder, ctx->instance_id,
3996                                             ctx->start_instance, "");
3997                 ctx->shader_info->vs.vgpr_comp_cnt = MAX2(3,
3998                                             ctx->shader_info->vs.vgpr_comp_cnt);
3999         } else
4000                 buffer_index = LLVMBuildAdd(ctx->builder, ctx->vertex_id,
4001                                             ctx->base_vertex, "");
4002
4003         for (unsigned i = 0; i < attrib_count; ++i, ++idx) {
4004                 t_offset = LLVMConstInt(ctx->i32, index + i, false);
4005
4006                 t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
4007                 args[0] = t_list;
4008                 args[1] = LLVMConstInt(ctx->i32, 0, false);
4009                 args[2] = buffer_index;
4010                 input = ac_build_intrinsic(&ctx->ac,
4011                         "llvm.SI.vs.load.input", ctx->v4f32, args, 3,
4012                         AC_FUNC_ATTR_READNONE | AC_FUNC_ATTR_NOUNWIND |
4013                         AC_FUNC_ATTR_LEGACY);
4014
4015                 for (unsigned chan = 0; chan < 4; chan++) {
4016                         LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false);
4017                         ctx->inputs[radeon_llvm_reg_index_soa(idx, chan)] =
4018                                 to_integer(ctx, LLVMBuildExtractElement(ctx->builder,
4019                                                         input, llvm_chan, ""));
4020                 }
4021         }
4022 }
4023
4024 static void
4025 handle_gs_input_decl(struct nir_to_llvm_context *ctx,
4026                      struct nir_variable *variable)
4027 {
4028         int idx = variable->data.location;
4029
4030         if (idx == VARYING_SLOT_CLIP_DIST0 ||
4031             idx == VARYING_SLOT_CULL_DIST0) {
4032                 int length = glsl_get_length(glsl_get_array_element(variable->type));
4033                 if (idx == VARYING_SLOT_CLIP_DIST0)
4034                         ctx->num_input_clips = length;
4035                 else
4036                         ctx->num_input_culls = length;
4037         }
4038 }
4039
4040 static void interp_fs_input(struct nir_to_llvm_context *ctx,
4041                             unsigned attr,
4042                             LLVMValueRef interp_param,
4043                             LLVMValueRef prim_mask,
4044                             LLVMValueRef result[4])
4045 {
4046         LLVMValueRef attr_number;
4047         unsigned chan;
4048         LLVMValueRef i, j;
4049         bool interp = interp_param != NULL;
4050
4051         attr_number = LLVMConstInt(ctx->i32, attr, false);
4052
4053         /* fs.constant returns the param from the middle vertex, so it's not
4054          * really useful for flat shading. It's meant to be used for custom
4055          * interpolation (but the intrinsic can't fetch from the other two
4056          * vertices).
4057          *
4058          * Luckily, it doesn't matter, because we rely on the FLAT_SHADE state
4059          * to do the right thing. The only reason we use fs.constant is that
4060          * fs.interp cannot be used on integers, because they can be equal
4061          * to NaN.
4062          */
4063         if (interp) {
4064                 interp_param = LLVMBuildBitCast(ctx->builder, interp_param,
4065                                                 LLVMVectorType(ctx->f32, 2), "");
4066
4067                 i = LLVMBuildExtractElement(ctx->builder, interp_param,
4068                                                 ctx->i32zero, "");
4069                 j = LLVMBuildExtractElement(ctx->builder, interp_param,
4070                                                 ctx->i32one, "");
4071         }
4072
4073         for (chan = 0; chan < 4; chan++) {
4074                 LLVMValueRef llvm_chan = LLVMConstInt(ctx->i32, chan, false);
4075
4076                 if (interp) {
4077                         result[chan] = ac_build_fs_interp(&ctx->ac,
4078                                                           llvm_chan,
4079                                                           attr_number,
4080                                                           prim_mask, i, j);
4081                 } else {
4082                         result[chan] = ac_build_fs_interp_mov(&ctx->ac,
4083                                                               LLVMConstInt(ctx->i32, 2, false),
4084                                                               llvm_chan,
4085                                                               attr_number,
4086                                                               prim_mask);
4087                 }
4088         }
4089 }
4090
4091 static void
4092 handle_fs_input_decl(struct nir_to_llvm_context *ctx,
4093                      struct nir_variable *variable)
4094 {
4095         int idx = variable->data.location;
4096         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
4097         LLVMValueRef interp;
4098
4099         variable->data.driver_location = idx * 4;
4100         ctx->input_mask |= ((1ull << attrib_count) - 1) << variable->data.location;
4101
4102         if (glsl_get_base_type(glsl_without_array(variable->type)) == GLSL_TYPE_FLOAT) {
4103                 unsigned interp_type;
4104                 if (variable->data.sample) {
4105                         interp_type = INTERP_SAMPLE;
4106                         ctx->shader_info->fs.force_persample = true;
4107                 } else if (variable->data.centroid)
4108                         interp_type = INTERP_CENTROID;
4109                 else
4110                         interp_type = INTERP_CENTER;
4111
4112                 interp = lookup_interp_param(ctx, variable->data.interpolation, interp_type);
4113         } else
4114                 interp = NULL;
4115
4116         for (unsigned i = 0; i < attrib_count; ++i)
4117                 ctx->inputs[radeon_llvm_reg_index_soa(idx + i, 0)] = interp;
4118
4119 }
4120
4121 static void
4122 handle_shader_input_decl(struct nir_to_llvm_context *ctx,
4123                          struct nir_variable *variable)
4124 {
4125         switch (ctx->stage) {
4126         case MESA_SHADER_VERTEX:
4127                 handle_vs_input_decl(ctx, variable);
4128                 break;
4129         case MESA_SHADER_FRAGMENT:
4130                 handle_fs_input_decl(ctx, variable);
4131                 break;
4132         case MESA_SHADER_GEOMETRY:
4133                 handle_gs_input_decl(ctx, variable);
4134                 break;
4135         default:
4136                 break;
4137         }
4138
4139 }
4140
4141 static void
4142 handle_fs_inputs_pre(struct nir_to_llvm_context *ctx,
4143                      struct nir_shader *nir)
4144 {
4145         unsigned index = 0;
4146         for (unsigned i = 0; i < RADEON_LLVM_MAX_INPUTS; ++i) {
4147                 LLVMValueRef interp_param;
4148                 LLVMValueRef *inputs = ctx->inputs +radeon_llvm_reg_index_soa(i, 0);
4149
4150                 if (!(ctx->input_mask & (1ull << i)))
4151                         continue;
4152
4153                 if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC ||
4154                     i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) {
4155                         interp_param = *inputs;
4156                         interp_fs_input(ctx, index, interp_param, ctx->prim_mask,
4157                                         inputs);
4158
4159                         if (!interp_param)
4160                                 ctx->shader_info->fs.flat_shaded_mask |= 1u << index;
4161                         ++index;
4162                 } else if (i == VARYING_SLOT_POS) {
4163                         for(int i = 0; i < 3; ++i)
4164                                 inputs[i] = ctx->frag_pos[i];
4165
4166                         inputs[3] = ac_build_fdiv(&ctx->ac, ctx->f32one, ctx->frag_pos[3]);
4167                 }
4168         }
4169         ctx->shader_info->fs.num_interp = index;
4170         if (ctx->input_mask & (1 << VARYING_SLOT_PNTC))
4171                 ctx->shader_info->fs.has_pcoord = true;
4172         if (ctx->input_mask & (1 << VARYING_SLOT_PRIMITIVE_ID))
4173                 ctx->shader_info->fs.prim_id_input = true;
4174         if (ctx->input_mask & (1 << VARYING_SLOT_LAYER))
4175                 ctx->shader_info->fs.layer_input = true;
4176         ctx->shader_info->fs.input_mask = ctx->input_mask >> VARYING_SLOT_VAR0;
4177 }
4178
4179 static LLVMValueRef
4180 ac_build_alloca(struct nir_to_llvm_context *ctx,
4181                 LLVMTypeRef type,
4182                 const char *name)
4183 {
4184         LLVMBuilderRef builder = ctx->builder;
4185         LLVMBasicBlockRef current_block = LLVMGetInsertBlock(builder);
4186         LLVMValueRef function = LLVMGetBasicBlockParent(current_block);
4187         LLVMBasicBlockRef first_block = LLVMGetEntryBasicBlock(function);
4188         LLVMValueRef first_instr = LLVMGetFirstInstruction(first_block);
4189         LLVMBuilderRef first_builder = LLVMCreateBuilderInContext(ctx->context);
4190         LLVMValueRef res;
4191
4192         if (first_instr) {
4193                 LLVMPositionBuilderBefore(first_builder, first_instr);
4194         } else {
4195                 LLVMPositionBuilderAtEnd(first_builder, first_block);
4196         }
4197
4198         res = LLVMBuildAlloca(first_builder, type, name);
4199         LLVMBuildStore(builder, LLVMConstNull(type), res);
4200
4201         LLVMDisposeBuilder(first_builder);
4202
4203         return res;
4204 }
4205
4206 static LLVMValueRef si_build_alloca_undef(struct nir_to_llvm_context *ctx,
4207                                           LLVMTypeRef type,
4208                                           const char *name)
4209 {
4210         LLVMValueRef ptr = ac_build_alloca(ctx, type, name);
4211         LLVMBuildStore(ctx->builder, LLVMGetUndef(type), ptr);
4212         return ptr;
4213 }
4214
4215 static void
4216 handle_shader_output_decl(struct nir_to_llvm_context *ctx,
4217                           struct nir_variable *variable)
4218 {
4219         int idx = variable->data.location + variable->data.index;
4220         unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
4221
4222         variable->data.driver_location = idx * 4;
4223
4224         if (ctx->stage == MESA_SHADER_VERTEX ||
4225             ctx->stage == MESA_SHADER_GEOMETRY) {
4226                 if (idx == VARYING_SLOT_CLIP_DIST0 ||
4227                     idx == VARYING_SLOT_CULL_DIST0) {
4228                         int length = glsl_get_length(variable->type);
4229                         if (idx == VARYING_SLOT_CLIP_DIST0) {
4230                                 if (ctx->stage == MESA_SHADER_VERTEX)
4231                                         ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << length) - 1;
4232                                 ctx->num_output_clips = length;
4233                         } else if (idx == VARYING_SLOT_CULL_DIST0) {
4234                                 if (ctx->stage == MESA_SHADER_VERTEX)
4235                                         ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << length) - 1;
4236                                 ctx->num_output_culls = length;
4237                         }
4238                         if (length > 4)
4239                                 attrib_count = 2;
4240                         else
4241                                 attrib_count = 1;
4242                 }
4243         }
4244
4245         for (unsigned i = 0; i < attrib_count; ++i) {
4246                 for (unsigned chan = 0; chan < 4; chan++) {
4247                         ctx->outputs[radeon_llvm_reg_index_soa(idx + i, chan)] =
4248                                        si_build_alloca_undef(ctx, ctx->f32, "");
4249                 }
4250         }
4251         ctx->output_mask |= ((1ull << attrib_count) - 1) << idx;
4252 }
4253
4254 static void
4255 setup_locals(struct nir_to_llvm_context *ctx,
4256              struct nir_function *func)
4257 {
4258         int i, j;
4259         ctx->num_locals = 0;
4260         nir_foreach_variable(variable, &func->impl->locals) {
4261                 unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
4262                 variable->data.driver_location = ctx->num_locals * 4;
4263                 ctx->num_locals += attrib_count;
4264         }
4265         ctx->locals = malloc(4 * ctx->num_locals * sizeof(LLVMValueRef));
4266         if (!ctx->locals)
4267             return;
4268
4269         for (i = 0; i < ctx->num_locals; i++) {
4270                 for (j = 0; j < 4; j++) {
4271                         ctx->locals[i * 4 + j] =
4272                                 si_build_alloca_undef(ctx, ctx->f32, "temp");
4273                 }
4274         }
4275 }
4276
4277 static LLVMValueRef
4278 emit_float_saturate(struct nir_to_llvm_context *ctx, LLVMValueRef v, float lo, float hi)
4279 {
4280         v = to_float(ctx, v);
4281         v = emit_intrin_2f_param(ctx, "llvm.maxnum.f32", ctx->f32, v, LLVMConstReal(ctx->f32, lo));
4282         return emit_intrin_2f_param(ctx, "llvm.minnum.f32", ctx->f32, v, LLVMConstReal(ctx->f32, hi));
4283 }
4284
4285
4286 static LLVMValueRef emit_pack_int16(struct nir_to_llvm_context *ctx,
4287                                         LLVMValueRef src0, LLVMValueRef src1)
4288 {
4289         LLVMValueRef const16 = LLVMConstInt(ctx->i32, 16, false);
4290         LLVMValueRef comp[2];
4291
4292         comp[0] = LLVMBuildAnd(ctx->builder, src0, LLVMConstInt(ctx-> i32, 65535, 0), "");
4293         comp[1] = LLVMBuildAnd(ctx->builder, src1, LLVMConstInt(ctx-> i32, 65535, 0), "");
4294         comp[1] = LLVMBuildShl(ctx->builder, comp[1], const16, "");
4295         return LLVMBuildOr(ctx->builder, comp[0], comp[1], "");
4296 }
4297
4298 /* Initialize arguments for the shader export intrinsic */
4299 static void
4300 si_llvm_init_export_args(struct nir_to_llvm_context *ctx,
4301                          LLVMValueRef *values,
4302                          unsigned target,
4303                          struct ac_export_args *args)
4304 {
4305         /* Default is 0xf. Adjusted below depending on the format. */
4306         args->enabled_channels = 0xf;
4307
4308         /* Specify whether the EXEC mask represents the valid mask */
4309         args->valid_mask = 0;
4310
4311         /* Specify whether this is the last export */
4312         args->done = 0;
4313
4314         /* Specify the target we are exporting */
4315         args->target = target;
4316
4317         args->compr = false;
4318         args->out[0] = LLVMGetUndef(ctx->f32);
4319         args->out[1] = LLVMGetUndef(ctx->f32);
4320         args->out[2] = LLVMGetUndef(ctx->f32);
4321         args->out[3] = LLVMGetUndef(ctx->f32);
4322
4323         if (!values)
4324                 return;
4325
4326         if (ctx->stage == MESA_SHADER_FRAGMENT && target >= V_008DFC_SQ_EXP_MRT) {
4327                 LLVMValueRef val[4];
4328                 unsigned index = target - V_008DFC_SQ_EXP_MRT;
4329                 unsigned col_format = (ctx->options->key.fs.col_format >> (4 * index)) & 0xf;
4330                 bool is_int8 = (ctx->options->key.fs.is_int8 >> index) & 1;
4331
4332                 switch(col_format) {
4333                 case V_028714_SPI_SHADER_ZERO:
4334                         args->enabled_channels = 0; /* writemask */
4335                         args->target = V_008DFC_SQ_EXP_NULL;
4336                         break;
4337
4338                 case V_028714_SPI_SHADER_32_R:
4339                         args->enabled_channels = 1;
4340                         args->out[0] = values[0];
4341                         break;
4342
4343                 case V_028714_SPI_SHADER_32_GR:
4344                         args->enabled_channels = 0x3;
4345                         args->out[0] = values[0];
4346                         args->out[1] = values[1];
4347                         break;
4348
4349                 case V_028714_SPI_SHADER_32_AR:
4350                         args->enabled_channels = 0x9;
4351                         args->out[0] = values[0];
4352                         args->out[3] = values[3];
4353                         break;
4354
4355                 case V_028714_SPI_SHADER_FP16_ABGR:
4356                         args->compr = 1;
4357
4358                         for (unsigned chan = 0; chan < 2; chan++) {
4359                                 LLVMValueRef pack_args[2] = {
4360                                         values[2 * chan],
4361                                         values[2 * chan + 1]
4362                                 };
4363                                 LLVMValueRef packed;
4364
4365                                 packed = ac_build_cvt_pkrtz_f16(&ctx->ac, pack_args);
4366                                 args->out[chan] = packed;
4367                         }
4368                         break;
4369
4370                 case V_028714_SPI_SHADER_UNORM16_ABGR:
4371                         for (unsigned chan = 0; chan < 4; chan++) {
4372                                 val[chan] = ac_build_clamp(&ctx->ac, values[chan]);
4373                                 val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
4374                                                         LLVMConstReal(ctx->f32, 65535), "");
4375                                 val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
4376                                                         LLVMConstReal(ctx->f32, 0.5), "");
4377                                 val[chan] = LLVMBuildFPToUI(ctx->builder, val[chan],
4378                                                         ctx->i32, "");
4379                         }
4380
4381                         args->compr = 1;
4382                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
4383                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
4384                         break;
4385
4386                 case V_028714_SPI_SHADER_SNORM16_ABGR:
4387                         for (unsigned chan = 0; chan < 4; chan++) {
4388                                 val[chan] = emit_float_saturate(ctx, values[chan], -1, 1);
4389                                 val[chan] = LLVMBuildFMul(ctx->builder, val[chan],
4390                                                         LLVMConstReal(ctx->f32, 32767), "");
4391
4392                                 /* If positive, add 0.5, else add -0.5. */
4393                                 val[chan] = LLVMBuildFAdd(ctx->builder, val[chan],
4394                                                 LLVMBuildSelect(ctx->builder,
4395                                                         LLVMBuildFCmp(ctx->builder, LLVMRealOGE,
4396                                                                 val[chan], ctx->f32zero, ""),
4397                                                         LLVMConstReal(ctx->f32, 0.5),
4398                                                         LLVMConstReal(ctx->f32, -0.5), ""), "");
4399                                 val[chan] = LLVMBuildFPToSI(ctx->builder, val[chan], ctx->i32, "");
4400                         }
4401
4402                         args->compr = 1;
4403                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
4404                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
4405                         break;
4406
4407                 case V_028714_SPI_SHADER_UINT16_ABGR: {
4408                         LLVMValueRef max = LLVMConstInt(ctx->i32, is_int8 ? 255 : 65535, 0);
4409
4410                         for (unsigned chan = 0; chan < 4; chan++) {
4411                                 val[chan] = to_integer(ctx, values[chan]);
4412                                 val[chan] = emit_minmax_int(ctx, LLVMIntULT, val[chan], max);
4413                         }
4414
4415                         args->compr = 1;
4416                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
4417                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
4418                         break;
4419                 }
4420
4421                 case V_028714_SPI_SHADER_SINT16_ABGR: {
4422                         LLVMValueRef max = LLVMConstInt(ctx->i32, is_int8 ? 127 : 32767, 0);
4423                         LLVMValueRef min = LLVMConstInt(ctx->i32, is_int8 ? -128 : -32768, 0);
4424
4425                         /* Clamp. */
4426                         for (unsigned chan = 0; chan < 4; chan++) {
4427                                 val[chan] = to_integer(ctx, values[chan]);
4428                                 val[chan] = emit_minmax_int(ctx, LLVMIntSLT, val[chan], max);
4429                                 val[chan] = emit_minmax_int(ctx, LLVMIntSGT, val[chan], min);
4430                         }
4431
4432                         args->compr = 1;
4433                         args->out[0] = emit_pack_int16(ctx, val[0], val[1]);
4434                         args->out[1] = emit_pack_int16(ctx, val[2], val[3]);
4435                         break;
4436                 }
4437
4438                 default:
4439                 case V_028714_SPI_SHADER_32_ABGR:
4440                         memcpy(&args->out[0], values, sizeof(values[0]) * 4);
4441                         break;
4442                 }
4443         } else
4444                 memcpy(&args->out[0], values, sizeof(values[0]) * 4);
4445
4446         for (unsigned i = 0; i < 4; ++i)
4447                 args->out[i] = to_float(ctx, args->out[i]);
4448 }
4449
4450 static void
4451 handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
4452                        struct ac_vs_output_info *outinfo)
4453 {
4454         uint32_t param_count = 0;
4455         unsigned target;
4456         unsigned pos_idx, num_pos_exports = 0;
4457         struct ac_export_args args, pos_args[4] = {};
4458         LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_index_value = NULL;
4459         int i;
4460         const uint64_t clip_mask = ctx->output_mask & ((1ull << VARYING_SLOT_CLIP_DIST0) |
4461                                                        (1ull << VARYING_SLOT_CLIP_DIST1) |
4462                                                        (1ull << VARYING_SLOT_CULL_DIST0) |
4463                                                        (1ull << VARYING_SLOT_CULL_DIST1));
4464
4465         outinfo->prim_id_output = 0xffffffff;
4466         outinfo->layer_output = 0xffffffff;
4467         if (clip_mask) {
4468                 LLVMValueRef slots[8];
4469                 unsigned j;
4470
4471                 if (outinfo->cull_dist_mask)
4472                         outinfo->cull_dist_mask <<= ctx->num_output_clips;
4473
4474                 i = VARYING_SLOT_CLIP_DIST0;
4475                 for (j = 0; j < ctx->num_output_clips; j++)
4476                         slots[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
4477                                                                ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
4478                 i = VARYING_SLOT_CULL_DIST0;
4479                 for (j = 0; j < ctx->num_output_culls; j++)
4480                         slots[ctx->num_output_clips + j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
4481                                                                            ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
4482
4483                 for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++)
4484                         slots[i] = LLVMGetUndef(ctx->f32);
4485
4486                 if (ctx->num_output_clips + ctx->num_output_culls > 4) {
4487                         target = V_008DFC_SQ_EXP_POS + 3;
4488                         si_llvm_init_export_args(ctx, &slots[4], target, &args);
4489                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
4490                                &args, sizeof(args));
4491                 }
4492
4493                 target = V_008DFC_SQ_EXP_POS + 2;
4494                 si_llvm_init_export_args(ctx, &slots[0], target, &args);
4495                 memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
4496                        &args, sizeof(args));
4497
4498         }
4499
4500         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
4501                 LLVMValueRef values[4];
4502                 if (!(ctx->output_mask & (1ull << i)))
4503                         continue;
4504
4505                 for (unsigned j = 0; j < 4; j++)
4506                         values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
4507                                               ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
4508
4509                 if (i == VARYING_SLOT_POS) {
4510                         target = V_008DFC_SQ_EXP_POS;
4511                 } else if (i == VARYING_SLOT_CLIP_DIST0 ||
4512                            i == VARYING_SLOT_CLIP_DIST1 ||
4513                            i == VARYING_SLOT_CULL_DIST0 ||
4514                            i == VARYING_SLOT_CULL_DIST1) {
4515                         continue;
4516                 } else if (i == VARYING_SLOT_PSIZ) {
4517                         outinfo->writes_pointsize = true;
4518                         psize_value = values[0];
4519                         continue;
4520                 } else if (i == VARYING_SLOT_LAYER) {
4521                         outinfo->writes_layer = true;
4522                         layer_value = values[0];
4523                         outinfo->layer_output = param_count;
4524                         target = V_008DFC_SQ_EXP_PARAM + param_count;
4525                         param_count++;
4526                 } else if (i == VARYING_SLOT_VIEWPORT) {
4527                         outinfo->writes_viewport_index = true;
4528                         viewport_index_value = values[0];
4529                         continue;
4530                 } else if (i == VARYING_SLOT_PRIMITIVE_ID) {
4531                         outinfo->prim_id_output = param_count;
4532                         target = V_008DFC_SQ_EXP_PARAM + param_count;
4533                         param_count++;
4534                 } else if (i >= VARYING_SLOT_VAR0) {
4535                         outinfo->export_mask |= 1u << (i - VARYING_SLOT_VAR0);
4536                         target = V_008DFC_SQ_EXP_PARAM + param_count;
4537                         param_count++;
4538                 }
4539
4540                 si_llvm_init_export_args(ctx, values, target, &args);
4541
4542                 if (target >= V_008DFC_SQ_EXP_POS &&
4543                     target <= (V_008DFC_SQ_EXP_POS + 3)) {
4544                         memcpy(&pos_args[target - V_008DFC_SQ_EXP_POS],
4545                                &args, sizeof(args));
4546                 } else {
4547                         ac_build_export(&ctx->ac, &args);
4548                 }
4549         }
4550
4551         /* We need to add the position output manually if it's missing. */
4552         if (!pos_args[0].out[0]) {
4553                 pos_args[0].enabled_channels = 0xf;
4554                 pos_args[0].valid_mask = 0;
4555                 pos_args[0].done = 0;
4556                 pos_args[0].target = V_008DFC_SQ_EXP_POS;
4557                 pos_args[0].compr = 0;
4558                 pos_args[0].out[0] = ctx->f32zero; /* X */
4559                 pos_args[0].out[1] = ctx->f32zero; /* Y */
4560                 pos_args[0].out[2] = ctx->f32zero; /* Z */
4561                 pos_args[0].out[3] = ctx->f32one;  /* W */
4562         }
4563
4564         uint32_t mask = ((outinfo->writes_pointsize == true ? 1 : 0) |
4565                          (outinfo->writes_layer == true ? 4 : 0) |
4566                          (outinfo->writes_viewport_index == true ? 8 : 0));
4567         if (mask) {
4568                 pos_args[1].enabled_channels = mask;
4569                 pos_args[1].valid_mask = 0;
4570                 pos_args[1].done = 0;
4571                 pos_args[1].target = V_008DFC_SQ_EXP_POS + 1;
4572                 pos_args[1].compr = 0;
4573                 pos_args[1].out[0] = ctx->f32zero; /* X */
4574                 pos_args[1].out[1] = ctx->f32zero; /* Y */
4575                 pos_args[1].out[2] = ctx->f32zero; /* Z */
4576                 pos_args[1].out[3] = ctx->f32zero;  /* W */
4577
4578                 if (outinfo->writes_pointsize == true)
4579                         pos_args[1].out[0] = psize_value;
4580                 if (outinfo->writes_layer == true)
4581                         pos_args[1].out[2] = layer_value;
4582                 if (outinfo->writes_viewport_index == true)
4583                         pos_args[1].out[3] = viewport_index_value;
4584         }
4585         for (i = 0; i < 4; i++) {
4586                 if (pos_args[i].out[0])
4587                         num_pos_exports++;
4588         }
4589
4590         pos_idx = 0;
4591         for (i = 0; i < 4; i++) {
4592                 if (!pos_args[i].out[0])
4593                         continue;
4594
4595                 /* Specify the target we are exporting */
4596                 pos_args[i].target = V_008DFC_SQ_EXP_POS + pos_idx++;
4597                 if (pos_idx == num_pos_exports)
4598                         pos_args[i].done = 1;
4599                 ac_build_export(&ctx->ac, &pos_args[i]);
4600         }
4601
4602         outinfo->pos_exports = num_pos_exports;
4603         outinfo->param_exports = param_count;
4604 }
4605
4606 static void
4607 handle_es_outputs_post(struct nir_to_llvm_context *ctx,
4608                        struct ac_es_output_info *outinfo)
4609 {
4610         int j;
4611         uint64_t max_output_written = 0;
4612         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
4613                 LLVMValueRef *out_ptr = &ctx->outputs[i * 4];
4614                 int param_index;
4615                 int length = 4;
4616                 int start = 0;
4617                 if (!(ctx->output_mask & (1ull << i)))
4618                         continue;
4619
4620                 if (i == VARYING_SLOT_CLIP_DIST0) {
4621                         length = ctx->num_output_clips;
4622                 } else if (i == VARYING_SLOT_CULL_DIST0) {
4623                         start = ctx->num_output_clips;
4624                         length = ctx->num_output_culls;
4625                 }
4626                 param_index = shader_io_get_unique_index(i);
4627
4628                 if (param_index > max_output_written)
4629                         max_output_written = param_index;
4630
4631                 for (j = 0; j < length; j++) {
4632                         LLVMValueRef out_val = LLVMBuildLoad(ctx->builder, out_ptr[j], "");
4633                         out_val = LLVMBuildBitCast(ctx->builder, out_val, ctx->i32, "");
4634
4635                         ac_build_buffer_store_dword(&ctx->ac,
4636                                                ctx->esgs_ring,
4637                                                out_val, 1,
4638                                                NULL, ctx->es2gs_offset,
4639                                                (4 * param_index + j + start) * 4,
4640                                                1, 1, true, true);
4641                 }
4642         }
4643         outinfo->esgs_itemsize = (max_output_written + 1) * 16;
4644 }
4645
4646 static void
4647 si_export_mrt_color(struct nir_to_llvm_context *ctx,
4648                     LLVMValueRef *color, unsigned param, bool is_last)
4649 {
4650
4651         struct ac_export_args args;
4652
4653         /* Export */
4654         si_llvm_init_export_args(ctx, color, param,
4655                                  &args);
4656
4657         if (is_last) {
4658                 args.valid_mask = 1; /* whether the EXEC mask is valid */
4659                 args.done = 1; /* DONE bit */
4660         } else if (!args.enabled_channels)
4661                 return; /* unnecessary NULL export */
4662
4663         ac_build_export(&ctx->ac, &args);
4664 }
4665
4666 static void
4667 si_export_mrt_z(struct nir_to_llvm_context *ctx,
4668                 LLVMValueRef depth, LLVMValueRef stencil,
4669                 LLVMValueRef samplemask)
4670 {
4671         struct ac_export_args args;
4672
4673         args.enabled_channels = 0;
4674         args.valid_mask = 1;
4675         args.done = 1;
4676         args.target = V_008DFC_SQ_EXP_MRTZ;
4677         args.compr = false;
4678
4679         args.out[0] = LLVMGetUndef(ctx->f32); /* R, depth */
4680         args.out[1] = LLVMGetUndef(ctx->f32); /* G, stencil test val[0:7], stencil op val[8:15] */
4681         args.out[2] = LLVMGetUndef(ctx->f32); /* B, sample mask */
4682         args.out[3] = LLVMGetUndef(ctx->f32); /* A, alpha to mask */
4683
4684         if (depth) {
4685                 args.out[0] = depth;
4686                 args.enabled_channels |= 0x1;
4687         }
4688
4689         if (stencil) {
4690                 args.out[1] = stencil;
4691                 args.enabled_channels |= 0x2;
4692         }
4693
4694         if (samplemask) {
4695                 args.out[2] = samplemask;
4696                 args.enabled_channels |= 0x4;
4697         }
4698
4699         /* SI (except OLAND) has a bug that it only looks
4700          * at the X writemask component. */
4701         if (ctx->options->chip_class == SI &&
4702             ctx->options->family != CHIP_OLAND)
4703                 args.enabled_channels |= 0x1;
4704
4705         ac_build_export(&ctx->ac, &args);
4706 }
4707
4708 static void
4709 handle_fs_outputs_post(struct nir_to_llvm_context *ctx)
4710 {
4711         unsigned index = 0;
4712         LLVMValueRef depth = NULL, stencil = NULL, samplemask = NULL;
4713
4714         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
4715                 LLVMValueRef values[4];
4716
4717                 if (!(ctx->output_mask & (1ull << i)))
4718                         continue;
4719
4720                 if (i == FRAG_RESULT_DEPTH) {
4721                         ctx->shader_info->fs.writes_z = true;
4722                         depth = to_float(ctx, LLVMBuildLoad(ctx->builder,
4723                                                             ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
4724                 } else if (i == FRAG_RESULT_STENCIL) {
4725                         ctx->shader_info->fs.writes_stencil = true;
4726                         stencil = to_float(ctx, LLVMBuildLoad(ctx->builder,
4727                                                               ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
4728                 } else if (i == FRAG_RESULT_SAMPLE_MASK) {
4729                         ctx->shader_info->fs.writes_sample_mask = true;
4730                         samplemask = to_float(ctx, LLVMBuildLoad(ctx->builder,
4731                                                                   ctx->outputs[radeon_llvm_reg_index_soa(i, 0)], ""));
4732                 } else {
4733                         bool last = false;
4734                         for (unsigned j = 0; j < 4; j++)
4735                                 values[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
4736                                                                         ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
4737
4738                         if (!ctx->shader_info->fs.writes_z && !ctx->shader_info->fs.writes_stencil && !ctx->shader_info->fs.writes_sample_mask)
4739                                 last = ctx->output_mask <= ((1ull << (i + 1)) - 1);
4740
4741                         si_export_mrt_color(ctx, values, V_008DFC_SQ_EXP_MRT + index, last);
4742                         index++;
4743                 }
4744         }
4745
4746         if (depth || stencil || samplemask)
4747                 si_export_mrt_z(ctx, depth, stencil, samplemask);
4748         else if (!index)
4749                 si_export_mrt_color(ctx, NULL, V_008DFC_SQ_EXP_NULL, true);
4750
4751         ctx->shader_info->fs.output_mask = index ? ((1ull << index) - 1) : 0;
4752 }
4753
4754 static void
4755 emit_gs_epilogue(struct nir_to_llvm_context *ctx)
4756 {
4757         ac_build_sendmsg(&ctx->ac, AC_SENDMSG_GS_OP_NOP | AC_SENDMSG_GS_DONE, ctx->gs_wave_id);
4758 }
4759
4760 static void
4761 handle_shader_outputs_post(struct nir_to_llvm_context *ctx)
4762 {
4763         switch (ctx->stage) {
4764         case MESA_SHADER_VERTEX:
4765                 if (ctx->options->key.vs.as_es)
4766                         handle_es_outputs_post(ctx, &ctx->shader_info->vs.es_info);
4767                 else
4768                         handle_vs_outputs_post(ctx, &ctx->shader_info->vs.outinfo);
4769                 break;
4770         case MESA_SHADER_FRAGMENT:
4771                 handle_fs_outputs_post(ctx);
4772                 break;
4773         case MESA_SHADER_GEOMETRY:
4774                 emit_gs_epilogue(ctx);
4775                 break;
4776         default:
4777                 break;
4778         }
4779 }
4780
4781 static void
4782 handle_shared_compute_var(struct nir_to_llvm_context *ctx,
4783                           struct nir_variable *variable, uint32_t *offset, int idx)
4784 {
4785         unsigned size = glsl_count_attribute_slots(variable->type, false);
4786         variable->data.driver_location = *offset;
4787         *offset += size;
4788 }
4789
4790 static void ac_llvm_finalize_module(struct nir_to_llvm_context * ctx)
4791 {
4792         LLVMPassManagerRef passmgr;
4793         /* Create the pass manager */
4794         passmgr = LLVMCreateFunctionPassManagerForModule(
4795                                                         ctx->module);
4796
4797         /* This pass should eliminate all the load and store instructions */
4798         LLVMAddPromoteMemoryToRegisterPass(passmgr);
4799
4800         /* Add some optimization passes */
4801         LLVMAddScalarReplAggregatesPass(passmgr);
4802         LLVMAddLICMPass(passmgr);
4803         LLVMAddAggressiveDCEPass(passmgr);
4804         LLVMAddCFGSimplificationPass(passmgr);
4805         LLVMAddInstructionCombiningPass(passmgr);
4806
4807         /* Run the pass */
4808         LLVMInitializeFunctionPassManager(passmgr);
4809         LLVMRunFunctionPassManager(passmgr, ctx->main_function);
4810         LLVMFinalizeFunctionPassManager(passmgr);
4811
4812         LLVMDisposeBuilder(ctx->builder);
4813         LLVMDisposePassManager(passmgr);
4814 }
4815
4816 static void
4817 ac_setup_rings(struct nir_to_llvm_context *ctx)
4818 {
4819         if (ctx->stage == MESA_SHADER_VERTEX && ctx->options->key.vs.as_es) {
4820                 ctx->esgs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, ctx->i32one);
4821         }
4822
4823         if (ctx->is_gs_copy_shader) {
4824                 ctx->gsvs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, 3, false));
4825         }
4826         if (ctx->stage == MESA_SHADER_GEOMETRY) {
4827                 LLVMValueRef tmp;
4828                 ctx->esgs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, 2, false));
4829                 ctx->gsvs_ring = ac_build_indexed_load_const(&ctx->ac, ctx->ring_offsets, LLVMConstInt(ctx->i32, 4, false));
4830
4831                 ctx->gsvs_ring = LLVMBuildBitCast(ctx->builder, ctx->gsvs_ring, ctx->v4i32, "");
4832
4833                 ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, ctx->gsvs_num_entries, LLVMConstInt(ctx->i32, 2, false), "");
4834                 tmp = LLVMBuildExtractElement(ctx->builder, ctx->gsvs_ring, ctx->i32one, "");
4835                 tmp = LLVMBuildOr(ctx->builder, tmp, ctx->gsvs_ring_stride, "");
4836                 ctx->gsvs_ring = LLVMBuildInsertElement(ctx->builder, ctx->gsvs_ring, tmp, ctx->i32one, "");
4837
4838                 ctx->gsvs_ring = LLVMBuildBitCast(ctx->builder, ctx->gsvs_ring, ctx->v16i8, "");
4839         }
4840 }
4841
4842 static
4843 LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
4844                                        struct nir_shader *nir,
4845                                        struct ac_shader_variant_info *shader_info,
4846                                        const struct ac_nir_compiler_options *options)
4847 {
4848         struct nir_to_llvm_context ctx = {0};
4849         struct nir_function *func;
4850         unsigned i;
4851         ctx.options = options;
4852         ctx.shader_info = shader_info;
4853         ctx.context = LLVMContextCreate();
4854         ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
4855
4856         ac_llvm_context_init(&ctx.ac, ctx.context);
4857         ctx.ac.module = ctx.module;
4858
4859         ctx.has_ds_bpermute = ctx.options->chip_class >= VI;
4860
4861         memset(shader_info, 0, sizeof(*shader_info));
4862
4863         LLVMSetTarget(ctx.module, options->supports_spill ? "amdgcn-mesa-mesa3d" : "amdgcn--");
4864
4865         LLVMTargetDataRef data_layout = LLVMCreateTargetDataLayout(tm);
4866         char *data_layout_str = LLVMCopyStringRepOfTargetData(data_layout);
4867         LLVMSetDataLayout(ctx.module, data_layout_str);
4868         LLVMDisposeTargetData(data_layout);
4869         LLVMDisposeMessage(data_layout_str);
4870
4871         setup_types(&ctx);
4872
4873         ctx.builder = LLVMCreateBuilderInContext(ctx.context);
4874         ctx.ac.builder = ctx.builder;
4875         ctx.stage = nir->stage;
4876
4877         for (i = 0; i < AC_UD_MAX_SETS; i++)
4878                 shader_info->user_sgprs_locs.descriptor_sets[i].sgpr_idx = -1;
4879         for (i = 0; i < AC_UD_MAX_UD; i++)
4880                 shader_info->user_sgprs_locs.shader_data[i].sgpr_idx = -1;
4881
4882         create_function(&ctx);
4883
4884         if (nir->stage == MESA_SHADER_COMPUTE) {
4885                 int num_shared = 0;
4886                 nir_foreach_variable(variable, &nir->shared)
4887                         num_shared++;
4888                 if (num_shared) {
4889                         int idx = 0;
4890                         uint32_t shared_size = 0;
4891                         LLVMValueRef var;
4892                         LLVMTypeRef i8p = LLVMPointerType(ctx.i8, LOCAL_ADDR_SPACE);
4893                         nir_foreach_variable(variable, &nir->shared) {
4894                                 handle_shared_compute_var(&ctx, variable, &shared_size, idx);
4895                                 idx++;
4896                         }
4897
4898                         shared_size *= 16;
4899                         var = LLVMAddGlobalInAddressSpace(ctx.module,
4900                                                           LLVMArrayType(ctx.i8, shared_size),
4901                                                           "compute_lds",
4902                                                           LOCAL_ADDR_SPACE);
4903                         LLVMSetAlignment(var, 4);
4904                         ctx.shared_memory = LLVMBuildBitCast(ctx.builder, var, i8p, "");
4905                 }
4906         } else if (nir->stage == MESA_SHADER_GEOMETRY) {
4907                 ctx.gs_next_vertex = ac_build_alloca(&ctx, ctx.i32, "gs_next_vertex");
4908
4909                 ctx.gs_max_out_vertices = nir->info->gs.vertices_out;
4910         }
4911
4912         ac_setup_rings(&ctx);
4913
4914         nir_foreach_variable(variable, &nir->inputs)
4915                 handle_shader_input_decl(&ctx, variable);
4916
4917         if (nir->stage == MESA_SHADER_FRAGMENT)
4918                 handle_fs_inputs_pre(&ctx, nir);
4919
4920         nir_foreach_variable(variable, &nir->outputs)
4921                 handle_shader_output_decl(&ctx, variable);
4922
4923         ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
4924                                            _mesa_key_pointer_equal);
4925         ctx.phis = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
4926                                            _mesa_key_pointer_equal);
4927
4928         func = (struct nir_function *)exec_list_get_head(&nir->functions);
4929
4930         setup_locals(&ctx, func);
4931
4932         visit_cf_list(&ctx, &func->impl->body);
4933         phi_post_pass(&ctx);
4934
4935         handle_shader_outputs_post(&ctx);
4936         LLVMBuildRetVoid(ctx.builder);
4937
4938         ac_llvm_finalize_module(&ctx);
4939         free(ctx.locals);
4940         ralloc_free(ctx.defs);
4941         ralloc_free(ctx.phis);
4942
4943         if (nir->stage == MESA_SHADER_GEOMETRY) {
4944                 shader_info->gs.gsvs_vertex_size = util_bitcount64(ctx.output_mask) * 16;
4945                 shader_info->gs.max_gsvs_emit_size = shader_info->gs.gsvs_vertex_size *
4946                         nir->info->gs.vertices_out;
4947         }
4948         return ctx.module;
4949 }
4950
4951 static void ac_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context)
4952 {
4953         unsigned *retval = (unsigned *)context;
4954         LLVMDiagnosticSeverity severity = LLVMGetDiagInfoSeverity(di);
4955         char *description = LLVMGetDiagInfoDescription(di);
4956
4957         if (severity == LLVMDSError) {
4958                 *retval = 1;
4959                 fprintf(stderr, "LLVM triggered Diagnostic Handler: %s\n",
4960                         description);
4961         }
4962
4963         LLVMDisposeMessage(description);
4964 }
4965
4966 static unsigned ac_llvm_compile(LLVMModuleRef M,
4967                                 struct ac_shader_binary *binary,
4968                                 LLVMTargetMachineRef tm)
4969 {
4970         unsigned retval = 0;
4971         char *err;
4972         LLVMContextRef llvm_ctx;
4973         LLVMMemoryBufferRef out_buffer;
4974         unsigned buffer_size;
4975         const char *buffer_data;
4976         LLVMBool mem_err;
4977
4978         /* Setup Diagnostic Handler*/
4979         llvm_ctx = LLVMGetModuleContext(M);
4980
4981         LLVMContextSetDiagnosticHandler(llvm_ctx, ac_diagnostic_handler,
4982                                         &retval);
4983
4984         /* Compile IR*/
4985         mem_err = LLVMTargetMachineEmitToMemoryBuffer(tm, M, LLVMObjectFile,
4986                                                       &err, &out_buffer);
4987
4988         /* Process Errors/Warnings */
4989         if (mem_err) {
4990                 fprintf(stderr, "%s: %s", __FUNCTION__, err);
4991                 free(err);
4992                 retval = 1;
4993                 goto out;
4994         }
4995
4996         /* Extract Shader Code*/
4997         buffer_size = LLVMGetBufferSize(out_buffer);
4998         buffer_data = LLVMGetBufferStart(out_buffer);
4999
5000         ac_elf_read(buffer_data, buffer_size, binary);
5001
5002         /* Clean up */
5003         LLVMDisposeMemoryBuffer(out_buffer);
5004
5005 out:
5006         return retval;
5007 }
5008
5009 static void ac_compile_llvm_module(LLVMTargetMachineRef tm,
5010                                    LLVMModuleRef llvm_module,
5011                                    struct ac_shader_binary *binary,
5012                                    struct ac_shader_config *config,
5013                                    struct ac_shader_variant_info *shader_info,
5014                                    gl_shader_stage stage,
5015                                    bool dump_shader, bool supports_spill)
5016 {
5017         if (dump_shader)
5018                 ac_dump_module(llvm_module);
5019
5020         memset(binary, 0, sizeof(*binary));
5021         int v = ac_llvm_compile(llvm_module, binary, tm);
5022         if (v) {
5023                 fprintf(stderr, "compile failed\n");
5024         }
5025
5026         if (dump_shader)
5027                 fprintf(stderr, "disasm:\n%s\n", binary->disasm_string);
5028
5029         ac_shader_binary_read_config(binary, config, 0, supports_spill);
5030
5031         LLVMContextRef ctx = LLVMGetModuleContext(llvm_module);
5032         LLVMDisposeModule(llvm_module);
5033         LLVMContextDispose(ctx);
5034
5035         if (stage == MESA_SHADER_FRAGMENT) {
5036                 shader_info->num_input_vgprs = 0;
5037                 if (G_0286CC_PERSP_SAMPLE_ENA(config->spi_ps_input_addr))
5038                         shader_info->num_input_vgprs += 2;
5039                 if (G_0286CC_PERSP_CENTER_ENA(config->spi_ps_input_addr))
5040                         shader_info->num_input_vgprs += 2;
5041                 if (G_0286CC_PERSP_CENTROID_ENA(config->spi_ps_input_addr))
5042                         shader_info->num_input_vgprs += 2;
5043                 if (G_0286CC_PERSP_PULL_MODEL_ENA(config->spi_ps_input_addr))
5044                         shader_info->num_input_vgprs += 3;
5045                 if (G_0286CC_LINEAR_SAMPLE_ENA(config->spi_ps_input_addr))
5046                         shader_info->num_input_vgprs += 2;
5047                 if (G_0286CC_LINEAR_CENTER_ENA(config->spi_ps_input_addr))
5048                         shader_info->num_input_vgprs += 2;
5049                 if (G_0286CC_LINEAR_CENTROID_ENA(config->spi_ps_input_addr))
5050                         shader_info->num_input_vgprs += 2;
5051                 if (G_0286CC_LINE_STIPPLE_TEX_ENA(config->spi_ps_input_addr))
5052                         shader_info->num_input_vgprs += 1;
5053                 if (G_0286CC_POS_X_FLOAT_ENA(config->spi_ps_input_addr))
5054                         shader_info->num_input_vgprs += 1;
5055                 if (G_0286CC_POS_Y_FLOAT_ENA(config->spi_ps_input_addr))
5056                         shader_info->num_input_vgprs += 1;
5057                 if (G_0286CC_POS_Z_FLOAT_ENA(config->spi_ps_input_addr))
5058                         shader_info->num_input_vgprs += 1;
5059                 if (G_0286CC_POS_W_FLOAT_ENA(config->spi_ps_input_addr))
5060                         shader_info->num_input_vgprs += 1;
5061                 if (G_0286CC_FRONT_FACE_ENA(config->spi_ps_input_addr))
5062                         shader_info->num_input_vgprs += 1;
5063                 if (G_0286CC_ANCILLARY_ENA(config->spi_ps_input_addr))
5064                         shader_info->num_input_vgprs += 1;
5065                 if (G_0286CC_SAMPLE_COVERAGE_ENA(config->spi_ps_input_addr))
5066                         shader_info->num_input_vgprs += 1;
5067                 if (G_0286CC_POS_FIXED_PT_ENA(config->spi_ps_input_addr))
5068                         shader_info->num_input_vgprs += 1;
5069         }
5070         config->num_vgprs = MAX2(config->num_vgprs, shader_info->num_input_vgprs);
5071
5072         /* +3 for scratch wave offset and VCC */
5073         config->num_sgprs = MAX2(config->num_sgprs,
5074                                  shader_info->num_input_sgprs + 3);
5075 }
5076
5077 void ac_compile_nir_shader(LLVMTargetMachineRef tm,
5078                            struct ac_shader_binary *binary,
5079                            struct ac_shader_config *config,
5080                            struct ac_shader_variant_info *shader_info,
5081                            struct nir_shader *nir,
5082                            const struct ac_nir_compiler_options *options,
5083                            bool dump_shader)
5084 {
5085
5086         LLVMModuleRef llvm_module = ac_translate_nir_to_llvm(tm, nir, shader_info,
5087                                                              options);
5088
5089         ac_compile_llvm_module(tm, llvm_module, binary, config, shader_info, nir->stage, dump_shader, options->supports_spill);
5090         switch (nir->stage) {
5091         case MESA_SHADER_COMPUTE:
5092                 for (int i = 0; i < 3; ++i)
5093                         shader_info->cs.block_size[i] = nir->info->cs.local_size[i];
5094                 break;
5095         case MESA_SHADER_FRAGMENT:
5096                 shader_info->fs.early_fragment_test = nir->info->fs.early_fragment_tests;
5097                 break;
5098         case MESA_SHADER_GEOMETRY:
5099                 shader_info->gs.vertices_in = nir->info->gs.vertices_in;
5100                 shader_info->gs.vertices_out = nir->info->gs.vertices_out;
5101                 shader_info->gs.output_prim = nir->info->gs.output_primitive;
5102                 shader_info->gs.invocations = nir->info->gs.invocations;
5103                 break;
5104         case MESA_SHADER_VERTEX:
5105                 shader_info->vs.as_es = options->key.vs.as_es;
5106                 break;
5107         default:
5108                 break;
5109         }
5110 }
5111
5112 static void
5113 ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
5114 {
5115         LLVMValueRef args[9];
5116         args[0] = ctx->gsvs_ring;
5117         args[1] = LLVMBuildMul(ctx->builder, ctx->vertex_id, LLVMConstInt(ctx->i32, 4, false), "");
5118         args[3] = ctx->i32zero;
5119         args[4] = ctx->i32one;  /* OFFEN */
5120         args[5] = ctx->i32zero; /* IDXEN */
5121         args[6] = ctx->i32one;  /* GLC */
5122         args[7] = ctx->i32one;  /* SLC */
5123         args[8] = ctx->i32zero; /* TFE */
5124
5125         int idx = 0;
5126         int clip_cull_slot = -1;
5127         for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
5128                 int length = 4;
5129                 int start = 0;
5130                 int slot = idx;
5131                 int slot_inc = 1;
5132                 if (!(ctx->output_mask & (1ull << i)))
5133                         continue;
5134
5135                 if (i == VARYING_SLOT_CLIP_DIST1 ||
5136                     i == VARYING_SLOT_CULL_DIST1)
5137                         continue;
5138
5139                 if (i == VARYING_SLOT_CLIP_DIST0 ||
5140                     i == VARYING_SLOT_CULL_DIST0) {
5141                         /* unpack clip and cull from a single set of slots */
5142                         if (clip_cull_slot == -1) {
5143                                 clip_cull_slot = idx;
5144                                 if (ctx->num_output_clips + ctx->num_output_culls > 4)
5145                                         slot_inc = 2;
5146                         } else {
5147                                 slot = clip_cull_slot;
5148                                 slot_inc = 0;
5149                         }
5150                         if (i == VARYING_SLOT_CLIP_DIST0)
5151                                 length = ctx->num_output_clips;
5152                         if (i == VARYING_SLOT_CULL_DIST0) {
5153                                 start = ctx->num_output_clips;
5154                                 length = ctx->num_output_culls;
5155                         }
5156                 }
5157
5158                 for (unsigned j = 0; j < length; j++) {
5159                         LLVMValueRef value;
5160                         args[2] = LLVMConstInt(ctx->i32,
5161                                                (slot * 4 + j + start) *
5162                                                ctx->gs_max_out_vertices * 16 * 4, false);
5163
5164                         value = ac_build_intrinsic(&ctx->ac,
5165                                                    "llvm.SI.buffer.load.dword.i32.i32",
5166                                                    ctx->i32, args, 9,
5167                                                    AC_FUNC_ATTR_READONLY |
5168                                                    AC_FUNC_ATTR_LEGACY);
5169
5170                         LLVMBuildStore(ctx->builder,
5171                                        to_float(ctx, value), ctx->outputs[radeon_llvm_reg_index_soa(i, j)]);
5172                 }
5173                 idx += slot_inc;
5174         }
5175         handle_vs_outputs_post(ctx, &ctx->shader_info->vs.outinfo);
5176 }
5177
5178 void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
5179                               struct nir_shader *geom_shader,
5180                               struct ac_shader_binary *binary,
5181                               struct ac_shader_config *config,
5182                               struct ac_shader_variant_info *shader_info,
5183                               const struct ac_nir_compiler_options *options,
5184                               bool dump_shader)
5185 {
5186         struct nir_to_llvm_context ctx = {0};
5187         ctx.context = LLVMContextCreate();
5188         ctx.module = LLVMModuleCreateWithNameInContext("shader", ctx.context);
5189         ctx.options = options;
5190         ctx.shader_info = shader_info;
5191
5192         ac_llvm_context_init(&ctx.ac, ctx.context);
5193         ctx.ac.module = ctx.module;
5194
5195         ctx.is_gs_copy_shader = true;
5196         LLVMSetTarget(ctx.module, "amdgcn--");
5197         setup_types(&ctx);
5198
5199         ctx.builder = LLVMCreateBuilderInContext(ctx.context);
5200         ctx.ac.builder = ctx.builder;
5201         ctx.stage = MESA_SHADER_VERTEX;
5202
5203         create_function(&ctx);
5204
5205         ctx.gs_max_out_vertices = geom_shader->info->gs.vertices_out;
5206         ac_setup_rings(&ctx);
5207
5208         nir_foreach_variable(variable, &geom_shader->outputs)
5209                 handle_shader_output_decl(&ctx, variable);
5210
5211         ac_gs_copy_shader_emit(&ctx);
5212
5213         LLVMBuildRetVoid(ctx.builder);
5214
5215         ac_llvm_finalize_module(&ctx);
5216
5217         ac_compile_llvm_module(tm, ctx.module, binary, config, shader_info,
5218                                MESA_SHADER_VERTEX,
5219                                dump_shader, options->supports_spill);
5220 }