OSDN Git Service

gallivm: optimize lp_build_minify for sse
[android-x86/external-mesa.git] / src / gallium / auxiliary / gallivm / lp_bld_sample_soa.c
1 /**************************************************************************
2  *
3  * Copyright 2009 VMware, Inc.
4  * All Rights Reserved.
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a
7  * copy of this software and associated documentation files (the
8  * "Software"), to deal in the Software without restriction, including
9  * without limitation the rights to use, copy, modify, merge, publish,
10  * distribute, sub license, and/or sell copies of the Software, and to
11  * permit persons to whom the Software is furnished to do so, subject to
12  * the following conditions:
13  *
14  * The above copyright notice and this permission notice (including the
15  * next paragraph) shall be included in all copies or substantial portions
16  * of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25  *
26  **************************************************************************/
27
28 /**
29  * @file
30  * Texture sampling -- SoA.
31  *
32  * @author Jose Fonseca <jfonseca@vmware.com>
33  * @author Brian Paul <brianp@vmware.com>
34  */
35
36 #include "pipe/p_defines.h"
37 #include "pipe/p_state.h"
38 #include "pipe/p_shader_tokens.h"
39 #include "util/u_debug.h"
40 #include "util/u_dump.h"
41 #include "util/u_memory.h"
42 #include "util/u_math.h"
43 #include "util/u_format.h"
44 #include "util/u_cpu_detect.h"
45 #include "util/u_format_rgb9e5.h"
46 #include "lp_bld_debug.h"
47 #include "lp_bld_type.h"
48 #include "lp_bld_const.h"
49 #include "lp_bld_conv.h"
50 #include "lp_bld_arit.h"
51 #include "lp_bld_bitarit.h"
52 #include "lp_bld_logic.h"
53 #include "lp_bld_printf.h"
54 #include "lp_bld_swizzle.h"
55 #include "lp_bld_flow.h"
56 #include "lp_bld_gather.h"
57 #include "lp_bld_format.h"
58 #include "lp_bld_sample.h"
59 #include "lp_bld_sample_aos.h"
60 #include "lp_bld_struct.h"
61 #include "lp_bld_quad.h"
62 #include "lp_bld_pack.h"
63
64
65 /**
66  * Generate code to fetch a texel from a texture at int coords (x, y, z).
67  * The computation depends on whether the texture is 1D, 2D or 3D.
68  * The result, texel, will be float vectors:
69  *   texel[0] = red values
70  *   texel[1] = green values
71  *   texel[2] = blue values
72  *   texel[3] = alpha values
73  */
74 static void
75 lp_build_sample_texel_soa(struct lp_build_sample_context *bld,
76                           LLVMValueRef width,
77                           LLVMValueRef height,
78                           LLVMValueRef depth,
79                           LLVMValueRef x,
80                           LLVMValueRef y,
81                           LLVMValueRef z,
82                           LLVMValueRef y_stride,
83                           LLVMValueRef z_stride,
84                           LLVMValueRef data_ptr,
85                           LLVMValueRef mipoffsets,
86                           LLVMValueRef texel_out[4])
87 {
88    const struct lp_static_sampler_state *static_state = bld->static_sampler_state;
89    const unsigned dims = bld->dims;
90    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
91    LLVMBuilderRef builder = bld->gallivm->builder;
92    LLVMValueRef offset;
93    LLVMValueRef i, j;
94    LLVMValueRef use_border = NULL;
95
96    /* use_border = x < 0 || x >= width || y < 0 || y >= height */
97    if (lp_sampler_wrap_mode_uses_border_color(static_state->wrap_s,
98                                               static_state->min_img_filter,
99                                               static_state->mag_img_filter)) {
100       LLVMValueRef b1, b2;
101       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
102       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
103       use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
104    }
105
106    if (dims >= 2 &&
107        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_t,
108                                               static_state->min_img_filter,
109                                               static_state->mag_img_filter)) {
110       LLVMValueRef b1, b2;
111       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
112       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
113       if (use_border) {
114          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
115          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
116       }
117       else {
118          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
119       }
120    }
121
122    if (dims == 3 &&
123        lp_sampler_wrap_mode_uses_border_color(static_state->wrap_r,
124                                               static_state->min_img_filter,
125                                               static_state->mag_img_filter)) {
126       LLVMValueRef b1, b2;
127       b1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
128       b2 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
129       if (use_border) {
130          use_border = LLVMBuildOr(builder, use_border, b1, "ub_or_b1");
131          use_border = LLVMBuildOr(builder, use_border, b2, "ub_or_b2");
132       }
133       else {
134          use_border = LLVMBuildOr(builder, b1, b2, "b1_or_b2");
135       }
136    }
137
138    /* convert x,y,z coords to linear offset from start of texture, in bytes */
139    lp_build_sample_offset(&bld->int_coord_bld,
140                           bld->format_desc,
141                           x, y, z, y_stride, z_stride,
142                           &offset, &i, &j);
143    if (mipoffsets) {
144       offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets);
145    }
146
147    if (use_border) {
148       /* If we can sample the border color, it means that texcoords may
149        * lie outside the bounds of the texture image.  We need to do
150        * something to prevent reading out of bounds and causing a segfault.
151        *
152        * Simply AND the texture coords with !use_border.  This will cause
153        * coords which are out of bounds to become zero.  Zero's guaranteed
154        * to be inside the texture image.
155        */
156       offset = lp_build_andnot(&bld->int_coord_bld, offset, use_border);
157    }
158
159    lp_build_fetch_rgba_soa(bld->gallivm,
160                            bld->format_desc,
161                            bld->texel_type,
162                            data_ptr, offset,
163                            i, j,
164                            texel_out);
165
166    /*
167     * Note: if we find an app which frequently samples the texture border
168     * we might want to implement a true conditional here to avoid sampling
169     * the texture whenever possible (since that's quite a bit of code).
170     * Ex:
171     *   if (use_border) {
172     *      texel = border_color;
173     *   }
174     *   else {
175     *      texel = sample_texture(coord);
176     *   }
177     * As it is now, we always sample the texture, then selectively replace
178     * the texel color results with the border color.
179     */
180
181    if (use_border) {
182       /* select texel color or border color depending on use_border. */
183       const struct util_format_description *format_desc = bld->format_desc;
184       int chan;
185       struct lp_type border_type = bld->texel_type;
186       border_type.length = 4;
187       /*
188        * Only replace channels which are actually present. The others should
189        * get optimized away eventually by sampler_view swizzle anyway but it's
190        * easier too.
191        */
192       for (chan = 0; chan < 4; chan++) {
193          unsigned chan_s;
194          /* reverse-map channel... */
195          for (chan_s = 0; chan_s < 4; chan_s++) {
196             if (chan_s == format_desc->swizzle[chan]) {
197                break;
198             }
199          }
200          if (chan_s <= 3) {
201             /* use the already clamped color */
202             LLVMValueRef idx = lp_build_const_int32(bld->gallivm, chan);
203             LLVMValueRef border_chan;
204
205             border_chan = lp_build_extract_broadcast(bld->gallivm,
206                                                      border_type,
207                                                      bld->texel_type,
208                                                      bld->border_color_clamped,
209                                                      idx);
210             texel_out[chan] = lp_build_select(&bld->texel_bld, use_border,
211                                               border_chan, texel_out[chan]);
212          }
213       }
214    }
215 }
216
217
218 /**
219  * Helper to compute the mirror function for the PIPE_WRAP_MIRROR modes.
220  */
221 static LLVMValueRef
222 lp_build_coord_mirror(struct lp_build_sample_context *bld,
223                       LLVMValueRef coord)
224 {
225    struct lp_build_context *coord_bld = &bld->coord_bld;
226    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
227    LLVMValueRef fract, flr, isOdd;
228
229    lp_build_ifloor_fract(coord_bld, coord, &flr, &fract);
230
231    /* isOdd = flr & 1 */
232    isOdd = LLVMBuildAnd(bld->gallivm->builder, flr, int_coord_bld->one, "");
233
234    /* make coord positive or negative depending on isOdd */
235    coord = lp_build_set_sign(coord_bld, fract, isOdd);
236
237    /* convert isOdd to float */
238    isOdd = lp_build_int_to_float(coord_bld, isOdd);
239
240    /* add isOdd to coord */
241    coord = lp_build_add(coord_bld, coord, isOdd);
242
243    return coord;
244 }
245
246
247 /**
248  * Helper to compute the first coord and the weight for
249  * linear wrap repeat npot textures
250  */
251 void
252 lp_build_coord_repeat_npot_linear(struct lp_build_sample_context *bld,
253                                   LLVMValueRef coord_f,
254                                   LLVMValueRef length_i,
255                                   LLVMValueRef length_f,
256                                   LLVMValueRef *coord0_i,
257                                   LLVMValueRef *weight_f)
258 {
259    struct lp_build_context *coord_bld = &bld->coord_bld;
260    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
261    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
262    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i,
263                                                 int_coord_bld->one);
264    LLVMValueRef mask;
265    /* wrap with normalized floats is just fract */
266    coord_f = lp_build_fract(coord_bld, coord_f);
267    /* mul by size and subtract 0.5 */
268    coord_f = lp_build_mul(coord_bld, coord_f, length_f);
269    coord_f = lp_build_sub(coord_bld, coord_f, half);
270    /*
271     * we avoided the 0.5/length division before the repeat wrap,
272     * now need to fix up edge cases with selects
273     */
274    /* convert to int, compute lerp weight */
275    lp_build_ifloor_fract(coord_bld, coord_f, coord0_i, weight_f);
276    mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
277                            PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero);
278    *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i);
279 }
280
281
282 /**
283  * Build LLVM code for texture wrap mode for linear filtering.
284  * \param x0_out  returns first integer texcoord
285  * \param x1_out  returns second integer texcoord
286  * \param weight_out  returns linear interpolation weight
287  */
288 static void
289 lp_build_sample_wrap_linear(struct lp_build_sample_context *bld,
290                             LLVMValueRef coord,
291                             LLVMValueRef length,
292                             LLVMValueRef length_f,
293                             LLVMValueRef offset,
294                             boolean is_pot,
295                             unsigned wrap_mode,
296                             LLVMValueRef *x0_out,
297                             LLVMValueRef *x1_out,
298                             LLVMValueRef *weight_out)
299 {
300    struct lp_build_context *coord_bld = &bld->coord_bld;
301    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
302    LLVMBuilderRef builder = bld->gallivm->builder;
303    LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5);
304    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
305    LLVMValueRef coord0, coord1, weight;
306
307    switch(wrap_mode) {
308    case PIPE_TEX_WRAP_REPEAT:
309       if (is_pot) {
310          /* mul by size and subtract 0.5 */
311          coord = lp_build_mul(coord_bld, coord, length_f);
312          coord = lp_build_sub(coord_bld, coord, half);
313          if (offset) {
314             offset = lp_build_int_to_float(coord_bld, offset);
315             coord = lp_build_add(coord_bld, coord, offset);
316          }
317          /* convert to int, compute lerp weight */
318          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
319          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
320          /* repeat wrap */
321          coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
322          coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, "");
323       }
324       else {
325          LLVMValueRef mask;
326          if (offset) {
327             offset = lp_build_int_to_float(coord_bld, offset);
328             offset = lp_build_div(coord_bld, offset, length_f);
329             coord = lp_build_add(coord_bld, coord, offset);
330          }
331          lp_build_coord_repeat_npot_linear(bld, coord,
332                                            length, length_f,
333                                            &coord0, &weight);
334          mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
335                                  PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);
336          coord1 = LLVMBuildAnd(builder,
337                                lp_build_add(int_coord_bld, coord0, int_coord_bld->one),
338                                mask, "");
339       }
340       break;
341
342    case PIPE_TEX_WRAP_CLAMP:
343       if (bld->static_sampler_state->normalized_coords) {
344          /* scale coord to length */
345          coord = lp_build_mul(coord_bld, coord, length_f);
346       }
347       if (offset) {
348          offset = lp_build_int_to_float(coord_bld, offset);
349          coord = lp_build_add(coord_bld, coord, offset);
350       }
351
352       /* clamp to [0, length] */
353       coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, length_f);
354
355       coord = lp_build_sub(coord_bld, coord, half);
356
357       /* convert to int, compute lerp weight */
358       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
359       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
360       break;
361
362    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
363       {
364          struct lp_build_context abs_coord_bld = bld->coord_bld;
365          abs_coord_bld.type.sign = FALSE;
366
367          if (bld->static_sampler_state->normalized_coords) {
368             /* mul by tex size */
369             coord = lp_build_mul(coord_bld, coord, length_f);
370          }
371          if (offset) {
372             offset = lp_build_int_to_float(coord_bld, offset);
373             coord = lp_build_add(coord_bld, coord, offset);
374          }
375
376          /* clamp to length max */
377          coord = lp_build_min(coord_bld, coord, length_f);
378          /* subtract 0.5 */
379          coord = lp_build_sub(coord_bld, coord, half);
380          /* clamp to [0, length - 0.5] */
381          coord = lp_build_max(coord_bld, coord, coord_bld->zero);
382          /* convert to int, compute lerp weight */
383          lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
384          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
385          /* coord1 = min(coord1, length-1) */
386          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
387          break;
388       }
389
390    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
391       if (bld->static_sampler_state->normalized_coords) {
392          /* scale coord to length */
393          coord = lp_build_mul(coord_bld, coord, length_f);
394       }
395       if (offset) {
396          offset = lp_build_int_to_float(coord_bld, offset);
397          coord = lp_build_add(coord_bld, coord, offset);
398       }
399       /* was: clamp to [-0.5, length + 0.5], then sub 0.5 */
400       /* can skip clamp (though might not work for very large coord values */
401       coord = lp_build_sub(coord_bld, coord, half);
402       /* convert to int, compute lerp weight */
403       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
404       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
405       break;
406
407    case PIPE_TEX_WRAP_MIRROR_REPEAT:
408       /* compute mirror function */
409       coord = lp_build_coord_mirror(bld, coord);
410
411       /* scale coord to length */
412       coord = lp_build_mul(coord_bld, coord, length_f);
413       coord = lp_build_sub(coord_bld, coord, half);
414       if (offset) {
415          offset = lp_build_int_to_float(coord_bld, offset);
416          coord = lp_build_add(coord_bld, coord, offset);
417       }
418
419       /* convert to int, compute lerp weight */
420       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
421       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
422
423       /* coord0 = max(coord0, 0) */
424       coord0 = lp_build_max(int_coord_bld, coord0, int_coord_bld->zero);
425       /* coord1 = min(coord1, length-1) */
426       coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
427       break;
428
429    case PIPE_TEX_WRAP_MIRROR_CLAMP:
430       if (bld->static_sampler_state->normalized_coords) {
431          /* scale coord to length */
432          coord = lp_build_mul(coord_bld, coord, length_f);
433       }
434       if (offset) {
435          offset = lp_build_int_to_float(coord_bld, offset);
436          coord = lp_build_add(coord_bld, coord, offset);
437       }
438       coord = lp_build_abs(coord_bld, coord);
439
440       /* clamp to [0, length] */
441       coord = lp_build_min(coord_bld, coord, length_f);
442
443       coord = lp_build_sub(coord_bld, coord, half);
444
445       /* convert to int, compute lerp weight */
446       lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
447       coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
448       break;
449
450    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
451       {
452          struct lp_build_context abs_coord_bld = bld->coord_bld;
453          abs_coord_bld.type.sign = FALSE;
454
455          if (bld->static_sampler_state->normalized_coords) {
456             /* scale coord to length */
457             coord = lp_build_mul(coord_bld, coord, length_f);
458          }
459          if (offset) {
460             offset = lp_build_int_to_float(coord_bld, offset);
461             coord = lp_build_add(coord_bld, coord, offset);
462          }
463          coord = lp_build_abs(coord_bld, coord);
464
465          /* clamp to length max */
466          coord = lp_build_min(coord_bld, coord, length_f);
467          /* subtract 0.5 */
468          coord = lp_build_sub(coord_bld, coord, half);
469          /* clamp to [0, length - 0.5] */
470          coord = lp_build_max(coord_bld, coord, coord_bld->zero);
471
472          /* convert to int, compute lerp weight */
473          lp_build_ifloor_fract(&abs_coord_bld, coord, &coord0, &weight);
474          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
475          /* coord1 = min(coord1, length-1) */
476          coord1 = lp_build_min(int_coord_bld, coord1, length_minus_one);
477       }
478       break;
479
480    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
481       {
482          if (bld->static_sampler_state->normalized_coords) {
483             /* scale coord to length */
484             coord = lp_build_mul(coord_bld, coord, length_f);
485          }
486          if (offset) {
487             offset = lp_build_int_to_float(coord_bld, offset);
488             coord = lp_build_add(coord_bld, coord, offset);
489          }
490          coord = lp_build_abs(coord_bld, coord);
491
492          /* was: clamp to [-0.5, length + 0.5] then sub 0.5 */
493          /* skip clamp - always positive, and other side
494             only potentially matters for very large coords */
495          coord = lp_build_sub(coord_bld, coord, half);
496
497          /* convert to int, compute lerp weight */
498          lp_build_ifloor_fract(coord_bld, coord, &coord0, &weight);
499          coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);
500       }
501       break;
502
503    default:
504       assert(0);
505       coord0 = NULL;
506       coord1 = NULL;
507       weight = NULL;
508    }
509
510    *x0_out = coord0;
511    *x1_out = coord1;
512    *weight_out = weight;
513 }
514
515
516 /**
517  * Build LLVM code for texture wrap mode for nearest filtering.
518  * \param coord  the incoming texcoord (nominally in [0,1])
519  * \param length  the texture size along one dimension, as int vector
520  * \param length_f  the texture size along one dimension, as float vector
521  * \param offset  texel offset along one dimension (as int vector)
522  * \param is_pot  if TRUE, length is a power of two
523  * \param wrap_mode  one of PIPE_TEX_WRAP_x
524  */
525 static LLVMValueRef
526 lp_build_sample_wrap_nearest(struct lp_build_sample_context *bld,
527                              LLVMValueRef coord,
528                              LLVMValueRef length,
529                              LLVMValueRef length_f,
530                              LLVMValueRef offset,
531                              boolean is_pot,
532                              unsigned wrap_mode)
533 {
534    struct lp_build_context *coord_bld = &bld->coord_bld;
535    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
536    LLVMBuilderRef builder = bld->gallivm->builder;
537    LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);
538    LLVMValueRef icoord;
539    
540    switch(wrap_mode) {
541    case PIPE_TEX_WRAP_REPEAT:
542       if (is_pot) {
543          coord = lp_build_mul(coord_bld, coord, length_f);
544          icoord = lp_build_ifloor(coord_bld, coord);
545          if (offset) {
546             icoord = lp_build_add(int_coord_bld, icoord, offset);
547          }
548          icoord = LLVMBuildAnd(builder, icoord, length_minus_one, "");
549       }
550       else {
551           if (offset) {
552              offset = lp_build_int_to_float(coord_bld, offset);
553              offset = lp_build_div(coord_bld, offset, length_f);
554              coord = lp_build_add(coord_bld, coord, offset);
555           }
556           /* take fraction, unnormalize */
557           coord = lp_build_fract_safe(coord_bld, coord);
558           coord = lp_build_mul(coord_bld, coord, length_f);
559           icoord = lp_build_itrunc(coord_bld, coord);
560       }
561       break;
562
563    case PIPE_TEX_WRAP_CLAMP:
564    case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
565       if (bld->static_sampler_state->normalized_coords) {
566          /* scale coord to length */
567          coord = lp_build_mul(coord_bld, coord, length_f);
568       }
569
570       /* floor */
571       /* use itrunc instead since we clamp to 0 anyway */
572       icoord = lp_build_itrunc(coord_bld, coord);
573       if (offset) {
574          icoord = lp_build_add(int_coord_bld, icoord, offset);
575       }
576
577       /* clamp to [0, length - 1]. */
578       icoord = lp_build_clamp(int_coord_bld, icoord, int_coord_bld->zero,
579                               length_minus_one);
580       break;
581
582    case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
583       if (bld->static_sampler_state->normalized_coords) {
584          /* scale coord to length */
585          coord = lp_build_mul(coord_bld, coord, length_f);
586       }
587       /* no clamp necessary, border masking will handle this */
588       icoord = lp_build_ifloor(coord_bld, coord);
589       if (offset) {
590          icoord = lp_build_add(int_coord_bld, icoord, offset);
591       }
592       break;
593
594    case PIPE_TEX_WRAP_MIRROR_REPEAT:
595       if (offset) {
596          offset = lp_build_int_to_float(coord_bld, offset);
597          offset = lp_build_div(coord_bld, offset, length_f);
598          coord = lp_build_add(coord_bld, coord, offset);
599       }
600       /* compute mirror function */
601       coord = lp_build_coord_mirror(bld, coord);
602
603       /* scale coord to length */
604       assert(bld->static_sampler_state->normalized_coords);
605       coord = lp_build_mul(coord_bld, coord, length_f);
606
607       /* itrunc == ifloor here */
608       icoord = lp_build_itrunc(coord_bld, coord);
609
610       /* clamp to [0, length - 1] */
611       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
612       break;
613
614    case PIPE_TEX_WRAP_MIRROR_CLAMP:
615    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
616       if (bld->static_sampler_state->normalized_coords) {
617          /* scale coord to length */
618          coord = lp_build_mul(coord_bld, coord, length_f);
619       }
620       if (offset) {
621          offset = lp_build_int_to_float(coord_bld, offset);
622          coord = lp_build_add(coord_bld, coord, offset);
623       }
624       coord = lp_build_abs(coord_bld, coord);
625
626       /* itrunc == ifloor here */
627       icoord = lp_build_itrunc(coord_bld, coord);
628
629       /* clamp to [0, length - 1] */
630       icoord = lp_build_min(int_coord_bld, icoord, length_minus_one);
631       break;
632
633    case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
634       if (bld->static_sampler_state->normalized_coords) {
635          /* scale coord to length */
636          coord = lp_build_mul(coord_bld, coord, length_f);
637       }
638       if (offset) {
639          offset = lp_build_int_to_float(coord_bld, offset);
640          coord = lp_build_add(coord_bld, coord, offset);
641       }
642       coord = lp_build_abs(coord_bld, coord);
643
644       /* itrunc == ifloor here */
645       icoord = lp_build_itrunc(coord_bld, coord);
646       break;
647
648    default:
649       assert(0);
650       icoord = NULL;
651    }
652
653    return icoord;
654 }
655
656
657 /**
658  * Do shadow test/comparison.
659  * \param p shadow ref value
660  * \param texel  the texel to compare against
661  */
662 static LLVMValueRef
663 lp_build_sample_comparefunc(struct lp_build_sample_context *bld,
664                             LLVMValueRef p,
665                             LLVMValueRef texel)
666 {
667    struct lp_build_context *texel_bld = &bld->texel_bld;
668    LLVMValueRef res;
669
670    if (0) {
671       //lp_build_print_value(bld->gallivm, "shadow cmp coord", p);
672       lp_build_print_value(bld->gallivm, "shadow cmp texel", texel);
673    }
674
675    /* result = (p FUNC texel) ? 1 : 0 */
676    /*
677     * honor d3d10 floating point rules here, which state that comparisons
678     * are ordered except NOT_EQUAL which is unordered.
679     */
680    if (bld->static_sampler_state->compare_func != PIPE_FUNC_NOTEQUAL) {
681       res = lp_build_cmp_ordered(texel_bld, bld->static_sampler_state->compare_func,
682                                  p, texel);
683    }
684    else {
685       res = lp_build_cmp(texel_bld, bld->static_sampler_state->compare_func,
686                          p, texel);
687    }
688    return res;
689 }
690
691
692 /**
693  * Generate code to sample a mipmap level with nearest filtering.
694  * If sampling a cube texture, r = cube face in [0,5].
695  */
696 static void
697 lp_build_sample_image_nearest(struct lp_build_sample_context *bld,
698                               LLVMValueRef size,
699                               LLVMValueRef row_stride_vec,
700                               LLVMValueRef img_stride_vec,
701                               LLVMValueRef data_ptr,
702                               LLVMValueRef mipoffsets,
703                               LLVMValueRef *coords,
704                               const LLVMValueRef *offsets,
705                               LLVMValueRef colors_out[4])
706 {
707    const unsigned dims = bld->dims;
708    LLVMValueRef width_vec;
709    LLVMValueRef height_vec;
710    LLVMValueRef depth_vec;
711    LLVMValueRef flt_size;
712    LLVMValueRef flt_width_vec;
713    LLVMValueRef flt_height_vec;
714    LLVMValueRef flt_depth_vec;
715    LLVMValueRef x, y = NULL, z = NULL;
716
717    lp_build_extract_image_sizes(bld,
718                                 &bld->int_size_bld,
719                                 bld->int_coord_type,
720                                 size,
721                                 &width_vec, &height_vec, &depth_vec);
722
723    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
724
725    lp_build_extract_image_sizes(bld,
726                                 &bld->float_size_bld,
727                                 bld->coord_type,
728                                 flt_size,
729                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
730
731    /*
732     * Compute integer texcoords.
733     */
734    x = lp_build_sample_wrap_nearest(bld, coords[0], width_vec,
735                                     flt_width_vec, offsets[0],
736                                     bld->static_texture_state->pot_width,
737                                     bld->static_sampler_state->wrap_s);
738    lp_build_name(x, "tex.x.wrapped");
739
740    if (dims >= 2) {
741       y = lp_build_sample_wrap_nearest(bld, coords[1], height_vec,
742                                        flt_height_vec, offsets[1],
743                                        bld->static_texture_state->pot_height,
744                                        bld->static_sampler_state->wrap_t);
745       lp_build_name(y, "tex.y.wrapped");
746
747       if (dims == 3) {
748          z = lp_build_sample_wrap_nearest(bld, coords[2], depth_vec,
749                                           flt_depth_vec, offsets[2],
750                                           bld->static_texture_state->pot_depth,
751                                           bld->static_sampler_state->wrap_r);
752          lp_build_name(z, "tex.z.wrapped");
753       }
754    }
755    if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
756        bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
757        bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
758       z = coords[2];
759       lp_build_name(z, "tex.z.layer");
760    }
761
762    /*
763     * Get texture colors.
764     */
765    lp_build_sample_texel_soa(bld,
766                              width_vec, height_vec, depth_vec,
767                              x, y, z,
768                              row_stride_vec, img_stride_vec,
769                              data_ptr, mipoffsets, colors_out);
770
771    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
772       LLVMValueRef cmpval;
773       cmpval = lp_build_sample_comparefunc(bld, coords[4], colors_out[0]);
774       /* this is really just a AND 1.0, cmpval but llvm is clever enough */
775       colors_out[0] = lp_build_select(&bld->texel_bld, cmpval,
776                                       bld->texel_bld.one, bld->texel_bld.zero);
777       colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
778    }
779
780 }
781
782
783 /**
784  * Like a lerp, but inputs are 0/~0 masks, so can simplify slightly.
785  */
786 static LLVMValueRef
787 lp_build_masklerp(struct lp_build_context *bld,
788                  LLVMValueRef weight,
789                  LLVMValueRef mask0,
790                  LLVMValueRef mask1)
791 {
792    struct gallivm_state *gallivm = bld->gallivm;
793    LLVMBuilderRef builder = gallivm->builder;
794    LLVMValueRef weight2;
795
796    weight2 = lp_build_sub(bld, bld->one, weight);
797    weight = LLVMBuildBitCast(builder, weight,
798                               lp_build_int_vec_type(gallivm, bld->type), "");
799    weight2 = LLVMBuildBitCast(builder, weight2,
800                               lp_build_int_vec_type(gallivm, bld->type), "");
801    weight = LLVMBuildAnd(builder, weight, mask1, "");
802    weight2 = LLVMBuildAnd(builder, weight2, mask0, "");
803    weight = LLVMBuildBitCast(builder, weight, bld->vec_type, "");
804    weight2 = LLVMBuildBitCast(builder, weight2, bld->vec_type, "");
805    return lp_build_add(bld, weight, weight2);
806 }
807
808 /**
809  * Like a 2d lerp, but inputs are 0/~0 masks, so can simplify slightly.
810  */
811 static LLVMValueRef
812 lp_build_masklerp2d(struct lp_build_context *bld,
813                     LLVMValueRef weight0,
814                     LLVMValueRef weight1,
815                     LLVMValueRef mask00,
816                     LLVMValueRef mask01,
817                     LLVMValueRef mask10,
818                     LLVMValueRef mask11)
819 {
820    LLVMValueRef val0 = lp_build_masklerp(bld, weight0, mask00, mask01);
821    LLVMValueRef val1 = lp_build_masklerp(bld, weight0, mask10, mask11);
822    return lp_build_lerp(bld, weight1, val0, val1, 0);
823 }
824
825 /*
826  * this is a bit excessive code for something OpenGL just recommends
827  * but does not require.
828  */
829 #define ACCURATE_CUBE_CORNERS 1
830
831 /**
832  * Generate code to sample a mipmap level with linear filtering.
833  * If sampling a cube texture, r = cube face in [0,5].
834  * If linear_mask is present, only pixels having their mask set
835  * will receive linear filtering, the rest will use nearest.
836  */
837 static void
838 lp_build_sample_image_linear(struct lp_build_sample_context *bld,
839                              LLVMValueRef size,
840                              LLVMValueRef linear_mask,
841                              LLVMValueRef row_stride_vec,
842                              LLVMValueRef img_stride_vec,
843                              LLVMValueRef data_ptr,
844                              LLVMValueRef mipoffsets,
845                              LLVMValueRef *coords,
846                              const LLVMValueRef *offsets,
847                              LLVMValueRef colors_out[4])
848 {
849    LLVMBuilderRef builder = bld->gallivm->builder;
850    struct lp_build_context *ivec_bld = &bld->int_coord_bld;
851    struct lp_build_context *coord_bld = &bld->coord_bld;
852    const unsigned dims = bld->dims;
853    LLVMValueRef width_vec;
854    LLVMValueRef height_vec;
855    LLVMValueRef depth_vec;
856    LLVMValueRef flt_size;
857    LLVMValueRef flt_width_vec;
858    LLVMValueRef flt_height_vec;
859    LLVMValueRef flt_depth_vec;
860    LLVMValueRef fall_off[4], have_corners;
861    LLVMValueRef z1 = NULL;
862    LLVMValueRef z00 = NULL, z01 = NULL, z10 = NULL, z11 = NULL;
863    LLVMValueRef x00 = NULL, x01 = NULL, x10 = NULL, x11 = NULL;
864    LLVMValueRef y00 = NULL, y01 = NULL, y10 = NULL, y11 = NULL;
865    LLVMValueRef s_fpart, t_fpart = NULL, r_fpart = NULL;
866    LLVMValueRef xs[4], ys[4], zs[4];
867    LLVMValueRef neighbors[2][2][4];
868    int chan, texel_index;
869    boolean seamless_cube_filter, accurate_cube_corners;
870
871    seamless_cube_filter = bld->static_texture_state->target == PIPE_TEXTURE_CUBE &&
872                           bld->static_sampler_state->seamless_cube_map;
873    accurate_cube_corners = ACCURATE_CUBE_CORNERS && seamless_cube_filter;
874
875    lp_build_extract_image_sizes(bld,
876                                 &bld->int_size_bld,
877                                 bld->int_coord_type,
878                                 size,
879                                 &width_vec, &height_vec, &depth_vec);
880
881    flt_size = lp_build_int_to_float(&bld->float_size_bld, size);
882
883    lp_build_extract_image_sizes(bld,
884                                 &bld->float_size_bld,
885                                 bld->coord_type,
886                                 flt_size,
887                                 &flt_width_vec, &flt_height_vec, &flt_depth_vec);
888
889    /*
890     * Compute integer texcoords.
891     */
892
893    if (!seamless_cube_filter) {
894       lp_build_sample_wrap_linear(bld, coords[0], width_vec,
895                                   flt_width_vec, offsets[0],
896                                   bld->static_texture_state->pot_width,
897                                   bld->static_sampler_state->wrap_s,
898                                   &x00, &x01, &s_fpart);
899       lp_build_name(x00, "tex.x0.wrapped");
900       lp_build_name(x01, "tex.x1.wrapped");
901       x10 = x00;
902       x11 = x01;
903
904       if (dims >= 2) {
905          lp_build_sample_wrap_linear(bld, coords[1], height_vec,
906                                      flt_height_vec, offsets[1],
907                                      bld->static_texture_state->pot_height,
908                                      bld->static_sampler_state->wrap_t,
909                                      &y00, &y10, &t_fpart);
910          lp_build_name(y00, "tex.y0.wrapped");
911          lp_build_name(y10, "tex.y1.wrapped");
912          y01 = y00;
913          y11 = y10;
914
915          if (dims == 3) {
916             lp_build_sample_wrap_linear(bld, coords[2], depth_vec,
917                                         flt_depth_vec, offsets[2],
918                                         bld->static_texture_state->pot_depth,
919                                         bld->static_sampler_state->wrap_r,
920                                         &z00, &z1, &r_fpart);
921             z01 = z10 = z11 = z00;
922             lp_build_name(z00, "tex.z0.wrapped");
923             lp_build_name(z1, "tex.z1.wrapped");
924          }
925       }
926       if (bld->static_texture_state->target == PIPE_TEXTURE_CUBE ||
927           bld->static_texture_state->target == PIPE_TEXTURE_1D_ARRAY ||
928           bld->static_texture_state->target == PIPE_TEXTURE_2D_ARRAY) {
929          z00 = z01 = z10 = z11 = z1 = coords[2];  /* cube face or layer */
930          lp_build_name(z00, "tex.z0.layer");
931          lp_build_name(z1, "tex.z1.layer");
932       }
933    }
934    else {
935       struct lp_build_if_state edge_if;
936       LLVMTypeRef int1t;
937       LLVMValueRef new_faces[4], new_xcoords[4][2], new_ycoords[4][2];
938       LLVMValueRef coord, have_edge, have_corner;
939       LLVMValueRef fall_off_ym_notxm, fall_off_ym_notxp, fall_off_x, fall_off_y;
940       LLVMValueRef fall_off_yp_notxm, fall_off_yp_notxp;
941       LLVMValueRef x0, x1, y0, y1, y0_clamped, y1_clamped;
942       LLVMValueRef face = coords[2];
943       LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5f);
944       LLVMValueRef length_minus_one = lp_build_sub(ivec_bld, width_vec, ivec_bld->one);
945       /* XXX drop height calcs. Could (should) do this without seamless filtering too */
946       height_vec = width_vec;
947       flt_height_vec = flt_width_vec;
948
949       /* XXX the overflow logic is actually sort of duplicated with trilinear,
950        * since an overflow in one mip should also have a corresponding overflow
951        * in another.
952        */
953       /* should always have normalized coords, and offsets are undefined */
954       assert(bld->static_sampler_state->normalized_coords);
955       coord = lp_build_mul(coord_bld, coords[0], flt_width_vec);
956       /* instead of clamp, build mask if overflowed */
957       coord = lp_build_sub(coord_bld, coord, half);
958       /* convert to int, compute lerp weight */
959       /* not ideal with AVX (and no AVX2) */
960       lp_build_ifloor_fract(coord_bld, coord, &x0, &s_fpart);
961       x1 = lp_build_add(ivec_bld, x0, ivec_bld->one);
962       coord = lp_build_mul(coord_bld, coords[1], flt_height_vec);
963       coord = lp_build_sub(coord_bld, coord, half);
964       lp_build_ifloor_fract(coord_bld, coord, &y0, &t_fpart);
965       y1 = lp_build_add(ivec_bld, y0, ivec_bld->one);
966
967       fall_off[0] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, x0, ivec_bld->zero);
968       fall_off[1] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, x1, length_minus_one);
969       fall_off[2] = lp_build_cmp(ivec_bld, PIPE_FUNC_LESS, y0, ivec_bld->zero);
970       fall_off[3] = lp_build_cmp(ivec_bld, PIPE_FUNC_GREATER, y1, length_minus_one);
971
972       fall_off_x = lp_build_or(ivec_bld, fall_off[0], fall_off[1]);
973       fall_off_y = lp_build_or(ivec_bld, fall_off[2], fall_off[3]);
974       have_edge = lp_build_or(ivec_bld, fall_off_x, fall_off_y);
975       have_edge = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_edge);
976
977       /* needed for accurate corner filtering branch later, rely on 0 init */
978       int1t = LLVMInt1TypeInContext(bld->gallivm->context);
979       have_corners = lp_build_alloca(bld->gallivm, int1t, "have_corner");
980
981       for (texel_index = 0; texel_index < 4; texel_index++) {
982          xs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "xs");
983          ys[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "ys");
984          zs[texel_index] = lp_build_alloca(bld->gallivm, ivec_bld->vec_type, "zs");
985       }
986
987       lp_build_if(&edge_if, bld->gallivm, have_edge);
988
989       have_corner = lp_build_and(ivec_bld, fall_off_x, fall_off_y);
990       have_corner = lp_build_any_true_range(ivec_bld, ivec_bld->type.length, have_corner);
991       LLVMBuildStore(builder, have_corner, have_corners);
992
993       /*
994        * Need to feed clamped values here for cheap corner handling,
995        * but only for y coord (as when falling off both edges we only
996        * fall off the x one) - this should be sufficient.
997        */
998       y0_clamped = lp_build_max(ivec_bld, y0, ivec_bld->zero);
999       y1_clamped = lp_build_min(ivec_bld, y1, length_minus_one);
1000
1001       /*
1002        * Get all possible new coords.
1003        */
1004       lp_build_cube_new_coords(ivec_bld, face,
1005                                x0, x1, y0_clamped, y1_clamped,
1006                                length_minus_one,
1007                                new_faces, new_xcoords, new_ycoords);
1008
1009       /* handle fall off x-, x+ direction */
1010       /* determine new coords, face (not both fall_off vars can be true at same time) */
1011       x00 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][0], x0);
1012       y00 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][0], y0_clamped);
1013       x10 = lp_build_select(ivec_bld, fall_off[0], new_xcoords[0][1], x0);
1014       y10 = lp_build_select(ivec_bld, fall_off[0], new_ycoords[0][1], y1_clamped);
1015       x01 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][0], x1);
1016       y01 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][0], y0_clamped);
1017       x11 = lp_build_select(ivec_bld, fall_off[1], new_xcoords[1][1], x1);
1018       y11 = lp_build_select(ivec_bld, fall_off[1], new_ycoords[1][1], y1_clamped);
1019
1020       z00 = z10 = lp_build_select(ivec_bld, fall_off[0], new_faces[0], face);
1021       z01 = z11 = lp_build_select(ivec_bld, fall_off[1], new_faces[1], face);
1022
1023       /* handle fall off y-, y+ direction */
1024       /*
1025        * Cheap corner logic: just hack up things so a texel doesn't fall
1026        * off both sides (which means filter weights will be wrong but we'll only
1027        * use valid texels in the filter).
1028        * This means however (y) coords must additionally be clamped (see above).
1029        * This corner handling should be fully OpenGL (but not d3d10) compliant.
1030        */
1031       fall_off_ym_notxm = lp_build_andnot(ivec_bld, fall_off[2], fall_off[0]);
1032       fall_off_ym_notxp = lp_build_andnot(ivec_bld, fall_off[2], fall_off[1]);
1033       fall_off_yp_notxm = lp_build_andnot(ivec_bld, fall_off[3], fall_off[0]);
1034       fall_off_yp_notxp = lp_build_andnot(ivec_bld, fall_off[3], fall_off[1]);
1035
1036       x00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_xcoords[2][0], x00);
1037       y00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_ycoords[2][0], y00);
1038       x01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_xcoords[2][1], x01);
1039       y01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_ycoords[2][1], y01);
1040       x10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_xcoords[3][0], x10);
1041       y10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_ycoords[3][0], y10);
1042       x11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_xcoords[3][1], x11);
1043       y11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_ycoords[3][1], y11);
1044
1045       z00 = lp_build_select(ivec_bld, fall_off_ym_notxm, new_faces[2], z00);
1046       z01 = lp_build_select(ivec_bld, fall_off_ym_notxp, new_faces[2], z01);
1047       z10 = lp_build_select(ivec_bld, fall_off_yp_notxm, new_faces[3], z10);
1048       z11 = lp_build_select(ivec_bld, fall_off_yp_notxp, new_faces[3], z11);
1049
1050       LLVMBuildStore(builder, x00, xs[0]);
1051       LLVMBuildStore(builder, x01, xs[1]);
1052       LLVMBuildStore(builder, x10, xs[2]);
1053       LLVMBuildStore(builder, x11, xs[3]);
1054       LLVMBuildStore(builder, y00, ys[0]);
1055       LLVMBuildStore(builder, y01, ys[1]);
1056       LLVMBuildStore(builder, y10, ys[2]);
1057       LLVMBuildStore(builder, y11, ys[3]);
1058       LLVMBuildStore(builder, z00, zs[0]);
1059       LLVMBuildStore(builder, z01, zs[1]);
1060       LLVMBuildStore(builder, z10, zs[2]);
1061       LLVMBuildStore(builder, z11, zs[3]);
1062
1063       lp_build_else(&edge_if);
1064
1065       LLVMBuildStore(builder, x0, xs[0]);
1066       LLVMBuildStore(builder, x1, xs[1]);
1067       LLVMBuildStore(builder, x0, xs[2]);
1068       LLVMBuildStore(builder, x1, xs[3]);
1069       LLVMBuildStore(builder, y0, ys[0]);
1070       LLVMBuildStore(builder, y0, ys[1]);
1071       LLVMBuildStore(builder, y1, ys[2]);
1072       LLVMBuildStore(builder, y1, ys[3]);
1073       LLVMBuildStore(builder, face, zs[0]);
1074       LLVMBuildStore(builder, face, zs[1]);
1075       LLVMBuildStore(builder, face, zs[2]);
1076       LLVMBuildStore(builder, face, zs[3]);
1077
1078       lp_build_endif(&edge_if);
1079
1080       x00 = LLVMBuildLoad(builder, xs[0], "");
1081       x01 = LLVMBuildLoad(builder, xs[1], "");
1082       x10 = LLVMBuildLoad(builder, xs[2], "");
1083       x11 = LLVMBuildLoad(builder, xs[3], "");
1084       y00 = LLVMBuildLoad(builder, ys[0], "");
1085       y01 = LLVMBuildLoad(builder, ys[1], "");
1086       y10 = LLVMBuildLoad(builder, ys[2], "");
1087       y11 = LLVMBuildLoad(builder, ys[3], "");
1088       z00 = LLVMBuildLoad(builder, zs[0], "");
1089       z01 = LLVMBuildLoad(builder, zs[1], "");
1090       z10 = LLVMBuildLoad(builder, zs[2], "");
1091       z11 = LLVMBuildLoad(builder, zs[3], "");
1092    }
1093
1094    if (linear_mask) {
1095       /*
1096        * Whack filter weights into place. Whatever texel had more weight is
1097        * the one which should have been selected by nearest filtering hence
1098        * just use 100% weight for it.
1099        */
1100       struct lp_build_context *c_bld = &bld->coord_bld;
1101       LLVMValueRef w1_mask, w1_weight;
1102       LLVMValueRef half = lp_build_const_vec(bld->gallivm, c_bld->type, 0.5f);
1103
1104       w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, s_fpart, half);
1105       /* this select is really just a "and" */
1106       w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1107       s_fpart = lp_build_select(c_bld, linear_mask, s_fpart, w1_weight);
1108       if (dims >= 2) {
1109          w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, t_fpart, half);
1110          w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1111          t_fpart = lp_build_select(c_bld, linear_mask, t_fpart, w1_weight);
1112          if (dims == 3) {
1113             w1_mask = lp_build_cmp(c_bld, PIPE_FUNC_GREATER, r_fpart, half);
1114             w1_weight = lp_build_select(c_bld, w1_mask, c_bld->one, c_bld->zero);
1115             r_fpart = lp_build_select(c_bld, linear_mask, r_fpart, w1_weight);
1116          }
1117       }
1118    }
1119
1120    /*
1121     * Get texture colors.
1122     */
1123    /* get x0/x1 texels */
1124    lp_build_sample_texel_soa(bld,
1125                              width_vec, height_vec, depth_vec,
1126                              x00, y00, z00,
1127                              row_stride_vec, img_stride_vec,
1128                              data_ptr, mipoffsets, neighbors[0][0]);
1129    lp_build_sample_texel_soa(bld,
1130                              width_vec, height_vec, depth_vec,
1131                              x01, y01, z01,
1132                              row_stride_vec, img_stride_vec,
1133                              data_ptr, mipoffsets, neighbors[0][1]);
1134
1135    if (dims == 1) {
1136       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1137          /* Interpolate two samples from 1D image to produce one color */
1138          for (chan = 0; chan < 4; chan++) {
1139             colors_out[chan] = lp_build_lerp(&bld->texel_bld, s_fpart,
1140                                              neighbors[0][0][chan],
1141                                              neighbors[0][1][chan],
1142                                              0);
1143          }
1144       }
1145       else {
1146          LLVMValueRef cmpval0, cmpval1;
1147          cmpval0 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1148          cmpval1 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1149          /* simplified lerp, AND mask with weight and add */
1150          colors_out[0] = lp_build_masklerp(&bld->texel_bld, s_fpart,
1151                                            cmpval0, cmpval1);
1152          colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1153       }
1154    }
1155    else {
1156       /* 2D/3D texture */
1157       struct lp_build_if_state corner_if;
1158       LLVMValueRef colors0[4], colorss[4];
1159
1160       /* get x0/x1 texels at y1 */
1161       lp_build_sample_texel_soa(bld,
1162                                 width_vec, height_vec, depth_vec,
1163                                 x10, y10, z10,
1164                                 row_stride_vec, img_stride_vec,
1165                                 data_ptr, mipoffsets, neighbors[1][0]);
1166       lp_build_sample_texel_soa(bld,
1167                                 width_vec, height_vec, depth_vec,
1168                                 x11, y11, z11,
1169                                 row_stride_vec, img_stride_vec,
1170                                 data_ptr, mipoffsets, neighbors[1][1]);
1171
1172       /*
1173        * To avoid having to duplicate linear_mask / fetch code use
1174        * another branch (with corner condition though edge would work
1175        * as well) here.
1176        */
1177       if (accurate_cube_corners) {
1178          LLVMValueRef w00, w01, w10, w11, wx0, wy0;
1179          LLVMValueRef c_weight, c00, c01, c10, c11;
1180          LLVMValueRef have_corner, one_third, tmp;
1181
1182          colorss[0] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
1183          colorss[1] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
1184          colorss[2] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
1185          colorss[3] = lp_build_alloca(bld->gallivm, coord_bld->vec_type, "cs");
1186
1187          have_corner = LLVMBuildLoad(builder, have_corners, "");
1188
1189          lp_build_if(&corner_if, bld->gallivm, have_corner);
1190
1191          /*
1192           * we can't use standard 2d lerp as we need per-element weight
1193           * in case of corners, so just calculate bilinear result as
1194           * w00*s00 + w01*s01 + w10*s10 + w11*s11.
1195           * (This is actually less work than using 2d lerp, 7 vs. 9 instructions,
1196           * however calculating the weights needs another 6, so actually probably
1197           * not slower than 2d lerp only for 4 channels as weights only need
1198           * to be calculated once - of course fixing the weights has additional cost.)
1199           */
1200          wx0 = lp_build_sub(coord_bld, coord_bld->one, s_fpart);
1201          wy0 = lp_build_sub(coord_bld, coord_bld->one, t_fpart);
1202          w00 = lp_build_mul(coord_bld, wx0, wy0);
1203          w01 = lp_build_mul(coord_bld, s_fpart, wy0);
1204          w10 = lp_build_mul(coord_bld, wx0, t_fpart);
1205          w11 = lp_build_mul(coord_bld, s_fpart, t_fpart);
1206
1207          /* find corner weight */
1208          c00 = lp_build_and(ivec_bld, fall_off[0], fall_off[2]);
1209          c_weight = lp_build_select(coord_bld, c00, w00, coord_bld->zero);
1210          c01 = lp_build_and(ivec_bld, fall_off[1], fall_off[2]);
1211          c_weight = lp_build_select(coord_bld, c01, w01, c_weight);
1212          c10 = lp_build_and(ivec_bld, fall_off[0], fall_off[3]);
1213          c_weight = lp_build_select(coord_bld, c10, w10, c_weight);
1214          c11 = lp_build_and(ivec_bld, fall_off[1], fall_off[3]);
1215          c_weight = lp_build_select(coord_bld, c11, w11, c_weight);
1216
1217          /*
1218           * add 1/3 of the corner weight to each of the 3 other samples
1219           * and null out corner weight
1220           */
1221          one_third = lp_build_const_vec(bld->gallivm, coord_bld->type, 1.0f/3.0f);
1222          c_weight = lp_build_mul(coord_bld, c_weight, one_third);
1223          w00 = lp_build_add(coord_bld, w00, c_weight);
1224          c00 = LLVMBuildBitCast(builder, c00, coord_bld->vec_type, "");
1225          w00 = lp_build_andnot(coord_bld, w00, c00);
1226          w01 = lp_build_add(coord_bld, w01, c_weight);
1227          c01 = LLVMBuildBitCast(builder, c01, coord_bld->vec_type, "");
1228          w01 = lp_build_andnot(coord_bld, w01, c01);
1229          w10 = lp_build_add(coord_bld, w10, c_weight);
1230          c10 = LLVMBuildBitCast(builder, c10, coord_bld->vec_type, "");
1231          w10 = lp_build_andnot(coord_bld, w10, c10);
1232          w11 = lp_build_add(coord_bld, w11, c_weight);
1233          c11 = LLVMBuildBitCast(builder, c11, coord_bld->vec_type, "");
1234          w11 = lp_build_andnot(coord_bld, w11, c11);
1235
1236          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1237             for (chan = 0; chan < 4; chan++) {
1238                colors0[chan] = lp_build_mul(coord_bld, w00, neighbors[0][0][chan]);
1239                tmp = lp_build_mul(coord_bld, w01, neighbors[0][1][chan]);
1240                colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1241                tmp = lp_build_mul(coord_bld, w10, neighbors[1][0][chan]);
1242                colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1243                tmp = lp_build_mul(coord_bld, w11, neighbors[1][1][chan]);
1244                colors0[chan] = lp_build_add(coord_bld, tmp, colors0[chan]);
1245             }
1246          }
1247          else {
1248             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1249             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1250             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1251             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1252             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1253             /* inputs to interpolation are just masks so just add masked weights together */
1254             cmpval00 = LLVMBuildBitCast(builder, cmpval00, coord_bld->vec_type, "");
1255             cmpval01 = LLVMBuildBitCast(builder, cmpval01, coord_bld->vec_type, "");
1256             cmpval10 = LLVMBuildBitCast(builder, cmpval10, coord_bld->vec_type, "");
1257             cmpval11 = LLVMBuildBitCast(builder, cmpval11, coord_bld->vec_type, "");
1258             colors0[0] = lp_build_and(coord_bld, w00, cmpval00);
1259             tmp = lp_build_and(coord_bld, w01, cmpval01);
1260             colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1261             tmp = lp_build_and(coord_bld, w10, cmpval10);
1262             colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1263             tmp = lp_build_and(coord_bld, w11, cmpval11);
1264             colors0[0] = lp_build_add(coord_bld, tmp, colors0[0]);
1265             colors0[1] = colors0[2] = colors0[3] = colors0[0];
1266          }
1267
1268          LLVMBuildStore(builder, colors0[0], colorss[0]);
1269          LLVMBuildStore(builder, colors0[1], colorss[1]);
1270          LLVMBuildStore(builder, colors0[2], colorss[2]);
1271          LLVMBuildStore(builder, colors0[3], colorss[3]);
1272
1273          lp_build_else(&corner_if);
1274       }
1275
1276       if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1277          /* Bilinear interpolate the four samples from the 2D image / 3D slice */
1278          for (chan = 0; chan < 4; chan++) {
1279             colors0[chan] = lp_build_lerp_2d(&bld->texel_bld,
1280                                              s_fpart, t_fpart,
1281                                              neighbors[0][0][chan],
1282                                              neighbors[0][1][chan],
1283                                              neighbors[1][0][chan],
1284                                              neighbors[1][1][chan],
1285                                              0);
1286          }
1287       }
1288       else {
1289          LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1290          cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1291          cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1292          cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1293          cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1294          colors0[0] = lp_build_masklerp2d(&bld->texel_bld, s_fpart, t_fpart,
1295                                           cmpval00, cmpval01, cmpval10, cmpval11);
1296          colors0[1] = colors0[2] = colors0[3] = colors0[0];
1297       }
1298
1299       if (accurate_cube_corners) {
1300          LLVMBuildStore(builder, colors0[0], colorss[0]);
1301          LLVMBuildStore(builder, colors0[1], colorss[1]);
1302          LLVMBuildStore(builder, colors0[2], colorss[2]);
1303          LLVMBuildStore(builder, colors0[3], colorss[3]);
1304
1305          lp_build_endif(&corner_if);
1306
1307          colors0[0] = LLVMBuildLoad(builder, colorss[0], "");
1308          colors0[1] = LLVMBuildLoad(builder, colorss[1], "");
1309          colors0[2] = LLVMBuildLoad(builder, colorss[2], "");
1310          colors0[3] = LLVMBuildLoad(builder, colorss[3], "");
1311       }
1312
1313       if (dims == 3) {
1314          LLVMValueRef neighbors1[2][2][4];
1315          LLVMValueRef colors1[4];
1316
1317          /* get x0/x1/y0/y1 texels at z1 */
1318          lp_build_sample_texel_soa(bld,
1319                                    width_vec, height_vec, depth_vec,
1320                                    x00, y00, z1,
1321                                    row_stride_vec, img_stride_vec,
1322                                    data_ptr, mipoffsets, neighbors1[0][0]);
1323          lp_build_sample_texel_soa(bld,
1324                                    width_vec, height_vec, depth_vec,
1325                                    x01, y01, z1,
1326                                    row_stride_vec, img_stride_vec,
1327                                    data_ptr, mipoffsets, neighbors1[0][1]);
1328          lp_build_sample_texel_soa(bld,
1329                                    width_vec, height_vec, depth_vec,
1330                                    x10, y10, z1,
1331                                    row_stride_vec, img_stride_vec,
1332                                    data_ptr, mipoffsets, neighbors1[1][0]);
1333          lp_build_sample_texel_soa(bld,
1334                                    width_vec, height_vec, depth_vec,
1335                                    x11, y11, z1,
1336                                    row_stride_vec, img_stride_vec,
1337                                    data_ptr, mipoffsets, neighbors1[1][1]);
1338
1339          if (bld->static_sampler_state->compare_mode == PIPE_TEX_COMPARE_NONE) {
1340             /* Bilinear interpolate the four samples from the second Z slice */
1341             for (chan = 0; chan < 4; chan++) {
1342                colors1[chan] = lp_build_lerp_2d(&bld->texel_bld,
1343                                                 s_fpart, t_fpart,
1344                                                 neighbors1[0][0][chan],
1345                                                 neighbors1[0][1][chan],
1346                                                 neighbors1[1][0][chan],
1347                                                 neighbors1[1][1][chan],
1348                                                 0);
1349             }
1350             /* Linearly interpolate the two samples from the two 3D slices */
1351             for (chan = 0; chan < 4; chan++) {
1352                colors_out[chan] = lp_build_lerp(&bld->texel_bld,
1353                                                 r_fpart,
1354                                                 colors0[chan], colors1[chan],
1355                                                 0);
1356             }
1357          }
1358          else {
1359             LLVMValueRef cmpval00, cmpval01, cmpval10, cmpval11;
1360             cmpval00 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][0][0]);
1361             cmpval01 = lp_build_sample_comparefunc(bld, coords[4], neighbors[0][1][0]);
1362             cmpval10 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][0][0]);
1363             cmpval11 = lp_build_sample_comparefunc(bld, coords[4], neighbors[1][1][0]);
1364             colors1[0] = lp_build_masklerp2d(&bld->texel_bld, s_fpart, t_fpart,
1365                                              cmpval00, cmpval01, cmpval10, cmpval11);
1366             /* Linearly interpolate the two samples from the two 3D slices */
1367             colors_out[0] = lp_build_lerp(&bld->texel_bld,
1368                                              r_fpart,
1369                                              colors0[0], colors1[0],
1370                                              0);
1371             colors_out[1] = colors_out[2] = colors_out[3] = colors_out[0];
1372          }
1373       }
1374       else {
1375          /* 2D tex */
1376          for (chan = 0; chan < 4; chan++) {
1377             colors_out[chan] = colors0[chan];
1378          }
1379       }
1380    }
1381 }
1382
1383
1384 /**
1385  * Sample the texture/mipmap using given image filter and mip filter.
1386  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1387  * from (vectors or scalars).
1388  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1389  */
1390 static void
1391 lp_build_sample_mipmap(struct lp_build_sample_context *bld,
1392                        unsigned img_filter,
1393                        unsigned mip_filter,
1394                        LLVMValueRef *coords,
1395                        const LLVMValueRef *offsets,
1396                        LLVMValueRef ilevel0,
1397                        LLVMValueRef ilevel1,
1398                        LLVMValueRef lod_fpart,
1399                        LLVMValueRef *colors_out)
1400 {
1401    LLVMBuilderRef builder = bld->gallivm->builder;
1402    LLVMValueRef size0 = NULL;
1403    LLVMValueRef size1 = NULL;
1404    LLVMValueRef row_stride0_vec = NULL;
1405    LLVMValueRef row_stride1_vec = NULL;
1406    LLVMValueRef img_stride0_vec = NULL;
1407    LLVMValueRef img_stride1_vec = NULL;
1408    LLVMValueRef data_ptr0 = NULL;
1409    LLVMValueRef data_ptr1 = NULL;
1410    LLVMValueRef mipoff0 = NULL;
1411    LLVMValueRef mipoff1 = NULL;
1412    LLVMValueRef colors0[4], colors1[4];
1413    unsigned chan;
1414
1415    /* sample the first mipmap level */
1416    lp_build_mipmap_level_sizes(bld, ilevel0,
1417                                &size0,
1418                                &row_stride0_vec, &img_stride0_vec);
1419    if (bld->num_mips == 1) {
1420       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1421    }
1422    else {
1423       /* This path should work for num_lods 1 too but slightly less efficient */
1424       data_ptr0 = bld->base_ptr;
1425       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1426    }
1427    if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1428       lp_build_sample_image_nearest(bld, size0,
1429                                     row_stride0_vec, img_stride0_vec,
1430                                     data_ptr0, mipoff0, coords, offsets,
1431                                     colors0);
1432    }
1433    else {
1434       assert(img_filter == PIPE_TEX_FILTER_LINEAR);
1435       lp_build_sample_image_linear(bld, size0, NULL,
1436                                    row_stride0_vec, img_stride0_vec,
1437                                    data_ptr0, mipoff0, coords, offsets,
1438                                    colors0);
1439    }
1440
1441    /* Store the first level's colors in the output variables */
1442    for (chan = 0; chan < 4; chan++) {
1443        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1444    }
1445
1446    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1447       struct lp_build_if_state if_ctx;
1448       LLVMValueRef need_lerp;
1449
1450       /* need_lerp = lod_fpart > 0 */
1451       if (bld->num_lods == 1) {
1452          need_lerp = LLVMBuildFCmp(builder, LLVMRealUGT,
1453                                    lod_fpart, bld->lodf_bld.zero,
1454                                    "need_lerp");
1455       }
1456       else {
1457          /*
1458           * We'll do mip filtering if any of the quads (or individual
1459           * pixel in case of per-pixel lod) need it.
1460           * It might be better to split the vectors here and only fetch/filter
1461           * quads which need it (if there's one lod per quad).
1462           */
1463          need_lerp = lp_build_compare(bld->gallivm, bld->lodf_bld.type,
1464                                       PIPE_FUNC_GREATER,
1465                                       lod_fpart, bld->lodf_bld.zero);
1466          need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, need_lerp);
1467       }
1468
1469       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1470       {
1471          /*
1472           * We unfortunately need to clamp lod_fpart here since we can get
1473           * negative values which would screw up filtering if not all
1474           * lod_fpart values have same sign.
1475           */
1476          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1477                                   bld->lodf_bld.zero);
1478          /* sample the second mipmap level */
1479          lp_build_mipmap_level_sizes(bld, ilevel1,
1480                                      &size1,
1481                                      &row_stride1_vec, &img_stride1_vec);
1482          if (bld->num_mips == 1) {
1483             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1484          }
1485          else {
1486             data_ptr1 = bld->base_ptr;
1487             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1488          }
1489          if (img_filter == PIPE_TEX_FILTER_NEAREST) {
1490             lp_build_sample_image_nearest(bld, size1,
1491                                           row_stride1_vec, img_stride1_vec,
1492                                           data_ptr1, mipoff1, coords, offsets,
1493                                           colors1);
1494          }
1495          else {
1496             lp_build_sample_image_linear(bld, size1, NULL,
1497                                          row_stride1_vec, img_stride1_vec,
1498                                          data_ptr1, mipoff1, coords, offsets,
1499                                          colors1);
1500          }
1501
1502          /* interpolate samples from the two mipmap levels */
1503
1504          if (bld->num_lods != bld->coord_type.length)
1505             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1506                                                               bld->lodf_bld.type,
1507                                                               bld->texel_bld.type,
1508                                                               lod_fpart);
1509
1510          for (chan = 0; chan < 4; chan++) {
1511             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1512                                           colors0[chan], colors1[chan],
1513                                           0);
1514             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1515          }
1516       }
1517       lp_build_endif(&if_ctx);
1518    }
1519 }
1520
1521
1522 /**
1523  * Sample the texture/mipmap using given mip filter, and using
1524  * both nearest and linear filtering at the same time depending
1525  * on linear_mask.
1526  * lod can be per quad but linear_mask is always per pixel.
1527  * ilevel0 and ilevel1 indicate the two mipmap levels to sample
1528  * from (vectors or scalars).
1529  * If we're using nearest miplevel sampling the '1' values will be null/unused.
1530  */
1531 static void
1532 lp_build_sample_mipmap_both(struct lp_build_sample_context *bld,
1533                             LLVMValueRef linear_mask,
1534                             unsigned mip_filter,
1535                             LLVMValueRef *coords,
1536                             const LLVMValueRef *offsets,
1537                             LLVMValueRef ilevel0,
1538                             LLVMValueRef ilevel1,
1539                             LLVMValueRef lod_fpart,
1540                             LLVMValueRef lod_positive,
1541                             LLVMValueRef *colors_out)
1542 {
1543    LLVMBuilderRef builder = bld->gallivm->builder;
1544    LLVMValueRef size0 = NULL;
1545    LLVMValueRef size1 = NULL;
1546    LLVMValueRef row_stride0_vec = NULL;
1547    LLVMValueRef row_stride1_vec = NULL;
1548    LLVMValueRef img_stride0_vec = NULL;
1549    LLVMValueRef img_stride1_vec = NULL;
1550    LLVMValueRef data_ptr0 = NULL;
1551    LLVMValueRef data_ptr1 = NULL;
1552    LLVMValueRef mipoff0 = NULL;
1553    LLVMValueRef mipoff1 = NULL;
1554    LLVMValueRef colors0[4], colors1[4];
1555    unsigned chan;
1556
1557    /* sample the first mipmap level */
1558    lp_build_mipmap_level_sizes(bld, ilevel0,
1559                                &size0,
1560                                &row_stride0_vec, &img_stride0_vec);
1561    if (bld->num_mips == 1) {
1562       data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0);
1563    }
1564    else {
1565       /* This path should work for num_lods 1 too but slightly less efficient */
1566       data_ptr0 = bld->base_ptr;
1567       mipoff0 = lp_build_get_mip_offsets(bld, ilevel0);
1568    }
1569
1570    lp_build_sample_image_linear(bld, size0, linear_mask,
1571                                 row_stride0_vec, img_stride0_vec,
1572                                 data_ptr0, mipoff0, coords, offsets,
1573                                 colors0);
1574
1575    /* Store the first level's colors in the output variables */
1576    for (chan = 0; chan < 4; chan++) {
1577        LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1578    }
1579
1580    if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
1581       struct lp_build_if_state if_ctx;
1582       LLVMValueRef need_lerp;
1583
1584       /*
1585        * We'll do mip filtering if any of the quads (or individual
1586        * pixel in case of per-pixel lod) need it.
1587        * Note using lod_positive here not lod_fpart since it may be the same
1588        * condition as that used in the outer "if" in the caller hence llvm
1589        * should be able to merge the branches in this case.
1590        */
1591       need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_positive);
1592
1593       lp_build_if(&if_ctx, bld->gallivm, need_lerp);
1594       {
1595          /*
1596           * We unfortunately need to clamp lod_fpart here since we can get
1597           * negative values which would screw up filtering if not all
1598           * lod_fpart values have same sign.
1599           */
1600          lod_fpart = lp_build_max(&bld->lodf_bld, lod_fpart,
1601                                   bld->lodf_bld.zero);
1602          /* sample the second mipmap level */
1603          lp_build_mipmap_level_sizes(bld, ilevel1,
1604                                      &size1,
1605                                      &row_stride1_vec, &img_stride1_vec);
1606          if (bld->num_mips == 1) {
1607             data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1);
1608          }
1609          else {
1610             data_ptr1 = bld->base_ptr;
1611             mipoff1 = lp_build_get_mip_offsets(bld, ilevel1);
1612          }
1613
1614          lp_build_sample_image_linear(bld, size1, linear_mask,
1615                                       row_stride1_vec, img_stride1_vec,
1616                                       data_ptr1, mipoff1, coords, offsets,
1617                                       colors1);
1618
1619          /* interpolate samples from the two mipmap levels */
1620
1621          if (bld->num_lods != bld->coord_type.length)
1622             lod_fpart = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
1623                                                               bld->lodf_bld.type,
1624                                                               bld->texel_bld.type,
1625                                                               lod_fpart);
1626
1627          for (chan = 0; chan < 4; chan++) {
1628             colors0[chan] = lp_build_lerp(&bld->texel_bld, lod_fpart,
1629                                           colors0[chan], colors1[chan],
1630                                           0);
1631             LLVMBuildStore(builder, colors0[chan], colors_out[chan]);
1632          }
1633       }
1634       lp_build_endif(&if_ctx);
1635    }
1636 }
1637
1638
1639 /**
1640  * Build (per-coord) layer value.
1641  * Either clamp layer to valid values or fill in optional out_of_bounds
1642  * value and just return value unclamped.
1643  */
1644 static LLVMValueRef
1645 lp_build_layer_coord(struct lp_build_sample_context *bld,
1646                      unsigned texture_unit,
1647                      LLVMValueRef layer,
1648                      LLVMValueRef *out_of_bounds)
1649 {
1650    LLVMValueRef num_layers;
1651    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
1652
1653    num_layers = bld->dynamic_state->depth(bld->dynamic_state,
1654                                           bld->gallivm, texture_unit);
1655
1656    if (out_of_bounds) {
1657       LLVMValueRef out1, out;
1658       num_layers = lp_build_broadcast_scalar(int_coord_bld, num_layers);
1659       out = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, layer, int_coord_bld->zero);
1660       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, layer, num_layers);
1661       *out_of_bounds = lp_build_or(int_coord_bld, out, out1);
1662       return layer;
1663    }
1664    else {
1665       LLVMValueRef maxlayer;
1666       maxlayer = lp_build_sub(&bld->int_bld, num_layers, bld->int_bld.one);
1667       maxlayer = lp_build_broadcast_scalar(int_coord_bld, maxlayer);
1668       return lp_build_clamp(int_coord_bld, layer, int_coord_bld->zero, maxlayer);
1669    }
1670 }
1671
1672
1673 /**
1674  * Calculate cube face, lod, mip levels.
1675  */
1676 static void
1677 lp_build_sample_common(struct lp_build_sample_context *bld,
1678                        unsigned texture_index,
1679                        unsigned sampler_index,
1680                        LLVMValueRef *coords,
1681                        const struct lp_derivatives *derivs, /* optional */
1682                        LLVMValueRef lod_bias, /* optional */
1683                        LLVMValueRef explicit_lod, /* optional */
1684                        LLVMValueRef *lod_pos_or_zero,
1685                        LLVMValueRef *lod_fpart,
1686                        LLVMValueRef *ilevel0,
1687                        LLVMValueRef *ilevel1)
1688 {
1689    const unsigned mip_filter = bld->static_sampler_state->min_mip_filter;
1690    const unsigned min_filter = bld->static_sampler_state->min_img_filter;
1691    const unsigned mag_filter = bld->static_sampler_state->mag_img_filter;
1692    const unsigned target = bld->static_texture_state->target;
1693    LLVMValueRef first_level, cube_rho = NULL;
1694    LLVMValueRef lod_ipart = NULL;
1695    struct lp_derivatives cube_derivs;
1696
1697    /*
1698    printf("%s mip %d  min %d  mag %d\n", __FUNCTION__,
1699           mip_filter, min_filter, mag_filter);
1700    */
1701
1702    /*
1703     * Choose cube face, recompute texcoords for the chosen face and
1704     * compute rho here too (as it requires transform of derivatives).
1705     */
1706    if (target == PIPE_TEXTURE_CUBE) {
1707       boolean need_derivs;
1708       need_derivs = ((min_filter != mag_filter ||
1709                       mip_filter != PIPE_TEX_MIPFILTER_NONE) &&
1710                       !bld->static_sampler_state->min_max_lod_equal &&
1711                       !explicit_lod);
1712       lp_build_cube_lookup(bld, coords, derivs, &cube_rho, &cube_derivs, need_derivs);
1713       derivs = &cube_derivs;
1714    }
1715    else if (target == PIPE_TEXTURE_1D_ARRAY ||
1716             target == PIPE_TEXTURE_2D_ARRAY) {
1717       coords[2] = lp_build_iround(&bld->coord_bld, coords[2]);
1718       coords[2] = lp_build_layer_coord(bld, texture_index, coords[2], NULL);
1719    }
1720
1721    if (bld->static_sampler_state->compare_mode != PIPE_TEX_COMPARE_NONE) {
1722       /*
1723        * Clamp p coords to [0,1] for fixed function depth texture format here.
1724        * Technically this is not entirely correct for unorm depth as the ref value
1725        * should be converted to the depth format (quantization!) and comparison
1726        * then done in texture format. This would actually help performance (since
1727        * only need to do it once and could save the per-sample conversion of texels
1728        * to floats instead), but it would need more messy code (would need to push
1729        * at least some bits down to actual fetch so conversion could be skipped,
1730        * and would have ugly interaction with border color, would need to convert
1731        * border color to that format too or do some other tricks to make it work).
1732        */
1733       const struct util_format_description *format_desc = bld->format_desc;
1734       unsigned chan_type;
1735       /* not entirely sure we couldn't end up with non-valid swizzle here */
1736       chan_type = format_desc->swizzle[0] <= UTIL_FORMAT_SWIZZLE_W ?
1737                      format_desc->channel[format_desc->swizzle[0]].type :
1738                      UTIL_FORMAT_TYPE_FLOAT;
1739       if (chan_type != UTIL_FORMAT_TYPE_FLOAT) {
1740          coords[4] = lp_build_clamp(&bld->coord_bld, coords[4],
1741                                     bld->coord_bld.zero, bld->coord_bld.one);
1742       }
1743    }
1744
1745    /*
1746     * Compute the level of detail (float).
1747     */
1748    if (min_filter != mag_filter ||
1749        mip_filter != PIPE_TEX_MIPFILTER_NONE) {
1750       /* Need to compute lod either to choose mipmap levels or to
1751        * distinguish between minification/magnification with one mipmap level.
1752        */
1753       lp_build_lod_selector(bld, texture_index, sampler_index,
1754                             coords[0], coords[1], coords[2], cube_rho,
1755                             derivs, lod_bias, explicit_lod,
1756                             mip_filter,
1757                             &lod_ipart, lod_fpart, lod_pos_or_zero);
1758    } else {
1759       lod_ipart = bld->lodi_bld.zero;
1760       *lod_pos_or_zero = bld->lodi_bld.zero;
1761    }
1762
1763    if (bld->num_lods != bld->num_mips) {
1764       /* only makes sense if there's just a single mip level */
1765       assert(bld->num_mips == 1);
1766       lod_ipart = lp_build_extract_range(bld->gallivm, lod_ipart, 0, 1);
1767    }
1768
1769    /*
1770     * Compute integer mipmap level(s) to fetch texels from: ilevel0, ilevel1
1771     */
1772    switch (mip_filter) {
1773    default:
1774       assert(0 && "bad mip_filter value in lp_build_sample_soa()");
1775       /* fall-through */
1776    case PIPE_TEX_MIPFILTER_NONE:
1777       /* always use mip level 0 */
1778       if (HAVE_LLVM == 0x0207 && target == PIPE_TEXTURE_CUBE) {
1779          /* XXX this is a work-around for an apparent bug in LLVM 2.7.
1780           * We should be able to set ilevel0 = const(0) but that causes
1781           * bad x86 code to be emitted.
1782           */
1783          assert(lod_ipart);
1784          lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
1785       }
1786       else {
1787          first_level = bld->dynamic_state->first_level(bld->dynamic_state,
1788                                                        bld->gallivm, texture_index);
1789          first_level = lp_build_broadcast_scalar(&bld->leveli_bld, first_level);
1790          *ilevel0 = first_level;
1791       }
1792       break;
1793    case PIPE_TEX_MIPFILTER_NEAREST:
1794       assert(lod_ipart);
1795       lp_build_nearest_mip_level(bld, texture_index, lod_ipart, ilevel0, NULL);
1796       break;
1797    case PIPE_TEX_MIPFILTER_LINEAR:
1798       assert(lod_ipart);
1799       assert(*lod_fpart);
1800       lp_build_linear_mip_levels(bld, texture_index,
1801                                  lod_ipart, lod_fpart,
1802                                  ilevel0, ilevel1);
1803       break;
1804    }
1805 }
1806
1807 static void
1808 lp_build_clamp_border_color(struct lp_build_sample_context *bld,
1809                             unsigned sampler_unit)
1810 {
1811    struct gallivm_state *gallivm = bld->gallivm;
1812    LLVMBuilderRef builder = gallivm->builder;
1813    LLVMValueRef border_color_ptr =
1814       bld->dynamic_state->border_color(bld->dynamic_state,
1815                                         gallivm, sampler_unit);
1816    LLVMValueRef border_color;
1817    const struct util_format_description *format_desc = bld->format_desc;
1818    struct lp_type vec4_type = bld->texel_type;
1819    struct lp_build_context vec4_bld;
1820    LLVMValueRef min_clamp = NULL;
1821    LLVMValueRef max_clamp = NULL;
1822
1823    /*
1824     * For normalized format need to clamp border color (technically
1825     * probably should also quantize the data). Really sucks doing this
1826     * here but can't avoid at least for now since this is part of
1827     * sampler state and texture format is part of sampler_view state.
1828     * GL expects also expects clamping for uint/sint formats too so
1829     * do that as well (d3d10 can't end up here with uint/sint since it
1830     * only supports them with ld).
1831     */
1832    vec4_type.length = 4;
1833    lp_build_context_init(&vec4_bld, gallivm, vec4_type);
1834
1835    /*
1836     * Vectorized clamping of border color. Loading is a bit of a hack since
1837     * we just cast the pointer to float array to pointer to vec4
1838     * (int or float).
1839     */
1840    border_color_ptr = lp_build_array_get_ptr(gallivm, border_color_ptr,
1841                                              lp_build_const_int32(gallivm, 0));
1842    border_color_ptr = LLVMBuildBitCast(builder, border_color_ptr,
1843                                        LLVMPointerType(vec4_bld.vec_type, 0), "");
1844    border_color = LLVMBuildLoad(builder, border_color_ptr, "");
1845    /* we don't have aligned type in the dynamic state unfortunately */
1846    lp_set_load_alignment(border_color, 4);
1847
1848    /*
1849     * Instead of having some incredibly complex logic which will try to figure out
1850     * clamping necessary for each channel, simply use the first channel, and treat
1851     * mixed signed/unsigned normalized formats specially.
1852     * (Mixed non-normalized, which wouldn't work at all here, do not exist for a
1853     * good reason.)
1854     */
1855    if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN) {
1856       int chan;
1857       /* d/s needs special handling because both present means just sampling depth */
1858       if (util_format_is_depth_and_stencil(format_desc->format)) {
1859          chan = format_desc->swizzle[0];
1860       }
1861       else {
1862          chan = util_format_get_first_non_void_channel(format_desc->format);
1863       }
1864       if (chan >= 0 && chan <= UTIL_FORMAT_SWIZZLE_W) {
1865          unsigned chan_type = format_desc->channel[chan].type;
1866          unsigned chan_norm = format_desc->channel[chan].normalized;
1867          unsigned chan_pure = format_desc->channel[chan].pure_integer;
1868          if (chan_type == UTIL_FORMAT_TYPE_SIGNED) {
1869             if (chan_norm) {
1870                min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
1871                max_clamp = vec4_bld.one;
1872             }
1873             else if (chan_pure) {
1874                /*
1875                 * Border color was stored as int, hence need min/max clamp
1876                 * only if chan has less than 32 bits..
1877                 */
1878                unsigned chan_size = format_desc->channel[chan].size;
1879                if (chan_size < 32) {
1880                   min_clamp = lp_build_const_int_vec(gallivm, vec4_type,
1881                                                      0 - (1 << (chan_size - 1)));
1882                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
1883                                                      (1 << (chan_size - 1)) - 1);
1884                }
1885             }
1886             /* TODO: no idea about non-pure, non-normalized! */
1887          }
1888          else if (chan_type == UTIL_FORMAT_TYPE_UNSIGNED) {
1889             if (chan_norm) {
1890                min_clamp = vec4_bld.zero;
1891                max_clamp = vec4_bld.one;
1892             }
1893             /*
1894              * Need a ugly hack here, because we don't have Z32_FLOAT_X8X24
1895              * we use Z32_FLOAT_S8X24 to imply sampling depth component
1896              * and ignoring stencil, which will blow up here if we try to
1897              * do a uint clamp in a float texel build...
1898              * And even if we had that format, mesa st also thinks using z24s8
1899              * means depth sampling ignoring stencil.
1900              */
1901             else if (chan_pure) {
1902                /*
1903                 * Border color was stored as uint, hence never need min
1904                 * clamp, and only need max clamp if chan has less than 32 bits.
1905                 */
1906                unsigned chan_size = format_desc->channel[chan].size;
1907                if (chan_size < 32) {
1908                   max_clamp = lp_build_const_int_vec(gallivm, vec4_type,
1909                                                      (1 << chan_size) - 1);
1910                }
1911                /* TODO: no idea about non-pure, non-normalized! */
1912             }
1913          }
1914          else if (chan_type == UTIL_FORMAT_TYPE_FIXED) {
1915             /* TODO: I have no idea what clamp this would need if any! */
1916          }
1917       }
1918       /* mixed plain formats (or different pure size) */
1919       switch (format_desc->format) {
1920       case PIPE_FORMAT_B10G10R10A2_UINT:
1921       case PIPE_FORMAT_R10G10B10A2_UINT:
1922       {
1923          unsigned max10 = (1 << 10) - 1;
1924          max_clamp = lp_build_const_aos(gallivm, vec4_type, max10, max10,
1925                                         max10, (1 << 2) - 1, NULL);
1926       }
1927          break;
1928       case PIPE_FORMAT_R10SG10SB10SA2U_NORM:
1929          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
1930                                         -1.0F, 0.0F, NULL);
1931          max_clamp = vec4_bld.one;
1932          break;
1933       case PIPE_FORMAT_R8SG8SB8UX8U_NORM:
1934       case PIPE_FORMAT_R5SG5SB6U_NORM:
1935          min_clamp = lp_build_const_aos(gallivm, vec4_type, -1.0F, -1.0F,
1936                                         0.0F, 0.0F, NULL);
1937          max_clamp = vec4_bld.one;
1938          break;
1939       default:
1940          break;
1941       }
1942    }
1943    else {
1944       /* cannot figure this out from format description */
1945       if (format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) {
1946          /* s3tc formats are always unorm */
1947          min_clamp = vec4_bld.zero;
1948          max_clamp = vec4_bld.one;
1949       }
1950       else if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC ||
1951                format_desc->layout == UTIL_FORMAT_LAYOUT_ETC) {
1952          switch (format_desc->format) {
1953          case PIPE_FORMAT_RGTC1_UNORM:
1954          case PIPE_FORMAT_RGTC2_UNORM:
1955          case PIPE_FORMAT_LATC1_UNORM:
1956          case PIPE_FORMAT_LATC2_UNORM:
1957          case PIPE_FORMAT_ETC1_RGB8:
1958             min_clamp = vec4_bld.zero;
1959             max_clamp = vec4_bld.one;
1960             break;
1961          case PIPE_FORMAT_RGTC1_SNORM:
1962          case PIPE_FORMAT_RGTC2_SNORM:
1963          case PIPE_FORMAT_LATC1_SNORM:
1964          case PIPE_FORMAT_LATC2_SNORM:
1965             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
1966             max_clamp = vec4_bld.one;
1967             break;
1968          default:
1969             assert(0);
1970             break;
1971          }
1972       }
1973       /*
1974        * all others from subsampled/other group, though we don't care
1975        * about yuv (and should not have any from zs here)
1976        */
1977       else if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_YUV){
1978          switch (format_desc->format) {
1979          case PIPE_FORMAT_R8G8_B8G8_UNORM:
1980          case PIPE_FORMAT_G8R8_G8B8_UNORM:
1981          case PIPE_FORMAT_G8R8_B8R8_UNORM:
1982          case PIPE_FORMAT_R8G8_R8B8_UNORM:
1983          case PIPE_FORMAT_R1_UNORM: /* doesn't make sense but ah well */
1984             min_clamp = vec4_bld.zero;
1985             max_clamp = vec4_bld.one;
1986             break;
1987          case PIPE_FORMAT_R8G8Bx_SNORM:
1988             min_clamp = lp_build_const_vec(gallivm, vec4_type, -1.0F);
1989             max_clamp = vec4_bld.one;
1990             break;
1991             /*
1992              * Note smallfloat formats usually don't need clamping
1993              * (they still have infinite range) however this is not
1994              * true for r11g11b10 and r9g9b9e5, which can't represent
1995              * negative numbers (and additionally r9g9b9e5 can't represent
1996              * very large numbers). d3d10 seems happy without clamping in
1997              * this case, but gl spec is pretty clear: "for floating
1998              * point and integer formats, border values are clamped to
1999              * the representable range of the format" so do that here.
2000              */
2001          case PIPE_FORMAT_R11G11B10_FLOAT:
2002             min_clamp = vec4_bld.zero;
2003             break;
2004          case PIPE_FORMAT_R9G9B9E5_FLOAT:
2005             min_clamp = vec4_bld.zero;
2006             max_clamp = lp_build_const_vec(gallivm, vec4_type, MAX_RGB9E5);
2007             break;
2008          default:
2009             assert(0);
2010             break;
2011          }
2012       }
2013    }
2014
2015    if (min_clamp) {
2016       border_color = lp_build_max(&vec4_bld, border_color, min_clamp);
2017    }
2018    if (max_clamp) {
2019       border_color = lp_build_min(&vec4_bld, border_color, max_clamp);
2020    }
2021
2022    bld->border_color_clamped = border_color;
2023 }
2024
2025
2026 /**
2027  * General texture sampling codegen.
2028  * This function handles texture sampling for all texture targets (1D,
2029  * 2D, 3D, cube) and all filtering modes.
2030  */
2031 static void
2032 lp_build_sample_general(struct lp_build_sample_context *bld,
2033                         unsigned sampler_unit,
2034                         LLVMValueRef *coords,
2035                         const LLVMValueRef *offsets,
2036                         LLVMValueRef lod_positive,
2037                         LLVMValueRef lod_fpart,
2038                         LLVMValueRef ilevel0,
2039                         LLVMValueRef ilevel1,
2040                         LLVMValueRef *colors_out)
2041 {
2042    LLVMBuilderRef builder = bld->gallivm->builder;
2043    const struct lp_static_sampler_state *sampler_state = bld->static_sampler_state;
2044    const unsigned mip_filter = sampler_state->min_mip_filter;
2045    const unsigned min_filter = sampler_state->min_img_filter;
2046    const unsigned mag_filter = sampler_state->mag_img_filter;
2047    LLVMValueRef texels[4];
2048    unsigned chan;
2049
2050    /* if we need border color, (potentially) clamp it now */
2051    if (lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_s,
2052                                               min_filter,
2053                                               mag_filter) ||
2054        (bld->dims > 1 &&
2055            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_t,
2056                                                   min_filter,
2057                                                   mag_filter)) ||
2058        (bld->dims > 2 &&
2059            lp_sampler_wrap_mode_uses_border_color(sampler_state->wrap_r,
2060                                                   min_filter,
2061                                                   mag_filter))) {
2062       lp_build_clamp_border_color(bld, sampler_unit);
2063    }
2064
2065
2066    /*
2067     * Get/interpolate texture colors.
2068     */
2069
2070    for (chan = 0; chan < 4; ++chan) {
2071      texels[chan] = lp_build_alloca(bld->gallivm, bld->texel_bld.vec_type, "");
2072      lp_build_name(texels[chan], "sampler%u_texel_%c_var", sampler_unit, "xyzw"[chan]);
2073    }
2074
2075    if (min_filter == mag_filter) {
2076       /* no need to distinguish between minification and magnification */
2077       lp_build_sample_mipmap(bld, min_filter, mip_filter,
2078                              coords, offsets,
2079                              ilevel0, ilevel1, lod_fpart,
2080                              texels);
2081    }
2082    else {
2083       /*
2084        * Could also get rid of the if-logic and always use mipmap_both, both
2085        * for the single lod and multi-lod case if nothing really uses this.
2086        */
2087       if (bld->num_lods == 1) {
2088          /* Emit conditional to choose min image filter or mag image filter
2089           * depending on the lod being > 0 or <= 0, respectively.
2090           */
2091          struct lp_build_if_state if_ctx;
2092
2093          lod_positive = LLVMBuildTrunc(builder, lod_positive,
2094                                        LLVMInt1TypeInContext(bld->gallivm->context), "");
2095
2096          lp_build_if(&if_ctx, bld->gallivm, lod_positive);
2097          {
2098             /* Use the minification filter */
2099             lp_build_sample_mipmap(bld, min_filter, mip_filter,
2100                                    coords, offsets,
2101                                    ilevel0, ilevel1, lod_fpart,
2102                                    texels);
2103          }
2104          lp_build_else(&if_ctx);
2105          {
2106             /* Use the magnification filter */
2107             lp_build_sample_mipmap(bld, mag_filter, PIPE_TEX_MIPFILTER_NONE,
2108                                    coords, offsets,
2109                                    ilevel0, NULL, NULL,
2110                                    texels);
2111          }
2112          lp_build_endif(&if_ctx);
2113       }
2114       else {
2115          LLVMValueRef need_linear, linear_mask;
2116          unsigned mip_filter_for_nearest;
2117          struct lp_build_if_state if_ctx;
2118
2119          if (min_filter == PIPE_TEX_FILTER_LINEAR) {
2120             linear_mask = lod_positive;
2121             mip_filter_for_nearest = PIPE_TEX_MIPFILTER_NONE;
2122          }
2123          else {
2124             linear_mask = lp_build_not(&bld->lodi_bld, lod_positive);
2125             mip_filter_for_nearest = mip_filter;
2126          }
2127          need_linear = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods,
2128                                                linear_mask);
2129
2130          if (bld->num_lods != bld->coord_type.length) {
2131             linear_mask = lp_build_unpack_broadcast_aos_scalars(bld->gallivm,
2132                                                                 bld->lodi_type,
2133                                                                 bld->int_coord_type,
2134                                                                 linear_mask);
2135          }
2136
2137          lp_build_if(&if_ctx, bld->gallivm, need_linear);
2138          {
2139             /*
2140              * Do sampling with both filters simultaneously. This means using
2141              * a linear filter and doing some tricks (with weights) for the pixels
2142              * which need nearest filter.
2143              * Note that it's probably rare some pixels need nearest and some
2144              * linear filter but the fixups required for the nearest pixels
2145              * aren't all that complicated so just always run a combined path
2146              * if at least some pixels require linear.
2147              */
2148             lp_build_sample_mipmap_both(bld, linear_mask, mip_filter,
2149                                         coords, offsets,
2150                                         ilevel0, ilevel1,
2151                                         lod_fpart, lod_positive,
2152                                         texels);
2153          }
2154          lp_build_else(&if_ctx);
2155          {
2156             /*
2157              * All pixels require just nearest filtering, which is way
2158              * cheaper than linear, hence do a separate path for that.
2159              */
2160             lp_build_sample_mipmap(bld, PIPE_TEX_FILTER_NEAREST,
2161                                    mip_filter_for_nearest,
2162                                    coords, offsets,
2163                                    ilevel0, ilevel1, lod_fpart,
2164                                    texels);
2165          }
2166          lp_build_endif(&if_ctx);
2167       }
2168    }
2169
2170    for (chan = 0; chan < 4; ++chan) {
2171      colors_out[chan] = LLVMBuildLoad(builder, texels[chan], "");
2172      lp_build_name(colors_out[chan], "sampler%u_texel_%c", sampler_unit, "xyzw"[chan]);
2173    }
2174 }
2175
2176
2177 /**
2178  * Texel fetch function.
2179  * In contrast to general sampling there is no filtering, no coord minification,
2180  * lod (if any) is always explicit uint, coords are uints (in terms of texel units)
2181  * directly to be applied to the selected mip level (after adding texel offsets).
2182  * This function handles texel fetch for all targets where texel fetch is supported
2183  * (no cube maps, but 1d, 2d, 3d are supported, arrays and buffers should be too).
2184  */
2185 static void
2186 lp_build_fetch_texel(struct lp_build_sample_context *bld,
2187                      unsigned texture_unit,
2188                      const LLVMValueRef *coords,
2189                      LLVMValueRef explicit_lod,
2190                      const LLVMValueRef *offsets,
2191                      LLVMValueRef *colors_out)
2192 {
2193    struct lp_build_context *perquadi_bld = &bld->lodi_bld;
2194    struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
2195    unsigned dims = bld->dims, chan;
2196    unsigned target = bld->static_texture_state->target;
2197    boolean out_of_bound_ret_zero = TRUE;
2198    LLVMValueRef size, ilevel;
2199    LLVMValueRef row_stride_vec = NULL, img_stride_vec = NULL;
2200    LLVMValueRef x = coords[0], y = coords[1], z = coords[2];
2201    LLVMValueRef width, height, depth, i, j;
2202    LLVMValueRef offset, out_of_bounds, out1;
2203
2204    out_of_bounds = int_coord_bld->zero;
2205
2206    if (explicit_lod && bld->static_texture_state->target != PIPE_BUFFER) {
2207       if (bld->num_mips != int_coord_bld->type.length) {
2208          ilevel = lp_build_pack_aos_scalars(bld->gallivm, int_coord_bld->type,
2209                                             perquadi_bld->type, explicit_lod, 0);
2210       }
2211       else {
2212          ilevel = explicit_lod;
2213       }
2214       lp_build_nearest_mip_level(bld, texture_unit, ilevel, &ilevel,
2215                                  out_of_bound_ret_zero ? &out_of_bounds : NULL);
2216    }
2217    else {
2218       assert(bld->num_mips == 1);
2219       if (bld->static_texture_state->target != PIPE_BUFFER) {
2220          ilevel = bld->dynamic_state->first_level(bld->dynamic_state,
2221                                                   bld->gallivm, texture_unit);
2222       }
2223       else {
2224          ilevel = lp_build_const_int32(bld->gallivm, 0);
2225       }
2226    }
2227    lp_build_mipmap_level_sizes(bld, ilevel,
2228                                &size,
2229                                &row_stride_vec, &img_stride_vec);
2230    lp_build_extract_image_sizes(bld, &bld->int_size_bld, int_coord_bld->type,
2231                                 size, &width, &height, &depth);
2232
2233    if (target == PIPE_TEXTURE_1D_ARRAY ||
2234        target == PIPE_TEXTURE_2D_ARRAY) {
2235       if (out_of_bound_ret_zero) {
2236          z = lp_build_layer_coord(bld, texture_unit, z, &out1);
2237          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2238       }
2239       else {
2240          z = lp_build_layer_coord(bld, texture_unit, z, NULL);
2241       }
2242    }
2243
2244    /* This is a lot like border sampling */
2245    if (offsets[0]) {
2246       /*
2247        * coords are really unsigned, offsets are signed, but I don't think
2248        * exceeding 31 bits is possible
2249        */
2250       x = lp_build_add(int_coord_bld, x, offsets[0]);
2251    }
2252    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, x, int_coord_bld->zero);
2253    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2254    out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, x, width);
2255    out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2256
2257    if (dims >= 2) {
2258       if (offsets[1]) {
2259          y = lp_build_add(int_coord_bld, y, offsets[1]);
2260       }
2261       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, y, int_coord_bld->zero);
2262       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2263       out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, y, height);
2264       out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2265
2266       if (dims >= 3) {
2267          if (offsets[2]) {
2268             z = lp_build_add(int_coord_bld, z, offsets[2]);
2269          }
2270          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_LESS, z, int_coord_bld->zero);
2271          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2272          out1 = lp_build_cmp(int_coord_bld, PIPE_FUNC_GEQUAL, z, depth);
2273          out_of_bounds = lp_build_or(int_coord_bld, out_of_bounds, out1);
2274       }
2275    }
2276
2277    lp_build_sample_offset(int_coord_bld,
2278                           bld->format_desc,
2279                           x, y, z, row_stride_vec, img_stride_vec,
2280                           &offset, &i, &j);
2281
2282    if (bld->static_texture_state->target != PIPE_BUFFER) {
2283       offset = lp_build_add(int_coord_bld, offset,
2284                             lp_build_get_mip_offsets(bld, ilevel));
2285    }
2286
2287    offset = lp_build_andnot(int_coord_bld, offset, out_of_bounds);
2288
2289    lp_build_fetch_rgba_soa(bld->gallivm,
2290                            bld->format_desc,
2291                            bld->texel_type,
2292                            bld->base_ptr, offset,
2293                            i, j,
2294                            colors_out);
2295
2296    if (out_of_bound_ret_zero) {
2297       /*
2298        * Only needed for ARB_robust_buffer_access_behavior and d3d10.
2299        * Could use min/max above instead of out-of-bounds comparisons
2300        * if we don't care about the result returned for out-of-bounds.
2301        */
2302       for (chan = 0; chan < 4; chan++) {
2303          colors_out[chan] = lp_build_select(&bld->texel_bld, out_of_bounds,
2304                                             bld->texel_bld.zero, colors_out[chan]);
2305       }
2306    }
2307 }
2308
2309
2310 /**
2311  * Just set texels to white instead of actually sampling the texture.
2312  * For debugging.
2313  */
2314 void
2315 lp_build_sample_nop(struct gallivm_state *gallivm,
2316                     struct lp_type type,
2317                     const LLVMValueRef *coords,
2318                     LLVMValueRef texel_out[4])
2319 {
2320    LLVMValueRef one = lp_build_one(gallivm, type);
2321    unsigned chan;
2322
2323    for (chan = 0; chan < 4; chan++) {
2324       texel_out[chan] = one;
2325    }  
2326 }
2327
2328
2329 /**
2330  * Build texture sampling code.
2331  * 'texel' will return a vector of four LLVMValueRefs corresponding to
2332  * R, G, B, A.
2333  * \param type  vector float type to use for coords, etc.
2334  * \param is_fetch  if this is a texel fetch instruction.
2335  * \param derivs  partial derivatives of (s,t,r,q) with respect to x and y
2336  */
2337 void
2338 lp_build_sample_soa(struct gallivm_state *gallivm,
2339                     const struct lp_static_texture_state *static_texture_state,
2340                     const struct lp_static_sampler_state *static_sampler_state,
2341                     struct lp_sampler_dynamic_state *dynamic_state,
2342                     struct lp_type type,
2343                     boolean is_fetch,
2344                     unsigned texture_index,
2345                     unsigned sampler_index,
2346                     const LLVMValueRef *coords,
2347                     const LLVMValueRef *offsets,
2348                     const struct lp_derivatives *derivs, /* optional */
2349                     LLVMValueRef lod_bias, /* optional */
2350                     LLVMValueRef explicit_lod, /* optional */
2351                     enum lp_sampler_lod_property lod_property,
2352                     LLVMValueRef texel_out[4])
2353 {
2354    unsigned target = static_texture_state->target;
2355    unsigned dims = texture_dims(target);
2356    unsigned num_quads = type.length / 4;
2357    unsigned mip_filter, min_img_filter, mag_img_filter, i;
2358    struct lp_build_sample_context bld;
2359    struct lp_static_sampler_state derived_sampler_state = *static_sampler_state;
2360    LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
2361    LLVMBuilderRef builder = gallivm->builder;
2362    LLVMValueRef tex_width, newcoords[5];
2363
2364    if (0) {
2365       enum pipe_format fmt = static_texture_state->format;
2366       debug_printf("Sample from %s\n", util_format_name(fmt));
2367    }
2368
2369    if (static_texture_state->format == PIPE_FORMAT_NONE) {
2370       /*
2371        * If there's nothing bound, format is NONE, and we must return
2372        * all zero as mandated by d3d10 in this case.
2373        */
2374       unsigned chan;
2375       LLVMValueRef zero = lp_build_const_vec(gallivm, type, 0.0F);
2376       for (chan = 0; chan < 4; chan++) {
2377          texel_out[chan] = zero;
2378       }
2379       return;
2380    }
2381
2382    assert(type.floating);
2383
2384    /* Setup our build context */
2385    memset(&bld, 0, sizeof bld);
2386    bld.gallivm = gallivm;
2387    bld.static_sampler_state = &derived_sampler_state;
2388    bld.static_texture_state = static_texture_state;
2389    bld.dynamic_state = dynamic_state;
2390    bld.format_desc = util_format_description(static_texture_state->format);
2391    bld.dims = dims;
2392
2393    bld.vector_width = lp_type_width(type);
2394
2395    bld.float_type = lp_type_float(32);
2396    bld.int_type = lp_type_int(32);
2397    bld.coord_type = type;
2398    bld.int_coord_type = lp_int_type(type);
2399    bld.float_size_in_type = lp_type_float(32);
2400    bld.float_size_in_type.length = dims > 1 ? 4 : 1;
2401    bld.int_size_in_type = lp_int_type(bld.float_size_in_type);
2402    bld.texel_type = type;
2403
2404    /* always using the first channel hopefully should be safe,
2405     * if not things WILL break in other places anyway.
2406     */
2407    if (bld.format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB &&
2408        bld.format_desc->channel[0].pure_integer) {
2409       if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) {
2410          bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2411       }
2412       else if (bld.format_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) {
2413          bld.texel_type = lp_type_uint_vec(type.width, type.width * type.length);
2414       }
2415    }
2416    else if (util_format_has_stencil(bld.format_desc) &&
2417        !util_format_has_depth(bld.format_desc)) {
2418       /* for stencil only formats, sample stencil (uint) */
2419       bld.texel_type = lp_type_int_vec(type.width, type.width * type.length);
2420    }
2421
2422    if (!static_texture_state->level_zero_only) {
2423       derived_sampler_state.min_mip_filter = static_sampler_state->min_mip_filter;
2424    } else {
2425       derived_sampler_state.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
2426    }
2427    mip_filter = derived_sampler_state.min_mip_filter;
2428
2429    if (0) {
2430       debug_printf("  .min_mip_filter = %u\n", derived_sampler_state.min_mip_filter);
2431    }
2432
2433    if (static_texture_state->target == PIPE_TEXTURE_CUBE ||
2434        static_texture_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2435    {
2436       /*
2437        * Seamless filtering ignores wrap modes.
2438        * Setting to CLAMP_TO_EDGE is correct for nearest filtering, for
2439        * bilinear it's not correct but way better than using for instance repeat.
2440        * Note we even set this for non-seamless. Technically GL allows any wrap
2441        * mode, which made sense when supporting true borders (can get seamless
2442        * effect with border and CLAMP_TO_BORDER), but gallium doesn't support
2443        * borders and d3d9 requires wrap modes to be ignored and it's a pain to fix
2444        * up the sampler state (as it makes it texture dependent).
2445        */
2446       derived_sampler_state.wrap_s = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2447       derived_sampler_state.wrap_t = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
2448    }
2449
2450    min_img_filter = derived_sampler_state.min_img_filter;
2451    mag_img_filter = derived_sampler_state.mag_img_filter;
2452
2453
2454    /*
2455     * This is all a bit complicated different paths are chosen for performance
2456     * reasons.
2457     * Essentially, there can be 1 lod per element, 1 lod per quad or 1 lod for
2458     * everything (the last two options are equivalent for 4-wide case).
2459     * If there's per-quad lod but we split to 4-wide so we can use AoS, per-quad
2460     * lod is calculated then the lod value extracted afterwards so making this
2461     * case basically the same as far as lod handling is concerned for the
2462     * further sample/filter code as the 1 lod for everything case.
2463     * Different lod handling mostly shows up when building mipmap sizes
2464     * (lp_build_mipmap_level_sizes() and friends) and also in filtering
2465     * (getting the fractional part of the lod to the right texels).
2466     */
2467
2468    /*
2469     * There are other situations where at least the multiple int lods could be
2470     * avoided like min and max lod being equal.
2471     */
2472    bld.num_mips = bld.num_lods = 1;
2473
2474    if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
2475        (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
2476        (static_texture_state->target == PIPE_TEXTURE_CUBE) &&
2477        (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2478       /*
2479        * special case for using per-pixel lod even for implicit lod,
2480        * which is generally never required (ok by APIs) except to please
2481        * some (somewhat broken imho) tests (because per-pixel face selection
2482        * can cause derivatives to be different for pixels outside the primitive
2483        * due to the major axis division even if pre-project derivatives are
2484        * looking normal).
2485        */
2486       bld.num_mips = type.length;
2487       bld.num_lods = type.length;
2488    }
2489    else if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT ||
2490        (explicit_lod || lod_bias || derivs)) {
2491       if ((is_fetch && target != PIPE_BUFFER) ||
2492           (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2493          bld.num_mips = type.length;
2494          bld.num_lods = type.length;
2495       }
2496       else if (!is_fetch && min_img_filter != mag_img_filter) {
2497          bld.num_mips = 1;
2498          bld.num_lods = type.length;
2499       }
2500    }
2501    /* TODO: for true scalar_lod should only use 1 lod value */
2502    else if ((is_fetch && explicit_lod && target != PIPE_BUFFER) ||
2503             (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2504       bld.num_mips = num_quads;
2505       bld.num_lods = num_quads;
2506    }
2507    else if (!is_fetch && min_img_filter != mag_img_filter) {
2508       bld.num_mips = 1;
2509       bld.num_lods = num_quads;
2510    }
2511
2512
2513    bld.lodf_type = type;
2514    /* we want native vector size to be able to use our intrinsics */
2515    if (bld.num_lods != type.length) {
2516       /* TODO: this currently always has to be per-quad or per-element */
2517       bld.lodf_type.length = type.length > 4 ? ((type.length + 15) / 16) * 4 : 1;
2518    }
2519    bld.lodi_type = lp_int_type(bld.lodf_type);
2520    bld.levelf_type = bld.lodf_type;
2521    if (bld.num_mips == 1) {
2522       bld.levelf_type.length = 1;
2523    }
2524    bld.leveli_type = lp_int_type(bld.levelf_type);
2525    bld.float_size_type = bld.float_size_in_type;
2526    /* Note: size vectors may not be native. They contain minified w/h/d/_ values,
2527     * with per-element lod that is w0/h0/d0/_/w1/h1/d1_/... so up to 8x4f32 */
2528    if (bld.num_mips > 1) {
2529       bld.float_size_type.length = bld.num_mips == type.length ?
2530                                       bld.num_mips * bld.float_size_in_type.length :
2531                                       type.length;
2532    }
2533    bld.int_size_type = lp_int_type(bld.float_size_type);
2534
2535    lp_build_context_init(&bld.float_bld, gallivm, bld.float_type);
2536    lp_build_context_init(&bld.float_vec_bld, gallivm, type);
2537    lp_build_context_init(&bld.int_bld, gallivm, bld.int_type);
2538    lp_build_context_init(&bld.coord_bld, gallivm, bld.coord_type);
2539    lp_build_context_init(&bld.int_coord_bld, gallivm, bld.int_coord_type);
2540    lp_build_context_init(&bld.int_size_in_bld, gallivm, bld.int_size_in_type);
2541    lp_build_context_init(&bld.float_size_in_bld, gallivm, bld.float_size_in_type);
2542    lp_build_context_init(&bld.int_size_bld, gallivm, bld.int_size_type);
2543    lp_build_context_init(&bld.float_size_bld, gallivm, bld.float_size_type);
2544    lp_build_context_init(&bld.texel_bld, gallivm, bld.texel_type);
2545    lp_build_context_init(&bld.levelf_bld, gallivm, bld.levelf_type);
2546    lp_build_context_init(&bld.leveli_bld, gallivm, bld.leveli_type);
2547    lp_build_context_init(&bld.lodf_bld, gallivm, bld.lodf_type);
2548    lp_build_context_init(&bld.lodi_bld, gallivm, bld.lodi_type);
2549
2550    /* Get the dynamic state */
2551    tex_width = dynamic_state->width(dynamic_state, gallivm, texture_index);
2552    bld.row_stride_array = dynamic_state->row_stride(dynamic_state, gallivm, texture_index);
2553    bld.img_stride_array = dynamic_state->img_stride(dynamic_state, gallivm, texture_index);
2554    bld.base_ptr = dynamic_state->base_ptr(dynamic_state, gallivm, texture_index);
2555    bld.mip_offsets = dynamic_state->mip_offsets(dynamic_state, gallivm, texture_index);
2556    /* Note that mip_offsets is an array[level] of offsets to texture images */
2557
2558    /* width, height, depth as single int vector */
2559    if (dims <= 1) {
2560       bld.int_size = tex_width;
2561    }
2562    else {
2563       bld.int_size = LLVMBuildInsertElement(builder, bld.int_size_in_bld.undef,
2564                                             tex_width, LLVMConstInt(i32t, 0, 0), "");
2565       if (dims >= 2) {
2566          LLVMValueRef tex_height =
2567             dynamic_state->height(dynamic_state, gallivm, texture_index);
2568          bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
2569                                                tex_height, LLVMConstInt(i32t, 1, 0), "");
2570          if (dims >= 3) {
2571             LLVMValueRef tex_depth =
2572                dynamic_state->depth(dynamic_state, gallivm, texture_index);
2573             bld.int_size = LLVMBuildInsertElement(builder, bld.int_size,
2574                                                   tex_depth, LLVMConstInt(i32t, 2, 0), "");
2575          }
2576       }
2577    }
2578
2579    for (i = 0; i < 5; i++) {
2580       newcoords[i] = coords[i];
2581    }
2582
2583    if (0) {
2584       /* For debug: no-op texture sampling */
2585       lp_build_sample_nop(gallivm,
2586                           bld.texel_type,
2587                           newcoords,
2588                           texel_out);
2589    }
2590
2591    else if (is_fetch) {
2592       lp_build_fetch_texel(&bld, texture_index, newcoords,
2593                            explicit_lod, offsets,
2594                            texel_out);
2595    }
2596
2597    else {
2598       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
2599       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
2600       boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
2601                         /* not sure this is strictly needed or simply impossible */
2602                         derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
2603                         lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
2604
2605       use_aos &= bld.num_lods <= num_quads ||
2606                  derived_sampler_state.min_img_filter ==
2607                     derived_sampler_state.mag_img_filter;
2608       if (dims > 1) {
2609          use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_t);
2610          if (dims > 2) {
2611             use_aos &= lp_is_simple_wrap_mode(derived_sampler_state.wrap_r);
2612          }
2613       }
2614       if (static_texture_state->target == PIPE_TEXTURE_CUBE &&
2615           derived_sampler_state.seamless_cube_map &&
2616           (derived_sampler_state.min_img_filter == PIPE_TEX_FILTER_LINEAR ||
2617            derived_sampler_state.mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
2618          /* theoretically possible with AoS filtering but not implemented (complex!) */
2619          use_aos = 0;
2620       }
2621
2622       if ((gallivm_debug & GALLIVM_DEBUG_PERF) &&
2623           !use_aos && util_format_fits_8unorm(bld.format_desc)) {
2624          debug_printf("%s: using floating point linear filtering for %s\n",
2625                       __FUNCTION__, bld.format_desc->short_name);
2626          debug_printf("  min_img %d  mag_img %d  mip %d  target %d  seamless %d"
2627                       "  wraps %d  wrapt %d  wrapr %d\n",
2628                       derived_sampler_state.min_img_filter,
2629                       derived_sampler_state.mag_img_filter,
2630                       derived_sampler_state.min_mip_filter,
2631                       static_texture_state->target,
2632                       derived_sampler_state.seamless_cube_map,
2633                       derived_sampler_state.wrap_s,
2634                       derived_sampler_state.wrap_t,
2635                       derived_sampler_state.wrap_r);
2636       }
2637
2638       lp_build_sample_common(&bld, texture_index, sampler_index,
2639                              newcoords,
2640                              derivs, lod_bias, explicit_lod,
2641                              &lod_positive, &lod_fpart,
2642                              &ilevel0, &ilevel1);
2643
2644       /*
2645        * we only try 8-wide sampling with soa as it appears to
2646        * be a loss with aos with AVX (but it should work, except
2647        * for conformance if min_filter != mag_filter if num_lods > 1).
2648        * (It should be faster if we'd support avx2)
2649        */
2650       if (num_quads == 1 || !use_aos) {
2651          if (use_aos) {
2652             /* do sampling/filtering with fixed pt arithmetic */
2653             lp_build_sample_aos(&bld, sampler_index,
2654                                 newcoords[0], newcoords[1],
2655                                 newcoords[2],
2656                                 offsets, lod_positive, lod_fpart,
2657                                 ilevel0, ilevel1,
2658                                 texel_out);
2659          }
2660
2661          else {
2662             lp_build_sample_general(&bld, sampler_index,
2663                                     newcoords, offsets,
2664                                     lod_positive, lod_fpart,
2665                                     ilevel0, ilevel1,
2666                                     texel_out);
2667          }
2668       }
2669       else {
2670          unsigned j;
2671          struct lp_build_sample_context bld4;
2672          struct lp_type type4 = type;
2673          unsigned i;
2674          LLVMValueRef texelout4[4];
2675          LLVMValueRef texelouttmp[4][LP_MAX_VECTOR_LENGTH/16];
2676
2677          type4.length = 4;
2678
2679          /* Setup our build context */
2680          memset(&bld4, 0, sizeof bld4);
2681          bld4.gallivm = bld.gallivm;
2682          bld4.static_texture_state = bld.static_texture_state;
2683          bld4.static_sampler_state = bld.static_sampler_state;
2684          bld4.dynamic_state = bld.dynamic_state;
2685          bld4.format_desc = bld.format_desc;
2686          bld4.dims = bld.dims;
2687          bld4.row_stride_array = bld.row_stride_array;
2688          bld4.img_stride_array = bld.img_stride_array;
2689          bld4.base_ptr = bld.base_ptr;
2690          bld4.mip_offsets = bld.mip_offsets;
2691          bld4.int_size = bld.int_size;
2692
2693          bld4.vector_width = lp_type_width(type4);
2694
2695          bld4.float_type = lp_type_float(32);
2696          bld4.int_type = lp_type_int(32);
2697          bld4.coord_type = type4;
2698          bld4.int_coord_type = lp_int_type(type4);
2699          bld4.float_size_in_type = lp_type_float(32);
2700          bld4.float_size_in_type.length = dims > 1 ? 4 : 1;
2701          bld4.int_size_in_type = lp_int_type(bld4.float_size_in_type);
2702          bld4.texel_type = bld.texel_type;
2703          bld4.texel_type.length = 4;
2704
2705          bld4.num_mips = bld4.num_lods = 1;
2706          if ((gallivm_debug & GALLIVM_DEBUG_NO_QUAD_LOD) &&
2707              (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) &&
2708              (static_texture_state->target == PIPE_TEXTURE_CUBE) &&
2709              (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2710             bld4.num_mips = type4.length;
2711             bld4.num_lods = type4.length;
2712          }
2713          if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
2714              (explicit_lod || lod_bias || derivs)) {
2715             if ((is_fetch && target != PIPE_BUFFER) ||
2716                 (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)) {
2717                bld4.num_mips = type4.length;
2718                bld4.num_lods = type4.length;
2719             }
2720             else if (!is_fetch && min_img_filter != mag_img_filter) {
2721                bld4.num_mips = 1;
2722                bld4.num_lods = type4.length;
2723             }
2724          }
2725
2726          /* we want native vector size to be able to use our intrinsics */
2727          bld4.lodf_type = type4;
2728          if (bld4.num_lods != type4.length) {
2729             bld4.lodf_type.length = 1;
2730          }
2731          bld4.lodi_type = lp_int_type(bld4.lodf_type);
2732          bld4.levelf_type = type4;
2733          if (bld4.num_mips != type4.length) {
2734             bld4.levelf_type.length = 1;
2735          }
2736          bld4.leveli_type = lp_int_type(bld4.levelf_type);
2737          bld4.float_size_type = bld4.float_size_in_type;
2738          if (bld4.num_mips > 1) {
2739             bld4.float_size_type.length = bld4.num_mips == type4.length ?
2740                                             bld4.num_mips * bld4.float_size_in_type.length :
2741                                             type4.length;
2742          }
2743          bld4.int_size_type = lp_int_type(bld4.float_size_type);
2744
2745          lp_build_context_init(&bld4.float_bld, gallivm, bld4.float_type);
2746          lp_build_context_init(&bld4.float_vec_bld, gallivm, type4);
2747          lp_build_context_init(&bld4.int_bld, gallivm, bld4.int_type);
2748          lp_build_context_init(&bld4.coord_bld, gallivm, bld4.coord_type);
2749          lp_build_context_init(&bld4.int_coord_bld, gallivm, bld4.int_coord_type);
2750          lp_build_context_init(&bld4.int_size_in_bld, gallivm, bld4.int_size_in_type);
2751          lp_build_context_init(&bld4.float_size_in_bld, gallivm, bld4.float_size_in_type);
2752          lp_build_context_init(&bld4.int_size_bld, gallivm, bld4.int_size_type);
2753          lp_build_context_init(&bld4.float_size_bld, gallivm, bld4.float_size_type);
2754          lp_build_context_init(&bld4.texel_bld, gallivm, bld4.texel_type);
2755          lp_build_context_init(&bld4.levelf_bld, gallivm, bld4.levelf_type);
2756          lp_build_context_init(&bld4.leveli_bld, gallivm, bld4.leveli_type);
2757          lp_build_context_init(&bld4.lodf_bld, gallivm, bld4.lodf_type);
2758          lp_build_context_init(&bld4.lodi_bld, gallivm, bld4.lodi_type);
2759
2760          for (i = 0; i < num_quads; i++) {
2761             LLVMValueRef s4, t4, r4;
2762             LLVMValueRef lod_positive4, lod_fpart4 = NULL;
2763             LLVMValueRef ilevel04, ilevel14 = NULL;
2764             LLVMValueRef offsets4[4] = { NULL };
2765             unsigned num_lods = bld4.num_lods;
2766
2767             s4 = lp_build_extract_range(gallivm, newcoords[0], 4*i, 4);
2768             t4 = lp_build_extract_range(gallivm, newcoords[1], 4*i, 4);
2769             r4 = lp_build_extract_range(gallivm, newcoords[2], 4*i, 4);
2770
2771             if (offsets[0]) {
2772                offsets4[0] = lp_build_extract_range(gallivm, offsets[0], 4*i, 4);
2773                if (dims > 1) {
2774                   offsets4[1] = lp_build_extract_range(gallivm, offsets[1], 4*i, 4);
2775                   if (dims > 2) {
2776                      offsets4[2] = lp_build_extract_range(gallivm, offsets[2], 4*i, 4);
2777                   }
2778                }
2779             }
2780             lod_positive4 = lp_build_extract_range(gallivm, lod_positive, num_lods * i, num_lods);
2781             ilevel04 = bld.num_mips == 1 ? ilevel0 :
2782                           lp_build_extract_range(gallivm, ilevel0, num_lods * i, num_lods);
2783             if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) {
2784                ilevel14 = lp_build_extract_range(gallivm, ilevel1, num_lods * i, num_lods);
2785                lod_fpart4 = lp_build_extract_range(gallivm, lod_fpart, num_lods * i, num_lods);
2786             }
2787
2788             if (use_aos) {
2789                /* do sampling/filtering with fixed pt arithmetic */
2790                lp_build_sample_aos(&bld4, sampler_index,
2791                                    s4, t4, r4, offsets4,
2792                                    lod_positive4, lod_fpart4,
2793                                    ilevel04, ilevel14,
2794                                    texelout4);
2795             }
2796
2797             else {
2798                /* this path is currently unreachable and hence might break easily... */
2799                LLVMValueRef newcoords4[5];
2800                newcoords4[0] = s4;
2801                newcoords4[1] = t4;
2802                newcoords4[2] = r4;
2803                newcoords4[3] = lp_build_extract_range(gallivm, newcoords[3], 4*i, 4);
2804                newcoords4[4] = lp_build_extract_range(gallivm, newcoords[4], 4*i, 4);
2805
2806                lp_build_sample_general(&bld4, sampler_index,
2807                                        newcoords4, offsets4,
2808                                        lod_positive4, lod_fpart4,
2809                                        ilevel04, ilevel14,
2810                                        texelout4);
2811             }
2812             for (j = 0; j < 4; j++) {
2813                texelouttmp[j][i] = texelout4[j];
2814             }
2815          }
2816
2817          for (j = 0; j < 4; j++) {
2818             texel_out[j] = lp_build_concat(gallivm, texelouttmp[j], type4, num_quads);
2819          }
2820       }
2821    }
2822
2823    if (target != PIPE_BUFFER) {
2824       apply_sampler_swizzle(&bld, texel_out);
2825    }
2826
2827    /*
2828     * texel type can be a (32bit) int/uint (for pure int formats only),
2829     * however we are expected to always return floats (storage is untyped).
2830     */
2831    if (!bld.texel_type.floating) {
2832       unsigned chan;
2833       for (chan = 0; chan < 4; chan++) {
2834          texel_out[chan] = LLVMBuildBitCast(builder, texel_out[chan],
2835                                             lp_build_vec_type(gallivm, type), "");
2836       }
2837    }
2838 }
2839
2840 void
2841 lp_build_size_query_soa(struct gallivm_state *gallivm,
2842                         const struct lp_static_texture_state *static_state,
2843                         struct lp_sampler_dynamic_state *dynamic_state,
2844                         struct lp_type int_type,
2845                         unsigned texture_unit,
2846                         unsigned target,
2847                         boolean is_sviewinfo,
2848                         enum lp_sampler_lod_property lod_property,
2849                         LLVMValueRef explicit_lod,
2850                         LLVMValueRef *sizes_out)
2851 {
2852    LLVMValueRef lod, level, size;
2853    LLVMValueRef first_level = NULL;
2854    int dims, i;
2855    boolean has_array;
2856    unsigned num_lods = 1;
2857    struct lp_build_context bld_int_vec4;
2858
2859    if (static_state->format == PIPE_FORMAT_NONE) {
2860       /*
2861        * If there's nothing bound, format is NONE, and we must return
2862        * all zero as mandated by d3d10 in this case.
2863        */
2864       unsigned chan;
2865       LLVMValueRef zero = lp_build_const_vec(gallivm, int_type, 0.0F);
2866       for (chan = 0; chan < 4; chan++) {
2867          sizes_out[chan] = zero;
2868       }
2869       return;
2870    }
2871
2872    /*
2873     * Do some sanity verification about bound texture and shader dcl target.
2874     * Not entirely sure what's possible but assume array/non-array
2875     * always compatible (probably not ok for OpenGL but d3d10 has no
2876     * distinction of arrays at the resource level).
2877     * Everything else looks bogus (though not entirely sure about rect/2d).
2878     * Currently disabled because it causes assertion failures if there's
2879     * nothing bound (or rather a dummy texture, not that this case would
2880     * return the right values).
2881     */
2882    if (0 && static_state->target != target) {
2883       if (static_state->target == PIPE_TEXTURE_1D)
2884          assert(target == PIPE_TEXTURE_1D_ARRAY);
2885       else if (static_state->target == PIPE_TEXTURE_1D_ARRAY)
2886          assert(target == PIPE_TEXTURE_1D);
2887       else if (static_state->target == PIPE_TEXTURE_2D)
2888          assert(target == PIPE_TEXTURE_2D_ARRAY);
2889       else if (static_state->target == PIPE_TEXTURE_2D_ARRAY)
2890          assert(target == PIPE_TEXTURE_2D);
2891       else if (static_state->target == PIPE_TEXTURE_CUBE)
2892          assert(target == PIPE_TEXTURE_CUBE_ARRAY);
2893       else if (static_state->target == PIPE_TEXTURE_CUBE_ARRAY)
2894          assert(target == PIPE_TEXTURE_CUBE);
2895       else
2896          assert(0);
2897    }
2898
2899    dims = texture_dims(target);
2900
2901    switch (target) {
2902    case PIPE_TEXTURE_1D_ARRAY:
2903    case PIPE_TEXTURE_2D_ARRAY:
2904       has_array = TRUE;
2905       break;
2906    default:
2907       has_array = FALSE;
2908       break;
2909    }
2910
2911    assert(!int_type.floating);
2912
2913    lp_build_context_init(&bld_int_vec4, gallivm, lp_type_int_vec(32, 128));
2914
2915    if (explicit_lod) {
2916       /* FIXME: this needs to honor per-element lod */
2917       lod = LLVMBuildExtractElement(gallivm->builder, explicit_lod, lp_build_const_int32(gallivm, 0), "");
2918       first_level = dynamic_state->first_level(dynamic_state, gallivm, texture_unit);
2919       level = LLVMBuildAdd(gallivm->builder, lod, first_level, "level");
2920       lod = lp_build_broadcast_scalar(&bld_int_vec4, level);
2921    } else {
2922       lod = bld_int_vec4.zero;
2923    }
2924
2925    size = bld_int_vec4.undef;
2926
2927    size = LLVMBuildInsertElement(gallivm->builder, size,
2928                                  dynamic_state->width(dynamic_state, gallivm, texture_unit),
2929                                  lp_build_const_int32(gallivm, 0), "");
2930
2931    if (dims >= 2) {
2932       size = LLVMBuildInsertElement(gallivm->builder, size,
2933                                     dynamic_state->height(dynamic_state, gallivm, texture_unit),
2934                                     lp_build_const_int32(gallivm, 1), "");
2935    }
2936
2937    if (dims >= 3) {
2938       size = LLVMBuildInsertElement(gallivm->builder, size,
2939                                     dynamic_state->depth(dynamic_state, gallivm, texture_unit),
2940                                     lp_build_const_int32(gallivm, 2), "");
2941    }
2942
2943    size = lp_build_minify(&bld_int_vec4, size, lod, TRUE);
2944
2945    if (has_array)
2946       size = LLVMBuildInsertElement(gallivm->builder, size,
2947                                     dynamic_state->depth(dynamic_state, gallivm, texture_unit),
2948                                     lp_build_const_int32(gallivm, dims), "");
2949
2950    /*
2951     * d3d10 requires zero for x/y/z values (but not w, i.e. mip levels)
2952     * if level is out of bounds (note this can't cover unbound texture
2953     * here, which also requires returning zero).
2954     */
2955    if (explicit_lod && is_sviewinfo) {
2956       LLVMValueRef last_level, out, out1;
2957       struct lp_build_context leveli_bld;
2958
2959       /* everything is scalar for now */
2960       lp_build_context_init(&leveli_bld, gallivm, lp_type_int_vec(32, 32));
2961       last_level = dynamic_state->last_level(dynamic_state, gallivm, texture_unit);
2962
2963       out = lp_build_cmp(&leveli_bld, PIPE_FUNC_LESS, level, first_level);
2964       out1 = lp_build_cmp(&leveli_bld, PIPE_FUNC_GREATER, level, last_level);
2965       out = lp_build_or(&leveli_bld, out, out1);
2966       if (num_lods == 1) {
2967          out = lp_build_broadcast_scalar(&bld_int_vec4, out);
2968       }
2969       else {
2970          /* TODO */
2971          assert(0);
2972       }
2973       size = lp_build_andnot(&bld_int_vec4, size, out);
2974    }
2975    for (i = 0; i < dims + (has_array ? 1 : 0); i++) {
2976       sizes_out[i] = lp_build_extract_broadcast(gallivm, bld_int_vec4.type, int_type,
2977                                                 size,
2978                                                 lp_build_const_int32(gallivm, i));
2979    }
2980    if (is_sviewinfo) {
2981       for (; i < 4; i++) {
2982          sizes_out[i] = lp_build_const_vec(gallivm, int_type, 0.0);
2983       }
2984    }
2985
2986    /*
2987     * if there's no explicit_lod (buffers, rects) queries requiring nr of
2988     * mips would be illegal.
2989     */
2990    if (is_sviewinfo && explicit_lod) {
2991       struct lp_build_context bld_int_scalar;
2992       LLVMValueRef num_levels;
2993       lp_build_context_init(&bld_int_scalar, gallivm, lp_type_int(32));
2994
2995       if (static_state->level_zero_only) {
2996          num_levels = bld_int_scalar.one;
2997       }
2998       else {
2999          LLVMValueRef last_level;
3000
3001          last_level = dynamic_state->last_level(dynamic_state, gallivm, texture_unit);
3002          num_levels = lp_build_sub(&bld_int_scalar, last_level, first_level);
3003          num_levels = lp_build_add(&bld_int_scalar, num_levels, bld_int_scalar.one);
3004       }
3005       sizes_out[3] = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, int_type),
3006                                         num_levels);
3007    }
3008 }