OSDN Git Service

26d8db0fd095817395e042ca2a76ccfa53db327b
[android-x86/external-s2tc.git] / s2tc_algorithm.cpp
1 /*
2  * Copyright (C) 2011  Rudolf Polzer   All Rights Reserved.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * RUDOLF POLZER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
18  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20  */
21 #define S2TC_LICENSE_IDENTIFIER s2tc_algorithm_license
22 #include "s2tc_license.h"
23
24 #include <math.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <stdio.h>
28 #include <stdint.h>
29 #include <algorithm>
30 #include <iostream>
31
32 #include "s2tc_algorithm.h"
33 #include "s2tc_common.h"
34
35 namespace
36 {
37         template<class T> struct color_type_info
38         {
39         };
40         template<> struct color_type_info<unsigned char>
41         {
42                 static const unsigned char min_value = 0;
43                 static const unsigned char max_value = 255;
44         };
45
46         struct color_t
47         {
48                 signed char r, g, b;
49         };
50         inline color_t make_color_t()
51         {
52                 return (color_t) {0, 0, 0};
53         }
54         inline color_t make_color_t(signed char r_, signed char g_, signed char b_)
55         {
56                 return (color_t) {r_, g_, b_};
57         }
58         inline color_t make_color_t(int i)
59         {
60                 return (color_t) {i >> 3, i >> 2, i >> 3};
61         }
62         inline bool operator==(const color_t &a, const color_t &b)
63         {
64                 return a.r == b.r && a.g == b.g && a.b == b.b;
65         }
66         inline bool operator<(const color_t &a, const color_t &b)
67         {
68                 signed char d;
69                 d = a.r - b.r;
70                 if(d)
71                         return d < 0;
72                 d = a.g - b.g;
73                 if(d)
74                         return d < 0;
75                 d = a.b - b.b;
76                 return d < 0;
77         }
78         inline color_t &operator--(color_t &c)
79         {
80                 if(c.b > 0)
81                 {
82                         --c.b;
83                 }
84                 else if(c.g > 0)
85                 {
86                         c.b = 31;
87                         --c.g;
88                 }
89                 else if(c.r > 0)
90                 {
91                         c.b = 31;
92                         c.g = 63;
93                         --c.r;
94                 }
95                 else
96                 {
97                         c.b = 31;
98                         c.g = 63;
99                         c.r = 31;
100                 }
101                 return c;
102         }
103         inline color_t &operator++(color_t &c)
104         {
105                 if(c.b < 31)
106                 {
107                         ++c.b;
108                 }
109                 else if(c.g < 63)
110                 {
111                         c.b = 0;
112                         ++c.g;
113                 }
114                 else if(c.r < 31)
115                 {
116                         c.b = 0;
117                         c.g = 0;
118                         ++c.r;
119                 }
120                 else
121                 {
122                         c.b = 0;
123                         c.g = 0;
124                         c.r = 0;
125                 }
126                 return c;
127         }
128         template<> struct color_type_info<color_t>
129         {
130                 static const color_t min_value;
131                 static const color_t max_value;
132         };
133         const color_t color_type_info<color_t>::min_value = { 0, 0, 0 };
134         const color_t color_type_info<color_t>::max_value = { 31, 63, 31 };
135
136         struct bigcolor_t
137         {
138                 int r, g, b;
139
140                 inline bigcolor_t(): r(0), g(0), b(0)
141                 {
142                 }
143
144                 inline bigcolor_t &operator+=(const color_t &c)
145                 {
146                         r += c.r;
147                         g += c.g;
148                         b += c.b;
149                         return *this;
150                 }
151
152                 inline bigcolor_t &operator+=(int v)
153                 {
154                         r += v;
155                         g += v;
156                         b += v;
157                         return *this;
158                 }
159
160                 inline bigcolor_t operator+(int v)
161                 {
162                         bigcolor_t out = *this;
163                         out += v;
164                         return out;
165                 }
166
167                 inline bigcolor_t &operator/=(int v)
168                 {
169                         r /= v;
170                         g /= v;
171                         b /= v;
172                         return *this;
173                 }
174
175                 inline bigcolor_t operator/(int v)
176                 {
177                         bigcolor_t out = *this;
178                         out /= v;
179                         return out;
180                 }
181
182                 inline bigcolor_t &operator<<=(int v)
183                 {
184                         r <<= v;
185                         g <<= v;
186                         b <<= v;
187                         return *this;
188                 }
189
190                 inline bigcolor_t operator<<(int v)
191                 {
192                         bigcolor_t out = *this;
193                         out <<= v;
194                         return out;
195                 }
196
197                 inline operator color_t()
198                 {
199                         color_t out;
200                         out.r = r & 31;
201                         out.g = g & 63;
202                         out.b = b & 31;
203                         return out;
204                 }
205         };
206
207         std::ostream &operator<<(std::ostream &ost, const color_t &c)
208         {
209                 return ost << "make_color_t(" << int(c.r) << ", " << int(c.g) << ", " << int(c.b) << ")";
210         }
211
212         std::ostream &operator<<(std::ostream &ost, const bigcolor_t &c)
213         {
214                 return ost << "bigcolor_t(" << c.r << ", " << c.g << ", " << c.b << ")";
215         }
216
217         // 16 differences must fit in int
218         // i.e. a difference must be lower than 2^27
219
220         // shift right, rounded
221 #define SHRR(a,n) (((a) + (1 << ((n)-1))) >> (n))
222
223         inline int color_dist_avg(const color_t &a, const color_t &b)
224         {
225                 int dr = a.r - b.r; // multiplier: 31 (-1..1)
226                 int dg = a.g - b.g; // multiplier: 63 (-1..1)
227                 int db = a.b - b.b; // multiplier: 31 (-1..1)
228                 return ((dr*dr) << 2) + dg*dg + ((db*db) << 2);
229         }
230
231         inline int color_dist_wavg(const color_t &a, const color_t &b)
232         {
233                 int dr = a.r - b.r; // multiplier: 31 (-1..1)
234                 int dg = a.g - b.g; // multiplier: 63 (-1..1)
235                 int db = a.b - b.b; // multiplier: 31 (-1..1)
236                 return ((dr*dr) << 2) + ((dg*dg) << 2) + (db*db);
237                 // weighted 4:16:1
238         }
239
240         inline int color_dist_yuv(const color_t &a, const color_t &b)
241         {
242                 int dr = a.r - b.r; // multiplier: 31 (-1..1)
243                 int dg = a.g - b.g; // multiplier: 63 (-1..1)
244                 int db = a.b - b.b; // multiplier: 31 (-1..1)
245                 int y = dr * 30*2 + dg * 59 + db * 11*2; // multiplier: 6259
246                 int u = dr * 202 - y; // * 0.5 / (1 - 0.30)
247                 int v = db * 202 - y; // * 0.5 / (1 - 0.11)
248                 return ((y*y) << 1) + SHRR(u*u, 3) + SHRR(v*v, 4);
249                 // weight for u: sqrt(2^-4) / (0.5 / (1 - 0.30)) = 0.350
250                 // weight for v: sqrt(2^-5) / (0.5 / (1 - 0.11)) = 0.315
251         }
252
253         inline int color_dist_rgb(const color_t &a, const color_t &b)
254         {
255                 int dr = a.r - b.r; // multiplier: 31 (-1..1)
256                 int dg = a.g - b.g; // multiplier: 63 (-1..1)
257                 int db = a.b - b.b; // multiplier: 31 (-1..1)
258                 int y = dr * 21*2 + dg * 72 + db * 7*2; // multiplier: 6272
259                 int u = dr * 202 - y; // * 0.5 / (1 - 0.21)
260                 int v = db * 202 - y; // * 0.5 / (1 - 0.07)
261                 return ((y*y) << 1) + SHRR(u*u, 3) + SHRR(v*v, 4);
262                 // weight for u: sqrt(2^-4) / (0.5 / (1 - 0.21)) = 0.395
263                 // weight for v: sqrt(2^-5) / (0.5 / (1 - 0.07)) = 0.328
264         }
265
266         inline int color_dist_srgb(const color_t &a, const color_t &b)
267         {
268                 int dr = a.r * (int) a.r - b.r * (int) b.r; // multiplier: 31*31
269                 int dg = a.g * (int) a.g - b.g * (int) b.g; // multiplier: 63*63
270                 int db = a.b * (int) a.b - b.b * (int) b.b; // multiplier: 31*31
271                 int y = dr * 21*2*2 + dg * 72 + db * 7*2*2; // multiplier: 393400
272                 int u = dr * 409 - y; // * 0.5 / (1 - 0.30)
273                 int v = db * 409 - y; // * 0.5 / (1 - 0.11)
274                 int sy = SHRR(y, 3) * SHRR(y, 4);
275                 int su = SHRR(u, 3) * SHRR(u, 4);
276                 int sv = SHRR(v, 3) * SHRR(v, 4);
277                 return SHRR(sy, 4) + SHRR(su, 8) + SHRR(sv, 9);
278                 // weight for u: sqrt(2^-4) / (0.5 / (1 - 0.30)) = 0.350
279                 // weight for v: sqrt(2^-5) / (0.5 / (1 - 0.11)) = 0.315
280         }
281
282         inline int srgb_get_y(const color_t &a)
283         {
284                 // convert to linear
285                 int r = a.r * (int) a.r;
286                 int g = a.g * (int) a.g;
287                 int b = a.b * (int) a.b;
288                 // find luminance
289                 int y = 37 * (r * 21*2*2 + g * 72 + b * 7*2*2); // multiplier: 14555800
290                 // square root it (!)
291                 y = sqrtf(y) + 0.5f; // now in range 0 to 3815
292                 return y;
293         }
294
295         inline int color_dist_srgb_mixed(const color_t &a, const color_t &b)
296         {
297                 // get Y
298                 int ay = srgb_get_y(a);
299                 int by = srgb_get_y(b);
300                 // get UV
301                 int au = a.r * 191 - ay;
302                 int av = a.b * 191 - ay;
303                 int bu = b.r * 191 - by;
304                 int bv = b.b * 191 - by;
305                 // get differences
306                 int y = ay - by;
307                 int u = au - bu;
308                 int v = av - bv;
309                 return ((y*y) << 3) + SHRR(u*u, 1) + SHRR(v*v, 2);
310                 // weight for u: ???
311                 // weight for v: ???
312         }
313
314         inline int color_dist_normalmap(const color_t &a, const color_t &b)
315         {
316                 float ca[3], cb[3], n;
317                 ca[0] = a.r / 31.0f * 2 - 1;
318                 ca[1] = a.g / 63.0f * 2 - 1;
319                 ca[2] = a.b / 31.0f * 2 - 1;
320                 cb[0] = b.r / 31.0f * 2 - 1;
321                 cb[1] = b.g / 63.0f * 2 - 1;
322                 cb[2] = b.b / 31.0f * 2 - 1;
323                 n = ca[0] * ca[0] + ca[1] * ca[1] + ca[2] * ca[2];
324                 if(n > 0)
325                 {
326                         n = 1.0f / sqrtf(n);
327                         ca[0] *= n;
328                         ca[1] *= n;
329                         ca[2] *= n;
330                 }
331                 n = cb[0] * cb[0] + cb[1] * cb[1] + cb[2] * cb[2];
332                 if(n > 0)
333                 {
334                         n = 1.0f / sqrtf(n);
335                         cb[0] *= n;
336                         cb[1] *= n;
337                         cb[2] *= n;
338                 }
339
340                 return
341                         100000 *
342                         (
343                                 (cb[0] - ca[0]) * (cb[0] - ca[0])
344                                 +
345                                 (cb[1] - ca[1]) * (cb[1] - ca[1])
346                                 +
347                                 (cb[2] - ca[2]) * (cb[2] - ca[2])
348                         )
349                         ;
350                 // max value: 1000 * (4 + 4 + 4) = 6000
351         }
352
353         typedef int ColorDistFunc(const color_t &a, const color_t &b);
354
355         inline int alpha_dist(unsigned char a, unsigned char b)
356         {
357                 return (a - (int) b) * (a - (int) b);
358         }
359
360         template <class T, class F>
361         // n: input count
362         // m: total color count (including non-counted inputs)
363         // m >= n
364         inline void reduce_colors_inplace(T *c, int n, int m, F dist)
365         {
366                 int i, j, k;
367                 int bestsum = -1;
368                 int besti = 0;
369                 int bestj = 1;
370                 int dists[m][n];
371                 // first the square
372                 for(i = 0; i < n; ++i)
373                 {
374                         dists[i][i] = 0;
375                         for(j = i+1; j < n; ++j)
376                         {
377                                 int d = dist(c[i], c[j]);
378                                 dists[i][j] = dists[j][i] = d;
379                         }
380                 }
381                 // then the box
382                 for(; i < m; ++i)
383                 {
384                         for(j = 0; j < n; ++j)
385                         {
386                                 int d = dist(c[i], c[j]);
387                                 dists[i][j] = d;
388                         }
389                 }
390                 for(i = 0; i < m; ++i)
391                         for(j = i+1; j < m; ++j)
392                         {
393                                 int sum = 0;
394                                 for(k = 0; k < n; ++k)
395                                 {
396                                         int di = dists[i][k];
397                                         int dj = dists[j][k];
398                                         int m  = min(di, dj);
399                                         sum += m;
400                                 }
401                                 if(bestsum < 0 || sum < bestsum)
402                                 {
403                                         bestsum = sum;
404                                         besti = i;
405                                         bestj = j;
406                                 }
407                         }
408                 if(besti != 0)
409                         c[0] = c[besti];
410                 if(bestj != 1)
411                         c[1] = c[bestj];
412         }
413         template <class T, class F>
414         inline void reduce_colors_inplace_2fixpoints(T *c, int n, int m, F dist, const T &fix0, const T &fix1)
415         {
416                 // TODO fix this for ramp encoding!
417                 int i, j, k;
418                 int bestsum = -1;
419                 int besti = 0;
420                 int bestj = 1;
421                 int dists[m+2][n];
422                 // first the square
423                 for(i = 0; i < n; ++i)
424                 {
425                         dists[i][i] = 0;
426                         for(j = i+1; j < n; ++j)
427                         {
428                                 int d = dist(c[i], c[j]);
429                                 dists[i][j] = dists[j][i] = d;
430                         }
431                 }
432                 // then the box
433                 for(; i < m; ++i)
434                 {
435                         for(j = 0; j < n; ++j)
436                         {
437                                 int d = dist(c[i], c[j]);
438                                 dists[i][j] = d;
439                         }
440                 }
441                 // then the two extra rows
442                 for(j = 0; j < n; ++j)
443                 {
444                         int d = dist(fix0, c[j]);
445                         dists[m][j] = d;
446                 }
447                 for(j = 0; j < n; ++j)
448                 {
449                         int d = dist(fix1, c[j]);
450                         dists[m+1][j] = d;
451                 }
452                 for(i = 0; i < m; ++i)
453                         for(j = i+1; j < m; ++j)
454                         {
455                                 int sum = 0;
456                                 for(k = 0; k < n; ++k)
457                                 {
458                                         int di = dists[i][k];
459                                         int dj = dists[j][k];
460                                         int d0 = dists[m][k];
461                                         int d1 = dists[m+1][k];
462                                         int m  = min(min(di, dj), min(d0, d1));
463                                         sum += m;
464                                 }
465                                 if(bestsum < 0 || sum < bestsum)
466                                 {
467                                         bestsum = sum;
468                                         besti = i;
469                                         bestj = j;
470                                 }
471                         }
472                 if(besti != 0)
473                         c[0] = c[besti];
474                 if(bestj != 1)
475                         c[1] = c[bestj];
476         }
477
478         enum CompressionMode
479         {
480                 MODE_NORMAL,
481                 MODE_FAST
482         };
483
484         template<ColorDistFunc ColorDist> inline int refine_component_encode(int comp)
485         {
486                 return comp;
487         }
488         template<> inline int refine_component_encode<color_dist_srgb>(int comp)
489         {
490                 return comp * comp;
491         }
492         template<> inline int refine_component_encode<color_dist_srgb_mixed>(int comp)
493         {
494                 return comp * comp;
495         }
496
497         template<ColorDistFunc ColorDist> inline int refine_component_decode(int comp)
498         {
499                 return comp;
500         }
501         template<> inline int refine_component_decode<color_dist_srgb>(int comp)
502         {
503                 return sqrtf(comp) + 0.5f;
504         }
505         template<> inline int refine_component_decode<color_dist_srgb_mixed>(int comp)
506         {
507                 return sqrtf(comp) + 0.5f;
508         }
509
510         template <class T, class Big, int scale_l>
511         struct s2tc_evaluate_colors_result_t;
512
513         template <class T, class Big>
514         struct s2tc_evaluate_colors_result_t<T, Big, 1>
515         {
516                 // uses:
517                 //   Big << int
518                 //   Big / int
519                 //   Big + int
520                 //   Big += T
521                 int n0, n1;
522                 Big S0, S1;
523                 inline s2tc_evaluate_colors_result_t():
524                         n0(), n1(), S0(), S1()
525                 {
526                 }
527                 inline void add(int l, T a)
528                 {
529                         if(l)
530                         {
531                                 ++n1;
532                                 S1 += a;
533                         }
534                         else
535                         {
536                                 ++n0;
537                                 S0 += a;
538                         }
539                 }
540                 inline bool evaluate(T &a, T &b)
541                 {
542                         if(!n0 && !n1)
543                                 return false;
544                         if(n0)
545                                 a = ((S0 << 1) + n0) / (n0 << 1);
546                         if(n1)
547                                 b = ((S1 << 1) + n1) / (n1 << 1);
548                         return true;
549                 }
550         };
551
552         template <class T, class Big, int scale_l>
553         struct s2tc_evaluate_colors_result_t
554         {
555                 // a possible implementation of inferred color/alpha values
556                 // refining would go here
557         };
558
559         template <class T>
560         struct s2tc_evaluate_colors_result_null_t
561         {
562                 inline void add(int l, T a)
563                 {
564                 }
565         };
566
567         template<class T> T get(const unsigned char *buf)
568         {
569                 T c;
570                 c.r = buf[0];
571                 c.g = buf[1];
572                 c.b = buf[2];
573                 return c;
574         }
575         template<> unsigned char get<unsigned char>(const unsigned char *buf)
576         {
577                 return buf[3]; // extract alpha
578         }
579
580         template<class T, class Big, int bpp, bool have_trans, bool have_0_255, int n_input, class Dist, class Eval, class Arr>
581         inline unsigned int s2tc_try_encode_block(
582                         Arr &out,
583                         Eval &res,
584                         Dist ColorDist,
585                         const unsigned char *in, int iw, int w, int h,
586                         const T colors_ref[])
587         {
588                 unsigned int score = 0;
589                 for(int x = 0; x < w; ++x) for(int y = 0; y < h; ++y)
590                 {
591                         int i = y * 4 + x;
592                         const unsigned char *pix = &in[(y * iw + x) * 4];
593
594                         if(have_trans)
595                         {
596                                 if(pix[3] == 0)
597                                 {
598                                         out.do_or(i, (1 << bpp) - 1);
599                                         continue;
600                                 }
601                         }
602
603                         T color(get<T>(pix));
604                         int best = 0;
605                         int bestdist = ColorDist(color, colors_ref[0]);
606                         for(int k = 1; k < n_input; ++k)
607                         {
608                                 int dist = ColorDist(color, colors_ref[k]);
609                                 if(dist < bestdist)
610                                 {
611                                         bestdist = dist;
612                                         best = k;
613                                 }
614                         }
615                         if(have_0_255)
616                         {
617                                 int dist_0 = ColorDist(color, color_type_info<T>::min_value);
618                                 if(dist_0 <= bestdist)
619                                 {
620                                         bestdist = dist_0;
621                                         out.do_or(i, (1 << bpp) - 2);
622                                         score += bestdist;
623                                         continue;
624                                 }
625                                 int dist_255 = ColorDist(color, color_type_info<T>::max_value);
626                                 if(dist_255 <= bestdist)
627                                 {
628                                         bestdist = dist_255;
629                                         out.do_or(i, (1 << bpp) - 1);
630                                         score += bestdist;
631                                         continue;
632                                 }
633                         }
634
635                         // record
636                         res.add(best, color);
637                         out.do_or(i, best);
638                         score += bestdist;
639                 }
640                 return score;
641         }
642
643         // REFINE_LOOP: refine, take result over only if score improved, loop until it did not
644         inline void s2tc_dxt5_encode_alpha_refine_loop(bitarray<uint64_t, 16, 3> &out, const unsigned char *in, int iw, int w, int h, unsigned char &a0, unsigned char &a1)
645         {
646                 bitarray<uint64_t, 16, 3> out2;
647                 unsigned char a0next = a0, a1next = a1;
648                 unsigned int s = 0x7FFFFFFF;
649                 for(;;)
650                 {
651                         unsigned char ramp[2] = {
652                                 a0next,
653                                 a1next
654                         };
655                         s2tc_evaluate_colors_result_t<unsigned char, int, 1> r2;
656                         unsigned int s2 = s2tc_try_encode_block<unsigned char, int, 3, false, true, 2>(out2, r2, alpha_dist, in, iw, w, h, ramp);
657                         if(s2 < s)
658                         {
659                                 out = out2;
660                                 s = s2;
661                                 a0 = a0next;
662                                 a1 = a1next;
663                                 if(!r2.evaluate(a0next, a1next))
664                                         break;
665                         }
666                         else
667                                 break;
668                         out2.clear();
669                 }
670
671                 if(a1 == a0)
672                 {
673                         if(a0 == 255)
674                                 --a1;
675                         else
676                                 ++a1;
677                         for(int i = 0; i < 16; ++i) switch(out.get(i))
678                         {
679                                 case 1:
680                                         out.set(i, 0);
681                                         break;
682                         }
683                 }
684
685                 if(a1 < a0)
686                 {
687                         std::swap(a0, a1);
688                         for(int i = 0; i < 16; ++i) switch(out.get(i))
689                         {
690                                 case 0:
691                                         out.set(i, 1);
692                                         break;
693                                 case 1:
694                                         out.set(i, 0);
695                                         break;
696                                 case 6:
697                                 case 7:
698                                         break;
699                                 default:
700                                         out.set(i, 7 - out.get(i));
701                                         break;
702                         }
703                 }
704         }
705
706         // REFINE_ALWAYS: refine, do not check
707         inline void s2tc_dxt5_encode_alpha_refine_always(bitarray<uint64_t, 16, 3> &out, const unsigned char *in, int iw, int w, int h, unsigned char &a0, unsigned char &a1)
708         {
709                 unsigned char ramp[2] = {
710                         a0,
711                         a1
712                 };
713                 s2tc_evaluate_colors_result_t<unsigned char, int, 1> r2;
714                 s2tc_try_encode_block<unsigned char, int, 3, false, true, 6>(out, r2, alpha_dist, in, iw, w, h, ramp);
715                 r2.evaluate(a0, a1);
716
717                 if(a1 == a0)
718                 {
719                         if(a0 == 255)
720                                 --a1;
721                         else
722                                 ++a1;
723                         for(int i = 0; i < 16; ++i) switch(out.get(i))
724                         {
725                                 case 1:
726                                         out.set(i, 0);
727                                         break;
728                         }
729                 }
730
731                 if(a1 < a0)
732                 {
733                         std::swap(a0, a1);
734                         for(int i = 0; i < 16; ++i) switch(out.get(i))
735                         {
736                                 case 0:
737                                         out.set(i, 1);
738                                         break;
739                                 case 1:
740                                         out.set(i, 0);
741                                         break;
742                                 case 6:
743                                 case 7:
744                                         break;
745                                 default:
746                                         out.set(i, 7 - out.get(i));
747                                         break;
748                         }
749                 }
750         }
751
752         // REFINE_NEVER: do not refine
753         inline void s2tc_dxt5_encode_alpha_refine_never(bitarray<uint64_t, 16, 3> &out, const unsigned char *in, int iw, int w, int h, unsigned char &a0, unsigned char &a1)
754         {
755                 if(a1 < a0)
756                         std::swap(a0, a1);
757                 unsigned char ramp[6] = {
758                         a0,
759                         a1
760                 };
761                 s2tc_evaluate_colors_result_null_t<unsigned char> r2;
762                 s2tc_try_encode_block<unsigned char, int, 3, false, true, 6>(out, r2, alpha_dist, in, iw, w, h, ramp);
763         }
764
765         // REFINE_LOOP: refine, take result over only if score improved, loop until it did not
766         template<ColorDistFunc ColorDist, bool have_trans>
767         inline void s2tc_dxt1_encode_color_refine_loop(bitarray<uint32_t, 16, 2> &out, const unsigned char *in, int iw, int w, int h, color_t &c0, color_t &c1)
768         {
769                 bitarray<uint32_t, 16, 2> out2;
770                 color_t c0next = c0, c1next = c1;
771                 unsigned int s = 0x7FFFFFFF;
772                 for(;;)
773                 {
774                         color_t ramp[2] = {
775                                 c0next,
776                                 c1next
777                         };
778                         s2tc_evaluate_colors_result_t<color_t, bigcolor_t, 1> r2;
779                         unsigned int s2 = s2tc_try_encode_block<color_t, bigcolor_t, 2, have_trans, false, 2>(out2, r2, ColorDist, in, iw, w, h, ramp);
780                         if(s2 < s)
781                         {
782                                 out = out2;
783                                 s = s2;
784                                 c0 = c0next;
785                                 c1 = c1next;
786                                 if(!r2.evaluate(c0next, c1next))
787                                         break;
788                         }
789                         else
790                                 break;
791                         out2.clear();
792                 }
793
794                 if(c0 == c1)
795                 {
796                         if(c0 == color_type_info<color_t>::max_value)
797                                 --c1;
798                         else
799                                 ++c1;
800                         for(int i = 0; i < 16; ++i)
801                                 if(!(out.get(i) == 1))
802                                         out.set(i, 0);
803                 }
804
805                 if(have_trans ? c1 < c0 : c0 < c1)
806                 {
807                         std::swap(c0, c1);
808                         for(int i = 0; i < 16; ++i)
809                                 if(!(out.get(i) & 2))
810                                         out.do_xor(i, 1);
811                 }
812         }
813
814         // REFINE_ALWAYS: refine, do not check
815         template<ColorDistFunc ColorDist, bool have_trans>
816         inline void s2tc_dxt1_encode_color_refine_always(bitarray<uint32_t, 16, 2> &out, const unsigned char *in, int iw, int w, int h, color_t &c0, color_t &c1)
817         {
818                 color_t ramp[2] = {
819                         c0,
820                         c1
821                 };
822                 s2tc_evaluate_colors_result_t<color_t, bigcolor_t, 1> r2;
823                 s2tc_try_encode_block<color_t, bigcolor_t, 2, have_trans, false, 2>(out, r2, ColorDist, in, iw, w, h, ramp);
824                 r2.evaluate(c0, c1);
825
826                 if(c0 == c1)
827                 {
828                         if(c0 == color_type_info<color_t>::max_value)
829                                 --c1;
830                         else
831                                 ++c1;
832                         for(int i = 0; i < 16; ++i)
833                                 if(!(out.get(i) == 1))
834                                         out.set(i, 0);
835                 }
836
837                 if(have_trans ? c1 < c0 : c0 < c1)
838                 {
839                         std::swap(c0, c1);
840                         for(int i = 0; i < 16; ++i)
841                                 if(!(out.get(i) & 2))
842                                         out.do_xor(i, 1);
843                 }
844         }
845
846         // REFINE_NEVER: do not refine
847         template<ColorDistFunc ColorDist, bool have_trans>
848         inline void s2tc_dxt1_encode_color_refine_never(bitarray<uint32_t, 16, 2> &out, const unsigned char *in, int iw, int w, int h, color_t &c0, color_t &c1)
849         {
850                 if(have_trans ? c1 < c0 : c0 < c1)
851                         std::swap(c0, c1);
852                 color_t ramp[2] = {
853                         c0,
854                         c1
855                 };
856                 s2tc_evaluate_colors_result_null_t<color_t> r2;
857                 s2tc_try_encode_block<color_t, bigcolor_t, 2, have_trans, false, 2>(out, r2, ColorDist, in, iw, w, h, ramp);
858         }
859
860         inline void s2tc_dxt3_encode_alpha(bitarray<uint64_t, 16, 4> &out, const unsigned char *in, int iw, int w, int h)
861         {
862                 for(int x = 0; x < w; ++x) for(int y = 0; y < h; ++y)
863                 {
864                         int i = y * 4 + x;
865                         const unsigned char *pix = &in[(y * iw + x) * 4];
866                         out.do_or(i, pix[3]);
867                 }
868         }
869
870         template<DxtMode dxt, ColorDistFunc ColorDist, CompressionMode mode, RefinementMode refine>
871         inline void s2tc_encode_block(unsigned char *out, const unsigned char *rgba, int iw, int w, int h, int nrandom)
872         {
873                 color_t c[16 + (nrandom >= 0 ? nrandom : 0)];
874                 unsigned char ca[16 + (nrandom >= 0 ? nrandom : 0)];
875                 int x, y;
876
877                 if(mode == MODE_FAST)
878                 {
879                         // FAST: trick from libtxc_dxtn: just get brightest and darkest colors, and encode using these
880
881                         color_t c0 = make_color_t(0, 0, 0);
882
883                         // dummy values because we don't know whether the first pixel will write
884                         c[0].r = 31;
885                         c[0].g = 63;
886                         c[0].b = 31;
887                         c[1].r = 0;
888                         c[1].g = 0;
889                         c[1].b = 0;
890                         int dmin = 0x7FFFFFFF;
891                         int dmax = 0;
892                         if(dxt == DXT5)
893                         {
894                                 ca[0] = rgba[3];
895                                 ca[1] = ca[0];
896                         }
897
898                         for(x = 0; x < w; ++x)
899                                 for(y = 0; y < h; ++y)
900                                 {
901                                         c[2].r = rgba[(x + y * iw) * 4 + 0];
902                                         c[2].g = rgba[(x + y * iw) * 4 + 1];
903                                         c[2].b = rgba[(x + y * iw) * 4 + 2];
904                                         ca[2]  = rgba[(x + y * iw) * 4 + 3];
905                                         // MODE_FAST doesn't work for normalmaps, so this works
906
907                                         int d = ColorDist(c[2], c0);
908                                         if(d > dmax)
909                                         {
910                                                 dmax = d;
911                                                 c[1] = c[2];
912                                         }
913                                         if(d < dmin)
914                                         {
915                                                 dmin = d;
916                                                 c[0] = c[2];
917                                         }
918
919                                         if(dxt == DXT5)
920                                         {
921                                                 if(ca[2] != 255)
922                                                 {
923                                                         if(ca[2] > ca[1])
924                                                                 ca[1] = ca[2];
925                                                         if(ca[2] < ca[0])
926                                                                 ca[0] = ca[2];
927                                                 }
928                                         }
929                                 }
930                 }
931                 else
932                 {
933                         int n = 0, m = 0;
934
935                         for(x = 0; x < w; ++x)
936                                 for(y = 0; y < h; ++y)
937                                 {
938                                         c[n].r = rgba[(x + y * iw) * 4 + 0];
939                                         c[n].g = rgba[(x + y * iw) * 4 + 1];
940                                         c[n].b = rgba[(x + y * iw) * 4 + 2];
941                                         ca[n]  = rgba[(x + y * iw) * 4 + 3];
942                                         ++n;
943                                 }
944                         if(n == 0)
945                         {
946                                 n = 1;
947                                 c[0].r = 0;
948                                 c[0].g = 0;
949                                 c[0].b = 0;
950                                 ca[0] = 0;
951                         }
952                         m = n;
953
954                         if(nrandom > 0)
955                         {
956                                 color_t mins = c[0];
957                                 color_t maxs = c[0];
958                                 unsigned char mina = (dxt == DXT5) ? ca[0] : 0;
959                                 unsigned char maxa = (dxt == DXT5) ? ca[0] : 0;
960                                 for(x = 1; x < n; ++x)
961                                 {
962                                         mins.r = min(mins.r, c[x].r);
963                                         mins.g = min(mins.g, c[x].g);
964                                         mins.b = min(mins.b, c[x].b);
965                                         maxs.r = max(maxs.r, c[x].r);
966                                         maxs.g = max(maxs.g, c[x].g);
967                                         maxs.b = max(maxs.b, c[x].b);
968                                         if(dxt == DXT5)
969                                         {
970                                                 mina = min(mina, ca[x]);
971                                                 maxa = max(maxa, ca[x]);
972                                         }
973                                 }
974                                 color_t len = make_color_t(maxs.r - mins.r + 1, maxs.g - mins.g + 1, maxs.b - mins.b + 1);
975                                 int lena = (dxt == DXT5) ? (maxa - (int) mina + 1) : 0;
976                                 for(x = 0; x < nrandom; ++x)
977                                 {
978                                         c[m].r = mins.r + rand() % len.r;
979                                         c[m].g = mins.g + rand() % len.g;
980                                         c[m].b = mins.b + rand() % len.b;
981                                         if(dxt == DXT5)
982                                                 ca[m] = mina + rand() % lena;
983                                         ++m;
984                                 }
985                         }
986                         else
987                         {
988                                 // hack for last miplevel
989                                 if(n == 1)
990                                 {
991                                         c[1] = c[0];
992                                         m = n = 2;
993                                 }
994                         }
995
996                         reduce_colors_inplace(c, n, m, ColorDist);
997                         if(dxt == DXT5)
998                                 reduce_colors_inplace_2fixpoints(ca, n, m, alpha_dist, (unsigned char) 0, (unsigned char) 255);
999                 }
1000
1001                 // equal colors are BAD
1002                 if(c[0] == c[1])
1003                 {
1004                         if(c[0] == color_type_info<color_t>::max_value)
1005                                 --c[1];
1006                         else
1007                                 ++c[1];
1008                 }
1009
1010                 if(dxt == DXT5)
1011                 {
1012                         if(ca[0] == ca[1])
1013                         {
1014                                 if(ca[0] == 255)
1015                                         --ca[1];
1016                                 else
1017                                         ++ca[1];
1018                         }
1019                 }
1020
1021                 switch(dxt)
1022                 {
1023                         case DXT1:
1024                                 {
1025                                         bitarray<uint32_t, 16, 2> colorblock;
1026                                         switch(refine)
1027                                         {
1028                                                 case REFINE_NEVER:
1029                                                         s2tc_dxt1_encode_color_refine_never<ColorDist, true>(colorblock, rgba, iw, w, h, c[0], c[1]);
1030                                                         break;
1031                                                 case REFINE_ALWAYS:
1032                                                         s2tc_dxt1_encode_color_refine_always<ColorDist, true>(colorblock, rgba, iw, w, h, c[0], c[1]);
1033                                                         break;
1034                                                 case REFINE_LOOP:
1035                                                         s2tc_dxt1_encode_color_refine_loop<ColorDist, true>(colorblock, rgba, iw, w, h, c[0], c[1]);
1036                                                         break;
1037                                         }
1038                                         out[0] = ((c[0].g & 0x07) << 5) | c[0].b;
1039                                         out[1] = (c[0].r << 3) | (c[0].g >> 3);
1040                                         out[2] = ((c[1].g & 0x07) << 5) | c[1].b;
1041                                         out[3] = (c[1].r << 3) | (c[1].g >> 3);
1042                                         colorblock.tobytes(&out[4]);
1043                                 }
1044                                 break;
1045                         case DXT3:
1046                                 {
1047                                         bitarray<uint32_t, 16, 2> colorblock;
1048                                         bitarray<uint64_t, 16, 4> alphablock;
1049                                         switch(refine)
1050                                         {
1051                                                 case REFINE_NEVER:
1052                                                         s2tc_dxt1_encode_color_refine_never<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
1053                                                         break;
1054                                                 case REFINE_ALWAYS:
1055                                                         s2tc_dxt1_encode_color_refine_always<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
1056                                                         break;
1057                                                 case REFINE_LOOP:
1058                                                         s2tc_dxt1_encode_color_refine_loop<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
1059                                                         break;
1060                                         }
1061                                         s2tc_dxt3_encode_alpha(alphablock, rgba, iw, w, h);
1062                                         alphablock.tobytes(&out[0]);
1063                                         out[8] = ((c[0].g & 0x07) << 5) | c[0].b;
1064                                         out[9] = (c[0].r << 3) | (c[0].g >> 3);
1065                                         out[10] = ((c[1].g & 0x07) << 5) | c[1].b;
1066                                         out[11] = (c[1].r << 3) | (c[1].g >> 3);
1067                                         colorblock.tobytes(&out[12]);
1068                                 }
1069                                 break;
1070                         case DXT5:
1071                                 {
1072                                         bitarray<uint32_t, 16, 2> colorblock;
1073                                         bitarray<uint64_t, 16, 3> alphablock;
1074                                         switch(refine)
1075                                         {
1076                                                 case REFINE_NEVER:
1077                                                         s2tc_dxt1_encode_color_refine_never<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
1078                                                         s2tc_dxt5_encode_alpha_refine_never(alphablock, rgba, iw, w, h, ca[0], ca[1]);
1079                                                         break;
1080                                                 case REFINE_ALWAYS:
1081                                                         s2tc_dxt1_encode_color_refine_always<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
1082                                                         s2tc_dxt5_encode_alpha_refine_always(alphablock, rgba, iw, w, h, ca[0], ca[1]);
1083                                                         break;
1084                                                 case REFINE_LOOP:
1085                                                         s2tc_dxt1_encode_color_refine_loop<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
1086                                                         s2tc_dxt5_encode_alpha_refine_loop(alphablock, rgba, iw, w, h, ca[0], ca[1]);
1087                                                         break;
1088                                         }
1089                                         out[0] = ca[0];
1090                                         out[1] = ca[1];
1091                                         alphablock.tobytes(&out[2]);
1092                                         out[8] = ((c[0].g & 0x07) << 5) | c[0].b;
1093                                         out[9] = (c[0].r << 3) | (c[0].g >> 3);
1094                                         out[10] = ((c[1].g & 0x07) << 5) | c[1].b;
1095                                         out[11] = (c[1].r << 3) | (c[1].g >> 3);
1096                                         colorblock.tobytes(&out[12]);
1097                                 }
1098                                 break;
1099                 }
1100         }
1101
1102         // compile time dispatch magic
1103         template<DxtMode dxt, ColorDistFunc ColorDist, CompressionMode mode>
1104         inline s2tc_encode_block_func_t s2tc_encode_block_func(RefinementMode refine)
1105         {
1106                 switch(refine)
1107                 {
1108                         case REFINE_NEVER:
1109                                 return s2tc_encode_block<dxt, ColorDist, mode, REFINE_NEVER>;
1110                         case REFINE_LOOP:
1111                                 return s2tc_encode_block<dxt, ColorDist, mode, REFINE_LOOP>;
1112                         default:
1113                         case REFINE_ALWAYS:
1114                                 return s2tc_encode_block<dxt, ColorDist, mode, REFINE_ALWAYS>;
1115                 }
1116         }
1117
1118         // these color dist functions do not need the refinement check, as they always improve the situation
1119         template<ColorDistFunc ColorDist> struct supports_fast
1120         {
1121                 static const bool value = true;
1122         };
1123         template<> struct supports_fast<color_dist_normalmap>
1124         {
1125                 static const bool value = false;
1126         };
1127
1128         template<DxtMode dxt, ColorDistFunc ColorDist>
1129         inline s2tc_encode_block_func_t s2tc_encode_block_func(int nrandom, RefinementMode refine)
1130         {
1131                 if(!supports_fast<ColorDist>::value || nrandom >= 0)
1132                         return s2tc_encode_block_func<dxt, ColorDist, MODE_NORMAL>(refine);
1133                 else
1134                         return s2tc_encode_block_func<dxt, ColorDist, MODE_FAST>(refine);
1135         }
1136
1137         template<ColorDistFunc ColorDist>
1138         inline s2tc_encode_block_func_t s2tc_encode_block_func(DxtMode dxt, int nrandom, RefinementMode refine)
1139         {
1140                 switch(dxt)
1141                 {
1142                         case DXT1:
1143                                 return s2tc_encode_block_func<DXT1, ColorDist>(nrandom, refine);
1144                                 break;
1145                         case DXT3:
1146                                 return s2tc_encode_block_func<DXT3, ColorDist>(nrandom, refine);
1147                                 break;
1148                         default:
1149                         case DXT5:
1150                                 return s2tc_encode_block_func<DXT5, ColorDist>(nrandom, refine);
1151                                 break;
1152                 }
1153         }
1154 };
1155
1156 s2tc_encode_block_func_t s2tc_encode_block_func(DxtMode dxt, ColorDistMode cd, int nrandom, RefinementMode refine)
1157 {
1158         switch(cd)
1159         {
1160                 case RGB:
1161                         return s2tc_encode_block_func<color_dist_rgb>(dxt, nrandom, refine);
1162                         break;
1163                 case YUV:
1164                         return s2tc_encode_block_func<color_dist_yuv>(dxt, nrandom, refine);
1165                         break;
1166                 case SRGB:
1167                         return s2tc_encode_block_func<color_dist_srgb>(dxt, nrandom, refine);
1168                         break;
1169                 case SRGB_MIXED:
1170                         return s2tc_encode_block_func<color_dist_srgb_mixed>(dxt, nrandom, refine);
1171                         break;
1172                 case AVG:
1173                         return s2tc_encode_block_func<color_dist_avg>(dxt, nrandom, refine);
1174                         break;
1175                 default:
1176                 case WAVG:
1177                         return s2tc_encode_block_func<color_dist_wavg>(dxt, nrandom, refine);
1178                         break;
1179                 case NORMALMAP:
1180                         return s2tc_encode_block_func<color_dist_normalmap>(dxt, nrandom, refine);
1181                         break;
1182         }
1183 }
1184
1185 namespace
1186 {
1187         inline int diffuse(int *diff, int src, int shift)
1188         {
1189                 const int maxval = (1 << (8 - shift)) - 1;
1190                 src += *diff;
1191                 int ret = max(0, min(src >> shift, maxval));
1192                 // simulate decoding ("loop filter")
1193                 int loop = (ret << shift) | (ret >> (8 - 2 * shift));
1194                 *diff = src - loop;
1195                 return ret;
1196         }
1197         inline int diffuse1(int *diff, int src)
1198         {
1199                 src += *diff;
1200                 int ret = (src >= 128);
1201                 // simulate decoding ("loop filter")
1202                 int loop = ret ? 255 : 0;
1203                 *diff = src - loop;
1204                 return ret;
1205         }
1206
1207         inline int floyd(int *thisrow, int *downrow, int src, int shift)
1208         {
1209                 const int maxval = (1 << (8 - shift)) - 1;
1210                 src = (src << 4) | (src >> 4);
1211                 src += thisrow[1];
1212                 int ret = max(0, min(src >> (shift + 4), maxval));
1213                 // simulate decoding ("loop filter")
1214                 int loop = (ret * 4095 / maxval);
1215                 int err = src - loop;
1216                 int e7 = (err * 7 + 8) / 16;
1217                 err -= e7;
1218                 int e3 = (err * 3 + 4) / 9;
1219                 err -= e3;
1220                 int e5 = (err * 5 + 3) / 6;
1221                 err -= e5;
1222                 int e1 = err;
1223                 thisrow[2] += e7;
1224                 downrow[0] += e3;
1225                 downrow[1] += e5;
1226                 downrow[2] += e1;
1227                 return ret;
1228         }
1229
1230         inline int floyd1(int *thisrow, int *downrow, int src)
1231         {
1232                 src = (src << 4) | (src >> 4);
1233                 src += thisrow[1];
1234                 int ret = (src >= 2048);
1235                 // simulate decoding ("loop filter")
1236                 int loop = ret ? 4095 : 0;
1237                 int err = src - loop;
1238                 int e7 = (err * 7 + 8) / 16;
1239                 err -= e7;
1240                 int e3 = (err * 3 + 4) / 9;
1241                 err -= e3;
1242                 int e5 = (err * 5 + 3) / 6;
1243                 err -= e5;
1244                 int e1 = err;
1245                 thisrow[2] += e7;
1246                 downrow[0] += e3;
1247                 downrow[1] += e5;
1248                 downrow[2] += e1;
1249                 return ret;
1250         }
1251
1252         template<int srccomps, int alphabits, DitherMode dither>
1253         inline void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h)
1254         {
1255                 int x, y;
1256                 switch(dither)
1257                 {
1258                         case DITHER_NONE:
1259                                 {
1260                                         for(y = 0; y < h; ++y)
1261                                                 for(x = 0; x < w; ++x)
1262                                                 {
1263                                                         out[(x + y * w) * 4 + 0] = rgba[(x + y * w) * srccomps + 0] >> 3;
1264                                                         out[(x + y * w) * 4 + 1] = rgba[(x + y * w) * srccomps + 1] >> 2;
1265                                                         out[(x + y * w) * 4 + 2] = rgba[(x + y * w) * srccomps + 2] >> 3;
1266                                                 }
1267                                         if(srccomps == 4)
1268                                         {
1269                                                 if(alphabits == 1)
1270                                                 {
1271                                                         for(y = 0; y < h; ++y)
1272                                                                 for(x = 0; x < w; ++x)
1273                                                                         out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3] >> 7;
1274                                                 }
1275                                                 else if(alphabits == 8)
1276                                                 {
1277                                                         for(y = 0; y < h; ++y)
1278                                                                 for(x = 0; x < w; ++x)
1279                                                                         out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
1280                                                 }
1281                                                 else
1282                                                 {
1283                                                         int alphadiffuse = 8 - alphabits;
1284                                                         for(y = 0; y < h; ++y)
1285                                                                 for(x = 0; x < w; ++x)
1286                                                                         out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3] >> (8 - alphabits);
1287                                                 }
1288                                         }
1289                                         else
1290                                         {
1291                                                 for(y = 0; y < h; ++y)
1292                                                         for(x = 0; x < w; ++x)
1293                                                                 out[(x + y * w) * 4 + 3] = (1 << alphabits) - 1;
1294                                         }
1295                                 }
1296                                 break;
1297                         case DITHER_SIMPLE:
1298                                 {
1299                                         int x, y;
1300                                         int diffuse_r = 0;
1301                                         int diffuse_g = 0;
1302                                         int diffuse_b = 0;
1303                                         int diffuse_a = 0;
1304                                         for(y = 0; y < h; ++y)
1305                                                 for(x = 0; x < w; ++x)
1306                                                 {
1307                                                         out[(x + y * w) * 4 + 0] = diffuse(&diffuse_r, rgba[(x + y * w) * srccomps + 0], 3);
1308                                                         out[(x + y * w) * 4 + 1] = diffuse(&diffuse_g, rgba[(x + y * w) * srccomps + 1], 2);
1309                                                         out[(x + y * w) * 4 + 2] = diffuse(&diffuse_b, rgba[(x + y * w) * srccomps + 2], 3);
1310                                                 }
1311                                         if(srccomps == 4)
1312                                         {
1313                                                 if(alphabits == 1)
1314                                                 {
1315                                                         for(y = 0; y < h; ++y)
1316                                                                 for(x = 0; x < w; ++x)
1317                                                                         out[(x + y * w) * 4 + 3] = diffuse1(&diffuse_a, rgba[(x + y * w) * srccomps + 3]);
1318                                                 }
1319                                                 else if(alphabits == 8)
1320                                                 {
1321                                                         for(y = 0; y < h; ++y)
1322                                                                 for(x = 0; x < w; ++x)
1323                                                                         out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
1324                                                 }
1325                                                 else
1326                                                 {
1327                                                         for(y = 0; y < h; ++y)
1328                                                                 for(x = 0; x < w; ++x)
1329                                                                         out[(x + y * w) * 4 + 3] = diffuse(&diffuse_a, rgba[(x + y * w) * srccomps + 3], 8 - alphabits);
1330                                                 }
1331                                         }
1332                                         else
1333                                         {
1334                                                 for(y = 0; y < h; ++y)
1335                                                         for(x = 0; x < w; ++x)
1336                                                                 out[(x + y * w) * 4 + 3] = (1 << alphabits) - 1;
1337                                         }
1338                                 }
1339                                 break;
1340                         case DITHER_FLOYDSTEINBERG:
1341                                 {
1342                                         int x, y;
1343                                         int pw = w+2;
1344                                         int downrow[6*pw];
1345                                         memset(downrow, 0, sizeof(downrow));
1346                                         int *thisrow_r, *thisrow_g, *thisrow_b, *thisrow_a;
1347                                         int *downrow_r, *downrow_g, *downrow_b, *downrow_a;
1348                                         for(y = 0; y < h; ++y)
1349                                         {
1350                                                 thisrow_r = downrow + ((y&1)?3:0) * pw;
1351                                                 downrow_r = downrow + ((y&1)?0:3) * pw;
1352                                                 memset(downrow_r, 0, sizeof(*downrow_r) * (3*pw));
1353                                                 thisrow_g = thisrow_r + pw;
1354                                                 thisrow_b = thisrow_g + pw;
1355                                                 downrow_g = downrow_r + pw;
1356                                                 downrow_b = downrow_g + pw;
1357                                                 for(x = 0; x < w; ++x)
1358                                                 {
1359                                                         out[(x + y * w) * 4 + 0] = floyd(&thisrow_r[x], &downrow_r[x], rgba[(x + y * w) * srccomps + 0], 3);
1360                                                         out[(x + y * w) * 4 + 1] = floyd(&thisrow_g[x], &downrow_g[x], rgba[(x + y * w) * srccomps + 1], 2);
1361                                                         out[(x + y * w) * 4 + 2] = floyd(&thisrow_b[x], &downrow_b[x], rgba[(x + y * w) * srccomps + 2], 3);
1362                                                 }
1363                                         }
1364                                         if(srccomps == 4)
1365                                         {
1366                                                 if(alphabits == 1)
1367                                                 {
1368                                                         for(y = 0; y < h; ++y)
1369                                                         {
1370                                                                 thisrow_a = downrow + (y&1) * pw;
1371                                                                 downrow_a = downrow + !(y&1) * pw;
1372                                                                 memset(downrow_a, 0, sizeof(*downrow_a) * pw);
1373                                                                 for(x = 0; x < w; ++x)
1374                                                                         out[(x + y * w) * 4 + 3] = floyd1(&thisrow_a[x], &downrow_a[x], rgba[(x + y * w) * srccomps + 3]);
1375                                                         }
1376                                                 }
1377                                                 else if(alphabits == 8)
1378                                                 {
1379                                                         for(y = 0; y < h; ++y)
1380                                                                 for(x = 0; x < w; ++x)
1381                                                                         out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
1382                                                 }
1383                                                 else
1384                                                 {
1385                                                         for(y = 0; y < h; ++y)
1386                                                         {
1387                                                                 thisrow_a = downrow + (y&1) * pw;
1388                                                                 downrow_a = downrow + !(y&1) * pw;
1389                                                                 memset(downrow_a, 0, sizeof(*downrow_a) * pw);
1390                                                                 for(x = 0; x < w; ++x)
1391                                                                         out[(x + y * w) * 4 + 3] = floyd(&thisrow_a[x], &downrow_a[x], rgba[(x + y * w) * srccomps + 3], 8 - alphabits);
1392                                                         }
1393                                                 }
1394                                         }
1395                                         else
1396                                         {
1397                                                 for(y = 0; y < h; ++y)
1398                                                         for(x = 0; x < w; ++x)
1399                                                                 out[(x + y * w) * 4 + 3] = (1 << alphabits) - 1;
1400                                         }
1401                                 }
1402                                 break;
1403                 }
1404         }
1405
1406         template<int srccomps, int alphabits>
1407         inline void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, DitherMode dither)
1408         {
1409                 switch(dither)
1410                 {
1411                         case DITHER_NONE:
1412                                 rgb565_image<srccomps, alphabits, DITHER_NONE>(out, rgba, w, h);
1413                                 break;
1414                         default:
1415                         case DITHER_SIMPLE:
1416                                 rgb565_image<srccomps, alphabits, DITHER_SIMPLE>(out, rgba, w, h);
1417                                 break;
1418                         case DITHER_FLOYDSTEINBERG:
1419                                 rgb565_image<srccomps, alphabits, DITHER_FLOYDSTEINBERG>(out, rgba, w, h);
1420                                 break;
1421                 }
1422         }
1423
1424         template<int srccomps>
1425         inline void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, int alphabits, DitherMode dither)
1426         {
1427                 switch(alphabits)
1428                 {
1429                         case 1:
1430                                 rgb565_image<srccomps, 1>(out, rgba, w, h, dither);
1431                                 break;
1432                         case 4:
1433                                 rgb565_image<srccomps, 4>(out, rgba, w, h, dither);
1434                                 break;
1435                         default:
1436                         case 8:
1437                                 rgb565_image<srccomps, 8>(out, rgba, w, h, dither);
1438                                 break;
1439                 }
1440         }
1441 };
1442
1443 void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, int srccomps, int alphabits, DitherMode dither)
1444 {
1445         switch(srccomps)
1446         {
1447                 case 3:
1448                         rgb565_image<3>(out, rgba, w, h, alphabits, dither);
1449                         break;
1450                 case 4:
1451                 default:
1452                         rgb565_image<4>(out, rgba, w, h, alphabits, dither);
1453                         break;
1454         }
1455 }