OSDN Git Service

add Android.mk to build for android
[android-x86/external-s2tc.git] / s2tc_algorithm.cpp
index c432486..42238f2 100644 (file)
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include <stdint.h>
 
 #include "s2tc_algorithm.h"
 #include "s2tc_common.h"
 
 namespace
 {
-       typedef struct
+       template<class T> void swap(T& a, T& b)
        {
-               signed char r, g, b;
+               T h = a;
+               a = b;
+               b = h;
        }
-       color_t;
+       template<class T> struct color_type_info
+       {
+       };
+       template<> struct color_type_info<unsigned char>
+       {
+               static const unsigned char min_value = 0;
+               static const unsigned char max_value = 255;
+       };
 
+       struct color_t
+       {
+               signed char r, g, b;
+       };
+       inline color_t make_color_t()
+       {
+               return (color_t) {0, 0, 0};
+       }
+       inline color_t make_color_t(signed char r_, signed char g_, signed char b_)
+       {
+               return (color_t) {r_, g_, b_};
+       }
+       inline color_t make_color_t(int i)
+       {
+               return (color_t) {(signed char)(i >> 3), (signed char)(i >> 2), (signed char)(i >> 3)};
+       }
+       inline bool operator==(const color_t &a, const color_t &b)
+       {
+               return a.r == b.r && a.g == b.g && a.b == b.b;
+       }
        inline bool operator<(const color_t &a, const color_t &b)
        {
                signed char d;
@@ -49,6 +79,135 @@ namespace
                d = a.b - b.b;
                return d < 0;
        }
+       inline color_t &operator--(color_t &c)
+       {
+               if(c.b > 0)
+               {
+                       --c.b;
+               }
+               else if(c.g > 0)
+               {
+                       c.b = 31;
+                       --c.g;
+               }
+               else if(c.r > 0)
+               {
+                       c.b = 31;
+                       c.g = 63;
+                       --c.r;
+               }
+               else
+               {
+                       c.b = 31;
+                       c.g = 63;
+                       c.r = 31;
+               }
+               return c;
+       }
+       inline color_t &operator++(color_t &c)
+       {
+               if(c.b < 31)
+               {
+                       ++c.b;
+               }
+               else if(c.g < 63)
+               {
+                       c.b = 0;
+                       ++c.g;
+               }
+               else if(c.r < 31)
+               {
+                       c.b = 0;
+                       c.g = 0;
+                       ++c.r;
+               }
+               else
+               {
+                       c.b = 0;
+                       c.g = 0;
+                       c.r = 0;
+               }
+               return c;
+       }
+       template<> struct color_type_info<color_t>
+       {
+               static const color_t min_value;
+               static const color_t max_value;
+       };
+       const color_t color_type_info<color_t>::min_value = { 0, 0, 0 };
+       const color_t color_type_info<color_t>::max_value = { 31, 63, 31 };
+
+       struct bigcolor_t
+       {
+               int r, g, b;
+
+               inline bigcolor_t(): r(0), g(0), b(0)
+               {
+               }
+
+               inline bigcolor_t &operator+=(const color_t &c)
+               {
+                       r += c.r;
+                       g += c.g;
+                       b += c.b;
+                       return *this;
+               }
+
+               inline bigcolor_t &operator+=(int v)
+               {
+                       r += v;
+                       g += v;
+                       b += v;
+                       return *this;
+               }
+
+               inline bigcolor_t operator+(int v)
+               {
+                       bigcolor_t out = *this;
+                       out += v;
+                       return out;
+               }
+
+               inline bigcolor_t &operator/=(int v)
+               {
+                       r /= v;
+                       g /= v;
+                       b /= v;
+                       return *this;
+               }
+
+               inline bigcolor_t operator/(int v)
+               {
+                       bigcolor_t out = *this;
+                       out /= v;
+                       return out;
+               }
+
+               inline bigcolor_t &operator<<=(int v)
+               {
+                       r <<= v;
+                       g <<= v;
+                       b <<= v;
+                       return *this;
+               }
+
+               inline bigcolor_t operator<<(int v)
+               {
+                       bigcolor_t out = *this;
+                       out <<= v;
+                       return out;
+               }
+
+               inline operator color_t()
+               {
+                       color_t out;
+                       out.r = r & 31;
+                       out.g = g & 63;
+                       out.b = b & 31;
+                       return out;
+               }
+       };
+
        // 16 differences must fit in int
        // i.e. a difference must be lower than 2^27
 
@@ -63,6 +222,15 @@ namespace
                return ((dr*dr) << 2) + dg*dg + ((db*db) << 2);
        }
 
+       inline int color_dist_w0avg(const color_t &a, const color_t &b)
+       {
+               int dr = a.r - b.r; // multiplier: 31 (-1..1)
+               int dg = a.g - b.g; // multiplier: 63 (-1..1)
+               int db = a.b - b.b; // multiplier: 31 (-1..1)
+               return dr*dr + dg*dg + db*db;
+               // weighted 1:4:1
+       }
+
        inline int color_dist_wavg(const color_t &a, const color_t &b)
        {
                int dr = a.r - b.r; // multiplier: 31 (-1..1)
@@ -146,37 +314,6 @@ namespace
                // weight for v: ???
        }
 
-       // FIXME this is likely broken
-       inline int color_dist_lab_srgb(const color_t &a, const color_t &b)
-       {
-               // undo sRGB
-               float ar = powf(a.r / 31.0f, 2.4f);
-               float ag = powf(a.g / 63.0f, 2.4f);
-               float ab = powf(a.b / 31.0f, 2.4f);
-               float br = powf(b.r / 31.0f, 2.4f);
-               float bg = powf(b.g / 63.0f, 2.4f);
-               float bb = powf(b.b / 31.0f, 2.4f);
-               // convert to CIE XYZ
-               float aX = 0.4124f * ar + 0.3576f * ag + 0.1805f * ab;
-               float aY = 0.2126f * ar + 0.7152f * ag + 0.0722f * ab;
-               float aZ = 0.0193f * ar + 0.1192f * ag + 0.9505f * ab;
-               float bX = 0.4124f * br + 0.3576f * bg + 0.1805f * bb;
-               float bY = 0.2126f * br + 0.7152f * bg + 0.0722f * bb;
-               float bZ = 0.0193f * br + 0.1192f * bg + 0.9505f * bb;
-               // convert to CIE Lab
-               float Xn = 0.3127f;
-               float Yn = 0.3290f;
-               float Zn = 0.3583f;
-               float aL = 116 * cbrtf(aY / Yn) - 16;
-               float aA = 500 * (cbrtf(aX / Xn) - cbrtf(aY / Yn));
-               float aB = 200 * (cbrtf(aY / Yn) - cbrtf(aZ / Zn));
-               float bL = 116 * cbrtf(bY / Yn) - 16;
-               float bA = 500 * (cbrtf(bX / Xn) - cbrtf(bY / Yn));
-               float bB = 200 * (cbrtf(bY / Yn) - cbrtf(bZ / Zn));
-               // euclidean distance, but moving weight away from A and B
-               return 1000 * ((aL - bL) * (aL - bL) + (aA - bA) * (aA - bA) + (aB - bB) * (aB - bB));
-       }
-
        inline int color_dist_normalmap(const color_t &a, const color_t &b)
        {
                float ca[3], cb[3], n;
@@ -271,14 +408,14 @@ namespace
                                        bestj = j;
                                }
                        }
-               if(besti != 0)
-                       c[0] = c[besti];
-               if(bestj != 1)
-                       c[1] = c[bestj];
+               T c0 = c[besti];
+               c[1] = c[bestj];
+               c[0] = c0;
        }
        template <class T, class F>
        inline void reduce_colors_inplace_2fixpoints(T *c, int n, int m, F dist, const T &fix0, const T &fix1)
        {
+               // TODO fix this for ramp encoding!
                int i, j, k;
                int bestsum = -1;
                int besti = 0;
@@ -343,7 +480,6 @@ namespace
        enum CompressionMode
        {
                MODE_NORMAL,
-               MODE_RANDOM,
                MODE_FAST
        };
 
@@ -359,10 +495,6 @@ namespace
        {
                return comp * comp;
        }
-       template<> inline int refine_component_encode<color_dist_lab_srgb>(int comp)
-       {
-               return comp * comp;
-       }
 
        template<ColorDistFunc ColorDist> inline int refine_component_decode(int comp)
        {
@@ -376,36 +508,381 @@ namespace
        {
                return sqrtf(comp) + 0.5f;
        }
-       template<> inline int refine_component_decode<color_dist_lab_srgb>(int comp)
+
+       template <class T, class Big, int scale_l>
+       struct s2tc_evaluate_colors_result_t;
+
+       template <class T, class Big>
+       struct s2tc_evaluate_colors_result_t<T, Big, 1>
        {
-               return sqrtf(comp) + 0.5f;
-       }
+               // uses:
+               //   Big << int
+               //   Big / int
+               //   Big + int
+               //   Big += T
+               int n0, n1;
+               Big S0, S1;
+               inline s2tc_evaluate_colors_result_t():
+                       n0(), n1(), S0(), S1()
+               {
+               }
+               inline void add(int l, T a)
+               {
+                       if(l)
+                       {
+                               ++n1;
+                               S1 += a;
+                       }
+                       else
+                       {
+                               ++n0;
+                               S0 += a;
+                       }
+               }
+               inline bool evaluate(T &a, T &b)
+               {
+                       if(!n0 && !n1)
+                               return false;
+                       if(n0)
+                               a = ((S0 << 1) + n0) / (n0 << 1);
+                       if(n1)
+                               b = ((S1 << 1) + n1) / (n1 << 1);
+                       return true;
+               }
+       };
 
-       // these color dist functions ignore color values at alpha 0
-       template<ColorDistFunc ColorDist> struct alpha_0_is_unimportant
+       template <class T, class Big, int scale_l>
+       struct s2tc_evaluate_colors_result_t
        {
-               static bool const value = true;
+               // a possible implementation of inferred color/alpha values
+               // refining would go here
        };
-       template<> struct alpha_0_is_unimportant<color_dist_normalmap>
+
+       template <class T>
+       struct s2tc_evaluate_colors_result_null_t
        {
-               static bool const value = false;
+               inline void add(int l, T a)
+               {
+               }
        };
 
+       template<class T> T get(const unsigned char *buf)
+       {
+               T c;
+               c.r = buf[0];
+               c.g = buf[1];
+               c.b = buf[2];
+               return c;
+       }
+       template<> unsigned char get<unsigned char>(const unsigned char *buf)
+       {
+               return buf[3]; // extract alpha
+       }
+
+       template<class T, class Big, int bpp, bool have_trans, bool have_0_255, int n_input, class Dist, class Eval, class Arr>
+       inline unsigned int s2tc_try_encode_block(
+                       Arr &out,
+                       Eval &res,
+                       Dist ColorDist,
+                       const unsigned char *in, int iw, int w, int h,
+                       const T colors_ref[])
+       {
+               unsigned int score = 0;
+               for(int x = 0; x < w; ++x) for(int y = 0; y < h; ++y)
+               {
+                       int i = y * 4 + x;
+                       const unsigned char *pix = &in[(y * iw + x) * 4];
+
+                       if(have_trans)
+                       {
+                               if(pix[3] == 0)
+                               {
+                                       out.do_or(i, (1 << bpp) - 1);
+                                       continue;
+                               }
+                       }
+
+                       T color(get<T>(pix));
+                       int best = 0;
+                       int bestdist = ColorDist(color, colors_ref[0]);
+                       for(int k = 1; k < n_input; ++k)
+                       {
+                               int dist = ColorDist(color, colors_ref[k]);
+                               if(dist < bestdist)
+                               {
+                                       bestdist = dist;
+                                       best = k;
+                               }
+                       }
+                       if(have_0_255)
+                       {
+                               int dist_0 = ColorDist(color, color_type_info<T>::min_value);
+                               if(dist_0 <= bestdist)
+                               {
+                                       bestdist = dist_0;
+                                       out.do_or(i, (1 << bpp) - 2);
+                                       score += bestdist;
+                                       continue;
+                               }
+                               int dist_255 = ColorDist(color, color_type_info<T>::max_value);
+                               if(dist_255 <= bestdist)
+                               {
+                                       bestdist = dist_255;
+                                       out.do_or(i, (1 << bpp) - 1);
+                                       score += bestdist;
+                                       continue;
+                               }
+                       }
+
+                       // record
+                       res.add(best, color);
+                       out.do_or(i, best);
+                       score += bestdist;
+               }
+               return score;
+       }
+
+       // REFINE_LOOP: refine, take result over only if score improved, loop until it did not
+       inline void s2tc_dxt5_encode_alpha_refine_loop(bitarray<uint64_t, 16, 3> &out, const unsigned char *in, int iw, int w, int h, unsigned char &a0, unsigned char &a1)
+       {
+               bitarray<uint64_t, 16, 3> out2;
+               unsigned char a0next = a0, a1next = a1;
+               unsigned int s = 0x7FFFFFFF;
+               for(;;)
+               {
+                       unsigned char ramp[2] = {
+                               a0next,
+                               a1next
+                       };
+                       s2tc_evaluate_colors_result_t<unsigned char, int, 1> r2;
+                       unsigned int s2 = s2tc_try_encode_block<unsigned char, int, 3, false, true, 2>(out2, r2, alpha_dist, in, iw, w, h, ramp);
+                       if(s2 < s)
+                       {
+                               out = out2;
+                               s = s2;
+                               a0 = a0next;
+                               a1 = a1next;
+                               if(!r2.evaluate(a0next, a1next))
+                                       break;
+                       }
+                       else
+                               break;
+                       out2.clear();
+               }
+
+               if(a1 == a0)
+               {
+                       if(a0 == 255)
+                               --a1;
+                       else
+                               ++a1;
+                       for(int i = 0; i < 16; ++i) switch(out.get(i))
+                       {
+                               case 1:
+                                       out.set(i, 0);
+                                       break;
+                       }
+               }
+
+               if(a1 < a0)
+               {
+                       swap(a0, a1);
+                       for(int i = 0; i < 16; ++i) switch(out.get(i))
+                       {
+                               case 0:
+                                       out.set(i, 1);
+                                       break;
+                               case 1:
+                                       out.set(i, 0);
+                                       break;
+                               case 6:
+                               case 7:
+                                       break;
+                               default:
+                                       out.set(i, 7 - out.get(i));
+                                       break;
+                       }
+               }
+       }
+
+       // REFINE_ALWAYS: refine, do not check
+       inline void s2tc_dxt5_encode_alpha_refine_always(bitarray<uint64_t, 16, 3> &out, const unsigned char *in, int iw, int w, int h, unsigned char &a0, unsigned char &a1)
+       {
+               unsigned char ramp[2] = {
+                       a0,
+                       a1
+               };
+               s2tc_evaluate_colors_result_t<unsigned char, int, 1> r2;
+               s2tc_try_encode_block<unsigned char, int, 3, false, true, 2>(out, r2, alpha_dist, in, iw, w, h, ramp);
+               r2.evaluate(a0, a1);
+
+               if(a1 == a0)
+               {
+                       if(a0 == 255)
+                               --a1;
+                       else
+                               ++a1;
+                       for(int i = 0; i < 16; ++i) switch(out.get(i))
+                       {
+                               case 1:
+                                       out.set(i, 0);
+                                       break;
+                       }
+               }
+
+               if(a1 < a0)
+               {
+                       swap(a0, a1);
+                       for(int i = 0; i < 16; ++i) switch(out.get(i))
+                       {
+                               case 0:
+                                       out.set(i, 1);
+                                       break;
+                               case 1:
+                                       out.set(i, 0);
+                                       break;
+                               case 6:
+                               case 7:
+                                       break;
+                               default:
+                                       out.set(i, 7 - out.get(i));
+                                       break;
+                       }
+               }
+       }
+
+       // REFINE_NEVER: do not refine
+       inline void s2tc_dxt5_encode_alpha_refine_never(bitarray<uint64_t, 16, 3> &out, const unsigned char *in, int iw, int w, int h, unsigned char &a0, unsigned char &a1)
+       {
+               if(a1 < a0)
+                       swap(a0, a1);
+               unsigned char ramp[6] = {
+                       a0,
+                       a1
+               };
+               s2tc_evaluate_colors_result_null_t<unsigned char> r2;
+               s2tc_try_encode_block<unsigned char, int, 3, false, true, 2>(out, r2, alpha_dist, in, iw, w, h, ramp);
+       }
+
+       // REFINE_LOOP: refine, take result over only if score improved, loop until it did not
+       template<ColorDistFunc ColorDist, bool have_trans>
+       inline void s2tc_dxt1_encode_color_refine_loop(bitarray<uint32_t, 16, 2> &out, const unsigned char *in, int iw, int w, int h, color_t &c0, color_t &c1)
+       {
+               bitarray<uint32_t, 16, 2> out2;
+               color_t c0next = c0, c1next = c1;
+               unsigned int s = 0x7FFFFFFF;
+               for(;;)
+               {
+                       color_t ramp[2] = {
+                               c0next,
+                               c1next
+                       };
+                       s2tc_evaluate_colors_result_t<color_t, bigcolor_t, 1> r2;
+                       unsigned int s2 = s2tc_try_encode_block<color_t, bigcolor_t, 2, have_trans, false, 2>(out2, r2, ColorDist, in, iw, w, h, ramp);
+                       if(s2 < s)
+                       {
+                               out = out2;
+                               s = s2;
+                               c0 = c0next;
+                               c1 = c1next;
+                               if(!r2.evaluate(c0next, c1next))
+                                       break;
+                       }
+                       else
+                               break;
+                       out2.clear();
+               }
+
+               if(c0 == c1)
+               {
+                       if(c0 == color_type_info<color_t>::max_value)
+                               --c1;
+                       else
+                               ++c1;
+                       for(int i = 0; i < 16; ++i)
+                               if(!(out.get(i) == 1))
+                                       out.set(i, 0);
+               }
+
+               if(have_trans ? c1 < c0 : c0 < c1)
+               {
+                       swap(c0, c1);
+                       for(int i = 0; i < 16; ++i)
+                               if(!(out.get(i) & 2))
+                                       out.do_xor(i, 1);
+               }
+       }
+
+       // REFINE_ALWAYS: refine, do not check
+       template<ColorDistFunc ColorDist, bool have_trans>
+       inline void s2tc_dxt1_encode_color_refine_always(bitarray<uint32_t, 16, 2> &out, const unsigned char *in, int iw, int w, int h, color_t &c0, color_t &c1)
+       {
+               color_t ramp[2] = {
+                       c0,
+                       c1
+               };
+               s2tc_evaluate_colors_result_t<color_t, bigcolor_t, 1> r2;
+               s2tc_try_encode_block<color_t, bigcolor_t, 2, have_trans, false, 2>(out, r2, ColorDist, in, iw, w, h, ramp);
+               r2.evaluate(c0, c1);
+
+               if(c0 == c1)
+               {
+                       if(c0 == color_type_info<color_t>::max_value)
+                               --c1;
+                       else
+                               ++c1;
+                       for(int i = 0; i < 16; ++i)
+                               if(!(out.get(i) == 1))
+                                       out.set(i, 0);
+               }
+
+               if(have_trans ? c1 < c0 : c0 < c1)
+               {
+                       swap(c0, c1);
+                       for(int i = 0; i < 16; ++i)
+                               if(!(out.get(i) & 2))
+                                       out.do_xor(i, 1);
+               }
+       }
+
+       // REFINE_NEVER: do not refine
+       template<ColorDistFunc ColorDist, bool have_trans>
+       inline void s2tc_dxt1_encode_color_refine_never(bitarray<uint32_t, 16, 2> &out, const unsigned char *in, int iw, int w, int h, color_t &c0, color_t &c1)
+       {
+               if(have_trans ? c1 < c0 : c0 < c1)
+                       swap(c0, c1);
+               color_t ramp[2] = {
+                       c0,
+                       c1
+               };
+               s2tc_evaluate_colors_result_null_t<color_t> r2;
+               s2tc_try_encode_block<color_t, bigcolor_t, 2, have_trans, false, 2>(out, r2, ColorDist, in, iw, w, h, ramp);
+       }
+
+       inline void s2tc_dxt3_encode_alpha(bitarray<uint64_t, 16, 4> &out, const unsigned char *in, int iw, int w, int h)
+       {
+               for(int x = 0; x < w; ++x) for(int y = 0; y < h; ++y)
+               {
+                       int i = y * 4 + x;
+                       const unsigned char *pix = &in[(y * iw + x) * 4];
+                       out.do_or(i, pix[3]);
+               }
+       }
+
        template<DxtMode dxt, ColorDistFunc ColorDist, CompressionMode mode, RefinementMode refine>
        inline void s2tc_encode_block(unsigned char *out, const unsigned char *rgba, int iw, int w, int h, int nrandom)
        {
-               color_t c[16 + (mode == MODE_RANDOM ? nrandom : 0)];
-               unsigned char ca[16 + (mode == MODE_RANDOM ? nrandom : 0)];
-               int n = 0, m = 0;
+               color_t c[16 + (nrandom >= 0 ? nrandom : 0)];
+               unsigned char ca[16 + (nrandom >= 0 ? nrandom : 0)];
                int x, y;
 
                if(mode == MODE_FAST)
                {
                        // FAST: trick from libtxc_dxtn: just get brightest and darkest colors, and encode using these
 
-                       color_t c0 = {0, 0, 0};
+                       color_t c0 = make_color_t(0, 0, 0);
 
-                       // dummy values because we don't know whether the first pixel willrite
+                       // dummy values because we don't know whether the first pixel will write
                        c[0].r = 31;
                        c[0].g = 63;
                        c[0].b = 31;
@@ -423,13 +900,14 @@ namespace
                        for(x = 0; x < w; ++x)
                                for(y = 0; y < h; ++y)
                                {
-                                       c[2].r = rgba[(x + y * iw) * 4 + 2];
+                                       c[2].r = rgba[(x + y * iw) * 4 + 0];
                                        c[2].g = rgba[(x + y * iw) * 4 + 1];
-                                       c[2].b = rgba[(x + y * iw) * 4 + 0];
+                                       c[2].b = rgba[(x + y * iw) * 4 + 2];
                                        ca[2]  = rgba[(x + y * iw) * 4 + 3];
+                                       if (dxt == DXT1)
+                                               if(ca[2] == 0)
+                                                       continue;
                                        // MODE_FAST doesn't work for normalmaps, so this works
-                                       if(!ca[2])
-                                               continue;
 
                                        int d = ColorDist(c[2], c0);
                                        if(d > dmax)
@@ -454,23 +932,21 @@ namespace
                                                }
                                        }
                                }
-
-                       // if ALL pixels were transparent, this won't stop us
-
-                       m = n = 2;
                }
                else
                {
+                       int n = 0, m = 0;
+
                        for(x = 0; x < w; ++x)
                                for(y = 0; y < h; ++y)
                                {
+                                       c[n].r = rgba[(x + y * iw) * 4 + 0];
+                                       c[n].g = rgba[(x + y * iw) * 4 + 1];
+                                       c[n].b = rgba[(x + y * iw) * 4 + 2];
                                        ca[n]  = rgba[(x + y * iw) * 4 + 3];
-                                       if(alpha_0_is_unimportant<ColorDist>::value)
-                                               if(!ca[n])
+                                       if (dxt == DXT1)
+                                               if(ca[n] == 0)
                                                        continue;
-                                       c[n].r = rgba[(x + y * iw) * 4 + 2];
-                                       c[n].g = rgba[(x + y * iw) * 4 + 1];
-                                       c[n].b = rgba[(x + y * iw) * 4 + 0];
                                        ++n;
                                }
                        if(n == 0)
@@ -483,7 +959,7 @@ namespace
                        }
                        m = n;
 
-                       if(mode == MODE_RANDOM)
+                       if(nrandom > 0)
                        {
                                color_t mins = c[0];
                                color_t maxs = c[0];
@@ -503,7 +979,7 @@ namespace
                                                maxa = max(maxa, ca[x]);
                                        }
                                }
-                               color_t len = { maxs.r - mins.r + 1, maxs.g - mins.g + 1, maxs.b - mins.b + 1 };
+                               color_t len = make_color_t(maxs.r - mins.r + 1, maxs.g - mins.g + 1, maxs.b - mins.b + 1);
                                int lena = (dxt == DXT5) ? (maxa - (int) mina + 1) : 0;
                                for(x = 0; x < nrandom; ++x)
                                {
@@ -530,380 +1006,107 @@ namespace
                                reduce_colors_inplace_2fixpoints(ca, n, m, alpha_dist, (unsigned char) 0, (unsigned char) 255);
                }
 
-               if(refine == REFINE_NEVER)
+               // equal colors are BAD
+               if(c[0] == c[1])
                {
-                       if(dxt == DXT5)
-                       {
-                               if(ca[1] < ca[0])
-                               {
-                                       // select mode with 6 = 0, 7 = 255
-                                       ca[2] = ca[0];
-                                       ca[0] = ca[1];
-                                       ca[1] = ca[2];
-                               }
-                       }
-                       if((dxt == DXT1) ? (c[1] < c[0]) : (c[0] < c[1]))
-                       // DXT1: select mode with 3 = transparent
-                       // other: don't select this mode
+                       if(c[0] == color_type_info<color_t>::max_value)
+                               --c[1];
+                       else
+                               ++c[1];
+               }
+
+               if(dxt == DXT5)
+               {
+                       if(ca[0] == ca[1])
                        {
-                               c[2] = c[0];
-                               c[0] = c[1];
-                               c[1] = c[2];
+                               if(ca[0] == 255)
+                                       --ca[1];
+                               else
+                                       ++ca[1];
                        }
                }
 
-               bool refined;
-               do
+               switch(dxt)
                {
-                       int nc0 = 0, na0 = 0, sc0r = 0, sc0g = 0, sc0b = 0, sa0 = 0;
-                       int nc1 = 0, na1 = 0, sc1r = 0, sc1g = 0, sc1b = 0, sa1 = 0;
-                       if(refine == REFINE_LOOP)
-                               refined = false;
-
-                       memset(out, 0, (dxt == DXT1) ? 8 : 16);
-                       for(x = 0; x < w; ++x)
-                               for(y = 0; y < h; ++y)
+                       case DXT1:
                                {
-                                       int pindex = (x+y*4);
-                                       c[2].r = rgba[(x + y * iw) * 4 + 2];
-                                       c[2].g = rgba[(x + y * iw) * 4 + 1];
-                                       c[2].b = rgba[(x + y * iw) * 4 + 0];
-                                       ca[2]  = rgba[(x + y * iw) * 4 + 3];
-                                       switch(dxt)
+                                       bitarray<uint32_t, 16, 2> colorblock;
+                                       switch(refine)
                                        {
-                                               case DXT5:
-                                                       {
-                                                               bool visible = true;
-                                                               int da[4];
-                                                               int bitindex = pindex * 3;
-                                                               da[0] = alpha_dist(ca[0], ca[2]);
-                                                               da[1] = alpha_dist(ca[1], ca[2]);
-                                                               da[2] = alpha_dist(0, ca[2]);
-                                                               da[3] = alpha_dist(255, ca[2]);
-                                                               if(da[2] <= da[0] && da[2] <= da[1] && da[2] <= da[3])
-                                                               {
-                                                                       // 6
-                                                                       ++bitindex;
-                                                                       setbit(&out[2], bitindex);
-                                                                       ++bitindex;
-                                                                       setbit(&out[2], bitindex);
-                                                                       if(alpha_0_is_unimportant<ColorDist>::value)
-                                                                               visible = false;
-                                                               }
-                                                               else if(da[3] <= da[0] && da[3] <= da[1])
-                                                               {
-                                                                       // 7
-                                                                       setbit(&out[2], bitindex);
-                                                                       ++bitindex;
-                                                                       setbit(&out[2], bitindex);
-                                                                       ++bitindex;
-                                                                       setbit(&out[2], bitindex);
-                                                               }
-                                                               else if(da[0] <= da[1])
-                                                               {
-                                                                       // 0
-                                                                       if(refine != REFINE_NEVER)
-                                                                       {
-                                                                               ++na0;
-                                                                               sa0 += ca[2];
-                                                                       }
-                                                               }
-                                                               else
-                                                               {
-                                                                       // 1
-                                                                       setbit(&out[2], bitindex);
-                                                                       if(refine != REFINE_NEVER)
-                                                                       {
-                                                                               ++na1;
-                                                                               sa1 += ca[2];
-                                                                       }
-                                                               }
-                                                               if(ColorDist(c[0], c[2]) > ColorDist(c[1], c[2]))
-                                                               {
-                                                                       int bitindex = pindex * 2;
-                                                                       setbit(&out[12], bitindex);
-                                                                       if(refine != REFINE_NEVER)
-                                                                       {
-                                                                               if(!alpha_0_is_unimportant<ColorDist>::value || visible)
-                                                                               {
-                                                                                       ++nc1;
-                                                                                       sc1r += refine_component_encode<ColorDist>(c[2].r);
-                                                                                       sc1g += refine_component_encode<ColorDist>(c[2].g);
-                                                                                       sc1b += refine_component_encode<ColorDist>(c[2].b);
-                                                                               }
-                                                                       }
-                                                               }
-                                                               else
-                                                               {
-                                                                       if(refine != REFINE_NEVER)
-                                                                       {
-                                                                               if(!alpha_0_is_unimportant<ColorDist>::value || visible)
-                                                                               {
-                                                                                       ++nc0;
-                                                                                       sc0r += refine_component_encode<ColorDist>(c[2].r);
-                                                                                       sc0g += refine_component_encode<ColorDist>(c[2].g);
-                                                                                       sc0b += refine_component_encode<ColorDist>(c[2].b);
-                                                                               }
-                                                                       }
-                                                               }
-                                                       }
+                                               case REFINE_NEVER:
+                                                       s2tc_dxt1_encode_color_refine_never<ColorDist, true>(colorblock, rgba, iw, w, h, c[0], c[1]);
                                                        break;
-                                               case DXT3:
-                                                       {
-                                                               int bitindex = pindex * 4;
-                                                               setbit(&out[0], bitindex, ca[2]);
-                                                       }
-                                                       if(ColorDist(c[0], c[2]) > ColorDist(c[1], c[2]))
-                                                       {
-                                                               int bitindex = pindex * 2;
-                                                               setbit(&out[12], bitindex);
-                                                               if(refine != REFINE_NEVER)
-                                                               {
-                                                                       if(!alpha_0_is_unimportant<ColorDist>::value || ca[2])
-                                                                       {
-                                                                               ++nc1;
-                                                                               sc1r += refine_component_encode<ColorDist>(c[2].r);
-                                                                               sc1g += refine_component_encode<ColorDist>(c[2].g);
-                                                                               sc1b += refine_component_encode<ColorDist>(c[2].b);
-                                                                       }
-                                                               }
-                                                       }
-                                                       else
-                                                       {
-                                                               if(refine != REFINE_NEVER)
-                                                               {
-                                                                       if(!alpha_0_is_unimportant<ColorDist>::value || ca[2])
-                                                                       {
-                                                                               ++nc0;
-                                                                               sc0r += refine_component_encode<ColorDist>(c[2].r);
-                                                                               sc0g += refine_component_encode<ColorDist>(c[2].g);
-                                                                               sc0b += refine_component_encode<ColorDist>(c[2].b);
-                                                                       }
-                                                               }
-                                                       }
+                                               case REFINE_ALWAYS:
+                                                       s2tc_dxt1_encode_color_refine_always<ColorDist, true>(colorblock, rgba, iw, w, h, c[0], c[1]);
                                                        break;
-                                               case DXT1:
-                                                       {
-                                                               // the normalmap-uses-alpha-0 hack cannot be used here
-                                                               int bitindex = pindex * 2;
-                                                               if(!ca[2])
-                                                                       setbit(&out[4], bitindex, 3);
-                                                               else if(ColorDist(c[0], c[2]) > ColorDist(c[1], c[2]))
-                                                               {
-                                                                       setbit(&out[4], bitindex);
-                                                                       if(refine != REFINE_NEVER)
-                                                                       {
-                                                                               ++nc1;
-                                                                               sc1r += refine_component_encode<ColorDist>(c[2].r);
-                                                                               sc1g += refine_component_encode<ColorDist>(c[2].g);
-                                                                               sc1b += refine_component_encode<ColorDist>(c[2].b);
-                                                                       }
-                                                               }
-                                                               else
-                                                               {
-                                                                       if(refine != REFINE_NEVER)
-                                                                       {
-                                                                               ++nc0;
-                                                                               sc0r += refine_component_encode<ColorDist>(c[2].r);
-                                                                               sc0g += refine_component_encode<ColorDist>(c[2].g);
-                                                                               sc0b += refine_component_encode<ColorDist>(c[2].b);
-                                                                       }
-                                                               }
-                                                       }
+                                               case REFINE_LOOP:
+                                                       s2tc_dxt1_encode_color_refine_loop<ColorDist, true>(colorblock, rgba, iw, w, h, c[0], c[1]);
                                                        break;
                                        }
+                                       out[0] = ((c[0].g & 0x07) << 5) | c[0].b;
+                                       out[1] = (c[0].r << 3) | (c[0].g >> 3);
+                                       out[2] = ((c[1].g & 0x07) << 5) | c[1].b;
+                                       out[3] = (c[1].r << 3) | (c[1].g >> 3);
+                                       colorblock.tobytes(&out[4]);
                                }
-                       if(refine != REFINE_NEVER)
-                       {
-                               // REFINEMENT: trick from libtxc_dxtn: reassign the colors to an average of the colors encoded with that value
-
-                               if(dxt == DXT5)
-                               {
-                                       if(na0)
-                                               ca[0] = (2 * sa0 + na0) / (2 * na0);
-                                       if(na1)
-                                               ca[1] = (2 * sa1 + na1) / (2 * na1);
-                               }
-                               if(refine == REFINE_CHECK || refine == REFINE_LOOP)
-                               {
-                                       c[2] = c[0];
-                                       c[3] = c[1];
-                               }
-                               if(nc0)
-                               {
-                                       c[0].r = refine_component_decode<ColorDist>((2 * sc0r + nc0) / (2 * nc0));
-                                       c[0].g = refine_component_decode<ColorDist>((2 * sc0g + nc0) / (2 * nc0));
-                                       c[0].b = refine_component_decode<ColorDist>((2 * sc0b + nc0) / (2 * nc0));
-                               }
-                               if(nc1)
-                               {
-                                       c[1].r = refine_component_decode<ColorDist>((2 * sc1r + nc1) / (2 * nc1));
-                                       c[1].g = refine_component_decode<ColorDist>((2 * sc1g + nc1) / (2 * nc1));
-                                       c[1].b = refine_component_decode<ColorDist>((2 * sc1b + nc1) / (2 * nc1));
-                               }
-
-                               if(refine == REFINE_CHECK || refine == REFINE_LOOP)
+                               break;
+                       case DXT3:
                                {
-                                       int score_01 = 0;
-                                       int score_23 = 0;
-                                       for(x = 0; x < w; ++x)
-                                               for(y = 0; y < h; ++y)
-                                               {
-                                                       int pindex = (x+y*4);
-                                                       c[4].r = rgba[(x + y * iw) * 4 + 2];
-                                                       c[4].g = rgba[(x + y * iw) * 4 + 1];
-                                                       c[4].b = rgba[(x + y * iw) * 4 + 0];
-                                                       if(alpha_0_is_unimportant<ColorDist>::value || dxt == DXT1) // in DXT1, alpha 0 pixels are always skipped!
-                                                       {
-                                                               if(dxt == DXT5)
-                                                               {
-                                                                       // check ENCODED alpha
-                                                                       int bitindex_0 = pindex * 3;
-                                                                       int bitindex_1 = bitindex_0 + 2;
-                                                                       if(!testbit(&out[2], bitindex_0))
-                                                                               if(testbit(&out[2], bitindex_1))
-                                                                                       continue;
-                                                               }
-                                                               else
-                                                               {
-                                                                       // check ORIGINAL alpha (DXT1 and DXT3 preserve it)
-                                                                       ca[4] = rgba[(x + y * iw) * 4 + 3];
-                                                                       if(!ca[4])
-                                                                               continue;
-                                                               }
-                                                       }
-                                                       int bitindex = pindex * 2;
-                                                       if(refine == REFINE_CHECK)
-                                                       {
-                                                               if(testbit(&out[(dxt == DXT1 ? 4 : 12)], bitindex))
-                                                               {
-                                                                       // we picked an 1
-                                                                       score_01 += ColorDist(c[1], c[4]);
-                                                                       score_23 += ColorDist(c[3], c[4]);
-                                                               }
-                                                               else
-                                                               {
-                                                                       // we picked a 0
-                                                                       score_01 += ColorDist(c[0], c[4]);
-                                                                       score_23 += ColorDist(c[2], c[4]);
-                                                               }
-                                                       }
-                                                       else if(refine == REFINE_LOOP)
-                                                       {
-                                                               if(testbit(&out[(dxt == DXT1 ? 4 : 12)], bitindex))
-                                                               {
-                                                                       // we picked an 1
-                                                                       score_23 += ColorDist(c[3], c[4]);
-                                                               }
-                                                               else
-                                                               {
-                                                                       // we picked a 0
-                                                                       score_23 += ColorDist(c[2], c[4]);
-                                                               }
-                                                               // we WILL run another loop iteration, if score_01 wins
-                                                               score_01 += min(ColorDist(c[0], c[4]), ColorDist(c[1], c[4]));
-                                                       }
-                                               }
-
-                                       if(score_23 <= score_01)
+                                       bitarray<uint32_t, 16, 2> colorblock;
+                                       bitarray<uint64_t, 16, 4> alphablock;
+                                       switch(refine)
                                        {
-                                               // refinement was BAD
-                                               c[0] = c[2];
-                                               c[1] = c[3];
+                                               case REFINE_NEVER:
+                                                       s2tc_dxt1_encode_color_refine_never<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
+                                                       break;
+                                               case REFINE_ALWAYS:
+                                                       s2tc_dxt1_encode_color_refine_always<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
+                                                       break;
+                                               case REFINE_LOOP:
+                                                       s2tc_dxt1_encode_color_refine_loop<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
+                                                       break;
                                        }
-                                       else if(refine == REFINE_LOOP)
-                                               refined = true;
-
-                                       // alpha refinement is always good and doesn't
-                                       // need to be checked because alpha is linear
-
-                                       // when looping, though, checking the
-                                       // alpha COULD help, but we usually
-                                       // loop twice anyway as refinement
-                                       // usually helps
+                                       s2tc_dxt3_encode_alpha(alphablock, rgba, iw, w, h);
+                                       alphablock.tobytes(&out[0]);
+                                       out[8] = ((c[0].g & 0x07) << 5) | c[0].b;
+                                       out[9] = (c[0].r << 3) | (c[0].g >> 3);
+                                       out[10] = ((c[1].g & 0x07) << 5) | c[1].b;
+                                       out[11] = (c[1].r << 3) | (c[1].g >> 3);
+                                       colorblock.tobytes(&out[12]);
                                }
-                       }
-               }
-               while(refine == REFINE_LOOP && refined);
-
-               if(refine != REFINE_NEVER)
-               {
-                       if(dxt == DXT5)
-                       {
-                               if(ca[1] < ca[0])
+                               break;
+                       case DXT5:
                                {
-                                       ca[2] = ca[0];
-                                       ca[0] = ca[1];
-                                       ca[1] = ca[2];
-                                       // swap the alphas
-                                       for(int pindex = 0; pindex < 16; ++pindex)
+                                       bitarray<uint32_t, 16, 2> colorblock;
+                                       bitarray<uint64_t, 16, 3> alphablock;
+                                       switch(refine)
                                        {
-                                               int bitindex_set = pindex * 3;
-                                               int bitindex_test = bitindex_set + 2;
-                                               if(!testbit(&out[2], bitindex_test))
-                                                       xorbit(&out[2], bitindex_set);
+                                               case REFINE_NEVER:
+                                                       s2tc_dxt1_encode_color_refine_never<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
+                                                       s2tc_dxt5_encode_alpha_refine_never(alphablock, rgba, iw, w, h, ca[0], ca[1]);
+                                                       break;
+                                               case REFINE_ALWAYS:
+                                                       s2tc_dxt1_encode_color_refine_always<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
+                                                       s2tc_dxt5_encode_alpha_refine_always(alphablock, rgba, iw, w, h, ca[0], ca[1]);
+                                                       break;
+                                               case REFINE_LOOP:
+                                                       s2tc_dxt1_encode_color_refine_loop<ColorDist, false>(colorblock, rgba, iw, w, h, c[0], c[1]);
+                                                       s2tc_dxt5_encode_alpha_refine_loop(alphablock, rgba, iw, w, h, ca[0], ca[1]);
+                                                       break;
                                        }
+                                       out[0] = ca[0];
+                                       out[1] = ca[1];
+                                       alphablock.tobytes(&out[2]);
+                                       out[8] = ((c[0].g & 0x07) << 5) | c[0].b;
+                                       out[9] = (c[0].r << 3) | (c[0].g >> 3);
+                                       out[10] = ((c[1].g & 0x07) << 5) | c[1].b;
+                                       out[11] = (c[1].r << 3) | (c[1].g >> 3);
+                                       colorblock.tobytes(&out[12]);
                                }
-                       }
-                       if((dxt == DXT1) ? (c[1] < c[0]) : (c[0] < c[1]))
-                       // DXT1: select mode with 3 = transparent
-                       // other: don't select this mode
-                       {
-                               c[2] = c[0];
-                               c[0] = c[1];
-                               c[1] = c[2];
-                               // swap the colors
-                               if(dxt == DXT1)
-                               {
-                                       out[4] ^= 0x55 & ~(out[4] >> 1);
-                                       out[5] ^= 0x55 & ~(out[5] >> 1);
-                                       out[6] ^= 0x55 & ~(out[6] >> 1);
-                                       out[7] ^= 0x55 & ~(out[7] >> 1);
-                               }
-                               else
-                               {
-                                       out[12] ^= 0x55 & ~(out[12] >> 1);
-                                       out[13] ^= 0x55 & ~(out[13] >> 1);
-                                       out[14] ^= 0x55 & ~(out[14] >> 1);
-                                       out[15] ^= 0x55 & ~(out[15] >> 1);
-                               }
-                       }
-               }
-
-               switch(dxt)
-               {
-                       case DXT5:
-                               out[0] = ca[0];
-                               out[1] = ca[1];
-                       case DXT3:
-                               out[8] = ((c[0].g & 0x07) << 5) | c[0].b;
-                               out[9] = (c[0].r << 3) | (c[0].g >> 3);
-                               out[10] = ((c[1].g & 0x07) << 5) | c[1].b;
-                               out[11] = (c[1].r << 3) | (c[1].g >> 3);
-                               break;
-                       case DXT1:
-                               out[0] = ((c[0].g & 0x07) << 5) | c[0].b;
-                               out[1] = (c[0].r << 3) | (c[0].g >> 3);
-                               out[2] = ((c[1].g & 0x07) << 5) | c[1].b;
-                               out[3] = (c[1].r << 3) | (c[1].g >> 3);
                                break;
                }
        }
 
-       // these color dist functions do not need the refinement check, as they always improve the situation
-       template<ColorDistFunc ColorDist> struct need_refine_check
-       {
-               static const bool value = true;
-       };
-       template<> struct need_refine_check<color_dist_avg>
-       {
-               static const bool value = false;
-       };
-       template<> struct need_refine_check<color_dist_wavg>
-       {
-               static const bool value = false;
-       };
-
        // compile time dispatch magic
        template<DxtMode dxt, ColorDistFunc ColorDist, CompressionMode mode>
        inline s2tc_encode_block_func_t s2tc_encode_block_func(RefinementMode refine)
@@ -914,9 +1117,6 @@ namespace
                                return s2tc_encode_block<dxt, ColorDist, mode, REFINE_NEVER>;
                        case REFINE_LOOP:
                                return s2tc_encode_block<dxt, ColorDist, mode, REFINE_LOOP>;
-                       case REFINE_CHECK:
-                               if(need_refine_check<ColorDist>::value)
-                                       return s2tc_encode_block<dxt, ColorDist, mode, REFINE_CHECK>;
                        default:
                        case REFINE_ALWAYS:
                                return s2tc_encode_block<dxt, ColorDist, mode, REFINE_ALWAYS>;
@@ -928,7 +1128,7 @@ namespace
        {
                static const bool value = true;
        };
-       template<> struct need_refine_check<color_dist_normalmap>
+       template<> struct supports_fast<color_dist_normalmap>
        {
                static const bool value = false;
        };
@@ -936,9 +1136,7 @@ namespace
        template<DxtMode dxt, ColorDistFunc ColorDist>
        inline s2tc_encode_block_func_t s2tc_encode_block_func(int nrandom, RefinementMode refine)
        {
-               if(nrandom > 0)
-                       return s2tc_encode_block_func<dxt, ColorDist, MODE_RANDOM>(refine);
-               else if(!supports_fast<ColorDist>::value || nrandom == 0) // MODE_FAST not supported for normalmaps, sorry
+               if(!supports_fast<ColorDist>::value || nrandom >= 0)
                        return s2tc_encode_block_func<dxt, ColorDist, MODE_NORMAL>(refine);
                else
                        return s2tc_encode_block_func<dxt, ColorDist, MODE_FAST>(refine);
@@ -979,9 +1177,6 @@ s2tc_encode_block_func_t s2tc_encode_block_func(DxtMode dxt, ColorDistMode cd, i
                case SRGB_MIXED:
                        return s2tc_encode_block_func<color_dist_srgb_mixed>(dxt, nrandom, refine);
                        break;
-               case LAB:
-                       return s2tc_encode_block_func<color_dist_lab_srgb>(dxt, nrandom, refine);
-                       break;
                case AVG:
                        return s2tc_encode_block_func<color_dist_avg>(dxt, nrandom, refine);
                        break;
@@ -989,6 +1184,9 @@ s2tc_encode_block_func_t s2tc_encode_block_func(DxtMode dxt, ColorDistMode cd, i
                case WAVG:
                        return s2tc_encode_block_func<color_dist_wavg>(dxt, nrandom, refine);
                        break;
+               case W0AVG:
+                       return s2tc_encode_block_func<color_dist_w0avg>(dxt, nrandom, refine);
+                       break;
                case NORMALMAP:
                        return s2tc_encode_block_func<color_dist_normalmap>(dxt, nrandom, refine);
                        break;
@@ -999,7 +1197,7 @@ namespace
 {
        inline int diffuse(int *diff, int src, int shift)
        {
-               int maxval = (1 << (8 - shift)) - 1;
+               const int maxval = (1 << (8 - shift)) - 1;
                src += *diff;
                int ret = max(0, min(src >> shift, maxval));
                // simulate decoding ("loop filter")
@@ -1011,67 +1209,257 @@ namespace
        {
                src += *diff;
                int ret = (src >= 128);
+               // simulate decoding ("loop filter")
                int loop = ret ? 255 : 0;
                *diff = src - loop;
                return ret;
        }
-};
 
-void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, int srccomps, int bgr, int alphabits)
-{
-       int x, y;
-       int diffuse_r = 0;
-       int diffuse_g = 0;
-       int diffuse_b = 0;
-       int diffuse_a = 0;
-       if(bgr)
+       inline int floyd(int *thisrow, int *downrow, int src, int shift)
        {
-               for(y = 0; y < h; ++y)
-                       for(x = 0; x < w; ++x)
-                       {
-                               out[(x + y * w) * 4 + 2] = diffuse(&diffuse_r, rgba[(x + y * w) * srccomps + 2], 3);
-                               out[(x + y * w) * 4 + 1] = diffuse(&diffuse_g, rgba[(x + y * w) * srccomps + 1], 2);
-                               out[(x + y * w) * 4 + 0] = diffuse(&diffuse_b, rgba[(x + y * w) * srccomps + 0], 3);
-                       }
+               const int maxval = (1 << (8 - shift)) - 1;
+               src = (src << 4) | (src >> 4);
+               src += thisrow[1];
+               int ret = max(0, min(src >> (shift + 4), maxval));
+               // simulate decoding ("loop filter")
+               int loop = (ret * 4095 / maxval);
+               int err = src - loop;
+               int e7 = (err * 7 + 8) / 16;
+               err -= e7;
+               int e3 = (err * 3 + 4) / 9;
+               err -= e3;
+               int e5 = (err * 5 + 3) / 6;
+               err -= e5;
+               int e1 = err;
+               thisrow[2] += e7;
+               downrow[0] += e3;
+               downrow[1] += e5;
+               downrow[2] += e1;
+               return ret;
        }
-       else
+
+       inline int floyd1(int *thisrow, int *downrow, int src)
        {
-               for(y = 0; y < h; ++y)
-                       for(x = 0; x < w; ++x)
-                       {
-                               out[(x + y * w) * 4 + 2] = diffuse(&diffuse_r, rgba[(x + y * w) * srccomps + 0], 3);
-                               out[(x + y * w) * 4 + 1] = diffuse(&diffuse_g, rgba[(x + y * w) * srccomps + 1], 2);
-                               out[(x + y * w) * 4 + 0] = diffuse(&diffuse_b, rgba[(x + y * w) * srccomps + 2], 3);
-                       }
+               src = (src << 4) | (src >> 4);
+               src += thisrow[1];
+               int ret = (src >= 2048);
+               // simulate decoding ("loop filter")
+               int loop = ret ? 4095 : 0;
+               int err = src - loop;
+               int e7 = (err * 7 + 8) / 16;
+               err -= e7;
+               int e3 = (err * 3 + 4) / 9;
+               err -= e3;
+               int e5 = (err * 5 + 3) / 6;
+               err -= e5;
+               int e1 = err;
+               thisrow[2] += e7;
+               downrow[0] += e3;
+               downrow[1] += e5;
+               downrow[2] += e1;
+               return ret;
        }
-       if(srccomps == 4)
+
+       template<int srccomps, int alphabits, DitherMode dither>
+       inline void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h)
        {
-               if(alphabits == 1)
+               int x, y;
+               switch(dither)
                {
-                       for(y = 0; y < h; ++y)
-                               for(x = 0; x < w; ++x)
-                                       out[(x + y * w) * 4 + 3] = diffuse1(&diffuse_a, rgba[(x + y * w) * srccomps + 3]);
+                       case DITHER_NONE:
+                               {
+                                       for(y = 0; y < h; ++y)
+                                               for(x = 0; x < w; ++x)
+                                               {
+                                                       out[(x + y * w) * 4 + 0] = rgba[(x + y * w) * srccomps + 0] >> 3;
+                                                       out[(x + y * w) * 4 + 1] = rgba[(x + y * w) * srccomps + 1] >> 2;
+                                                       out[(x + y * w) * 4 + 2] = rgba[(x + y * w) * srccomps + 2] >> 3;
+                                               }
+                                       if(srccomps == 4)
+                                       {
+                                               if(alphabits == 1)
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3] >> 7;
+                                               }
+                                               else if(alphabits == 8)
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
+                                               }
+                                               else
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3] >> (8 - alphabits);
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for(y = 0; y < h; ++y)
+                                                       for(x = 0; x < w; ++x)
+                                                               out[(x + y * w) * 4 + 3] = (1 << alphabits) - 1;
+                                       }
+                               }
+                               break;
+                       case DITHER_SIMPLE:
+                               {
+                                       int x, y;
+                                       int diffuse_r = 0;
+                                       int diffuse_g = 0;
+                                       int diffuse_b = 0;
+                                       int diffuse_a = 0;
+                                       for(y = 0; y < h; ++y)
+                                               for(x = 0; x < w; ++x)
+                                               {
+                                                       out[(x + y * w) * 4 + 0] = diffuse(&diffuse_r, rgba[(x + y * w) * srccomps + 0], 3);
+                                                       out[(x + y * w) * 4 + 1] = diffuse(&diffuse_g, rgba[(x + y * w) * srccomps + 1], 2);
+                                                       out[(x + y * w) * 4 + 2] = diffuse(&diffuse_b, rgba[(x + y * w) * srccomps + 2], 3);
+                                               }
+                                       if(srccomps == 4)
+                                       {
+                                               if(alphabits == 1)
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = diffuse1(&diffuse_a, rgba[(x + y * w) * srccomps + 3]);
+                                               }
+                                               else if(alphabits == 8)
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
+                                               }
+                                               else
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = diffuse(&diffuse_a, rgba[(x + y * w) * srccomps + 3], 8 - alphabits);
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for(y = 0; y < h; ++y)
+                                                       for(x = 0; x < w; ++x)
+                                                               out[(x + y * w) * 4 + 3] = (1 << alphabits) - 1;
+                                       }
+                               }
+                               break;
+                       case DITHER_FLOYDSTEINBERG:
+                               {
+                                       int x, y;
+                                       int pw = w+2;
+                                       int downrow[6*pw];
+                                       memset(downrow, 0, sizeof(downrow));
+                                       int *thisrow_r, *thisrow_g, *thisrow_b, *thisrow_a;
+                                       int *downrow_r, *downrow_g, *downrow_b, *downrow_a;
+                                       for(y = 0; y < h; ++y)
+                                       {
+                                               thisrow_r = downrow + ((y&1)?3:0) * pw;
+                                               downrow_r = downrow + ((y&1)?0:3) * pw;
+                                               memset(downrow_r, 0, sizeof(*downrow_r) * (3*pw));
+                                               thisrow_g = thisrow_r + pw;
+                                               thisrow_b = thisrow_g + pw;
+                                               downrow_g = downrow_r + pw;
+                                               downrow_b = downrow_g + pw;
+                                               for(x = 0; x < w; ++x)
+                                               {
+                                                       out[(x + y * w) * 4 + 0] = floyd(&thisrow_r[x], &downrow_r[x], rgba[(x + y * w) * srccomps + 0], 3);
+                                                       out[(x + y * w) * 4 + 1] = floyd(&thisrow_g[x], &downrow_g[x], rgba[(x + y * w) * srccomps + 1], 2);
+                                                       out[(x + y * w) * 4 + 2] = floyd(&thisrow_b[x], &downrow_b[x], rgba[(x + y * w) * srccomps + 2], 3);
+                                               }
+                                       }
+                                       if(srccomps == 4)
+                                       {
+                                               if(alphabits == 1)
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                       {
+                                                               thisrow_a = downrow + (y&1) * pw;
+                                                               downrow_a = downrow + !(y&1) * pw;
+                                                               memset(downrow_a, 0, sizeof(*downrow_a) * pw);
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = floyd1(&thisrow_a[x], &downrow_a[x], rgba[(x + y * w) * srccomps + 3]);
+                                                       }
+                                               }
+                                               else if(alphabits == 8)
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
+                                               }
+                                               else
+                                               {
+                                                       for(y = 0; y < h; ++y)
+                                                       {
+                                                               thisrow_a = downrow + (y&1) * pw;
+                                                               downrow_a = downrow + !(y&1) * pw;
+                                                               memset(downrow_a, 0, sizeof(*downrow_a) * pw);
+                                                               for(x = 0; x < w; ++x)
+                                                                       out[(x + y * w) * 4 + 3] = floyd(&thisrow_a[x], &downrow_a[x], rgba[(x + y * w) * srccomps + 3], 8 - alphabits);
+                                                       }
+                                               }
+                                       }
+                                       else
+                                       {
+                                               for(y = 0; y < h; ++y)
+                                                       for(x = 0; x < w; ++x)
+                                                               out[(x + y * w) * 4 + 3] = (1 << alphabits) - 1;
+                                       }
+                               }
+                               break;
                }
-               else if(alphabits == 8)
+       }
+
+       template<int srccomps, int alphabits>
+       inline void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, DitherMode dither)
+       {
+               switch(dither)
                {
-                       for(y = 0; y < h; ++y)
-                               for(x = 0; x < w; ++x)
-                                       out[(x + y * w) * 4 + 3] = rgba[(x + y * w) * srccomps + 3]; // no conversion
+                       case DITHER_NONE:
+                               rgb565_image<srccomps, alphabits, DITHER_NONE>(out, rgba, w, h);
+                               break;
+                       default:
+                       case DITHER_SIMPLE:
+                               rgb565_image<srccomps, alphabits, DITHER_SIMPLE>(out, rgba, w, h);
+                               break;
+                       case DITHER_FLOYDSTEINBERG:
+                               rgb565_image<srccomps, alphabits, DITHER_FLOYDSTEINBERG>(out, rgba, w, h);
+                               break;
                }
-               else
+       }
+
+       template<int srccomps>
+       inline void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, int alphabits, DitherMode dither)
+       {
+               switch(alphabits)
                {
-                       int alphadiffuse = 8 - alphabits;
-                       for(y = 0; y < h; ++y)
-                               for(x = 0; x < w; ++x)
-                                       out[(x + y * w) * 4 + 3] = diffuse(&diffuse_a, rgba[(x + y * w) * srccomps + 3], alphadiffuse);
+                       case 1:
+                               rgb565_image<srccomps, 1>(out, rgba, w, h, dither);
+                               break;
+                       case 4:
+                               rgb565_image<srccomps, 4>(out, rgba, w, h, dither);
+                               break;
+                       default:
+                       case 8:
+                               rgb565_image<srccomps, 8>(out, rgba, w, h, dither);
+                               break;
                }
        }
-       else
+};
+
+void rgb565_image(unsigned char *out, const unsigned char *rgba, int w, int h, int srccomps, int alphabits, DitherMode dither)
+{
+       switch(srccomps)
        {
-               int alpharange = (1 << alphabits) - 1;
-               for(y = 0; y < h; ++y)
-                       for(x = 0; x < w; ++x)
-                               out[(x + y * w) * 4 + 3] = alpharange;
+               case 3:
+                       rgb565_image<3>(out, rgba, w, h, alphabits, dither);
+                       break;
+               case 4:
+               default:
+                       rgb565_image<4>(out, rgba, w, h, alphabits, dither);
+                       break;
        }
 }
-