libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185
 186 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 187 {
 188   {   0,  1,  0,  1,  0,  1,  0,  1,},
 189   {   1,  0,  1,  0,  1,  0,  1,  0,},
 190   {   0,  1,  0,  1,  0,  1,  0,  1,},
 191   {   1,  0,  1,  0,  1,  0,  1,  0,},
 192   {   0,  1,  0,  1,  0,  1,  0,  1,},
 193   {   1,  0,  1,  0,  1,  0,  1,  0,},
 194   {   0,  1,  0,  1,  0,  1,  0,  1,},
 195   {   1,  0,  1,  0,  1,  0,  1,  0,},
 196 },{
 197   {   1,  2,  1,  2,  1,  2,  1,  2,},
 198   {   3,  0,  3,  0,  3,  0,  3,  0,},
 199   {   1,  2,  1,  2,  1,  2,  1,  2,},
 200   {   3,  0,  3,  0,  3,  0,  3,  0,},
 201   {   1,  2,  1,  2,  1,  2,  1,  2,},
 202   {   3,  0,  3,  0,  3,  0,  3,  0,},
 203   {   1,  2,  1,  2,  1,  2,  1,  2,},
 204   {   3,  0,  3,  0,  3,  0,  3,  0,},
 205 },{
 206   {   2,  4,  3,  5,  2,  4,  3,  5,},
 207   {   6,  0,  7,  1,  6,  0,  7,  1,},
 208   {   3,  5,  2,  4,  3,  5,  2,  4,},
 209   {   7,  1,  6,  0,  7,  1,  6,  0,},
 210   {   2,  4,  3,  5,  2,  4,  3,  5,},
 211   {   6,  0,  7,  1,  6,  0,  7,  1,},
 212   {   3,  5,  2,  4,  3,  5,  2,  4,},
 213   {   7,  1,  6,  0,  7,  1,  6,  0,},
 214 },{
 215   {   4,  8,  7, 11,  4,  8,  7, 11,},
 216   {  12,  0, 15,  3, 12,  0, 15,  3,},
 217   {   6, 10,  5,  9,  6, 10,  5,  9,},
 218   {  14,  2, 13,  1, 14,  2, 13,  1,},
 219   {   4,  8,  7, 11,  4,  8,  7, 11,},
 220   {  12,  0, 15,  3, 12,  0, 15,  3,},
 221   {   6, 10,  5,  9,  6, 10,  5,  9,},
 222   {  14,  2, 13,  1, 14,  2, 13,  1,},
 223 },{
 224   {   9, 17, 15, 23,  8, 16, 14, 22,},
 225   {  25,  1, 31,  7, 24,  0, 30,  6,},
 226   {  13, 21, 11, 19, 12, 20, 10, 18,},
 227   {  29,  5, 27,  3, 28,  4, 26,  2,},
 228   {   8, 16, 14, 22,  9, 17, 15, 23,},
 229   {  24,  0, 30,  6, 25,  1, 31,  7,},
 230   {  12, 20, 10, 18, 13, 21, 11, 19,},
 231   {  28,  4, 26,  2, 29,  5, 27,  3,},
 232 },{
 233   {  18, 34, 30, 46, 17, 33, 29, 45,},
 234   {  50,  2, 62, 14, 49,  1, 61, 13,},
 235   {  26, 42, 22, 38, 25, 41, 21, 37,},
 236   {  58, 10, 54,  6, 57,  9, 53,  5,},
 237   {  16, 32, 28, 44, 19, 35, 31, 47,},
 238   {  48,  0, 60, 12, 51,  3, 63, 15,},
 239   {  24, 40, 20, 36, 27, 43, 23, 39,},
 240   {  56,  8, 52,  4, 59, 11, 55,  7,},
 241 },{
 242   {  18, 34, 30, 46, 17, 33, 29, 45,},
 243   {  50,  2, 62, 14, 49,  1, 61, 13,},
 244   {  26, 42, 22, 38, 25, 41, 21, 37,},
 245   {  58, 10, 54,  6, 57,  9, 53,  5,},
 246   {  16, 32, 28, 44, 19, 35, 31, 47,},
 247   {  48,  0, 60, 12, 51,  3, 63, 15,},
 248   {  24, 40, 20, 36, 27, 43, 23, 39,},
 249   {  56,  8, 52,  4, 59, 11, 55,  7,},
 250 },{
 251   {  36, 68, 60, 92, 34, 66, 58, 90,},
 252   { 100,  4,124, 28, 98,  2,122, 26,},
 253   {  52, 84, 44, 76, 50, 82, 42, 74,},
 254   { 116, 20,108, 12,114, 18,106, 10,},
 255   {  32, 64, 56, 88, 38, 70, 62, 94,},
 256   {  96,  0,120, 24,102,  6,126, 30,},
 257   {  48, 80, 40, 72, 54, 86, 46, 78,},
 258   { 112, 16,104,  8,118, 22,110, 14,},
 259 }};
 260
 261 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 262
 263 const uint16_t dither_scale[15][16]={
 264 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 265 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 266 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 267 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 268 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 269 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 270 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 271 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 272 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 273 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 274 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 275 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 276 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 277 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 278 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 279 };
 280
 281 static av_always_inline void
 282 yuv2yuvX16_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 283                       int lumFilterSize, const int16_t *chrFilter,
 284                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 285                       int chrFilterSize, const int16_t **alpSrc,
 286                       uint16_t *dest[4], int dstW, int chrDstW,
 287                       int big_endian, int output_bits)
 288 {
 289     //FIXME Optimize (just quickly written not optimized..)
 290     int i;
 291     int shift = 11 + 16 - output_bits;
 292     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 293              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 294
 295 #define output_pixel(pos, val) \
 296     if (big_endian) { \
 297         if (output_bits == 16) { \
 298             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 299         } else { \
 300             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 301         } \
 302     } else { \
 303         if (output_bits == 16) { \
 304             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 305         } else { \
 306             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 307         } \
 308     }
 309     for (i = 0; i < dstW; i++) {
 310         int val = 1 << (26-output_bits);
 311         int j;
 312
 313         for (j = 0; j < lumFilterSize; j++)
 314             val += lumSrc[j][i] * lumFilter[j];
 315
 316         output_pixel(&yDest[i], val);
 317     }
 318
 319     if (uDest) {
 320         for (i = 0; i < chrDstW; i++) {
 321             int u = 1 << (26-output_bits);
 322             int v = 1 << (26-output_bits);
 323             int j;
 324
 325             for (j = 0; j < chrFilterSize; j++) {
 326                 u += chrUSrc[j][i] * chrFilter[j];
 327                 v += chrVSrc[j][i] * chrFilter[j];
 328             }
 329
 330             output_pixel(&uDest[i], u);
 331             output_pixel(&vDest[i], v);
 332         }
 333     }
 334
 335     if (CONFIG_SWSCALE_ALPHA && aDest) {
 336         for (i = 0; i < dstW; i++) {
 337             int val = 1 << (26-output_bits);
 338             int j;
 339
 340             for (j = 0; j < lumFilterSize; j++)
 341                 val += alpSrc[j][i] * lumFilter[j];
 342
 343             output_pixel(&aDest[i], val);
 344         }
 345     }
 346 #undef output_pixel
 347 }
 348
 349 #define yuv2NBPS(bits, BE_LE, is_be) \
 350 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 351                               const int16_t **lumSrc, int lumFilterSize, \
 352                               const int16_t *chrFilter, const int16_t **chrUSrc, \
 353                               const int16_t **chrVSrc, \
 354                               int chrFilterSize, const int16_t **alpSrc, \
 355                               uint8_t *_dest[4], int dstW, int chrDstW) \
 356 { \
 357     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 358                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 359                           alpSrc, (uint16_t **) _dest, \
 360                           dstW, chrDstW, is_be, bits); \
 361 }
 362 yuv2NBPS( 9, BE, 1);
 363 yuv2NBPS( 9, LE, 0);
 364 yuv2NBPS(10, BE, 1);
 365 yuv2NBPS(10, LE, 0);
 366 yuv2NBPS(16, BE, 1);
 367 yuv2NBPS(16, LE, 0);
 368
 369 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 370                        const int16_t **lumSrc, int lumFilterSize,
 371                        const int16_t *chrFilter, const int16_t **chrUSrc,
 372                        const int16_t **chrVSrc,
 373                        int chrFilterSize, const int16_t **alpSrc,
 374                        uint8_t *dest[4], int dstW, int chrDstW,
 375                        const uint8_t *lumDither, const uint8_t *chrDither)
 376 {
 377     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 378             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 379     int i;
 380
 381     //FIXME Optimize (just quickly written not optimized..)
 382     for (i=0; i<dstW; i++) {
 383         int val = lumDither[i&7] << 12;
 384         int j;
 385         for (j=0; j<lumFilterSize; j++)
 386             val += lumSrc[j][i] * lumFilter[j];
 387
 388         yDest[i]= av_clip_uint8(val>>19);
 389     }
 390
 391     if (uDest)
 392         for (i=0; i<chrDstW; i++) {
 393             int u = chrDither[i&7] << 12;
 394             int v = chrDither[(i+3)&7] << 12;
 395             int j;
 396             for (j=0; j<chrFilterSize; j++) {
 397                 u += chrUSrc[j][i] * chrFilter[j];
 398                 v += chrVSrc[j][i] * chrFilter[j];
 399             }
 400
 401             uDest[i]= av_clip_uint8(u>>19);
 402             vDest[i]= av_clip_uint8(v>>19);
 403         }
 404
 405     if (CONFIG_SWSCALE_ALPHA && aDest)
 406         for (i=0; i<dstW; i++) {
 407             int val = lumDither[i&7] << 12;
 408             int j;
 409             for (j=0; j<lumFilterSize; j++)
 410                 val += alpSrc[j][i] * lumFilter[j];
 411
 412             aDest[i]= av_clip_uint8(val>>19);
 413         }
 414 }
 415
 416 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 417                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 418                        const int16_t *alpSrc,
 419                        uint8_t *dest[4], int dstW, int chrDstW,
 420                        const uint8_t *lumDither, const uint8_t *chrDither)
 421 {
 422     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 423             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 424     int i;
 425
 426     for (i=0; i<dstW; i++) {
 427         int val= (lumSrc[i]+lumDither[i&7])>>7;
 428         yDest[i]= av_clip_uint8(val);
 429     }
 430
 431     if (uDest)
 432         for (i=0; i<chrDstW; i++) {
 433             int u=(chrUSrc[i]+chrDither[i&7])>>7;
 434             int v=(chrVSrc[i]+chrDither[(i+3)&7])>>7;
 435             uDest[i]= av_clip_uint8(u);
 436             vDest[i]= av_clip_uint8(v);
 437         }
 438
 439     if (CONFIG_SWSCALE_ALPHA && aDest)
 440         for (i=0; i<dstW; i++) {
 441             int val= (alpSrc[i]+lumDither[i&7])>>7;
 442             aDest[i]= av_clip_uint8(val);
 443         }
 444 }
 445
 446 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 447                         const int16_t **lumSrc, int lumFilterSize,
 448                         const int16_t *chrFilter, const int16_t **chrUSrc,
 449                         const int16_t **chrVSrc, int chrFilterSize,
 450                         const int16_t **alpSrc, uint8_t *dest[4],
 451                         int dstW, int chrDstW,
 452                         const uint8_t *lumDither, const uint8_t *chrDither)
 453 {
 454     uint8_t *yDest = dest[0], *uDest = dest[1];
 455     enum PixelFormat dstFormat = c->dstFormat;
 456
 457     //FIXME Optimize (just quickly written not optimized..)
 458     int i;
 459     for (i=0; i<dstW; i++) {
 460         int val = lumDither[i&7]<<12;
 461         int j;
 462         for (j=0; j<lumFilterSize; j++)
 463             val += lumSrc[j][i] * lumFilter[j];
 464
 465         yDest[i]= av_clip_uint8(val>>19);
 466     }
 467
 468     if (!uDest)
 469         return;
 470
 471     if (dstFormat == PIX_FMT_NV12)
 472         for (i=0; i<chrDstW; i++) {
 473             int u = chrDither[i&7]<<12;
 474             int v = chrDither[(i+3)&7]<<12;
 475             int j;
 476             for (j=0; j<chrFilterSize; j++) {
 477                 u += chrUSrc[j][i] * chrFilter[j];
 478                 v += chrVSrc[j][i] * chrFilter[j];
 479             }
 480
 481             uDest[2*i]= av_clip_uint8(u>>19);
 482             uDest[2*i+1]= av_clip_uint8(v>>19);
 483         }
 484     else
 485         for (i=0; i<chrDstW; i++) {
 486             int u = chrDither[i&7]<<12;
 487             int v = chrDither[(i+3)&7]<<12;
 488             int j;
 489             for (j=0; j<chrFilterSize; j++) {
 490                 u += chrUSrc[j][i] * chrFilter[j];
 491                 v += chrVSrc[j][i] * chrFilter[j];
 492             }
 493
 494             uDest[2*i]= av_clip_uint8(v>>19);
 495             uDest[2*i+1]= av_clip_uint8(u>>19);
 496         }
 497 }
 498
 499 #define output_pixel(pos, val) \
 500         if (target == PIX_FMT_GRAY16BE) { \
 501             AV_WB16(pos, val); \
 502         } else { \
 503             AV_WL16(pos, val); \
 504         }
 505
 506 static av_always_inline void
 507 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 508                         const int16_t **lumSrc, int lumFilterSize,
 509                         const int16_t *chrFilter, const int16_t **chrUSrc,
 510                         const int16_t **chrVSrc, int chrFilterSize,
 511                         const int16_t **alpSrc, uint8_t *dest, int dstW,
 512                         int y, enum PixelFormat target)
 513 {
 514     int i;
 515
 516     for (i = 0; i < (dstW >> 1); i++) {
 517         int j;
 518         int Y1 = 1 << 18;
 519         int Y2 = 1 << 18;
 520         const int i2 = 2 * i;
 521
 522         for (j = 0; j < lumFilterSize; j++) {
 523             Y1 += lumSrc[j][i2]   * lumFilter[j];
 524             Y2 += lumSrc[j][i2+1] * lumFilter[j];
 525         }
 526         Y1 >>= 11;
 527         Y2 >>= 11;
 528         if ((Y1 | Y2) & 0x10000) {
 529             Y1 = av_clip_uint16(Y1);
 530             Y2 = av_clip_uint16(Y2);
 531         }
 532         output_pixel(&dest[2 * i2 + 0], Y1);
 533         output_pixel(&dest[2 * i2 + 2], Y2);
 534     }
 535 }
 536
 537 static av_always_inline void
 538 yuv2gray16_2_c_template(SwsContext *c, const int16_t *buf[2],
 539                         const int16_t *ubuf[2], const int16_t *vbuf[2],
 540                         const int16_t *abuf[2], uint8_t *dest, int dstW,
 541                         int yalpha, int uvalpha, int y,
 542                         enum PixelFormat target)
 543 {
 544     int  yalpha1 = 4095 - yalpha;
 545     int i;
 546     const int16_t *buf0 = buf[0], *buf1 = buf[1];
 547
 548     for (i = 0; i < (dstW >> 1); i++) {
 549         const int i2 = 2 * i;
 550         int Y1 = (buf0[i2  ] * yalpha1 + buf1[i2  ] * yalpha) >> 11;
 551         int Y2 = (buf0[i2+1] * yalpha1 + buf1[i2+1] * yalpha) >> 11;
 552
 553         output_pixel(&dest[2 * i2 + 0], Y1);
 554         output_pixel(&dest[2 * i2 + 2], Y2);
 555     }
 556 }
 557
 558 static av_always_inline void
 559 yuv2gray16_1_c_template(SwsContext *c, const int16_t *buf0,
 560                         const int16_t *ubuf[2], const int16_t *vbuf[2],
 561                         const int16_t *abuf0, uint8_t *dest, int dstW,
 562                         int uvalpha, int y, enum PixelFormat target)
 563 {
 564     int i;
 565
 566     for (i = 0; i < (dstW >> 1); i++) {
 567         const int i2 = 2 * i;
 568         int Y1 = buf0[i2  ] << 1;
 569         int Y2 = buf0[i2+1] << 1;
 570
 571         output_pixel(&dest[2 * i2 + 0], Y1);
 572         output_pixel(&dest[2 * i2 + 2], Y2);
 573     }
 574 }
 575
 576 #undef output_pixel
 577
 578 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 579 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 580                         const int16_t **lumSrc, int lumFilterSize, \
 581                         const int16_t *chrFilter, const int16_t **chrUSrc, \
 582                         const int16_t **chrVSrc, int chrFilterSize, \
 583                         const int16_t **alpSrc, uint8_t *dest, int dstW, \
 584                         int y) \
 585 { \
 586     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 587                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 588                           alpSrc, dest, dstW, y, fmt); \
 589 } \
 590  \
 591 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 592                         const int16_t *ubuf[2], const int16_t *vbuf[2], \
 593                         const int16_t *abuf[2], uint8_t *dest, int dstW, \
 594                         int yalpha, int uvalpha, int y) \
 595 { \
 596     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 597                           dest, dstW, yalpha, uvalpha, y, fmt); \
 598 } \
 599  \
 600 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 601                         const int16_t *ubuf[2], const int16_t *vbuf[2], \
 602                         const int16_t *abuf0, uint8_t *dest, int dstW, \
 603                         int uvalpha, int y) \
 604 { \
 605     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 606                                   dstW, uvalpha, y, fmt); \
 607 }
 608
 609 YUV2PACKEDWRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 610 YUV2PACKEDWRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 611
 612 #define output_pixel(pos, acc) \
 613     if (target == PIX_FMT_MONOBLACK) { \
 614         pos = acc; \
 615     } else { \
 616         pos = ~acc; \
 617     }
 618
 619 static av_always_inline void
 620 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 621                       const int16_t **lumSrc, int lumFilterSize,
 622                       const int16_t *chrFilter, const int16_t **chrUSrc,
 623                       const int16_t **chrVSrc, int chrFilterSize,
 624                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 625                       int y, enum PixelFormat target)
 626 {
 627     const uint8_t * const d128=dither_8x8_220[y&7];
 628     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 629     int i;
 630     int acc = 0;
 631
 632     for (i = 0; i < dstW - 1; i += 2) {
 633         int j;
 634         int Y1 = 1 << 18;
 635         int Y2 = 1 << 18;
 636
 637         for (j = 0; j < lumFilterSize; j++) {
 638             Y1 += lumSrc[j][i]   * lumFilter[j];
 639             Y2 += lumSrc[j][i+1] * lumFilter[j];
 640         }
 641         Y1 >>= 19;
 642         Y2 >>= 19;
 643         if ((Y1 | Y2) & 0x100) {
 644             Y1 = av_clip_uint8(Y1);
 645             Y2 = av_clip_uint8(Y2);
 646         }
 647         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 648         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 649         if ((i & 7) == 6) {
 650             output_pixel(*dest++, acc);
 651         }
 652     }
 653 }
 654
 655 static av_always_inline void
 656 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 657                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 658                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 659                       int yalpha, int uvalpha, int y,
 660                       enum PixelFormat target)
 661 {
 662     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 663     const uint8_t * const d128 = dither_8x8_220[y & 7];
 664     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 665     int  yalpha1 = 4095 - yalpha;
 666     int i;
 667
 668     for (i = 0; i < dstW - 7; i += 8) {
 669         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 670         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 671         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 672         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 673         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 674         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 675         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 676         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 677         output_pixel(*dest++, acc);
 678     }
 679 }
 680
 681 static av_always_inline void
 682 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 683                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 684                       const int16_t *abuf0, uint8_t *dest, int dstW,
 685                       int uvalpha, int y, enum PixelFormat target)
 686 {
 687     const uint8_t * const d128 = dither_8x8_220[y & 7];
 688     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 689     int i;
 690
 691     for (i = 0; i < dstW - 7; i += 8) {
 692         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 693         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 694         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 695         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 696         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 697         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 698         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 699         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 700         output_pixel(*dest++, acc);
 701     }
 702 }
 703
 704 #undef output_pixel
 705
 706 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 707 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 708
 709 #define output_pixels(pos, Y1, U, Y2, V) \
 710     if (target == PIX_FMT_YUYV422) { \
 711         dest[pos + 0] = Y1; \
 712         dest[pos + 1] = U;  \
 713         dest[pos + 2] = Y2; \
 714         dest[pos + 3] = V;  \
 715     } else { \
 716         dest[pos + 0] = U;  \
 717         dest[pos + 1] = Y1; \
 718         dest[pos + 2] = V;  \
 719         dest[pos + 3] = Y2; \
 720     }
 721
 722 static av_always_inline void
 723 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 724                      const int16_t **lumSrc, int lumFilterSize,
 725                      const int16_t *chrFilter, const int16_t **chrUSrc,
 726                      const int16_t **chrVSrc, int chrFilterSize,
 727                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 728                      int y, enum PixelFormat target)
 729 {
 730     int i;
 731
 732     for (i = 0; i < (dstW >> 1); i++) {
 733         int j;
 734         int Y1 = 1 << 18;
 735         int Y2 = 1 << 18;
 736         int U  = 1 << 18;
 737         int V  = 1 << 18;
 738
 739         for (j = 0; j < lumFilterSize; j++) {
 740             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 741             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 742         }
 743         for (j = 0; j < chrFilterSize; j++) {
 744             U += chrUSrc[j][i] * chrFilter[j];
 745             V += chrVSrc[j][i] * chrFilter[j];
 746         }
 747         Y1 >>= 19;
 748         Y2 >>= 19;
 749         U  >>= 19;
 750         V  >>= 19;
 751         if ((Y1 | Y2 | U | V) & 0x100) {
 752             Y1 = av_clip_uint8(Y1);
 753             Y2 = av_clip_uint8(Y2);
 754             U  = av_clip_uint8(U);
 755             V  = av_clip_uint8(V);
 756         }
 757         output_pixels(4*i, Y1, U, Y2, V);
 758     }
 759 }
 760
 761 static av_always_inline void
 762 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 763                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 764                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 765                      int yalpha, int uvalpha, int y,
 766                      enum PixelFormat target)
 767 {
 768     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 769                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 770                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 771     int  yalpha1 = 4095 - yalpha;
 772     int uvalpha1 = 4095 - uvalpha;
 773     int i;
 774
 775     for (i = 0; i < (dstW >> 1); i++) {
 776         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 777         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 778         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 779         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 780
 781         output_pixels(i * 4, Y1, U, Y2, V);
 782     }
 783 }
 784
 785 static av_always_inline void
 786 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 787                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 788                      const int16_t *abuf0, uint8_t *dest, int dstW,
 789                      int uvalpha, int y, enum PixelFormat target)
 790 {
 791     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 792                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 793     int i;
 794
 795     if (uvalpha < 2048) {
 796         for (i = 0; i < (dstW >> 1); i++) {
 797             int Y1 = buf0[i * 2]     >> 7;
 798             int Y2 = buf0[i * 2 + 1] >> 7;
 799             int U  = ubuf1[i]        >> 7;
 800             int V  = vbuf1[i]        >> 7;
 801
 802             output_pixels(i * 4, Y1, U, Y2, V);
 803         }
 804     } else {
 805         for (i = 0; i < (dstW >> 1); i++) {
 806             int Y1 =  buf0[i * 2]          >> 7;
 807             int Y2 =  buf0[i * 2 + 1]      >> 7;
 808             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 809             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 810
 811             output_pixels(i * 4, Y1, U, Y2, V);
 812         }
 813     }
 814 }
 815
 816 #undef output_pixels
 817
 818 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 819 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 820
 821 #define r_b ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? r : b)
 822 #define b_r ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? b : r)
 823
 824 static av_always_inline void
 825 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 826                        const int16_t **lumSrc, int lumFilterSize,
 827                        const int16_t *chrFilter, const int16_t **chrUSrc,
 828                        const int16_t **chrVSrc, int chrFilterSize,
 829                        const int16_t **alpSrc, uint8_t *dest, int dstW,
 830                        int y, enum PixelFormat target)
 831 {
 832     int i;
 833
 834     for (i = 0; i < (dstW >> 1); i++) {
 835         int j;
 836         int Y1 = 1 << 18;
 837         int Y2 = 1 << 18;
 838         int U  = 1 << 18;
 839         int V  = 1 << 18;
 840         const uint8_t *r, *g, *b;
 841
 842         for (j = 0; j < lumFilterSize; j++) {
 843             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 844             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 845         }
 846         for (j = 0; j < chrFilterSize; j++) {
 847             U += chrUSrc[j][i] * chrFilter[j];
 848             V += chrVSrc[j][i] * chrFilter[j];
 849         }
 850         Y1 >>= 19;
 851         Y2 >>= 19;
 852         U  >>= 19;
 853         V  >>= 19;
 854         if ((Y1 | Y2 | U | V) & 0x100) {
 855             Y1 = av_clip_uint8(Y1);
 856             Y2 = av_clip_uint8(Y2);
 857             U  = av_clip_uint8(U);
 858             V  = av_clip_uint8(V);
 859         }
 860
 861         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
 862         r = (const uint8_t *) c->table_rV[V];
 863         g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]);
 864         b = (const uint8_t *) c->table_bU[U];
 865
 866         dest[ 0] = dest[ 1] = r_b[Y1];
 867         dest[ 2] = dest[ 3] =   g[Y1];
 868         dest[ 4] = dest[ 5] = b_r[Y1];
 869         dest[ 6] = dest[ 7] = r_b[Y2];
 870         dest[ 8] = dest[ 9] =   g[Y2];
 871         dest[10] = dest[11] = b_r[Y2];
 872         dest += 12;
 873     }
 874 }
 875
 876 static av_always_inline void
 877 yuv2rgb48_2_c_template(SwsContext *c, const int16_t *buf[2],
 878                        const int16_t *ubuf[2], const int16_t *vbuf[2],
 879                        const int16_t *abuf[2], uint8_t *dest, int dstW,
 880                        int yalpha, int uvalpha, int y,
 881                        enum PixelFormat target)
 882 {
 883     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 884                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 885                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 886     int  yalpha1 = 4095 - yalpha;
 887     int uvalpha1 = 4095 - uvalpha;
 888     int i;
 889
 890     for (i = 0; i < (dstW >> 1); i++) {
 891         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 892         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 893         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 894         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 895         const uint8_t *r = (const uint8_t *) c->table_rV[V],
 896                       *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 897                       *b = (const uint8_t *) c->table_bU[U];
 898
 899         dest[ 0] = dest[ 1] = r_b[Y1];
 900         dest[ 2] = dest[ 3] =   g[Y1];
 901         dest[ 4] = dest[ 5] = b_r[Y1];
 902         dest[ 6] = dest[ 7] = r_b[Y2];
 903         dest[ 8] = dest[ 9] =   g[Y2];
 904         dest[10] = dest[11] = b_r[Y2];
 905         dest += 12;
 906     }
 907 }
 908
 909 static av_always_inline void
 910 yuv2rgb48_1_c_template(SwsContext *c, const int16_t *buf0,
 911                        const int16_t *ubuf[2], const int16_t *vbuf[2],
 912                        const int16_t *abuf0, uint8_t *dest, int dstW,
 913                        int uvalpha, int y, enum PixelFormat target)
 914 {
 915     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 916                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 917     int i;
 918
 919     if (uvalpha < 2048) {
 920         for (i = 0; i < (dstW >> 1); i++) {
 921             int Y1 = buf0[i * 2]     >> 7;
 922             int Y2 = buf0[i * 2 + 1] >> 7;
 923             int U  = ubuf1[i]        >> 7;
 924             int V  = vbuf1[i]        >> 7;
 925             const uint8_t *r = (const uint8_t *) c->table_rV[V],
 926                           *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 927                           *b = (const uint8_t *) c->table_bU[U];
 928
 929             dest[ 0] = dest[ 1] = r_b[Y1];
 930             dest[ 2] = dest[ 3] =   g[Y1];
 931             dest[ 4] = dest[ 5] = b_r[Y1];
 932             dest[ 6] = dest[ 7] = r_b[Y2];
 933             dest[ 8] = dest[ 9] =   g[Y2];
 934             dest[10] = dest[11] = b_r[Y2];
 935             dest += 12;
 936         }
 937     } else {
 938         for (i = 0; i < (dstW >> 1); i++) {
 939             int Y1 =  buf0[i * 2]          >> 7;
 940             int Y2 =  buf0[i * 2 + 1]      >> 7;
 941             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 942             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 943             const uint8_t *r = (const uint8_t *) c->table_rV[V],
 944                           *g = (const uint8_t *)(c->table_gU[U] + c->table_gV[V]),
 945                           *b = (const uint8_t *) c->table_bU[U];
 946
 947             dest[ 0] = dest[ 1] = r_b[Y1];
 948             dest[ 2] = dest[ 3] =   g[Y1];
 949             dest[ 4] = dest[ 5] = b_r[Y1];
 950             dest[ 6] = dest[ 7] = r_b[Y2];
 951             dest[ 8] = dest[ 9] =   g[Y2];
 952             dest[10] = dest[11] = b_r[Y2];
 953             dest += 12;
 954         }
 955     }
 956 }
 957
 958 #undef r_b
 959 #undef b_r
 960
 961 YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
 962 //YUV2PACKEDWRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
 963 YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
 964 //YUV2PACKEDWRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
 965
 966 static av_always_inline void
 967 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
 968               int U, int V, int A1, int A2,
 969               const void *_r, const void *_g, const void *_b, int y,
 970               enum PixelFormat target, int hasAlpha)
 971 {
 972     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
 973         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
 974         uint32_t *dest = (uint32_t *) _dest;
 975         const uint32_t *r = (const uint32_t *) _r;
 976         const uint32_t *g = (const uint32_t *) _g;
 977         const uint32_t *b = (const uint32_t *) _b;
 978
 979 #if CONFIG_SMALL
 980         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 981
 982         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
 983         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
 984 #else
 985         if (hasAlpha) {
 986             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
 987
 988             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
 989             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
 990         } else {
 991             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
 992             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
 993         }
 994 #endif
 995     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
 996         uint8_t *dest = (uint8_t *) _dest;
 997         const uint8_t *r = (const uint8_t *) _r;
 998         const uint8_t *g = (const uint8_t *) _g;
 999         const uint8_t *b = (const uint8_t *) _b;
1000
1001 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1002 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1003         dest[i * 6 + 0] = r_b[Y1];
1004         dest[i * 6 + 1] =   g[Y1];
1005         dest[i * 6 + 2] = b_r[Y1];
1006         dest[i * 6 + 3] = r_b[Y2];
1007         dest[i * 6 + 4] =   g[Y2];
1008         dest[i * 6 + 5] = b_r[Y2];
1009 #undef r_b
1010 #undef b_r
1011     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1012                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1013                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1014         uint16_t *dest = (uint16_t *) _dest;
1015         const uint16_t *r = (const uint16_t *) _r;
1016         const uint16_t *g = (const uint16_t *) _g;
1017         const uint16_t *b = (const uint16_t *) _b;
1018         int dr1, dg1, db1, dr2, dg2, db2;
1019
1020         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1021             dr1 = dither_2x2_8[ y & 1     ][0];
1022             dg1 = dither_2x2_4[ y & 1     ][0];
1023             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1024             dr2 = dither_2x2_8[ y & 1     ][1];
1025             dg2 = dither_2x2_4[ y & 1     ][1];
1026             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1027         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1028             dr1 = dither_2x2_8[ y & 1     ][0];
1029             dg1 = dither_2x2_8[ y & 1     ][1];
1030             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1031             dr2 = dither_2x2_8[ y & 1     ][1];
1032             dg2 = dither_2x2_8[ y & 1     ][0];
1033             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1034         } else {
1035             dr1 = dither_4x4_16[ y & 3     ][0];
1036             dg1 = dither_4x4_16[ y & 3     ][1];
1037             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1038             dr2 = dither_4x4_16[ y & 3     ][1];
1039             dg2 = dither_4x4_16[ y & 3     ][0];
1040             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1041         }
1042
1043         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1044         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1045     } else /* 8/4-bit */ {
1046         uint8_t *dest = (uint8_t *) _dest;
1047         const uint8_t *r = (const uint8_t *) _r;
1048         const uint8_t *g = (const uint8_t *) _g;
1049         const uint8_t *b = (const uint8_t *) _b;
1050         int dr1, dg1, db1, dr2, dg2, db2;
1051
1052         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1053             const uint8_t * const d64 = dither_8x8_73[y & 7];
1054             const uint8_t * const d32 = dither_8x8_32[y & 7];
1055             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1056             db1 =       d64[(i * 2 + 0) & 7];
1057             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1058             db2 =       d64[(i * 2 + 1) & 7];
1059         } else {
1060             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1061             const uint8_t * const d128 = dither_8x8_220[y & 7];
1062             dr1 = db1 = d128[(i * 2 + 0) & 7];
1063             dg1 =        d64[(i * 2 + 0) & 7];
1064             dr2 = db2 = d128[(i * 2 + 1) & 7];
1065             dg2 =        d64[(i * 2 + 1) & 7];
1066         }
1067
1068         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1069             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1070                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1071         } else {
1072             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1073             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1074         }
1075     }
1076 }
1077
1078 static av_always_inline void
1079 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1080                      const int16_t **lumSrc, int lumFilterSize,
1081                      const int16_t *chrFilter, const int16_t **chrUSrc,
1082                      const int16_t **chrVSrc, int chrFilterSize,
1083                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1084                      int y, enum PixelFormat target, int hasAlpha)
1085 {
1086     int i;
1087
1088     for (i = 0; i < (dstW >> 1); i++) {
1089         int j;
1090         int Y1 = 1 << 18;
1091         int Y2 = 1 << 18;
1092         int U  = 1 << 18;
1093         int V  = 1 << 18;
1094         int av_unused A1, A2;
1095         const void *r, *g, *b;
1096
1097         for (j = 0; j < lumFilterSize; j++) {
1098             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1099             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1100         }
1101         for (j = 0; j < chrFilterSize; j++) {
1102             U += chrUSrc[j][i] * chrFilter[j];
1103             V += chrVSrc[j][i] * chrFilter[j];
1104         }
1105         Y1 >>= 19;
1106         Y2 >>= 19;
1107         U  >>= 19;
1108         V  >>= 19;
1109         if ((Y1 | Y2 | U | V) & 0x100) {
1110             Y1 = av_clip_uint8(Y1);
1111             Y2 = av_clip_uint8(Y2);
1112             U  = av_clip_uint8(U);
1113             V  = av_clip_uint8(V);
1114         }
1115         if (hasAlpha) {\
1116             A1 = 1 << 18;
1117             A2 = 1 << 18;
1118             for (j = 0; j < lumFilterSize; j++) {
1119                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1120                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1121             }
1122             A1 >>= 19;
1123             A2 >>= 19;
1124             if ((A1 | A2) & 0x100) {
1125                 A1 = av_clip_uint8(A1);
1126                 A2 = av_clip_uint8(A2);
1127             }
1128         }
1129
1130         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1131         r =  c->table_rV[V];
1132         g = (c->table_gU[U] + c->table_gV[V]);
1133         b =  c->table_bU[U];
1134
1135         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1136                       r, g, b, y, target, hasAlpha);
1137     }
1138 }
1139
1140 static av_always_inline void
1141 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1142                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1143                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1144                      int yalpha, int uvalpha, int y,
1145                      enum PixelFormat target, int hasAlpha)
1146 {
1147     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1148                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1149                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1150                   *abuf0 = abuf[0], *abuf1 = abuf[1];
1151     int  yalpha1 = 4095 - yalpha;
1152     int uvalpha1 = 4095 - uvalpha;
1153     int i;
1154
1155     for (i = 0; i < (dstW >> 1); i++) {
1156         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1157         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1158         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1159         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1160         int A1, A2;
1161         const void *r =  c->table_rV[V],
1162                    *g = (c->table_gU[U] + c->table_gV[V]),
1163                    *b =  c->table_bU[U];
1164
1165         if (hasAlpha) {
1166             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1167             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1168         }
1169
1170         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1171                       r, g, b, y, target, hasAlpha);
1172     }
1173 }
1174
1175 static av_always_inline void
1176 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1177                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1178                      const int16_t *abuf0, uint8_t *dest, int dstW,
1179                      int uvalpha, int y, enum PixelFormat target,
1180                      int hasAlpha)
1181 {
1182     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1183                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1184     int i;
1185
1186     if (uvalpha < 2048) {
1187         for (i = 0; i < (dstW >> 1); i++) {
1188             int Y1 = buf0[i * 2]     >> 7;
1189             int Y2 = buf0[i * 2 + 1] >> 7;
1190             int U  = ubuf1[i]        >> 7;
1191             int V  = vbuf1[i]        >> 7;
1192             int A1, A2;
1193             const void *r =  c->table_rV[V],
1194                        *g = (c->table_gU[U] + c->table_gV[V]),
1195                        *b =  c->table_bU[U];
1196
1197             if (hasAlpha) {
1198                 A1 = abuf0[i * 2    ] >> 7;
1199                 A2 = abuf0[i * 2 + 1] >> 7;
1200             }
1201
1202             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1203                           r, g, b, y, target, hasAlpha);
1204         }
1205     } else {
1206         for (i = 0; i < (dstW >> 1); i++) {
1207             int Y1 =  buf0[i * 2]          >> 7;
1208             int Y2 =  buf0[i * 2 + 1]      >> 7;
1209             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1210             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1211             int A1, A2;
1212             const void *r =  c->table_rV[V],
1213                        *g = (c->table_gU[U] + c->table_gV[V]),
1214                        *b =  c->table_bU[U];
1215
1216             if (hasAlpha) {
1217                 A1 = abuf0[i * 2    ] >> 7;
1218                 A2 = abuf0[i * 2 + 1] >> 7;
1219             }
1220
1221             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1222                           r, g, b, y, target, hasAlpha);
1223         }
1224     }
1225 }
1226
1227 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1228 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1229                                 const int16_t **lumSrc, int lumFilterSize, \
1230                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1231                                 const int16_t **chrVSrc, int chrFilterSize, \
1232                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1233                                 int y) \
1234 { \
1235     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1236                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1237                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1238 }
1239 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1240 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1241 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1242                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1243                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1244                                 int yalpha, int uvalpha, int y) \
1245 { \
1246     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1247                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1248 } \
1249  \
1250 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1251                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1252                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1253                                 int uvalpha, int y) \
1254 { \
1255     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1256                                   dstW, uvalpha, y, fmt, hasAlpha); \
1257 }
1258
1259 #if CONFIG_SMALL
1260 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1261 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1262 #else
1263 #if CONFIG_SWSCALE_ALPHA
1264 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1265 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1266 #endif
1267 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1268 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1269 #endif
1270 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1271 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1272 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1273 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1274 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1275 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1276 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1277 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1278
1279 static av_always_inline void
1280 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1281                           const int16_t **lumSrc, int lumFilterSize,
1282                           const int16_t *chrFilter, const int16_t **chrUSrc,
1283                           const int16_t **chrVSrc, int chrFilterSize,
1284                           const int16_t **alpSrc, uint8_t *dest,
1285                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1286 {
1287     int i;
1288     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1289
1290     for (i = 0; i < dstW; i++) {
1291         int j;
1292         int Y = 1<<9;
1293         int U = (1<<9)-(128 << 19);
1294         int V = (1<<9)-(128 << 19);
1295         int av_unused A;
1296         int R, G, B;
1297
1298         for (j = 0; j < lumFilterSize; j++) {
1299             Y += lumSrc[j][i] * lumFilter[j];
1300         }
1301         for (j = 0; j < chrFilterSize; j++) {
1302             U += chrUSrc[j][i] * chrFilter[j];
1303             V += chrVSrc[j][i] * chrFilter[j];
1304         }
1305         Y >>= 10;
1306         U >>= 10;
1307         V >>= 10;
1308         if (hasAlpha) {
1309             A = 1 << 18;
1310             for (j = 0; j < lumFilterSize; j++) {
1311                 A += alpSrc[j][i] * lumFilter[j];
1312             }
1313             A >>= 19;
1314             if (A & 0x100)
1315                 A = av_clip_uint8(A);
1316         }
1317         Y -= c->yuv2rgb_y_offset;
1318         Y *= c->yuv2rgb_y_coeff;
1319         Y += 1 << 21;
1320         R = Y + V*c->yuv2rgb_v2r_coeff;
1321         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1322         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1323         if ((R | G | B) & 0xC0000000) {
1324             R = av_clip_uintp2(R, 30);
1325             G = av_clip_uintp2(G, 30);
1326             B = av_clip_uintp2(B, 30);
1327         }
1328
1329         switch(target) {
1330         case PIX_FMT_ARGB:
1331             dest[0] = hasAlpha ? A : 255;
1332             dest[1] = R >> 22;
1333             dest[2] = G >> 22;
1334             dest[3] = B >> 22;
1335             break;
1336         case PIX_FMT_RGB24:
1337             dest[0] = R >> 22;
1338             dest[1] = G >> 22;
1339             dest[2] = B >> 22;
1340             break;
1341         case PIX_FMT_RGBA:
1342             dest[0] = R >> 22;
1343             dest[1] = G >> 22;
1344             dest[2] = B >> 22;
1345             dest[3] = hasAlpha ? A : 255;
1346             break;
1347         case PIX_FMT_ABGR:
1348             dest[0] = hasAlpha ? A : 255;
1349             dest[1] = B >> 22;
1350             dest[2] = G >> 22;
1351             dest[3] = R >> 22;
1352             break;
1353         case PIX_FMT_BGR24:
1354             dest[0] = B >> 22;
1355             dest[1] = G >> 22;
1356             dest[2] = R >> 22;
1357             break;
1358         case PIX_FMT_BGRA:
1359             dest[0] = B >> 22;
1360             dest[1] = G >> 22;
1361             dest[2] = R >> 22;
1362             dest[3] = hasAlpha ? A : 255;
1363             break;
1364         }
1365         dest += step;
1366     }
1367 }
1368
1369 #if CONFIG_SMALL
1370 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1371 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1372 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1373 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1374 #else
1375 #if CONFIG_SWSCALE_ALPHA
1376 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1377 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1378 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1379 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1380 #endif
1381 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1382 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1383 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1384 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1385 #endif
1386 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1387 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1388
1389 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1390                                        int width, int height,
1391                                        int y, uint8_t val)
1392 {
1393     int i;
1394     uint8_t *ptr = plane + stride*y;
1395     for (i=0; i<height; i++) {
1396         memset(ptr, val, width);
1397         ptr += stride;
1398     }
1399 }
1400
1401 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1402
1403 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1404 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1405
1406 static av_always_inline void
1407 rgb48ToY_c_template(int16_t *dst, const uint16_t *src, int width,
1408                     enum PixelFormat origin)
1409 {
1410     int i;
1411     for (i = 0; i < width; i++) {
1412         int r_b = input_pixel(&src[i*3+0]);
1413         int   g = input_pixel(&src[i*3+1]);
1414         int b_r = input_pixel(&src[i*3+2]);
1415
1416         dst[i] = (RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1+8)) + (1<<(RGB2YUV_SHIFT-7+8))) >> (RGB2YUV_SHIFT-6+8);
1417     }
1418 }
1419
1420 static av_always_inline void
1421 rgb48ToUV_c_template(int16_t *dstU, int16_t *dstV,
1422                     const uint16_t *src1, const uint16_t *src2,
1423                     int width, enum PixelFormat origin)
1424 {
1425     int i;
1426     assert(src1==src2);
1427     for (i = 0; i < width; i++) {
1428         int r_b = input_pixel(&src1[i*3+0]);
1429         int   g = input_pixel(&src1[i*3+1]);
1430         int b_r = input_pixel(&src1[i*3+2]);
1431
1432         dstU[i] = (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1+8)) + (1<<(RGB2YUV_SHIFT-7+8))) >> (RGB2YUV_SHIFT-6+8);
1433         dstV[i] = (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1+8)) + (1<<(RGB2YUV_SHIFT-7+8))) >> (RGB2YUV_SHIFT-6+8);
1434     }
1435 }
1436
1437 static av_always_inline void
1438 rgb48ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1439                           const uint16_t *src1, const uint16_t *src2,
1440                           int width, enum PixelFormat origin)
1441 {
1442     int i;
1443     assert(src1==src2);
1444     for (i = 0; i < width; i++) {
1445         int r_b = (input_pixel(&src1[6*i + 0])) + (input_pixel(&src1[6*i + 3]));
1446         int   g = (input_pixel(&src1[6*i + 1])) + (input_pixel(&src1[6*i + 4]));
1447         int b_r = (input_pixel(&src1[6*i + 2])) + (input_pixel(&src1[6*i + 5]));
1448
1449         dstU[i]= (RU*r + GU*g + BU*b + (256U<<(RGB2YUV_SHIFT+8)) + (1<<(RGB2YUV_SHIFT-6+8))) >> (RGB2YUV_SHIFT-5+8);
1450         dstV[i]= (RV*r + GV*g + BV*b + (256U<<(RGB2YUV_SHIFT+8)) + (1<<(RGB2YUV_SHIFT-6+8))) >> (RGB2YUV_SHIFT-5+8);
1451     }
1452 }
1453
1454 #undef r
1455 #undef b
1456 #undef input_pixel
1457
1458 #define rgb48funcs(pattern, BE_LE, origin) \
1459 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *dst, const uint8_t *src, \
1460                                     int width, uint32_t *unused) \
1461 { \
1462     rgb48ToY_c_template(dst, src, width, origin); \
1463 } \
1464  \
1465 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1466                                     const uint8_t *src1, const uint8_t *src2, \
1467                                     int width, uint32_t *unused) \
1468 { \
1469     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1470 } \
1471  \
1472 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1473                                     const uint8_t *src1, const uint8_t *src2, \
1474                                     int width, uint32_t *unused) \
1475 { \
1476     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1477 }
1478
1479 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1480 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1481 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1482 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1483
1484 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1485                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1486                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1487
1488 static av_always_inline void
1489 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1490                        int width, enum PixelFormat origin,
1491                        int shr,   int shg,   int shb, int shp,
1492                        int maskr, int maskg, int maskb,
1493                        int rsh,   int gsh,   int bsh, int S)
1494 {
1495     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1496               rnd = (32<<((S)-1)) + (1<<(S-7));
1497     int i;
1498
1499     for (i = 0; i < width; i++) {
1500         int px = input_pixel(i) >> shp;
1501         int b = (px & maskb) >> shb;
1502         int g = (px & maskg) >> shg;
1503         int r = (px & maskr) >> shr;
1504
1505         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1506     }
1507 }
1508
1509 static av_always_inline void
1510 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1511                         const uint8_t *src, int width,
1512                         enum PixelFormat origin,
1513                         int shr,   int shg,   int shb, int shp,
1514                         int maskr, int maskg, int maskb,
1515                         int rsh,   int gsh,   int bsh, int S)
1516 {
1517     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1518               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1519               rnd = (256<<((S)-1)) + (1<<(S-7));
1520     int i;
1521
1522     for (i = 0; i < width; i++) {
1523         int px = input_pixel(i) >> shp;
1524         int b = (px & maskb) >> shb;
1525         int g = (px & maskg) >> shg;
1526         int r = (px & maskr) >> shr;
1527
1528         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1529         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1530     }
1531 }
1532
1533 static av_always_inline void
1534 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1535                              const uint8_t *src, int width,
1536                              enum PixelFormat origin,
1537                              int shr,   int shg,   int shb, int shp,
1538                              int maskr, int maskg, int maskb,
1539                              int rsh,   int gsh,   int bsh, int S)
1540 {
1541     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1542               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1543               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1544     int i;
1545
1546     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1547     for (i = 0; i < width; i++) {
1548         int px0 = input_pixel(2 * i + 0) >> shp;
1549         int px1 = input_pixel(2 * i + 1) >> shp;
1550         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1551         int rb = px0 + px1 - g;
1552
1553         b = (rb & maskb) >> shb;
1554         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1555             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1556             g >>= shg;
1557         } else {
1558             g = (g  & maskg) >> shg;
1559         }
1560         r = (rb & maskr) >> shr;
1561
1562         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1563         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1564     }
1565 }
1566
1567 #undef input_pixel
1568
1569 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1570                          maskg, maskb, rsh, gsh, bsh, S) \
1571 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1572                           int width, uint32_t *unused) \
1573 { \
1574     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1575                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1576 } \
1577  \
1578 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1579                            const uint8_t *src, const uint8_t *dummy, \
1580                            int width, uint32_t *unused) \
1581 { \
1582     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1583                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1584 } \
1585  \
1586 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1587                                 const uint8_t *src, const uint8_t *dummy, \
1588                                 int width, uint32_t *unused) \
1589 { \
1590     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1591                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1592 }
1593
1594 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1595 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1596 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1597 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1598 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1599 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1600 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1601 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1602 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1603 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1604 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1605 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1606
1607 static void abgrToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1608 {
1609     int i;
1610     for (i=0; i<width; i++) {
1611         dst[i]= src[4*i]<<6;
1612     }
1613 }
1614
1615 static void rgbaToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1616 {
1617     int i;
1618     for (i=0; i<width; i++) {
1619         dst[i]= src[4*i+3]<<6;
1620     }
1621 }
1622
1623 static void palToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *pal)
1624 {
1625     int i;
1626     for (i=0; i<width; i++) {
1627         int d= src[i];
1628
1629         dst[i]= (pal[d] >> 24)<<6;
1630     }
1631 }
1632
1633 static void palToY_c(int16_t *dst, const uint8_t *src, long width, uint32_t *pal)
1634 {
1635     int i;
1636     for (i=0; i<width; i++) {
1637         int d= src[i];
1638
1639         dst[i]= (pal[d] & 0xFF)<<6;
1640     }
1641 }
1642
1643 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1644                            const uint8_t *src1, const uint8_t *src2,
1645                            int width, uint32_t *pal)
1646 {
1647     int i;
1648     assert(src1 == src2);
1649     for (i=0; i<width; i++) {
1650         int p= pal[src1[i]];
1651
1652         dstU[i]= (uint8_t)(p>> 8)<<6;
1653         dstV[i]= (uint8_t)(p>>16)<<6;
1654     }
1655 }
1656
1657 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1658 {
1659     int i, j;
1660     for (i=0; i<width/8; i++) {
1661         int d= ~src[i];
1662         for(j=0; j<8; j++)
1663             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1664     }
1665     if(width&7){
1666         int d= ~src[i];
1667         for(j=0; j<(width&7); j++)
1668             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1669     }
1670 }
1671
1672 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1673 {
1674     int i, j;
1675     for (i=0; i<width/8; i++) {
1676         int d= src[i];
1677         for(j=0; j<8; j++)
1678             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1679     }
1680     if(width&7){
1681         int d= src[i];
1682         for(j=0; j<(width&7); j++)
1683             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1684     }
1685 }
1686
1687 //FIXME yuy2* can read up to 7 samples too much
1688
1689 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1690                       uint32_t *unused)
1691 {
1692     int i;
1693     for (i=0; i<width; i++)
1694         dst[i]= src[2*i];
1695 }
1696
1697 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1698                        const uint8_t *src2, int width, uint32_t *unused)
1699 {
1700     int i;
1701     for (i=0; i<width; i++) {
1702         dstU[i]= src1[4*i + 1];
1703         dstV[i]= src1[4*i + 3];
1704     }
1705     assert(src1 == src2);
1706 }
1707
1708 static void LEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1709                      const uint8_t *src2, int width, uint32_t *unused)
1710 {
1711     int i;
1712     for (i=0; i<width; i++) {
1713         dstU[i]= src1[2*i + 1];
1714         dstV[i]= src2[2*i + 1];
1715     }
1716 }
1717
1718 /* This is almost identical to the previous, end exists only because
1719  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1720 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1721                       uint32_t *unused)
1722 {
1723     int i;
1724     for (i=0; i<width; i++)
1725         dst[i]= src[2*i+1];
1726 }
1727
1728 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1729                        const uint8_t *src2, int width, uint32_t *unused)
1730 {
1731     int i;
1732     for (i=0; i<width; i++) {
1733         dstU[i]= src1[4*i + 0];
1734         dstV[i]= src1[4*i + 2];
1735     }
1736     assert(src1 == src2);
1737 }
1738
1739 static void BEToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1740                      const uint8_t *src2, int width, uint32_t *unused)
1741 {
1742     int i;
1743     for (i=0; i<width; i++) {
1744         dstU[i]= src1[2*i];
1745         dstV[i]= src2[2*i];
1746     }
1747 }
1748
1749 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1750                                         const uint8_t *src, int width)
1751 {
1752     int i;
1753     for (i = 0; i < width; i++) {
1754         dst1[i] = src[2*i+0];
1755         dst2[i] = src[2*i+1];
1756     }
1757 }
1758
1759 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1760                        const uint8_t *src1, const uint8_t *src2,
1761                        int width, uint32_t *unused)
1762 {
1763     nvXXtoUV_c(dstU, dstV, src1, width);
1764 }
1765
1766 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1767                        const uint8_t *src1, const uint8_t *src2,
1768                        int width, uint32_t *unused)
1769 {
1770     nvXXtoUV_c(dstV, dstU, src1, width);
1771 }
1772
1773 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1774
1775 static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
1776                        int width, uint32_t *unused)
1777 {
1778     int i;
1779     for (i=0; i<width; i++) {
1780         int b= src[i*3+0];
1781         int g= src[i*3+1];
1782         int r= src[i*3+2];
1783
1784         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1785     }
1786 }
1787
1788 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1789                         const uint8_t *src2, int width, uint32_t *unused)
1790 {
1791     int i;
1792     for (i=0; i<width; i++) {
1793         int b= src1[3*i + 0];
1794         int g= src1[3*i + 1];
1795         int r= src1[3*i + 2];
1796
1797         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1798         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1799     }
1800     assert(src1 == src2);
1801 }
1802
1803 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1804                              const uint8_t *src2, int width, uint32_t *unused)
1805 {
1806     int i;
1807     for (i=0; i<width; i++) {
1808         int b= src1[6*i + 0] + src1[6*i + 3];
1809         int g= src1[6*i + 1] + src1[6*i + 4];
1810         int r= src1[6*i + 2] + src1[6*i + 5];
1811
1812         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1813         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1814     }
1815     assert(src1 == src2);
1816 }
1817
1818 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, int width,
1819                        uint32_t *unused)
1820 {
1821     int i;
1822     for (i=0; i<width; i++) {
1823         int r= src[i*3+0];
1824         int g= src[i*3+1];
1825         int b= src[i*3+2];
1826
1827         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1828     }
1829 }
1830
1831 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1832                         const uint8_t *src2, int width, uint32_t *unused)
1833 {
1834     int i;
1835     assert(src1==src2);
1836     for (i=0; i<width; i++) {
1837         int r= src1[3*i + 0];
1838         int g= src1[3*i + 1];
1839         int b= src1[3*i + 2];
1840
1841         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1842         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1843     }
1844 }
1845
1846 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1847                                     const uint8_t *src2, int width, uint32_t *unused)
1848 {
1849     int i;
1850     assert(src1==src2);
1851     for (i=0; i<width; i++) {
1852         int r= src1[6*i + 0] + src1[6*i + 3];
1853         int g= src1[6*i + 1] + src1[6*i + 4];
1854         int b= src1[6*i + 2] + src1[6*i + 5];
1855
1856         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1857         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1858     }
1859 }
1860
1861 // bilinear / bicubic scaling
1862 static void hScale_c(int16_t *dst, int dstW, const uint8_t *src,
1863                      const int16_t *filter, const int16_t *filterPos,
1864                      int filterSize)
1865 {
1866     int i;
1867     for (i=0; i<dstW; i++) {
1868         int j;
1869         int srcPos= filterPos[i];
1870         int val=0;
1871         for (j=0; j<filterSize; j++) {
1872             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1873         }
1874         //filter += hFilterSize;
1875         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1876         //dst[i] = val>>7;
1877     }
1878 }
1879
1880 static inline void hScale16_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
1881                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
1882 {
1883     int i, j;
1884
1885     for (i=0; i<dstW; i++) {
1886         int srcPos= filterPos[i];
1887         int val=0;
1888         for (j=0; j<filterSize; j++) {
1889             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1890         }
1891         dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
1892     }
1893 }
1894
1895 static inline void hScale16X_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
1896                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
1897 {
1898     int i, j;
1899     for (i=0; i<dstW; i++) {
1900         int srcPos= filterPos[i];
1901         int val=0;
1902         for (j=0; j<filterSize; j++) {
1903             val += ((int)av_bswap16(src[srcPos + j]))*filter[filterSize*i + j];
1904         }
1905         dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
1906     }
1907 }
1908
1909 //FIXME all pal and rgb srcFormats could do this convertion as well
1910 //FIXME all scalers more complex than bilinear could do half of this transform
1911 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1912 {
1913     int i;
1914     for (i = 0; i < width; i++) {
1915         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
1916         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
1917     }
1918 }
1919 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1920 {
1921     int i;
1922     for (i = 0; i < width; i++) {
1923         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
1924         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
1925     }
1926 }
1927 static void lumRangeToJpeg_c(int16_t *dst, int width)
1928 {
1929     int i;
1930     for (i = 0; i < width; i++)
1931         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
1932 }
1933 static void lumRangeFromJpeg_c(int16_t *dst, int width)
1934 {
1935     int i;
1936     for (i = 0; i < width; i++)
1937         dst[i] = (dst[i]*14071 + 33561947)>>14;
1938 }
1939
1940 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
1941                            const uint8_t *src, int srcW, int xInc)
1942 {
1943     int i;
1944     unsigned int xpos=0;
1945     for (i=0;i<dstWidth;i++) {
1946         register unsigned int xx=xpos>>16;
1947         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1948         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
1949         xpos+=xInc;
1950     }
1951     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
1952         dst[i] = src[srcW-1]*128;
1953 }
1954
1955 // *** horizontal scale Y line to temp buffer
1956 static av_always_inline void hyscale(SwsContext *c, uint16_t *dst, int dstWidth,
1957                                      const uint8_t *src, int srcW, int xInc,
1958                                      const int16_t *hLumFilter,
1959                                      const int16_t *hLumFilterPos, int hLumFilterSize,
1960                                      uint8_t *formatConvBuffer,
1961                                      uint32_t *pal, int isAlpha)
1962 {
1963     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
1964     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
1965
1966     if (toYV12) {
1967         toYV12(formatConvBuffer, src, srcW, pal);
1968         src= formatConvBuffer;
1969     }
1970
1971     if (c->hScale16) {
1972         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1973         c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift);
1974     } else if (!c->hyscale_fast) {
1975         c->hScale(dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
1976     } else { // fast bilinear upscale / crap downscale
1977         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
1978     }
1979
1980     if (convertRange)
1981         convertRange(dst, dstWidth);
1982 }
1983
1984 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
1985                            int dstWidth, const uint8_t *src1,
1986                            const uint8_t *src2, int srcW, int xInc)
1987 {
1988     int i;
1989     unsigned int xpos=0;
1990     for (i=0;i<dstWidth;i++) {
1991         register unsigned int xx=xpos>>16;
1992         register unsigned int xalpha=(xpos&0xFFFF)>>9;
1993         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
1994         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
1995         xpos+=xInc;
1996     }
1997     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
1998         dst1[i] = src1[srcW-1]*128;
1999         dst2[i] = src2[srcW-1]*128;
2000     }
2001 }
2002
2003 static av_always_inline void hcscale(SwsContext *c, uint16_t *dst1, uint16_t *dst2, int dstWidth,
2004                                      const uint8_t *src1, const uint8_t *src2,
2005                                      int srcW, int xInc, const int16_t *hChrFilter,
2006                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2007                                      uint8_t *formatConvBuffer, uint32_t *pal)
2008 {
2009     if (c->chrToYV12) {
2010         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2011         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2012         src1= formatConvBuffer;
2013         src2= buf2;
2014     }
2015
2016     if (c->hScale16) {
2017         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2018         c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2019         c->hScale16(dst2, dstWidth, (const uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2020     } else if (!c->hcscale_fast) {
2021         c->hScale(dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2022         c->hScale(dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2023     } else { // fast bilinear upscale / crap downscale
2024         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2025     }
2026
2027     if (c->chrConvertRange)
2028         c->chrConvertRange(dst1, dst2, dstWidth);
2029 }
2030
2031 static av_always_inline void
2032 find_c_packed_planar_out_funcs(SwsContext *c,
2033                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2034                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2035                                yuv2packedX_fn *yuv2packedX)
2036 {
2037     enum PixelFormat dstFormat = c->dstFormat;
2038
2039     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2040         *yuv2yuvX     = yuv2nv12X_c;
2041     } else if (is16BPS(dstFormat)) {
2042         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2043     } else if (is9_OR_10BPS(dstFormat)) {
2044         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2045             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2046         } else {
2047             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2048         }
2049     } else {
2050         *yuv2yuv1     = yuv2yuv1_c;
2051         *yuv2yuvX     = yuv2yuvX_c;
2052     }
2053     if(c->flags & SWS_FULL_CHR_H_INT) {
2054         switch (dstFormat) {
2055             case PIX_FMT_RGBA:
2056 #if CONFIG_SMALL
2057                 *yuv2packedX = yuv2rgba32_full_X_c;
2058 #else
2059 #if CONFIG_SWSCALE_ALPHA
2060                 if (c->alpPixBuf) {
2061                     *yuv2packedX = yuv2rgba32_full_X_c;
2062                 } else
2063 #endif /* CONFIG_SWSCALE_ALPHA */
2064                 {
2065                     *yuv2packedX = yuv2rgbx32_full_X_c;
2066                 }
2067 #endif /* !CONFIG_SMALL */
2068                 break;
2069             case PIX_FMT_ARGB:
2070 #if CONFIG_SMALL
2071                 *yuv2packedX = yuv2argb32_full_X_c;
2072 #else
2073 #if CONFIG_SWSCALE_ALPHA
2074                 if (c->alpPixBuf) {
2075                     *yuv2packedX = yuv2argb32_full_X_c;
2076                 } else
2077 #endif /* CONFIG_SWSCALE_ALPHA */
2078                 {
2079                     *yuv2packedX = yuv2xrgb32_full_X_c;
2080                 }
2081 #endif /* !CONFIG_SMALL */
2082                 break;
2083             case PIX_FMT_BGRA:
2084 #if CONFIG_SMALL
2085                 *yuv2packedX = yuv2bgra32_full_X_c;
2086 #else
2087 #if CONFIG_SWSCALE_ALPHA
2088                 if (c->alpPixBuf) {
2089                     *yuv2packedX = yuv2bgra32_full_X_c;
2090                 } else
2091 #endif /* CONFIG_SWSCALE_ALPHA */
2092                 {
2093                     *yuv2packedX = yuv2bgrx32_full_X_c;
2094                 }
2095 #endif /* !CONFIG_SMALL */
2096                 break;
2097             case PIX_FMT_ABGR:
2098 #if CONFIG_SMALL
2099                 *yuv2packedX = yuv2abgr32_full_X_c;
2100 #else
2101 #if CONFIG_SWSCALE_ALPHA
2102                 if (c->alpPixBuf) {
2103                     *yuv2packedX = yuv2abgr32_full_X_c;
2104                 } else
2105 #endif /* CONFIG_SWSCALE_ALPHA */
2106                 {
2107                     *yuv2packedX = yuv2xbgr32_full_X_c;
2108                 }
2109 #endif /* !CONFIG_SMALL */
2110                 break;
2111             case PIX_FMT_RGB24:
2112             *yuv2packedX = yuv2rgb24_full_X_c;
2113             break;
2114         case PIX_FMT_BGR24:
2115             *yuv2packedX = yuv2bgr24_full_X_c;
2116             break;
2117         }
2118         if(!*yuv2packedX)
2119             goto YUV_PACKED;
2120     } else {
2121         YUV_PACKED:
2122         switch (dstFormat) {
2123         case PIX_FMT_GRAY16BE:
2124             *yuv2packed1 = yuv2gray16BE_1_c;
2125             *yuv2packed2 = yuv2gray16BE_2_c;
2126             *yuv2packedX = yuv2gray16BE_X_c;
2127             break;
2128         case PIX_FMT_GRAY16LE:
2129             *yuv2packed1 = yuv2gray16LE_1_c;
2130             *yuv2packed2 = yuv2gray16LE_2_c;
2131             *yuv2packedX = yuv2gray16LE_X_c;
2132             break;
2133         case PIX_FMT_MONOWHITE:
2134             *yuv2packed1 = yuv2monowhite_1_c;
2135             *yuv2packed2 = yuv2monowhite_2_c;
2136             *yuv2packedX = yuv2monowhite_X_c;
2137             break;
2138         case PIX_FMT_MONOBLACK:
2139             *yuv2packed1 = yuv2monoblack_1_c;
2140             *yuv2packed2 = yuv2monoblack_2_c;
2141             *yuv2packedX = yuv2monoblack_X_c;
2142             break;
2143         case PIX_FMT_YUYV422:
2144             *yuv2packed1 = yuv2yuyv422_1_c;
2145             *yuv2packed2 = yuv2yuyv422_2_c;
2146             *yuv2packedX = yuv2yuyv422_X_c;
2147             break;
2148         case PIX_FMT_UYVY422:
2149             *yuv2packed1 = yuv2uyvy422_1_c;
2150             *yuv2packed2 = yuv2uyvy422_2_c;
2151             *yuv2packedX = yuv2uyvy422_X_c;
2152             break;
2153         case PIX_FMT_RGB48LE:
2154             //*yuv2packed1 = yuv2rgb48le_1_c;
2155             //*yuv2packed2 = yuv2rgb48le_2_c;
2156             //*yuv2packedX = yuv2rgb48le_X_c;
2157             //break;
2158         case PIX_FMT_RGB48BE:
2159             *yuv2packed1 = yuv2rgb48be_1_c;
2160             *yuv2packed2 = yuv2rgb48be_2_c;
2161             *yuv2packedX = yuv2rgb48be_X_c;
2162             break;
2163         case PIX_FMT_BGR48LE:
2164             //*yuv2packed1 = yuv2bgr48le_1_c;
2165             //*yuv2packed2 = yuv2bgr48le_2_c;
2166             //*yuv2packedX = yuv2bgr48le_X_c;
2167             //break;
2168         case PIX_FMT_BGR48BE:
2169             *yuv2packed1 = yuv2bgr48be_1_c;
2170             *yuv2packed2 = yuv2bgr48be_2_c;
2171             *yuv2packedX = yuv2bgr48be_X_c;
2172             break;
2173         case PIX_FMT_RGB32:
2174         case PIX_FMT_BGR32:
2175 #if CONFIG_SMALL
2176             *yuv2packed1 = yuv2rgb32_1_c;
2177             *yuv2packed2 = yuv2rgb32_2_c;
2178             *yuv2packedX = yuv2rgb32_X_c;
2179 #else
2180 #if CONFIG_SWSCALE_ALPHA
2181                 if (c->alpPixBuf) {
2182                     *yuv2packed1 = yuv2rgba32_1_c;
2183                     *yuv2packed2 = yuv2rgba32_2_c;
2184                     *yuv2packedX = yuv2rgba32_X_c;
2185                 } else
2186 #endif /* CONFIG_SWSCALE_ALPHA */
2187                 {
2188                     *yuv2packed1 = yuv2rgbx32_1_c;
2189                     *yuv2packed2 = yuv2rgbx32_2_c;
2190                     *yuv2packedX = yuv2rgbx32_X_c;
2191                 }
2192 #endif /* !CONFIG_SMALL */
2193             break;
2194         case PIX_FMT_RGB32_1:
2195         case PIX_FMT_BGR32_1:
2196 #if CONFIG_SMALL
2197                 *yuv2packed1 = yuv2rgb32_1_1_c;
2198                 *yuv2packed2 = yuv2rgb32_1_2_c;
2199                 *yuv2packedX = yuv2rgb32_1_X_c;
2200 #else
2201 #if CONFIG_SWSCALE_ALPHA
2202                 if (c->alpPixBuf) {
2203                     *yuv2packed1 = yuv2rgba32_1_1_c;
2204                     *yuv2packed2 = yuv2rgba32_1_2_c;
2205                     *yuv2packedX = yuv2rgba32_1_X_c;
2206                 } else
2207 #endif /* CONFIG_SWSCALE_ALPHA */
2208                 {
2209                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2210                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2211                     *yuv2packedX = yuv2rgbx32_1_X_c;
2212                 }
2213 #endif /* !CONFIG_SMALL */
2214                 break;
2215         case PIX_FMT_RGB24:
2216             *yuv2packed1 = yuv2rgb24_1_c;
2217             *yuv2packed2 = yuv2rgb24_2_c;
2218             *yuv2packedX = yuv2rgb24_X_c;
2219             break;
2220         case PIX_FMT_BGR24:
2221             *yuv2packed1 = yuv2bgr24_1_c;
2222             *yuv2packed2 = yuv2bgr24_2_c;
2223             *yuv2packedX = yuv2bgr24_X_c;
2224             break;
2225         case PIX_FMT_RGB565LE:
2226         case PIX_FMT_RGB565BE:
2227         case PIX_FMT_BGR565LE:
2228         case PIX_FMT_BGR565BE:
2229             *yuv2packed1 = yuv2rgb16_1_c;
2230             *yuv2packed2 = yuv2rgb16_2_c;
2231             *yuv2packedX = yuv2rgb16_X_c;
2232             break;
2233         case PIX_FMT_RGB555LE:
2234         case PIX_FMT_RGB555BE:
2235         case PIX_FMT_BGR555LE:
2236         case PIX_FMT_BGR555BE:
2237             *yuv2packed1 = yuv2rgb15_1_c;
2238             *yuv2packed2 = yuv2rgb15_2_c;
2239             *yuv2packedX = yuv2rgb15_X_c;
2240             break;
2241         case PIX_FMT_RGB444LE:
2242         case PIX_FMT_RGB444BE:
2243         case PIX_FMT_BGR444LE:
2244         case PIX_FMT_BGR444BE:
2245             *yuv2packed1 = yuv2rgb12_1_c;
2246             *yuv2packed2 = yuv2rgb12_2_c;
2247             *yuv2packedX = yuv2rgb12_X_c;
2248             break;
2249         case PIX_FMT_RGB8:
2250         case PIX_FMT_BGR8:
2251             *yuv2packed1 = yuv2rgb8_1_c;
2252             *yuv2packed2 = yuv2rgb8_2_c;
2253             *yuv2packedX = yuv2rgb8_X_c;
2254             break;
2255         case PIX_FMT_RGB4:
2256         case PIX_FMT_BGR4:
2257             *yuv2packed1 = yuv2rgb4_1_c;
2258             *yuv2packed2 = yuv2rgb4_2_c;
2259             *yuv2packedX = yuv2rgb4_X_c;
2260             break;
2261         case PIX_FMT_RGB4_BYTE:
2262         case PIX_FMT_BGR4_BYTE:
2263             *yuv2packed1 = yuv2rgb4b_1_c;
2264             *yuv2packed2 = yuv2rgb4b_2_c;
2265             *yuv2packedX = yuv2rgb4b_X_c;
2266             break;
2267         }
2268     }
2269 }
2270
2271 #define DEBUG_SWSCALE_BUFFERS 0
2272 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2273
2274 static int swScale(SwsContext *c, const uint8_t* src[],
2275                    int srcStride[], int srcSliceY,
2276                    int srcSliceH, uint8_t* dst[], int dstStride[])
2277 {
2278     /* load a few things into local vars to make the code more readable? and faster */
2279     const int srcW= c->srcW;
2280     const int dstW= c->dstW;
2281     const int dstH= c->dstH;
2282     const int chrDstW= c->chrDstW;
2283     const int chrSrcW= c->chrSrcW;
2284     const int lumXInc= c->lumXInc;
2285     const int chrXInc= c->chrXInc;
2286     const enum PixelFormat dstFormat= c->dstFormat;
2287     const int flags= c->flags;
2288     int16_t *vLumFilterPos= c->vLumFilterPos;
2289     int16_t *vChrFilterPos= c->vChrFilterPos;
2290     int16_t *hLumFilterPos= c->hLumFilterPos;
2291     int16_t *hChrFilterPos= c->hChrFilterPos;
2292     int16_t *vLumFilter= c->vLumFilter;
2293     int16_t *vChrFilter= c->vChrFilter;
2294     int16_t *hLumFilter= c->hLumFilter;
2295     int16_t *hChrFilter= c->hChrFilter;
2296     int32_t *lumMmxFilter= c->lumMmxFilter;
2297     int32_t *chrMmxFilter= c->chrMmxFilter;
2298     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2299     const int vLumFilterSize= c->vLumFilterSize;
2300     const int vChrFilterSize= c->vChrFilterSize;
2301     const int hLumFilterSize= c->hLumFilterSize;
2302     const int hChrFilterSize= c->hChrFilterSize;
2303     int16_t **lumPixBuf= c->lumPixBuf;
2304     int16_t **chrUPixBuf= c->chrUPixBuf;
2305     int16_t **chrVPixBuf= c->chrVPixBuf;
2306     int16_t **alpPixBuf= c->alpPixBuf;
2307     const int vLumBufSize= c->vLumBufSize;
2308     const int vChrBufSize= c->vChrBufSize;
2309     uint8_t *formatConvBuffer= c->formatConvBuffer;
2310     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2311     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2312     int lastDstY;
2313     uint32_t *pal=c->pal_yuv;
2314     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2315     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2316     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2317     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2318     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2319     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2320
2321     /* vars which will change and which we need to store back in the context */
2322     int dstY= c->dstY;
2323     int lumBufIndex= c->lumBufIndex;
2324     int chrBufIndex= c->chrBufIndex;
2325     int lastInLumBuf= c->lastInLumBuf;
2326     int lastInChrBuf= c->lastInChrBuf;
2327
2328     if (isPacked(c->srcFormat)) {
2329         src[0]=
2330         src[1]=
2331         src[2]=
2332         src[3]= src[0];
2333         srcStride[0]=
2334         srcStride[1]=
2335         srcStride[2]=
2336         srcStride[3]= srcStride[0];
2337     }
2338     srcStride[1]<<= c->vChrDrop;
2339     srcStride[2]<<= c->vChrDrop;
2340
2341     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2342                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2343                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2344     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2345                    srcSliceY,    srcSliceH,    dstY,    dstH);
2346     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2347                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2348
2349     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2350         static int warnedAlready=0; //FIXME move this into the context perhaps
2351         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2352             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2353                    "         ->cannot do aligned memory accesses anymore\n");
2354             warnedAlready=1;
2355         }
2356     }
2357
2358     /* Note the user might start scaling the picture in the middle so this
2359        will not get executed. This is not really intended but works
2360        currently, so people might do it. */
2361     if (srcSliceY ==0) {
2362         lumBufIndex=-1;
2363         chrBufIndex=-1;
2364         dstY=0;
2365         lastInLumBuf= -1;
2366         lastInChrBuf= -1;
2367     }
2368
2369     lastDstY= dstY;
2370
2371     for (;dstY < dstH; dstY++) {
2372         const int chrDstY= dstY>>c->chrDstVSubSample;
2373         uint8_t *dest[4] = {
2374             dst[0] + dstStride[0] * dstY,
2375             dst[1] + dstStride[1] * chrDstY,
2376             dst[2] + dstStride[2] * chrDstY,
2377             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2378         };
2379         const uint8_t *lumDither= should_dither ? dithers[7][dstY   &7] : flat64;
2380         const uint8_t *chrDither= should_dither ? dithers[7][chrDstY&7] : flat64;
2381
2382         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2383         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2384         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2385         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2386         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2387         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2388         int enough_lines;
2389
2390         //handle holes (FAST_BILINEAR & weird filters)
2391         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2392         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2393         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2394         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2395
2396         DEBUG_BUFFERS("dstY: %d\n", dstY);
2397         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2398                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2399         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2400                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2401
2402         // Do we have enough lines in this slice to output the dstY line
2403         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2404
2405         if (!enough_lines) {
2406             lastLumSrcY = srcSliceY + srcSliceH - 1;
2407             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2408             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2409                                             lastLumSrcY, lastChrSrcY);
2410         }
2411
2412         //Do horizontal scaling
2413         while(lastInLumBuf < lastLumSrcY) {
2414             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2415             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2416             lumBufIndex++;
2417             assert(lumBufIndex < 2*vLumBufSize);
2418             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2419             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2420             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2421                     hLumFilter, hLumFilterPos, hLumFilterSize,
2422                     formatConvBuffer,
2423                     pal, 0);
2424             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2425                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2426                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2427                         formatConvBuffer,
2428                         pal, 1);
2429             lastInLumBuf++;
2430             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2431                                lumBufIndex,    lastInLumBuf);
2432         }
2433         while(lastInChrBuf < lastChrSrcY) {
2434             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2435             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2436             chrBufIndex++;
2437             assert(chrBufIndex < 2*vChrBufSize);
2438             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2439             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2440             //FIXME replace parameters through context struct (some at least)
2441
2442             if (c->needs_hcscale)
2443                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2444                           chrDstW, src1, src2, chrSrcW, chrXInc,
2445                           hChrFilter, hChrFilterPos, hChrFilterSize,
2446                           formatConvBuffer, pal);
2447             lastInChrBuf++;
2448             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2449                                chrBufIndex,    lastInChrBuf);
2450         }
2451         //wrap buf index around to stay inside the ring buffer
2452         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2453         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2454         if (!enough_lines)
2455             break; //we can't output a dstY line so let's try with the next slice
2456
2457 #if HAVE_MMX
2458         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2459 #endif
2460         if (dstY >= dstH-2) {
2461             // hmm looks like we can't use MMX here without overwriting this array's tail
2462             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2463                                            &yuv2packed1, &yuv2packed2,
2464                                            &yuv2packedX);
2465         }
2466
2467         {
2468             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2469             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2470             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2471             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2472
2473             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2474                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2475                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2476                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2477                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2478                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2479                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2480                              dest, dstW, chrDstW, lumDither, chrDither);
2481                 } else { //General YV12
2482                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2483                              lumSrcPtr, vLumFilterSize,
2484                              vChrFilter + chrDstY * vChrFilterSize,
2485                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2486                              alpSrcPtr, dest, dstW, chrDstW, lumDither, chrDither);
2487                 }
2488             } else {
2489                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2490                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2491                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2492                     int chrAlpha = vChrFilter[2 * dstY + 1];
2493                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2494                                 alpPixBuf ? *alpSrcPtr : NULL,
2495                                 dest[0], dstW, chrAlpha, dstY);
2496                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2497                     int lumAlpha = vLumFilter[2 * dstY + 1];
2498                     int chrAlpha = vChrFilter[2 * dstY + 1];
2499                     lumMmxFilter[2] =
2500                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2501                     chrMmxFilter[2] =
2502                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2503                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2504                                 alpPixBuf ? alpSrcPtr : NULL,
2505                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2506                 } else { //general RGB
2507                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2508                                 lumSrcPtr, vLumFilterSize,
2509                                 vChrFilter + dstY * vChrFilterSize,
2510                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2511                                 alpSrcPtr, dest[0], dstW, dstY);
2512                 }
2513             }
2514         }
2515     }
2516
2517     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2518         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2519
2520 #if HAVE_MMX2
2521     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2522         __asm__ volatile("sfence":::"memory");
2523 #endif
2524     emms_c();
2525
2526     /* store changed local vars back in the context */
2527     c->dstY= dstY;
2528     c->lumBufIndex= lumBufIndex;
2529     c->chrBufIndex= chrBufIndex;
2530     c->lastInLumBuf= lastInLumBuf;
2531     c->lastInChrBuf= lastInChrBuf;
2532
2533     return dstY - lastDstY;
2534 }
2535
2536 static av_cold void sws_init_swScale_c(SwsContext *c)
2537 {
2538     enum PixelFormat srcFormat = c->srcFormat;
2539
2540     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2541                                    &c->yuv2packed1, &c->yuv2packed2,
2542                                    &c->yuv2packedX);
2543
2544     c->hScale       = hScale_c;
2545
2546     if (c->flags & SWS_FAST_BILINEAR) {
2547         c->hyscale_fast = hyscale_fast_c;
2548         c->hcscale_fast = hcscale_fast_c;
2549     }
2550
2551     c->chrToYV12 = NULL;
2552     switch(srcFormat) {
2553         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2554         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2555         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2556         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2557         case PIX_FMT_RGB8     :
2558         case PIX_FMT_BGR8     :
2559         case PIX_FMT_PAL8     :
2560         case PIX_FMT_BGR4_BYTE:
2561         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2562         case PIX_FMT_GRAY16BE :
2563         case PIX_FMT_YUV444P9BE:
2564         case PIX_FMT_YUV420P9BE:
2565         case PIX_FMT_YUV444P10BE:
2566         case PIX_FMT_YUV422P10BE:
2567         case PIX_FMT_YUV420P10BE:
2568         case PIX_FMT_YUV420P16BE:
2569         case PIX_FMT_YUV422P16BE:
2570         case PIX_FMT_YUV444P16BE: c->hScale16= HAVE_BIGENDIAN ? hScale16_c : hScale16X_c; break;
2571         case PIX_FMT_GRAY16LE :
2572         case PIX_FMT_YUV444P9LE:
2573         case PIX_FMT_YUV420P9LE:
2574         case PIX_FMT_YUV422P10LE:
2575         case PIX_FMT_YUV420P10LE:
2576         case PIX_FMT_YUV444P10LE:
2577         case PIX_FMT_YUV420P16LE:
2578         case PIX_FMT_YUV422P16LE:
2579         case PIX_FMT_YUV444P16LE: c->hScale16= HAVE_BIGENDIAN ? hScale16X_c : hScale16_c; break;
2580     }
2581     if (c->chrSrcHSubSample) {
2582         switch(srcFormat) {
2583         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2584         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2585         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2586         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2587         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2588         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2589         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2590         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2591         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2592         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2593         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2594         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2595         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2596         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2597         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2598         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2599         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2600         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2601         }
2602     } else {
2603         switch(srcFormat) {
2604         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2605         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2606         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2607         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2608         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2609         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2610         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2611         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2612         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2613         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2614         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2615         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2616         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2617         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2618         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2619         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2620         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2621         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2622         }
2623     }
2624
2625     c->lumToYV12 = NULL;
2626     c->alpToYV12 = NULL;
2627     switch (srcFormat) {
2628     case PIX_FMT_YUYV422  :
2629     case PIX_FMT_GRAY8A   :
2630                             c->lumToYV12 = yuy2ToY_c; break;
2631     case PIX_FMT_UYVY422  :
2632                             c->lumToYV12 = uyvyToY_c;    break;
2633     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2634     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2635     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2636     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2637     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2638     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2639     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2640     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2641     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2642     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2643     case PIX_FMT_RGB8     :
2644     case PIX_FMT_BGR8     :
2645     case PIX_FMT_PAL8     :
2646     case PIX_FMT_BGR4_BYTE:
2647     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2648     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2649     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2650     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2651     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2652     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2653     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2654     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2655     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2656     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2657     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2658     }
2659     if (c->alpPixBuf) {
2660         switch (srcFormat) {
2661         case PIX_FMT_BGRA:
2662         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2663         case PIX_FMT_ABGR:
2664         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2665         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2666         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2667         }
2668     }
2669
2670     if(isAnyRGB(c->srcFormat) || c->srcFormat == PIX_FMT_PAL8)
2671         c->hScale16= hScale16_c;
2672
2673     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2674         if (c->srcRange) {
2675             c->lumConvertRange = lumRangeFromJpeg_c;
2676             c->chrConvertRange = chrRangeFromJpeg_c;
2677         } else {
2678             c->lumConvertRange = lumRangeToJpeg_c;
2679             c->chrConvertRange = chrRangeToJpeg_c;
2680         }
2681     }
2682
2683     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2684           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2685         c->needs_hcscale = 1;
2686 }
2687
2688 SwsFunc ff_getSwsFunc(SwsContext *c)
2689 {
2690     sws_init_swScale_c(c);
2691
2692     if (HAVE_MMX)
2693         ff_sws_init_swScale_mmx(c);
2694     if (HAVE_ALTIVEC)
2695         ff_sws_init_swScale_altivec(c);
2696
2697     return swScale;
2698 }