libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/avassert.h"
  64 #include "libavutil/intreadwrite.h"
  65 #include "libavutil/cpu.h"
  66 #include "libavutil/avutil.h"
  67 #include "libavutil/mathematics.h"
  68 #include "libavutil/bswap.h"
  69 #include "libavutil/pixdesc.h"
  70
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 DECLARE_ALIGNED(8, const uint8_t, dithers)[8][8][8]={
 199 {
 200   {   0,  1,  0,  1,  0,  1,  0,  1,},
 201   {   1,  0,  1,  0,  1,  0,  1,  0,},
 202   {   0,  1,  0,  1,  0,  1,  0,  1,},
 203   {   1,  0,  1,  0,  1,  0,  1,  0,},
 204   {   0,  1,  0,  1,  0,  1,  0,  1,},
 205   {   1,  0,  1,  0,  1,  0,  1,  0,},
 206   {   0,  1,  0,  1,  0,  1,  0,  1,},
 207   {   1,  0,  1,  0,  1,  0,  1,  0,},
 208 },{
 209   {   1,  2,  1,  2,  1,  2,  1,  2,},
 210   {   3,  0,  3,  0,  3,  0,  3,  0,},
 211   {   1,  2,  1,  2,  1,  2,  1,  2,},
 212   {   3,  0,  3,  0,  3,  0,  3,  0,},
 213   {   1,  2,  1,  2,  1,  2,  1,  2,},
 214   {   3,  0,  3,  0,  3,  0,  3,  0,},
 215   {   1,  2,  1,  2,  1,  2,  1,  2,},
 216   {   3,  0,  3,  0,  3,  0,  3,  0,},
 217 },{
 218   {   2,  4,  3,  5,  2,  4,  3,  5,},
 219   {   6,  0,  7,  1,  6,  0,  7,  1,},
 220   {   3,  5,  2,  4,  3,  5,  2,  4,},
 221   {   7,  1,  6,  0,  7,  1,  6,  0,},
 222   {   2,  4,  3,  5,  2,  4,  3,  5,},
 223   {   6,  0,  7,  1,  6,  0,  7,  1,},
 224   {   3,  5,  2,  4,  3,  5,  2,  4,},
 225   {   7,  1,  6,  0,  7,  1,  6,  0,},
 226 },{
 227   {   4,  8,  7, 11,  4,  8,  7, 11,},
 228   {  12,  0, 15,  3, 12,  0, 15,  3,},
 229   {   6, 10,  5,  9,  6, 10,  5,  9,},
 230   {  14,  2, 13,  1, 14,  2, 13,  1,},
 231   {   4,  8,  7, 11,  4,  8,  7, 11,},
 232   {  12,  0, 15,  3, 12,  0, 15,  3,},
 233   {   6, 10,  5,  9,  6, 10,  5,  9,},
 234   {  14,  2, 13,  1, 14,  2, 13,  1,},
 235 },{
 236   {   9, 17, 15, 23,  8, 16, 14, 22,},
 237   {  25,  1, 31,  7, 24,  0, 30,  6,},
 238   {  13, 21, 11, 19, 12, 20, 10, 18,},
 239   {  29,  5, 27,  3, 28,  4, 26,  2,},
 240   {   8, 16, 14, 22,  9, 17, 15, 23,},
 241   {  24,  0, 30,  6, 25,  1, 31,  7,},
 242   {  12, 20, 10, 18, 13, 21, 11, 19,},
 243   {  28,  4, 26,  2, 29,  5, 27,  3,},
 244 },{
 245   {  18, 34, 30, 46, 17, 33, 29, 45,},
 246   {  50,  2, 62, 14, 49,  1, 61, 13,},
 247   {  26, 42, 22, 38, 25, 41, 21, 37,},
 248   {  58, 10, 54,  6, 57,  9, 53,  5,},
 249   {  16, 32, 28, 44, 19, 35, 31, 47,},
 250   {  48,  0, 60, 12, 51,  3, 63, 15,},
 251   {  24, 40, 20, 36, 27, 43, 23, 39,},
 252   {  56,  8, 52,  4, 59, 11, 55,  7,},
 253 },{
 254   {  18, 34, 30, 46, 17, 33, 29, 45,},
 255   {  50,  2, 62, 14, 49,  1, 61, 13,},
 256   {  26, 42, 22, 38, 25, 41, 21, 37,},
 257   {  58, 10, 54,  6, 57,  9, 53,  5,},
 258   {  16, 32, 28, 44, 19, 35, 31, 47,},
 259   {  48,  0, 60, 12, 51,  3, 63, 15,},
 260   {  24, 40, 20, 36, 27, 43, 23, 39,},
 261   {  56,  8, 52,  4, 59, 11, 55,  7,},
 262 },{
 263   {  36, 68, 60, 92, 34, 66, 58, 90,},
 264   { 100,  4,124, 28, 98,  2,122, 26,},
 265   {  52, 84, 44, 76, 50, 82, 42, 74,},
 266   { 116, 20,108, 12,114, 18,106, 10,},
 267   {  32, 64, 56, 88, 38, 70, 62, 94,},
 268   {  96,  0,120, 24,102,  6,126, 30,},
 269   {  48, 80, 40, 72, 54, 86, 46, 78,},
 270   { 112, 16,104,  8,118, 22,110, 14,},
 271 }};
 272
 273 static const uint8_t flat64[8]={64,64,64,64,64,64,64,64};
 274
 275 const uint16_t dither_scale[15][16]={
 276 {    2,    3,    3,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,},
 277 {    2,    3,    7,    7,   13,   13,   25,   25,   25,   25,   25,   25,   25,   25,   25,   25,},
 278 {    3,    3,    4,   15,   15,   29,   57,   57,   57,  113,  113,  113,  113,  113,  113,  113,},
 279 {    3,    4,    4,    5,   31,   31,   61,  121,  241,  241,  241,  241,  481,  481,  481,  481,},
 280 {    3,    4,    5,    5,    6,   63,   63,  125,  249,  497,  993,  993,  993,  993,  993, 1985,},
 281 {    3,    5,    6,    6,    6,    7,  127,  127,  253,  505, 1009, 2017, 4033, 4033, 4033, 4033,},
 282 {    3,    5,    6,    7,    7,    7,    8,  255,  255,  509, 1017, 2033, 4065, 8129,16257,16257,},
 283 {    3,    5,    6,    8,    8,    8,    8,    9,  511,  511, 1021, 2041, 4081, 8161,16321,32641,},
 284 {    3,    5,    7,    8,    9,    9,    9,    9,   10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
 285 {    3,    5,    7,    8,   10,   10,   10,   10,   10,   11, 2047, 2047, 4093, 8185,16369,32737,},
 286 {    3,    5,    7,    8,   10,   11,   11,   11,   11,   11,   12, 4095, 4095, 8189,16377,32753,},
 287 {    3,    5,    7,    9,   10,   12,   12,   12,   12,   12,   12,   13, 8191, 8191,16381,32761,},
 288 {    3,    5,    7,    9,   10,   12,   13,   13,   13,   13,   13,   13,   14,16383,16383,32765,},
 289 {    3,    5,    7,    9,   10,   12,   14,   14,   14,   14,   14,   14,   14,   15,32767,32767,},
 290 {    3,    5,    7,    9,   11,   12,   14,   15,   15,   15,   15,   15,   15,   15,   16,65535,},
 291 };
 292
 293 static av_always_inline void
 294 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 295                       int lumFilterSize, const int16_t *chrFilter,
 296                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 297                       int chrFilterSize, const int32_t **alpSrc,
 298                       uint16_t *dest[4], int dstW, int chrDstW,
 299                       int big_endian, int output_bits)
 300 {
 301     //FIXME Optimize (just quickly written not optimized..)
 302     int i;
 303     int dword= output_bits == 16;
 304     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 305              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 306     int shift = 11 + 4*dword + 16 - output_bits - 1;
 307
 308 #define output_pixel(pos, val) \
 309     if (big_endian) { \
 310         if (output_bits == 16) { \
 311             AV_WB16(pos, av_clip_uint16(val >> shift)); \
 312         } else { \
 313             AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 314         } \
 315     } else { \
 316         if (output_bits == 16) { \
 317             AV_WL16(pos, av_clip_uint16(val >> shift)); \
 318         } else { \
 319             AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 320         } \
 321     }
 322     for (i = 0; i < dstW; i++) {
 323         int val = 1 << (26-output_bits + 4*dword - 1);
 324         int j;
 325
 326         for (j = 0; j < lumFilterSize; j++)
 327             val += ((dword ? lumSrc[j][i] : ((int16_t**)lumSrc)[j][i]) * lumFilter[j])>>1;
 328
 329         output_pixel(&yDest[i], val);
 330     }
 331
 332     if (uDest) {
 333         for (i = 0; i < chrDstW; i++) {
 334             int u = 1 << (26-output_bits + 4*dword - 1);
 335             int v = 1 << (26-output_bits + 4*dword - 1);
 336             int j;
 337
 338             for (j = 0; j < chrFilterSize; j++) {
 339                 u += ((dword ? chrUSrc[j][i] : ((int16_t**)chrUSrc)[j][i]) * chrFilter[j]) >> 1;
 340                 v += ((dword ? chrVSrc[j][i] : ((int16_t**)chrVSrc)[j][i]) * chrFilter[j]) >> 1;
 341             }
 342
 343             output_pixel(&uDest[i], u);
 344             output_pixel(&vDest[i], v);
 345         }
 346     }
 347
 348     if (CONFIG_SWSCALE_ALPHA && aDest) {
 349         for (i = 0; i < dstW; i++) {
 350             int val = 1 << (26-output_bits + 4*dword - 1);
 351             int j;
 352
 353             for (j = 0; j < lumFilterSize; j++)
 354                 val += ((dword ? alpSrc[j][i] : ((int16_t**)alpSrc)[j][i]) * lumFilter[j]) >> 1;
 355
 356             output_pixel(&aDest[i], val);
 357         }
 358     }
 359 #undef output_pixel
 360 }
 361
 362 #define yuv2NBPS(bits, BE_LE, is_be) \
 363 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 364                               const int16_t **_lumSrc, int lumFilterSize, \
 365                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 366                               const int16_t **_chrVSrc, \
 367                               int chrFilterSize, const int16_t **_alpSrc, \
 368                               uint8_t *_dest[4], int dstW, int chrDstW) \
 369 { \
 370     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 371                   **chrUSrc = (const int32_t **) _chrUSrc, \
 372                   **chrVSrc = (const int32_t **) _chrVSrc, \
 373                   **alpSrc  = (const int32_t **) _alpSrc; \
 374     yuv2yuvX16_c_template(lumFilter, lumSrc, lumFilterSize, \
 375                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 376                           alpSrc, (uint16_t **) _dest, \
 377                           dstW, chrDstW, is_be, bits); \
 378 }
 379 yuv2NBPS( 9, BE, 1);
 380 yuv2NBPS( 9, LE, 0);
 381 yuv2NBPS(10, BE, 1);
 382 yuv2NBPS(10, LE, 0);
 383 yuv2NBPS(16, BE, 1);
 384 yuv2NBPS(16, LE, 0);
 385
 386 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 387                        const int16_t **lumSrc, int lumFilterSize,
 388                        const int16_t *chrFilter, const int16_t **chrUSrc,
 389                        const int16_t **chrVSrc,
 390                        int chrFilterSize, const int16_t **alpSrc,
 391                        uint8_t *dest[4], int dstW, int chrDstW)
 392 {
 393     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 394             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 395     int i;
 396     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 397
 398     //FIXME Optimize (just quickly written not optimized..)
 399     for (i=0; i<dstW; i++) {
 400         int val = lumDither[i & 7] << 12;
 401         int j;
 402         for (j=0; j<lumFilterSize; j++)
 403             val += lumSrc[j][i] * lumFilter[j];
 404
 405         yDest[i]= av_clip_uint8(val>>19);
 406     }
 407
 408     if (uDest)
 409         for (i=0; i<chrDstW; i++) {
 410             int u = chrDither[i & 7] << 12;
 411             int v = chrDither[(i + 3) & 7] << 12;
 412             int j;
 413             for (j=0; j<chrFilterSize; j++) {
 414                 u += chrUSrc[j][i] * chrFilter[j];
 415                 v += chrVSrc[j][i] * chrFilter[j];
 416             }
 417
 418             uDest[i]= av_clip_uint8(u>>19);
 419             vDest[i]= av_clip_uint8(v>>19);
 420         }
 421
 422     if (CONFIG_SWSCALE_ALPHA && aDest)
 423         for (i=0; i<dstW; i++) {
 424             int val = lumDither[i & 7] << 12;
 425             int j;
 426             for (j=0; j<lumFilterSize; j++)
 427                 val += alpSrc[j][i] * lumFilter[j];
 428
 429             aDest[i]= av_clip_uint8(val>>19);
 430         }
 431 }
 432
 433 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 434                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 435                        const int16_t *alpSrc,
 436                        uint8_t *dest[4], int dstW, int chrDstW)
 437 {
 438     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 439             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 440     int i;
 441     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 442
 443     for (i=0; i<dstW; i++) {
 444         int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
 445         yDest[i]= av_clip_uint8(val);
 446     }
 447
 448     if (uDest)
 449         for (i=0; i<chrDstW; i++) {
 450             int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
 451             int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
 452             uDest[i]= av_clip_uint8(u);
 453             vDest[i]= av_clip_uint8(v);
 454         }
 455
 456     if (CONFIG_SWSCALE_ALPHA && aDest)
 457         for (i=0; i<dstW; i++) {
 458             int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
 459             aDest[i]= av_clip_uint8(val);
 460         }
 461 }
 462
 463 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 464                         const int16_t **lumSrc, int lumFilterSize,
 465                         const int16_t *chrFilter, const int16_t **chrUSrc,
 466                         const int16_t **chrVSrc, int chrFilterSize,
 467                         const int16_t **alpSrc, uint8_t *dest[4],
 468                         int dstW, int chrDstW)
 469 {
 470     uint8_t *yDest = dest[0], *uDest = dest[1];
 471     enum PixelFormat dstFormat = c->dstFormat;
 472     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 473
 474     //FIXME Optimize (just quickly written not optimized..)
 475     int i;
 476     for (i=0; i<dstW; i++) {
 477         int val = lumDither[i & 7] << 12;
 478         int j;
 479         for (j=0; j<lumFilterSize; j++)
 480             val += lumSrc[j][i] * lumFilter[j];
 481
 482         yDest[i]= av_clip_uint8(val>>19);
 483     }
 484
 485     if (!uDest)
 486         return;
 487
 488     if (dstFormat == PIX_FMT_NV12)
 489         for (i=0; i<chrDstW; i++) {
 490             int u = chrDither[i & 7] << 12;
 491             int v = chrDither[(i + 3) & 7] << 12;
 492             int j;
 493             for (j=0; j<chrFilterSize; j++) {
 494                 u += chrUSrc[j][i] * chrFilter[j];
 495                 v += chrVSrc[j][i] * chrFilter[j];
 496             }
 497
 498             uDest[2*i]= av_clip_uint8(u>>19);
 499             uDest[2*i+1]= av_clip_uint8(v>>19);
 500         }
 501     else
 502         for (i=0; i<chrDstW; i++) {
 503             int u = chrDither[i & 7] << 12;
 504             int v = chrDither[(i + 3) & 7] << 12;
 505             int j;
 506             for (j=0; j<chrFilterSize; j++) {
 507                 u += chrUSrc[j][i] * chrFilter[j];
 508                 v += chrVSrc[j][i] * chrFilter[j];
 509             }
 510
 511             uDest[2*i]= av_clip_uint8(v>>19);
 512             uDest[2*i+1]= av_clip_uint8(u>>19);
 513         }
 514 }
 515
 516 #define output_pixel(pos, val) \
 517         if (target == PIX_FMT_GRAY16BE) { \
 518             AV_WB16(pos, val); \
 519         } else { \
 520             AV_WL16(pos, val); \
 521         }
 522
 523 static av_always_inline void
 524 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 525                         const int32_t **lumSrc, int lumFilterSize,
 526                         const int16_t *chrFilter, const int32_t **chrUSrc,
 527                         const int32_t **chrVSrc, int chrFilterSize,
 528                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 529                         int y, enum PixelFormat target)
 530 {
 531     int i;
 532
 533     for (i = 0; i < (dstW >> 1); i++) {
 534         int j;
 535         int Y1 = 1 << 14;
 536         int Y2 = 1 << 14;
 537
 538         for (j = 0; j < lumFilterSize; j++) {
 539             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 540             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 541         }
 542         Y1 >>= 15;
 543         Y2 >>= 15;
 544         if ((Y1 | Y2) & 0x10000) {
 545             Y1 = av_clip_uint16(Y1);
 546             Y2 = av_clip_uint16(Y2);
 547         }
 548         output_pixel(&dest[i * 2 + 0], Y1);
 549         output_pixel(&dest[i * 2 + 1], Y2);
 550     }
 551 }
 552
 553 static av_always_inline void
 554 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 555                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 556                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 557                         int yalpha, int uvalpha, int y,
 558                         enum PixelFormat target)
 559 {
 560     int  yalpha1 = 4095 - yalpha;
 561     int i;
 562     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 563
 564     for (i = 0; i < (dstW >> 1); i++) {
 565         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 566         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 567
 568         output_pixel(&dest[i * 2 + 0], Y1);
 569         output_pixel(&dest[i * 2 + 1], Y2);
 570     }
 571 }
 572
 573 static av_always_inline void
 574 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 575                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 576                         const int32_t *abuf0, uint16_t *dest, int dstW,
 577                         int uvalpha, int y, enum PixelFormat target)
 578 {
 579     int i;
 580
 581     for (i = 0; i < (dstW >> 1); i++) {
 582         int Y1 = (buf0[i * 2    ]+4)>>3;
 583         int Y2 = (buf0[i * 2 + 1]+4)>>3;
 584
 585         output_pixel(&dest[i * 2 + 0], Y1);
 586         output_pixel(&dest[i * 2 + 1], Y2);
 587     }
 588 }
 589
 590 #undef output_pixel
 591
 592 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 593 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 594                         const int16_t **_lumSrc, int lumFilterSize, \
 595                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 596                         const int16_t **_chrVSrc, int chrFilterSize, \
 597                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 598                         int y) \
 599 { \
 600     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 601                   **chrUSrc = (const int32_t **) _chrUSrc, \
 602                   **chrVSrc = (const int32_t **) _chrVSrc, \
 603                   **alpSrc  = (const int32_t **) _alpSrc; \
 604     uint16_t *dest = (uint16_t *) _dest; \
 605     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 606                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 607                           alpSrc, dest, dstW, y, fmt); \
 608 } \
 609  \
 610 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 611                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 612                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 613                         int yalpha, int uvalpha, int y) \
 614 { \
 615     const int32_t **buf  = (const int32_t **) _buf, \
 616                   **ubuf = (const int32_t **) _ubuf, \
 617                   **vbuf = (const int32_t **) _vbuf, \
 618                   **abuf = (const int32_t **) _abuf; \
 619     uint16_t *dest = (uint16_t *) _dest; \
 620     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 621                           dest, dstW, yalpha, uvalpha, y, fmt); \
 622 } \
 623  \
 624 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 625                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 626                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 627                         int uvalpha, int y) \
 628 { \
 629     const int32_t *buf0  = (const int32_t *)  _buf0, \
 630                  **ubuf  = (const int32_t **) _ubuf, \
 631                  **vbuf  = (const int32_t **) _vbuf, \
 632                   *abuf0 = (const int32_t *)  _abuf0; \
 633     uint16_t *dest = (uint16_t *) _dest; \
 634     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 635                                   dstW, uvalpha, y, fmt); \
 636 }
 637
 638 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 639 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 640
 641 #define output_pixel(pos, acc) \
 642     if (target == PIX_FMT_MONOBLACK) { \
 643         pos = acc; \
 644     } else { \
 645         pos = ~acc; \
 646     }
 647
 648 static av_always_inline void
 649 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 650                       const int16_t **lumSrc, int lumFilterSize,
 651                       const int16_t *chrFilter, const int16_t **chrUSrc,
 652                       const int16_t **chrVSrc, int chrFilterSize,
 653                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 654                       int y, enum PixelFormat target)
 655 {
 656     const uint8_t * const d128=dither_8x8_220[y&7];
 657     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 658     int i;
 659     int acc = 0;
 660
 661     for (i = 0; i < dstW - 1; i += 2) {
 662         int j;
 663         int Y1 = 1 << 18;
 664         int Y2 = 1 << 18;
 665
 666         for (j = 0; j < lumFilterSize; j++) {
 667             Y1 += lumSrc[j][i]   * lumFilter[j];
 668             Y2 += lumSrc[j][i+1] * lumFilter[j];
 669         }
 670         Y1 >>= 19;
 671         Y2 >>= 19;
 672         if ((Y1 | Y2) & 0x100) {
 673             Y1 = av_clip_uint8(Y1);
 674             Y2 = av_clip_uint8(Y2);
 675         }
 676         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 677         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 678         if ((i & 7) == 6) {
 679             output_pixel(*dest++, acc);
 680         }
 681     }
 682 }
 683
 684 static av_always_inline void
 685 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 686                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 687                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 688                       int yalpha, int uvalpha, int y,
 689                       enum PixelFormat target)
 690 {
 691     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 692     const uint8_t * const d128 = dither_8x8_220[y & 7];
 693     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 694     int  yalpha1 = 4095 - yalpha;
 695     int i;
 696
 697     for (i = 0; i < dstW - 7; i += 8) {
 698         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 699         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 700         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 701         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 702         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 703         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 704         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 705         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 706         output_pixel(*dest++, acc);
 707     }
 708 }
 709
 710 static av_always_inline void
 711 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 712                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 713                       const int16_t *abuf0, uint8_t *dest, int dstW,
 714                       int uvalpha, int y, enum PixelFormat target)
 715 {
 716     const uint8_t * const d128 = dither_8x8_220[y & 7];
 717     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 718     int i;
 719
 720     for (i = 0; i < dstW - 7; i += 8) {
 721         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 722         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 723         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 724         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 725         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 726         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 727         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 728         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 729         output_pixel(*dest++, acc);
 730     }
 731 }
 732
 733 #undef output_pixel
 734
 735 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 736 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 737                                 const int16_t **lumSrc, int lumFilterSize, \
 738                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 739                                 const int16_t **chrVSrc, int chrFilterSize, \
 740                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 741                                 int y) \
 742 { \
 743     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 744                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 745                                   alpSrc, dest, dstW, y, fmt); \
 746 } \
 747  \
 748 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 749                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 750                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 751                                 int yalpha, int uvalpha, int y) \
 752 { \
 753     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 754                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 755 } \
 756  \
 757 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 758                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 759                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 760                                 int uvalpha, int y) \
 761 { \
 762     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 763                                   abuf0, dest, dstW, uvalpha, \
 764                                   y, fmt); \
 765 }
 766
 767 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 768 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 769
 770 #define output_pixels(pos, Y1, U, Y2, V) \
 771     if (target == PIX_FMT_YUYV422) { \
 772         dest[pos + 0] = Y1; \
 773         dest[pos + 1] = U;  \
 774         dest[pos + 2] = Y2; \
 775         dest[pos + 3] = V;  \
 776     } else { \
 777         dest[pos + 0] = U;  \
 778         dest[pos + 1] = Y1; \
 779         dest[pos + 2] = V;  \
 780         dest[pos + 3] = Y2; \
 781     }
 782
 783 static av_always_inline void
 784 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 785                      const int16_t **lumSrc, int lumFilterSize,
 786                      const int16_t *chrFilter, const int16_t **chrUSrc,
 787                      const int16_t **chrVSrc, int chrFilterSize,
 788                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 789                      int y, enum PixelFormat target)
 790 {
 791     int i;
 792
 793     for (i = 0; i < (dstW >> 1); i++) {
 794         int j;
 795         int Y1 = 1 << 18;
 796         int Y2 = 1 << 18;
 797         int U  = 1 << 18;
 798         int V  = 1 << 18;
 799
 800         for (j = 0; j < lumFilterSize; j++) {
 801             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 802             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 803         }
 804         for (j = 0; j < chrFilterSize; j++) {
 805             U += chrUSrc[j][i] * chrFilter[j];
 806             V += chrVSrc[j][i] * chrFilter[j];
 807         }
 808         Y1 >>= 19;
 809         Y2 >>= 19;
 810         U  >>= 19;
 811         V  >>= 19;
 812         if ((Y1 | Y2 | U | V) & 0x100) {
 813             Y1 = av_clip_uint8(Y1);
 814             Y2 = av_clip_uint8(Y2);
 815             U  = av_clip_uint8(U);
 816             V  = av_clip_uint8(V);
 817         }
 818         output_pixels(4*i, Y1, U, Y2, V);
 819     }
 820 }
 821
 822 static av_always_inline void
 823 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 824                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 825                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 826                      int yalpha, int uvalpha, int y,
 827                      enum PixelFormat target)
 828 {
 829     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 830                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 831                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 832     int  yalpha1 = 4095 - yalpha;
 833     int uvalpha1 = 4095 - uvalpha;
 834     int i;
 835
 836     for (i = 0; i < (dstW >> 1); i++) {
 837         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 838         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 839         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 840         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 841
 842         output_pixels(i * 4, Y1, U, Y2, V);
 843     }
 844 }
 845
 846 static av_always_inline void
 847 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 848                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 849                      const int16_t *abuf0, uint8_t *dest, int dstW,
 850                      int uvalpha, int y, enum PixelFormat target)
 851 {
 852     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 853                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 854     int i;
 855
 856     if (uvalpha < 2048) {
 857         for (i = 0; i < (dstW >> 1); i++) {
 858             int Y1 = buf0[i * 2]     >> 7;
 859             int Y2 = buf0[i * 2 + 1] >> 7;
 860             int U  = ubuf1[i]        >> 7;
 861             int V  = vbuf1[i]        >> 7;
 862
 863             output_pixels(i * 4, Y1, U, Y2, V);
 864         }
 865     } else {
 866         for (i = 0; i < (dstW >> 1); i++) {
 867             int Y1 =  buf0[i * 2]          >> 7;
 868             int Y2 =  buf0[i * 2 + 1]      >> 7;
 869             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 870             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 871
 872             output_pixels(i * 4, Y1, U, Y2, V);
 873         }
 874     }
 875 }
 876
 877 #undef output_pixels
 878
 879 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 880 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 881
 882 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 883 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 884 #define output_pixel(pos, val) \
 885     if (isBE(target)) { \
 886         AV_WB16(pos, val); \
 887     } else { \
 888         AV_WL16(pos, val); \
 889     }
 890
 891 static av_always_inline void
 892 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 893                        const int32_t **lumSrc, int lumFilterSize,
 894                        const int16_t *chrFilter, const int32_t **chrUSrc,
 895                        const int32_t **chrVSrc, int chrFilterSize,
 896                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 897                        int y, enum PixelFormat target)
 898 {
 899     int i;
 900
 901     for (i = 0; i < (dstW >> 1); i++) {
 902         int j;
 903         int Y1 = 0;
 904         int Y2 = 0;
 905         int U  = -128 << 23; // 19
 906         int V  = -128 << 23;
 907         int R, G, B;
 908
 909         for (j = 0; j < lumFilterSize; j++) {
 910             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 911             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 912         }
 913         for (j = 0; j < chrFilterSize; j++) {
 914             U += chrUSrc[j][i] * chrFilter[j];
 915             V += chrVSrc[j][i] * chrFilter[j];
 916         }
 917
 918         // 8bit: 12+15=27; 16-bit: 12+19=31
 919         Y1 >>= 14; // 10
 920         Y2 >>= 14;
 921         U  >>= 14;
 922         V  >>= 14;
 923
 924         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 925         Y1 -= c->yuv2rgb_y_offset;
 926         Y2 -= c->yuv2rgb_y_offset;
 927         Y1 *= c->yuv2rgb_y_coeff;
 928         Y2 *= c->yuv2rgb_y_coeff;
 929         Y1 += 1 << 13; // 21
 930         Y2 += 1 << 13;
 931         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 932
 933         R = V * c->yuv2rgb_v2r_coeff;
 934         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 935         B =                            U * c->yuv2rgb_u2b_coeff;
 936
 937         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 938         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 939         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 940         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 941         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 942         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 943         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 944         dest += 6;
 945     }
 946 }
 947
 948 static av_always_inline void
 949 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 950                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 951                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 952                        int yalpha, int uvalpha, int y,
 953                        enum PixelFormat target)
 954 {
 955     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 956                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 957                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 958     int  yalpha1 = 4095 - yalpha;
 959     int uvalpha1 = 4095 - uvalpha;
 960     int i;
 961
 962     for (i = 0; i < (dstW >> 1); i++) {
 963         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 964         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 965         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 966         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 967         int R, G, B;
 968
 969         Y1 -= c->yuv2rgb_y_offset;
 970         Y2 -= c->yuv2rgb_y_offset;
 971         Y1 *= c->yuv2rgb_y_coeff;
 972         Y2 *= c->yuv2rgb_y_coeff;
 973         Y1 += 1 << 13;
 974         Y2 += 1 << 13;
 975
 976         R = V * c->yuv2rgb_v2r_coeff;
 977         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 978         B =                            U * c->yuv2rgb_u2b_coeff;
 979
 980         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 981         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 982         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 983         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 984         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 985         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 986         dest += 6;
 987     }
 988 }
 989
 990 static av_always_inline void
 991 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 992                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 993                        const int32_t *abuf0, uint16_t *dest, int dstW,
 994                        int uvalpha, int y, enum PixelFormat target)
 995 {
 996     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 997                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 998     int i;
 999
1000     if (uvalpha < 2048) {
1001         for (i = 0; i < (dstW >> 1); i++) {
1002             int Y1 = (buf0[i * 2]    ) >> 2;
1003             int Y2 = (buf0[i * 2 + 1]) >> 2;
1004             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
1005             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
1006             int R, G, B;
1007
1008             Y1 -= c->yuv2rgb_y_offset;
1009             Y2 -= c->yuv2rgb_y_offset;
1010             Y1 *= c->yuv2rgb_y_coeff;
1011             Y2 *= c->yuv2rgb_y_coeff;
1012             Y1 += 1 << 13;
1013             Y2 += 1 << 13;
1014
1015             R = V * c->yuv2rgb_v2r_coeff;
1016             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1017             B =                            U * c->yuv2rgb_u2b_coeff;
1018
1019             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1020             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1021             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1022             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1023             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1024             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1025             dest += 6;
1026         }
1027     } else {
1028         for (i = 0; i < (dstW >> 1); i++) {
1029             int Y1 = (buf0[i * 2]    ) >> 2;
1030             int Y2 = (buf0[i * 2 + 1]) >> 2;
1031             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
1032             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
1033             int R, G, B;
1034
1035             Y1 -= c->yuv2rgb_y_offset;
1036             Y2 -= c->yuv2rgb_y_offset;
1037             Y1 *= c->yuv2rgb_y_coeff;
1038             Y2 *= c->yuv2rgb_y_coeff;
1039             Y1 += 1 << 13;
1040             Y2 += 1 << 13;
1041
1042             R = V * c->yuv2rgb_v2r_coeff;
1043             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1044             B =                            U * c->yuv2rgb_u2b_coeff;
1045
1046             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1047             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1048             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1049             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1050             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1051             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1052             dest += 6;
1053         }
1054     }
1055 }
1056
1057 #undef output_pixel
1058 #undef r_b
1059 #undef b_r
1060
1061 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1062 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1063 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1064 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1065
1066 static av_always_inline void
1067 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1068               int U, int V, int A1, int A2,
1069               const void *_r, const void *_g, const void *_b, int y,
1070               enum PixelFormat target, int hasAlpha)
1071 {
1072     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1073         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1074         uint32_t *dest = (uint32_t *) _dest;
1075         const uint32_t *r = (const uint32_t *) _r;
1076         const uint32_t *g = (const uint32_t *) _g;
1077         const uint32_t *b = (const uint32_t *) _b;
1078
1079 #if CONFIG_SMALL
1080         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1081
1082         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1083         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1084 #else
1085         if (hasAlpha) {
1086             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1087
1088             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1089             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1090         } else {
1091             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1092             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1093         }
1094 #endif
1095     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1096         uint8_t *dest = (uint8_t *) _dest;
1097         const uint8_t *r = (const uint8_t *) _r;
1098         const uint8_t *g = (const uint8_t *) _g;
1099         const uint8_t *b = (const uint8_t *) _b;
1100
1101 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1102 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1103
1104         dest[i * 6 + 0] = r_b[Y1];
1105         dest[i * 6 + 1] =   g[Y1];
1106         dest[i * 6 + 2] = b_r[Y1];
1107         dest[i * 6 + 3] = r_b[Y2];
1108         dest[i * 6 + 4] =   g[Y2];
1109         dest[i * 6 + 5] = b_r[Y2];
1110 #undef r_b
1111 #undef b_r
1112     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1113                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1114                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1115         uint16_t *dest = (uint16_t *) _dest;
1116         const uint16_t *r = (const uint16_t *) _r;
1117         const uint16_t *g = (const uint16_t *) _g;
1118         const uint16_t *b = (const uint16_t *) _b;
1119         int dr1, dg1, db1, dr2, dg2, db2;
1120
1121         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1122             dr1 = dither_2x2_8[ y & 1     ][0];
1123             dg1 = dither_2x2_4[ y & 1     ][0];
1124             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1125             dr2 = dither_2x2_8[ y & 1     ][1];
1126             dg2 = dither_2x2_4[ y & 1     ][1];
1127             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1128         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1129             dr1 = dither_2x2_8[ y & 1     ][0];
1130             dg1 = dither_2x2_8[ y & 1     ][1];
1131             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1132             dr2 = dither_2x2_8[ y & 1     ][1];
1133             dg2 = dither_2x2_8[ y & 1     ][0];
1134             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1135         } else {
1136             dr1 = dither_4x4_16[ y & 3     ][0];
1137             dg1 = dither_4x4_16[ y & 3     ][1];
1138             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1139             dr2 = dither_4x4_16[ y & 3     ][1];
1140             dg2 = dither_4x4_16[ y & 3     ][0];
1141             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1142         }
1143
1144         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1145         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1146     } else /* 8/4-bit */ {
1147         uint8_t *dest = (uint8_t *) _dest;
1148         const uint8_t *r = (const uint8_t *) _r;
1149         const uint8_t *g = (const uint8_t *) _g;
1150         const uint8_t *b = (const uint8_t *) _b;
1151         int dr1, dg1, db1, dr2, dg2, db2;
1152
1153         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1154             const uint8_t * const d64 = dither_8x8_73[y & 7];
1155             const uint8_t * const d32 = dither_8x8_32[y & 7];
1156             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1157             db1 =       d64[(i * 2 + 0) & 7];
1158             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1159             db2 =       d64[(i * 2 + 1) & 7];
1160         } else {
1161             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1162             const uint8_t * const d128 = dither_8x8_220[y & 7];
1163             dr1 = db1 = d128[(i * 2 + 0) & 7];
1164             dg1 =        d64[(i * 2 + 0) & 7];
1165             dr2 = db2 = d128[(i * 2 + 1) & 7];
1166             dg2 =        d64[(i * 2 + 1) & 7];
1167         }
1168
1169         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1170             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1171                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1172         } else {
1173             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1174             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1175         }
1176     }
1177 }
1178
1179 static av_always_inline void
1180 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1181                      const int16_t **lumSrc, int lumFilterSize,
1182                      const int16_t *chrFilter, const int16_t **chrUSrc,
1183                      const int16_t **chrVSrc, int chrFilterSize,
1184                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1185                      int y, enum PixelFormat target, int hasAlpha)
1186 {
1187     int i;
1188
1189     for (i = 0; i < (dstW >> 1); i++) {
1190         int j;
1191         int Y1 = 1 << 18;
1192         int Y2 = 1 << 18;
1193         int U  = 1 << 18;
1194         int V  = 1 << 18;
1195         int av_unused A1, A2;
1196         const void *r, *g, *b;
1197
1198         for (j = 0; j < lumFilterSize; j++) {
1199             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1200             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1201         }
1202         for (j = 0; j < chrFilterSize; j++) {
1203             U += chrUSrc[j][i] * chrFilter[j];
1204             V += chrVSrc[j][i] * chrFilter[j];
1205         }
1206         Y1 >>= 19;
1207         Y2 >>= 19;
1208         U  >>= 19;
1209         V  >>= 19;
1210         if ((Y1 | Y2 | U | V) & 0x100) {
1211             Y1 = av_clip_uint8(Y1);
1212             Y2 = av_clip_uint8(Y2);
1213             U  = av_clip_uint8(U);
1214             V  = av_clip_uint8(V);
1215         }
1216         if (hasAlpha) {
1217             A1 = 1 << 18;
1218             A2 = 1 << 18;
1219             for (j = 0; j < lumFilterSize; j++) {
1220                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1221                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1222             }
1223             A1 >>= 19;
1224             A2 >>= 19;
1225             if ((A1 | A2) & 0x100) {
1226                 A1 = av_clip_uint8(A1);
1227                 A2 = av_clip_uint8(A2);
1228             }
1229         }
1230
1231         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1232         r =  c->table_rV[V];
1233         g = (c->table_gU[U] + c->table_gV[V]);
1234         b =  c->table_bU[U];
1235
1236         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1237                       r, g, b, y, target, hasAlpha);
1238     }
1239 }
1240
1241 static av_always_inline void
1242 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1243                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1244                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1245                      int yalpha, int uvalpha, int y,
1246                      enum PixelFormat target, int hasAlpha)
1247 {
1248     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1249                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1250                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1251                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1252                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1253     int  yalpha1 = 4095 - yalpha;
1254     int uvalpha1 = 4095 - uvalpha;
1255     int i;
1256
1257     for (i = 0; i < (dstW >> 1); i++) {
1258         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1259         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1260         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1261         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1262         int A1, A2;
1263         const void *r =  c->table_rV[V],
1264                    *g = (c->table_gU[U] + c->table_gV[V]),
1265                    *b =  c->table_bU[U];
1266
1267         if (hasAlpha) {
1268             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1269             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1270         }
1271
1272         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1273                       r, g, b, y, target, hasAlpha);
1274     }
1275 }
1276
1277 static av_always_inline void
1278 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1279                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1280                      const int16_t *abuf0, uint8_t *dest, int dstW,
1281                      int uvalpha, int y, enum PixelFormat target,
1282                      int hasAlpha)
1283 {
1284     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1285                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1286     int i;
1287
1288     if (uvalpha < 2048) {
1289         for (i = 0; i < (dstW >> 1); i++) {
1290             int Y1 = buf0[i * 2]     >> 7;
1291             int Y2 = buf0[i * 2 + 1] >> 7;
1292             int U  = ubuf1[i]        >> 7;
1293             int V  = vbuf1[i]        >> 7;
1294             int A1, A2;
1295             const void *r =  c->table_rV[V],
1296                        *g = (c->table_gU[U] + c->table_gV[V]),
1297                        *b =  c->table_bU[U];
1298
1299             if (hasAlpha) {
1300                 A1 = abuf0[i * 2    ] >> 7;
1301                 A2 = abuf0[i * 2 + 1] >> 7;
1302             }
1303
1304             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1305                           r, g, b, y, target, hasAlpha);
1306         }
1307     } else {
1308         for (i = 0; i < (dstW >> 1); i++) {
1309             int Y1 =  buf0[i * 2]          >> 7;
1310             int Y2 =  buf0[i * 2 + 1]      >> 7;
1311             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1312             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1313             int A1, A2;
1314             const void *r =  c->table_rV[V],
1315                        *g = (c->table_gU[U] + c->table_gV[V]),
1316                        *b =  c->table_bU[U];
1317
1318             if (hasAlpha) {
1319                 A1 = abuf0[i * 2    ] >> 7;
1320                 A2 = abuf0[i * 2 + 1] >> 7;
1321             }
1322
1323             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1324                           r, g, b, y, target, hasAlpha);
1325         }
1326     }
1327 }
1328
1329 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1330 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1331                                 const int16_t **lumSrc, int lumFilterSize, \
1332                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1333                                 const int16_t **chrVSrc, int chrFilterSize, \
1334                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1335                                 int y) \
1336 { \
1337     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1338                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1339                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1340 }
1341 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1342 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1343 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1344                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1345                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1346                                 int yalpha, int uvalpha, int y) \
1347 { \
1348     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1349                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1350 } \
1351  \
1352 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1353                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1354                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1355                                 int uvalpha, int y) \
1356 { \
1357     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1358                                   dstW, uvalpha, y, fmt, hasAlpha); \
1359 }
1360
1361 #if CONFIG_SMALL
1362 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1363 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1364 #else
1365 #if CONFIG_SWSCALE_ALPHA
1366 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1367 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1368 #endif
1369 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1370 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1371 #endif
1372 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1373 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1374 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1375 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1376 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1377 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1378 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1379 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1380
1381 static av_always_inline void
1382 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1383                           const int16_t **lumSrc, int lumFilterSize,
1384                           const int16_t *chrFilter, const int16_t **chrUSrc,
1385                           const int16_t **chrVSrc, int chrFilterSize,
1386                           const int16_t **alpSrc, uint8_t *dest,
1387                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1388 {
1389     int i;
1390     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1391
1392     for (i = 0; i < dstW; i++) {
1393         int j;
1394         int Y = 1<<9;
1395         int U = (1<<9)-(128 << 19);
1396         int V = (1<<9)-(128 << 19);
1397         int av_unused A;
1398         int R, G, B;
1399
1400         for (j = 0; j < lumFilterSize; j++) {
1401             Y += lumSrc[j][i] * lumFilter[j];
1402         }
1403         for (j = 0; j < chrFilterSize; j++) {
1404             U += chrUSrc[j][i] * chrFilter[j];
1405             V += chrVSrc[j][i] * chrFilter[j];
1406         }
1407         Y >>= 10;
1408         U >>= 10;
1409         V >>= 10;
1410         if (hasAlpha) {
1411             A = 1 << 18;
1412             for (j = 0; j < lumFilterSize; j++) {
1413                 A += alpSrc[j][i] * lumFilter[j];
1414             }
1415             A >>= 19;
1416             if (A & 0x100)
1417                 A = av_clip_uint8(A);
1418         }
1419         Y -= c->yuv2rgb_y_offset;
1420         Y *= c->yuv2rgb_y_coeff;
1421         Y += 1 << 21;
1422         R = Y + V*c->yuv2rgb_v2r_coeff;
1423         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1424         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1425         if ((R | G | B) & 0xC0000000) {
1426             R = av_clip_uintp2(R, 30);
1427             G = av_clip_uintp2(G, 30);
1428             B = av_clip_uintp2(B, 30);
1429         }
1430
1431         switch(target) {
1432         case PIX_FMT_ARGB:
1433             dest[0] = hasAlpha ? A : 255;
1434             dest[1] = R >> 22;
1435             dest[2] = G >> 22;
1436             dest[3] = B >> 22;
1437             break;
1438         case PIX_FMT_RGB24:
1439             dest[0] = R >> 22;
1440             dest[1] = G >> 22;
1441             dest[2] = B >> 22;
1442             break;
1443         case PIX_FMT_RGBA:
1444             dest[0] = R >> 22;
1445             dest[1] = G >> 22;
1446             dest[2] = B >> 22;
1447             dest[3] = hasAlpha ? A : 255;
1448             break;
1449         case PIX_FMT_ABGR:
1450             dest[0] = hasAlpha ? A : 255;
1451             dest[1] = B >> 22;
1452             dest[2] = G >> 22;
1453             dest[3] = R >> 22;
1454             break;
1455         case PIX_FMT_BGR24:
1456             dest[0] = B >> 22;
1457             dest[1] = G >> 22;
1458             dest[2] = R >> 22;
1459             break;
1460         case PIX_FMT_BGRA:
1461             dest[0] = B >> 22;
1462             dest[1] = G >> 22;
1463             dest[2] = R >> 22;
1464             dest[3] = hasAlpha ? A : 255;
1465             break;
1466         }
1467         dest += step;
1468     }
1469 }
1470
1471 #if CONFIG_SMALL
1472 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1473 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1474 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1475 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1476 #else
1477 #if CONFIG_SWSCALE_ALPHA
1478 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1479 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1480 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1481 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1482 #endif
1483 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1484 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1485 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1486 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1487 #endif
1488 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1489 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1490
1491 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1492                                        int width, int height,
1493                                        int y, uint8_t val)
1494 {
1495     int i;
1496     uint8_t *ptr = plane + stride*y;
1497     for (i=0; i<height; i++) {
1498         memset(ptr, val, width);
1499         ptr += stride;
1500     }
1501 }
1502
1503 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1504
1505 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1506 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1507
1508 static av_always_inline void
1509 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1510                     enum PixelFormat origin)
1511 {
1512     int i;
1513     for (i = 0; i < width; i++) {
1514         unsigned int r_b = input_pixel(&src[i*3+0]);
1515         unsigned int   g = input_pixel(&src[i*3+1]);
1516         unsigned int b_r = input_pixel(&src[i*3+2]);
1517
1518         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1519     }
1520 }
1521
1522 static av_always_inline void
1523 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1524                     const uint16_t *src1, const uint16_t *src2,
1525                     int width, enum PixelFormat origin)
1526 {
1527     int i;
1528     assert(src1==src2);
1529     for (i = 0; i < width; i++) {
1530         int r_b = input_pixel(&src1[i*3+0]);
1531         int   g = input_pixel(&src1[i*3+1]);
1532         int b_r = input_pixel(&src1[i*3+2]);
1533
1534         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1535         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1536     }
1537 }
1538
1539 static av_always_inline void
1540 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1541                           const uint16_t *src1, const uint16_t *src2,
1542                           int width, enum PixelFormat origin)
1543 {
1544     int i;
1545     assert(src1==src2);
1546     for (i = 0; i < width; i++) {
1547         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1548         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1549         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1550
1551         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1552         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1553     }
1554 }
1555
1556 #undef r
1557 #undef b
1558 #undef input_pixel
1559
1560 #define rgb48funcs(pattern, BE_LE, origin) \
1561 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1562                                     int width, uint32_t *unused) \
1563 { \
1564     const uint16_t *src = (const uint16_t *) _src; \
1565     uint16_t *dst = (uint16_t *) _dst; \
1566     rgb48ToY_c_template(dst, src, width, origin); \
1567 } \
1568  \
1569 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1570                                     const uint8_t *_src1, const uint8_t *_src2, \
1571                                     int width, uint32_t *unused) \
1572 { \
1573     const uint16_t *src1 = (const uint16_t *) _src1, \
1574                    *src2 = (const uint16_t *) _src2; \
1575     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1576     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1577 } \
1578  \
1579 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1580                                     const uint8_t *_src1, const uint8_t *_src2, \
1581                                     int width, uint32_t *unused) \
1582 { \
1583     const uint16_t *src1 = (const uint16_t *) _src1, \
1584                    *src2 = (const uint16_t *) _src2; \
1585     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1586     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1587 }
1588
1589 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1590 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1591 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1592 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1593
1594 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1595                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1596                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1597
1598 static av_always_inline void
1599 rgb16_32ToY_c_template(int16_t *dst, const uint8_t *src,
1600                        int width, enum PixelFormat origin,
1601                        int shr,   int shg,   int shb, int shp,
1602                        int maskr, int maskg, int maskb,
1603                        int rsh,   int gsh,   int bsh, int S)
1604 {
1605     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1606               rnd = (32<<((S)-1)) + (1<<(S-7));
1607     int i;
1608
1609     for (i = 0; i < width; i++) {
1610         int px = input_pixel(i) >> shp;
1611         int b = (px & maskb) >> shb;
1612         int g = (px & maskg) >> shg;
1613         int r = (px & maskr) >> shr;
1614
1615         dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
1616     }
1617 }
1618
1619 static av_always_inline void
1620 rgb16_32ToUV_c_template(int16_t *dstU, int16_t *dstV,
1621                         const uint8_t *src, int width,
1622                         enum PixelFormat origin,
1623                         int shr,   int shg,   int shb, int shp,
1624                         int maskr, int maskg, int maskb,
1625                         int rsh,   int gsh,   int bsh, int S)
1626 {
1627     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1628               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1629               rnd = (256<<((S)-1)) + (1<<(S-7));
1630     int i;
1631
1632     for (i = 0; i < width; i++) {
1633         int px = input_pixel(i) >> shp;
1634         int b = (px & maskb) >> shb;
1635         int g = (px & maskg) >> shg;
1636         int r = (px & maskr) >> shr;
1637
1638         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
1639         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
1640     }
1641 }
1642
1643 static av_always_inline void
1644 rgb16_32ToUV_half_c_template(int16_t *dstU, int16_t *dstV,
1645                              const uint8_t *src, int width,
1646                              enum PixelFormat origin,
1647                              int shr,   int shg,   int shb, int shp,
1648                              int maskr, int maskg, int maskb,
1649                              int rsh,   int gsh,   int bsh, int S)
1650 {
1651     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1652               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1653               rnd = (256U<<(S)) + (1<<(S-6)), maskgx = ~(maskr | maskb);
1654     int i;
1655
1656     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1657     for (i = 0; i < width; i++) {
1658         int px0 = input_pixel(2 * i + 0) >> shp;
1659         int px1 = input_pixel(2 * i + 1) >> shp;
1660         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1661         int rb = px0 + px1 - g;
1662
1663         b = (rb & maskb) >> shb;
1664         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1665             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1666             g >>= shg;
1667         } else {
1668             g = (g  & maskg) >> shg;
1669         }
1670         r = (rb & maskr) >> shr;
1671
1672         dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
1673         dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
1674     }
1675 }
1676
1677 #undef input_pixel
1678
1679 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1680                          maskg, maskb, rsh, gsh, bsh, S) \
1681 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1682                           int width, uint32_t *unused) \
1683 { \
1684     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1685                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1686 } \
1687  \
1688 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1689                            const uint8_t *src, const uint8_t *dummy, \
1690                            int width, uint32_t *unused) \
1691 { \
1692     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1693                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1694 } \
1695  \
1696 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1697                                 const uint8_t *src, const uint8_t *dummy, \
1698                                 int width, uint32_t *unused) \
1699 { \
1700     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1701                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1702 }
1703
1704 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1705 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1706 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1707 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1708 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1709 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1710 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1711 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1712 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1713 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1714 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1715 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1716
1717 static void abgrToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1718 {
1719     int i;
1720     for (i=0; i<width; i++) {
1721         dst[i]= src[4*i]<<6;
1722     }
1723 }
1724
1725 static void rgbaToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1726 {
1727     int i;
1728     for (i=0; i<width; i++) {
1729         dst[i]= src[4*i+3]<<6;
1730     }
1731 }
1732
1733 static void palToA_c(int16_t *dst, const uint8_t *src, int width, uint32_t *pal)
1734 {
1735     int i;
1736     for (i=0; i<width; i++) {
1737         int d= src[i];
1738
1739         dst[i]= (pal[d] >> 24)<<6;
1740     }
1741 }
1742
1743 static void palToY_c(int16_t *dst, const uint8_t *src, long width, uint32_t *pal)
1744 {
1745     int i;
1746     for (i=0; i<width; i++) {
1747         int d= src[i];
1748
1749         dst[i]= (pal[d] & 0xFF)<<6;
1750     }
1751 }
1752
1753 static void palToUV_c(uint16_t *dstU, int16_t *dstV,
1754                            const uint8_t *src1, const uint8_t *src2,
1755                            int width, uint32_t *pal)
1756 {
1757     int i;
1758     assert(src1 == src2);
1759     for (i=0; i<width; i++) {
1760         int p= pal[src1[i]];
1761
1762         dstU[i]= (uint8_t)(p>> 8)<<6;
1763         dstV[i]= (uint8_t)(p>>16)<<6;
1764     }
1765 }
1766
1767 static void monowhite2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1768 {
1769     int i, j;
1770     for (i=0; i<width/8; i++) {
1771         int d= ~src[i];
1772         for(j=0; j<8; j++)
1773             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1774     }
1775     if(width&7){
1776         int d= ~src[i];
1777         for(j=0; j<(width&7); j++)
1778             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1779     }
1780 }
1781
1782 static void monoblack2Y_c(int16_t *dst, const uint8_t *src, int width, uint32_t *unused)
1783 {
1784     int i, j;
1785     for (i=0; i<width/8; i++) {
1786         int d= src[i];
1787         for(j=0; j<8; j++)
1788             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1789     }
1790     if(width&7){
1791         int d= src[i];
1792         for(j=0; j<(width&7); j++)
1793             dst[8*i+j]= ((d>>(7-j))&1)*16383;
1794     }
1795 }
1796
1797 //FIXME yuy2* can read up to 7 samples too much
1798
1799 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1800                       uint32_t *unused)
1801 {
1802     int i;
1803     for (i=0; i<width; i++)
1804         dst[i]= src[2*i];
1805 }
1806
1807 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1808                        const uint8_t *src2, int width, uint32_t *unused)
1809 {
1810     int i;
1811     for (i=0; i<width; i++) {
1812         dstU[i]= src1[4*i + 1];
1813         dstV[i]= src1[4*i + 3];
1814     }
1815     assert(src1 == src2);
1816 }
1817
1818 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1819 {
1820     int i;
1821     const uint16_t *src = (const uint16_t *) _src;
1822     uint16_t *dst = (uint16_t *) _dst;
1823     for (i=0; i<width; i++) {
1824         dst[i] = av_bswap16(src[i]);
1825     }
1826 }
1827
1828 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1829                         const uint8_t *_src2, int width, uint32_t *unused)
1830 {
1831     int i;
1832     const uint16_t *src1 = (const uint16_t *) _src1,
1833                    *src2 = (const uint16_t *) _src2;
1834     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1835     for (i=0; i<width; i++) {
1836         dstU[i] = av_bswap16(src1[i]);
1837         dstV[i] = av_bswap16(src2[i]);
1838     }
1839 }
1840
1841 /* This is almost identical to the previous, end exists only because
1842  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1843 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1844                       uint32_t *unused)
1845 {
1846     int i;
1847     for (i=0; i<width; i++)
1848         dst[i]= src[2*i+1];
1849 }
1850
1851 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1852                        const uint8_t *src2, int width, uint32_t *unused)
1853 {
1854     int i;
1855     for (i=0; i<width; i++) {
1856         dstU[i]= src1[4*i + 0];
1857         dstV[i]= src1[4*i + 2];
1858     }
1859     assert(src1 == src2);
1860 }
1861
1862 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1863                                         const uint8_t *src, int width)
1864 {
1865     int i;
1866     for (i = 0; i < width; i++) {
1867         dst1[i] = src[2*i+0];
1868         dst2[i] = src[2*i+1];
1869     }
1870 }
1871
1872 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1873                        const uint8_t *src1, const uint8_t *src2,
1874                        int width, uint32_t *unused)
1875 {
1876     nvXXtoUV_c(dstU, dstV, src1, width);
1877 }
1878
1879 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1880                        const uint8_t *src1, const uint8_t *src2,
1881                        int width, uint32_t *unused)
1882 {
1883     nvXXtoUV_c(dstV, dstU, src1, width);
1884 }
1885
1886 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1887
1888 static void bgr24ToY_c(int16_t *dst, const uint8_t *src,
1889                        int width, uint32_t *unused)
1890 {
1891     int i;
1892     for (i=0; i<width; i++) {
1893         int b= src[i*3+0];
1894         int g= src[i*3+1];
1895         int r= src[i*3+2];
1896
1897         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1898     }
1899 }
1900
1901 static void bgr24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1902                         const uint8_t *src2, int width, uint32_t *unused)
1903 {
1904     int i;
1905     for (i=0; i<width; i++) {
1906         int b= src1[3*i + 0];
1907         int g= src1[3*i + 1];
1908         int r= src1[3*i + 2];
1909
1910         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1911         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1912     }
1913     assert(src1 == src2);
1914 }
1915
1916 static void bgr24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1917                              const uint8_t *src2, int width, uint32_t *unused)
1918 {
1919     int i;
1920     for (i=0; i<width; i++) {
1921         int b= src1[6*i + 0] + src1[6*i + 3];
1922         int g= src1[6*i + 1] + src1[6*i + 4];
1923         int r= src1[6*i + 2] + src1[6*i + 5];
1924
1925         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1926         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1927     }
1928     assert(src1 == src2);
1929 }
1930
1931 static void rgb24ToY_c(int16_t *dst, const uint8_t *src, int width,
1932                        uint32_t *unused)
1933 {
1934     int i;
1935     for (i=0; i<width; i++) {
1936         int r= src[i*3+0];
1937         int g= src[i*3+1];
1938         int b= src[i*3+2];
1939
1940         dst[i]= ((RY*r + GY*g + BY*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
1941     }
1942 }
1943
1944 static void rgb24ToUV_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1945                         const uint8_t *src2, int width, uint32_t *unused)
1946 {
1947     int i;
1948     assert(src1==src2);
1949     for (i=0; i<width; i++) {
1950         int r= src1[3*i + 0];
1951         int g= src1[3*i + 1];
1952         int b= src1[3*i + 2];
1953
1954         dstU[i]= (RU*r + GU*g + BU*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1955         dstV[i]= (RV*r + GV*g + BV*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
1956     }
1957 }
1958
1959 static void rgb24ToUV_half_c(int16_t *dstU, int16_t *dstV, const uint8_t *src1,
1960                                     const uint8_t *src2, int width, uint32_t *unused)
1961 {
1962     int i;
1963     assert(src1==src2);
1964     for (i=0; i<width; i++) {
1965         int r= src1[6*i + 0] + src1[6*i + 3];
1966         int g= src1[6*i + 1] + src1[6*i + 4];
1967         int b= src1[6*i + 2] + src1[6*i + 5];
1968
1969         dstU[i]= (RU*r + GU*g + BU*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1970         dstV[i]= (RV*r + GV*g + BV*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
1971     }
1972 }
1973
1974 static void hScale16_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1975                        const int16_t *filter,
1976                        const int16_t *filterPos, int filterSize)
1977 {
1978     int i;
1979     int32_t *dst = (int32_t *) _dst;
1980     const uint16_t *src = (const uint16_t *) _src;
1981     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1982     int sh = (bits <= 7) ? 11 : (bits - 4);
1983
1984     if((isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
1985         sh= 9;
1986
1987     for (i = 0; i < dstW; i++) {
1988         int j;
1989         int srcPos = filterPos[i];
1990         int val = 0;
1991
1992         for (j = 0; j < filterSize; j++) {
1993             val += src[srcPos + j] * filter[filterSize * i + j];
1994         }
1995         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1996         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1997     }
1998 }
1999
2000 // bilinear / bicubic scaling
2001 static void hScale_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
2002                      const int16_t *filter, const int16_t *filterPos,
2003                      int filterSize)
2004 {
2005     int i;
2006     for (i=0; i<dstW; i++) {
2007         int j;
2008         int srcPos= filterPos[i];
2009         int val=0;
2010         for (j=0; j<filterSize; j++) {
2011             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2012         }
2013         //filter += hFilterSize;
2014         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
2015         //dst[i] = val>>7;
2016     }
2017 }
2018
2019 static inline void hScale16N_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
2020                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
2021 {
2022     int i, j;
2023
2024     for (i=0; i<dstW; i++) {
2025         int srcPos= filterPos[i];
2026         int val=0;
2027         for (j=0; j<filterSize; j++) {
2028             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
2029         }
2030         dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
2031     }
2032 }
2033
2034 static inline void hScale16NX_c(int16_t *dst, int dstW, const uint16_t *src, int srcW, int xInc,
2035                                     const int16_t *filter, const int16_t *filterPos, long filterSize, int shift)
2036 {
2037     int i, j;
2038     for (i=0; i<dstW; i++) {
2039         int srcPos= filterPos[i];
2040         int val=0;
2041         for (j=0; j<filterSize; j++) {
2042             val += ((int)av_bswap16(src[srcPos + j]))*filter[filterSize*i + j];
2043         }
2044         dst[i] = FFMIN(val>>shift, (1<<15)-1); // the cubic equation does overflow ...
2045     }
2046 }
2047
2048 //FIXME all pal and rgb srcFormats could do this convertion as well
2049 //FIXME all scalers more complex than bilinear could do half of this transform
2050 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2051 {
2052     int i;
2053     for (i = 0; i < width; i++) {
2054         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2055         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2056     }
2057 }
2058 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2059 {
2060     int i;
2061     for (i = 0; i < width; i++) {
2062         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2063         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2064     }
2065 }
2066 static void lumRangeToJpeg_c(int16_t *dst, int width)
2067 {
2068     int i;
2069     for (i = 0; i < width; i++)
2070         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2071 }
2072 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2073 {
2074     int i;
2075     for (i = 0; i < width; i++)
2076         dst[i] = (dst[i]*14071 + 33561947)>>14;
2077 }
2078
2079 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2080 {
2081     int i;
2082     int32_t *dstU = (int32_t *) _dstU;
2083     int32_t *dstV = (int32_t *) _dstV;
2084     for (i = 0; i < width; i++) {
2085         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2086         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2087     }
2088 }
2089 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2090 {
2091     int i;
2092     int32_t *dstU = (int32_t *) _dstU;
2093     int32_t *dstV = (int32_t *) _dstV;
2094     for (i = 0; i < width; i++) {
2095         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2096         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2097     }
2098 }
2099 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2100 {
2101     int i;
2102     int32_t *dst = (int32_t *) _dst;
2103     for (i = 0; i < width; i++)
2104         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2105 }
2106 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2107 {
2108     int i;
2109     int32_t *dst = (int32_t *) _dst;
2110     for (i = 0; i < width; i++)
2111         dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
2112 }
2113
2114 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2115                            const uint8_t *src, int srcW, int xInc)
2116 {
2117     int i;
2118     unsigned int xpos=0;
2119     for (i=0;i<dstWidth;i++) {
2120         register unsigned int xx=xpos>>16;
2121         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2122         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2123         xpos+=xInc;
2124     }
2125     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
2126         dst[i] = src[srcW-1]*128;
2127 }
2128
2129 static void scale8To16Rv_c(uint16_t *_dst, const uint8_t *src, int len)
2130 {
2131     int i;
2132     uint8_t *dst = (uint8_t *) _dst;
2133     for (i = len - 1; i >= 0; i--) {
2134         dst[i * 2] = dst[i * 2 + 1] = src[i];
2135     }
2136 }
2137
2138 static void scale19To15Fw_c(int16_t *dst, const int32_t *src, int len)
2139 {
2140     int i;
2141     for (i = 0; i < len; i++) {
2142         dst[i] = src[i] >> 4;
2143     }
2144 }
2145
2146 // *** horizontal scale Y line to temp buffer
2147 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2148                                      const uint8_t *src, int srcW, int xInc,
2149                                      const int16_t *hLumFilter,
2150                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2151                                      uint8_t *formatConvBuffer,
2152                                      uint32_t *pal, int isAlpha)
2153 {
2154     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2155     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2156
2157     if (toYV12) {
2158         toYV12(formatConvBuffer, src, srcW, pal);
2159         src= formatConvBuffer;
2160     }
2161
2162     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16 && !isAnyRGB(c->srcFormat)) {
2163         c->scale8To16Rv((uint16_t *) formatConvBuffer, src, srcW);
2164         src = formatConvBuffer;
2165     }
2166
2167     if (c->hScale16) {
2168         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2169         c->hScale16(dst, dstWidth, (const uint16_t*)src, srcW, xInc, hLumFilter, hLumFilterPos, hLumFilterSize, shift);
2170     } else if (!c->hyscale_fast) {
2171         c->hScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2172     } else { // fast bilinear upscale / crap downscale
2173         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2174     }
2175
2176     if (convertRange)
2177         convertRange(dst, dstWidth);
2178
2179     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 15 && c->scalingBpp == 16) {
2180         c->scale19To15Fw(dst, (int32_t *) dst, dstWidth);
2181     }
2182 }
2183
2184 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2185                            int dstWidth, const uint8_t *src1,
2186                            const uint8_t *src2, int srcW, int xInc)
2187 {
2188     int i;
2189     unsigned int xpos=0;
2190     for (i=0;i<dstWidth;i++) {
2191         register unsigned int xx=xpos>>16;
2192         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2193         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2194         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2195         xpos+=xInc;
2196     }
2197     for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
2198         dst1[i] = src1[srcW-1]*128;
2199         dst2[i] = src2[srcW-1]*128;
2200     }
2201 }
2202
2203 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2204                                      const uint8_t *src1, const uint8_t *src2,
2205                                      int srcW, int xInc, const int16_t *hChrFilter,
2206                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2207                                      uint8_t *formatConvBuffer, uint32_t *pal)
2208 {
2209     if (c->chrToYV12) {
2210         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW*2+78, 16);
2211         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2212         src1= formatConvBuffer;
2213         src2= buf2;
2214     }
2215
2216     if (av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1 < 8 && c->scalingBpp == 16 && !isAnyRGB(c->srcFormat)) {
2217         uint8_t *buf2 = (formatConvBuffer + FFALIGN(srcW * 2+78, 16));
2218         c->scale8To16Rv((uint16_t *) formatConvBuffer, src1, srcW);
2219         c->scale8To16Rv((uint16_t *) buf2,             src2, srcW);
2220         src1 = formatConvBuffer;
2221         src2 = buf2;
2222     }
2223
2224     if (c->hScale16) {
2225         int shift= isAnyRGB(c->srcFormat) || c->srcFormat==PIX_FMT_PAL8 ? 13 : av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
2226         c->hScale16(dst1, dstWidth, (const uint16_t*)src1, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2227         c->hScale16(dst2, dstWidth, (const uint16_t*)src2, srcW, xInc, hChrFilter, hChrFilterPos, hChrFilterSize, shift);
2228     } else if (!c->hcscale_fast) {
2229         c->hScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2230         c->hScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2231     } else { // fast bilinear upscale / crap downscale
2232         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2233     }
2234
2235     if (c->chrConvertRange)
2236         c->chrConvertRange(dst1, dst2, dstWidth);
2237
2238     if (av_pix_fmt_descriptors[c->dstFormat].comp[0].depth_minus1 < 15 && c->scalingBpp == 16) {
2239         c->scale19To15Fw(dst1, (int32_t *) dst1, dstWidth);
2240         c->scale19To15Fw(dst2, (int32_t *) dst2, dstWidth);
2241     }
2242 }
2243
2244 static av_always_inline void
2245 find_c_packed_planar_out_funcs(SwsContext *c,
2246                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2247                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2248                                yuv2packedX_fn *yuv2packedX)
2249 {
2250     enum PixelFormat dstFormat = c->dstFormat;
2251
2252     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2253         *yuv2yuvX     = yuv2nv12X_c;
2254     } else if (is16BPS(dstFormat)) {
2255         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2256     } else if (is9_OR_10BPS(dstFormat)) {
2257         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2258             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2259         } else {
2260             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2261         }
2262     } else {
2263         *yuv2yuv1     = yuv2yuv1_c;
2264         *yuv2yuvX     = yuv2yuvX_c;
2265     }
2266     if(c->flags & SWS_FULL_CHR_H_INT) {
2267         switch (dstFormat) {
2268             case PIX_FMT_RGBA:
2269 #if CONFIG_SMALL
2270                 *yuv2packedX = yuv2rgba32_full_X_c;
2271 #else
2272 #if CONFIG_SWSCALE_ALPHA
2273                 if (c->alpPixBuf) {
2274                     *yuv2packedX = yuv2rgba32_full_X_c;
2275                 } else
2276 #endif /* CONFIG_SWSCALE_ALPHA */
2277                 {
2278                     *yuv2packedX = yuv2rgbx32_full_X_c;
2279                 }
2280 #endif /* !CONFIG_SMALL */
2281                 break;
2282             case PIX_FMT_ARGB:
2283 #if CONFIG_SMALL
2284                 *yuv2packedX = yuv2argb32_full_X_c;
2285 #else
2286 #if CONFIG_SWSCALE_ALPHA
2287                 if (c->alpPixBuf) {
2288                     *yuv2packedX = yuv2argb32_full_X_c;
2289                 } else
2290 #endif /* CONFIG_SWSCALE_ALPHA */
2291                 {
2292                     *yuv2packedX = yuv2xrgb32_full_X_c;
2293                 }
2294 #endif /* !CONFIG_SMALL */
2295                 break;
2296             case PIX_FMT_BGRA:
2297 #if CONFIG_SMALL
2298                 *yuv2packedX = yuv2bgra32_full_X_c;
2299 #else
2300 #if CONFIG_SWSCALE_ALPHA
2301                 if (c->alpPixBuf) {
2302                     *yuv2packedX = yuv2bgra32_full_X_c;
2303                 } else
2304 #endif /* CONFIG_SWSCALE_ALPHA */
2305                 {
2306                     *yuv2packedX = yuv2bgrx32_full_X_c;
2307                 }
2308 #endif /* !CONFIG_SMALL */
2309                 break;
2310             case PIX_FMT_ABGR:
2311 #if CONFIG_SMALL
2312                 *yuv2packedX = yuv2abgr32_full_X_c;
2313 #else
2314 #if CONFIG_SWSCALE_ALPHA
2315                 if (c->alpPixBuf) {
2316                     *yuv2packedX = yuv2abgr32_full_X_c;
2317                 } else
2318 #endif /* CONFIG_SWSCALE_ALPHA */
2319                 {
2320                     *yuv2packedX = yuv2xbgr32_full_X_c;
2321                 }
2322 #endif /* !CONFIG_SMALL */
2323                 break;
2324             case PIX_FMT_RGB24:
2325             *yuv2packedX = yuv2rgb24_full_X_c;
2326             break;
2327         case PIX_FMT_BGR24:
2328             *yuv2packedX = yuv2bgr24_full_X_c;
2329             break;
2330         }
2331         if(!*yuv2packedX)
2332             goto YUV_PACKED;
2333     } else {
2334         YUV_PACKED:
2335         switch (dstFormat) {
2336         case PIX_FMT_GRAY16BE:
2337             *yuv2packed1 = yuv2gray16BE_1_c;
2338             *yuv2packed2 = yuv2gray16BE_2_c;
2339             *yuv2packedX = yuv2gray16BE_X_c;
2340             break;
2341         case PIX_FMT_GRAY16LE:
2342             *yuv2packed1 = yuv2gray16LE_1_c;
2343             *yuv2packed2 = yuv2gray16LE_2_c;
2344             *yuv2packedX = yuv2gray16LE_X_c;
2345             break;
2346         case PIX_FMT_MONOWHITE:
2347             *yuv2packed1 = yuv2monowhite_1_c;
2348             *yuv2packed2 = yuv2monowhite_2_c;
2349             *yuv2packedX = yuv2monowhite_X_c;
2350             break;
2351         case PIX_FMT_MONOBLACK:
2352             *yuv2packed1 = yuv2monoblack_1_c;
2353             *yuv2packed2 = yuv2monoblack_2_c;
2354             *yuv2packedX = yuv2monoblack_X_c;
2355             break;
2356         case PIX_FMT_YUYV422:
2357             *yuv2packed1 = yuv2yuyv422_1_c;
2358             *yuv2packed2 = yuv2yuyv422_2_c;
2359             *yuv2packedX = yuv2yuyv422_X_c;
2360             break;
2361         case PIX_FMT_UYVY422:
2362             *yuv2packed1 = yuv2uyvy422_1_c;
2363             *yuv2packed2 = yuv2uyvy422_2_c;
2364             *yuv2packedX = yuv2uyvy422_X_c;
2365             break;
2366         case PIX_FMT_RGB48LE:
2367             *yuv2packed1 = yuv2rgb48le_1_c;
2368             *yuv2packed2 = yuv2rgb48le_2_c;
2369             *yuv2packedX = yuv2rgb48le_X_c;
2370             break;
2371         case PIX_FMT_RGB48BE:
2372             *yuv2packed1 = yuv2rgb48be_1_c;
2373             *yuv2packed2 = yuv2rgb48be_2_c;
2374             *yuv2packedX = yuv2rgb48be_X_c;
2375             break;
2376         case PIX_FMT_BGR48LE:
2377             *yuv2packed1 = yuv2bgr48le_1_c;
2378             *yuv2packed2 = yuv2bgr48le_2_c;
2379             *yuv2packedX = yuv2bgr48le_X_c;
2380             break;
2381         case PIX_FMT_BGR48BE:
2382             *yuv2packed1 = yuv2bgr48be_1_c;
2383             *yuv2packed2 = yuv2bgr48be_2_c;
2384             *yuv2packedX = yuv2bgr48be_X_c;
2385             break;
2386         case PIX_FMT_RGB32:
2387         case PIX_FMT_BGR32:
2388 #if CONFIG_SMALL
2389             *yuv2packed1 = yuv2rgb32_1_c;
2390             *yuv2packed2 = yuv2rgb32_2_c;
2391             *yuv2packedX = yuv2rgb32_X_c;
2392 #else
2393 #if CONFIG_SWSCALE_ALPHA
2394                 if (c->alpPixBuf) {
2395                     *yuv2packed1 = yuv2rgba32_1_c;
2396                     *yuv2packed2 = yuv2rgba32_2_c;
2397                     *yuv2packedX = yuv2rgba32_X_c;
2398                 } else
2399 #endif /* CONFIG_SWSCALE_ALPHA */
2400                 {
2401                     *yuv2packed1 = yuv2rgbx32_1_c;
2402                     *yuv2packed2 = yuv2rgbx32_2_c;
2403                     *yuv2packedX = yuv2rgbx32_X_c;
2404                 }
2405 #endif /* !CONFIG_SMALL */
2406             break;
2407         case PIX_FMT_RGB32_1:
2408         case PIX_FMT_BGR32_1:
2409 #if CONFIG_SMALL
2410                 *yuv2packed1 = yuv2rgb32_1_1_c;
2411                 *yuv2packed2 = yuv2rgb32_1_2_c;
2412                 *yuv2packedX = yuv2rgb32_1_X_c;
2413 #else
2414 #if CONFIG_SWSCALE_ALPHA
2415                 if (c->alpPixBuf) {
2416                     *yuv2packed1 = yuv2rgba32_1_1_c;
2417                     *yuv2packed2 = yuv2rgba32_1_2_c;
2418                     *yuv2packedX = yuv2rgba32_1_X_c;
2419                 } else
2420 #endif /* CONFIG_SWSCALE_ALPHA */
2421                 {
2422                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2423                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2424                     *yuv2packedX = yuv2rgbx32_1_X_c;
2425                 }
2426 #endif /* !CONFIG_SMALL */
2427                 break;
2428         case PIX_FMT_RGB24:
2429             *yuv2packed1 = yuv2rgb24_1_c;
2430             *yuv2packed2 = yuv2rgb24_2_c;
2431             *yuv2packedX = yuv2rgb24_X_c;
2432             break;
2433         case PIX_FMT_BGR24:
2434             *yuv2packed1 = yuv2bgr24_1_c;
2435             *yuv2packed2 = yuv2bgr24_2_c;
2436             *yuv2packedX = yuv2bgr24_X_c;
2437             break;
2438         case PIX_FMT_RGB565LE:
2439         case PIX_FMT_RGB565BE:
2440         case PIX_FMT_BGR565LE:
2441         case PIX_FMT_BGR565BE:
2442             *yuv2packed1 = yuv2rgb16_1_c;
2443             *yuv2packed2 = yuv2rgb16_2_c;
2444             *yuv2packedX = yuv2rgb16_X_c;
2445             break;
2446         case PIX_FMT_RGB555LE:
2447         case PIX_FMT_RGB555BE:
2448         case PIX_FMT_BGR555LE:
2449         case PIX_FMT_BGR555BE:
2450             *yuv2packed1 = yuv2rgb15_1_c;
2451             *yuv2packed2 = yuv2rgb15_2_c;
2452             *yuv2packedX = yuv2rgb15_X_c;
2453             break;
2454         case PIX_FMT_RGB444LE:
2455         case PIX_FMT_RGB444BE:
2456         case PIX_FMT_BGR444LE:
2457         case PIX_FMT_BGR444BE:
2458             *yuv2packed1 = yuv2rgb12_1_c;
2459             *yuv2packed2 = yuv2rgb12_2_c;
2460             *yuv2packedX = yuv2rgb12_X_c;
2461             break;
2462         case PIX_FMT_RGB8:
2463         case PIX_FMT_BGR8:
2464             *yuv2packed1 = yuv2rgb8_1_c;
2465             *yuv2packed2 = yuv2rgb8_2_c;
2466             *yuv2packedX = yuv2rgb8_X_c;
2467             break;
2468         case PIX_FMT_RGB4:
2469         case PIX_FMT_BGR4:
2470             *yuv2packed1 = yuv2rgb4_1_c;
2471             *yuv2packed2 = yuv2rgb4_2_c;
2472             *yuv2packedX = yuv2rgb4_X_c;
2473             break;
2474         case PIX_FMT_RGB4_BYTE:
2475         case PIX_FMT_BGR4_BYTE:
2476             *yuv2packed1 = yuv2rgb4b_1_c;
2477             *yuv2packed2 = yuv2rgb4b_2_c;
2478             *yuv2packedX = yuv2rgb4b_X_c;
2479             break;
2480         }
2481     }
2482 }
2483
2484 #define DEBUG_SWSCALE_BUFFERS 0
2485 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2486
2487 static int swScale(SwsContext *c, const uint8_t* src[],
2488                    int srcStride[], int srcSliceY,
2489                    int srcSliceH, uint8_t* dst[], int dstStride[])
2490 {
2491     /* load a few things into local vars to make the code more readable? and faster */
2492     const int srcW= c->srcW;
2493     const int dstW= c->dstW;
2494     const int dstH= c->dstH;
2495     const int chrDstW= c->chrDstW;
2496     const int chrSrcW= c->chrSrcW;
2497     const int lumXInc= c->lumXInc;
2498     const int chrXInc= c->chrXInc;
2499     const enum PixelFormat dstFormat= c->dstFormat;
2500     const int flags= c->flags;
2501     int16_t *vLumFilterPos= c->vLumFilterPos;
2502     int16_t *vChrFilterPos= c->vChrFilterPos;
2503     int16_t *hLumFilterPos= c->hLumFilterPos;
2504     int16_t *hChrFilterPos= c->hChrFilterPos;
2505     int16_t *vLumFilter= c->vLumFilter;
2506     int16_t *vChrFilter= c->vChrFilter;
2507     int16_t *hLumFilter= c->hLumFilter;
2508     int16_t *hChrFilter= c->hChrFilter;
2509     int32_t *lumMmxFilter= c->lumMmxFilter;
2510     int32_t *chrMmxFilter= c->chrMmxFilter;
2511     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2512     const int vLumFilterSize= c->vLumFilterSize;
2513     const int vChrFilterSize= c->vChrFilterSize;
2514     const int hLumFilterSize= c->hLumFilterSize;
2515     const int hChrFilterSize= c->hChrFilterSize;
2516     int16_t **lumPixBuf= c->lumPixBuf;
2517     int16_t **chrUPixBuf= c->chrUPixBuf;
2518     int16_t **chrVPixBuf= c->chrVPixBuf;
2519     int16_t **alpPixBuf= c->alpPixBuf;
2520     const int vLumBufSize= c->vLumBufSize;
2521     const int vChrBufSize= c->vChrBufSize;
2522     uint8_t *formatConvBuffer= c->formatConvBuffer;
2523     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2524     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2525     int lastDstY;
2526     uint32_t *pal=c->pal_yuv;
2527
2528     int should_dither= isNBPS(c->srcFormat) || is16BPS(c->srcFormat);
2529     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2530     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2531     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2532     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2533     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2534
2535     /* vars which will change and which we need to store back in the context */
2536     int dstY= c->dstY;
2537     int lumBufIndex= c->lumBufIndex;
2538     int chrBufIndex= c->chrBufIndex;
2539     int lastInLumBuf= c->lastInLumBuf;
2540     int lastInChrBuf= c->lastInChrBuf;
2541
2542     if (isPacked(c->srcFormat)) {
2543         src[0]=
2544         src[1]=
2545         src[2]=
2546         src[3]= src[0];
2547         srcStride[0]=
2548         srcStride[1]=
2549         srcStride[2]=
2550         srcStride[3]= srcStride[0];
2551     }
2552     srcStride[1]<<= c->vChrDrop;
2553     srcStride[2]<<= c->vChrDrop;
2554
2555     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2556                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2557                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2558     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2559                    srcSliceY,    srcSliceH,    dstY,    dstH);
2560     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2561                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2562
2563     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2564         static int warnedAlready=0; //FIXME move this into the context perhaps
2565         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2566             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2567                    "         ->cannot do aligned memory accesses anymore\n");
2568             warnedAlready=1;
2569         }
2570     }
2571
2572     /* Note the user might start scaling the picture in the middle so this
2573        will not get executed. This is not really intended but works
2574        currently, so people might do it. */
2575     if (srcSliceY ==0) {
2576         lumBufIndex=-1;
2577         chrBufIndex=-1;
2578         dstY=0;
2579         lastInLumBuf= -1;
2580         lastInChrBuf= -1;
2581     }
2582
2583     if (!should_dither) {
2584         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2585     }
2586     lastDstY= dstY;
2587
2588     for (;dstY < dstH; dstY++) {
2589         const int chrDstY= dstY>>c->chrDstVSubSample;
2590         uint8_t *dest[4] = {
2591             dst[0] + dstStride[0] * dstY,
2592             dst[1] + dstStride[1] * chrDstY,
2593             dst[2] + dstStride[2] * chrDstY,
2594             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2595         };
2596
2597         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2598         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2599         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2600         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2601         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2602         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2603         int enough_lines;
2604
2605         //handle holes (FAST_BILINEAR & weird filters)
2606         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2607         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2608         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2609         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2610
2611         DEBUG_BUFFERS("dstY: %d\n", dstY);
2612         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2613                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2614         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2615                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2616
2617         // Do we have enough lines in this slice to output the dstY line
2618         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2619
2620         if (!enough_lines) {
2621             lastLumSrcY = srcSliceY + srcSliceH - 1;
2622             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2623             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2624                                             lastLumSrcY, lastChrSrcY);
2625         }
2626
2627         //Do horizontal scaling
2628         while(lastInLumBuf < lastLumSrcY) {
2629             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2630             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2631             lumBufIndex++;
2632             assert(lumBufIndex < 2*vLumBufSize);
2633             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2634             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2635             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2636                     hLumFilter, hLumFilterPos, hLumFilterSize,
2637                     formatConvBuffer,
2638                     pal, 0);
2639             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2640                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2641                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2642                         formatConvBuffer,
2643                         pal, 1);
2644             lastInLumBuf++;
2645             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2646                                lumBufIndex,    lastInLumBuf);
2647         }
2648         while(lastInChrBuf < lastChrSrcY) {
2649             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2650             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2651             chrBufIndex++;
2652             assert(chrBufIndex < 2*vChrBufSize);
2653             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2654             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2655             //FIXME replace parameters through context struct (some at least)
2656
2657             if (c->needs_hcscale)
2658                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2659                           chrDstW, src1, src2, chrSrcW, chrXInc,
2660                           hChrFilter, hChrFilterPos, hChrFilterSize,
2661                           formatConvBuffer, pal);
2662             lastInChrBuf++;
2663             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2664                                chrBufIndex,    lastInChrBuf);
2665         }
2666         //wrap buf index around to stay inside the ring buffer
2667         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2668         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2669         if (!enough_lines)
2670             break; //we can't output a dstY line so let's try with the next slice
2671
2672 #if HAVE_MMX
2673         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2674 #endif
2675         if (should_dither) {
2676             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2677             c->lumDither8 = dither_8x8_128[dstY & 7];
2678         }
2679         if (dstY >= dstH-2) {
2680             // hmm looks like we can't use MMX here without overwriting this array's tail
2681             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2682                                            &yuv2packed1, &yuv2packed2,
2683                                            &yuv2packedX);
2684         }
2685
2686         {
2687             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2688             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2689             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2690             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2691
2692             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2693                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2694                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2695                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2696                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2697                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2698                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2699                              dest, dstW, chrDstW);
2700                 } else { //General YV12
2701                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2702                              lumSrcPtr, vLumFilterSize,
2703                              vChrFilter + chrDstY * vChrFilterSize,
2704                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2705                              alpSrcPtr, dest, dstW, chrDstW);
2706                 }
2707             } else {
2708                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2709                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2710                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2711                     int chrAlpha = vChrFilter[2 * dstY + 1];
2712                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2713                                 alpPixBuf ? *alpSrcPtr : NULL,
2714                                 dest[0], dstW, chrAlpha, dstY);
2715                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2716                     int lumAlpha = vLumFilter[2 * dstY + 1];
2717                     int chrAlpha = vChrFilter[2 * dstY + 1];
2718                     lumMmxFilter[2] =
2719                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2720                     chrMmxFilter[2] =
2721                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2722                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2723                                 alpPixBuf ? alpSrcPtr : NULL,
2724                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2725                 } else { //general RGB
2726                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2727                                 lumSrcPtr, vLumFilterSize,
2728                                 vChrFilter + dstY * vChrFilterSize,
2729                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2730                                 alpSrcPtr, dest[0], dstW, dstY);
2731                 }
2732             }
2733         }
2734     }
2735
2736     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2737         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2738
2739 #if HAVE_MMX2
2740     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2741         __asm__ volatile("sfence":::"memory");
2742 #endif
2743     emms_c();
2744
2745     /* store changed local vars back in the context */
2746     c->dstY= dstY;
2747     c->lumBufIndex= lumBufIndex;
2748     c->chrBufIndex= chrBufIndex;
2749     c->lastInLumBuf= lastInLumBuf;
2750     c->lastInChrBuf= lastInChrBuf;
2751
2752     return dstY - lastDstY;
2753 }
2754
2755 static av_cold void sws_init_swScale_c(SwsContext *c)
2756 {
2757     enum PixelFormat srcFormat = c->srcFormat;
2758
2759     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2760                                    &c->yuv2packed1, &c->yuv2packed2,
2761                                    &c->yuv2packedX);
2762
2763     c->chrToYV12 = NULL;
2764     switch(srcFormat) {
2765         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2766         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2767         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2768         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2769         case PIX_FMT_RGB8     :
2770         case PIX_FMT_BGR8     :
2771         case PIX_FMT_PAL8     :
2772         case PIX_FMT_BGR4_BYTE:
2773         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2774         case PIX_FMT_YUV444P9BE:
2775         case PIX_FMT_YUV420P9BE:
2776         case PIX_FMT_YUV444P10BE:
2777         case PIX_FMT_YUV422P10BE:
2778         case PIX_FMT_YUV420P10BE: c->hScale16= HAVE_BIGENDIAN ? hScale16N_c : hScale16NX_c; break;
2779         case PIX_FMT_YUV444P9LE:
2780         case PIX_FMT_YUV420P9LE:
2781         case PIX_FMT_YUV422P10LE:
2782         case PIX_FMT_YUV420P10LE:
2783         case PIX_FMT_YUV444P10LE: c->hScale16= HAVE_BIGENDIAN ? hScale16NX_c : hScale16N_c; break;
2784 #if HAVE_BIGENDIAN
2785         case PIX_FMT_YUV420P16LE:
2786         case PIX_FMT_YUV422P16LE:
2787         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2788 #else
2789         case PIX_FMT_YUV420P16BE:
2790         case PIX_FMT_YUV422P16BE:
2791         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2792 #endif
2793     }
2794     if (c->chrSrcHSubSample) {
2795         switch(srcFormat) {
2796         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2797         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2798         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2799         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2800         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2801         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2802         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2803         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2804         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2805         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2806         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2807         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2808         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2809         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2810         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2811         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2812         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2813         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2814         }
2815     } else {
2816         switch(srcFormat) {
2817         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2818         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2819         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2820         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2821         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2822         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2823         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2824         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2825         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2826         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2827         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2828         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2829         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2830         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2831         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2832         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2833         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2834         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2835         }
2836     }
2837
2838     c->lumToYV12 = NULL;
2839     c->alpToYV12 = NULL;
2840     switch (srcFormat) {
2841 #if HAVE_BIGENDIAN
2842     case PIX_FMT_YUV420P16LE:
2843     case PIX_FMT_YUV422P16LE:
2844     case PIX_FMT_YUV444P16LE:
2845     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2846 #else
2847     case PIX_FMT_YUV420P16BE:
2848     case PIX_FMT_YUV422P16BE:
2849     case PIX_FMT_YUV444P16BE:
2850     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2851 #endif
2852     case PIX_FMT_YUYV422  :
2853     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2854     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2855     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2856     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2857     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2858     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2859     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2860     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2861     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2862     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2863     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2864     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2865     case PIX_FMT_RGB8     :
2866     case PIX_FMT_BGR8     :
2867     case PIX_FMT_PAL8     :
2868     case PIX_FMT_BGR4_BYTE:
2869     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2870     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2871     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2872     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2873     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2874     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2875     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2876     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2877     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2878     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2879     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2880     }
2881     if (c->alpPixBuf) {
2882         switch (srcFormat) {
2883         case PIX_FMT_BGRA:
2884         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2885         case PIX_FMT_ABGR:
2886         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2887         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2888         case PIX_FMT_PAL8 : c->alpToYV12 = palToA_c; break;
2889         }
2890     }
2891
2892     if((isAnyRGB(c->srcFormat) && av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1<15)
2893        || c->srcFormat == PIX_FMT_PAL8)
2894         c->hScale16= hScale16N_c;
2895
2896     if (c->scalingBpp == 8) {
2897     c->hScale       = hScale_c;
2898     if (c->flags & SWS_FAST_BILINEAR) {
2899         c->hyscale_fast = hyscale_fast_c;
2900         c->hcscale_fast = hcscale_fast_c;
2901     }
2902
2903     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2904         if (c->srcRange) {
2905             c->lumConvertRange = lumRangeFromJpeg_c;
2906             c->chrConvertRange = chrRangeFromJpeg_c;
2907         } else {
2908             c->lumConvertRange = lumRangeToJpeg_c;
2909             c->chrConvertRange = chrRangeToJpeg_c;
2910         }
2911     }
2912     } else {
2913         if(c->hScale16 == hScale16NX_c && !isAnyRGB(c->srcFormat)){
2914             c->chrToYV12 = bswap16UV_c;
2915             c->lumToYV12 = bswap16Y_c;
2916         }
2917         c->hScale16 = NULL;
2918         c->hScale = hScale16_c;
2919         c->scale19To15Fw = scale19To15Fw_c;
2920         c->scale8To16Rv  = scale8To16Rv_c;
2921
2922         if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2923             if (c->srcRange) {
2924                 c->lumConvertRange = lumRangeFromJpeg16_c;
2925                 c->chrConvertRange = chrRangeFromJpeg16_c;
2926             } else {
2927                 c->lumConvertRange = lumRangeToJpeg16_c;
2928                 c->chrConvertRange = chrRangeToJpeg16_c;
2929             }
2930         }
2931     }
2932
2933     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2934           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2935         c->needs_hcscale = 1;
2936 }
2937
2938 SwsFunc ff_getSwsFunc(SwsContext *c)
2939 {
2940     sws_init_swScale_c(c);
2941
2942     if (HAVE_MMX)
2943         ff_sws_init_swScale_mmx(c);
2944     if (HAVE_ALTIVEC)
2945         ff_sws_init_swScale_altivec(c);
2946
2947     return swScale;
2948 }