libswscale/swscale.c

   1 /*
   2  * Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of Libav.
   5  *
   6  * Libav is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * Libav is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with Libav; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 /*
  22   supported Input formats: YV12, I420/IYUV, YUY2, UYVY, BGR32, BGR32_1, BGR24, BGR16, BGR15, RGB32, RGB32_1, RGB24, Y8/Y800, YVU9/IF09, PAL8
  23   supported output formats: YV12, I420/IYUV, YUY2, UYVY, {BGR,RGB}{1,4,8,15,16,24,32}, Y8/Y800, YVU9/IF09
  24   {BGR,RGB}{1,4,8,15,16} support dithering
  25
  26   unscaled special converters (YV12=I420=IYUV, Y800=Y8)
  27   YV12 -> {BGR,RGB}{1,4,8,12,15,16,24,32}
  28   x -> x
  29   YUV9 -> YV12
  30   YUV9/YV12 -> Y800
  31   Y800 -> YUV9/YV12
  32   BGR24 -> BGR32 & RGB24 -> RGB32
  33   BGR32 -> BGR24 & RGB32 -> RGB24
  34   BGR15 -> BGR16
  35 */
  36
  37 /*
  38 tested special converters (most are tested actually, but I did not write it down ...)
  39  YV12 -> BGR12/BGR16
  40  YV12 -> YV12
  41  BGR15 -> BGR16
  42  BGR16 -> BGR16
  43  YVU9 -> YV12
  44
  45 untested special converters
  46   YV12/I420 -> BGR15/BGR24/BGR32 (it is the yuv2rgb stuff, so it should be OK)
  47   YV12/I420 -> YV12/I420
  48   YUY2/BGR15/BGR24/BGR32/RGB24/RGB32 -> same format
  49   BGR24 -> BGR32 & RGB24 -> RGB32
  50   BGR32 -> BGR24 & RGB32 -> RGB24
  51   BGR24 -> YV12
  52 */
  53
  54 #include <inttypes.h>
  55 #include <string.h>
  56 #include <math.h>
  57 #include <stdio.h>
  58 #include "config.h"
  59 #include <assert.h>
  60 #include "swscale.h"
  61 #include "swscale_internal.h"
  62 #include "rgb2rgb.h"
  63 #include "libavutil/intreadwrite.h"
  64 #include "libavutil/cpu.h"
  65 #include "libavutil/avutil.h"
  66 #include "libavutil/mathematics.h"
  67 #include "libavutil/bswap.h"
  68 #include "libavutil/pixdesc.h"
  69
  70 #define DITHER1XBPP
  71
  72 #define RGB2YUV_SHIFT 15
  73 #define BY ( (int)(0.114*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  74 #define BV (-(int)(0.081*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  75 #define BU ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  76 #define GY ( (int)(0.587*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  77 #define GV (-(int)(0.419*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  78 #define GU (-(int)(0.331*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  79 #define RY ( (int)(0.299*219/255*(1<<RGB2YUV_SHIFT)+0.5))
  80 #define RV ( (int)(0.500*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  81 #define RU (-(int)(0.169*224/255*(1<<RGB2YUV_SHIFT)+0.5))
  82
  83 /*
  84 NOTES
  85 Special versions: fast Y 1:1 scaling (no interpolation in y direction)
  86
  87 TODO
  88 more intelligent misalignment avoidance for the horizontal scaler
  89 write special vertical cubic upscale version
  90 optimize C code (YV12 / minmax)
  91 add support for packed pixel YUV input & output
  92 add support for Y8 output
  93 optimize BGR24 & BGR32
  94 add BGR4 output support
  95 write special BGR->BGR scaler
  96 */
  97
  98 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_4)[2][8]={
  99 {  1,   3,   1,   3,   1,   3,   1,   3, },
 100 {  2,   0,   2,   0,   2,   0,   2,   0, },
 101 };
 102
 103 DECLARE_ALIGNED(8, static const uint8_t, dither_2x2_8)[2][8]={
 104 {  6,   2,   6,   2,   6,   2,   6,   2, },
 105 {  0,   4,   0,   4,   0,   4,   0,   4, },
 106 };
 107
 108 DECLARE_ALIGNED(8, const uint8_t, dither_4x4_16)[4][8]={
 109 {  8,   4,  11,   7,   8,   4,  11,   7, },
 110 {  2,  14,   1,  13,   2,  14,   1,  13, },
 111 { 10,   6,   9,   5,  10,   6,   9,   5, },
 112 {  0,  12,   3,  15,   0,  12,   3,  15, },
 113 };
 114
 115 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_32)[8][8]={
 116 { 17,   9,  23,  15,  16,   8,  22,  14, },
 117 {  5,  29,   3,  27,   4,  28,   2,  26, },
 118 { 21,  13,  19,  11,  20,  12,  18,  10, },
 119 {  0,  24,   6,  30,   1,  25,   7,  31, },
 120 { 16,   8,  22,  14,  17,   9,  23,  15, },
 121 {  4,  28,   2,  26,   5,  29,   3,  27, },
 122 { 20,  12,  18,  10,  21,  13,  19,  11, },
 123 {  1,  25,   7,  31,   0,  24,   6,  30, },
 124 };
 125
 126 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_73)[8][8]={
 127 {  0,  55,  14,  68,   3,  58,  17,  72, },
 128 { 37,  18,  50,  32,  40,  22,  54,  35, },
 129 {  9,  64,   5,  59,  13,  67,   8,  63, },
 130 { 46,  27,  41,  23,  49,  31,  44,  26, },
 131 {  2,  57,  16,  71,   1,  56,  15,  70, },
 132 { 39,  21,  52,  34,  38,  19,  51,  33, },
 133 { 11,  66,   7,  62,  10,  65,   6,  60, },
 134 { 48,  30,  43,  25,  47,  29,  42,  24, },
 135 };
 136
 137 #if 1
 138 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 139 {117,  62, 158, 103, 113,  58, 155, 100, },
 140 { 34, 199,  21, 186,  31, 196,  17, 182, },
 141 {144,  89, 131,  76, 141,  86, 127,  72, },
 142 {  0, 165,  41, 206,  10, 175,  52, 217, },
 143 {110,  55, 151,  96, 120,  65, 162, 107, },
 144 { 28, 193,  14, 179,  38, 203,  24, 189, },
 145 {138,  83, 124,  69, 148,  93, 134,  79, },
 146 {  7, 172,  48, 213,   3, 168,  45, 210, },
 147 };
 148 #elif 1
 149 // tries to correct a gamma of 1.5
 150 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 151 {  0, 143,  18, 200,   2, 156,  25, 215, },
 152 { 78,  28, 125,  64,  89,  36, 138,  74, },
 153 { 10, 180,   3, 161,  16, 195,   8, 175, },
 154 {109,  51,  93,  38, 121,  60, 105,  47, },
 155 {  1, 152,  23, 210,   0, 147,  20, 205, },
 156 { 85,  33, 134,  71,  81,  30, 130,  67, },
 157 { 14, 190,   6, 171,  12, 185,   5, 166, },
 158 {117,  57, 101,  44, 113,  54,  97,  41, },
 159 };
 160 #elif 1
 161 // tries to correct a gamma of 2.0
 162 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 163 {  0, 124,   8, 193,   0, 140,  12, 213, },
 164 { 55,  14, 104,  42,  66,  19, 119,  52, },
 165 {  3, 168,   1, 145,   6, 187,   3, 162, },
 166 { 86,  31,  70,  21,  99,  39,  82,  28, },
 167 {  0, 134,  11, 206,   0, 129,   9, 200, },
 168 { 62,  17, 114,  48,  58,  16, 109,  45, },
 169 {  5, 181,   2, 157,   4, 175,   1, 151, },
 170 { 95,  36,  78,  26,  90,  34,  74,  24, },
 171 };
 172 #else
 173 // tries to correct a gamma of 2.5
 174 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_220)[8][8]={
 175 {  0, 107,   3, 187,   0, 125,   6, 212, },
 176 { 39,   7,  86,  28,  49,  11, 102,  36, },
 177 {  1, 158,   0, 131,   3, 180,   1, 151, },
 178 { 68,  19,  52,  12,  81,  25,  64,  17, },
 179 {  0, 119,   5, 203,   0, 113,   4, 195, },
 180 { 45,   9,  96,  33,  42,   8,  91,  30, },
 181 {  2, 172,   1, 144,   2, 165,   0, 137, },
 182 { 77,  23,  60,  15,  72,  21,  56,  14, },
 183 };
 184 #endif
 185 DECLARE_ALIGNED(8, const uint8_t, dither_8x8_128)[8][8] = {
 186 {  36, 68, 60, 92, 34, 66, 58, 90,},
 187 { 100,  4,124, 28, 98,  2,122, 26,},
 188 {  52, 84, 44, 76, 50, 82, 42, 74,},
 189 { 116, 20,108, 12,114, 18,106, 10,},
 190 {  32, 64, 56, 88, 38, 70, 62, 94,},
 191 {  96,  0,120, 24,102,  6,126, 30,},
 192 {  48, 80, 40, 72, 54, 86, 46, 78,},
 193 { 112, 16,104,  8,118, 22,110, 14,},
 194 };
 195 DECLARE_ALIGNED(8, const uint8_t, ff_sws_pb_64)[8] =
 196 {  64, 64, 64, 64, 64, 64, 64, 64 };
 197
 198 static av_always_inline void
 199 yuv2yuvX16_c_template(const int16_t *lumFilter, const int32_t **lumSrc,
 200                       int lumFilterSize, const int16_t *chrFilter,
 201                       const int32_t **chrUSrc, const int32_t **chrVSrc,
 202                       int chrFilterSize, const int32_t **alpSrc,
 203                       uint16_t *dest[4], int dstW, int chrDstW,
 204                       int big_endian, int output_bits)
 205 {
 206     //FIXME Optimize (just quickly written not optimized..)
 207     int i;
 208     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 209              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 210     int shift = 15 + 16 - output_bits - 1;
 211
 212 #define output_pixel(pos, val) \
 213     if (big_endian) { \
 214         AV_WB16(pos, av_clip_uint16(val >> shift)); \
 215     } else { \
 216         AV_WL16(pos, av_clip_uint16(val >> shift)); \
 217     }
 218     for (i = 0; i < dstW; i++) {
 219         int val = 1 << (30-output_bits - 1);
 220         int j;
 221
 222         for (j = 0; j < lumFilterSize; j++)
 223             val += (lumSrc[j][i] * lumFilter[j]) >> 1;
 224
 225         output_pixel(&yDest[i], val);
 226     }
 227
 228     if (uDest) {
 229         for (i = 0; i < chrDstW; i++) {
 230             int u = 1 << (30-output_bits - 1);
 231             int v = 1 << (30-output_bits - 1);
 232             int j;
 233
 234             for (j = 0; j < chrFilterSize; j++) {
 235                 u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
 236                 v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
 237             }
 238
 239             output_pixel(&uDest[i], u);
 240             output_pixel(&vDest[i], v);
 241         }
 242     }
 243
 244     if (CONFIG_SWSCALE_ALPHA && aDest) {
 245         for (i = 0; i < dstW; i++) {
 246             int val = 1 << (30-output_bits - 1);
 247             int j;
 248
 249             for (j = 0; j < lumFilterSize; j++)
 250                 val += (alpSrc[j][i] * lumFilter[j]) >> 1;
 251
 252             output_pixel(&aDest[i], val);
 253         }
 254     }
 255 #undef output_pixel
 256 }
 257
 258 static av_always_inline void
 259 yuv2yuvX10_c_template(const int16_t *lumFilter, const int16_t **lumSrc,
 260                       int lumFilterSize, const int16_t *chrFilter,
 261                       const int16_t **chrUSrc, const int16_t **chrVSrc,
 262                       int chrFilterSize, const int16_t **alpSrc,
 263                       uint16_t *dest[4], int dstW, int chrDstW,
 264                       int big_endian, int output_bits)
 265 {
 266     //FIXME Optimize (just quickly written not optimized..)
 267     int i;
 268     uint16_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 269              *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 270     int shift = 11 + 16 - output_bits - 1;
 271
 272 #define output_pixel(pos, val) \
 273     if (big_endian) { \
 274         AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 275     } else { \
 276         AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
 277     }
 278     for (i = 0; i < dstW; i++) {
 279         int val = 1 << (26-output_bits - 1);
 280         int j;
 281
 282         for (j = 0; j < lumFilterSize; j++)
 283             val += (lumSrc[j][i] * lumFilter[j]) >> 1;
 284
 285         output_pixel(&yDest[i], val);
 286     }
 287
 288     if (uDest) {
 289         for (i = 0; i < chrDstW; i++) {
 290             int u = 1 << (26-output_bits - 1);
 291             int v = 1 << (26-output_bits - 1);
 292             int j;
 293
 294             for (j = 0; j < chrFilterSize; j++) {
 295                 u += (chrUSrc[j][i] * chrFilter[j]) >> 1;
 296                 v += (chrVSrc[j][i] * chrFilter[j]) >> 1;
 297             }
 298
 299             output_pixel(&uDest[i], u);
 300             output_pixel(&vDest[i], v);
 301         }
 302     }
 303
 304     if (CONFIG_SWSCALE_ALPHA && aDest) {
 305         for (i = 0; i < dstW; i++) {
 306             int val = 1 << (26-output_bits - 1);
 307             int j;
 308
 309             for (j = 0; j < lumFilterSize; j++)
 310                 val += (alpSrc[j][i] * lumFilter[j]) >> 1;
 311
 312             output_pixel(&aDest[i], val);
 313         }
 314     }
 315 #undef output_pixel
 316 }
 317
 318 #define yuv2NBPS(bits, BE_LE, is_be, yuv2yuvX_template_fn, typeX_t) \
 319 static void yuv2yuvX ## bits ## BE_LE ## _c(SwsContext *c, const int16_t *lumFilter, \
 320                               const int16_t **_lumSrc, int lumFilterSize, \
 321                               const int16_t *chrFilter, const int16_t **_chrUSrc, \
 322                               const int16_t **_chrVSrc, \
 323                               int chrFilterSize, const int16_t **_alpSrc, \
 324                               uint8_t *_dest[4], int dstW, int chrDstW) \
 325 { \
 326     const typeX_t **lumSrc  = (const typeX_t **) _lumSrc, \
 327                   **chrUSrc = (const typeX_t **) _chrUSrc, \
 328                   **chrVSrc = (const typeX_t **) _chrVSrc, \
 329                   **alpSrc  = (const typeX_t **) _alpSrc; \
 330     yuv2yuvX_template_fn(lumFilter, lumSrc, lumFilterSize, \
 331                          chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 332                          alpSrc, (uint16_t **) _dest, \
 333                          dstW, chrDstW, is_be, bits); \
 334 }
 335 yuv2NBPS( 9, BE, 1, yuv2yuvX10_c_template, int16_t);
 336 yuv2NBPS( 9, LE, 0, yuv2yuvX10_c_template, int16_t);
 337 yuv2NBPS(10, BE, 1, yuv2yuvX10_c_template, int16_t);
 338 yuv2NBPS(10, LE, 0, yuv2yuvX10_c_template, int16_t);
 339 yuv2NBPS(16, BE, 1, yuv2yuvX16_c_template, int32_t);
 340 yuv2NBPS(16, LE, 0, yuv2yuvX16_c_template, int32_t);
 341
 342 static void yuv2yuvX_c(SwsContext *c, const int16_t *lumFilter,
 343                        const int16_t **lumSrc, int lumFilterSize,
 344                        const int16_t *chrFilter, const int16_t **chrUSrc,
 345                        const int16_t **chrVSrc,
 346                        int chrFilterSize, const int16_t **alpSrc,
 347                        uint8_t *dest[4], int dstW, int chrDstW)
 348 {
 349     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 350             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 351     int i;
 352     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 353
 354     //FIXME Optimize (just quickly written not optimized..)
 355     for (i=0; i<dstW; i++) {
 356         int val = lumDither[i & 7] << 12;
 357         int j;
 358         for (j=0; j<lumFilterSize; j++)
 359             val += lumSrc[j][i] * lumFilter[j];
 360
 361         yDest[i]= av_clip_uint8(val>>19);
 362     }
 363
 364     if (uDest)
 365         for (i=0; i<chrDstW; i++) {
 366             int u = chrDither[i & 7] << 12;
 367             int v = chrDither[(i + 3) & 7] << 12;
 368             int j;
 369             for (j=0; j<chrFilterSize; j++) {
 370                 u += chrUSrc[j][i] * chrFilter[j];
 371                 v += chrVSrc[j][i] * chrFilter[j];
 372             }
 373
 374             uDest[i]= av_clip_uint8(u>>19);
 375             vDest[i]= av_clip_uint8(v>>19);
 376         }
 377
 378     if (CONFIG_SWSCALE_ALPHA && aDest)
 379         for (i=0; i<dstW; i++) {
 380             int val = lumDither[i & 7] << 12;
 381             int j;
 382             for (j=0; j<lumFilterSize; j++)
 383                 val += alpSrc[j][i] * lumFilter[j];
 384
 385             aDest[i]= av_clip_uint8(val>>19);
 386         }
 387 }
 388
 389 static void yuv2yuv1_c(SwsContext *c, const int16_t *lumSrc,
 390                        const int16_t *chrUSrc, const int16_t *chrVSrc,
 391                        const int16_t *alpSrc,
 392                        uint8_t *dest[4], int dstW, int chrDstW)
 393 {
 394     uint8_t *yDest = dest[0], *uDest = dest[1], *vDest = dest[2],
 395             *aDest = CONFIG_SWSCALE_ALPHA ? dest[3] : NULL;
 396     int i;
 397     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 398
 399     for (i=0; i<dstW; i++) {
 400         int val = (lumSrc[i]+  lumDither[i & 7]) >> 7;
 401         yDest[i]= av_clip_uint8(val);
 402     }
 403
 404     if (uDest)
 405         for (i=0; i<chrDstW; i++) {
 406             int u = (chrUSrc[i] + chrDither[i & 7])       >> 7;
 407             int v = (chrVSrc[i] + chrDither[(i + 3) & 7]) >> 7;
 408             uDest[i]= av_clip_uint8(u);
 409             vDest[i]= av_clip_uint8(v);
 410         }
 411
 412     if (CONFIG_SWSCALE_ALPHA && aDest)
 413         for (i=0; i<dstW; i++) {
 414             int val = (alpSrc[i] + lumDither[i & 7]) >> 7;
 415             aDest[i]= av_clip_uint8(val);
 416         }
 417 }
 418
 419 static void yuv2nv12X_c(SwsContext *c, const int16_t *lumFilter,
 420                         const int16_t **lumSrc, int lumFilterSize,
 421                         const int16_t *chrFilter, const int16_t **chrUSrc,
 422                         const int16_t **chrVSrc, int chrFilterSize,
 423                         const int16_t **alpSrc, uint8_t *dest[4],
 424                         int dstW, int chrDstW)
 425 {
 426     uint8_t *yDest = dest[0], *uDest = dest[1];
 427     enum PixelFormat dstFormat = c->dstFormat;
 428     const uint8_t *lumDither = c->lumDither8, *chrDither = c->chrDither8;
 429
 430     //FIXME Optimize (just quickly written not optimized..)
 431     int i;
 432     for (i=0; i<dstW; i++) {
 433         int val = lumDither[i & 7] << 12;
 434         int j;
 435         for (j=0; j<lumFilterSize; j++)
 436             val += lumSrc[j][i] * lumFilter[j];
 437
 438         yDest[i]= av_clip_uint8(val>>19);
 439     }
 440
 441     if (!uDest)
 442         return;
 443
 444     if (dstFormat == PIX_FMT_NV12)
 445         for (i=0; i<chrDstW; i++) {
 446             int u = chrDither[i & 7] << 12;
 447             int v = chrDither[(i + 3) & 7] << 12;
 448             int j;
 449             for (j=0; j<chrFilterSize; j++) {
 450                 u += chrUSrc[j][i] * chrFilter[j];
 451                 v += chrVSrc[j][i] * chrFilter[j];
 452             }
 453
 454             uDest[2*i]= av_clip_uint8(u>>19);
 455             uDest[2*i+1]= av_clip_uint8(v>>19);
 456         }
 457     else
 458         for (i=0; i<chrDstW; i++) {
 459             int u = chrDither[i & 7] << 12;
 460             int v = chrDither[(i + 3) & 7] << 12;
 461             int j;
 462             for (j=0; j<chrFilterSize; j++) {
 463                 u += chrUSrc[j][i] * chrFilter[j];
 464                 v += chrVSrc[j][i] * chrFilter[j];
 465             }
 466
 467             uDest[2*i]= av_clip_uint8(v>>19);
 468             uDest[2*i+1]= av_clip_uint8(u>>19);
 469         }
 470 }
 471
 472 #define output_pixel(pos, val) \
 473         if (target == PIX_FMT_GRAY16BE) { \
 474             AV_WB16(pos, val); \
 475         } else { \
 476             AV_WL16(pos, val); \
 477         }
 478
 479 static av_always_inline void
 480 yuv2gray16_X_c_template(SwsContext *c, const int16_t *lumFilter,
 481                         const int32_t **lumSrc, int lumFilterSize,
 482                         const int16_t *chrFilter, const int32_t **chrUSrc,
 483                         const int32_t **chrVSrc, int chrFilterSize,
 484                         const int32_t **alpSrc, uint16_t *dest, int dstW,
 485                         int y, enum PixelFormat target)
 486 {
 487     int i;
 488
 489     for (i = 0; i < (dstW >> 1); i++) {
 490         int j;
 491         int Y1 = 1 << 14;
 492         int Y2 = 1 << 14;
 493
 494         for (j = 0; j < lumFilterSize; j++) {
 495             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 496             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 497         }
 498         Y1 >>= 15;
 499         Y2 >>= 15;
 500         if ((Y1 | Y2) & 0x10000) {
 501             Y1 = av_clip_uint16(Y1);
 502             Y2 = av_clip_uint16(Y2);
 503         }
 504         output_pixel(&dest[i * 2 + 0], Y1);
 505         output_pixel(&dest[i * 2 + 1], Y2);
 506     }
 507 }
 508
 509 static av_always_inline void
 510 yuv2gray16_2_c_template(SwsContext *c, const int32_t *buf[2],
 511                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 512                         const int32_t *abuf[2], uint16_t *dest, int dstW,
 513                         int yalpha, int uvalpha, int y,
 514                         enum PixelFormat target)
 515 {
 516     int  yalpha1 = 4095 - yalpha;
 517     int i;
 518     const int32_t *buf0 = buf[0], *buf1 = buf[1];
 519
 520     for (i = 0; i < (dstW >> 1); i++) {
 521         int Y1 = (buf0[i * 2    ] * yalpha1 + buf1[i * 2    ] * yalpha) >> 15;
 522         int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 15;
 523
 524         output_pixel(&dest[i * 2 + 0], Y1);
 525         output_pixel(&dest[i * 2 + 1], Y2);
 526     }
 527 }
 528
 529 static av_always_inline void
 530 yuv2gray16_1_c_template(SwsContext *c, const int32_t *buf0,
 531                         const int32_t *ubuf[2], const int32_t *vbuf[2],
 532                         const int32_t *abuf0, uint16_t *dest, int dstW,
 533                         int uvalpha, int y, enum PixelFormat target)
 534 {
 535     int i;
 536
 537     for (i = 0; i < (dstW >> 1); i++) {
 538         int Y1 = buf0[i * 2    ] << 1;
 539         int Y2 = buf0[i * 2 + 1] << 1;
 540
 541         output_pixel(&dest[i * 2 + 0], Y1);
 542         output_pixel(&dest[i * 2 + 1], Y2);
 543     }
 544 }
 545
 546 #undef output_pixel
 547
 548 #define YUV2PACKED16WRAPPER(name, base, ext, fmt) \
 549 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 550                         const int16_t **_lumSrc, int lumFilterSize, \
 551                         const int16_t *chrFilter, const int16_t **_chrUSrc, \
 552                         const int16_t **_chrVSrc, int chrFilterSize, \
 553                         const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
 554                         int y) \
 555 { \
 556     const int32_t **lumSrc  = (const int32_t **) _lumSrc, \
 557                   **chrUSrc = (const int32_t **) _chrUSrc, \
 558                   **chrVSrc = (const int32_t **) _chrVSrc, \
 559                   **alpSrc  = (const int32_t **) _alpSrc; \
 560     uint16_t *dest = (uint16_t *) _dest; \
 561     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 562                           chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 563                           alpSrc, dest, dstW, y, fmt); \
 564 } \
 565  \
 566 static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
 567                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 568                         const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
 569                         int yalpha, int uvalpha, int y) \
 570 { \
 571     const int32_t **buf  = (const int32_t **) _buf, \
 572                   **ubuf = (const int32_t **) _ubuf, \
 573                   **vbuf = (const int32_t **) _vbuf, \
 574                   **abuf = (const int32_t **) _abuf; \
 575     uint16_t *dest = (uint16_t *) _dest; \
 576     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 577                           dest, dstW, yalpha, uvalpha, y, fmt); \
 578 } \
 579  \
 580 static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
 581                         const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
 582                         const int16_t *_abuf0, uint8_t *_dest, int dstW, \
 583                         int uvalpha, int y) \
 584 { \
 585     const int32_t *buf0  = (const int32_t *)  _buf0, \
 586                  **ubuf  = (const int32_t **) _ubuf, \
 587                  **vbuf  = (const int32_t **) _vbuf, \
 588                   *abuf0 = (const int32_t *)  _abuf0; \
 589     uint16_t *dest = (uint16_t *) _dest; \
 590     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
 591                                   dstW, uvalpha, y, fmt); \
 592 }
 593
 594 YUV2PACKED16WRAPPER(yuv2gray16,, LE, PIX_FMT_GRAY16LE);
 595 YUV2PACKED16WRAPPER(yuv2gray16,, BE, PIX_FMT_GRAY16BE);
 596
 597 #define output_pixel(pos, acc) \
 598     if (target == PIX_FMT_MONOBLACK) { \
 599         pos = acc; \
 600     } else { \
 601         pos = ~acc; \
 602     }
 603
 604 static av_always_inline void
 605 yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
 606                       const int16_t **lumSrc, int lumFilterSize,
 607                       const int16_t *chrFilter, const int16_t **chrUSrc,
 608                       const int16_t **chrVSrc, int chrFilterSize,
 609                       const int16_t **alpSrc, uint8_t *dest, int dstW,
 610                       int y, enum PixelFormat target)
 611 {
 612     const uint8_t * const d128=dither_8x8_220[y&7];
 613     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 614     int i;
 615     int acc = 0;
 616
 617     for (i = 0; i < dstW - 1; i += 2) {
 618         int j;
 619         int Y1 = 1 << 18;
 620         int Y2 = 1 << 18;
 621
 622         for (j = 0; j < lumFilterSize; j++) {
 623             Y1 += lumSrc[j][i]   * lumFilter[j];
 624             Y2 += lumSrc[j][i+1] * lumFilter[j];
 625         }
 626         Y1 >>= 19;
 627         Y2 >>= 19;
 628         if ((Y1 | Y2) & 0x100) {
 629             Y1 = av_clip_uint8(Y1);
 630             Y2 = av_clip_uint8(Y2);
 631         }
 632         acc += acc + g[Y1 + d128[(i + 0) & 7]];
 633         acc += acc + g[Y2 + d128[(i + 1) & 7]];
 634         if ((i & 7) == 6) {
 635             output_pixel(*dest++, acc);
 636         }
 637     }
 638 }
 639
 640 static av_always_inline void
 641 yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
 642                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 643                       const int16_t *abuf[2], uint8_t *dest, int dstW,
 644                       int yalpha, int uvalpha, int y,
 645                       enum PixelFormat target)
 646 {
 647     const int16_t *buf0  = buf[0],  *buf1  = buf[1];
 648     const uint8_t * const d128 = dither_8x8_220[y & 7];
 649     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 650     int  yalpha1 = 4095 - yalpha;
 651     int i;
 652
 653     for (i = 0; i < dstW - 7; i += 8) {
 654         int acc =    g[((buf0[i    ] * yalpha1 + buf1[i    ] * yalpha) >> 19) + d128[0]];
 655         acc += acc + g[((buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19) + d128[1]];
 656         acc += acc + g[((buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19) + d128[2]];
 657         acc += acc + g[((buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19) + d128[3]];
 658         acc += acc + g[((buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19) + d128[4]];
 659         acc += acc + g[((buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19) + d128[5]];
 660         acc += acc + g[((buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19) + d128[6]];
 661         acc += acc + g[((buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19) + d128[7]];
 662         output_pixel(*dest++, acc);
 663     }
 664 }
 665
 666 static av_always_inline void
 667 yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
 668                       const int16_t *ubuf[2], const int16_t *vbuf[2],
 669                       const int16_t *abuf0, uint8_t *dest, int dstW,
 670                       int uvalpha, int y, enum PixelFormat target)
 671 {
 672     const uint8_t * const d128 = dither_8x8_220[y & 7];
 673     uint8_t *g = c->table_gU[128] + c->table_gV[128];
 674     int i;
 675
 676     for (i = 0; i < dstW - 7; i += 8) {
 677         int acc =    g[(buf0[i    ] >> 7) + d128[0]];
 678         acc += acc + g[(buf0[i + 1] >> 7) + d128[1]];
 679         acc += acc + g[(buf0[i + 2] >> 7) + d128[2]];
 680         acc += acc + g[(buf0[i + 3] >> 7) + d128[3]];
 681         acc += acc + g[(buf0[i + 4] >> 7) + d128[4]];
 682         acc += acc + g[(buf0[i + 5] >> 7) + d128[5]];
 683         acc += acc + g[(buf0[i + 6] >> 7) + d128[6]];
 684         acc += acc + g[(buf0[i + 7] >> 7) + d128[7]];
 685         output_pixel(*dest++, acc);
 686     }
 687 }
 688
 689 #undef output_pixel
 690
 691 #define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
 692 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
 693                                 const int16_t **lumSrc, int lumFilterSize, \
 694                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
 695                                 const int16_t **chrVSrc, int chrFilterSize, \
 696                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
 697                                 int y) \
 698 { \
 699     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
 700                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
 701                                   alpSrc, dest, dstW, y, fmt); \
 702 } \
 703  \
 704 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
 705                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 706                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
 707                                 int yalpha, int uvalpha, int y) \
 708 { \
 709     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
 710                                   dest, dstW, yalpha, uvalpha, y, fmt); \
 711 } \
 712  \
 713 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
 714                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
 715                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
 716                                 int uvalpha, int y) \
 717 { \
 718     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
 719                                   abuf0, dest, dstW, uvalpha, \
 720                                   y, fmt); \
 721 }
 722
 723 YUV2PACKEDWRAPPER(yuv2mono,, white, PIX_FMT_MONOWHITE);
 724 YUV2PACKEDWRAPPER(yuv2mono,, black, PIX_FMT_MONOBLACK);
 725
 726 #define output_pixels(pos, Y1, U, Y2, V) \
 727     if (target == PIX_FMT_YUYV422) { \
 728         dest[pos + 0] = Y1; \
 729         dest[pos + 1] = U;  \
 730         dest[pos + 2] = Y2; \
 731         dest[pos + 3] = V;  \
 732     } else { \
 733         dest[pos + 0] = U;  \
 734         dest[pos + 1] = Y1; \
 735         dest[pos + 2] = V;  \
 736         dest[pos + 3] = Y2; \
 737     }
 738
 739 static av_always_inline void
 740 yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
 741                      const int16_t **lumSrc, int lumFilterSize,
 742                      const int16_t *chrFilter, const int16_t **chrUSrc,
 743                      const int16_t **chrVSrc, int chrFilterSize,
 744                      const int16_t **alpSrc, uint8_t *dest, int dstW,
 745                      int y, enum PixelFormat target)
 746 {
 747     int i;
 748
 749     for (i = 0; i < (dstW >> 1); i++) {
 750         int j;
 751         int Y1 = 1 << 18;
 752         int Y2 = 1 << 18;
 753         int U  = 1 << 18;
 754         int V  = 1 << 18;
 755
 756         for (j = 0; j < lumFilterSize; j++) {
 757             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 758             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 759         }
 760         for (j = 0; j < chrFilterSize; j++) {
 761             U += chrUSrc[j][i] * chrFilter[j];
 762             V += chrVSrc[j][i] * chrFilter[j];
 763         }
 764         Y1 >>= 19;
 765         Y2 >>= 19;
 766         U  >>= 19;
 767         V  >>= 19;
 768         if ((Y1 | Y2 | U | V) & 0x100) {
 769             Y1 = av_clip_uint8(Y1);
 770             Y2 = av_clip_uint8(Y2);
 771             U  = av_clip_uint8(U);
 772             V  = av_clip_uint8(V);
 773         }
 774         output_pixels(4*i, Y1, U, Y2, V);
 775     }
 776 }
 777
 778 static av_always_inline void
 779 yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
 780                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 781                      const int16_t *abuf[2], uint8_t *dest, int dstW,
 782                      int yalpha, int uvalpha, int y,
 783                      enum PixelFormat target)
 784 {
 785     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 786                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 787                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 788     int  yalpha1 = 4095 - yalpha;
 789     int uvalpha1 = 4095 - uvalpha;
 790     int i;
 791
 792     for (i = 0; i < (dstW >> 1); i++) {
 793         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
 794         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
 795         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
 796         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
 797
 798         output_pixels(i * 4, Y1, U, Y2, V);
 799     }
 800 }
 801
 802 static av_always_inline void
 803 yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
 804                      const int16_t *ubuf[2], const int16_t *vbuf[2],
 805                      const int16_t *abuf0, uint8_t *dest, int dstW,
 806                      int uvalpha, int y, enum PixelFormat target)
 807 {
 808     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 809                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 810     int i;
 811
 812     if (uvalpha < 2048) {
 813         for (i = 0; i < (dstW >> 1); i++) {
 814             int Y1 = buf0[i * 2]     >> 7;
 815             int Y2 = buf0[i * 2 + 1] >> 7;
 816             int U  = ubuf1[i]        >> 7;
 817             int V  = vbuf1[i]        >> 7;
 818
 819             output_pixels(i * 4, Y1, U, Y2, V);
 820         }
 821     } else {
 822         for (i = 0; i < (dstW >> 1); i++) {
 823             int Y1 =  buf0[i * 2]          >> 7;
 824             int Y2 =  buf0[i * 2 + 1]      >> 7;
 825             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
 826             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
 827
 828             output_pixels(i * 4, Y1, U, Y2, V);
 829         }
 830     }
 831 }
 832
 833 #undef output_pixels
 834
 835 YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, PIX_FMT_YUYV422);
 836 YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, PIX_FMT_UYVY422);
 837
 838 #define R_B ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? R : B)
 839 #define B_R ((target == PIX_FMT_RGB48LE || target == PIX_FMT_RGB48BE) ? B : R)
 840 #define output_pixel(pos, val) \
 841     if (isBE(target)) { \
 842         AV_WB16(pos, val); \
 843     } else { \
 844         AV_WL16(pos, val); \
 845     }
 846
 847 static av_always_inline void
 848 yuv2rgb48_X_c_template(SwsContext *c, const int16_t *lumFilter,
 849                        const int32_t **lumSrc, int lumFilterSize,
 850                        const int16_t *chrFilter, const int32_t **chrUSrc,
 851                        const int32_t **chrVSrc, int chrFilterSize,
 852                        const int32_t **alpSrc, uint16_t *dest, int dstW,
 853                        int y, enum PixelFormat target)
 854 {
 855     int i;
 856
 857     for (i = 0; i < (dstW >> 1); i++) {
 858         int j;
 859         int Y1 = 0;
 860         int Y2 = 0;
 861         int U  = -128 << 23; // 19
 862         int V  = -128 << 23;
 863         int R, G, B;
 864
 865         for (j = 0; j < lumFilterSize; j++) {
 866             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
 867             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
 868         }
 869         for (j = 0; j < chrFilterSize; j++) {
 870             U += chrUSrc[j][i] * chrFilter[j];
 871             V += chrVSrc[j][i] * chrFilter[j];
 872         }
 873
 874         // 8bit: 12+15=27; 16-bit: 12+19=31
 875         Y1 >>= 14; // 10
 876         Y2 >>= 14;
 877         U  >>= 14;
 878         V  >>= 14;
 879
 880         // 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
 881         Y1 -= c->yuv2rgb_y_offset;
 882         Y2 -= c->yuv2rgb_y_offset;
 883         Y1 *= c->yuv2rgb_y_coeff;
 884         Y2 *= c->yuv2rgb_y_coeff;
 885         Y1 += 1 << 13; // 21
 886         Y2 += 1 << 13;
 887         // 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 888
 889         R = V * c->yuv2rgb_v2r_coeff;
 890         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 891         B =                            U * c->yuv2rgb_u2b_coeff;
 892
 893         // 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
 894         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 895         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 896         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 897         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 898         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 899         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 900         dest += 6;
 901     }
 902 }
 903
 904 static av_always_inline void
 905 yuv2rgb48_2_c_template(SwsContext *c, const int32_t *buf[2],
 906                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 907                        const int32_t *abuf[2], uint16_t *dest, int dstW,
 908                        int yalpha, int uvalpha, int y,
 909                        enum PixelFormat target)
 910 {
 911     const int32_t *buf0  = buf[0],  *buf1  = buf[1],
 912                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 913                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 914     int  yalpha1 = 4095 - yalpha;
 915     int uvalpha1 = 4095 - uvalpha;
 916     int i;
 917
 918     for (i = 0; i < (dstW >> 1); i++) {
 919         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha) >> 14;
 920         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha) >> 14;
 921         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 922         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha + (-128 << 23)) >> 14;
 923         int R, G, B;
 924
 925         Y1 -= c->yuv2rgb_y_offset;
 926         Y2 -= c->yuv2rgb_y_offset;
 927         Y1 *= c->yuv2rgb_y_coeff;
 928         Y2 *= c->yuv2rgb_y_coeff;
 929         Y1 += 1 << 13;
 930         Y2 += 1 << 13;
 931
 932         R = V * c->yuv2rgb_v2r_coeff;
 933         G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 934         B =                            U * c->yuv2rgb_u2b_coeff;
 935
 936         output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 937         output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 938         output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 939         output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 940         output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 941         output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 942         dest += 6;
 943     }
 944 }
 945
 946 static av_always_inline void
 947 yuv2rgb48_1_c_template(SwsContext *c, const int32_t *buf0,
 948                        const int32_t *ubuf[2], const int32_t *vbuf[2],
 949                        const int32_t *abuf0, uint16_t *dest, int dstW,
 950                        int uvalpha, int y, enum PixelFormat target)
 951 {
 952     const int32_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 953                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 954     int i;
 955
 956     if (uvalpha < 2048) {
 957         for (i = 0; i < (dstW >> 1); i++) {
 958             int Y1 = (buf0[i * 2]    ) >> 2;
 959             int Y2 = (buf0[i * 2 + 1]) >> 2;
 960             int U  = (ubuf0[i] + (-128 << 11)) >> 2;
 961             int V  = (vbuf0[i] + (-128 << 11)) >> 2;
 962             int R, G, B;
 963
 964             Y1 -= c->yuv2rgb_y_offset;
 965             Y2 -= c->yuv2rgb_y_offset;
 966             Y1 *= c->yuv2rgb_y_coeff;
 967             Y2 *= c->yuv2rgb_y_coeff;
 968             Y1 += 1 << 13;
 969             Y2 += 1 << 13;
 970
 971             R = V * c->yuv2rgb_v2r_coeff;
 972             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
 973             B =                            U * c->yuv2rgb_u2b_coeff;
 974
 975             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
 976             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
 977             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
 978             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
 979             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
 980             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
 981             dest += 6;
 982         }
 983     } else {
 984         for (i = 0; i < (dstW >> 1); i++) {
 985             int Y1 = (buf0[i * 2]    ) >> 2;
 986             int Y2 = (buf0[i * 2 + 1]) >> 2;
 987             int U  = (ubuf0[i] + ubuf1[i] + (-128 << 11)) >> 3;
 988             int V  = (vbuf0[i] + vbuf1[i] + (-128 << 11)) >> 3;
 989             int R, G, B;
 990
 991             Y1 -= c->yuv2rgb_y_offset;
 992             Y2 -= c->yuv2rgb_y_offset;
 993             Y1 *= c->yuv2rgb_y_coeff;
 994             Y2 *= c->yuv2rgb_y_coeff;
 995             Y1 += 1 << 13;
 996             Y2 += 1 << 13;
 997
 998             R = V * c->yuv2rgb_v2r_coeff;
 999             G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
1000             B =                            U * c->yuv2rgb_u2b_coeff;
1001
1002             output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
1003             output_pixel(&dest[1], av_clip_uintp2(  G + Y1, 30) >> 14);
1004             output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
1005             output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
1006             output_pixel(&dest[4], av_clip_uintp2(  G + Y2, 30) >> 14);
1007             output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
1008             dest += 6;
1009         }
1010     }
1011 }
1012
1013 #undef output_pixel
1014 #undef r_b
1015 #undef b_r
1016
1017 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48be, PIX_FMT_RGB48BE);
1018 YUV2PACKED16WRAPPER(yuv2, rgb48, rgb48le, PIX_FMT_RGB48LE);
1019 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48be, PIX_FMT_BGR48BE);
1020 YUV2PACKED16WRAPPER(yuv2, rgb48, bgr48le, PIX_FMT_BGR48LE);
1021
1022 static av_always_inline void
1023 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
1024               int U, int V, int A1, int A2,
1025               const void *_r, const void *_g, const void *_b, int y,
1026               enum PixelFormat target, int hasAlpha)
1027 {
1028     if (target == PIX_FMT_ARGB || target == PIX_FMT_RGBA ||
1029         target == PIX_FMT_ABGR || target == PIX_FMT_BGRA) {
1030         uint32_t *dest = (uint32_t *) _dest;
1031         const uint32_t *r = (const uint32_t *) _r;
1032         const uint32_t *g = (const uint32_t *) _g;
1033         const uint32_t *b = (const uint32_t *) _b;
1034
1035 #if CONFIG_SMALL
1036         int sh = hasAlpha ? ((target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24) : 0;
1037
1038         dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
1039         dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
1040 #else
1041         if (hasAlpha) {
1042             int sh = (target == PIX_FMT_RGB32_1 || target == PIX_FMT_BGR32_1) ? 0 : 24;
1043
1044             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
1045             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
1046         } else {
1047             dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
1048             dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
1049         }
1050 #endif
1051     } else if (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) {
1052         uint8_t *dest = (uint8_t *) _dest;
1053         const uint8_t *r = (const uint8_t *) _r;
1054         const uint8_t *g = (const uint8_t *) _g;
1055         const uint8_t *b = (const uint8_t *) _b;
1056
1057 #define r_b ((target == PIX_FMT_RGB24) ? r : b)
1058 #define b_r ((target == PIX_FMT_RGB24) ? b : r)
1059         dest[i * 6 + 0] = r_b[Y1];
1060         dest[i * 6 + 1] =   g[Y1];
1061         dest[i * 6 + 2] = b_r[Y1];
1062         dest[i * 6 + 3] = r_b[Y2];
1063         dest[i * 6 + 4] =   g[Y2];
1064         dest[i * 6 + 5] = b_r[Y2];
1065 #undef r_b
1066 #undef b_r
1067     } else if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565 ||
1068                target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555 ||
1069                target == PIX_FMT_RGB444 || target == PIX_FMT_BGR444) {
1070         uint16_t *dest = (uint16_t *) _dest;
1071         const uint16_t *r = (const uint16_t *) _r;
1072         const uint16_t *g = (const uint16_t *) _g;
1073         const uint16_t *b = (const uint16_t *) _b;
1074         int dr1, dg1, db1, dr2, dg2, db2;
1075
1076         if (target == PIX_FMT_RGB565 || target == PIX_FMT_BGR565) {
1077             dr1 = dither_2x2_8[ y & 1     ][0];
1078             dg1 = dither_2x2_4[ y & 1     ][0];
1079             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1080             dr2 = dither_2x2_8[ y & 1     ][1];
1081             dg2 = dither_2x2_4[ y & 1     ][1];
1082             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1083         } else if (target == PIX_FMT_RGB555 || target == PIX_FMT_BGR555) {
1084             dr1 = dither_2x2_8[ y & 1     ][0];
1085             dg1 = dither_2x2_8[ y & 1     ][1];
1086             db1 = dither_2x2_8[(y & 1) ^ 1][0];
1087             dr2 = dither_2x2_8[ y & 1     ][1];
1088             dg2 = dither_2x2_8[ y & 1     ][0];
1089             db2 = dither_2x2_8[(y & 1) ^ 1][1];
1090         } else {
1091             dr1 = dither_4x4_16[ y & 3     ][0];
1092             dg1 = dither_4x4_16[ y & 3     ][1];
1093             db1 = dither_4x4_16[(y & 3) ^ 3][0];
1094             dr2 = dither_4x4_16[ y & 3     ][1];
1095             dg2 = dither_4x4_16[ y & 3     ][0];
1096             db2 = dither_4x4_16[(y & 3) ^ 3][1];
1097         }
1098
1099         dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1100         dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1101     } else /* 8/4-bit */ {
1102         uint8_t *dest = (uint8_t *) _dest;
1103         const uint8_t *r = (const uint8_t *) _r;
1104         const uint8_t *g = (const uint8_t *) _g;
1105         const uint8_t *b = (const uint8_t *) _b;
1106         int dr1, dg1, db1, dr2, dg2, db2;
1107
1108         if (target == PIX_FMT_RGB8 || target == PIX_FMT_BGR8) {
1109             const uint8_t * const d64 = dither_8x8_73[y & 7];
1110             const uint8_t * const d32 = dither_8x8_32[y & 7];
1111             dr1 = dg1 = d32[(i * 2 + 0) & 7];
1112             db1 =       d64[(i * 2 + 0) & 7];
1113             dr2 = dg2 = d32[(i * 2 + 1) & 7];
1114             db2 =       d64[(i * 2 + 1) & 7];
1115         } else {
1116             const uint8_t * const d64  = dither_8x8_73 [y & 7];
1117             const uint8_t * const d128 = dither_8x8_220[y & 7];
1118             dr1 = db1 = d128[(i * 2 + 0) & 7];
1119             dg1 =        d64[(i * 2 + 0) & 7];
1120             dr2 = db2 = d128[(i * 2 + 1) & 7];
1121             dg2 =        d64[(i * 2 + 1) & 7];
1122         }
1123
1124         if (target == PIX_FMT_RGB4 || target == PIX_FMT_BGR4) {
1125             dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
1126                     ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
1127         } else {
1128             dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
1129             dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
1130         }
1131     }
1132 }
1133
1134 static av_always_inline void
1135 yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
1136                      const int16_t **lumSrc, int lumFilterSize,
1137                      const int16_t *chrFilter, const int16_t **chrUSrc,
1138                      const int16_t **chrVSrc, int chrFilterSize,
1139                      const int16_t **alpSrc, uint8_t *dest, int dstW,
1140                      int y, enum PixelFormat target, int hasAlpha)
1141 {
1142     int i;
1143
1144     for (i = 0; i < (dstW >> 1); i++) {
1145         int j;
1146         int Y1 = 1 << 18;
1147         int Y2 = 1 << 18;
1148         int U  = 1 << 18;
1149         int V  = 1 << 18;
1150         int av_unused A1, A2;
1151         const void *r, *g, *b;
1152
1153         for (j = 0; j < lumFilterSize; j++) {
1154             Y1 += lumSrc[j][i * 2]     * lumFilter[j];
1155             Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
1156         }
1157         for (j = 0; j < chrFilterSize; j++) {
1158             U += chrUSrc[j][i] * chrFilter[j];
1159             V += chrVSrc[j][i] * chrFilter[j];
1160         }
1161         Y1 >>= 19;
1162         Y2 >>= 19;
1163         U  >>= 19;
1164         V  >>= 19;
1165         if ((Y1 | Y2 | U | V) & 0x100) {
1166             Y1 = av_clip_uint8(Y1);
1167             Y2 = av_clip_uint8(Y2);
1168             U  = av_clip_uint8(U);
1169             V  = av_clip_uint8(V);
1170         }
1171         if (hasAlpha) {
1172             A1 = 1 << 18;
1173             A2 = 1 << 18;
1174             for (j = 0; j < lumFilterSize; j++) {
1175                 A1 += alpSrc[j][i * 2    ] * lumFilter[j];
1176                 A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
1177             }
1178             A1 >>= 19;
1179             A2 >>= 19;
1180             if ((A1 | A2) & 0x100) {
1181                 A1 = av_clip_uint8(A1);
1182                 A2 = av_clip_uint8(A2);
1183             }
1184         }
1185
1186         /* FIXME fix tables so that clipping is not needed and then use _NOCLIP*/
1187         r =  c->table_rV[V];
1188         g = (c->table_gU[U] + c->table_gV[V]);
1189         b =  c->table_bU[U];
1190
1191         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1192                       r, g, b, y, target, hasAlpha);
1193     }
1194 }
1195
1196 static av_always_inline void
1197 yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
1198                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1199                      const int16_t *abuf[2], uint8_t *dest, int dstW,
1200                      int yalpha, int uvalpha, int y,
1201                      enum PixelFormat target, int hasAlpha)
1202 {
1203     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1204                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1205                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1206                   *abuf0 = hasAlpha ? abuf[0] : NULL,
1207                   *abuf1 = hasAlpha ? abuf[1] : NULL;
1208     int  yalpha1 = 4095 - yalpha;
1209     int uvalpha1 = 4095 - uvalpha;
1210     int i;
1211
1212     for (i = 0; i < (dstW >> 1); i++) {
1213         int Y1 = (buf0[i * 2]     * yalpha1  + buf1[i * 2]     * yalpha)  >> 19;
1214         int Y2 = (buf0[i * 2 + 1] * yalpha1  + buf1[i * 2 + 1] * yalpha)  >> 19;
1215         int U  = (ubuf0[i]        * uvalpha1 + ubuf1[i]        * uvalpha) >> 19;
1216         int V  = (vbuf0[i]        * uvalpha1 + vbuf1[i]        * uvalpha) >> 19;
1217         int A1, A2;
1218         const void *r =  c->table_rV[V],
1219                    *g = (c->table_gU[U] + c->table_gV[V]),
1220                    *b =  c->table_bU[U];
1221
1222         if (hasAlpha) {
1223             A1 = (abuf0[i * 2    ] * yalpha1 + abuf1[i * 2    ] * yalpha) >> 19;
1224             A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
1225         }
1226
1227         yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1228                       r, g, b, y, target, hasAlpha);
1229     }
1230 }
1231
1232 static av_always_inline void
1233 yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
1234                      const int16_t *ubuf[2], const int16_t *vbuf[2],
1235                      const int16_t *abuf0, uint8_t *dest, int dstW,
1236                      int uvalpha, int y, enum PixelFormat target,
1237                      int hasAlpha)
1238 {
1239     const int16_t *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1240                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
1241     int i;
1242
1243     if (uvalpha < 2048) {
1244         for (i = 0; i < (dstW >> 1); i++) {
1245             int Y1 = buf0[i * 2]     >> 7;
1246             int Y2 = buf0[i * 2 + 1] >> 7;
1247             int U  = ubuf1[i]        >> 7;
1248             int V  = vbuf1[i]        >> 7;
1249             int A1, A2;
1250             const void *r =  c->table_rV[V],
1251                        *g = (c->table_gU[U] + c->table_gV[V]),
1252                        *b =  c->table_bU[U];
1253
1254             if (hasAlpha) {
1255                 A1 = abuf0[i * 2    ] >> 7;
1256                 A2 = abuf0[i * 2 + 1] >> 7;
1257             }
1258
1259             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1260                           r, g, b, y, target, hasAlpha);
1261         }
1262     } else {
1263         for (i = 0; i < (dstW >> 1); i++) {
1264             int Y1 =  buf0[i * 2]          >> 7;
1265             int Y2 =  buf0[i * 2 + 1]      >> 7;
1266             int U  = (ubuf0[i] + ubuf1[i]) >> 8;
1267             int V  = (vbuf0[i] + vbuf1[i]) >> 8;
1268             int A1, A2;
1269             const void *r =  c->table_rV[V],
1270                        *g = (c->table_gU[U] + c->table_gV[V]),
1271                        *b =  c->table_bU[U];
1272
1273             if (hasAlpha) {
1274                 A1 = abuf0[i * 2    ] >> 7;
1275                 A2 = abuf0[i * 2 + 1] >> 7;
1276             }
1277
1278             yuv2rgb_write(dest, i, Y1, Y2, U, V, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
1279                           r, g, b, y, target, hasAlpha);
1280         }
1281     }
1282 }
1283
1284 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1285 static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
1286                                 const int16_t **lumSrc, int lumFilterSize, \
1287                                 const int16_t *chrFilter, const int16_t **chrUSrc, \
1288                                 const int16_t **chrVSrc, int chrFilterSize, \
1289                                 const int16_t **alpSrc, uint8_t *dest, int dstW, \
1290                                 int y) \
1291 { \
1292     name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
1293                                   chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
1294                                   alpSrc, dest, dstW, y, fmt, hasAlpha); \
1295 }
1296 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
1297 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
1298 static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
1299                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1300                                 const int16_t *abuf[2], uint8_t *dest, int dstW, \
1301                                 int yalpha, int uvalpha, int y) \
1302 { \
1303     name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
1304                                   dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
1305 } \
1306  \
1307 static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
1308                                 const int16_t *ubuf[2], const int16_t *vbuf[2], \
1309                                 const int16_t *abuf0, uint8_t *dest, int dstW, \
1310                                 int uvalpha, int y) \
1311 { \
1312     name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
1313                                   dstW, uvalpha, y, fmt, hasAlpha); \
1314 }
1315
1316 #if CONFIG_SMALL
1317 YUV2RGBWRAPPER(yuv2rgb,,  32_1,  PIX_FMT_RGB32_1,   CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1318 YUV2RGBWRAPPER(yuv2rgb,,  32,    PIX_FMT_RGB32,     CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1319 #else
1320 #if CONFIG_SWSCALE_ALPHA
1321 YUV2RGBWRAPPER(yuv2rgb,, a32_1,  PIX_FMT_RGB32_1,   1);
1322 YUV2RGBWRAPPER(yuv2rgb,, a32,    PIX_FMT_RGB32,     1);
1323 #endif
1324 YUV2RGBWRAPPER(yuv2rgb,, x32_1,  PIX_FMT_RGB32_1,   0);
1325 YUV2RGBWRAPPER(yuv2rgb,, x32,    PIX_FMT_RGB32,     0);
1326 #endif
1327 YUV2RGBWRAPPER(yuv2, rgb, rgb24, PIX_FMT_RGB24,   0);
1328 YUV2RGBWRAPPER(yuv2, rgb, bgr24, PIX_FMT_BGR24,   0);
1329 YUV2RGBWRAPPER(yuv2rgb,,  16,    PIX_FMT_RGB565,    0);
1330 YUV2RGBWRAPPER(yuv2rgb,,  15,    PIX_FMT_RGB555,    0);
1331 YUV2RGBWRAPPER(yuv2rgb,,  12,    PIX_FMT_RGB444,    0);
1332 YUV2RGBWRAPPER(yuv2rgb,,   8,    PIX_FMT_RGB8,      0);
1333 YUV2RGBWRAPPER(yuv2rgb,,   4,    PIX_FMT_RGB4,      0);
1334 YUV2RGBWRAPPER(yuv2rgb,,   4b,   PIX_FMT_RGB4_BYTE, 0);
1335
1336 static av_always_inline void
1337 yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
1338                           const int16_t **lumSrc, int lumFilterSize,
1339                           const int16_t *chrFilter, const int16_t **chrUSrc,
1340                           const int16_t **chrVSrc, int chrFilterSize,
1341                           const int16_t **alpSrc, uint8_t *dest,
1342                           int dstW, int y, enum PixelFormat target, int hasAlpha)
1343 {
1344     int i;
1345     int step = (target == PIX_FMT_RGB24 || target == PIX_FMT_BGR24) ? 3 : 4;
1346
1347     for (i = 0; i < dstW; i++) {
1348         int j;
1349         int Y = 0;
1350         int U = -128 << 19;
1351         int V = -128 << 19;
1352         int av_unused A;
1353         int R, G, B;
1354
1355         for (j = 0; j < lumFilterSize; j++) {
1356             Y += lumSrc[j][i] * lumFilter[j];
1357         }
1358         for (j = 0; j < chrFilterSize; j++) {
1359             U += chrUSrc[j][i] * chrFilter[j];
1360             V += chrVSrc[j][i] * chrFilter[j];
1361         }
1362         Y >>= 10;
1363         U >>= 10;
1364         V >>= 10;
1365         if (hasAlpha) {
1366             A = 1 << 21;
1367             for (j = 0; j < lumFilterSize; j++) {
1368                 A += alpSrc[j][i] * lumFilter[j];
1369             }
1370             A >>= 19;
1371             if (A & 0x100)
1372                 A = av_clip_uint8(A);
1373         }
1374         Y -= c->yuv2rgb_y_offset;
1375         Y *= c->yuv2rgb_y_coeff;
1376         Y += 1 << 21;
1377         R = Y + V*c->yuv2rgb_v2r_coeff;
1378         G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
1379         B = Y +                          U*c->yuv2rgb_u2b_coeff;
1380         if ((R | G | B) & 0xC0000000) {
1381             R = av_clip_uintp2(R, 30);
1382             G = av_clip_uintp2(G, 30);
1383             B = av_clip_uintp2(B, 30);
1384         }
1385
1386         switch(target) {
1387         case PIX_FMT_ARGB:
1388             dest[0] = hasAlpha ? A : 255;
1389             dest[1] = R >> 22;
1390             dest[2] = G >> 22;
1391             dest[3] = B >> 22;
1392             break;
1393         case PIX_FMT_RGB24:
1394             dest[0] = R >> 22;
1395             dest[1] = G >> 22;
1396             dest[2] = B >> 22;
1397             break;
1398         case PIX_FMT_RGBA:
1399             dest[0] = R >> 22;
1400             dest[1] = G >> 22;
1401             dest[2] = B >> 22;
1402             dest[3] = hasAlpha ? A : 255;
1403             break;
1404         case PIX_FMT_ABGR:
1405             dest[0] = hasAlpha ? A : 255;
1406             dest[1] = B >> 22;
1407             dest[2] = G >> 22;
1408             dest[3] = R >> 22;
1409             dest += 4;
1410             break;
1411         case PIX_FMT_BGR24:
1412             dest[0] = B >> 22;
1413             dest[1] = G >> 22;
1414             dest[2] = R >> 22;
1415             break;
1416         case PIX_FMT_BGRA:
1417             dest[0] = B >> 22;
1418             dest[1] = G >> 22;
1419             dest[2] = R >> 22;
1420             dest[3] = hasAlpha ? A : 255;
1421             break;
1422         }
1423         dest += step;
1424     }
1425 }
1426
1427 #if CONFIG_SMALL
1428 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1429 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1430 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1431 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  CONFIG_SWSCALE_ALPHA && c->alpPixBuf);
1432 #else
1433 #if CONFIG_SWSCALE_ALPHA
1434 YUV2RGBWRAPPERX(yuv2, rgb_full, bgra32_full, PIX_FMT_BGRA,  1);
1435 YUV2RGBWRAPPERX(yuv2, rgb_full, abgr32_full, PIX_FMT_ABGR,  1);
1436 YUV2RGBWRAPPERX(yuv2, rgb_full, rgba32_full, PIX_FMT_RGBA,  1);
1437 YUV2RGBWRAPPERX(yuv2, rgb_full, argb32_full, PIX_FMT_ARGB,  1);
1438 #endif
1439 YUV2RGBWRAPPERX(yuv2, rgb_full, bgrx32_full, PIX_FMT_BGRA,  0);
1440 YUV2RGBWRAPPERX(yuv2, rgb_full, xbgr32_full, PIX_FMT_ABGR,  0);
1441 YUV2RGBWRAPPERX(yuv2, rgb_full, rgbx32_full, PIX_FMT_RGBA,  0);
1442 YUV2RGBWRAPPERX(yuv2, rgb_full, xrgb32_full, PIX_FMT_ARGB,  0);
1443 #endif
1444 YUV2RGBWRAPPERX(yuv2, rgb_full, bgr24_full,  PIX_FMT_BGR24, 0);
1445 YUV2RGBWRAPPERX(yuv2, rgb_full, rgb24_full,  PIX_FMT_RGB24, 0);
1446
1447 static av_always_inline void fillPlane(uint8_t* plane, int stride,
1448                                        int width, int height,
1449                                        int y, uint8_t val)
1450 {
1451     int i;
1452     uint8_t *ptr = plane + stride*y;
1453     for (i=0; i<height; i++) {
1454         memset(ptr, val, width);
1455         ptr += stride;
1456     }
1457 }
1458
1459 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1460
1461 #define r ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? b_r : r_b)
1462 #define b ((origin == PIX_FMT_BGR48BE || origin == PIX_FMT_BGR48LE) ? r_b : b_r)
1463
1464 static av_always_inline void
1465 rgb48ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
1466                     enum PixelFormat origin)
1467 {
1468     int i;
1469     for (i = 0; i < width; i++) {
1470         unsigned int r_b = input_pixel(&src[i*3+0]);
1471         unsigned int   g = input_pixel(&src[i*3+1]);
1472         unsigned int b_r = input_pixel(&src[i*3+2]);
1473
1474         dst[i] = (RY*r + GY*g + BY*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1475     }
1476 }
1477
1478 static av_always_inline void
1479 rgb48ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
1480                     const uint16_t *src1, const uint16_t *src2,
1481                     int width, enum PixelFormat origin)
1482 {
1483     int i;
1484     assert(src1==src2);
1485     for (i = 0; i < width; i++) {
1486         int r_b = input_pixel(&src1[i*3+0]);
1487         int   g = input_pixel(&src1[i*3+1]);
1488         int b_r = input_pixel(&src1[i*3+2]);
1489
1490         dstU[i] = (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1491         dstV[i] = (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1492     }
1493 }
1494
1495 static av_always_inline void
1496 rgb48ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
1497                           const uint16_t *src1, const uint16_t *src2,
1498                           int width, enum PixelFormat origin)
1499 {
1500     int i;
1501     assert(src1==src2);
1502     for (i = 0; i < width; i++) {
1503         int r_b = (input_pixel(&src1[6 * i + 0]) + input_pixel(&src1[6 * i + 3]) + 1) >> 1;
1504         int   g = (input_pixel(&src1[6 * i + 1]) + input_pixel(&src1[6 * i + 4]) + 1) >> 1;
1505         int b_r = (input_pixel(&src1[6 * i + 2]) + input_pixel(&src1[6 * i + 5]) + 1) >> 1;
1506
1507         dstU[i]= (RU*r + GU*g + BU*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1508         dstV[i]= (RV*r + GV*g + BV*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
1509     }
1510 }
1511
1512 #undef r
1513 #undef b
1514 #undef input_pixel
1515
1516 #define rgb48funcs(pattern, BE_LE, origin) \
1517 static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, \
1518                                     int width, uint32_t *unused) \
1519 { \
1520     const uint16_t *src = (const uint16_t *) _src; \
1521     uint16_t *dst = (uint16_t *) _dst; \
1522     rgb48ToY_c_template(dst, src, width, origin); \
1523 } \
1524  \
1525 static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
1526                                     const uint8_t *_src1, const uint8_t *_src2, \
1527                                     int width, uint32_t *unused) \
1528 { \
1529     const uint16_t *src1 = (const uint16_t *) _src1, \
1530                    *src2 = (const uint16_t *) _src2; \
1531     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1532     rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin); \
1533 } \
1534  \
1535 static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
1536                                     const uint8_t *_src1, const uint8_t *_src2, \
1537                                     int width, uint32_t *unused) \
1538 { \
1539     const uint16_t *src1 = (const uint16_t *) _src1, \
1540                    *src2 = (const uint16_t *) _src2; \
1541     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
1542     rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin); \
1543 }
1544
1545 rgb48funcs(rgb, LE, PIX_FMT_RGB48LE);
1546 rgb48funcs(rgb, BE, PIX_FMT_RGB48BE);
1547 rgb48funcs(bgr, LE, PIX_FMT_BGR48LE);
1548 rgb48funcs(bgr, BE, PIX_FMT_BGR48BE);
1549
1550 #define input_pixel(i) ((origin == PIX_FMT_RGBA || origin == PIX_FMT_BGRA || \
1551                          origin == PIX_FMT_ARGB || origin == PIX_FMT_ABGR) ? AV_RN32A(&src[(i)*4]) : \
1552                         (isBE(origin) ? AV_RB16(&src[(i)*2]) : AV_RL16(&src[(i)*2])))
1553
1554 static av_always_inline void
1555 rgb16_32ToY_c_template(uint8_t *dst, const uint8_t *src,
1556                        int width, enum PixelFormat origin,
1557                        int shr,   int shg,   int shb, int shp,
1558                        int maskr, int maskg, int maskb,
1559                        int rsh,   int gsh,   int bsh, int S)
1560 {
1561     const int ry = RY << rsh, gy = GY << gsh, by = BY << bsh,
1562               rnd = 33 << (S - 1);
1563     int i;
1564
1565     for (i = 0; i < width; i++) {
1566         int px = input_pixel(i) >> shp;
1567         int b = (px & maskb) >> shb;
1568         int g = (px & maskg) >> shg;
1569         int r = (px & maskr) >> shr;
1570
1571         dst[i] = (ry * r + gy * g + by * b + rnd) >> S;
1572     }
1573 }
1574
1575 static av_always_inline void
1576 rgb16_32ToUV_c_template(uint8_t *dstU, uint8_t *dstV,
1577                         const uint8_t *src, int width,
1578                         enum PixelFormat origin,
1579                         int shr,   int shg,   int shb, int shp,
1580                         int maskr, int maskg, int maskb,
1581                         int rsh,   int gsh,   int bsh, int S)
1582 {
1583     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1584               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1585               rnd = 257 << (S - 1);
1586     int i;
1587
1588     for (i = 0; i < width; i++) {
1589         int px = input_pixel(i) >> shp;
1590         int b = (px & maskb) >> shb;
1591         int g = (px & maskg) >> shg;
1592         int r = (px & maskr) >> shr;
1593
1594         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> S;
1595         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> S;
1596     }
1597 }
1598
1599 static av_always_inline void
1600 rgb16_32ToUV_half_c_template(uint8_t *dstU, uint8_t *dstV,
1601                              const uint8_t *src, int width,
1602                              enum PixelFormat origin,
1603                              int shr,   int shg,   int shb, int shp,
1604                              int maskr, int maskg, int maskb,
1605                              int rsh,   int gsh,   int bsh, int S)
1606 {
1607     const int ru = RU << rsh, gu = GU << gsh, bu = BU << bsh,
1608               rv = RV << rsh, gv = GV << gsh, bv = BV << bsh,
1609               rnd = 257 << S, maskgx = ~(maskr | maskb);
1610     int i;
1611
1612     maskr |= maskr << 1; maskb |= maskb << 1; maskg |= maskg << 1;
1613     for (i = 0; i < width; i++) {
1614         int px0 = input_pixel(2 * i + 0) >> shp;
1615         int px1 = input_pixel(2 * i + 1) >> shp;
1616         int b, r, g = (px0 & maskgx) + (px1 & maskgx);
1617         int rb = px0 + px1 - g;
1618
1619         b = (rb & maskb) >> shb;
1620         if (shp || origin == PIX_FMT_BGR565LE || origin == PIX_FMT_BGR565BE ||
1621             origin == PIX_FMT_RGB565LE || origin == PIX_FMT_RGB565BE) {
1622             g >>= shg;
1623         } else {
1624             g = (g  & maskg) >> shg;
1625         }
1626         r = (rb & maskr) >> shr;
1627
1628         dstU[i] = (ru * r + gu * g + bu * b + rnd) >> (S + 1);
1629         dstV[i] = (rv * r + gv * g + bv * b + rnd) >> (S + 1);
1630     }
1631 }
1632
1633 #undef input_pixel
1634
1635 #define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
1636                          maskg, maskb, rsh, gsh, bsh, S) \
1637 static void name ## ToY_c(uint8_t *dst, const uint8_t *src, \
1638                           int width, uint32_t *unused) \
1639 { \
1640     rgb16_32ToY_c_template(dst, src, width, fmt, shr, shg, shb, shp, \
1641                            maskr, maskg, maskb, rsh, gsh, bsh, S); \
1642 } \
1643  \
1644 static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
1645                            const uint8_t *src, const uint8_t *dummy, \
1646                            int width, uint32_t *unused) \
1647 { \
1648     rgb16_32ToUV_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1649                             maskr, maskg, maskb, rsh, gsh, bsh, S); \
1650 } \
1651  \
1652 static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
1653                                 const uint8_t *src, const uint8_t *dummy, \
1654                                 int width, uint32_t *unused) \
1655 { \
1656     rgb16_32ToUV_half_c_template(dstU, dstV, src, width, fmt, shr, shg, shb, shp, \
1657                                  maskr, maskg, maskb, rsh, gsh, bsh, S); \
1658 }
1659
1660 rgb16_32_wrapper(PIX_FMT_BGR32,    bgr32,  16, 0,  0, 0, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1661 rgb16_32_wrapper(PIX_FMT_BGR32_1,  bgr321, 16, 0,  0, 8, 0xFF0000, 0xFF00,   0x00FF,  8, 0,  8, RGB2YUV_SHIFT+8);
1662 rgb16_32_wrapper(PIX_FMT_RGB32,    rgb32,   0, 0, 16, 0,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1663 rgb16_32_wrapper(PIX_FMT_RGB32_1,  rgb321,  0, 0, 16, 8,   0x00FF, 0xFF00, 0xFF0000,  8, 0,  8, RGB2YUV_SHIFT+8);
1664 rgb16_32_wrapper(PIX_FMT_BGR565LE, bgr16le, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1665 rgb16_32_wrapper(PIX_FMT_BGR555LE, bgr15le, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1666 rgb16_32_wrapper(PIX_FMT_RGB565LE, rgb16le, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1667 rgb16_32_wrapper(PIX_FMT_RGB555LE, rgb15le, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1668 rgb16_32_wrapper(PIX_FMT_BGR565BE, bgr16be, 0, 0,  0, 0,   0x001F, 0x07E0,   0xF800, 11, 5,  0, RGB2YUV_SHIFT+8);
1669 rgb16_32_wrapper(PIX_FMT_BGR555BE, bgr15be, 0, 0,  0, 0,   0x001F, 0x03E0,   0x7C00, 10, 5,  0, RGB2YUV_SHIFT+7);
1670 rgb16_32_wrapper(PIX_FMT_RGB565BE, rgb16be, 0, 0,  0, 0,   0xF800, 0x07E0,   0x001F,  0, 5, 11, RGB2YUV_SHIFT+8);
1671 rgb16_32_wrapper(PIX_FMT_RGB555BE, rgb15be, 0, 0,  0, 0,   0x7C00, 0x03E0,   0x001F,  0, 5, 10, RGB2YUV_SHIFT+7);
1672
1673 static void abgrToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1674 {
1675     int i;
1676     for (i=0; i<width; i++) {
1677         dst[i]= src[4*i];
1678     }
1679 }
1680
1681 static void rgbaToA_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *unused)
1682 {
1683     int i;
1684     for (i=0; i<width; i++) {
1685         dst[i]= src[4*i+3];
1686     }
1687 }
1688
1689 static void palToY_c(uint8_t *dst, const uint8_t *src, int width, uint32_t *pal)
1690 {
1691     int i;
1692     for (i=0; i<width; i++) {
1693         int d= src[i];
1694
1695         dst[i]= pal[d] & 0xFF;
1696     }
1697 }
1698
1699 static void palToUV_c(uint8_t *dstU, uint8_t *dstV,
1700                       const uint8_t *src1, const uint8_t *src2,
1701                       int width, uint32_t *pal)
1702 {
1703     int i;
1704     assert(src1 == src2);
1705     for (i=0; i<width; i++) {
1706         int p= pal[src1[i]];
1707
1708         dstU[i]= p>>8;
1709         dstV[i]= p>>16;
1710     }
1711 }
1712
1713 static void monowhite2Y_c(uint8_t *dst, const uint8_t *src,
1714                           int width, uint32_t *unused)
1715 {
1716     int i, j;
1717     for (i=0; i<width/8; i++) {
1718         int d= ~src[i];
1719         for(j=0; j<8; j++)
1720             dst[8*i+j]= ((d>>(7-j))&1)*255;
1721     }
1722 }
1723
1724 static void monoblack2Y_c(uint8_t *dst, const uint8_t *src,
1725                           int width, uint32_t *unused)
1726 {
1727     int i, j;
1728     for (i=0; i<width/8; i++) {
1729         int d= src[i];
1730         for(j=0; j<8; j++)
1731             dst[8*i+j]= ((d>>(7-j))&1)*255;
1732     }
1733 }
1734
1735 //FIXME yuy2* can read up to 7 samples too much
1736
1737 static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, int width,
1738                       uint32_t *unused)
1739 {
1740     int i;
1741     for (i=0; i<width; i++)
1742         dst[i]= src[2*i];
1743 }
1744
1745 static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1746                        const uint8_t *src2, int width, uint32_t *unused)
1747 {
1748     int i;
1749     for (i=0; i<width; i++) {
1750         dstU[i]= src1[4*i + 1];
1751         dstV[i]= src1[4*i + 3];
1752     }
1753     assert(src1 == src2);
1754 }
1755
1756 static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, int width, uint32_t *unused)
1757 {
1758     int i;
1759     const uint16_t *src = (const uint16_t *) _src;
1760     uint16_t *dst = (uint16_t *) _dst;
1761     for (i=0; i<width; i++) {
1762         dst[i] = av_bswap16(src[i]);
1763     }
1764 }
1765
1766 static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *_src1,
1767                         const uint8_t *_src2, int width, uint32_t *unused)
1768 {
1769     int i;
1770     const uint16_t *src1 = (const uint16_t *) _src1,
1771                    *src2 = (const uint16_t *) _src2;
1772     uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV;
1773     for (i=0; i<width; i++) {
1774         dstU[i] = av_bswap16(src1[i]);
1775         dstV[i] = av_bswap16(src2[i]);
1776     }
1777 }
1778
1779 /* This is almost identical to the previous, end exists only because
1780  * yuy2ToY/UV)(dst, src+1, ...) would have 100% unaligned accesses. */
1781 static void uyvyToY_c(uint8_t *dst, const uint8_t *src, int width,
1782                       uint32_t *unused)
1783 {
1784     int i;
1785     for (i=0; i<width; i++)
1786         dst[i]= src[2*i+1];
1787 }
1788
1789 static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1790                        const uint8_t *src2, int width, uint32_t *unused)
1791 {
1792     int i;
1793     for (i=0; i<width; i++) {
1794         dstU[i]= src1[4*i + 0];
1795         dstV[i]= src1[4*i + 2];
1796     }
1797     assert(src1 == src2);
1798 }
1799
1800 static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
1801                                         const uint8_t *src, int width)
1802 {
1803     int i;
1804     for (i = 0; i < width; i++) {
1805         dst1[i] = src[2*i+0];
1806         dst2[i] = src[2*i+1];
1807     }
1808 }
1809
1810 static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
1811                        const uint8_t *src1, const uint8_t *src2,
1812                        int width, uint32_t *unused)
1813 {
1814     nvXXtoUV_c(dstU, dstV, src1, width);
1815 }
1816
1817 static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
1818                        const uint8_t *src1, const uint8_t *src2,
1819                        int width, uint32_t *unused)
1820 {
1821     nvXXtoUV_c(dstV, dstU, src1, width);
1822 }
1823
1824 #define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
1825
1826 static void bgr24ToY_c(uint8_t *dst, const uint8_t *src,
1827                        int width, uint32_t *unused)
1828 {
1829     int i;
1830     for (i=0; i<width; i++) {
1831         int b= src[i*3+0];
1832         int g= src[i*3+1];
1833         int r= src[i*3+2];
1834
1835         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1836     }
1837 }
1838
1839 static void bgr24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1840                         const uint8_t *src2, int width, uint32_t *unused)
1841 {
1842     int i;
1843     for (i=0; i<width; i++) {
1844         int b= src1[3*i + 0];
1845         int g= src1[3*i + 1];
1846         int r= src1[3*i + 2];
1847
1848         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1849         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1850     }
1851     assert(src1 == src2);
1852 }
1853
1854 static void bgr24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1855                              const uint8_t *src2, int width, uint32_t *unused)
1856 {
1857     int i;
1858     for (i=0; i<width; i++) {
1859         int b= src1[6*i + 0] + src1[6*i + 3];
1860         int g= src1[6*i + 1] + src1[6*i + 4];
1861         int r= src1[6*i + 2] + src1[6*i + 5];
1862
1863         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1864         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1865     }
1866     assert(src1 == src2);
1867 }
1868
1869 static void rgb24ToY_c(uint8_t *dst, const uint8_t *src, int width,
1870                        uint32_t *unused)
1871 {
1872     int i;
1873     for (i=0; i<width; i++) {
1874         int r= src[i*3+0];
1875         int g= src[i*3+1];
1876         int b= src[i*3+2];
1877
1878         dst[i]= ((RY*r + GY*g + BY*b + (33<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT);
1879     }
1880 }
1881
1882 static void rgb24ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1883                         const uint8_t *src2, int width, uint32_t *unused)
1884 {
1885     int i;
1886     assert(src1==src2);
1887     for (i=0; i<width; i++) {
1888         int r= src1[3*i + 0];
1889         int g= src1[3*i + 1];
1890         int b= src1[3*i + 2];
1891
1892         dstU[i]= (RU*r + GU*g + BU*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1893         dstV[i]= (RV*r + GV*g + BV*b + (257<<(RGB2YUV_SHIFT-1)))>>RGB2YUV_SHIFT;
1894     }
1895 }
1896
1897 static void rgb24ToUV_half_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *src1,
1898                              const uint8_t *src2, int width, uint32_t *unused)
1899 {
1900     int i;
1901     assert(src1==src2);
1902     for (i=0; i<width; i++) {
1903         int r= src1[6*i + 0] + src1[6*i + 3];
1904         int g= src1[6*i + 1] + src1[6*i + 4];
1905         int b= src1[6*i + 2] + src1[6*i + 5];
1906
1907         dstU[i]= (RU*r + GU*g + BU*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1908         dstV[i]= (RV*r + GV*g + BV*b + (257<<RGB2YUV_SHIFT))>>(RGB2YUV_SHIFT+1);
1909     }
1910 }
1911
1912 static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
1913                            const int16_t *filter,
1914                            const int16_t *filterPos, int filterSize)
1915 {
1916     int i;
1917     int32_t *dst = (int32_t *) _dst;
1918     const uint16_t *src = (const uint16_t *) _src;
1919     int bits = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1920     int sh = bits - 4;
1921
1922     for (i = 0; i < dstW; i++) {
1923         int j;
1924         int srcPos = filterPos[i];
1925         int val = 0;
1926
1927         for (j = 0; j < filterSize; j++) {
1928             val += src[srcPos + j] * filter[filterSize * i + j];
1929         }
1930         // filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
1931         dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
1932     }
1933 }
1934
1935 static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
1936                            const int16_t *filter,
1937                            const int16_t *filterPos, int filterSize)
1938 {
1939     int i;
1940     const uint16_t *src = (const uint16_t *) _src;
1941     int sh = av_pix_fmt_descriptors[c->srcFormat].comp[0].depth_minus1;
1942
1943     for (i = 0; i < dstW; i++) {
1944         int j;
1945         int srcPos = filterPos[i];
1946         int val = 0;
1947
1948         for (j = 0; j < filterSize; j++) {
1949             val += src[srcPos + j] * filter[filterSize * i + j];
1950         }
1951         // filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
1952         dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
1953     }
1954 }
1955
1956 // bilinear / bicubic scaling
1957 static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
1958                           const int16_t *filter, const int16_t *filterPos,
1959                           int filterSize)
1960 {
1961     int i;
1962     for (i=0; i<dstW; i++) {
1963         int j;
1964         int srcPos= filterPos[i];
1965         int val=0;
1966         for (j=0; j<filterSize; j++) {
1967             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1968         }
1969         //filter += hFilterSize;
1970         dst[i] = FFMIN(val>>7, (1<<15)-1); // the cubic equation does overflow ...
1971         //dst[i] = val>>7;
1972     }
1973 }
1974
1975 static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
1976                           const int16_t *filter, const int16_t *filterPos,
1977                           int filterSize)
1978 {
1979     int i;
1980     int32_t *dst = (int32_t *) _dst;
1981     for (i=0; i<dstW; i++) {
1982         int j;
1983         int srcPos= filterPos[i];
1984         int val=0;
1985         for (j=0; j<filterSize; j++) {
1986             val += ((int)src[srcPos + j])*filter[filterSize*i + j];
1987         }
1988         //filter += hFilterSize;
1989         dst[i] = FFMIN(val>>3, (1<<19)-1); // the cubic equation does overflow ...
1990         //dst[i] = val>>7;
1991     }
1992 }
1993
1994 //FIXME all pal and rgb srcFormats could do this convertion as well
1995 //FIXME all scalers more complex than bilinear could do half of this transform
1996 static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
1997 {
1998     int i;
1999     for (i = 0; i < width; i++) {
2000         dstU[i] = (FFMIN(dstU[i],30775)*4663 - 9289992)>>12; //-264
2001         dstV[i] = (FFMIN(dstV[i],30775)*4663 - 9289992)>>12; //-264
2002     }
2003 }
2004 static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
2005 {
2006     int i;
2007     for (i = 0; i < width; i++) {
2008         dstU[i] = (dstU[i]*1799 + 4081085)>>11; //1469
2009         dstV[i] = (dstV[i]*1799 + 4081085)>>11; //1469
2010     }
2011 }
2012 static void lumRangeToJpeg_c(int16_t *dst, int width)
2013 {
2014     int i;
2015     for (i = 0; i < width; i++)
2016         dst[i] = (FFMIN(dst[i],30189)*19077 - 39057361)>>14;
2017 }
2018 static void lumRangeFromJpeg_c(int16_t *dst, int width)
2019 {
2020     int i;
2021     for (i = 0; i < width; i++)
2022         dst[i] = (dst[i]*14071 + 33561947)>>14;
2023 }
2024
2025 static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2026 {
2027     int i;
2028     int32_t *dstU = (int32_t *) _dstU;
2029     int32_t *dstV = (int32_t *) _dstV;
2030     for (i = 0; i < width; i++) {
2031         dstU[i] = (FFMIN(dstU[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2032         dstV[i] = (FFMIN(dstV[i],30775<<4)*4663 - (9289992<<4))>>12; //-264
2033     }
2034 }
2035 static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
2036 {
2037     int i;
2038     int32_t *dstU = (int32_t *) _dstU;
2039     int32_t *dstV = (int32_t *) _dstV;
2040     for (i = 0; i < width; i++) {
2041         dstU[i] = (dstU[i]*1799 + (4081085<<4))>>11; //1469
2042         dstV[i] = (dstV[i]*1799 + (4081085<<4))>>11; //1469
2043     }
2044 }
2045 static void lumRangeToJpeg16_c(int16_t *_dst, int width)
2046 {
2047     int i;
2048     int32_t *dst = (int32_t *) _dst;
2049     for (i = 0; i < width; i++)
2050         dst[i] = (FFMIN(dst[i],30189<<4)*4769 - (39057361<<2))>>12;
2051 }
2052 static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
2053 {
2054     int i;
2055     int32_t *dst = (int32_t *) _dst;
2056     for (i = 0; i < width; i++)
2057         dst[i] = (dst[i]*14071 + (33561947<<4))>>14;
2058 }
2059
2060 static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
2061                            const uint8_t *src, int srcW, int xInc)
2062 {
2063     int i;
2064     unsigned int xpos=0;
2065     for (i=0;i<dstWidth;i++) {
2066         register unsigned int xx=xpos>>16;
2067         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2068         dst[i]= (src[xx]<<7) + (src[xx+1] - src[xx])*xalpha;
2069         xpos+=xInc;
2070     }
2071 }
2072
2073 // *** horizontal scale Y line to temp buffer
2074 static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
2075                                      const uint8_t *src, int srcW, int xInc,
2076                                      const int16_t *hLumFilter,
2077                                      const int16_t *hLumFilterPos, int hLumFilterSize,
2078                                      uint8_t *formatConvBuffer,
2079                                      uint32_t *pal, int isAlpha)
2080 {
2081     void (*toYV12)(uint8_t *, const uint8_t *, int, uint32_t *) = isAlpha ? c->alpToYV12 : c->lumToYV12;
2082     void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
2083
2084     if (toYV12) {
2085         toYV12(formatConvBuffer, src, srcW, pal);
2086         src= formatConvBuffer;
2087     }
2088
2089     if (!c->hyscale_fast) {
2090         c->hyScale(c, dst, dstWidth, src, hLumFilter, hLumFilterPos, hLumFilterSize);
2091     } else { // fast bilinear upscale / crap downscale
2092         c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
2093     }
2094
2095     if (convertRange)
2096         convertRange(dst, dstWidth);
2097 }
2098
2099 static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
2100                            int dstWidth, const uint8_t *src1,
2101                            const uint8_t *src2, int srcW, int xInc)
2102 {
2103     int i;
2104     unsigned int xpos=0;
2105     for (i=0;i<dstWidth;i++) {
2106         register unsigned int xx=xpos>>16;
2107         register unsigned int xalpha=(xpos&0xFFFF)>>9;
2108         dst1[i]=(src1[xx]*(xalpha^127)+src1[xx+1]*xalpha);
2109         dst2[i]=(src2[xx]*(xalpha^127)+src2[xx+1]*xalpha);
2110         xpos+=xInc;
2111     }
2112 }
2113
2114 static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
2115                                      const uint8_t *src1, const uint8_t *src2,
2116                                      int srcW, int xInc, const int16_t *hChrFilter,
2117                                      const int16_t *hChrFilterPos, int hChrFilterSize,
2118                                      uint8_t *formatConvBuffer, uint32_t *pal)
2119 {
2120     if (c->chrToYV12) {
2121         uint8_t *buf2 = formatConvBuffer + FFALIGN(srcW * FFALIGN(c->srcBpc, 8) >> 3, 16);
2122         c->chrToYV12(formatConvBuffer, buf2, src1, src2, srcW, pal);
2123         src1= formatConvBuffer;
2124         src2= buf2;
2125     }
2126
2127     if (!c->hcscale_fast) {
2128         c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
2129         c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
2130     } else { // fast bilinear upscale / crap downscale
2131         c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
2132     }
2133
2134     if (c->chrConvertRange)
2135         c->chrConvertRange(dst1, dst2, dstWidth);
2136 }
2137
2138 static av_always_inline void
2139 find_c_packed_planar_out_funcs(SwsContext *c,
2140                                yuv2planar1_fn *yuv2yuv1,    yuv2planarX_fn *yuv2yuvX,
2141                                yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2,
2142                                yuv2packedX_fn *yuv2packedX)
2143 {
2144     enum PixelFormat dstFormat = c->dstFormat;
2145
2146     if (dstFormat == PIX_FMT_NV12 || dstFormat == PIX_FMT_NV21) {
2147         *yuv2yuvX     = yuv2nv12X_c;
2148     } else if (is16BPS(dstFormat)) {
2149         *yuv2yuvX     = isBE(dstFormat) ? yuv2yuvX16BE_c  : yuv2yuvX16LE_c;
2150     } else if (is9_OR_10BPS(dstFormat)) {
2151         if (av_pix_fmt_descriptors[dstFormat].comp[0].depth_minus1 == 8) {
2152             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX9BE_c :  yuv2yuvX9LE_c;
2153         } else {
2154             *yuv2yuvX = isBE(dstFormat) ? yuv2yuvX10BE_c : yuv2yuvX10LE_c;
2155         }
2156     } else {
2157         *yuv2yuv1     = yuv2yuv1_c;
2158         *yuv2yuvX     = yuv2yuvX_c;
2159     }
2160     if(c->flags & SWS_FULL_CHR_H_INT) {
2161         switch (dstFormat) {
2162             case PIX_FMT_RGBA:
2163 #if CONFIG_SMALL
2164                 *yuv2packedX = yuv2rgba32_full_X_c;
2165 #else
2166 #if CONFIG_SWSCALE_ALPHA
2167                 if (c->alpPixBuf) {
2168                     *yuv2packedX = yuv2rgba32_full_X_c;
2169                 } else
2170 #endif /* CONFIG_SWSCALE_ALPHA */
2171                 {
2172                     *yuv2packedX = yuv2rgbx32_full_X_c;
2173                 }
2174 #endif /* !CONFIG_SMALL */
2175                 break;
2176             case PIX_FMT_ARGB:
2177 #if CONFIG_SMALL
2178                 *yuv2packedX = yuv2argb32_full_X_c;
2179 #else
2180 #if CONFIG_SWSCALE_ALPHA
2181                 if (c->alpPixBuf) {
2182                     *yuv2packedX = yuv2argb32_full_X_c;
2183                 } else
2184 #endif /* CONFIG_SWSCALE_ALPHA */
2185                 {
2186                     *yuv2packedX = yuv2xrgb32_full_X_c;
2187                 }
2188 #endif /* !CONFIG_SMALL */
2189                 break;
2190             case PIX_FMT_BGRA:
2191 #if CONFIG_SMALL
2192                 *yuv2packedX = yuv2bgra32_full_X_c;
2193 #else
2194 #if CONFIG_SWSCALE_ALPHA
2195                 if (c->alpPixBuf) {
2196                     *yuv2packedX = yuv2bgra32_full_X_c;
2197                 } else
2198 #endif /* CONFIG_SWSCALE_ALPHA */
2199                 {
2200                     *yuv2packedX = yuv2bgrx32_full_X_c;
2201                 }
2202 #endif /* !CONFIG_SMALL */
2203                 break;
2204             case PIX_FMT_ABGR:
2205 #if CONFIG_SMALL
2206                 *yuv2packedX = yuv2abgr32_full_X_c;
2207 #else
2208 #if CONFIG_SWSCALE_ALPHA
2209                 if (c->alpPixBuf) {
2210                     *yuv2packedX = yuv2abgr32_full_X_c;
2211                 } else
2212 #endif /* CONFIG_SWSCALE_ALPHA */
2213                 {
2214                     *yuv2packedX = yuv2xbgr32_full_X_c;
2215                 }
2216 #endif /* !CONFIG_SMALL */
2217                 break;
2218             case PIX_FMT_RGB24:
2219             *yuv2packedX = yuv2rgb24_full_X_c;
2220             break;
2221         case PIX_FMT_BGR24:
2222             *yuv2packedX = yuv2bgr24_full_X_c;
2223             break;
2224         }
2225     } else {
2226         switch (dstFormat) {
2227         case PIX_FMT_GRAY16BE:
2228             *yuv2packed1 = yuv2gray16BE_1_c;
2229             *yuv2packed2 = yuv2gray16BE_2_c;
2230             *yuv2packedX = yuv2gray16BE_X_c;
2231             break;
2232         case PIX_FMT_GRAY16LE:
2233             *yuv2packed1 = yuv2gray16LE_1_c;
2234             *yuv2packed2 = yuv2gray16LE_2_c;
2235             *yuv2packedX = yuv2gray16LE_X_c;
2236             break;
2237         case PIX_FMT_MONOWHITE:
2238             *yuv2packed1 = yuv2monowhite_1_c;
2239             *yuv2packed2 = yuv2monowhite_2_c;
2240             *yuv2packedX = yuv2monowhite_X_c;
2241             break;
2242         case PIX_FMT_MONOBLACK:
2243             *yuv2packed1 = yuv2monoblack_1_c;
2244             *yuv2packed2 = yuv2monoblack_2_c;
2245             *yuv2packedX = yuv2monoblack_X_c;
2246             break;
2247         case PIX_FMT_YUYV422:
2248             *yuv2packed1 = yuv2yuyv422_1_c;
2249             *yuv2packed2 = yuv2yuyv422_2_c;
2250             *yuv2packedX = yuv2yuyv422_X_c;
2251             break;
2252         case PIX_FMT_UYVY422:
2253             *yuv2packed1 = yuv2uyvy422_1_c;
2254             *yuv2packed2 = yuv2uyvy422_2_c;
2255             *yuv2packedX = yuv2uyvy422_X_c;
2256             break;
2257         case PIX_FMT_RGB48LE:
2258             *yuv2packed1 = yuv2rgb48le_1_c;
2259             *yuv2packed2 = yuv2rgb48le_2_c;
2260             *yuv2packedX = yuv2rgb48le_X_c;
2261             break;
2262         case PIX_FMT_RGB48BE:
2263             *yuv2packed1 = yuv2rgb48be_1_c;
2264             *yuv2packed2 = yuv2rgb48be_2_c;
2265             *yuv2packedX = yuv2rgb48be_X_c;
2266             break;
2267         case PIX_FMT_BGR48LE:
2268             *yuv2packed1 = yuv2bgr48le_1_c;
2269             *yuv2packed2 = yuv2bgr48le_2_c;
2270             *yuv2packedX = yuv2bgr48le_X_c;
2271             break;
2272         case PIX_FMT_BGR48BE:
2273             *yuv2packed1 = yuv2bgr48be_1_c;
2274             *yuv2packed2 = yuv2bgr48be_2_c;
2275             *yuv2packedX = yuv2bgr48be_X_c;
2276             break;
2277         case PIX_FMT_RGB32:
2278         case PIX_FMT_BGR32:
2279 #if CONFIG_SMALL
2280             *yuv2packed1 = yuv2rgb32_1_c;
2281             *yuv2packed2 = yuv2rgb32_2_c;
2282             *yuv2packedX = yuv2rgb32_X_c;
2283 #else
2284 #if CONFIG_SWSCALE_ALPHA
2285                 if (c->alpPixBuf) {
2286                     *yuv2packed1 = yuv2rgba32_1_c;
2287                     *yuv2packed2 = yuv2rgba32_2_c;
2288                     *yuv2packedX = yuv2rgba32_X_c;
2289                 } else
2290 #endif /* CONFIG_SWSCALE_ALPHA */
2291                 {
2292                     *yuv2packed1 = yuv2rgbx32_1_c;
2293                     *yuv2packed2 = yuv2rgbx32_2_c;
2294                     *yuv2packedX = yuv2rgbx32_X_c;
2295                 }
2296 #endif /* !CONFIG_SMALL */
2297             break;
2298         case PIX_FMT_RGB32_1:
2299         case PIX_FMT_BGR32_1:
2300 #if CONFIG_SMALL
2301                 *yuv2packed1 = yuv2rgb32_1_1_c;
2302                 *yuv2packed2 = yuv2rgb32_1_2_c;
2303                 *yuv2packedX = yuv2rgb32_1_X_c;
2304 #else
2305 #if CONFIG_SWSCALE_ALPHA
2306                 if (c->alpPixBuf) {
2307                     *yuv2packed1 = yuv2rgba32_1_1_c;
2308                     *yuv2packed2 = yuv2rgba32_1_2_c;
2309                     *yuv2packedX = yuv2rgba32_1_X_c;
2310                 } else
2311 #endif /* CONFIG_SWSCALE_ALPHA */
2312                 {
2313                     *yuv2packed1 = yuv2rgbx32_1_1_c;
2314                     *yuv2packed2 = yuv2rgbx32_1_2_c;
2315                     *yuv2packedX = yuv2rgbx32_1_X_c;
2316                 }
2317 #endif /* !CONFIG_SMALL */
2318                 break;
2319         case PIX_FMT_RGB24:
2320             *yuv2packed1 = yuv2rgb24_1_c;
2321             *yuv2packed2 = yuv2rgb24_2_c;
2322             *yuv2packedX = yuv2rgb24_X_c;
2323             break;
2324         case PIX_FMT_BGR24:
2325             *yuv2packed1 = yuv2bgr24_1_c;
2326             *yuv2packed2 = yuv2bgr24_2_c;
2327             *yuv2packedX = yuv2bgr24_X_c;
2328             break;
2329         case PIX_FMT_RGB565LE:
2330         case PIX_FMT_RGB565BE:
2331         case PIX_FMT_BGR565LE:
2332         case PIX_FMT_BGR565BE:
2333             *yuv2packed1 = yuv2rgb16_1_c;
2334             *yuv2packed2 = yuv2rgb16_2_c;
2335             *yuv2packedX = yuv2rgb16_X_c;
2336             break;
2337         case PIX_FMT_RGB555LE:
2338         case PIX_FMT_RGB555BE:
2339         case PIX_FMT_BGR555LE:
2340         case PIX_FMT_BGR555BE:
2341             *yuv2packed1 = yuv2rgb15_1_c;
2342             *yuv2packed2 = yuv2rgb15_2_c;
2343             *yuv2packedX = yuv2rgb15_X_c;
2344             break;
2345         case PIX_FMT_RGB444LE:
2346         case PIX_FMT_RGB444BE:
2347         case PIX_FMT_BGR444LE:
2348         case PIX_FMT_BGR444BE:
2349             *yuv2packed1 = yuv2rgb12_1_c;
2350             *yuv2packed2 = yuv2rgb12_2_c;
2351             *yuv2packedX = yuv2rgb12_X_c;
2352             break;
2353         case PIX_FMT_RGB8:
2354         case PIX_FMT_BGR8:
2355             *yuv2packed1 = yuv2rgb8_1_c;
2356             *yuv2packed2 = yuv2rgb8_2_c;
2357             *yuv2packedX = yuv2rgb8_X_c;
2358             break;
2359         case PIX_FMT_RGB4:
2360         case PIX_FMT_BGR4:
2361             *yuv2packed1 = yuv2rgb4_1_c;
2362             *yuv2packed2 = yuv2rgb4_2_c;
2363             *yuv2packedX = yuv2rgb4_X_c;
2364             break;
2365         case PIX_FMT_RGB4_BYTE:
2366         case PIX_FMT_BGR4_BYTE:
2367             *yuv2packed1 = yuv2rgb4b_1_c;
2368             *yuv2packed2 = yuv2rgb4b_2_c;
2369             *yuv2packedX = yuv2rgb4b_X_c;
2370             break;
2371         }
2372     }
2373 }
2374
2375 #define DEBUG_SWSCALE_BUFFERS 0
2376 #define DEBUG_BUFFERS(...) if (DEBUG_SWSCALE_BUFFERS) av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
2377
2378 static int swScale(SwsContext *c, const uint8_t* src[],
2379                    int srcStride[], int srcSliceY,
2380                    int srcSliceH, uint8_t* dst[], int dstStride[])
2381 {
2382     /* load a few things into local vars to make the code more readable? and faster */
2383     const int srcW= c->srcW;
2384     const int dstW= c->dstW;
2385     const int dstH= c->dstH;
2386     const int chrDstW= c->chrDstW;
2387     const int chrSrcW= c->chrSrcW;
2388     const int lumXInc= c->lumXInc;
2389     const int chrXInc= c->chrXInc;
2390     const enum PixelFormat dstFormat= c->dstFormat;
2391     const int flags= c->flags;
2392     int16_t *vLumFilterPos= c->vLumFilterPos;
2393     int16_t *vChrFilterPos= c->vChrFilterPos;
2394     int16_t *hLumFilterPos= c->hLumFilterPos;
2395     int16_t *hChrFilterPos= c->hChrFilterPos;
2396     int16_t *vLumFilter= c->vLumFilter;
2397     int16_t *vChrFilter= c->vChrFilter;
2398     int16_t *hLumFilter= c->hLumFilter;
2399     int16_t *hChrFilter= c->hChrFilter;
2400     int32_t *lumMmxFilter= c->lumMmxFilter;
2401     int32_t *chrMmxFilter= c->chrMmxFilter;
2402     int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
2403     const int vLumFilterSize= c->vLumFilterSize;
2404     const int vChrFilterSize= c->vChrFilterSize;
2405     const int hLumFilterSize= c->hLumFilterSize;
2406     const int hChrFilterSize= c->hChrFilterSize;
2407     int16_t **lumPixBuf= c->lumPixBuf;
2408     int16_t **chrUPixBuf= c->chrUPixBuf;
2409     int16_t **chrVPixBuf= c->chrVPixBuf;
2410     int16_t **alpPixBuf= c->alpPixBuf;
2411     const int vLumBufSize= c->vLumBufSize;
2412     const int vChrBufSize= c->vChrBufSize;
2413     uint8_t *formatConvBuffer= c->formatConvBuffer;
2414     const int chrSrcSliceY= srcSliceY >> c->chrSrcVSubSample;
2415     const int chrSrcSliceH= -((-srcSliceH) >> c->chrSrcVSubSample);
2416     int lastDstY;
2417     uint32_t *pal=c->pal_yuv;
2418     yuv2planar1_fn yuv2yuv1 = c->yuv2yuv1;
2419     yuv2planarX_fn yuv2yuvX = c->yuv2yuvX;
2420     yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
2421     yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
2422     yuv2packedX_fn yuv2packedX = c->yuv2packedX;
2423     int should_dither = is9_OR_10BPS(c->srcFormat) || is16BPS(c->srcFormat);
2424
2425     /* vars which will change and which we need to store back in the context */
2426     int dstY= c->dstY;
2427     int lumBufIndex= c->lumBufIndex;
2428     int chrBufIndex= c->chrBufIndex;
2429     int lastInLumBuf= c->lastInLumBuf;
2430     int lastInChrBuf= c->lastInChrBuf;
2431
2432     if (isPacked(c->srcFormat)) {
2433         src[0]=
2434         src[1]=
2435         src[2]=
2436         src[3]= src[0];
2437         srcStride[0]=
2438         srcStride[1]=
2439         srcStride[2]=
2440         srcStride[3]= srcStride[0];
2441     }
2442     srcStride[1]<<= c->vChrDrop;
2443     srcStride[2]<<= c->vChrDrop;
2444
2445     DEBUG_BUFFERS("swScale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
2446                   src[0], srcStride[0], src[1], srcStride[1], src[2], srcStride[2], src[3], srcStride[3],
2447                   dst[0], dstStride[0], dst[1], dstStride[1], dst[2], dstStride[2], dst[3], dstStride[3]);
2448     DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
2449                    srcSliceY,    srcSliceH,    dstY,    dstH);
2450     DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
2451                    vLumFilterSize,    vLumBufSize,    vChrFilterSize,    vChrBufSize);
2452
2453     if (dstStride[0]%8 !=0 || dstStride[1]%8 !=0 || dstStride[2]%8 !=0 || dstStride[3]%8 != 0) {
2454         static int warnedAlready=0; //FIXME move this into the context perhaps
2455         if (flags & SWS_PRINT_INFO && !warnedAlready) {
2456             av_log(c, AV_LOG_WARNING, "Warning: dstStride is not aligned!\n"
2457                    "         ->cannot do aligned memory accesses anymore\n");
2458             warnedAlready=1;
2459         }
2460     }
2461
2462     /* Note the user might start scaling the picture in the middle so this
2463        will not get executed. This is not really intended but works
2464        currently, so people might do it. */
2465     if (srcSliceY ==0) {
2466         lumBufIndex=-1;
2467         chrBufIndex=-1;
2468         dstY=0;
2469         lastInLumBuf= -1;
2470         lastInChrBuf= -1;
2471     }
2472
2473     if (!should_dither) {
2474         c->chrDither8 = c->lumDither8 = ff_sws_pb_64;
2475     }
2476     lastDstY= dstY;
2477
2478     for (;dstY < dstH; dstY++) {
2479         const int chrDstY= dstY>>c->chrDstVSubSample;
2480         uint8_t *dest[4] = {
2481             dst[0] + dstStride[0] * dstY,
2482             dst[1] + dstStride[1] * chrDstY,
2483             dst[2] + dstStride[2] * chrDstY,
2484             (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
2485         };
2486
2487         const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
2488         const int firstLumSrcY2= vLumFilterPos[FFMIN(dstY | ((1<<c->chrDstVSubSample) - 1), dstH-1)];
2489         const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
2490         int lastLumSrcY= firstLumSrcY + vLumFilterSize -1; // Last line needed as input
2491         int lastLumSrcY2=firstLumSrcY2+ vLumFilterSize -1; // Last line needed as input
2492         int lastChrSrcY= firstChrSrcY + vChrFilterSize -1; // Last line needed as input
2493         int enough_lines;
2494
2495         //handle holes (FAST_BILINEAR & weird filters)
2496         if (firstLumSrcY > lastInLumBuf) lastInLumBuf= firstLumSrcY-1;
2497         if (firstChrSrcY > lastInChrBuf) lastInChrBuf= firstChrSrcY-1;
2498         assert(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
2499         assert(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
2500
2501         DEBUG_BUFFERS("dstY: %d\n", dstY);
2502         DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
2503                          firstLumSrcY,    lastLumSrcY,    lastInLumBuf);
2504         DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
2505                          firstChrSrcY,    lastChrSrcY,    lastInChrBuf);
2506
2507         // Do we have enough lines in this slice to output the dstY line
2508         enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH && lastChrSrcY < -((-srcSliceY - srcSliceH)>>c->chrSrcVSubSample);
2509
2510         if (!enough_lines) {
2511             lastLumSrcY = srcSliceY + srcSliceH - 1;
2512             lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
2513             DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
2514                                             lastLumSrcY, lastChrSrcY);
2515         }
2516
2517         //Do horizontal scaling
2518         while(lastInLumBuf < lastLumSrcY) {
2519             const uint8_t *src1= src[0]+(lastInLumBuf + 1 - srcSliceY)*srcStride[0];
2520             const uint8_t *src2= src[3]+(lastInLumBuf + 1 - srcSliceY)*srcStride[3];
2521             lumBufIndex++;
2522             assert(lumBufIndex < 2*vLumBufSize);
2523             assert(lastInLumBuf + 1 - srcSliceY < srcSliceH);
2524             assert(lastInLumBuf + 1 - srcSliceY >= 0);
2525             hyscale(c, lumPixBuf[ lumBufIndex ], dstW, src1, srcW, lumXInc,
2526                     hLumFilter, hLumFilterPos, hLumFilterSize,
2527                     formatConvBuffer,
2528                     pal, 0);
2529             if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
2530                 hyscale(c, alpPixBuf[ lumBufIndex ], dstW, src2, srcW,
2531                         lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
2532                         formatConvBuffer,
2533                         pal, 1);
2534             lastInLumBuf++;
2535             DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
2536                                lumBufIndex,    lastInLumBuf);
2537         }
2538         while(lastInChrBuf < lastChrSrcY) {
2539             const uint8_t *src1= src[1]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[1];
2540             const uint8_t *src2= src[2]+(lastInChrBuf + 1 - chrSrcSliceY)*srcStride[2];
2541             chrBufIndex++;
2542             assert(chrBufIndex < 2*vChrBufSize);
2543             assert(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
2544             assert(lastInChrBuf + 1 - chrSrcSliceY >= 0);
2545             //FIXME replace parameters through context struct (some at least)
2546
2547             if (c->needs_hcscale)
2548                 hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
2549                           chrDstW, src1, src2, chrSrcW, chrXInc,
2550                           hChrFilter, hChrFilterPos, hChrFilterSize,
2551                           formatConvBuffer, pal);
2552             lastInChrBuf++;
2553             DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
2554                                chrBufIndex,    lastInChrBuf);
2555         }
2556         //wrap buf index around to stay inside the ring buffer
2557         if (lumBufIndex >= vLumBufSize) lumBufIndex-= vLumBufSize;
2558         if (chrBufIndex >= vChrBufSize) chrBufIndex-= vChrBufSize;
2559         if (!enough_lines)
2560             break; //we can't output a dstY line so let's try with the next slice
2561
2562 #if HAVE_MMX
2563         updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex, lastInLumBuf, lastInChrBuf);
2564 #endif
2565         if (should_dither) {
2566             c->chrDither8 = dither_8x8_128[chrDstY & 7];
2567             c->lumDither8 = dither_8x8_128[dstY & 7];
2568         }
2569         if (dstY >= dstH-2) {
2570             // hmm looks like we can't use MMX here without overwriting this array's tail
2571             find_c_packed_planar_out_funcs(c, &yuv2yuv1, &yuv2yuvX,
2572                                            &yuv2packed1, &yuv2packed2,
2573                                            &yuv2packedX);
2574         }
2575
2576         {
2577             const int16_t **lumSrcPtr= (const int16_t **) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
2578             const int16_t **chrUSrcPtr= (const int16_t **) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2579             const int16_t **chrVSrcPtr= (const int16_t **) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
2580             const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
2581             if (isPlanarYUV(dstFormat) || dstFormat==PIX_FMT_GRAY8) { //YV12 like
2582                 const int chrSkipMask= (1<<c->chrDstVSubSample)-1;
2583                 if ((dstY&chrSkipMask) || isGray(dstFormat))
2584                     dest[1] = dest[2] = NULL; //FIXME split functions in lumi / chromi
2585                 if (c->yuv2yuv1 && vLumFilterSize == 1 && vChrFilterSize == 1) { // unscaled YV12
2586                     const int16_t *alpBuf= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? alpSrcPtr[0] : NULL;
2587                     yuv2yuv1(c, lumSrcPtr[0], chrUSrcPtr[0], chrVSrcPtr[0], alpBuf,
2588                              dest, dstW, chrDstW);
2589                 } else { //General YV12
2590                     yuv2yuvX(c, vLumFilter + dstY * vLumFilterSize,
2591                              lumSrcPtr, vLumFilterSize,
2592                              vChrFilter + chrDstY * vChrFilterSize,
2593                              chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2594                              alpSrcPtr, dest, dstW, chrDstW);
2595                 }
2596             } else {
2597                 assert(lumSrcPtr  + vLumFilterSize - 1 < lumPixBuf  + vLumBufSize*2);
2598                 assert(chrUSrcPtr + vChrFilterSize - 1 < chrUPixBuf + vChrBufSize*2);
2599                 if (c->yuv2packed1 && vLumFilterSize == 1 && vChrFilterSize == 2) { //unscaled RGB
2600                     int chrAlpha = vChrFilter[2 * dstY + 1];
2601                     yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2602                                 alpPixBuf ? *alpSrcPtr : NULL,
2603                                 dest[0], dstW, chrAlpha, dstY);
2604                 } else if (c->yuv2packed2 && vLumFilterSize == 2 && vChrFilterSize == 2) { //bilinear upscale RGB
2605                     int lumAlpha = vLumFilter[2 * dstY + 1];
2606                     int chrAlpha = vChrFilter[2 * dstY + 1];
2607                     lumMmxFilter[2] =
2608                     lumMmxFilter[3] = vLumFilter[2 * dstY   ] * 0x10001;
2609                     chrMmxFilter[2] =
2610                     chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
2611                     yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
2612                                 alpPixBuf ? alpSrcPtr : NULL,
2613                                 dest[0], dstW, lumAlpha, chrAlpha, dstY);
2614                 } else { //general RGB
2615                     yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
2616                                 lumSrcPtr, vLumFilterSize,
2617                                 vChrFilter + dstY * vChrFilterSize,
2618                                 chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
2619                                 alpSrcPtr, dest[0], dstW, dstY);
2620                 }
2621             }
2622         }
2623     }
2624
2625     if ((dstFormat == PIX_FMT_YUVA420P) && !alpPixBuf)
2626         fillPlane(dst[3], dstStride[3], dstW, dstY-lastDstY, lastDstY, 255);
2627
2628 #if HAVE_MMX2
2629     if (av_get_cpu_flags() & AV_CPU_FLAG_MMX2)
2630         __asm__ volatile("sfence":::"memory");
2631 #endif
2632     emms_c();
2633
2634     /* store changed local vars back in the context */
2635     c->dstY= dstY;
2636     c->lumBufIndex= lumBufIndex;
2637     c->chrBufIndex= chrBufIndex;
2638     c->lastInLumBuf= lastInLumBuf;
2639     c->lastInChrBuf= lastInChrBuf;
2640
2641     return dstY - lastDstY;
2642 }
2643
2644 static av_cold void sws_init_swScale_c(SwsContext *c)
2645 {
2646     enum PixelFormat srcFormat = c->srcFormat;
2647
2648     find_c_packed_planar_out_funcs(c, &c->yuv2yuv1, &c->yuv2yuvX,
2649                                    &c->yuv2packed1, &c->yuv2packed2,
2650                                    &c->yuv2packedX);
2651
2652     c->chrToYV12 = NULL;
2653     switch(srcFormat) {
2654         case PIX_FMT_YUYV422  : c->chrToYV12 = yuy2ToUV_c; break;
2655         case PIX_FMT_UYVY422  : c->chrToYV12 = uyvyToUV_c; break;
2656         case PIX_FMT_NV12     : c->chrToYV12 = nv12ToUV_c; break;
2657         case PIX_FMT_NV21     : c->chrToYV12 = nv21ToUV_c; break;
2658         case PIX_FMT_RGB8     :
2659         case PIX_FMT_BGR8     :
2660         case PIX_FMT_PAL8     :
2661         case PIX_FMT_BGR4_BYTE:
2662         case PIX_FMT_RGB4_BYTE: c->chrToYV12 = palToUV_c; break;
2663 #if HAVE_BIGENDIAN
2664         case PIX_FMT_YUV444P9LE:
2665         case PIX_FMT_YUV420P9LE:
2666         case PIX_FMT_YUV422P10LE:
2667         case PIX_FMT_YUV444P10LE:
2668         case PIX_FMT_YUV420P10LE:
2669         case PIX_FMT_YUV420P16LE:
2670         case PIX_FMT_YUV422P16LE:
2671         case PIX_FMT_YUV444P16LE: c->chrToYV12 = bswap16UV_c; break;
2672 #else
2673         case PIX_FMT_YUV444P9BE:
2674         case PIX_FMT_YUV420P9BE:
2675         case PIX_FMT_YUV444P10BE:
2676         case PIX_FMT_YUV422P10BE:
2677         case PIX_FMT_YUV420P10BE:
2678         case PIX_FMT_YUV420P16BE:
2679         case PIX_FMT_YUV422P16BE:
2680         case PIX_FMT_YUV444P16BE: c->chrToYV12 = bswap16UV_c; break;
2681 #endif
2682     }
2683     if (c->chrSrcHSubSample) {
2684         switch(srcFormat) {
2685         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_half_c; break;
2686         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_half_c; break;
2687         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_half_c; break;
2688         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_half_c; break;
2689         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_half_c;   break;
2690         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_half_c;  break;
2691         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_half_c;   break;
2692         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_half_c; break;
2693         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_half_c; break;
2694         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_half_c; break;
2695         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_half_c; break;
2696         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_half_c;   break;
2697         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_half_c;  break;
2698         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_half_c;   break;
2699         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_half_c; break;
2700         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_half_c; break;
2701         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_half_c; break;
2702         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_half_c; break;
2703         }
2704     } else {
2705         switch(srcFormat) {
2706         case PIX_FMT_RGB48BE : c->chrToYV12 = rgb48BEToUV_c; break;
2707         case PIX_FMT_RGB48LE : c->chrToYV12 = rgb48LEToUV_c; break;
2708         case PIX_FMT_BGR48BE : c->chrToYV12 = bgr48BEToUV_c; break;
2709         case PIX_FMT_BGR48LE : c->chrToYV12 = bgr48LEToUV_c; break;
2710         case PIX_FMT_RGB32   : c->chrToYV12 = bgr32ToUV_c;   break;
2711         case PIX_FMT_RGB32_1 : c->chrToYV12 = bgr321ToUV_c;  break;
2712         case PIX_FMT_BGR24   : c->chrToYV12 = bgr24ToUV_c;   break;
2713         case PIX_FMT_BGR565LE: c->chrToYV12 = bgr16leToUV_c; break;
2714         case PIX_FMT_BGR565BE: c->chrToYV12 = bgr16beToUV_c; break;
2715         case PIX_FMT_BGR555LE: c->chrToYV12 = bgr15leToUV_c; break;
2716         case PIX_FMT_BGR555BE: c->chrToYV12 = bgr15beToUV_c; break;
2717         case PIX_FMT_BGR32   : c->chrToYV12 = rgb32ToUV_c;   break;
2718         case PIX_FMT_BGR32_1 : c->chrToYV12 = rgb321ToUV_c;  break;
2719         case PIX_FMT_RGB24   : c->chrToYV12 = rgb24ToUV_c;   break;
2720         case PIX_FMT_RGB565LE: c->chrToYV12 = rgb16leToUV_c; break;
2721         case PIX_FMT_RGB565BE: c->chrToYV12 = rgb16beToUV_c; break;
2722         case PIX_FMT_RGB555LE: c->chrToYV12 = rgb15leToUV_c; break;
2723         case PIX_FMT_RGB555BE: c->chrToYV12 = rgb15beToUV_c; break;
2724         }
2725     }
2726
2727     c->lumToYV12 = NULL;
2728     c->alpToYV12 = NULL;
2729     switch (srcFormat) {
2730 #if HAVE_BIGENDIAN
2731     case PIX_FMT_YUV444P9LE:
2732     case PIX_FMT_YUV420P9LE:
2733     case PIX_FMT_YUV444P10LE:
2734     case PIX_FMT_YUV422P10LE:
2735     case PIX_FMT_YUV420P10LE:
2736     case PIX_FMT_YUV420P16LE:
2737     case PIX_FMT_YUV422P16LE:
2738     case PIX_FMT_YUV444P16LE:
2739     case PIX_FMT_GRAY16LE: c->lumToYV12 = bswap16Y_c; break;
2740 #else
2741     case PIX_FMT_YUV444P9BE:
2742     case PIX_FMT_YUV420P9BE:
2743     case PIX_FMT_YUV444P10BE:
2744     case PIX_FMT_YUV422P10BE:
2745     case PIX_FMT_YUV420P10BE:
2746     case PIX_FMT_YUV420P16BE:
2747     case PIX_FMT_YUV422P16BE:
2748     case PIX_FMT_YUV444P16BE:
2749     case PIX_FMT_GRAY16BE: c->lumToYV12 = bswap16Y_c; break;
2750 #endif
2751     case PIX_FMT_YUYV422  :
2752     case PIX_FMT_Y400A    : c->lumToYV12 = yuy2ToY_c; break;
2753     case PIX_FMT_UYVY422  : c->lumToYV12 = uyvyToY_c;    break;
2754     case PIX_FMT_BGR24    : c->lumToYV12 = bgr24ToY_c;   break;
2755     case PIX_FMT_BGR565LE : c->lumToYV12 = bgr16leToY_c; break;
2756     case PIX_FMT_BGR565BE : c->lumToYV12 = bgr16beToY_c; break;
2757     case PIX_FMT_BGR555LE : c->lumToYV12 = bgr15leToY_c; break;
2758     case PIX_FMT_BGR555BE : c->lumToYV12 = bgr15beToY_c; break;
2759     case PIX_FMT_RGB24    : c->lumToYV12 = rgb24ToY_c;   break;
2760     case PIX_FMT_RGB565LE : c->lumToYV12 = rgb16leToY_c; break;
2761     case PIX_FMT_RGB565BE : c->lumToYV12 = rgb16beToY_c; break;
2762     case PIX_FMT_RGB555LE : c->lumToYV12 = rgb15leToY_c; break;
2763     case PIX_FMT_RGB555BE : c->lumToYV12 = rgb15beToY_c; break;
2764     case PIX_FMT_RGB8     :
2765     case PIX_FMT_BGR8     :
2766     case PIX_FMT_PAL8     :
2767     case PIX_FMT_BGR4_BYTE:
2768     case PIX_FMT_RGB4_BYTE: c->lumToYV12 = palToY_c; break;
2769     case PIX_FMT_MONOBLACK: c->lumToYV12 = monoblack2Y_c; break;
2770     case PIX_FMT_MONOWHITE: c->lumToYV12 = monowhite2Y_c; break;
2771     case PIX_FMT_RGB32  : c->lumToYV12 = bgr32ToY_c;  break;
2772     case PIX_FMT_RGB32_1: c->lumToYV12 = bgr321ToY_c; break;
2773     case PIX_FMT_BGR32  : c->lumToYV12 = rgb32ToY_c;  break;
2774     case PIX_FMT_BGR32_1: c->lumToYV12 = rgb321ToY_c; break;
2775     case PIX_FMT_RGB48BE: c->lumToYV12 = rgb48BEToY_c; break;
2776     case PIX_FMT_RGB48LE: c->lumToYV12 = rgb48LEToY_c; break;
2777     case PIX_FMT_BGR48BE: c->lumToYV12 = bgr48BEToY_c; break;
2778     case PIX_FMT_BGR48LE: c->lumToYV12 = bgr48LEToY_c; break;
2779     }
2780     if (c->alpPixBuf) {
2781         switch (srcFormat) {
2782         case PIX_FMT_BGRA:
2783         case PIX_FMT_RGBA:  c->alpToYV12 = rgbaToA_c; break;
2784         case PIX_FMT_ABGR:
2785         case PIX_FMT_ARGB:  c->alpToYV12 = abgrToA_c; break;
2786         case PIX_FMT_Y400A: c->alpToYV12 = uyvyToY_c; break;
2787         }
2788     }
2789
2790     if (c->srcBpc == 8) {
2791         if (c->dstBpc <= 10) {
2792             c->hyScale = c->hcScale = hScale8To15_c;
2793             if (c->flags & SWS_FAST_BILINEAR) {
2794                 c->hyscale_fast = hyscale_fast_c;
2795                 c->hcscale_fast = hcscale_fast_c;
2796             }
2797         } else {
2798             c->hyScale = c->hcScale = hScale8To19_c;
2799         }
2800     } else {
2801         c->hyScale = c->hcScale = c->dstBpc > 10 ? hScale16To19_c : hScale16To15_c;
2802     }
2803
2804     if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
2805         if (c->dstBpc <= 10) {
2806             if (c->srcRange) {
2807                 c->lumConvertRange = lumRangeFromJpeg_c;
2808                 c->chrConvertRange = chrRangeFromJpeg_c;
2809             } else {
2810                 c->lumConvertRange = lumRangeToJpeg_c;
2811                 c->chrConvertRange = chrRangeToJpeg_c;
2812             }
2813         } else {
2814             if (c->srcRange) {
2815                 c->lumConvertRange = lumRangeFromJpeg16_c;
2816                 c->chrConvertRange = chrRangeFromJpeg16_c;
2817             } else {
2818                 c->lumConvertRange = lumRangeToJpeg16_c;
2819                 c->chrConvertRange = chrRangeToJpeg16_c;
2820             }
2821         }
2822     }
2823
2824     if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
2825           srcFormat == PIX_FMT_MONOBLACK || srcFormat == PIX_FMT_MONOWHITE))
2826         c->needs_hcscale = 1;
2827 }
2828
2829 SwsFunc ff_getSwsFunc(SwsContext *c)
2830 {
2831     sws_init_swScale_c(c);
2832
2833     if (HAVE_MMX)
2834         ff_sws_init_swScale_mmx(c);
2835     if (HAVE_ALTIVEC)
2836         ff_sws_init_swScale_altivec(c);
2837
2838     return swScale;
2839 }