libavcodec/wmavoice.c

   1 /*
   2  * Windows Media Audio Voice decoder.
   3  * Copyright (c) 2009 Ronald S. Bultje
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * @brief Windows Media Audio Voice compatible decoder
  25  * @author Ronald S. Bultje <rsbultje@gmail.com>
  26  */
  27
  28 #include <math.h>
  29 #include "avcodec.h"
  30 #include "get_bits.h"
  31 #include "put_bits.h"
  32 #include "wmavoice_data.h"
  33 #include "celp_math.h"
  34 #include "celp_filters.h"
  35 #include "acelp_vectors.h"
  36 #include "acelp_filters.h"
  37 #include "lsp.h"
  38 #include "libavutil/lzo.h"
  39 #include "dct.h"
  40 #include "rdft.h"
  41 #include "sinewin.h"
  42
  43 #define MAX_BLOCKS           8   ///< maximum number of blocks per frame
  44 #define MAX_LSPS             16  ///< maximum filter order
  45 #define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
  46                                  ///< of 16 for ASM input buffer alignment
  47 #define MAX_FRAMES           3   ///< maximum number of frames per superframe
  48 #define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
  49 #define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
  50 #define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
  51                                  ///< maximum number of samples per superframe
  52 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
  53                                  ///< was split over two packets
  54 #define VLC_NBITS            6   ///< number of bits to read per VLC iteration
  55
  56 /**
  57  * Frame type VLC coding.
  58  */
  59 static VLC frame_type_vlc;
  60
  61 /**
  62  * Adaptive codebook types.
  63  */
  64 enum {
  65     ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
  66     ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
  67                              ///< we interpolate to get a per-sample pitch.
  68                              ///< Signal is generated using an asymmetric sinc
  69                              ///< window function
  70                              ///< @note see #wmavoice_ipol1_coeffs
  71     ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
  72                              ///< a Hamming sinc window function
  73                              ///< @note see #wmavoice_ipol2_coeffs
  74 };
  75
  76 /**
  77  * Fixed codebook types.
  78  */
  79 enum {
  80     FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
  81                              ///< generated from a hardcoded (fixed) codebook
  82                              ///< with per-frame (low) gain values
  83     FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
  84                              ///< gain values
  85     FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
  86                              ///< used in particular for low-bitrate streams
  87     FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
  88                              ///< combinations of either single pulses or
  89                              ///< pulse pairs
  90 };
  91
  92 /**
  93  * Description of frame types.
  94  */
  95 static const struct frame_type_desc {
  96     uint8_t n_blocks;     ///< amount of blocks per frame (each block
  97                           ///< (contains 160/#n_blocks samples)
  98     uint8_t log_n_blocks; ///< log2(#n_blocks)
  99     uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
 100     uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
 101     uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
 102                           ///< (rather than just one single pulse)
 103                           ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
 104     uint16_t frame_size;  ///< the amount of bits that make up the block
 105                           ///< data (per frame)
 106 } frame_descs[17] = {
 107     { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0,   0 },
 108     { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0,  28 },
 109     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0,  46 },
 110     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2,  80 },
 111     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
 112     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
 113     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
 114     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
 115     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0,  64 },
 116     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2,  80 },
 117     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 104 },
 118     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 108 },
 119     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 132 },
 120     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 168 },
 121     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0, 176 },
 122     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2, 208 },
 123     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5, 256 }
 124 };
 125
 126 /**
 127  * WMA Voice decoding context.
 128  */
 129 typedef struct {
 130     /**
 131      * @name Global values specified in the stream header / extradata or used all over.
 132      * @{
 133      */
 134     GetBitContext gb;             ///< packet bitreader. During decoder init,
 135                                   ///< it contains the extradata from the
 136                                   ///< demuxer. During decoding, it contains
 137                                   ///< packet data.
 138     int8_t vbm_tree[25];          ///< converts VLC codes to frame type
 139
 140     int spillover_bitsize;        ///< number of bits used to specify
 141                                   ///< #spillover_nbits in the packet header
 142                                   ///< = ceil(log2(ctx->block_align << 3))
 143     int history_nsamples;         ///< number of samples in history for signal
 144                                   ///< prediction (through ACB)
 145
 146     /* postfilter specific values */
 147     int do_apf;                   ///< whether to apply the averaged
 148                                   ///< projection filter (APF)
 149     int denoise_strength;         ///< strength of denoising in Wiener filter
 150                                   ///< [0-11]
 151     int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
 152                                   ///< Wiener filter coefficients (postfilter)
 153     int dc_level;                 ///< Predicted amount of DC noise, based
 154                                   ///< on which a DC removal filter is used
 155
 156     int lsps;                     ///< number of LSPs per frame [10 or 16]
 157     int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
 158     int lsp_def_mode;             ///< defines different sets of LSP defaults
 159                                   ///< [0, 1]
 160     int frame_lsp_bitsize;        ///< size (in bits) of LSPs, when encoded
 161                                   ///< per-frame (independent coding)
 162     int sframe_lsp_bitsize;       ///< size (in bits) of LSPs, when encoded
 163                                   ///< per superframe (residual coding)
 164
 165     int min_pitch_val;            ///< base value for pitch parsing code
 166     int max_pitch_val;            ///< max value + 1 for pitch parsing
 167     int pitch_nbits;              ///< number of bits used to specify the
 168                                   ///< pitch value in the frame header
 169     int block_pitch_nbits;        ///< number of bits used to specify the
 170                                   ///< first block's pitch value
 171     int block_pitch_range;        ///< range of the block pitch
 172     int block_delta_pitch_nbits;  ///< number of bits used to specify the
 173                                   ///< delta pitch between this and the last
 174                                   ///< block's pitch value, used in all but
 175                                   ///< first block
 176     int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
 177                                   ///< from -this to +this-1)
 178     uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
 179                                   ///< conversion
 180
 181     /**
 182      * @}
 183      *
 184      * @name Packet values specified in the packet header or related to a packet.
 185      *
 186      * A packet is considered to be a single unit of data provided to this
 187      * decoder by the demuxer.
 188      * @{
 189      */
 190     int spillover_nbits;          ///< number of bits of the previous packet's
 191                                   ///< last superframe preceeding this
 192                                   ///< packet's first full superframe (useful
 193                                   ///< for re-synchronization also)
 194     int has_residual_lsps;        ///< if set, superframes contain one set of
 195                                   ///< LSPs that cover all frames, encoded as
 196                                   ///< independent and residual LSPs; if not
 197                                   ///< set, each frame contains its own, fully
 198                                   ///< independent, LSPs
 199     int skip_bits_next;           ///< number of bits to skip at the next call
 200                                   ///< to #wmavoice_decode_packet() (since
 201                                   ///< they're part of the previous superframe)
 202
 203     uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE];
 204                                   ///< cache for superframe data split over
 205                                   ///< multiple packets
 206     int sframe_cache_size;        ///< set to >0 if we have data from an
 207                                   ///< (incomplete) superframe from a previous
 208                                   ///< packet that spilled over in the current
 209                                   ///< packet; specifies the amount of bits in
 210                                   ///< #sframe_cache
 211     PutBitContext pb;             ///< bitstream writer for #sframe_cache
 212
 213     /**
 214      * @}
 215      *
 216      * @name Frame and superframe values
 217      * Superframe and frame data - these can change from frame to frame,
 218      * although some of them do in that case serve as a cache / history for
 219      * the next frame or superframe.
 220      * @{
 221      */
 222     double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
 223                                   ///< superframe
 224     int last_pitch_val;           ///< pitch value of the previous frame
 225     int last_acb_type;            ///< frame type [0-2] of the previous frame
 226     int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
 227                                   ///< << 16) / #MAX_FRAMESIZE
 228     float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
 229
 230     int aw_idx_is_ext;            ///< whether the AW index was encoded in
 231                                   ///< 8 bits (instead of 6)
 232     int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
 233                                   ///< can apply the pulse, relative to the
 234                                   ///< value in aw_first_pulse_off. The exact
 235                                   ///< position of the first AW-pulse is within
 236                                   ///< [pulse_off, pulse_off + this], and
 237                                   ///< depends on bitstream values; [16 or 24]
 238     int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
 239                                   ///< that this number can be negative (in
 240                                   ///< which case it basically means "zero")
 241     int aw_first_pulse_off[2];    ///< index of first sample to which to
 242                                   ///< apply AW-pulses, or -0xff if unset
 243     int aw_next_pulse_off_cache;  ///< the position (relative to start of the
 244                                   ///< second block) at which pulses should
 245                                   ///< start to be positioned, serves as a
 246                                   ///< cache for pitch-adaptive window pulses
 247                                   ///< between blocks
 248
 249     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
 250                                   ///< only used for comfort noise in #pRNG()
 251     float gain_pred_err[6];       ///< cache for gain prediction
 252     float excitation_history[MAX_SIGNAL_HISTORY];
 253                                   ///< cache of the signal of previous
 254                                   ///< superframes, used as a history for
 255                                   ///< signal generation
 256     float synth_history[MAX_LSPS]; ///< see #excitation_history
 257     /**
 258      * @}
 259      *
 260      * @name Postfilter values
 261      *
 262      * Variables used for postfilter implementation, mostly history for
 263      * smoothing and so on, and context variables for FFT/iFFT.
 264      * @{
 265      */
 266     RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
 267                                   ///< postfilter (for denoise filter)
 268     DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
 269                                   ///< transform, part of postfilter)
 270     float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
 271                                   ///< range
 272     float postfilter_agc;         ///< gain control memory, used in
 273                                   ///< #adaptive_gain_control()
 274     float dcf_mem[2];             ///< DC filter history
 275     float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
 276                                   ///< zero filter output (i.e. excitation)
 277                                   ///< by postfilter
 278     float denoise_filter_cache[MAX_FRAMESIZE];
 279     int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
 280     DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
 281                                   ///< aligned buffer for LPC tilting
 282     DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
 283                                   ///< aligned buffer for denoise coefficients
 284     DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
 285                                   ///< aligned buffer for postfilter speech
 286                                   ///< synthesis
 287     /**
 288      * @}
 289      */
 290 } WMAVoiceContext;
 291
 292 /**
 293  * Set up the variable bit mode (VBM) tree from container extradata.
 294  * @param gb bit I/O context.
 295  *           The bit context (s->gb) should be loaded with byte 23-46 of the
 296  *           container extradata (i.e. the ones containing the VBM tree).
 297  * @param vbm_tree pointer to array to which the decoded VBM tree will be
 298  *                 written.
 299  * @return 0 on success, <0 on error.
 300  */
 301 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
 302 {
 303     static const uint8_t bits[] = {
 304          2,  2,  2,  4,  4,  4,
 305          6,  6,  6,  8,  8,  8,
 306         10, 10, 10, 12, 12, 12,
 307         14, 14, 14, 14
 308     };
 309     static const uint16_t codes[] = {
 310           0x0000, 0x0001, 0x0002,        //              00/01/10
 311           0x000c, 0x000d, 0x000e,        //           11+00/01/10
 312           0x003c, 0x003d, 0x003e,        //         1111+00/01/10
 313           0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
 314           0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
 315           0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
 316           0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
 317     };
 318     int cntr[8], n, res;
 319
 320     memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
 321     memset(cntr,     0,    sizeof(cntr));
 322     for (n = 0; n < 17; n++) {
 323         res = get_bits(gb, 3);
 324         if (cntr[res] > 3) // should be >= 3 + (res == 7))
 325             return -1;
 326         vbm_tree[res * 3 + cntr[res]++] = n;
 327     }
 328     INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
 329                     bits, 1, 1, codes, 2, 2, 132);
 330     return 0;
 331 }
 332
 333 /**
 334  * Set up decoder with parameters from demuxer (extradata etc.).
 335  */
 336 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
 337 {
 338     int n, flags, pitch_range, lsp16_flag;
 339     WMAVoiceContext *s = ctx->priv_data;
 340
 341     /**
 342      * Extradata layout:
 343      * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
 344      * - byte 19-22: flags field (annoyingly in LE; see below for known
 345      *               values),
 346      * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
 347      *               rest is 0).
 348      */
 349     if (ctx->extradata_size != 46) {
 350         av_log(ctx, AV_LOG_ERROR,
 351                "Invalid extradata size %d (should be 46)\n",
 352                ctx->extradata_size);
 353         return -1;
 354     }
 355     flags                = AV_RL32(ctx->extradata + 18);
 356     s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
 357     s->do_apf            =    flags & 0x1;
 358     if (s->do_apf) {
 359         ff_rdft_init(&s->rdft,  7, DFT_R2C);
 360         ff_rdft_init(&s->irdft, 7, IDFT_C2R);
 361         ff_dct_init(&s->dct,  6, DCT_I);
 362         ff_dct_init(&s->dst,  6, DST_I);
 363
 364         ff_sine_window_init(s->cos, 256);
 365         memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
 366         for (n = 0; n < 255; n++) {
 367             s->sin[n]       = -s->sin[510 - n];
 368             s->cos[510 - n] =  s->cos[n];
 369         }
 370     }
 371     s->denoise_strength  =   (flags >> 2) & 0xF;
 372     if (s->denoise_strength >= 12) {
 373         av_log(ctx, AV_LOG_ERROR,
 374                "Invalid denoise filter strength %d (max=11)\n",
 375                s->denoise_strength);
 376         return -1;
 377     }
 378     s->denoise_tilt_corr = !!(flags & 0x40);
 379     s->dc_level          =   (flags >> 7) & 0xF;
 380     s->lsp_q_mode        = !!(flags & 0x2000);
 381     s->lsp_def_mode      = !!(flags & 0x4000);
 382     lsp16_flag           =    flags & 0x1000;
 383     if (lsp16_flag) {
 384         s->lsps               = 16;
 385         s->frame_lsp_bitsize  = 34;
 386         s->sframe_lsp_bitsize = 60;
 387     } else {
 388         s->lsps               = 10;
 389         s->frame_lsp_bitsize  = 24;
 390         s->sframe_lsp_bitsize = 48;
 391     }
 392     for (n = 0; n < s->lsps; n++)
 393         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 394
 395     init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
 396     if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
 397         av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
 398         return -1;
 399     }
 400
 401     s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
 402     s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
 403     pitch_range         = s->max_pitch_val - s->min_pitch_val;
 404     s->pitch_nbits      = av_ceil_log2(pitch_range);
 405     s->last_pitch_val   = 40;
 406     s->last_acb_type    = ACB_TYPE_NONE;
 407     s->history_nsamples = s->max_pitch_val + 8;
 408
 409     if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
 410         int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
 411             max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
 412
 413         av_log(ctx, AV_LOG_ERROR,
 414                "Unsupported samplerate %d (min=%d, max=%d)\n",
 415                ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
 416
 417         return -1;
 418     }
 419
 420     s->block_conv_table[0]      = s->min_pitch_val;
 421     s->block_conv_table[1]      = (pitch_range * 25) >> 6;
 422     s->block_conv_table[2]      = (pitch_range * 44) >> 6;
 423     s->block_conv_table[3]      = s->max_pitch_val - 1;
 424     s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
 425     s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
 426     s->block_pitch_range        = s->block_conv_table[2] +
 427                                   s->block_conv_table[3] + 1 +
 428                                   2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
 429     s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
 430
 431     ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
 432
 433     return 0;
 434 }
 435
 436 /**
 437  * @name Postfilter functions
 438  * Postfilter functions (gain control, wiener denoise filter, DC filter,
 439  * kalman smoothening, plus surrounding code to wrap it)
 440  * @{
 441  */
 442 /**
 443  * Adaptive gain control (as used in postfilter).
 444  *
 445  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
 446  * that the energy here is calculated using sum(abs(...)), whereas the
 447  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
 448  *
 449  * @param out output buffer for filtered samples
 450  * @param in input buffer containing the samples as they are after the
 451  *           postfilter steps so far
 452  * @param speech_synth input buffer containing speech synth before postfilter
 453  * @param size input buffer size
 454  * @param alpha exponential filter factor
 455  * @param gain_mem pointer to filter memory (single float)
 456  */
 457 static void adaptive_gain_control(float *out, const float *in,
 458                                   const float *speech_synth,
 459                                   int size, float alpha, float *gain_mem)
 460 {
 461     int i;
 462     float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
 463     float mem = *gain_mem;
 464
 465     for (i = 0; i < size; i++) {
 466         speech_energy     += fabsf(speech_synth[i]);
 467         postfilter_energy += fabsf(in[i]);
 468     }
 469     gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
 470
 471     for (i = 0; i < size; i++) {
 472         mem = alpha * mem + gain_scale_factor;
 473         out[i] = in[i] * mem;
 474     }
 475
 476     *gain_mem = mem;
 477 }
 478
 479 /**
 480  * Kalman smoothing function.
 481  *
 482  * This function looks back pitch +/- 3 samples back into history to find
 483  * the best fitting curve (that one giving the optimal gain of the two
 484  * signals, i.e. the highest dot product between the two), and then
 485  * uses that signal history to smoothen the output of the speech synthesis
 486  * filter.
 487  *
 488  * @param s WMA Voice decoding context
 489  * @param pitch pitch of the speech signal
 490  * @param in input speech signal
 491  * @param out output pointer for smoothened signal
 492  * @param size input/output buffer size
 493  *
 494  * @returns -1 if no smoothening took place, e.g. because no optimal
 495  *          fit could be found, or 0 on success.
 496  */
 497 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
 498                            const float *in, float *out, int size)
 499 {
 500     int n;
 501     float optimal_gain = 0, dot;
 502     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
 503                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
 504                 *best_hist_ptr;
 505
 506     /* find best fitting point in history */
 507     do {
 508         dot = ff_dot_productf(in, ptr, size);
 509         if (dot > optimal_gain) {
 510             optimal_gain  = dot;
 511             best_hist_ptr = ptr;
 512         }
 513     } while (--ptr >= end);
 514
 515     if (optimal_gain <= 0)
 516         return -1;
 517     dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size);
 518     if (dot <= 0) // would be 1.0
 519         return -1;
 520
 521     if (optimal_gain <= dot) {
 522         dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
 523     } else
 524         dot = 0.625;
 525
 526     /* actual smoothing */
 527     for (n = 0; n < size; n++)
 528         out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
 529
 530     return 0;
 531 }
 532
 533 /**
 534  * Get the tilt factor of a formant filter from its transfer function
 535  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
 536  *      but somehow (??) it does a speech synthesis filter in the
 537  *      middle, which is missing here
 538  *
 539  * @param lpcs LPC coefficients
 540  * @param n_lpcs Size of LPC buffer
 541  * @returns the tilt factor
 542  */
 543 static float tilt_factor(const float *lpcs, int n_lpcs)
 544 {
 545     float rh0, rh1;
 546
 547     rh0 = 1.0     + ff_dot_productf(lpcs,  lpcs,    n_lpcs);
 548     rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1);
 549
 550     return rh1 / rh0;
 551 }
 552
 553 /**
 554  * Derive denoise filter coefficients (in real domain) from the LPCs.
 555  */
 556 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
 557                                 int fcb_type, float *coeffs, int remainder)
 558 {
 559     float last_coeff, min = 15.0, max = -15.0;
 560     float irange, angle_mul, gain_mul, range, sq;
 561     int n, idx;
 562
 563     /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
 564     s->rdft.rdft_calc(&s->rdft, lpcs);
 565 #define log_range(var, assign) do { \
 566         float tmp = log10f(assign);  var = tmp; \
 567         max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
 568     } while (0)
 569     log_range(last_coeff,  lpcs[1]         * lpcs[1]);
 570     for (n = 1; n < 64; n++)
 571         log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
 572                            lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
 573     log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
 574 #undef log_range
 575     range    = max - min;
 576     lpcs[64] = last_coeff;
 577
 578     /* Now, use this spectrum to pick out these frequencies with higher
 579      * (relative) power/energy (which we then take to be "not noise"),
 580      * and set up a table (still in lpc[]) of (relative) gains per frequency.
 581      * These frequencies will be maintained, while others ("noise") will be
 582      * decreased in the filter output. */
 583     irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
 584     gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
 585                                                           (5.0 / 14.7));
 586     angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
 587     for (n = 0; n <= 64; n++) {
 588         float pwr;
 589
 590         idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
 591         pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
 592         lpcs[n] = angle_mul * pwr;
 593
 594         /* 70.57 =~ 1/log10(1.0331663) */
 595         idx = (pwr * gain_mul - 0.0295) * 70.570526123;
 596         if (idx > 127) { // fallback if index falls outside table range
 597             coeffs[n] = wmavoice_energy_table[127] *
 598                         powf(1.0331663, idx - 127);
 599         } else
 600             coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
 601     }
 602
 603     /* calculate the Hilbert transform of the gains, which we do (since this
 604      * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
 605      * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
 606      * "moment" of the LPCs in this filter. */
 607     s->dct.dct_calc(&s->dct, lpcs);
 608     s->dst.dct_calc(&s->dst, lpcs);
 609
 610     /* Split out the coefficient indexes into phase/magnitude pairs */
 611     idx = 255 + av_clip(lpcs[64],               -255, 255);
 612     coeffs[0]  = coeffs[0]  * s->cos[idx];
 613     idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
 614     last_coeff = coeffs[64] * s->cos[idx];
 615     for (n = 63;; n--) {
 616         idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 617         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 618         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 619
 620         if (!--n) break;
 621
 622         idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 623         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 624         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 625     }
 626     coeffs[1] = last_coeff;
 627
 628     /* move into real domain */
 629     s->irdft.rdft_calc(&s->irdft, coeffs);
 630
 631     /* tilt correction and normalize scale */
 632     memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
 633     if (s->denoise_tilt_corr) {
 634         float tilt_mem = 0;
 635
 636         coeffs[remainder - 1] = 0;
 637         ff_tilt_compensation(&tilt_mem,
 638                              -1.8 * tilt_factor(coeffs, remainder - 1),
 639                              coeffs, remainder);
 640     }
 641     sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder));
 642     for (n = 0; n < remainder; n++)
 643         coeffs[n] *= sq;
 644 }
 645
 646 /**
 647  * This function applies a Wiener filter on the (noisy) speech signal as
 648  * a means to denoise it.
 649  *
 650  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
 651  * - using this power spectrum, calculate (for each frequency) the Wiener
 652  *    filter gain, which depends on the frequency power and desired level
 653  *    of noise subtraction (when set too high, this leads to artifacts)
 654  *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
 655  *    of 4-8kHz);
 656  * - by doing a phase shift, calculate the Hilbert transform of this array
 657  *    of per-frequency filter-gains to get the filtering coefficients;
 658  * - smoothen/normalize/de-tilt these filter coefficients as desired;
 659  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
 660  *    to get the denoised speech signal;
 661  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
 662  *    the frame boundary) are saved and applied to subsequent frames by an
 663  *    overlap-add method (otherwise you get clicking-artifacts).
 664  *
 665  * @param s WMA Voice decoding context
 666  * @param fcb_type Frame (codebook) type
 667  * @param synth_pf input: the noisy speech signal, output: denoised speech
 668  *                 data; should be 16-byte aligned (for ASM purposes)
 669  * @param size size of the speech data
 670  * @param lpcs LPCs used to synthesize this frame's speech data
 671  */
 672 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
 673                            float *synth_pf, int size,
 674                            const float *lpcs)
 675 {
 676     int remainder, lim, n;
 677
 678     if (fcb_type != FCB_TYPE_SILENCE) {
 679         float *tilted_lpcs = s->tilted_lpcs_pf,
 680               *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
 681
 682         tilted_lpcs[0]           = 1.0;
 683         memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
 684         memset(&tilted_lpcs[s->lsps + 1], 0,
 685                sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
 686         ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
 687                              tilted_lpcs, s->lsps + 2);
 688
 689         /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
 690          * size is applied to the next frame. All input beyond this is zero,
 691          * and thus all output beyond this will go towards zero, hence we can
 692          * limit to min(size-1, 127-size) as a performance consideration. */
 693         remainder = FFMIN(127 - size, size - 1);
 694         calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
 695
 696         /* apply coefficients (in frequency spectrum domain), i.e. complex
 697          * number multiplication */
 698         memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
 699         s->rdft.rdft_calc(&s->rdft, synth_pf);
 700         s->rdft.rdft_calc(&s->rdft, coeffs);
 701         synth_pf[0] *= coeffs[0];
 702         synth_pf[1] *= coeffs[1];
 703         for (n = 1; n < 64; n++) {
 704             float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
 705             synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
 706             synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
 707         }
 708         s->irdft.rdft_calc(&s->irdft, synth_pf);
 709     }
 710
 711     /* merge filter output with the history of previous runs */
 712     if (s->denoise_filter_cache_size) {
 713         lim = FFMIN(s->denoise_filter_cache_size, size);
 714         for (n = 0; n < lim; n++)
 715             synth_pf[n] += s->denoise_filter_cache[n];
 716         s->denoise_filter_cache_size -= lim;
 717         memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
 718                 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
 719     }
 720
 721     /* move remainder of filter output into a cache for future runs */
 722     if (fcb_type != FCB_TYPE_SILENCE) {
 723         lim = FFMIN(remainder, s->denoise_filter_cache_size);
 724         for (n = 0; n < lim; n++)
 725             s->denoise_filter_cache[n] += synth_pf[size + n];
 726         if (lim < remainder) {
 727             memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
 728                    sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
 729             s->denoise_filter_cache_size = remainder;
 730         }
 731     }
 732 }
 733
 734 /**
 735  * Averaging projection filter, the postfilter used in WMAVoice.
 736  *
 737  * This uses the following steps:
 738  * - A zero-synthesis filter (generate excitation from synth signal)
 739  * - Kalman smoothing on excitation, based on pitch
 740  * - Re-synthesized smoothened output
 741  * - Iterative Wiener denoise filter
 742  * - Adaptive gain filter
 743  * - DC filter
 744  *
 745  * @param s WMAVoice decoding context
 746  * @param synth Speech synthesis output (before postfilter)
 747  * @param samples Output buffer for filtered samples
 748  * @param size Buffer size of synth & samples
 749  * @param lpcs Generated LPCs used for speech synthesis
 750  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
 751  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
 752  * @param pitch Pitch of the input signal
 753  */
 754 static void postfilter(WMAVoiceContext *s, const float *synth,
 755                        float *samples,    int size,
 756                        const float *lpcs, float *zero_exc_pf,
 757                        int fcb_type,      int pitch)
 758 {
 759     float synth_filter_in_buf[MAX_FRAMESIZE / 2],
 760           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
 761           *synth_filter_in = zero_exc_pf;
 762
 763     assert(size <= MAX_FRAMESIZE / 2);
 764
 765     /* generate excitation from input signal */
 766     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
 767
 768     if (fcb_type >= FCB_TYPE_AW_PULSES &&
 769         !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
 770         synth_filter_in = synth_filter_in_buf;
 771
 772     /* re-synthesize speech after smoothening, and keep history */
 773     ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
 774                                  synth_filter_in, size, s->lsps);
 775     memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
 776            sizeof(synth_pf[0]) * s->lsps);
 777
 778     wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
 779
 780     adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
 781                           &s->postfilter_agc);
 782
 783     if (s->dc_level > 8) {
 784         /* remove ultra-low frequency DC noise / highpass filter;
 785          * coefficients are identical to those used in SIPR decoding,
 786          * and very closely resemble those used in AMR-NB decoding. */
 787         ff_acelp_apply_order_2_transfer_function(samples, samples,
 788             (const float[2]) { -1.99997,      1.0 },
 789             (const float[2]) { -1.9330735188, 0.93589198496 },
 790             0.93980580475, s->dcf_mem, size);
 791     }
 792 }
 793 /**
 794  * @}
 795  */
 796
 797 /**
 798  * Dequantize LSPs
 799  * @param lsps output pointer to the array that will hold the LSPs
 800  * @param num number of LSPs to be dequantized
 801  * @param values quantized values, contains n_stages values
 802  * @param sizes range (i.e. max value) of each quantized value
 803  * @param n_stages number of dequantization runs
 804  * @param table dequantization table to be used
 805  * @param mul_q LSF multiplier
 806  * @param base_q base (lowest) LSF values
 807  */
 808 static void dequant_lsps(double *lsps, int num,
 809                          const uint16_t *values,
 810                          const uint16_t *sizes,
 811                          int n_stages, const uint8_t *table,
 812                          const double *mul_q,
 813                          const double *base_q)
 814 {
 815     int n, m;
 816
 817     memset(lsps, 0, num * sizeof(*lsps));
 818     for (n = 0; n < n_stages; n++) {
 819         const uint8_t *t_off = &table[values[n] * num];
 820         double base = base_q[n], mul = mul_q[n];
 821
 822         for (m = 0; m < num; m++)
 823             lsps[m] += base + mul * t_off[m];
 824
 825         table += sizes[n] * num;
 826     }
 827 }
 828
 829 /**
 830  * @name LSP dequantization routines
 831  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
 832  * @note we assume enough bits are available, caller should check.
 833  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
 834  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
 835  * @{
 836  */
 837 /**
 838  * Parse 10 independently-coded LSPs.
 839  */
 840 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
 841 {
 842     static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
 843     static const double mul_lsf[4] = {
 844         5.2187144800e-3,    1.4626986422e-3,
 845         9.6179549166e-4,    1.1325736225e-3
 846     };
 847     static const double base_lsf[4] = {
 848         M_PI * -2.15522e-1, M_PI * -6.1646e-2,
 849         M_PI * -3.3486e-2,  M_PI * -5.7408e-2
 850     };
 851     uint16_t v[4];
 852
 853     v[0] = get_bits(gb, 8);
 854     v[1] = get_bits(gb, 6);
 855     v[2] = get_bits(gb, 5);
 856     v[3] = get_bits(gb, 5);
 857
 858     dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
 859                  mul_lsf, base_lsf);
 860 }
 861
 862 /**
 863  * Parse 10 independently-coded LSPs, and then derive the tables to
 864  * generate LSPs for the other frames from them (residual coding).
 865  */
 866 static void dequant_lsp10r(GetBitContext *gb,
 867                            double *i_lsps, const double *old,
 868                            double *a1, double *a2, int q_mode)
 869 {
 870     static const uint16_t vec_sizes[3] = { 128, 64, 64 };
 871     static const double mul_lsf[3] = {
 872         2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
 873     };
 874     static const double base_lsf[3] = {
 875         M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
 876     };
 877     const float (*ipol_tab)[2][10] = q_mode ?
 878         wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
 879     uint16_t interpol, v[3];
 880     int n;
 881
 882     dequant_lsp10i(gb, i_lsps);
 883
 884     interpol = get_bits(gb, 5);
 885     v[0]     = get_bits(gb, 7);
 886     v[1]     = get_bits(gb, 6);
 887     v[2]     = get_bits(gb, 6);
 888
 889     for (n = 0; n < 10; n++) {
 890         double delta = old[n] - i_lsps[n];
 891         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
 892         a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
 893     }
 894
 895     dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
 896                  mul_lsf, base_lsf);
 897 }
 898
 899 /**
 900  * Parse 16 independently-coded LSPs.
 901  */
 902 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
 903 {
 904     static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
 905     static const double mul_lsf[5] = {
 906         3.3439586280e-3,    6.9908173703e-4,
 907         3.3216608306e-3,    1.0334960326e-3,
 908         3.1899104283e-3
 909     };
 910     static const double base_lsf[5] = {
 911         M_PI * -1.27576e-1, M_PI * -2.4292e-2,
 912         M_PI * -1.28094e-1, M_PI * -3.2128e-2,
 913         M_PI * -1.29816e-1
 914     };
 915     uint16_t v[5];
 916
 917     v[0] = get_bits(gb, 8);
 918     v[1] = get_bits(gb, 6);
 919     v[2] = get_bits(gb, 7);
 920     v[3] = get_bits(gb, 6);
 921     v[4] = get_bits(gb, 7);
 922
 923     dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
 924                  wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
 925     dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
 926                  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
 927     dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
 928                  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
 929 }
 930
 931 /**
 932  * Parse 16 independently-coded LSPs, and then derive the tables to
 933  * generate LSPs for the other frames from them (residual coding).
 934  */
 935 static void dequant_lsp16r(GetBitContext *gb,
 936                            double *i_lsps, const double *old,
 937                            double *a1, double *a2, int q_mode)
 938 {
 939     static const uint16_t vec_sizes[3] = { 128, 128, 128 };
 940     static const double mul_lsf[3] = {
 941         1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
 942     };
 943     static const double base_lsf[3] = {
 944         M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
 945     };
 946     const float (*ipol_tab)[2][16] = q_mode ?
 947         wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
 948     uint16_t interpol, v[3];
 949     int n;
 950
 951     dequant_lsp16i(gb, i_lsps);
 952
 953     interpol = get_bits(gb, 5);
 954     v[0]     = get_bits(gb, 7);
 955     v[1]     = get_bits(gb, 7);
 956     v[2]     = get_bits(gb, 7);
 957
 958     for (n = 0; n < 16; n++) {
 959         double delta = old[n] - i_lsps[n];
 960         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
 961         a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
 962     }
 963
 964     dequant_lsps( a2,     10,  v,     vec_sizes,    1,
 965                  wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
 966     dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
 967                  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
 968     dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
 969                  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
 970 }
 971
 972 /**
 973  * @}
 974  * @name Pitch-adaptive window coding functions
 975  * The next few functions are for pitch-adaptive window coding.
 976  * @{
 977  */
 978 /**
 979  * Parse the offset of the first pitch-adaptive window pulses, and
 980  * the distribution of pulses between the two blocks in this frame.
 981  * @param s WMA Voice decoding context private data
 982  * @param gb bit I/O context
 983  * @param pitch pitch for each block in this frame
 984  */
 985 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
 986                             const int *pitch)
 987 {
 988     static const int16_t start_offset[94] = {
 989         -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
 990          13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
 991          27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
 992          45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
 993          69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
 994          93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
 995         117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
 996         141, 143, 145, 147, 149, 151, 153, 155, 157, 159
 997     };
 998     int bits, offset;
 999
1000     /* position of pulse */
1001     s->aw_idx_is_ext = 0;
1002     if ((bits = get_bits(gb, 6)) >= 54) {
1003         s->aw_idx_is_ext = 1;
1004         bits += (bits - 54) * 3 + get_bits(gb, 2);
1005     }
1006
1007     /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1008      * the distribution of the pulses in each block contained in this frame. */
1009     s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1010     for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1011     s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1012     s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1013     offset                  += s->aw_n_pulses[0] * pitch[0];
1014     s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1015     s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1016
1017     /* if continuing from a position before the block, reset position to
1018      * start of block (when corrected for the range over which it can be
1019      * spread in aw_pulse_set1()). */
1020     if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1021         while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1022             s->aw_first_pulse_off[1] -= pitch[1];
1023         if (start_offset[bits] < 0)
1024             while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1025                 s->aw_first_pulse_off[0] -= pitch[0];
1026     }
1027 }
1028
1029 /**
1030  * Apply second set of pitch-adaptive window pulses.
1031  * @param s WMA Voice decoding context private data
1032  * @param gb bit I/O context
1033  * @param block_idx block index in frame [0, 1]
1034  * @param fcb structure containing fixed codebook vector info
1035  */
1036 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1037                           int block_idx, AMRFixed *fcb)
1038 {
1039     uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1040     uint16_t *use_mask = use_mask_mem + 2;
1041     /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1042      * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1043      * of idx are the position of the bit within a particular item in the
1044      * array (0 being the most significant bit, and 15 being the least
1045      * significant bit), and the remainder (>> 4) is the index in the
1046      * use_mask[]-array. This is faster and uses less memory than using a
1047      * 80-byte/80-int array. */
1048     int pulse_off = s->aw_first_pulse_off[block_idx],
1049         pulse_start, n, idx, range, aidx, start_off = 0;
1050
1051     /* set offset of first pulse to within this block */
1052     if (s->aw_n_pulses[block_idx] > 0)
1053         while (pulse_off + s->aw_pulse_range < 1)
1054             pulse_off += fcb->pitch_lag;
1055
1056     /* find range per pulse */
1057     if (s->aw_n_pulses[0] > 0) {
1058         if (block_idx == 0) {
1059             range = 32;
1060         } else /* block_idx = 1 */ {
1061             range = 8;
1062             if (s->aw_n_pulses[block_idx] > 0)
1063                 pulse_off = s->aw_next_pulse_off_cache;
1064         }
1065     } else
1066         range = 16;
1067     pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1068
1069     /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1070      * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1071      * we exclude that range from being pulsed again in this function. */
1072     memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1073     memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1074     memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1075     if (s->aw_n_pulses[block_idx] > 0)
1076         for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1077             int excl_range         = s->aw_pulse_range; // always 16 or 24
1078             uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1079             int first_sh           = 16 - (idx & 15);
1080             *use_mask_ptr++       &= 0xFFFF << first_sh;
1081             excl_range            -= first_sh;
1082             if (excl_range >= 16) {
1083                 *use_mask_ptr++    = 0;
1084                 *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1085             } else
1086                 *use_mask_ptr     &= 0xFFFF >> excl_range;
1087         }
1088
1089     /* find the 'aidx'th offset that is not excluded */
1090     aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1091     for (n = 0; n <= aidx; pulse_start++) {
1092         for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1093         if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1094             if (use_mask[0])      idx = 0x0F;
1095             else if (use_mask[1]) idx = 0x1F;
1096             else if (use_mask[2]) idx = 0x2F;
1097             else if (use_mask[3]) idx = 0x3F;
1098             else if (use_mask[4]) idx = 0x4F;
1099             else                  return;
1100             idx -= av_log2_16bit(use_mask[idx >> 4]);
1101         }
1102         if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1103             use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1104             n++;
1105             start_off = idx;
1106         }
1107     }
1108
1109     fcb->x[fcb->n] = start_off;
1110     fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1111     fcb->n++;
1112
1113     /* set offset for next block, relative to start of that block */
1114     n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1115     s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1116 }
1117
1118 /**
1119  * Apply first set of pitch-adaptive window pulses.
1120  * @param s WMA Voice decoding context private data
1121  * @param gb bit I/O context
1122  * @param block_idx block index in frame [0, 1]
1123  * @param fcb storage location for fixed codebook pulse info
1124  */
1125 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1126                           int block_idx, AMRFixed *fcb)
1127 {
1128     int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1129     float v;
1130
1131     if (s->aw_n_pulses[block_idx] > 0) {
1132         int n, v_mask, i_mask, sh, n_pulses;
1133
1134         if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1135             n_pulses = 3;
1136             v_mask   = 8;
1137             i_mask   = 7;
1138             sh       = 4;
1139         } else { // 4 pulses, 1:sign + 2:index each
1140             n_pulses = 4;
1141             v_mask   = 4;
1142             i_mask   = 3;
1143             sh       = 3;
1144         }
1145
1146         for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1147             fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1148             fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1149                                  s->aw_first_pulse_off[block_idx];
1150             while (fcb->x[fcb->n] < 0)
1151                 fcb->x[fcb->n] += fcb->pitch_lag;
1152             if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1153                 fcb->n++;
1154         }
1155     } else {
1156         int num2 = (val & 0x1FF) >> 1, delta, idx;
1157
1158         if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1159         else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1160         else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1161         else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1162         v = (val & 0x200) ? -1.0 : 1.0;
1163
1164         fcb->no_repeat_mask |= 3 << fcb->n;
1165         fcb->x[fcb->n]       = idx - delta;
1166         fcb->y[fcb->n]       = v;
1167         fcb->x[fcb->n + 1]   = idx;
1168         fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1169         fcb->n              += 2;
1170     }
1171 }
1172
1173 /**
1174  * @}
1175  *
1176  * Generate a random number from frame_cntr and block_idx, which will lief
1177  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1178  * table of size 1000 of which you want to read block_size entries).
1179  *
1180  * @param frame_cntr current frame number
1181  * @param block_num current block index
1182  * @param block_size amount of entries we want to read from a table
1183  *                   that has 1000 entries
1184  * @return a (non-)random number in the [0, 1000 - block_size] range.
1185  */
1186 static int pRNG(int frame_cntr, int block_num, int block_size)
1187 {
1188     /* array to simplify the calculation of z:
1189      * y = (x % 9) * 5 + 6;
1190      * z = (49995 * x) / y;
1191      * Since y only has 9 values, we can remove the division by using a
1192      * LUT and using FASTDIV-style divisions. For each of the 9 values
1193      * of y, we can rewrite z as:
1194      * z = x * (49995 / y) + x * ((49995 % y) / y)
1195      * In this table, each col represents one possible value of y, the
1196      * first number is 49995 / y, and the second is the FASTDIV variant
1197      * of 49995 % y / y. */
1198     static const unsigned int div_tbl[9][2] = {
1199         { 8332,  3 * 715827883U }, // y =  6
1200         { 4545,  0 * 390451573U }, // y = 11
1201         { 3124, 11 * 268435456U }, // y = 16
1202         { 2380, 15 * 204522253U }, // y = 21
1203         { 1922, 23 * 165191050U }, // y = 26
1204         { 1612, 23 * 138547333U }, // y = 31
1205         { 1388, 27 * 119304648U }, // y = 36
1206         { 1219, 16 * 104755300U }, // y = 41
1207         { 1086, 39 *  93368855U }  // y = 46
1208     };
1209     unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1210     if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1211                                     // so this is effectively a modulo (%)
1212     y = x - 9 * MULH(477218589, x); // x % 9
1213     z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1214                                     // z = x * 49995 / (y * 5 + 6)
1215     return z % (1000 - block_size);
1216 }
1217
1218 /**
1219  * Parse hardcoded signal for a single block.
1220  * @note see #synth_block().
1221  */
1222 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1223                                  int block_idx, int size,
1224                                  const struct frame_type_desc *frame_desc,
1225                                  float *excitation)
1226 {
1227     float gain;
1228     int n, r_idx;
1229
1230     assert(size <= MAX_FRAMESIZE);
1231
1232     /* Set the offset from which we start reading wmavoice_std_codebook */
1233     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1234         r_idx = pRNG(s->frame_cntr, block_idx, size);
1235         gain  = s->silence_gain;
1236     } else /* FCB_TYPE_HARDCODED */ {
1237         r_idx = get_bits(gb, 8);
1238         gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1239     }
1240
1241     /* Clear gain prediction parameters */
1242     memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1243
1244     /* Apply gain to hardcoded codebook and use that as excitation signal */
1245     for (n = 0; n < size; n++)
1246         excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1247 }
1248
1249 /**
1250  * Parse FCB/ACB signal for a single block.
1251  * @note see #synth_block().
1252  */
1253 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1254                                 int block_idx, int size,
1255                                 int block_pitch_sh2,
1256                                 const struct frame_type_desc *frame_desc,
1257                                 float *excitation)
1258 {
1259     static const float gain_coeff[6] = {
1260         0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1261     };
1262     float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1263     int n, idx, gain_weight;
1264     AMRFixed fcb;
1265
1266     assert(size <= MAX_FRAMESIZE / 2);
1267     memset(pulses, 0, sizeof(*pulses) * size);
1268
1269     fcb.pitch_lag      = block_pitch_sh2 >> 2;
1270     fcb.pitch_fac      = 1.0;
1271     fcb.no_repeat_mask = 0;
1272     fcb.n              = 0;
1273
1274     /* For the other frame types, this is where we apply the innovation
1275      * (fixed) codebook pulses of the speech signal. */
1276     if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1277         aw_pulse_set1(s, gb, block_idx, &fcb);
1278         aw_pulse_set2(s, gb, block_idx, &fcb);
1279     } else /* FCB_TYPE_EXC_PULSES */ {
1280         int offset_nbits = 5 - frame_desc->log_n_blocks;
1281
1282         fcb.no_repeat_mask = -1;
1283         /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1284          * (instead of double) for a subset of pulses */
1285         for (n = 0; n < 5; n++) {
1286             float sign;
1287             int pos1, pos2;
1288
1289             sign           = get_bits1(gb) ? 1.0 : -1.0;
1290             pos1           = get_bits(gb, offset_nbits);
1291             fcb.x[fcb.n]   = n + 5 * pos1;
1292             fcb.y[fcb.n++] = sign;
1293             if (n < frame_desc->dbl_pulses) {
1294                 pos2           = get_bits(gb, offset_nbits);
1295                 fcb.x[fcb.n]   = n + 5 * pos2;
1296                 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1297             }
1298         }
1299     }
1300     ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1301
1302     /* Calculate gain for adaptive & fixed codebook signal.
1303      * see ff_amr_set_fixed_gain(). */
1304     idx = get_bits(gb, 7);
1305     fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
1306                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1307     acb_gain = wmavoice_gain_codebook_acb[idx];
1308     pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1309                         -2.9957322736 /* log(0.05) */,
1310                          1.6094379124 /* log(5.0)  */);
1311
1312     gain_weight = 8 >> frame_desc->log_n_blocks;
1313     memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1314             sizeof(*s->gain_pred_err) * (6 - gain_weight));
1315     for (n = 0; n < gain_weight; n++)
1316         s->gain_pred_err[n] = pred_err;
1317
1318     /* Calculation of adaptive codebook */
1319     if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1320         int len;
1321         for (n = 0; n < size; n += len) {
1322             int next_idx_sh16;
1323             int abs_idx    = block_idx * size + n;
1324             int pitch_sh16 = (s->last_pitch_val << 16) +
1325                              s->pitch_diff_sh16 * abs_idx;
1326             int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1327             int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1328             idx            = idx_sh16 >> 16;
1329             if (s->pitch_diff_sh16) {
1330                 if (s->pitch_diff_sh16 > 0) {
1331                     next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1332                 } else
1333                     next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1334                 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1335                               1, size - n);
1336             } else
1337                 len = size;
1338
1339             ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1340                                   wmavoice_ipol1_coeffs, 17,
1341                                   idx, 9, len);
1342         }
1343     } else /* ACB_TYPE_HAMMING */ {
1344         int block_pitch = block_pitch_sh2 >> 2;
1345         idx             = block_pitch_sh2 & 3;
1346         if (idx) {
1347             ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1348                                   wmavoice_ipol2_coeffs, 4,
1349                                   idx, 8, size);
1350         } else
1351             av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1352                               sizeof(float) * size);
1353     }
1354
1355     /* Interpolate ACB/FCB and use as excitation signal */
1356     ff_weighted_vector_sumf(excitation, excitation, pulses,
1357                             acb_gain, fcb_gain, size);
1358 }
1359
1360 /**
1361  * Parse data in a single block.
1362  * @note we assume enough bits are available, caller should check.
1363  *
1364  * @param s WMA Voice decoding context private data
1365  * @param gb bit I/O context
1366  * @param block_idx index of the to-be-read block
1367  * @param size amount of samples to be read in this block
1368  * @param block_pitch_sh2 pitch for this block << 2
1369  * @param lsps LSPs for (the end of) this frame
1370  * @param prev_lsps LSPs for the last frame
1371  * @param frame_desc frame type descriptor
1372  * @param excitation target memory for the ACB+FCB interpolated signal
1373  * @param synth target memory for the speech synthesis filter output
1374  * @return 0 on success, <0 on error.
1375  */
1376 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1377                         int block_idx, int size,
1378                         int block_pitch_sh2,
1379                         const double *lsps, const double *prev_lsps,
1380                         const struct frame_type_desc *frame_desc,
1381                         float *excitation, float *synth)
1382 {
1383     double i_lsps[MAX_LSPS];
1384     float lpcs[MAX_LSPS];
1385     float fac;
1386     int n;
1387
1388     if (frame_desc->acb_type == ACB_TYPE_NONE)
1389         synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1390     else
1391         synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1392                             frame_desc, excitation);
1393
1394     /* convert interpolated LSPs to LPCs */
1395     fac = (block_idx + 0.5) / frame_desc->n_blocks;
1396     for (n = 0; n < s->lsps; n++) // LSF -> LSP
1397         i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1398     ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1399
1400     /* Speech synthesis */
1401     ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1402 }
1403
1404 /**
1405  * Synthesize output samples for a single frame.
1406  * @note we assume enough bits are available, caller should check.
1407  *
1408  * @param ctx WMA Voice decoder context
1409  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1410  * @param frame_idx Frame number within superframe [0-2]
1411  * @param samples pointer to output sample buffer, has space for at least 160
1412  *                samples
1413  * @param lsps LSP array
1414  * @param prev_lsps array of previous frame's LSPs
1415  * @param excitation target buffer for excitation signal
1416  * @param synth target buffer for synthesized speech data
1417  * @return 0 on success, <0 on error.
1418  */
1419 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1420                        float *samples,
1421                        const double *lsps, const double *prev_lsps,
1422                        float *excitation, float *synth)
1423 {
1424     WMAVoiceContext *s = ctx->priv_data;
1425     int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1426     int pitch[MAX_BLOCKS], last_block_pitch;
1427
1428     /* Parse frame type ("frame header"), see frame_descs */
1429     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
1430         block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1431
1432     if (bd_idx < 0) {
1433         av_log(ctx, AV_LOG_ERROR,
1434                "Invalid frame type VLC code, skipping\n");
1435         return -1;
1436     }
1437
1438     /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1439     if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1440         /* Pitch is provided per frame, which is interpreted as the pitch of
1441          * the last sample of the last block of this frame. We can interpolate
1442          * the pitch of other blocks (and even pitch-per-sample) by gradually
1443          * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1444         n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1445         log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1446         cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1447         cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1448         if (s->last_acb_type == ACB_TYPE_NONE ||
1449             20 * abs(cur_pitch_val - s->last_pitch_val) >
1450                 (cur_pitch_val + s->last_pitch_val))
1451             s->last_pitch_val = cur_pitch_val;
1452
1453         /* pitch per block */
1454         for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1455             int fac = n * 2 + 1;
1456
1457             pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1458                         MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1459                         frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1460         }
1461
1462         /* "pitch-diff-per-sample" for calculation of pitch per sample */
1463         s->pitch_diff_sh16 =
1464             ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1465     }
1466
1467     /* Global gain (if silence) and pitch-adaptive window coordinates */
1468     switch (frame_descs[bd_idx].fcb_type) {
1469     case FCB_TYPE_SILENCE:
1470         s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1471         break;
1472     case FCB_TYPE_AW_PULSES:
1473         aw_parse_coords(s, gb, pitch);
1474         break;
1475     }
1476
1477     for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1478         int bl_pitch_sh2;
1479
1480         /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1481         switch (frame_descs[bd_idx].acb_type) {
1482         case ACB_TYPE_HAMMING: {
1483             /* Pitch is given per block. Per-block pitches are encoded as an
1484              * absolute value for the first block, and then delta values
1485              * relative to this value) for all subsequent blocks. The scale of
1486              * this pitch value is semi-logaritmic compared to its use in the
1487              * decoder, so we convert it to normal scale also. */
1488             int block_pitch,
1489                 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1490                 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1491                 t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1492
1493             if (n == 0) {
1494                 block_pitch = get_bits(gb, s->block_pitch_nbits);
1495             } else
1496                 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1497                                  get_bits(gb, s->block_delta_pitch_nbits);
1498             /* Convert last_ so that any next delta is within _range */
1499             last_block_pitch = av_clip(block_pitch,
1500                                        s->block_delta_pitch_hrange,
1501                                        s->block_pitch_range -
1502                                            s->block_delta_pitch_hrange);
1503
1504             /* Convert semi-log-style scale back to normal scale */
1505             if (block_pitch < t1) {
1506                 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1507             } else {
1508                 block_pitch -= t1;
1509                 if (block_pitch < t2) {
1510                     bl_pitch_sh2 =
1511                         (s->block_conv_table[1] << 2) + (block_pitch << 1);
1512                 } else {
1513                     block_pitch -= t2;
1514                     if (block_pitch < t3) {
1515                         bl_pitch_sh2 =
1516                             (s->block_conv_table[2] + block_pitch) << 2;
1517                     } else
1518                         bl_pitch_sh2 = s->block_conv_table[3] << 2;
1519                 }
1520             }
1521             pitch[n] = bl_pitch_sh2 >> 2;
1522             break;
1523         }
1524
1525         case ACB_TYPE_ASYMMETRIC: {
1526             bl_pitch_sh2 = pitch[n] << 2;
1527             break;
1528         }
1529
1530         default: // ACB_TYPE_NONE has no pitch
1531             bl_pitch_sh2 = 0;
1532             break;
1533         }
1534
1535         synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1536                     lsps, prev_lsps, &frame_descs[bd_idx],
1537                     &excitation[n * block_nsamples],
1538                     &synth[n * block_nsamples]);
1539     }
1540
1541     /* Averaging projection filter, if applicable. Else, just copy samples
1542      * from synthesis buffer */
1543     if (s->do_apf) {
1544         double i_lsps[MAX_LSPS];
1545         float lpcs[MAX_LSPS];
1546
1547         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1548             i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1549         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1550         postfilter(s, synth, samples, 80, lpcs,
1551                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1552                    frame_descs[bd_idx].fcb_type, pitch[0]);
1553
1554         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1555             i_lsps[n] = cos(lsps[n]);
1556         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1557         postfilter(s, &synth[80], &samples[80], 80, lpcs,
1558                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1559                    frame_descs[bd_idx].fcb_type, pitch[0]);
1560     } else
1561         memcpy(samples, synth, 160 * sizeof(synth[0]));
1562
1563     /* Cache values for next frame */
1564     s->frame_cntr++;
1565     if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1566     s->last_acb_type = frame_descs[bd_idx].acb_type;
1567     switch (frame_descs[bd_idx].acb_type) {
1568     case ACB_TYPE_NONE:
1569         s->last_pitch_val = 0;
1570         break;
1571     case ACB_TYPE_ASYMMETRIC:
1572         s->last_pitch_val = cur_pitch_val;
1573         break;
1574     case ACB_TYPE_HAMMING:
1575         s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1576         break;
1577     }
1578
1579     return 0;
1580 }
1581
1582 /**
1583  * Ensure minimum value for first item, maximum value for last value,
1584  * proper spacing between each value and proper ordering.
1585  *
1586  * @param lsps array of LSPs
1587  * @param num size of LSP array
1588  *
1589  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1590  *       useful to put in a generic location later on. Parts are also
1591  *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1592  *       which is in float.
1593  */
1594 static void stabilize_lsps(double *lsps, int num)
1595 {
1596     int n, m, l;
1597
1598     /* set minimum value for first, maximum value for last and minimum
1599      * spacing between LSF values.
1600      * Very similar to ff_set_min_dist_lsf(), but in double. */
1601     lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1602     for (n = 1; n < num; n++)
1603         lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1604     lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1605
1606     /* reorder (looks like one-time / non-recursed bubblesort).
1607      * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1608     for (n = 1; n < num; n++) {
1609         if (lsps[n] < lsps[n - 1]) {
1610             for (m = 1; m < num; m++) {
1611                 double tmp = lsps[m];
1612                 for (l = m - 1; l >= 0; l--) {
1613                     if (lsps[l] <= tmp) break;
1614                     lsps[l + 1] = lsps[l];
1615                 }
1616                 lsps[l + 1] = tmp;
1617             }
1618             break;
1619         }
1620     }
1621 }
1622
1623 /**
1624  * Test if there's enough bits to read 1 superframe.
1625  *
1626  * @param orig_gb bit I/O context used for reading. This function
1627  *                does not modify the state of the bitreader; it
1628  *                only uses it to copy the current stream position
1629  * @param s WMA Voice decoding context private data
1630  * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1631  */
1632 static int check_bits_for_superframe(GetBitContext *orig_gb,
1633                                      WMAVoiceContext *s)
1634 {
1635     GetBitContext s_gb, *gb = &s_gb;
1636     int n, need_bits, bd_idx;
1637     const struct frame_type_desc *frame_desc;
1638
1639     /* initialize a copy */
1640     init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1641     skip_bits_long(gb, get_bits_count(orig_gb));
1642     assert(get_bits_left(gb) == get_bits_left(orig_gb));
1643
1644     /* superframe header */
1645     if (get_bits_left(gb) < 14)
1646         return 1;
1647     if (!get_bits1(gb))
1648         return -1;                        // WMAPro-in-WMAVoice superframe
1649     if (get_bits1(gb)) skip_bits(gb, 12); // number of  samples in superframe
1650     if (s->has_residual_lsps) {           // residual LSPs (for all frames)
1651         if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1652             return 1;
1653         skip_bits_long(gb, s->sframe_lsp_bitsize);
1654     }
1655
1656     /* frames */
1657     for (n = 0; n < MAX_FRAMES; n++) {
1658         int aw_idx_is_ext = 0;
1659
1660         if (!s->has_residual_lsps) {     // independent LSPs (per-frame)
1661            if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1662            skip_bits_long(gb, s->frame_lsp_bitsize);
1663         }
1664         bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1665         if (bd_idx < 0)
1666             return -1;                   // invalid frame type VLC code
1667         frame_desc = &frame_descs[bd_idx];
1668         if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1669             if (get_bits_left(gb) < s->pitch_nbits)
1670                 return 1;
1671             skip_bits_long(gb, s->pitch_nbits);
1672         }
1673         if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1674             skip_bits(gb, 8);
1675         } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1676             int tmp = get_bits(gb, 6);
1677             if (tmp >= 0x36) {
1678                 skip_bits(gb, 2);
1679                 aw_idx_is_ext = 1;
1680             }
1681         }
1682
1683         /* blocks */
1684         if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1685             need_bits = s->block_pitch_nbits +
1686                 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1687         } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1688             need_bits = 2 * !aw_idx_is_ext;
1689         } else
1690             need_bits = 0;
1691         need_bits += frame_desc->frame_size;
1692         if (get_bits_left(gb) < need_bits)
1693             return 1;
1694         skip_bits_long(gb, need_bits);
1695     }
1696
1697     return 0;
1698 }
1699
1700 /**
1701  * Synthesize output samples for a single superframe. If we have any data
1702  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1703  * in s->gb.
1704  *
1705  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1706  * to give a total of 480 samples per frame. See #synth_frame() for frame
1707  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1708  * (if these are globally specified for all frames (residually); they can
1709  * also be specified individually per-frame. See the s->has_residual_lsps
1710  * option), and can specify the number of samples encoded in this superframe
1711  * (if less than 480), usually used to prevent blanks at track boundaries.
1712  *
1713  * @param ctx WMA Voice decoder context
1714  * @param samples pointer to output buffer for voice samples
1715  * @param data_size pointer containing the size of #samples on input, and the
1716  *                  amount of #samples filled on output
1717  * @return 0 on success, <0 on error or 1 if there was not enough data to
1718  *         fully parse the superframe
1719  */
1720 static int synth_superframe(AVCodecContext *ctx,
1721                             float *samples, int *data_size)
1722 {
1723     WMAVoiceContext *s = ctx->priv_data;
1724     GetBitContext *gb = &s->gb, s_gb;
1725     int n, res, n_samples = 480;
1726     double lsps[MAX_FRAMES][MAX_LSPS];
1727     const double *mean_lsf = s->lsps == 16 ?
1728         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1729     float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1730     float synth[MAX_LSPS + MAX_SFRAMESIZE];
1731
1732     memcpy(synth,      s->synth_history,
1733            s->lsps             * sizeof(*synth));
1734     memcpy(excitation, s->excitation_history,
1735            s->history_nsamples * sizeof(*excitation));
1736
1737     if (s->sframe_cache_size > 0) {
1738         gb = &s_gb;
1739         init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1740         s->sframe_cache_size = 0;
1741     }
1742
1743     if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
1744
1745     /* First bit is speech/music bit, it differentiates between WMAVoice
1746      * speech samples (the actual codec) and WMAVoice music samples, which
1747      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1748      * the wild yet. */
1749     if (!get_bits1(gb)) {
1750         av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1751         return -1;
1752     }
1753
1754     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1755     if (get_bits1(gb)) {
1756         if ((n_samples = get_bits(gb, 12)) > 480) {
1757             av_log(ctx, AV_LOG_ERROR,
1758                    "Superframe encodes >480 samples (%d), not allowed\n",
1759                    n_samples);
1760             return -1;
1761         }
1762     }
1763     /* Parse LSPs, if global for the superframe (can also be per-frame). */
1764     if (s->has_residual_lsps) {
1765         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1766
1767         for (n = 0; n < s->lsps; n++)
1768             prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1769
1770         if (s->lsps == 10) {
1771             dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1772         } else /* s->lsps == 16 */
1773             dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1774
1775         for (n = 0; n < s->lsps; n++) {
1776             lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1777             lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1778             lsps[2][n] += mean_lsf[n];
1779         }
1780         for (n = 0; n < 3; n++)
1781             stabilize_lsps(lsps[n], s->lsps);
1782     }
1783
1784     /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */
1785     for (n = 0; n < 3; n++) {
1786         if (!s->has_residual_lsps) {
1787             int m;
1788
1789             if (s->lsps == 10) {
1790                 dequant_lsp10i(gb, lsps[n]);
1791             } else /* s->lsps == 16 */
1792                 dequant_lsp16i(gb, lsps[n]);
1793
1794             for (m = 0; m < s->lsps; m++)
1795                 lsps[n][m] += mean_lsf[m];
1796             stabilize_lsps(lsps[n], s->lsps);
1797         }
1798
1799         if ((res = synth_frame(ctx, gb, n,
1800                                &samples[n * MAX_FRAMESIZE],
1801                                lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1802                                &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1803                                &synth[s->lsps + n * MAX_FRAMESIZE])))
1804             return res;
1805     }
1806
1807     /* Statistics? FIXME - we don't check for length, a slight overrun
1808      * will be caught by internal buffer padding, and anything else
1809      * will be skipped, not read. */
1810     if (get_bits1(gb)) {
1811         res = get_bits(gb, 4);
1812         skip_bits(gb, 10 * (res + 1));
1813     }
1814
1815     /* Specify nr. of output samples */
1816     *data_size = n_samples * sizeof(float);
1817
1818     /* Update history */
1819     memcpy(s->prev_lsps,           lsps[2],
1820            s->lsps             * sizeof(*s->prev_lsps));
1821     memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1822            s->lsps             * sizeof(*synth));
1823     memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1824            s->history_nsamples * sizeof(*excitation));
1825     if (s->do_apf)
1826         memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1827                 s->history_nsamples * sizeof(*s->zero_exc_pf));
1828
1829     return 0;
1830 }
1831
1832 /**
1833  * Parse the packet header at the start of each packet (input data to this
1834  * decoder).
1835  *
1836  * @param s WMA Voice decoding context private data
1837  * @return 1 if not enough bits were available, or 0 on success.
1838  */
1839 static int parse_packet_header(WMAVoiceContext *s)
1840 {
1841     GetBitContext *gb = &s->gb;
1842     unsigned int res;
1843
1844     if (get_bits_left(gb) < 11)
1845         return 1;
1846     skip_bits(gb, 4);          // packet sequence number
1847     s->has_residual_lsps = get_bits1(gb);
1848     do {
1849         res = get_bits(gb, 6); // number of superframes per packet
1850                                // (minus first one if there is spillover)
1851         if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1852             return 1;
1853     } while (res == 0x3F);
1854     s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1855
1856     return 0;
1857 }
1858
1859 /**
1860  * Copy (unaligned) bits from gb/data/size to pb.
1861  *
1862  * @param pb target buffer to copy bits into
1863  * @param data source buffer to copy bits from
1864  * @param size size of the source data, in bytes
1865  * @param gb bit I/O context specifying the current position in the source.
1866  *           data. This function might use this to align the bit position to
1867  *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1868  *           source data
1869  * @param nbits the amount of bits to copy from source to target
1870  *
1871  * @note after calling this function, the current position in the input bit
1872  *       I/O context is undefined.
1873  */
1874 static void copy_bits(PutBitContext *pb,
1875                       const uint8_t *data, int size,
1876                       GetBitContext *gb, int nbits)
1877 {
1878     int rmn_bytes, rmn_bits;
1879
1880     rmn_bits = rmn_bytes = get_bits_left(gb);
1881     if (rmn_bits < nbits)
1882         return;
1883     rmn_bits &= 7; rmn_bytes >>= 3;
1884     if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1885         put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1886     ff_copy_bits(pb, data + size - rmn_bytes,
1887                  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1888 }
1889
1890 /**
1891  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1892  * and we expect that the demuxer / application provides it to us as such
1893  * (else you'll probably get garbage as output). Every packet has a size of
1894  * ctx->block_align bytes, starts with a packet header (see
1895  * #parse_packet_header()), and then a series of superframes. Superframe
1896  * boundaries may exceed packets, i.e. superframes can split data over
1897  * multiple (two) packets.
1898  *
1899  * For more information about frames, see #synth_superframe().
1900  */
1901 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1902                                   int *data_size, AVPacket *avpkt)
1903 {
1904     WMAVoiceContext *s = ctx->priv_data;
1905     GetBitContext *gb = &s->gb;
1906     int size, res, pos;
1907
1908     if (*data_size < 480 * sizeof(float)) {
1909         av_log(ctx, AV_LOG_ERROR,
1910                "Output buffer too small (%d given - %zu needed)\n",
1911                *data_size, 480 * sizeof(float));
1912         return -1;
1913     }
1914     *data_size = 0;
1915
1916     /* Packets are sometimes a multiple of ctx->block_align, with a packet
1917      * header at each ctx->block_align bytes. However, Libav's ASF demuxer
1918      * feeds us ASF packets, which may concatenate multiple "codec" packets
1919      * in a single "muxer" packet, so we artificially emulate that by
1920      * capping the packet size at ctx->block_align. */
1921     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1922     if (!size)
1923         return 0;
1924     init_get_bits(&s->gb, avpkt->data, size << 3);
1925
1926     /* size == ctx->block_align is used to indicate whether we are dealing with
1927      * a new packet or a packet of which we already read the packet header
1928      * previously. */
1929     if (size == ctx->block_align) { // new packet header
1930         if ((res = parse_packet_header(s)) < 0)
1931             return res;
1932
1933         /* If the packet header specifies a s->spillover_nbits, then we want
1934          * to push out all data of the previous packet (+ spillover) before
1935          * continuing to parse new superframes in the current packet. */
1936         if (s->spillover_nbits > 0) {
1937             if (s->sframe_cache_size > 0) {
1938                 int cnt = get_bits_count(gb);
1939                 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1940                 flush_put_bits(&s->pb);
1941                 s->sframe_cache_size += s->spillover_nbits;
1942                 if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
1943                     *data_size > 0) {
1944                     cnt += s->spillover_nbits;
1945                     s->skip_bits_next = cnt & 7;
1946                     return cnt >> 3;
1947                 } else
1948                     skip_bits_long (gb, s->spillover_nbits - cnt +
1949                                     get_bits_count(gb)); // resync
1950             } else
1951                 skip_bits_long(gb, s->spillover_nbits);  // resync
1952         }
1953     } else if (s->skip_bits_next)
1954         skip_bits(gb, s->skip_bits_next);
1955
1956     /* Try parsing superframes in current packet */
1957     s->sframe_cache_size = 0;
1958     s->skip_bits_next = 0;
1959     pos = get_bits_left(gb);
1960     if ((res = synth_superframe(ctx, data, data_size)) < 0) {
1961         return res;
1962     } else if (*data_size > 0) {
1963         int cnt = get_bits_count(gb);
1964         s->skip_bits_next = cnt & 7;
1965         return cnt >> 3;
1966     } else if ((s->sframe_cache_size = pos) > 0) {
1967         /* rewind bit reader to start of last (incomplete) superframe... */
1968         init_get_bits(gb, avpkt->data, size << 3);
1969         skip_bits_long(gb, (size << 3) - pos);
1970         assert(get_bits_left(gb) == pos);
1971
1972         /* ...and cache it for spillover in next packet */
1973         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1974         copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1975         // FIXME bad - just copy bytes as whole and add use the
1976         // skip_bits_next field
1977     }
1978
1979     return size;
1980 }
1981
1982 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1983 {
1984     WMAVoiceContext *s = ctx->priv_data;
1985
1986     if (s->do_apf) {
1987         ff_rdft_end(&s->rdft);
1988         ff_rdft_end(&s->irdft);
1989         ff_dct_end(&s->dct);
1990         ff_dct_end(&s->dst);
1991     }
1992
1993     return 0;
1994 }
1995
1996 static av_cold void wmavoice_flush(AVCodecContext *ctx)
1997 {
1998     WMAVoiceContext *s = ctx->priv_data;
1999     int n;
2000
2001     s->postfilter_agc    = 0;
2002     s->sframe_cache_size = 0;
2003     s->skip_bits_next    = 0;
2004     for (n = 0; n < s->lsps; n++)
2005         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2006     memset(s->excitation_history, 0,
2007            sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2008     memset(s->synth_history,      0,
2009            sizeof(*s->synth_history)      * MAX_LSPS);
2010     memset(s->gain_pred_err,      0,
2011            sizeof(s->gain_pred_err));
2012
2013     if (s->do_apf) {
2014         memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2015                sizeof(*s->synth_filter_out_buf) * s->lsps);
2016         memset(s->dcf_mem,              0,
2017                sizeof(*s->dcf_mem)              * 2);
2018         memset(s->zero_exc_pf,          0,
2019                sizeof(*s->zero_exc_pf)          * s->history_nsamples);
2020         memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2021     }
2022 }
2023
2024 AVCodec ff_wmavoice_decoder = {
2025     .name           = "wmavoice",
2026     .type           = AVMEDIA_TYPE_AUDIO,
2027     .id             = CODEC_ID_WMAVOICE,
2028     .priv_data_size = sizeof(WMAVoiceContext),
2029     .init           = wmavoice_decode_init,
2030     .close          = wmavoice_decode_end,
2031     .decode         = wmavoice_decode_packet,
2032     .capabilities   = CODEC_CAP_SUBFRAMES,
2033     .flush     = wmavoice_flush,
2034     .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2035 };