sbc/sbc_primitives.c

   1 /*
   2  *
   3  *  Bluetooth low-complexity, subband codec (SBC) library
   4  *
   5  *  Copyright (C) 2004-2009  Marcel Holtmann <marcel@holtmann.org>
   6  *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
   7  *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
   8  *
   9  *
  10  *  This library is free software; you can redistribute it and/or
  11  *  modify it under the terms of the GNU Lesser General Public
  12  *  License as published by the Free Software Foundation; either
  13  *  version 2.1 of the License, or (at your option) any later version.
  14  *
  15  *  This library is distributed in the hope that it will be useful,
  16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  *  Lesser General Public License for more details.
  19  *
  20  *  You should have received a copy of the GNU Lesser General Public
  21  *  License along with this library; if not, write to the Free Software
  22  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  23  *
  24  */
  25
  26 #include <stdint.h>
  27 #include <limits.h>
  28 #include "sbc.h"
  29 #include "sbc_math.h"
  30 #include "sbc_tables.h"
  31
  32 #include "sbc_primitives.h"
  33 #include "sbc_primitives_mmx.h"
  34 #include "sbc_primitives_neon.h"
  35
  36 /*
  37  * A standard C code of analysis filter.
  38  */
  39 static inline void sbc_analyze_four(const int16_t *in, int32_t *out)
  40 {
  41         FIXED_A t1[4];
  42         FIXED_T t2[4];
  43         int i = 0, hop = 0;
  44
  45         /* rounding coefficient */
  46         t1[0] = t1[1] = t1[2] = t1[3] =
  47                 (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
  48
  49         /* low pass polyphase filter */
  50         for (hop = 0; hop < 40; hop += 8) {
  51                 t1[0] += (FIXED_A) in[hop] * _sbc_proto_fixed4[hop];
  52                 t1[1] += (FIXED_A) in[hop + 1] * _sbc_proto_fixed4[hop + 1];
  53                 t1[2] += (FIXED_A) in[hop + 2] * _sbc_proto_fixed4[hop + 2];
  54                 t1[1] += (FIXED_A) in[hop + 3] * _sbc_proto_fixed4[hop + 3];
  55                 t1[0] += (FIXED_A) in[hop + 4] * _sbc_proto_fixed4[hop + 4];
  56                 t1[3] += (FIXED_A) in[hop + 5] * _sbc_proto_fixed4[hop + 5];
  57                 t1[3] += (FIXED_A) in[hop + 7] * _sbc_proto_fixed4[hop + 7];
  58         }
  59
  60         /* scaling */
  61         t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
  62         t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
  63         t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
  64         t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
  65
  66         /* do the cos transform */
  67         for (i = 0, hop = 0; i < 4; hop += 8, i++) {
  68                 out[i] = ((FIXED_A) t2[0] * cos_table_fixed_4[0 + hop] +
  69                           (FIXED_A) t2[1] * cos_table_fixed_4[1 + hop] +
  70                           (FIXED_A) t2[2] * cos_table_fixed_4[2 + hop] +
  71                           (FIXED_A) t2[3] * cos_table_fixed_4[5 + hop]) >>
  72                         (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
  73         }
  74 }
  75
  76 static void sbc_analyze_4b_4s(int16_t *pcm, int16_t *x,
  77                               int32_t *out, int out_stride)
  78 {
  79         int i;
  80
  81         /* Input 4 x 4 Audio Samples */
  82         for (i = 0; i < 16; i += 4) {
  83                 x[64 + i] = x[0 + i] = pcm[15 - i];
  84                 x[65 + i] = x[1 + i] = pcm[14 - i];
  85                 x[66 + i] = x[2 + i] = pcm[13 - i];
  86                 x[67 + i] = x[3 + i] = pcm[12 - i];
  87         }
  88
  89         /* Analyze four blocks */
  90         sbc_analyze_four(x + 12, out);
  91         out += out_stride;
  92         sbc_analyze_four(x + 8, out);
  93         out += out_stride;
  94         sbc_analyze_four(x + 4, out);
  95         out += out_stride;
  96         sbc_analyze_four(x, out);
  97 }
  98
  99 static inline void sbc_analyze_eight(const int16_t *in, int32_t *out)
 100 {
 101         FIXED_A t1[8];
 102         FIXED_T t2[8];
 103         int i, hop;
 104
 105         /* rounding coefficient */
 106         t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
 107                 (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1);
 108
 109         /* low pass polyphase filter */
 110         for (hop = 0; hop < 80; hop += 16) {
 111                 t1[0] += (FIXED_A) in[hop] * _sbc_proto_fixed8[hop];
 112                 t1[1] += (FIXED_A) in[hop + 1] * _sbc_proto_fixed8[hop + 1];
 113                 t1[2] += (FIXED_A) in[hop + 2] * _sbc_proto_fixed8[hop + 2];
 114                 t1[3] += (FIXED_A) in[hop + 3] * _sbc_proto_fixed8[hop + 3];
 115                 t1[4] += (FIXED_A) in[hop + 4] * _sbc_proto_fixed8[hop + 4];
 116                 t1[3] += (FIXED_A) in[hop + 5] * _sbc_proto_fixed8[hop + 5];
 117                 t1[2] += (FIXED_A) in[hop + 6] * _sbc_proto_fixed8[hop + 6];
 118                 t1[1] += (FIXED_A) in[hop + 7] * _sbc_proto_fixed8[hop + 7];
 119                 t1[0] += (FIXED_A) in[hop + 8] * _sbc_proto_fixed8[hop + 8];
 120                 t1[5] += (FIXED_A) in[hop + 9] * _sbc_proto_fixed8[hop + 9];
 121                 t1[6] += (FIXED_A) in[hop + 10] * _sbc_proto_fixed8[hop + 10];
 122                 t1[7] += (FIXED_A) in[hop + 11] * _sbc_proto_fixed8[hop + 11];
 123                 t1[7] += (FIXED_A) in[hop + 13] * _sbc_proto_fixed8[hop + 13];
 124                 t1[6] += (FIXED_A) in[hop + 14] * _sbc_proto_fixed8[hop + 14];
 125                 t1[5] += (FIXED_A) in[hop + 15] * _sbc_proto_fixed8[hop + 15];
 126         }
 127
 128         /* scaling */
 129         t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
 130         t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
 131         t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
 132         t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
 133         t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
 134         t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
 135         t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
 136         t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
 137
 138         /* do the cos transform */
 139         for (i = 0, hop = 0; i < 8; hop += 16, i++) {
 140                 out[i] = ((FIXED_A) t2[0] * cos_table_fixed_8[0 + hop] +
 141                           (FIXED_A) t2[1] * cos_table_fixed_8[1 + hop] +
 142                           (FIXED_A) t2[2] * cos_table_fixed_8[2 + hop] +
 143                           (FIXED_A) t2[3] * cos_table_fixed_8[3 + hop] +
 144                           (FIXED_A) t2[4] * cos_table_fixed_8[4 + hop] +
 145                           (FIXED_A) t2[5] * cos_table_fixed_8[9 + hop] +
 146                           (FIXED_A) t2[6] * cos_table_fixed_8[10 + hop] +
 147                           (FIXED_A) t2[7] * cos_table_fixed_8[11 + hop]) >>
 148                         (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
 149         }
 150 }
 151
 152 static void sbc_analyze_4b_8s(int16_t *pcm, int16_t *x,
 153                               int32_t *out, int out_stride)
 154 {
 155         int i;
 156
 157         /* Input 4 x 8 Audio Samples */
 158         for (i = 0; i < 32; i += 8) {
 159                 x[128 + i] = x[0 + i] = pcm[31 - i];
 160                 x[129 + i] = x[1 + i] = pcm[30 - i];
 161                 x[130 + i] = x[2 + i] = pcm[29 - i];
 162                 x[131 + i] = x[3 + i] = pcm[28 - i];
 163                 x[132 + i] = x[4 + i] = pcm[27 - i];
 164                 x[133 + i] = x[5 + i] = pcm[26 - i];
 165                 x[134 + i] = x[6 + i] = pcm[25 - i];
 166                 x[135 + i] = x[7 + i] = pcm[24 - i];
 167         }
 168
 169         /* Analyze four blocks */
 170         sbc_analyze_eight(x + 24, out);
 171         out += out_stride;
 172         sbc_analyze_eight(x + 16, out);
 173         out += out_stride;
 174         sbc_analyze_eight(x + 8, out);
 175         out += out_stride;
 176         sbc_analyze_eight(x, out);
 177 }
 178
 179 /*
 180  * A reference C code of analysis filter with SIMD-friendly tables
 181  * reordering and code layout. This code can be used to develop platform
 182  * specific SIMD optimizations. Also it may be used as some kind of test
 183  * for compiler autovectorization capabilities (who knows, if the compiler
 184  * is very good at this stuff, hand optimized assembly may be not strictly
 185  * needed for some platform).
 186  */
 187
 188 static inline void sbc_analyze_four_simd(const int16_t *in, int32_t *out,
 189                                          const FIXED_T *consts)
 190 {
 191         FIXED_A t1[4];
 192         FIXED_T t2[4];
 193         int hop = 0;
 194
 195         /* rounding coefficient */
 196         t1[0] = t1[1] = t1[2] = t1[3] =
 197                 (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
 198
 199         /* low pass polyphase filter */
 200         for (hop = 0; hop < 40; hop += 8) {
 201                 t1[0] += (FIXED_A) in[hop] * consts[hop];
 202                 t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1];
 203                 t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2];
 204                 t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3];
 205                 t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4];
 206                 t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5];
 207                 t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6];
 208                 t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7];
 209         }
 210
 211         /* scaling */
 212         t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
 213         t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
 214         t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
 215         t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
 216
 217         /* do the cos transform */
 218         t1[0]  = (FIXED_A) t2[0] * consts[40 + 0];
 219         t1[0] += (FIXED_A) t2[1] * consts[40 + 1];
 220         t1[1]  = (FIXED_A) t2[0] * consts[40 + 2];
 221         t1[1] += (FIXED_A) t2[1] * consts[40 + 3];
 222         t1[2]  = (FIXED_A) t2[0] * consts[40 + 4];
 223         t1[2] += (FIXED_A) t2[1] * consts[40 + 5];
 224         t1[3]  = (FIXED_A) t2[0] * consts[40 + 6];
 225         t1[3] += (FIXED_A) t2[1] * consts[40 + 7];
 226
 227         t1[0] += (FIXED_A) t2[2] * consts[40 + 8];
 228         t1[0] += (FIXED_A) t2[3] * consts[40 + 9];
 229         t1[1] += (FIXED_A) t2[2] * consts[40 + 10];
 230         t1[1] += (FIXED_A) t2[3] * consts[40 + 11];
 231         t1[2] += (FIXED_A) t2[2] * consts[40 + 12];
 232         t1[2] += (FIXED_A) t2[3] * consts[40 + 13];
 233         t1[3] += (FIXED_A) t2[2] * consts[40 + 14];
 234         t1[3] += (FIXED_A) t2[3] * consts[40 + 15];
 235
 236         out[0] = t1[0] >>
 237                 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
 238         out[1] = t1[1] >>
 239                 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
 240         out[2] = t1[2] >>
 241                 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
 242         out[3] = t1[3] >>
 243                 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
 244 }
 245
 246 static inline void sbc_analyze_eight_simd(const int16_t *in, int32_t *out,
 247                                           const FIXED_T *consts)
 248 {
 249         FIXED_A t1[8];
 250         FIXED_T t2[8];
 251         int i, hop;
 252
 253         /* rounding coefficient */
 254         t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
 255                 (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1);
 256
 257         /* low pass polyphase filter */
 258         for (hop = 0; hop < 80; hop += 16) {
 259                 t1[0] += (FIXED_A) in[hop] * consts[hop];
 260                 t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1];
 261                 t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2];
 262                 t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3];
 263                 t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4];
 264                 t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5];
 265                 t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6];
 266                 t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7];
 267                 t1[4] += (FIXED_A) in[hop + 8] * consts[hop + 8];
 268                 t1[4] += (FIXED_A) in[hop + 9] * consts[hop + 9];
 269                 t1[5] += (FIXED_A) in[hop + 10] * consts[hop + 10];
 270                 t1[5] += (FIXED_A) in[hop + 11] * consts[hop + 11];
 271                 t1[6] += (FIXED_A) in[hop + 12] * consts[hop + 12];
 272                 t1[6] += (FIXED_A) in[hop + 13] * consts[hop + 13];
 273                 t1[7] += (FIXED_A) in[hop + 14] * consts[hop + 14];
 274                 t1[7] += (FIXED_A) in[hop + 15] * consts[hop + 15];
 275         }
 276
 277         /* scaling */
 278         t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
 279         t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
 280         t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
 281         t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
 282         t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
 283         t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
 284         t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
 285         t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
 286
 287
 288         /* do the cos transform */
 289         t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0;
 290
 291         for (i = 0; i < 4; i++) {
 292                 t1[0] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 0];
 293                 t1[0] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 1];
 294                 t1[1] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 2];
 295                 t1[1] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 3];
 296                 t1[2] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 4];
 297                 t1[2] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 5];
 298                 t1[3] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 6];
 299                 t1[3] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 7];
 300                 t1[4] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 8];
 301                 t1[4] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 9];
 302                 t1[5] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 10];
 303                 t1[5] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 11];
 304                 t1[6] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 12];
 305                 t1[6] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 13];
 306                 t1[7] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 14];
 307                 t1[7] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 15];
 308         }
 309
 310         for (i = 0; i < 8; i++)
 311                 out[i] = t1[i] >>
 312                         (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
 313 }
 314
 315 static inline void sbc_analyze_4b_4s_simd(int16_t *pcm, int16_t *x,
 316                                           int32_t *out, int out_stride)
 317 {
 318         /* Fetch audio samples and do input data reordering for SIMD */
 319         x[64] = x[0]  = pcm[8 + 7];
 320         x[65] = x[1]  = pcm[8 + 3];
 321         x[66] = x[2]  = pcm[8 + 6];
 322         x[67] = x[3]  = pcm[8 + 4];
 323         x[68] = x[4]  = pcm[8 + 0];
 324         x[69] = x[5]  = pcm[8 + 2];
 325         x[70] = x[6]  = pcm[8 + 1];
 326         x[71] = x[7]  = pcm[8 + 5];
 327
 328         x[72] = x[8]  = pcm[0 + 7];
 329         x[73] = x[9]  = pcm[0 + 3];
 330         x[74] = x[10] = pcm[0 + 6];
 331         x[75] = x[11] = pcm[0 + 4];
 332         x[76] = x[12] = pcm[0 + 0];
 333         x[77] = x[13] = pcm[0 + 2];
 334         x[78] = x[14] = pcm[0 + 1];
 335         x[79] = x[15] = pcm[0 + 5];
 336
 337         /* Analyze blocks */
 338         sbc_analyze_four_simd(x + 12, out, analysis_consts_fixed4_simd_odd);
 339         out += out_stride;
 340         sbc_analyze_four_simd(x + 8, out, analysis_consts_fixed4_simd_even);
 341         out += out_stride;
 342         sbc_analyze_four_simd(x + 4, out, analysis_consts_fixed4_simd_odd);
 343         out += out_stride;
 344         sbc_analyze_four_simd(x + 0, out, analysis_consts_fixed4_simd_even);
 345 }
 346
 347 static inline void sbc_analyze_4b_8s_simd(int16_t *pcm, int16_t *x,
 348                                           int32_t *out, int out_stride)
 349 {
 350         /* Fetch audio samples and do input data reordering for SIMD */
 351         x[128] = x[0]  = pcm[16 + 15];
 352         x[129] = x[1]  = pcm[16 + 7];
 353         x[130] = x[2]  = pcm[16 + 14];
 354         x[131] = x[3]  = pcm[16 + 8];
 355         x[132] = x[4]  = pcm[16 + 13];
 356         x[133] = x[5]  = pcm[16 + 9];
 357         x[134] = x[6]  = pcm[16 + 12];
 358         x[135] = x[7]  = pcm[16 + 10];
 359         x[136] = x[8]  = pcm[16 + 11];
 360         x[137] = x[9]  = pcm[16 + 3];
 361         x[138] = x[10] = pcm[16 + 6];
 362         x[139] = x[11] = pcm[16 + 0];
 363         x[140] = x[12] = pcm[16 + 5];
 364         x[141] = x[13] = pcm[16 + 1];
 365         x[142] = x[14] = pcm[16 + 4];
 366         x[143] = x[15] = pcm[16 + 2];
 367
 368         x[144] = x[16] = pcm[0 + 15];
 369         x[145] = x[17] = pcm[0 + 7];
 370         x[146] = x[18] = pcm[0 + 14];
 371         x[147] = x[19] = pcm[0 + 8];
 372         x[148] = x[20] = pcm[0 + 13];
 373         x[149] = x[21] = pcm[0 + 9];
 374         x[150] = x[22] = pcm[0 + 12];
 375         x[151] = x[23] = pcm[0 + 10];
 376         x[152] = x[24] = pcm[0 + 11];
 377         x[153] = x[25] = pcm[0 + 3];
 378         x[154] = x[26] = pcm[0 + 6];
 379         x[155] = x[27] = pcm[0 + 0];
 380         x[156] = x[28] = pcm[0 + 5];
 381         x[157] = x[29] = pcm[0 + 1];
 382         x[158] = x[30] = pcm[0 + 4];
 383         x[159] = x[31] = pcm[0 + 2];
 384
 385         /* Analyze blocks */
 386         sbc_analyze_eight_simd(x + 24, out, analysis_consts_fixed8_simd_odd);
 387         out += out_stride;
 388         sbc_analyze_eight_simd(x + 16, out, analysis_consts_fixed8_simd_even);
 389         out += out_stride;
 390         sbc_analyze_eight_simd(x + 8, out, analysis_consts_fixed8_simd_odd);
 391         out += out_stride;
 392         sbc_analyze_eight_simd(x + 0, out, analysis_consts_fixed8_simd_even);
 393 }
 394
 395 /*
 396  * Detect CPU features and setup function pointers
 397  */
 398 void sbc_init_primitives(struct sbc_encoder_state *state)
 399 {
 400         /* Default implementation for analyze functions */
 401         state->sbc_analyze_4b_4s = sbc_analyze_4b_4s;
 402         state->sbc_analyze_4b_8s = sbc_analyze_4b_8s;
 403
 404         /* X86/AMD64 optimizations */
 405 #ifdef SBC_BUILD_WITH_MMX_SUPPORT
 406         sbc_init_primitives_mmx(state);
 407 #endif
 408
 409         /* ARM optimizations */
 410 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
 411         sbc_init_primitives_neon(state);
 412 #endif
 413 }