3 * Bluetooth low-complexity, subband codec (SBC) library
5 * Copyright (C) 2004-2009 Marcel Holtmann <marcel@holtmann.org>
6 * Copyright (C) 2004-2005 Henryk Ploetz <henryk@ploetzli.ch>
7 * Copyright (C) 2005-2006 Brad Midgley <bmidgley@xmission.com>
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * This library is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with this library; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "sbc_tables.h"
32 #include "sbc_primitives.h"
33 #include "sbc_primitives_mmx.h"
34 #include "sbc_primitives_neon.h"
37 * A standard C code of analysis filter.
39 static inline void sbc_analyze_four(const int16_t *in, int32_t *out)
45 /* rounding coefficient */
46 t1[0] = t1[1] = t1[2] = t1[3] =
47 (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
49 /* low pass polyphase filter */
50 for (hop = 0; hop < 40; hop += 8) {
51 t1[0] += (FIXED_A) in[hop] * _sbc_proto_fixed4[hop];
52 t1[1] += (FIXED_A) in[hop + 1] * _sbc_proto_fixed4[hop + 1];
53 t1[2] += (FIXED_A) in[hop + 2] * _sbc_proto_fixed4[hop + 2];
54 t1[1] += (FIXED_A) in[hop + 3] * _sbc_proto_fixed4[hop + 3];
55 t1[0] += (FIXED_A) in[hop + 4] * _sbc_proto_fixed4[hop + 4];
56 t1[3] += (FIXED_A) in[hop + 5] * _sbc_proto_fixed4[hop + 5];
57 t1[3] += (FIXED_A) in[hop + 7] * _sbc_proto_fixed4[hop + 7];
61 t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
62 t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
63 t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
64 t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
66 /* do the cos transform */
67 for (i = 0, hop = 0; i < 4; hop += 8, i++) {
68 out[i] = ((FIXED_A) t2[0] * cos_table_fixed_4[0 + hop] +
69 (FIXED_A) t2[1] * cos_table_fixed_4[1 + hop] +
70 (FIXED_A) t2[2] * cos_table_fixed_4[2 + hop] +
71 (FIXED_A) t2[3] * cos_table_fixed_4[5 + hop]) >>
72 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
76 static void sbc_analyze_4b_4s(int16_t *pcm, int16_t *x,
77 int32_t *out, int out_stride)
81 /* Input 4 x 4 Audio Samples */
82 for (i = 0; i < 16; i += 4) {
83 x[64 + i] = x[0 + i] = pcm[15 - i];
84 x[65 + i] = x[1 + i] = pcm[14 - i];
85 x[66 + i] = x[2 + i] = pcm[13 - i];
86 x[67 + i] = x[3 + i] = pcm[12 - i];
89 /* Analyze four blocks */
90 sbc_analyze_four(x + 12, out);
92 sbc_analyze_four(x + 8, out);
94 sbc_analyze_four(x + 4, out);
96 sbc_analyze_four(x, out);
99 static inline void sbc_analyze_eight(const int16_t *in, int32_t *out)
105 /* rounding coefficient */
106 t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
107 (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1);
109 /* low pass polyphase filter */
110 for (hop = 0; hop < 80; hop += 16) {
111 t1[0] += (FIXED_A) in[hop] * _sbc_proto_fixed8[hop];
112 t1[1] += (FIXED_A) in[hop + 1] * _sbc_proto_fixed8[hop + 1];
113 t1[2] += (FIXED_A) in[hop + 2] * _sbc_proto_fixed8[hop + 2];
114 t1[3] += (FIXED_A) in[hop + 3] * _sbc_proto_fixed8[hop + 3];
115 t1[4] += (FIXED_A) in[hop + 4] * _sbc_proto_fixed8[hop + 4];
116 t1[3] += (FIXED_A) in[hop + 5] * _sbc_proto_fixed8[hop + 5];
117 t1[2] += (FIXED_A) in[hop + 6] * _sbc_proto_fixed8[hop + 6];
118 t1[1] += (FIXED_A) in[hop + 7] * _sbc_proto_fixed8[hop + 7];
119 t1[0] += (FIXED_A) in[hop + 8] * _sbc_proto_fixed8[hop + 8];
120 t1[5] += (FIXED_A) in[hop + 9] * _sbc_proto_fixed8[hop + 9];
121 t1[6] += (FIXED_A) in[hop + 10] * _sbc_proto_fixed8[hop + 10];
122 t1[7] += (FIXED_A) in[hop + 11] * _sbc_proto_fixed8[hop + 11];
123 t1[7] += (FIXED_A) in[hop + 13] * _sbc_proto_fixed8[hop + 13];
124 t1[6] += (FIXED_A) in[hop + 14] * _sbc_proto_fixed8[hop + 14];
125 t1[5] += (FIXED_A) in[hop + 15] * _sbc_proto_fixed8[hop + 15];
129 t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
130 t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
131 t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
132 t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
133 t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
134 t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
135 t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
136 t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
138 /* do the cos transform */
139 for (i = 0, hop = 0; i < 8; hop += 16, i++) {
140 out[i] = ((FIXED_A) t2[0] * cos_table_fixed_8[0 + hop] +
141 (FIXED_A) t2[1] * cos_table_fixed_8[1 + hop] +
142 (FIXED_A) t2[2] * cos_table_fixed_8[2 + hop] +
143 (FIXED_A) t2[3] * cos_table_fixed_8[3 + hop] +
144 (FIXED_A) t2[4] * cos_table_fixed_8[4 + hop] +
145 (FIXED_A) t2[5] * cos_table_fixed_8[9 + hop] +
146 (FIXED_A) t2[6] * cos_table_fixed_8[10 + hop] +
147 (FIXED_A) t2[7] * cos_table_fixed_8[11 + hop]) >>
148 (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
152 static void sbc_analyze_4b_8s(int16_t *pcm, int16_t *x,
153 int32_t *out, int out_stride)
157 /* Input 4 x 8 Audio Samples */
158 for (i = 0; i < 32; i += 8) {
159 x[128 + i] = x[0 + i] = pcm[31 - i];
160 x[129 + i] = x[1 + i] = pcm[30 - i];
161 x[130 + i] = x[2 + i] = pcm[29 - i];
162 x[131 + i] = x[3 + i] = pcm[28 - i];
163 x[132 + i] = x[4 + i] = pcm[27 - i];
164 x[133 + i] = x[5 + i] = pcm[26 - i];
165 x[134 + i] = x[6 + i] = pcm[25 - i];
166 x[135 + i] = x[7 + i] = pcm[24 - i];
169 /* Analyze four blocks */
170 sbc_analyze_eight(x + 24, out);
172 sbc_analyze_eight(x + 16, out);
174 sbc_analyze_eight(x + 8, out);
176 sbc_analyze_eight(x, out);
180 * A reference C code of analysis filter with SIMD-friendly tables
181 * reordering and code layout. This code can be used to develop platform
182 * specific SIMD optimizations. Also it may be used as some kind of test
183 * for compiler autovectorization capabilities (who knows, if the compiler
184 * is very good at this stuff, hand optimized assembly may be not strictly
185 * needed for some platform).
188 static inline void sbc_analyze_four_simd(const int16_t *in, int32_t *out,
189 const FIXED_T *consts)
195 /* rounding coefficient */
196 t1[0] = t1[1] = t1[2] = t1[3] =
197 (FIXED_A) 1 << (SBC_PROTO_FIXED4_SCALE - 1);
199 /* low pass polyphase filter */
200 for (hop = 0; hop < 40; hop += 8) {
201 t1[0] += (FIXED_A) in[hop] * consts[hop];
202 t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1];
203 t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2];
204 t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3];
205 t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4];
206 t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5];
207 t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6];
208 t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7];
212 t2[0] = t1[0] >> SBC_PROTO_FIXED4_SCALE;
213 t2[1] = t1[1] >> SBC_PROTO_FIXED4_SCALE;
214 t2[2] = t1[2] >> SBC_PROTO_FIXED4_SCALE;
215 t2[3] = t1[3] >> SBC_PROTO_FIXED4_SCALE;
217 /* do the cos transform */
218 t1[0] = (FIXED_A) t2[0] * consts[40 + 0];
219 t1[0] += (FIXED_A) t2[1] * consts[40 + 1];
220 t1[1] = (FIXED_A) t2[0] * consts[40 + 2];
221 t1[1] += (FIXED_A) t2[1] * consts[40 + 3];
222 t1[2] = (FIXED_A) t2[0] * consts[40 + 4];
223 t1[2] += (FIXED_A) t2[1] * consts[40 + 5];
224 t1[3] = (FIXED_A) t2[0] * consts[40 + 6];
225 t1[3] += (FIXED_A) t2[1] * consts[40 + 7];
227 t1[0] += (FIXED_A) t2[2] * consts[40 + 8];
228 t1[0] += (FIXED_A) t2[3] * consts[40 + 9];
229 t1[1] += (FIXED_A) t2[2] * consts[40 + 10];
230 t1[1] += (FIXED_A) t2[3] * consts[40 + 11];
231 t1[2] += (FIXED_A) t2[2] * consts[40 + 12];
232 t1[2] += (FIXED_A) t2[3] * consts[40 + 13];
233 t1[3] += (FIXED_A) t2[2] * consts[40 + 14];
234 t1[3] += (FIXED_A) t2[3] * consts[40 + 15];
237 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
239 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
241 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
243 (SBC_COS_TABLE_FIXED4_SCALE - SCALE_OUT_BITS);
246 static inline void sbc_analyze_eight_simd(const int16_t *in, int32_t *out,
247 const FIXED_T *consts)
253 /* rounding coefficient */
254 t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] =
255 (FIXED_A) 1 << (SBC_PROTO_FIXED8_SCALE-1);
257 /* low pass polyphase filter */
258 for (hop = 0; hop < 80; hop += 16) {
259 t1[0] += (FIXED_A) in[hop] * consts[hop];
260 t1[0] += (FIXED_A) in[hop + 1] * consts[hop + 1];
261 t1[1] += (FIXED_A) in[hop + 2] * consts[hop + 2];
262 t1[1] += (FIXED_A) in[hop + 3] * consts[hop + 3];
263 t1[2] += (FIXED_A) in[hop + 4] * consts[hop + 4];
264 t1[2] += (FIXED_A) in[hop + 5] * consts[hop + 5];
265 t1[3] += (FIXED_A) in[hop + 6] * consts[hop + 6];
266 t1[3] += (FIXED_A) in[hop + 7] * consts[hop + 7];
267 t1[4] += (FIXED_A) in[hop + 8] * consts[hop + 8];
268 t1[4] += (FIXED_A) in[hop + 9] * consts[hop + 9];
269 t1[5] += (FIXED_A) in[hop + 10] * consts[hop + 10];
270 t1[5] += (FIXED_A) in[hop + 11] * consts[hop + 11];
271 t1[6] += (FIXED_A) in[hop + 12] * consts[hop + 12];
272 t1[6] += (FIXED_A) in[hop + 13] * consts[hop + 13];
273 t1[7] += (FIXED_A) in[hop + 14] * consts[hop + 14];
274 t1[7] += (FIXED_A) in[hop + 15] * consts[hop + 15];
278 t2[0] = t1[0] >> SBC_PROTO_FIXED8_SCALE;
279 t2[1] = t1[1] >> SBC_PROTO_FIXED8_SCALE;
280 t2[2] = t1[2] >> SBC_PROTO_FIXED8_SCALE;
281 t2[3] = t1[3] >> SBC_PROTO_FIXED8_SCALE;
282 t2[4] = t1[4] >> SBC_PROTO_FIXED8_SCALE;
283 t2[5] = t1[5] >> SBC_PROTO_FIXED8_SCALE;
284 t2[6] = t1[6] >> SBC_PROTO_FIXED8_SCALE;
285 t2[7] = t1[7] >> SBC_PROTO_FIXED8_SCALE;
288 /* do the cos transform */
289 t1[0] = t1[1] = t1[2] = t1[3] = t1[4] = t1[5] = t1[6] = t1[7] = 0;
291 for (i = 0; i < 4; i++) {
292 t1[0] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 0];
293 t1[0] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 1];
294 t1[1] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 2];
295 t1[1] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 3];
296 t1[2] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 4];
297 t1[2] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 5];
298 t1[3] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 6];
299 t1[3] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 7];
300 t1[4] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 8];
301 t1[4] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 9];
302 t1[5] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 10];
303 t1[5] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 11];
304 t1[6] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 12];
305 t1[6] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 13];
306 t1[7] += (FIXED_A) t2[i * 2 + 0] * consts[80 + i * 16 + 14];
307 t1[7] += (FIXED_A) t2[i * 2 + 1] * consts[80 + i * 16 + 15];
310 for (i = 0; i < 8; i++)
312 (SBC_COS_TABLE_FIXED8_SCALE - SCALE_OUT_BITS);
315 static inline void sbc_analyze_4b_4s_simd(int16_t *pcm, int16_t *x,
316 int32_t *out, int out_stride)
318 /* Fetch audio samples and do input data reordering for SIMD */
319 x[64] = x[0] = pcm[8 + 7];
320 x[65] = x[1] = pcm[8 + 3];
321 x[66] = x[2] = pcm[8 + 6];
322 x[67] = x[3] = pcm[8 + 4];
323 x[68] = x[4] = pcm[8 + 0];
324 x[69] = x[5] = pcm[8 + 2];
325 x[70] = x[6] = pcm[8 + 1];
326 x[71] = x[7] = pcm[8 + 5];
328 x[72] = x[8] = pcm[0 + 7];
329 x[73] = x[9] = pcm[0 + 3];
330 x[74] = x[10] = pcm[0 + 6];
331 x[75] = x[11] = pcm[0 + 4];
332 x[76] = x[12] = pcm[0 + 0];
333 x[77] = x[13] = pcm[0 + 2];
334 x[78] = x[14] = pcm[0 + 1];
335 x[79] = x[15] = pcm[0 + 5];
338 sbc_analyze_four_simd(x + 12, out, analysis_consts_fixed4_simd_odd);
340 sbc_analyze_four_simd(x + 8, out, analysis_consts_fixed4_simd_even);
342 sbc_analyze_four_simd(x + 4, out, analysis_consts_fixed4_simd_odd);
344 sbc_analyze_four_simd(x + 0, out, analysis_consts_fixed4_simd_even);
347 static inline void sbc_analyze_4b_8s_simd(int16_t *pcm, int16_t *x,
348 int32_t *out, int out_stride)
350 /* Fetch audio samples and do input data reordering for SIMD */
351 x[128] = x[0] = pcm[16 + 15];
352 x[129] = x[1] = pcm[16 + 7];
353 x[130] = x[2] = pcm[16 + 14];
354 x[131] = x[3] = pcm[16 + 8];
355 x[132] = x[4] = pcm[16 + 13];
356 x[133] = x[5] = pcm[16 + 9];
357 x[134] = x[6] = pcm[16 + 12];
358 x[135] = x[7] = pcm[16 + 10];
359 x[136] = x[8] = pcm[16 + 11];
360 x[137] = x[9] = pcm[16 + 3];
361 x[138] = x[10] = pcm[16 + 6];
362 x[139] = x[11] = pcm[16 + 0];
363 x[140] = x[12] = pcm[16 + 5];
364 x[141] = x[13] = pcm[16 + 1];
365 x[142] = x[14] = pcm[16 + 4];
366 x[143] = x[15] = pcm[16 + 2];
368 x[144] = x[16] = pcm[0 + 15];
369 x[145] = x[17] = pcm[0 + 7];
370 x[146] = x[18] = pcm[0 + 14];
371 x[147] = x[19] = pcm[0 + 8];
372 x[148] = x[20] = pcm[0 + 13];
373 x[149] = x[21] = pcm[0 + 9];
374 x[150] = x[22] = pcm[0 + 12];
375 x[151] = x[23] = pcm[0 + 10];
376 x[152] = x[24] = pcm[0 + 11];
377 x[153] = x[25] = pcm[0 + 3];
378 x[154] = x[26] = pcm[0 + 6];
379 x[155] = x[27] = pcm[0 + 0];
380 x[156] = x[28] = pcm[0 + 5];
381 x[157] = x[29] = pcm[0 + 1];
382 x[158] = x[30] = pcm[0 + 4];
383 x[159] = x[31] = pcm[0 + 2];
386 sbc_analyze_eight_simd(x + 24, out, analysis_consts_fixed8_simd_odd);
388 sbc_analyze_eight_simd(x + 16, out, analysis_consts_fixed8_simd_even);
390 sbc_analyze_eight_simd(x + 8, out, analysis_consts_fixed8_simd_odd);
392 sbc_analyze_eight_simd(x + 0, out, analysis_consts_fixed8_simd_even);
396 * Detect CPU features and setup function pointers
398 void sbc_init_primitives(struct sbc_encoder_state *state)
400 /* Default implementation for analyze functions */
401 state->sbc_analyze_4b_4s = sbc_analyze_4b_4s;
402 state->sbc_analyze_4b_8s = sbc_analyze_4b_8s;
404 /* X86/AMD64 optimizations */
405 #ifdef SBC_BUILD_WITH_MMX_SUPPORT
406 sbc_init_primitives_mmx(state);
409 /* ARM optimizations */
410 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
411 sbc_init_primitives_neon(state);