2 * Copyright (C) 2011 The Android Open Source Project
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 #include <cutils/bitops.h> /* for popcount() */
18 #include <audio_utils/primitives.h>
19 #include "private/private.h"
21 void ditherAndClamp(int32_t *out, const int32_t *sums, size_t pairs)
23 for (; pairs > 0; --pairs) {
24 const int32_t l = clamp16(*sums++ >> 12);
25 const int32_t r = clamp16(*sums++ >> 12);
26 *out++ = (r << 16) | (l & 0xFFFF);
30 void memcpy_to_i16_from_q4_27(int16_t *dst, const int32_t *src, size_t count)
32 for (; count > 0; --count) {
33 *dst++ = clamp16(*src++ >> 12);
37 void memcpy_to_i16_from_u8(int16_t *dst, const uint8_t *src, size_t count)
41 for (; count > 0; --count) {
42 *--dst = (int16_t)(*--src - 0x80) << 8;
46 void memcpy_to_u8_from_i16(uint8_t *dst, const int16_t *src, size_t count)
48 for (; count > 0; --count) {
49 *dst++ = (*src++ >> 8) + 0x80;
53 void memcpy_to_u8_from_float(uint8_t *dst, const float *src, size_t count)
55 for (; count > 0; --count) {
56 *dst++ = clamp8_from_float(*src++);
60 void memcpy_to_i16_from_i32(int16_t *dst, const int32_t *src, size_t count)
62 for (; count > 0; --count) {
63 *dst++ = *src++ >> 16;
67 void memcpy_to_i16_from_float(int16_t *dst, const float *src, size_t count)
69 for (; count > 0; --count) {
70 *dst++ = clamp16_from_float(*src++);
74 void memcpy_to_float_from_q4_27(float *dst, const int32_t *src, size_t count)
76 for (; count > 0; --count) {
77 *dst++ = float_from_q4_27(*src++);
81 void memcpy_to_float_from_i16(float *dst, const int16_t *src, size_t count)
85 for (; count > 0; --count) {
86 *--dst = float_from_i16(*--src);
90 void memcpy_to_float_from_u8(float *dst, const uint8_t *src, size_t count)
94 for (; count > 0; --count) {
95 *--dst = float_from_u8(*--src);
99 void memcpy_to_float_from_p24(float *dst, const uint8_t *src, size_t count)
103 for (; count > 0; --count) {
105 *--dst = float_from_p24(src);
109 void memcpy_to_i16_from_p24(int16_t *dst, const uint8_t *src, size_t count)
111 for (; count > 0; --count) {
113 *dst++ = src[1] | (src[0] << 8);
115 *dst++ = src[1] | (src[2] << 8);
121 void memcpy_to_i32_from_p24(int32_t *dst, const uint8_t *src, size_t count)
125 for (; count > 0; --count) {
128 *--dst = (src[2] << 8) | (src[1] << 16) | (src[0] << 24);
130 *--dst = (src[0] << 8) | (src[1] << 16) | (src[2] << 24);
135 void memcpy_to_p24_from_i16(uint8_t *dst, const int16_t *src, size_t count)
139 for (; count > 0; --count) {
141 const int16_t sample = *--src;
143 dst[0] = sample >> 8;
149 dst[2] = sample >> 8;
154 void memcpy_to_p24_from_float(uint8_t *dst, const float *src, size_t count)
156 for (; count > 0; --count) {
157 int32_t ival = clamp24_from_float(*src++);
171 void memcpy_to_p24_from_q8_23(uint8_t *dst, const int32_t *src, size_t count)
173 for (; count > 0; --count) {
174 int32_t ival = clamp24_from_q8_23(*src++);
188 void memcpy_to_p24_from_i32(uint8_t *dst, const int32_t *src, size_t count)
190 for (; count > 0; --count) {
191 int32_t ival = *src++ >> 8;
205 void memcpy_to_q8_23_from_i16(int32_t *dst, const int16_t *src, size_t count)
209 for (; count > 0; --count) {
210 *--dst = (int32_t)*--src << 8;
214 void memcpy_to_q8_23_from_float_with_clamp(int32_t *dst, const float *src, size_t count)
216 for (; count > 0; --count) {
217 *dst++ = clamp24_from_float(*src++);
221 void memcpy_to_q8_23_from_p24(int32_t *dst, const uint8_t *src, size_t count)
225 for (; count > 0; --count) {
228 *--dst = (int8_t)src[0] << 16 | src[1] << 8 | src[2];
230 *--dst = (int8_t)src[2] << 16 | src[1] << 8 | src[0];
235 void memcpy_to_q4_27_from_float(int32_t *dst, const float *src, size_t count)
237 for (; count > 0; --count) {
238 *dst++ = clampq4_27_from_float(*src++);
242 void memcpy_to_i16_from_q8_23(int16_t *dst, const int32_t *src, size_t count)
244 for (; count > 0; --count) {
245 *dst++ = clamp16(*src++ >> 8);
249 void memcpy_to_float_from_q8_23(float *dst, const int32_t *src, size_t count)
251 for (; count > 0; --count) {
252 *dst++ = float_from_q8_23(*src++);
256 void memcpy_to_i32_from_i16(int32_t *dst, const int16_t *src, size_t count)
260 for (; count > 0; --count) {
261 *--dst = (int32_t)*--src << 16;
265 void memcpy_to_i32_from_float(int32_t *dst, const float *src, size_t count)
267 for (; count > 0; --count) {
268 *dst++ = clamp32_from_float(*src++);
272 void memcpy_to_float_from_i32(float *dst, const int32_t *src, size_t count)
274 for (; count > 0; --count) {
275 *dst++ = float_from_i32(*src++);
279 void memcpy_to_float_from_float_with_clamping(float *dst, const float *src, size_t count,
281 // Note: using NEON intrinsics (vminq_f32, vld1q_f32...) did NOT accelerate
282 // the function when benchmarked. The compiler already vectorize using FMINNM f32x4 & similar.
283 // Note: clamping induce a ~20% overhead compared to memcpy for count in [64, 512]
284 // See primitives_benchmark
285 for (; count > 0; --count) {
286 const float sample = *src++;
287 *dst++ = fmax(-absMax, fmin(absMax, sample));
291 void downmix_to_mono_i16_from_stereo_i16(int16_t *dst, const int16_t *src, size_t count)
293 for (; count > 0; --count) {
294 *dst++ = (int16_t)(((int32_t)src[0] + (int32_t)src[1]) >> 1);
299 void upmix_to_stereo_i16_from_mono_i16(int16_t *dst, const int16_t *src, size_t count)
303 for (; count > 0; --count) {
304 const int32_t temp = *--src;
311 void downmix_to_mono_float_from_stereo_float(float *dst, const float *src, size_t frames)
313 for (; frames > 0; --frames) {
314 *dst++ = (src[0] + src[1]) * 0.5;
319 void upmix_to_stereo_float_from_mono_float(float *dst, const float *src, size_t frames)
323 for (; frames > 0; --frames) {
324 const float temp = *--src;
331 size_t nonZeroMono32(const int32_t *samples, size_t count)
334 for (; count > 0; --count) {
335 nonZero += *samples++ != 0;
340 size_t nonZeroMono16(const int16_t *samples, size_t count)
343 for (; count > 0; --count) {
344 nonZero += *samples++ != 0;
349 size_t nonZeroStereo32(const int32_t *frames, size_t count)
352 for (; count > 0; --count) {
353 nonZero += frames[0] != 0 || frames[1] != 0;
359 size_t nonZeroStereo16(const int16_t *frames, size_t count)
362 for (; count > 0; --count) {
363 nonZero += frames[0] != 0 || frames[1] != 0;
370 * C macro to do channel mask copying independent of dst/src sample type.
371 * Don't pass in any expressions for the macro arguments here.
373 #define copy_frame_by_mask(dst, dmask, src, smask, count, zero) \
375 uint32_t bit, ormask; \
376 for (; (count) > 0; --(count)) { \
377 ormask = (dmask) | (smask); \
379 bit = ormask & -ormask; /* get lowest bit */ \
380 ormask ^= bit; /* remove lowest bit */ \
381 if ((dmask) & bit) { \
382 *(dst)++ = (smask) & bit ? *(src)++ : (zero); \
383 } else { /* source channel only */ \
390 void memcpy_by_channel_mask(void *dst, uint32_t dst_mask,
391 const void *src, uint32_t src_mask, size_t sample_size, size_t count)
394 /* alternate way of handling memcpy_by_channel_mask by using the idxary */
396 uint32_t src_channels = popcount(src_mask);
397 uint32_t dst_channels =
398 memcpy_by_index_array_initialization(idxary, 32, dst_mask, src_mask);
400 memcpy_by_idxary(dst, dst_channels, src, src_channels, idxary, sample_size, count);
402 if (dst_mask == src_mask) {
403 memcpy(dst, src, sample_size * popcount(dst_mask) * count);
406 switch (sample_size) {
408 uint8_t *udst = (uint8_t*)dst;
409 const uint8_t *usrc = (const uint8_t*)src;
411 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
414 uint16_t *udst = (uint16_t*)dst;
415 const uint16_t *usrc = (const uint16_t*)src;
417 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
419 case 3: { /* could be slow. use a struct to represent 3 bytes of data. */
420 uint8x3_t *udst = (uint8x3_t*)dst;
421 const uint8x3_t *usrc = (const uint8x3_t*)src;
422 static const uint8x3_t zero; /* tricky - we use this to zero out a sample */
424 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, zero);
427 uint32_t *udst = (uint32_t*)dst;
428 const uint32_t *usrc = (const uint32_t*)src;
430 copy_frame_by_mask(udst, dst_mask, usrc, src_mask, count, 0);
433 abort(); /* illegal value */
440 * C macro to do copying by index array, to rearrange samples
441 * within a frame. This is independent of src/dst sample type.
442 * Don't pass in any expressions for the macro arguments here.
444 #define copy_frame_by_idx(dst, dst_channels, src, src_channels, idxary, count, zero) \
448 for (; (count) > 0; --(count)) { \
449 for (i = 0; i < (dst_channels); ++i) { \
450 index = (idxary)[i]; \
451 *(dst)++ = index < 0 ? (zero) : (src)[index]; \
453 (src) += (src_channels); \
457 void memcpy_by_index_array(void *dst, uint32_t dst_channels,
458 const void *src, uint32_t src_channels,
459 const int8_t *idxary, size_t sample_size, size_t count)
461 switch (sample_size) {
463 uint8_t *udst = (uint8_t*)dst;
464 const uint8_t *usrc = (const uint8_t*)src;
466 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
469 uint16_t *udst = (uint16_t*)dst;
470 const uint16_t *usrc = (const uint16_t*)src;
472 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
474 case 3: { /* could be slow. use a struct to represent 3 bytes of data. */
475 uint8x3_t *udst = (uint8x3_t*)dst;
476 const uint8x3_t *usrc = (const uint8x3_t*)src;
477 static const uint8x3_t zero;
479 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, zero);
482 uint32_t *udst = (uint32_t*)dst;
483 const uint32_t *usrc = (const uint32_t*)src;
485 copy_frame_by_idx(udst, dst_channels, usrc, src_channels, idxary, count, 0);
488 abort(); /* illegal value */
493 size_t memcpy_by_index_array_initialization(int8_t *idxary, size_t idxcount,
494 uint32_t dst_mask, uint32_t src_mask)
498 uint32_t bit, ormask = src_mask | dst_mask;
500 while (ormask && n < idxcount) {
501 bit = ormask & -ormask; /* get lowest bit */
502 ormask ^= bit; /* remove lowest bit */
503 if (src_mask & dst_mask & bit) { /* matching channel */
504 idxary[n++] = srcidx++;
505 } else if (src_mask & bit) { /* source channel only */
507 } else { /* destination channel only */
511 return n + popcount(ormask & dst_mask);
514 size_t memcpy_by_index_array_initialization_src_index(int8_t *idxary, size_t idxcount,
515 uint32_t dst_mask, uint32_t src_mask) {
516 size_t dst_count = popcount(dst_mask);
520 if (dst_count > idxcount) {
521 dst_count = idxcount;
524 size_t src_idx, dst_idx;
525 for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++dst_idx) {
527 idxary[dst_idx] = src_idx++;
529 idxary[dst_idx] = -1;
536 size_t memcpy_by_index_array_initialization_dst_index(int8_t *idxary, size_t idxcount,
537 uint32_t dst_mask, uint32_t src_mask) {
538 size_t src_idx, dst_idx;
539 size_t dst_count = __builtin_popcount(dst_mask);
540 size_t src_count = __builtin_popcount(src_mask);
544 if (dst_count > idxcount) {
545 dst_count = idxcount;
547 for (src_idx = 0, dst_idx = 0; dst_idx < dst_count; ++src_idx) {
549 idxary[dst_idx++] = src_idx < src_count ? (signed)src_idx : -1;
556 void accumulate_i16(int16_t *dst, const int16_t *src, size_t count) {
558 *dst = clamp16((int32_t)*dst + *src++);
563 void accumulate_u8(uint8_t *dst, const uint8_t *src, size_t count) {
565 for (; count > 0; --count) {
566 // 8-bit samples are centered around 0x80.
567 sum = *dst + *src++ - 0x80;
568 // Clamp to [0, 0xff].
569 *dst++ = (sum & 0x100) ? (~sum >> 9) : sum;
573 void accumulate_p24(uint8_t *dst, const uint8_t *src, size_t count) {
574 for (; count > 0; --count) {
576 int32_t dst_q8_23 = 0;
577 int32_t src_q8_23 = 0;
578 memcpy_to_q8_23_from_p24(&dst_q8_23, dst, 1);
579 memcpy_to_q8_23_from_p24(&src_q8_23, src, 1);
581 // Accumulate and overwrite.
582 dst_q8_23 += src_q8_23;
583 memcpy_to_p24_from_q8_23(dst, &dst_q8_23, 1);
585 // Move on to next sample.
591 void accumulate_q8_23(int32_t *dst, const int32_t *src, size_t count) {
592 for (; count > 0; --count) {
593 *dst = clamp24_from_q8_23(*dst + *src++);
598 void accumulate_i32(int32_t *dst, const int32_t *src, size_t count) {
599 for (; count > 0; --count) {
600 *dst = clamp32((int64_t)*dst + *src++);
605 void accumulate_float(float *dst, const float *src, size_t count) {
606 for (; count > 0; --count) {