OSDN Git Service

90b424c16651a9611435adbde2cd97ea9d672002
[android-x86/external-bluetooth-sbc.git] / sbc / sbc_primitives_neon.c
1 /*
2  *
3  *  Bluetooth low-complexity, subband codec (SBC) library
4  *
5  *  Copyright (C) 2004-2010  Marcel Holtmann <marcel@holtmann.org>
6  *  Copyright (C) 2004-2005  Henryk Ploetz <henryk@ploetzli.ch>
7  *  Copyright (C) 2005-2006  Brad Midgley <bmidgley@xmission.com>
8  *
9  *
10  *  This library is free software; you can redistribute it and/or
11  *  modify it under the terms of the GNU Lesser General Public
12  *  License as published by the Free Software Foundation; either
13  *  version 2.1 of the License, or (at your option) any later version.
14  *
15  *  This library is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  *  Lesser General Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser General Public
21  *  License along with this library; if not, write to the Free Software
22  *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
23  *
24  */
25
26 #include <stdint.h>
27 #include <limits.h>
28 #include "sbc.h"
29 #include "sbc_math.h"
30 #include "sbc_tables.h"
31
32 #include "sbc_primitives_neon.h"
33
34 /*
35  * ARM NEON optimizations
36  */
37
38 #ifdef SBC_BUILD_WITH_NEON_SUPPORT
39
40 static inline void _sbc_analyze_four_neon(const int16_t *in, int32_t *out,
41                                                         const FIXED_T *consts)
42 {
43         /* TODO: merge even and odd cases (or even merge all four calls to this
44          * function) in order to have only aligned reads from 'in' array
45          * and reduce number of load instructions */
46         asm volatile (
47                 "vld1.16    {d4, d5}, [%0, :64]!\n"
48                 "vld1.16    {d8, d9}, [%1, :128]!\n"
49
50                 "vmull.s16  q0, d4, d8\n"
51                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
52                 "vmull.s16  q1, d5, d9\n"
53                 "vld1.16    {d10, d11}, [%1, :128]!\n"
54
55                 "vmlal.s16  q0, d6, d10\n"
56                 "vld1.16    {d4, d5}, [%0, :64]!\n"
57                 "vmlal.s16  q1, d7, d11\n"
58                 "vld1.16    {d8, d9}, [%1, :128]!\n"
59
60                 "vmlal.s16  q0, d4, d8\n"
61                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
62                 "vmlal.s16  q1, d5, d9\n"
63                 "vld1.16    {d10, d11}, [%1, :128]!\n"
64
65                 "vmlal.s16  q0, d6, d10\n"
66                 "vld1.16    {d4, d5}, [%0, :64]!\n"
67                 "vmlal.s16  q1, d7, d11\n"
68                 "vld1.16    {d8, d9}, [%1, :128]!\n"
69
70                 "vmlal.s16  q0, d4, d8\n"
71                 "vmlal.s16  q1, d5, d9\n"
72
73                 "vpadd.s32  d0, d0, d1\n"
74                 "vpadd.s32  d1, d2, d3\n"
75
76                 "vrshrn.s32 d0, q0, %3\n"
77
78                 "vld1.16    {d2, d3, d4, d5}, [%1, :128]!\n"
79
80                 "vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
81                 "vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
82
83                 "vmull.s16  q3, d2, d0\n"
84                 "vmull.s16  q4, d3, d0\n"
85                 "vmlal.s16  q3, d4, d1\n"
86                 "vmlal.s16  q4, d5, d1\n"
87
88                 "vpadd.s32  d0, d6, d7\n" /* TODO: can be eliminated */
89                 "vpadd.s32  d1, d8, d9\n" /* TODO: can be eliminated */
90
91                 "vst1.32    {d0, d1}, [%2, :128]\n"
92                 : "+r" (in), "+r" (consts)
93                 : "r" (out),
94                         "i" (SBC_PROTO_FIXED4_SCALE)
95                 : "memory",
96                         "d0", "d1", "d2", "d3", "d4", "d5",
97                         "d6", "d7", "d8", "d9", "d10", "d11");
98 }
99
100 static inline void _sbc_analyze_eight_neon(const int16_t *in, int32_t *out,
101                                                         const FIXED_T *consts)
102 {
103         /* TODO: merge even and odd cases (or even merge all four calls to this
104          * function) in order to have only aligned reads from 'in' array
105          * and reduce number of load instructions */
106         asm volatile (
107                 "vld1.16    {d4, d5}, [%0, :64]!\n"
108                 "vld1.16    {d8, d9}, [%1, :128]!\n"
109
110                 "vmull.s16  q6, d4, d8\n"
111                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
112                 "vmull.s16  q7, d5, d9\n"
113                 "vld1.16    {d10, d11}, [%1, :128]!\n"
114                 "vmull.s16  q8, d6, d10\n"
115                 "vld1.16    {d4, d5}, [%0, :64]!\n"
116                 "vmull.s16  q9, d7, d11\n"
117                 "vld1.16    {d8, d9}, [%1, :128]!\n"
118
119                 "vmlal.s16  q6, d4, d8\n"
120                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
121                 "vmlal.s16  q7, d5, d9\n"
122                 "vld1.16    {d10, d11}, [%1, :128]!\n"
123                 "vmlal.s16  q8, d6, d10\n"
124                 "vld1.16    {d4, d5}, [%0, :64]!\n"
125                 "vmlal.s16  q9, d7, d11\n"
126                 "vld1.16    {d8, d9}, [%1, :128]!\n"
127
128                 "vmlal.s16  q6, d4, d8\n"
129                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
130                 "vmlal.s16  q7, d5, d9\n"
131                 "vld1.16    {d10, d11}, [%1, :128]!\n"
132                 "vmlal.s16  q8, d6, d10\n"
133                 "vld1.16    {d4, d5}, [%0, :64]!\n"
134                 "vmlal.s16  q9, d7, d11\n"
135                 "vld1.16    {d8, d9}, [%1, :128]!\n"
136
137                 "vmlal.s16  q6, d4, d8\n"
138                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
139                 "vmlal.s16  q7, d5, d9\n"
140                 "vld1.16    {d10, d11}, [%1, :128]!\n"
141                 "vmlal.s16  q8, d6, d10\n"
142                 "vld1.16    {d4, d5}, [%0, :64]!\n"
143                 "vmlal.s16  q9, d7, d11\n"
144                 "vld1.16    {d8, d9}, [%1, :128]!\n"
145
146                 "vmlal.s16  q6, d4, d8\n"
147                 "vld1.16    {d6,  d7}, [%0, :64]!\n"
148                 "vmlal.s16  q7, d5, d9\n"
149                 "vld1.16    {d10, d11}, [%1, :128]!\n"
150
151                 "vmlal.s16  q8, d6, d10\n"
152                 "vmlal.s16  q9, d7, d11\n"
153
154                 "vpadd.s32  d0, d12, d13\n"
155                 "vpadd.s32  d1, d14, d15\n"
156                 "vpadd.s32  d2, d16, d17\n"
157                 "vpadd.s32  d3, d18, d19\n"
158
159                 "vrshr.s32 q0, q0, %3\n"
160                 "vrshr.s32 q1, q1, %3\n"
161                 "vmovn.s32 d0, q0\n"
162                 "vmovn.s32 d1, q1\n"
163
164                 "vdup.i32   d3, d1[1]\n"  /* TODO: can be eliminated */
165                 "vdup.i32   d2, d1[0]\n"  /* TODO: can be eliminated */
166                 "vdup.i32   d1, d0[1]\n"  /* TODO: can be eliminated */
167                 "vdup.i32   d0, d0[0]\n"  /* TODO: can be eliminated */
168
169                 "vld1.16    {d4, d5}, [%1, :128]!\n"
170                 "vmull.s16  q6, d4, d0\n"
171                 "vld1.16    {d6, d7}, [%1, :128]!\n"
172                 "vmull.s16  q7, d5, d0\n"
173                 "vmull.s16  q8, d6, d0\n"
174                 "vmull.s16  q9, d7, d0\n"
175
176                 "vld1.16    {d4, d5}, [%1, :128]!\n"
177                 "vmlal.s16  q6, d4, d1\n"
178                 "vld1.16    {d6, d7}, [%1, :128]!\n"
179                 "vmlal.s16  q7, d5, d1\n"
180                 "vmlal.s16  q8, d6, d1\n"
181                 "vmlal.s16  q9, d7, d1\n"
182
183                 "vld1.16    {d4, d5}, [%1, :128]!\n"
184                 "vmlal.s16  q6, d4, d2\n"
185                 "vld1.16    {d6, d7}, [%1, :128]!\n"
186                 "vmlal.s16  q7, d5, d2\n"
187                 "vmlal.s16  q8, d6, d2\n"
188                 "vmlal.s16  q9, d7, d2\n"
189
190                 "vld1.16    {d4, d5}, [%1, :128]!\n"
191                 "vmlal.s16  q6, d4, d3\n"
192                 "vld1.16    {d6, d7}, [%1, :128]!\n"
193                 "vmlal.s16  q7, d5, d3\n"
194                 "vmlal.s16  q8, d6, d3\n"
195                 "vmlal.s16  q9, d7, d3\n"
196
197                 "vpadd.s32  d0, d12, d13\n" /* TODO: can be eliminated */
198                 "vpadd.s32  d1, d14, d15\n" /* TODO: can be eliminated */
199                 "vpadd.s32  d2, d16, d17\n" /* TODO: can be eliminated */
200                 "vpadd.s32  d3, d18, d19\n" /* TODO: can be eliminated */
201
202                 "vst1.32    {d0, d1, d2, d3}, [%2, :128]\n"
203                 : "+r" (in), "+r" (consts)
204                 : "r" (out),
205                         "i" (SBC_PROTO_FIXED8_SCALE)
206                 : "memory",
207                         "d0", "d1", "d2", "d3", "d4", "d5",
208                         "d6", "d7", "d8", "d9", "d10", "d11",
209                         "d12", "d13", "d14", "d15", "d16", "d17",
210                         "d18", "d19");
211 }
212
213 static inline void sbc_analyze_4b_4s_neon(int16_t *x,
214                                                 int32_t *out, int out_stride)
215 {
216         /* Analyze blocks */
217         _sbc_analyze_four_neon(x + 12, out, analysis_consts_fixed4_simd_odd);
218         out += out_stride;
219         _sbc_analyze_four_neon(x + 8, out, analysis_consts_fixed4_simd_even);
220         out += out_stride;
221         _sbc_analyze_four_neon(x + 4, out, analysis_consts_fixed4_simd_odd);
222         out += out_stride;
223         _sbc_analyze_four_neon(x + 0, out, analysis_consts_fixed4_simd_even);
224 }
225
226 static inline void sbc_analyze_4b_8s_neon(int16_t *x,
227                                                 int32_t *out, int out_stride)
228 {
229         /* Analyze blocks */
230         _sbc_analyze_eight_neon(x + 24, out, analysis_consts_fixed8_simd_odd);
231         out += out_stride;
232         _sbc_analyze_eight_neon(x + 16, out, analysis_consts_fixed8_simd_even);
233         out += out_stride;
234         _sbc_analyze_eight_neon(x + 8, out, analysis_consts_fixed8_simd_odd);
235         out += out_stride;
236         _sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
237 }
238
239 void sbc_init_primitives_neon(struct sbc_encoder_state *state)
240 {
241         state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
242         state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
243         state->implementation_info = "NEON";
244 }
245
246 #endif