8 #ifndef __builtin_expect
9 #include <linux/compiler.h>
12 #define rdtscll(val) \
13 __asm__ __volatile__("rdtsc" : "=A" (val))
15 #define likely(x) __builtin_expect((x),1)
16 #define unlikely(x) __builtin_expect((x),0)
18 typedef short int s16;
26 #define LOCK_PREFIX "lock ; "
28 #define LOCK_PREFIX ""
31 struct __xchg_dummy { unsigned long a[100]; };
32 #define __xg(x) ((struct __xchg_dummy *)(x))
34 static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
35 unsigned long new, int size)
40 __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2"
42 : "q"(new), "m"(*__xg(ptr)), "0"(old)
46 __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2"
48 : "q"(new), "m"(*__xg(ptr)), "0"(old)
52 __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2"
54 : "q"(new), "m"(*__xg(ptr)), "0"(old)
61 #define cmpxchg(ptr,o,n)\
62 ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\
63 (unsigned long)(n),sizeof(*(ptr))))
65 static inline void atomic_add(volatile int *dst, int v)
68 LOCK_PREFIX "addl %1,%0"
70 :"ir" (v), "m" (*dst));
73 static double detect_cpu_clock()
75 struct timeval tm_begin, tm_end;
76 unsigned long long tsc_begin, tsc_end;
79 gettimeofday(&tm_begin, 0);
82 gettimeofday(&tm_begin, 0);
87 gettimeofday(&tm_end, 0);
89 return (tsc_end - tsc_begin) / (tm_end.tv_sec - tm_begin.tv_sec + (tm_end.tv_usec - tm_begin.tv_usec) / 1e6);
92 void mix_areas_srv(unsigned int size,
95 unsigned int src_step, unsigned int sum_step)
97 src_step /= sizeof(*src);
98 sum_step /= sizeof(*sum);
100 atomic_add(sum, *src);
106 void saturate(unsigned int size,
107 s16 *dst, const s32 *sum,
108 unsigned int dst_step, unsigned int sum_step)
110 dst_step /= sizeof(*dst);
111 sum_step /= sizeof(*sum);
114 if (unlikely(sample < -0x8000))
116 else if (unlikely(sample > 0x7fff))
125 void mix_areas0(unsigned int size,
126 volatile s16 *dst, s16 *src,
128 unsigned int dst_step,
129 unsigned int src_step,
130 unsigned int sum_step)
132 dst_step /= sizeof(*dst);
133 src_step /= sizeof(*src);
134 sum_step /= sizeof(*sum);
136 s32 sample = *dst + *src;
137 if (unlikely(sample < -0x8000))
139 else if (unlikely(sample > 0x7fff))
149 #define MIX_AREAS1 mix_areas1
150 #define MIX_AREAS1_MMX mix_areas1_mmx
151 #include "../src/pcm/pcm_dmix_i386.h"
153 #undef MIX_AREAS1_MMX
155 void mix_areas2(unsigned int size,
156 volatile s16 *dst, const s16 *src,
158 unsigned int dst_step,
159 unsigned int src_step)
161 dst_step /= sizeof(*dst);
162 src_step /= sizeof(*src);
165 s32 old_sample = *sum;
166 if (cmpxchg(dst, 0, 1) == 0)
167 sample -= old_sample;
168 atomic_add(sum, sample);
171 if (unlikely(sample < -0x8000))
173 else if (unlikely(sample > 0x7fff))
177 } while (unlikely(sample != *sum));
184 void setscheduler(void)
186 struct sched_param sched_param;
188 if (sched_getparam(0, &sched_param) < 0) {
189 printf("Scheduler getparam failed...\n");
192 sched_param.sched_priority = sched_get_priority_max(SCHED_RR);
193 if (!sched_setscheduler(0, SCHED_RR, &sched_param)) {
194 printf("Scheduler set to Round Robin with priority %i...\n", sched_param.sched_priority);
198 printf("!!!Scheduler set to Round Robin with priority %i FAILED!!!\n", sched_param.sched_priority);
201 int cache_size = 1024*1024;
203 void init(s16 *dst, s32 *sum, int size)
208 for (count = size - 1; count >= 0; count--)
210 for (count = size - 1; count >= 0; count--)
212 a = malloc(cache_size);
213 for (count = cache_size - 1; count >= 0; count--) {
214 a[count] = count & 0xff;
221 int main(int argc, char **argv)
223 int size = 2048, n = 4, max = 32267;
226 unsigned long long begin, end, diff, diffS, diff0, diff1, diff1_mmx, diff2;
227 double cpu_clock = detect_cpu_clock();
228 s16 *dst = malloc(sizeof(*dst) * size);
229 s32 *sum = calloc(size, sizeof(*sum));
230 s16 **srcs = malloc(sizeof(*srcs) * n);
234 printf("CPU clock: %fMhz (UP)\n\n", cpu_clock / 10e5);
236 printf("CPU clock: %fMhz (SMP)\n\n", cpu_clock / 10e5);
239 size = atoi(argv[1]);
244 cache_size = atoi(argv[4]) * 1024;
245 for (i = 0; i < n; i++) {
248 srcs[i] = s = malloc(sizeof(s16) * size);
249 for (k = 0; k < size; ++k, ++s) {
250 *s = (rand() % (max * 2)) - max;
254 for (t = 0, diffS = -1; t < LOOP; t++) {
255 init(dst, sum, size);
257 for (i = 0; i < n; i++) {
258 mix_areas_srv(size, srcs[i], sum, 2, 4);
260 saturate(size, dst, sum, 2, 4);
265 printf("mix_areas_srv : %lld \r", diff); fflush(stdout);
268 for (t = 0, diff0 = -1; t < LOOP; t++) {
269 init(dst, sum, size);
271 for (i = 0; i < n; i++) {
272 mix_areas0(size, dst, srcs[i], sum, 2, 2, 4);
278 printf("mix_areas0 : %lld \r", diff); fflush(stdout);
281 for (t = 0, diff1 = -1; t < LOOP; t++) {
282 init(dst, sum, size);
284 for (i = 0; i < n; i++) {
285 mix_areas1(size, dst, srcs[i], sum, 2, 2, 4);
291 printf("mix_areas1 : %lld \r", diff); fflush(stdout);
294 for (t = 0, diff1_mmx = -1; t < LOOP; t++) {
295 init(dst, sum, size);
297 for (i = 0; i < n; i++) {
298 mix_areas1_mmx(size, dst, srcs[i], sum, 2, 2, 4);
302 if (diff < diff1_mmx)
304 printf("mix_areas1_mmx: %lld \r", diff); fflush(stdout);
307 for (t = 0, diff2 = -1; t < LOOP; t++) {
308 init(dst, sum, size);
310 for (i = 0; i < n; i++) {
311 mix_areas2(size, dst, srcs[i], sum, 2, 2);
317 printf("mix_areas2 : %lld \r", diff); fflush(stdout);
321 printf("Summary (the best times):\n");
322 printf("mix_areas_srv : %lld %f%%\n", diffS, 100*2*44100.0*diffS/(size*n*cpu_clock));
323 printf("mix_areas0 : %lld %f%%\n", diff0, 100*2*44100.0*diff0/(size*n*cpu_clock));
324 printf("mix_areas1 : %lld %f%%\n", diff1, 100*2*44100.0*diff1/(size*n*cpu_clock));
325 printf("mix_areas1_mmx: %lld %f%%\n", diff1_mmx, 100*2*44100.0*diff1_mmx/(size*n*cpu_clock));
326 printf("mix_areas2 : %lld %f%%\n", diff2, 100*2*44100.0*diff2/(size*n*cpu_clock));
329 printf("areas1/srv ratio : %f\n", (double)diff1 / diffS);
330 printf("areas1_mmx/srv ratio : %f\n", (double)diff1_mmx / diffS);