OSDN Git Service

[flac] Update FLAC to 1.4.1
[timidity41/timidity41.git] / FLAC / src / deduplication / lpc_compute_autocorrelation_intrin_sse2.c
1 /* This code is imported several times in lpc_intrin_sse2.c with different
2  * values for MAX_LAG. Comments are for MAX_LAG == 14 */
3         int i;
4         __m128d sum0, sum1, sum2, sum3;
5         __m128d d0, d1, d2, d3;
6 #if MAX_LAG > 8
7         __m128d d4;
8         __m128d sum4;
9 #endif
10 #if MAX_LAG > 10
11         __m128d d5, d6;
12         __m128d sum5, sum6;
13 #endif
14
15         (void) lag;
16         FLAC__ASSERT(lag <= MAX_LAG);
17
18         /* Initialize all sum vectors with zero */
19         sum0 = _mm_setzero_pd();
20         sum1 = _mm_setzero_pd();
21         sum2 = _mm_setzero_pd();
22         sum3 = _mm_setzero_pd();
23         d0 = _mm_setzero_pd();
24         d1 = _mm_setzero_pd();
25         d2 = _mm_setzero_pd();
26         d3 = _mm_setzero_pd();
27 #if MAX_LAG > 8
28         sum4 = _mm_setzero_pd();
29         d4 = _mm_setzero_pd();
30 #endif
31 #if MAX_LAG > 10
32         sum5 = _mm_setzero_pd();
33         sum6 = _mm_setzero_pd();
34         d5 = _mm_setzero_pd();
35         d6 = _mm_setzero_pd();
36 #endif
37
38         /* Loop backwards through samples from data_len to limit */
39         for(i = data_len-1; i >= 0; i--) {
40                 __m128d d = _mm_set1_pd(data[i]);
41
42                 /* The next lines of code work like a queue. For more
43                  * information see the lag8 version of this function */
44 #if MAX_LAG > 10
45                 d6 = _mm_shuffle_pd(d5, d6, _MM_SHUFFLE(0,0,0,1));
46                 d5 = _mm_shuffle_pd(d4, d5, _MM_SHUFFLE(0,0,0,1));
47 #endif
48 #if MAX_LAG > 8
49                 d4 = _mm_shuffle_pd(d3, d4, _MM_SHUFFLE(0,0,0,1));
50 #endif
51                 d3 = _mm_shuffle_pd(d2, d3, _MM_SHUFFLE(0,0,0,1));
52                 d2 = _mm_shuffle_pd(d1, d2, _MM_SHUFFLE(0,0,0,1));
53                 d1 = _mm_shuffle_pd(d0, d1, _MM_SHUFFLE(0,0,0,1));
54                 d0 = _mm_shuffle_pd(d,  d0, _MM_SHUFFLE(0,0,0,1));
55
56                 /* sumn += d*dn */
57                 sum0 = _mm_add_pd(sum0, _mm_mul_pd(d, d0));
58                 sum1 = _mm_add_pd(sum1, _mm_mul_pd(d, d1));
59                 sum2 = _mm_add_pd(sum2, _mm_mul_pd(d, d2));
60                 sum3 = _mm_add_pd(sum3, _mm_mul_pd(d, d3));
61 #if MAX_LAG > 8
62                 sum4 = _mm_add_pd(sum4, _mm_mul_pd(d, d4));
63 #endif
64 #if MAX_LAG > 10
65                 sum5 = _mm_add_pd(sum5, _mm_mul_pd(d, d5));
66                 sum6 = _mm_add_pd(sum6, _mm_mul_pd(d, d6));
67 #endif
68         }
69
70         /* Store sum0..sum6 in autoc[0..14] */
71         _mm_storeu_pd(autoc,   sum0);
72         _mm_storeu_pd(autoc+2, sum1);
73         _mm_storeu_pd(autoc+4, sum2);
74         _mm_storeu_pd(autoc+6 ,sum3);
75 #if MAX_LAG > 8
76         _mm_storeu_pd(autoc+8, sum4);
77 #endif
78 #if MAX_LAG > 10
79         _mm_storeu_pd(autoc+10,sum5);
80         _mm_storeu_pd(autoc+12,sum6);
81 #endif