Line data Source code
1 : /* Copyright (c) 2014, Cisco Systems, INC
2 : Written by XiangMingZhu WeiZhou MinPeng YanWang
3 :
4 : Redistribution and use in source and binary forms, with or without
5 : modification, are permitted provided that the following conditions
6 : are met:
7 :
8 : - Redistributions of source code must retain the above copyright
9 : notice, this list of conditions and the following disclaimer.
10 :
11 : - Redistributions in binary form must reproduce the above copyright
12 : notice, this list of conditions and the following disclaimer in the
13 : documentation and/or other materials provided with the distribution.
14 :
15 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 : ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 : LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 : A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 : OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 : EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 : PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 : PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 : LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 : NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 : SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 : */
27 :
28 : #ifdef HAVE_CONFIG_H
29 : #include "config.h"
30 : #endif
31 :
32 : #include "macros.h"
33 : #include "celt_lpc.h"
34 : #include "stack_alloc.h"
35 : #include "mathops.h"
36 : #include "pitch.h"
37 :
38 : #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
39 :
40 : #include <xmmintrin.h>
41 : #include "arch.h"
42 :
43 0 : void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
44 : {
45 : int j;
46 : __m128 xsum1, xsum2;
47 0 : xsum1 = _mm_loadu_ps(sum);
48 0 : xsum2 = _mm_setzero_ps();
49 :
50 0 : for (j = 0; j < len-3; j += 4)
51 : {
52 0 : __m128 x0 = _mm_loadu_ps(x+j);
53 0 : __m128 yj = _mm_loadu_ps(y+j);
54 0 : __m128 y3 = _mm_loadu_ps(y+j+3);
55 :
56 0 : xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
57 0 : xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
58 0 : _mm_shuffle_ps(yj,y3,0x49)));
59 0 : xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
60 0 : _mm_shuffle_ps(yj,y3,0x9e)));
61 0 : xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
62 : }
63 0 : if (j < len)
64 : {
65 0 : xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
66 0 : if (++j < len)
67 : {
68 0 : xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
69 0 : if (++j < len)
70 : {
71 0 : xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
72 : }
73 : }
74 : }
75 0 : _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
76 0 : }
77 :
78 :
79 0 : void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
80 : int N, opus_val32 *xy1, opus_val32 *xy2)
81 : {
82 : int i;
83 : __m128 xsum1, xsum2;
84 0 : xsum1 = _mm_setzero_ps();
85 0 : xsum2 = _mm_setzero_ps();
86 0 : for (i=0;i<N-3;i+=4)
87 : {
88 0 : __m128 xi = _mm_loadu_ps(x+i);
89 0 : __m128 y1i = _mm_loadu_ps(y01+i);
90 0 : __m128 y2i = _mm_loadu_ps(y02+i);
91 0 : xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
92 0 : xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
93 : }
94 : /* Horizontal sum */
95 0 : xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
96 0 : xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
97 : _mm_store_ss(xy1, xsum1);
98 0 : xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
99 0 : xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
100 : _mm_store_ss(xy2, xsum2);
101 0 : for (;i<N;i++)
102 : {
103 0 : *xy1 = MAC16_16(*xy1, x[i], y01[i]);
104 0 : *xy2 = MAC16_16(*xy2, x[i], y02[i]);
105 : }
106 0 : }
107 :
108 0 : opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
109 : int N)
110 : {
111 : int i;
112 : float xy;
113 : __m128 sum;
114 0 : sum = _mm_setzero_ps();
115 : /* FIXME: We should probably go 8-way and use 2 sums. */
116 0 : for (i=0;i<N-3;i+=4)
117 : {
118 0 : __m128 xi = _mm_loadu_ps(x+i);
119 0 : __m128 yi = _mm_loadu_ps(y+i);
120 0 : sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
121 : }
122 : /* Horizontal sum */
123 0 : sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
124 0 : sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
125 : _mm_store_ss(&xy, sum);
126 0 : for (;i<N;i++)
127 : {
128 0 : xy = MAC16_16(xy, x[i], y[i]);
129 : }
130 0 : return xy;
131 : }
132 :
133 0 : void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
134 : opus_val16 g10, opus_val16 g11, opus_val16 g12)
135 : {
136 : int i;
137 : __m128 x0v;
138 : __m128 g10v, g11v, g12v;
139 0 : g10v = _mm_load1_ps(&g10);
140 0 : g11v = _mm_load1_ps(&g11);
141 0 : g12v = _mm_load1_ps(&g12);
142 0 : x0v = _mm_loadu_ps(&x[-T-2]);
143 0 : for (i=0;i<N-3;i+=4)
144 : {
145 : __m128 yi, yi2, x1v, x2v, x3v, x4v;
146 0 : const opus_val32 *xp = &x[i-T-2];
147 0 : yi = _mm_loadu_ps(x+i);
148 0 : x4v = _mm_loadu_ps(xp+4);
149 : #if 0
150 : /* Slower version with all loads */
151 : x1v = _mm_loadu_ps(xp+1);
152 : x2v = _mm_loadu_ps(xp+2);
153 : x3v = _mm_loadu_ps(xp+3);
154 : #else
155 0 : x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
156 0 : x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
157 0 : x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
158 : #endif
159 :
160 0 : yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
161 : #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
162 : yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
163 : yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
164 : #else
165 : /* Use partial sums */
166 0 : yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
167 : _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
168 0 : yi = _mm_add_ps(yi, yi2);
169 : #endif
170 0 : x0v=x4v;
171 0 : _mm_storeu_ps(y+i, yi);
172 : }
173 : #ifdef CUSTOM_MODES
174 : for (;i<N;i++)
175 : {
176 : y[i] = x[i]
177 : + MULT16_32_Q15(g10,x[i-T])
178 : + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
179 : + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
180 : }
181 : #endif
182 0 : }
183 :
184 :
185 : #endif
|