Line data Source code
1 : /*
2 : * Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "webrtc/modules/audio_processing/utility/ooura_fft.h"
12 :
13 : #include <emmintrin.h>
14 :
15 : #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h"
16 : #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"
17 :
18 : namespace webrtc {
19 :
20 : #if defined(WEBRTC_ARCH_X86_FAMILY)
21 :
22 : namespace {
23 : // These intrinsics were unavailable before VS 2008.
24 : // TODO(andrew): move to a common file.
25 : #if defined(_MSC_VER) && _MSC_VER < 1500
26 : static __inline __m128 _mm_castsi128_ps(__m128i a) {
27 : return *(__m128*)&a;
28 : }
29 : static __inline __m128i _mm_castps_si128(__m128 a) {
30 : return *(__m128i*)&a;
31 : }
32 : #endif
33 :
34 : } // namespace
35 :
36 0 : void cft1st_128_SSE2(float* a) {
37 0 : const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
38 : int j, k2;
39 :
40 0 : for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
41 0 : __m128 a00v = _mm_loadu_ps(&a[j + 0]);
42 0 : __m128 a04v = _mm_loadu_ps(&a[j + 4]);
43 0 : __m128 a08v = _mm_loadu_ps(&a[j + 8]);
44 0 : __m128 a12v = _mm_loadu_ps(&a[j + 12]);
45 0 : __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
46 0 : __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
47 0 : __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
48 0 : __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
49 :
50 0 : const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
51 0 : const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
52 0 : const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
53 0 : const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
54 0 : const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
55 0 : const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
56 0 : __m128 x0v = _mm_add_ps(a01v, a23v);
57 0 : const __m128 x1v = _mm_sub_ps(a01v, a23v);
58 0 : const __m128 x2v = _mm_add_ps(a45v, a67v);
59 0 : const __m128 x3v = _mm_sub_ps(a45v, a67v);
60 : __m128 x0w;
61 0 : a01v = _mm_add_ps(x0v, x2v);
62 0 : x0v = _mm_sub_ps(x0v, x2v);
63 0 : x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
64 : {
65 0 : const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
66 0 : const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
67 0 : a45v = _mm_add_ps(a45_0v, a45_1v);
68 : }
69 : {
70 : __m128 a23_0v, a23_1v;
71 0 : const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
72 0 : const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
73 0 : x0v = _mm_add_ps(x1v, x3s);
74 0 : x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
75 0 : a23_0v = _mm_mul_ps(wk1rv, x0v);
76 0 : a23_1v = _mm_mul_ps(wk1iv, x0w);
77 0 : a23v = _mm_add_ps(a23_0v, a23_1v);
78 :
79 0 : x0v = _mm_sub_ps(x1v, x3s);
80 0 : x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
81 : }
82 : {
83 0 : const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
84 0 : const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
85 0 : a67v = _mm_add_ps(a67_0v, a67_1v);
86 : }
87 :
88 0 : a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
89 0 : a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
90 0 : a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
91 0 : a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
92 0 : _mm_storeu_ps(&a[j + 0], a00v);
93 0 : _mm_storeu_ps(&a[j + 4], a04v);
94 0 : _mm_storeu_ps(&a[j + 8], a08v);
95 0 : _mm_storeu_ps(&a[j + 12], a12v);
96 : }
97 0 : }
98 :
99 0 : void cftmdl_128_SSE2(float* a) {
100 0 : const int l = 8;
101 0 : const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
102 : int j0;
103 :
104 0 : __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
105 0 : for (j0 = 0; j0 < l; j0 += 2) {
106 0 : const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
107 0 : const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
108 0 : const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
109 0 : const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
110 : const __m128 a_00_32 =
111 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
112 : _MM_SHUFFLE(1, 0, 1, 0));
113 : const __m128 a_08_40 =
114 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
115 : _MM_SHUFFLE(1, 0, 1, 0));
116 0 : __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
117 0 : const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
118 :
119 0 : const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
120 0 : const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
121 0 : const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
122 0 : const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
123 : const __m128 a_16_48 =
124 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
125 : _MM_SHUFFLE(1, 0, 1, 0));
126 : const __m128 a_24_56 =
127 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
128 : _MM_SHUFFLE(1, 0, 1, 0));
129 0 : const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
130 0 : const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
131 :
132 0 : const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
133 0 : const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
134 :
135 0 : const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
136 0 : _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
137 0 : const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
138 0 : const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
139 0 : const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
140 :
141 : const __m128 yy0 =
142 0 : _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
143 : const __m128 yy1 =
144 0 : _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
145 0 : const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
146 0 : const __m128 yy3 = _mm_add_ps(yy0, yy2);
147 0 : const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
148 :
149 0 : _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
150 0 : _mm_storel_epi64(
151 0 : (__m128i*)&a[j0 + 32],
152 0 : _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
153 :
154 0 : _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
155 0 : _mm_storel_epi64(
156 0 : (__m128i*)&a[j0 + 48],
157 0 : _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
158 0 : a[j0 + 48] = -a[j0 + 48];
159 :
160 0 : _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
161 0 : _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
162 :
163 0 : _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
164 0 : _mm_storel_epi64(
165 0 : (__m128i*)&a[j0 + 56],
166 0 : _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
167 : }
168 :
169 : {
170 0 : int k = 64;
171 0 : int k1 = 2;
172 0 : int k2 = 2 * k1;
173 0 : const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
174 0 : const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
175 0 : const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
176 0 : const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
177 0 : const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
178 0 : wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
179 0 : for (j0 = k; j0 < l + k; j0 += 2) {
180 0 : const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
181 0 : const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
182 0 : const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
183 0 : const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
184 : const __m128 a_00_32 =
185 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
186 : _MM_SHUFFLE(1, 0, 1, 0));
187 : const __m128 a_08_40 =
188 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
189 : _MM_SHUFFLE(1, 0, 1, 0));
190 0 : __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
191 0 : const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
192 :
193 0 : const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
194 0 : const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
195 0 : const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
196 0 : const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
197 : const __m128 a_16_48 =
198 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
199 : _MM_SHUFFLE(1, 0, 1, 0));
200 : const __m128 a_24_56 =
201 0 : _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
202 : _MM_SHUFFLE(1, 0, 1, 0));
203 0 : const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
204 0 : const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
205 :
206 0 : const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
207 0 : const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
208 0 : const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
209 0 : const __m128 xx3 = _mm_mul_ps(
210 0 : wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
211 0 : _MM_SHUFFLE(2, 3, 0, 1))));
212 0 : const __m128 xx4 = _mm_add_ps(xx2, xx3);
213 :
214 0 : const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
215 0 : _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
216 0 : const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
217 0 : const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
218 0 : const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
219 :
220 0 : const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
221 0 : const __m128 xx11 = _mm_mul_ps(
222 0 : wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
223 0 : _MM_SHUFFLE(2, 3, 0, 1))));
224 0 : const __m128 xx12 = _mm_add_ps(xx10, xx11);
225 :
226 0 : const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
227 0 : const __m128 xx21 = _mm_mul_ps(
228 0 : wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
229 0 : _MM_SHUFFLE(2, 3, 0, 1))));
230 0 : const __m128 xx22 = _mm_add_ps(xx20, xx21);
231 :
232 0 : _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
233 0 : _mm_storel_epi64(
234 0 : (__m128i*)&a[j0 + 32],
235 0 : _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
236 :
237 0 : _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
238 0 : _mm_storel_epi64(
239 0 : (__m128i*)&a[j0 + 48],
240 0 : _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
241 :
242 0 : _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
243 0 : _mm_storel_epi64(
244 0 : (__m128i*)&a[j0 + 40],
245 0 : _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
246 :
247 0 : _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
248 0 : _mm_storel_epi64(
249 0 : (__m128i*)&a[j0 + 56],
250 0 : _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
251 : }
252 : }
253 0 : }
254 :
255 0 : void rftfsub_128_SSE2(float* a) {
256 0 : const float* c = rdft_w + 32;
257 : int j1, j2, k1, k2;
258 : float wkr, wki, xr, xi, yr, yi;
259 :
260 : static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
261 : 0.5f};
262 0 : const __m128 mm_half = _mm_load_ps(k_half);
263 :
264 : // Vectorized code (four at once).
265 : // Note: commented number are indexes for the first iteration of the loop.
266 0 : for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
267 : // Load 'wk'.
268 0 : const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
269 0 : const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
270 0 : const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
271 : const __m128 wkr_ =
272 0 : _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
273 0 : const __m128 wki_ = c_j1; // 1, 2, 3, 4,
274 : // Load and shuffle 'a'.
275 0 : const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
276 0 : const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
277 0 : const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
278 0 : const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
279 0 : const __m128 a_j2_p0 = _mm_shuffle_ps(
280 : a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
281 0 : const __m128 a_j2_p1 = _mm_shuffle_ps(
282 : a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
283 0 : const __m128 a_k2_p0 = _mm_shuffle_ps(
284 : a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
285 0 : const __m128 a_k2_p1 = _mm_shuffle_ps(
286 : a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
287 : // Calculate 'x'.
288 0 : const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
289 : // 2-126, 4-124, 6-122, 8-120,
290 0 : const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
291 : // 3-127, 5-125, 7-123, 9-121,
292 : // Calculate product into 'y'.
293 : // yr = wkr * xr - wki * xi;
294 : // yi = wkr * xi + wki * xr;
295 0 : const __m128 a_ = _mm_mul_ps(wkr_, xr_);
296 0 : const __m128 b_ = _mm_mul_ps(wki_, xi_);
297 0 : const __m128 c_ = _mm_mul_ps(wkr_, xi_);
298 0 : const __m128 d_ = _mm_mul_ps(wki_, xr_);
299 0 : const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
300 0 : const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
301 : // Update 'a'.
302 : // a[j2 + 0] -= yr;
303 : // a[j2 + 1] -= yi;
304 : // a[k2 + 0] += yr;
305 : // a[k2 + 1] -= yi;
306 0 : const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
307 0 : const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9,
308 0 : const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
309 0 : const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121,
310 : // Shuffle in right order and store.
311 0 : const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
312 : // 2, 3, 4, 5,
313 0 : const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
314 : // 6, 7, 8, 9,
315 0 : const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
316 : // 122, 123, 120, 121,
317 0 : const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
318 : // 126, 127, 124, 125,
319 0 : const __m128 a_k2_0n = _mm_shuffle_ps(
320 : a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
321 0 : const __m128 a_k2_4n = _mm_shuffle_ps(
322 : a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
323 0 : _mm_storeu_ps(&a[0 + j2], a_j2_0n);
324 0 : _mm_storeu_ps(&a[4 + j2], a_j2_4n);
325 0 : _mm_storeu_ps(&a[122 - j2], a_k2_0n);
326 0 : _mm_storeu_ps(&a[126 - j2], a_k2_4n);
327 : }
328 : // Scalar code for the remaining items.
329 0 : for (; j2 < 64; j1 += 1, j2 += 2) {
330 0 : k2 = 128 - j2;
331 0 : k1 = 32 - j1;
332 0 : wkr = 0.5f - c[k1];
333 0 : wki = c[j1];
334 0 : xr = a[j2 + 0] - a[k2 + 0];
335 0 : xi = a[j2 + 1] + a[k2 + 1];
336 0 : yr = wkr * xr - wki * xi;
337 0 : yi = wkr * xi + wki * xr;
338 0 : a[j2 + 0] -= yr;
339 0 : a[j2 + 1] -= yi;
340 0 : a[k2 + 0] += yr;
341 0 : a[k2 + 1] -= yi;
342 : }
343 0 : }
344 :
345 0 : void rftbsub_128_SSE2(float* a) {
346 0 : const float* c = rdft_w + 32;
347 : int j1, j2, k1, k2;
348 : float wkr, wki, xr, xi, yr, yi;
349 :
350 : static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
351 : 0.5f};
352 0 : const __m128 mm_half = _mm_load_ps(k_half);
353 :
354 0 : a[1] = -a[1];
355 : // Vectorized code (four at once).
356 : // Note: commented number are indexes for the first iteration of the loop.
357 0 : for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
358 : // Load 'wk'.
359 0 : const __m128 c_j1 = _mm_loadu_ps(&c[j1]); // 1, 2, 3, 4,
360 0 : const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31,
361 0 : const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31,
362 : const __m128 wkr_ =
363 0 : _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
364 0 : const __m128 wki_ = c_j1; // 1, 2, 3, 4,
365 : // Load and shuffle 'a'.
366 0 : const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5,
367 0 : const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9,
368 0 : const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123,
369 0 : const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127,
370 0 : const __m128 a_j2_p0 = _mm_shuffle_ps(
371 : a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0)); // 2, 4, 6, 8,
372 0 : const __m128 a_j2_p1 = _mm_shuffle_ps(
373 : a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1)); // 3, 5, 7, 9,
374 0 : const __m128 a_k2_p0 = _mm_shuffle_ps(
375 : a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2)); // 126, 124, 122, 120,
376 0 : const __m128 a_k2_p1 = _mm_shuffle_ps(
377 : a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3)); // 127, 125, 123, 121,
378 : // Calculate 'x'.
379 0 : const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
380 : // 2-126, 4-124, 6-122, 8-120,
381 0 : const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
382 : // 3-127, 5-125, 7-123, 9-121,
383 : // Calculate product into 'y'.
384 : // yr = wkr * xr + wki * xi;
385 : // yi = wkr * xi - wki * xr;
386 0 : const __m128 a_ = _mm_mul_ps(wkr_, xr_);
387 0 : const __m128 b_ = _mm_mul_ps(wki_, xi_);
388 0 : const __m128 c_ = _mm_mul_ps(wkr_, xi_);
389 0 : const __m128 d_ = _mm_mul_ps(wki_, xr_);
390 0 : const __m128 yr_ = _mm_add_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120,
391 0 : const __m128 yi_ = _mm_sub_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121,
392 : // Update 'a'.
393 : // a[j2 + 0] = a[j2 + 0] - yr;
394 : // a[j2 + 1] = yi - a[j2 + 1];
395 : // a[k2 + 0] = yr + a[k2 + 0];
396 : // a[k2 + 1] = yi - a[k2 + 1];
397 0 : const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8,
398 0 : const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1); // 3, 5, 7, 9,
399 0 : const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120,
400 0 : const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1); // 127, 125, 123, 121,
401 : // Shuffle in right order and store.
402 0 : const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
403 : // 2, 3, 4, 5,
404 0 : const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
405 : // 6, 7, 8, 9,
406 0 : const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
407 : // 122, 123, 120, 121,
408 0 : const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
409 : // 126, 127, 124, 125,
410 0 : const __m128 a_k2_0n = _mm_shuffle_ps(
411 : a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2)); // 120, 121, 122, 123,
412 0 : const __m128 a_k2_4n = _mm_shuffle_ps(
413 : a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2)); // 124, 125, 126, 127,
414 0 : _mm_storeu_ps(&a[0 + j2], a_j2_0n);
415 0 : _mm_storeu_ps(&a[4 + j2], a_j2_4n);
416 0 : _mm_storeu_ps(&a[122 - j2], a_k2_0n);
417 0 : _mm_storeu_ps(&a[126 - j2], a_k2_4n);
418 : }
419 : // Scalar code for the remaining items.
420 0 : for (; j2 < 64; j1 += 1, j2 += 2) {
421 0 : k2 = 128 - j2;
422 0 : k1 = 32 - j1;
423 0 : wkr = 0.5f - c[k1];
424 0 : wki = c[j1];
425 0 : xr = a[j2 + 0] - a[k2 + 0];
426 0 : xi = a[j2 + 1] + a[k2 + 1];
427 0 : yr = wkr * xr + wki * xi;
428 0 : yi = wkr * xi - wki * xr;
429 0 : a[j2 + 0] = a[j2 + 0] - yr;
430 0 : a[j2 + 1] = yi - a[j2 + 1];
431 0 : a[k2 + 0] = yr + a[k2 + 0];
432 0 : a[k2 + 1] = yi - a[k2 + 1];
433 : }
434 0 : a[65] = -a[65];
435 0 : }
436 : #endif
437 :
438 : } // namespace webrtc
|