Line data Source code
1 : ////////////////////////////////////////////////////////////////////////////////
2 : ///
3 : /// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE
4 : /// optimized functions have been gathered into this single source
5 : /// code file, regardless to their class or original source code file, in order
6 : /// to ease porting the library to other compiler and processor platforms.
7 : ///
8 : /// The SSE-optimizations are programmed using SSE compiler intrinsics that
9 : /// are supported both by Microsoft Visual C++ and GCC compilers, so this file
10 : /// should compile with both toolsets.
11 : ///
12 : /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
13 : /// 6.0 processor pack" update to support SSE instruction set. The update is
14 : /// available for download at Microsoft Developers Network, see here:
15 : /// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
16 : ///
17 : /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
18 : /// perform a search with keywords "processor pack".
19 : ///
20 : /// Author : Copyright (c) Olli Parviainen
21 : /// Author e-mail : oparviai 'at' iki.fi
22 : /// SoundTouch WWW: http://www.surina.net/soundtouch
23 : ///
24 : ////////////////////////////////////////////////////////////////////////////////
25 : //
26 : // Last changed : $Date: 2015-02-21 21:24:29 +0000 (Sat, 21 Feb 2015) $
27 : // File revision : $Revision: 4 $
28 : //
29 : // $Id: sse_optimized.cpp 202 2015-02-21 21:24:29Z oparviai $
30 : //
31 : ////////////////////////////////////////////////////////////////////////////////
32 : //
33 : // License :
34 : //
35 : // SoundTouch audio processing library
36 : // Copyright (c) Olli Parviainen
37 : //
38 : // This library is free software; you can redistribute it and/or
39 : // modify it under the terms of the GNU Lesser General Public
40 : // License as published by the Free Software Foundation; either
41 : // version 2.1 of the License, or (at your option) any later version.
42 : //
43 : // This library is distributed in the hope that it will be useful,
44 : // but WITHOUT ANY WARRANTY; without even the implied warranty of
45 : // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
46 : // Lesser General Public License for more details.
47 : //
48 : // You should have received a copy of the GNU Lesser General Public
49 : // License along with this library; if not, write to the Free Software
50 : // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
51 : //
52 : ////////////////////////////////////////////////////////////////////////////////
53 :
54 : #include "cpu_detect.h"
55 : #include "STTypes.h"
56 :
57 : using namespace soundtouch;
58 :
59 : #ifdef SOUNDTOUCH_ALLOW_SSE
60 :
61 : // SSE routines available only with float sample type
62 :
63 : //////////////////////////////////////////////////////////////////////////////
64 : //
65 : // implementation of SSE optimized functions of class 'TDStretchSSE'
66 : //
67 : //////////////////////////////////////////////////////////////////////////////
68 :
69 : #include "TDStretch.h"
70 : #include <xmmintrin.h>
71 : #include <math.h>
72 :
73 : // Calculates cross correlation of two buffers
74 0 : double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &anorm) const
75 : {
76 : int i;
77 : const float *pVec1;
78 : const __m128 *pVec2;
79 : __m128 vSum, vNorm;
80 :
81 : // Note. It means a major slow-down if the routine needs to tolerate
82 : // unaligned __m128 memory accesses. It's way faster if we can skip
83 : // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
84 : // This can mean up to ~ 10-fold difference (incl. part of which is
85 : // due to skipping every second round for stereo sound though).
86 : //
87 : // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
88 : // for choosing if this little cheating is allowed.
89 :
90 : #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
91 : // Little cheating allowed, return valid correlation only for
92 : // aligned locations, meaning every second round for stereo sound.
93 :
94 : #define _MM_LOAD _mm_load_ps
95 :
96 0 : if (((ulongptr)pV1) & 15) return -1e50; // skip unaligned locations
97 :
98 : #else
99 : // No cheating allowed, use unaligned load & take the resulting
100 : // performance hit.
101 : #define _MM_LOAD _mm_loadu_ps
102 : #endif
103 :
104 : // ensure overlapLength is divisible by 8
105 0 : assert((overlapLength % 8) == 0);
106 :
107 : // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
108 : // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
109 0 : pVec1 = (const float*)pV1;
110 0 : pVec2 = (const __m128*)pV2;
111 0 : vSum = vNorm = _mm_setzero_ps();
112 :
113 : // Unroll the loop by factor of 4 * 4 operations. Use same routine for
114 : // stereo & mono, for mono it just means twice the amount of unrolling.
115 0 : for (i = 0; i < channels * overlapLength / 16; i ++)
116 : {
117 : __m128 vTemp;
118 : // vSum += pV1[0..3] * pV2[0..3]
119 0 : vTemp = _MM_LOAD(pVec1);
120 0 : vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp ,pVec2[0]));
121 0 : vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
122 :
123 : // vSum += pV1[4..7] * pV2[4..7]
124 0 : vTemp = _MM_LOAD(pVec1 + 4);
125 0 : vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
126 0 : vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
127 :
128 : // vSum += pV1[8..11] * pV2[8..11]
129 0 : vTemp = _MM_LOAD(pVec1 + 8);
130 0 : vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
131 0 : vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
132 :
133 : // vSum += pV1[12..15] * pV2[12..15]
134 0 : vTemp = _MM_LOAD(pVec1 + 12);
135 0 : vSum = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
136 0 : vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
137 :
138 0 : pVec1 += 16;
139 0 : pVec2 += 4;
140 : }
141 :
142 : // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
143 0 : float *pvNorm = (float*)&vNorm;
144 0 : float norm = (pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
145 0 : anorm = norm;
146 :
147 0 : float *pvSum = (float*)&vSum;
148 0 : return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / sqrt(norm < 1e-9 ? 1.0 : norm);
149 :
150 : /* This is approximately corresponding routine in C-language yet without normalization:
151 : double corr, norm;
152 : uint i;
153 :
154 : // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
155 : corr = norm = 0.0;
156 : for (i = 0; i < channels * overlapLength / 16; i ++)
157 : {
158 : corr += pV1[0] * pV2[0] +
159 : pV1[1] * pV2[1] +
160 : pV1[2] * pV2[2] +
161 : pV1[3] * pV2[3] +
162 : pV1[4] * pV2[4] +
163 : pV1[5] * pV2[5] +
164 : pV1[6] * pV2[6] +
165 : pV1[7] * pV2[7] +
166 : pV1[8] * pV2[8] +
167 : pV1[9] * pV2[9] +
168 : pV1[10] * pV2[10] +
169 : pV1[11] * pV2[11] +
170 : pV1[12] * pV2[12] +
171 : pV1[13] * pV2[13] +
172 : pV1[14] * pV2[14] +
173 : pV1[15] * pV2[15];
174 :
175 : for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
176 :
177 : pV1 += 16;
178 : pV2 += 16;
179 : }
180 : return corr / sqrt(norm);
181 : */
182 : }
183 :
184 :
185 :
186 0 : double TDStretchSSE::calcCrossCorrAccumulate(const float *pV1, const float *pV2, double &norm) const
187 : {
188 : // call usual calcCrossCorr function because SSE does not show big benefit of
189 : // accumulating "norm" value, and also the "norm" rolling algorithm would get
190 : // complicated due to SSE-specific alignment-vs-nonexact correlation rules.
191 0 : return calcCrossCorr(pV1, pV2, norm);
192 : }
193 :
194 :
195 : //////////////////////////////////////////////////////////////////////////////
196 : //
197 : // implementation of SSE optimized functions of class 'FIRFilter'
198 : //
199 : //////////////////////////////////////////////////////////////////////////////
200 :
201 : #include "FIRFilter.h"
202 :
203 0 : FIRFilterSSE::FIRFilterSSE() : FIRFilter()
204 : {
205 0 : filterCoeffsAlign = NULL;
206 0 : filterCoeffsUnalign = NULL;
207 0 : }
208 :
209 :
210 0 : FIRFilterSSE::~FIRFilterSSE()
211 : {
212 0 : delete[] filterCoeffsUnalign;
213 0 : filterCoeffsAlign = NULL;
214 0 : filterCoeffsUnalign = NULL;
215 0 : }
216 :
217 :
218 : // (overloaded) Calculates filter coefficients for SSE routine
219 0 : void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
220 : {
221 : uint i;
222 : float fDivider;
223 :
224 0 : FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
225 :
226 : // Scale the filter coefficients so that it won't be necessary to scale the filtering result
227 : // also rearrange coefficients suitably for SSE
228 : // Ensure that filter coeffs array is aligned to 16-byte boundary
229 0 : delete[] filterCoeffsUnalign;
230 0 : filterCoeffsUnalign = new float[2 * newLength + 4];
231 0 : filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
232 :
233 0 : fDivider = (float)resultDivider;
234 :
235 : // rearrange the filter coefficients for mmx routines
236 0 : for (i = 0; i < newLength; i ++)
237 : {
238 0 : filterCoeffsAlign[2 * i + 0] =
239 0 : filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
240 : }
241 0 : }
242 :
243 :
244 :
245 : // SSE-optimized version of the filter routine for stereo sound
246 0 : uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
247 : {
248 0 : int count = (int)((numSamples - length) & (uint)-2);
249 : int j;
250 :
251 0 : assert(count % 2 == 0);
252 :
253 0 : if (count < 2) return 0;
254 :
255 0 : assert(source != NULL);
256 0 : assert(dest != NULL);
257 0 : assert((length % 8) == 0);
258 0 : assert(filterCoeffsAlign != NULL);
259 0 : assert(((ulongptr)filterCoeffsAlign) % 16 == 0);
260 :
261 : // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
262 : #pragma omp parallel for
263 0 : for (j = 0; j < count; j += 2)
264 : {
265 : const float *pSrc;
266 : float *pDest;
267 : const __m128 *pFil;
268 : __m128 sum1, sum2;
269 : uint i;
270 :
271 0 : pSrc = (const float*)source + j * 2; // source audio data
272 0 : pDest = dest + j * 2; // destination audio data
273 0 : pFil = (const __m128*)filterCoeffsAlign; // filter coefficients. NOTE: Assumes coefficients
274 : // are aligned to 16-byte boundary
275 0 : sum1 = sum2 = _mm_setzero_ps();
276 :
277 0 : for (i = 0; i < length / 8; i ++)
278 : {
279 : // Unroll loop for efficiency & calculate filter for 2*2 stereo samples
280 : // at each pass
281 :
282 : // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
283 : // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
284 :
285 0 : sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc) , pFil[0]));
286 0 : sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
287 :
288 0 : sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
289 0 : sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
290 :
291 0 : sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) , pFil[2]));
292 0 : sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
293 :
294 0 : sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
295 0 : sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
296 :
297 0 : pSrc += 16;
298 0 : pFil += 4;
299 : }
300 :
301 : // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
302 : // to sum the two hi- and lo-floats of these registers together.
303 :
304 : // post-shuffle & add the filtered values and store to dest.
305 0 : _mm_storeu_ps(pDest, _mm_add_ps(
306 : _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)), // s2_1 s2_0 s1_3 s1_2
307 : _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0)) // s2_3 s2_2 s1_1 s1_0
308 : ));
309 : }
310 :
311 : // Ideas for further improvement:
312 : // 1. If it could be guaranteed that 'source' were always aligned to 16-byte
313 : // boundary, a faster aligned '_mm_load_ps' instruction could be used.
314 : // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte
315 : // boundary, a faster '_mm_store_ps' instruction could be used.
316 :
317 0 : return (uint)count;
318 :
319 : /* original routine in C-language. please notice the C-version has differently
320 : organized coefficients though.
321 : double suml1, suml2;
322 : double sumr1, sumr2;
323 : uint i, j;
324 :
325 : for (j = 0; j < count; j += 2)
326 : {
327 : const float *ptr;
328 : const float *pFil;
329 :
330 : suml1 = sumr1 = 0.0;
331 : suml2 = sumr2 = 0.0;
332 : ptr = src;
333 : pFil = filterCoeffs;
334 : for (i = 0; i < lengthLocal; i ++)
335 : {
336 : // unroll loop for efficiency.
337 :
338 : suml1 += ptr[0] * pFil[0] +
339 : ptr[2] * pFil[2] +
340 : ptr[4] * pFil[4] +
341 : ptr[6] * pFil[6];
342 :
343 : sumr1 += ptr[1] * pFil[1] +
344 : ptr[3] * pFil[3] +
345 : ptr[5] * pFil[5] +
346 : ptr[7] * pFil[7];
347 :
348 : suml2 += ptr[8] * pFil[0] +
349 : ptr[10] * pFil[2] +
350 : ptr[12] * pFil[4] +
351 : ptr[14] * pFil[6];
352 :
353 : sumr2 += ptr[9] * pFil[1] +
354 : ptr[11] * pFil[3] +
355 : ptr[13] * pFil[5] +
356 : ptr[15] * pFil[7];
357 :
358 : ptr += 16;
359 : pFil += 8;
360 : }
361 : dest[0] = (float)suml1;
362 : dest[1] = (float)sumr1;
363 : dest[2] = (float)suml2;
364 : dest[3] = (float)sumr2;
365 :
366 : src += 4;
367 : dest += 4;
368 : }
369 : */
370 : }
371 :
372 : #endif // SOUNDTOUCH_ALLOW_SSE
|