Line data Source code
1 : /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 : /* this source code form is subject to the terms of the mozilla public
3 : * license, v. 2.0. if a copy of the mpl was not distributed with this file,
4 : * You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #include "AudioNodeEngineSSE2.h"
7 : #include "AlignmentUtils.h"
8 : #include <emmintrin.h>
9 :
10 :
11 : namespace mozilla {
12 : void
13 0 : AudioBufferAddWithScale_SSE(const float* aInput,
14 : float aScale,
15 : float* aOutput,
16 : uint32_t aSize)
17 : {
18 : __m128 vin0, vin1, vin2, vin3,
19 : vscaled0, vscaled1, vscaled2, vscaled3,
20 : vout0, vout1, vout2, vout3,
21 : vgain;
22 :
23 0 : ASSERT_ALIGNED16(aInput);
24 0 : ASSERT_ALIGNED16(aOutput);
25 0 : ASSERT_MULTIPLE16(aSize);
26 :
27 0 : vgain = _mm_load1_ps(&aScale);
28 :
29 0 : for (unsigned i = 0; i < aSize; i+=16) {
30 0 : vin0 = _mm_load_ps(&aInput[i]);
31 0 : vin1 = _mm_load_ps(&aInput[i + 4]);
32 0 : vin2 = _mm_load_ps(&aInput[i + 8]);
33 0 : vin3 = _mm_load_ps(&aInput[i + 12]);
34 :
35 0 : vscaled0 = _mm_mul_ps(vin0, vgain);
36 0 : vscaled1 = _mm_mul_ps(vin1, vgain);
37 0 : vscaled2 = _mm_mul_ps(vin2, vgain);
38 0 : vscaled3 = _mm_mul_ps(vin3, vgain);
39 :
40 0 : vin0 = _mm_load_ps(&aOutput[i]);
41 0 : vin1 = _mm_load_ps(&aOutput[i + 4]);
42 0 : vin2 = _mm_load_ps(&aOutput[i + 8]);
43 0 : vin3 = _mm_load_ps(&aOutput[i + 12]);
44 :
45 0 : vout0 = _mm_add_ps(vin0, vscaled0);
46 0 : vout1 = _mm_add_ps(vin1, vscaled1);
47 0 : vout2 = _mm_add_ps(vin2, vscaled2);
48 0 : vout3 = _mm_add_ps(vin3, vscaled3);
49 :
50 0 : _mm_store_ps(&aOutput[i], vout0);
51 0 : _mm_store_ps(&aOutput[i + 4], vout1);
52 0 : _mm_store_ps(&aOutput[i + 8], vout2);
53 0 : _mm_store_ps(&aOutput[i + 12], vout3);
54 : }
55 0 : }
56 :
57 : void
58 0 : AudioBlockCopyChannelWithScale_SSE(const float* aInput,
59 : float aScale,
60 : float* aOutput)
61 : {
62 : __m128 vin0, vin1, vin2, vin3,
63 : vout0, vout1, vout2, vout3;
64 :
65 0 : ASSERT_ALIGNED16(aInput);
66 0 : ASSERT_ALIGNED16(aOutput);
67 :
68 0 : __m128 vgain = _mm_load1_ps(&aScale);
69 :
70 0 : for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
71 0 : vin0 = _mm_load_ps(&aInput[i]);
72 0 : vin1 = _mm_load_ps(&aInput[i + 4]);
73 0 : vin2 = _mm_load_ps(&aInput[i + 8]);
74 0 : vin3 = _mm_load_ps(&aInput[i + 12]);
75 0 : vout0 = _mm_mul_ps(vin0, vgain);
76 0 : vout1 = _mm_mul_ps(vin1, vgain);
77 0 : vout2 = _mm_mul_ps(vin2, vgain);
78 0 : vout3 = _mm_mul_ps(vin3, vgain);
79 0 : _mm_store_ps(&aOutput[i], vout0);
80 0 : _mm_store_ps(&aOutput[i + 4], vout1);
81 0 : _mm_store_ps(&aOutput[i + 8], vout2);
82 0 : _mm_store_ps(&aOutput[i + 12], vout3);
83 : }
84 0 : }
85 :
86 : void
87 0 : AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
88 : const float aScale[WEBAUDIO_BLOCK_SIZE],
89 : float aOutput[WEBAUDIO_BLOCK_SIZE])
90 : {
91 : __m128 vin0, vin1, vin2, vin3,
92 : vscaled0, vscaled1, vscaled2, vscaled3,
93 : vout0, vout1, vout2, vout3;
94 :
95 0 : ASSERT_ALIGNED16(aInput);
96 0 : ASSERT_ALIGNED16(aScale);
97 0 : ASSERT_ALIGNED16(aOutput);
98 :
99 0 : for (unsigned i = 0 ; i < WEBAUDIO_BLOCK_SIZE; i+=16) {
100 0 : vscaled0 = _mm_load_ps(&aScale[i]);
101 0 : vscaled1 = _mm_load_ps(&aScale[i+4]);
102 0 : vscaled2 = _mm_load_ps(&aScale[i+8]);
103 0 : vscaled3 = _mm_load_ps(&aScale[i+12]);
104 :
105 0 : vin0 = _mm_load_ps(&aInput[i]);
106 0 : vin1 = _mm_load_ps(&aInput[i + 4]);
107 0 : vin2 = _mm_load_ps(&aInput[i + 8]);
108 0 : vin3 = _mm_load_ps(&aInput[i + 12]);
109 :
110 0 : vout0 = _mm_mul_ps(vin0, vscaled0);
111 0 : vout1 = _mm_mul_ps(vin1, vscaled1);
112 0 : vout2 = _mm_mul_ps(vin2, vscaled2);
113 0 : vout3 = _mm_mul_ps(vin3, vscaled3);
114 :
115 0 : _mm_store_ps(&aOutput[i], vout0);
116 0 : _mm_store_ps(&aOutput[i + 4], vout1);
117 0 : _mm_store_ps(&aOutput[i + 8], vout2);
118 0 : _mm_store_ps(&aOutput[i + 12], vout3);
119 : }
120 0 : }
121 :
122 : void
123 0 : AudioBufferInPlaceScale_SSE(float* aBlock,
124 : float aScale,
125 : uint32_t aSize)
126 : {
127 : __m128 vout0, vout1, vout2, vout3,
128 : vin0, vin1, vin2, vin3;
129 :
130 0 : ASSERT_ALIGNED16(aBlock);
131 0 : ASSERT_MULTIPLE16(aSize);
132 :
133 0 : __m128 vgain = _mm_load1_ps(&aScale);
134 :
135 0 : for (unsigned i = 0; i < aSize; i+=16) {
136 0 : vin0 = _mm_load_ps(&aBlock[i]);
137 0 : vin1 = _mm_load_ps(&aBlock[i + 4]);
138 0 : vin2 = _mm_load_ps(&aBlock[i + 8]);
139 0 : vin3 = _mm_load_ps(&aBlock[i + 12]);
140 0 : vout0 = _mm_mul_ps(vin0, vgain);
141 0 : vout1 = _mm_mul_ps(vin1, vgain);
142 0 : vout2 = _mm_mul_ps(vin2, vgain);
143 0 : vout3 = _mm_mul_ps(vin3, vgain);
144 0 : _mm_store_ps(&aBlock[i], vout0);
145 0 : _mm_store_ps(&aBlock[i + 4], vout1);
146 0 : _mm_store_ps(&aBlock[i + 8], vout2);
147 0 : _mm_store_ps(&aBlock[i + 12], vout3);
148 : }
149 0 : }
150 :
151 : void
152 0 : AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
153 : const float aInputR[WEBAUDIO_BLOCK_SIZE],
154 : float aGainL, float aGainR, bool aIsOnTheLeft,
155 : float aOutputL[WEBAUDIO_BLOCK_SIZE],
156 : float aOutputR[WEBAUDIO_BLOCK_SIZE])
157 : {
158 : __m128 vinl0, vinr0, vinl1, vinr1,
159 : vout0, vout1,
160 : vscaled0, vscaled1,
161 : vgainl, vgainr;
162 :
163 0 : ASSERT_ALIGNED16(aInputL);
164 0 : ASSERT_ALIGNED16(aInputR);
165 0 : ASSERT_ALIGNED16(aOutputL);
166 0 : ASSERT_ALIGNED16(aOutputR);
167 :
168 0 : vgainl = _mm_load1_ps(&aGainL);
169 0 : vgainr = _mm_load1_ps(&aGainR);
170 :
171 0 : if (aIsOnTheLeft) {
172 0 : for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
173 0 : vinl0 = _mm_load_ps(&aInputL[i]);
174 0 : vinr0 = _mm_load_ps(&aInputR[i]);
175 0 : vinl1 = _mm_load_ps(&aInputL[i+4]);
176 0 : vinr1 = _mm_load_ps(&aInputR[i+4]);
177 :
178 : /* left channel : aOutputL = aInputL + aInputR * gainL */
179 0 : vscaled0 = _mm_mul_ps(vinr0, vgainl);
180 0 : vscaled1 = _mm_mul_ps(vinr1, vgainl);
181 0 : vout0 = _mm_add_ps(vscaled0, vinl0);
182 0 : vout1 = _mm_add_ps(vscaled1, vinl1);
183 0 : _mm_store_ps(&aOutputL[i], vout0);
184 0 : _mm_store_ps(&aOutputL[i+4], vout1);
185 :
186 : /* right channel : aOutputR = aInputR * gainR */
187 0 : vscaled0 = _mm_mul_ps(vinr0, vgainr);
188 0 : vscaled1 = _mm_mul_ps(vinr1, vgainr);
189 0 : _mm_store_ps(&aOutputR[i], vscaled0);
190 0 : _mm_store_ps(&aOutputR[i+4], vscaled1);
191 : }
192 : } else {
193 0 : for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
194 0 : vinl0 = _mm_load_ps(&aInputL[i]);
195 0 : vinr0 = _mm_load_ps(&aInputR[i]);
196 0 : vinl1 = _mm_load_ps(&aInputL[i+4]);
197 0 : vinr1 = _mm_load_ps(&aInputR[i+4]);
198 :
199 : /* left channel : aInputL * gainL */
200 0 : vscaled0 = _mm_mul_ps(vinl0, vgainl);
201 0 : vscaled1 = _mm_mul_ps(vinl1, vgainl);
202 0 : _mm_store_ps(&aOutputL[i], vscaled0);
203 0 : _mm_store_ps(&aOutputL[i+4], vscaled1);
204 :
205 : /* right channel: aOutputR = aInputR + aInputL * gainR */
206 0 : vscaled0 = _mm_mul_ps(vinl0, vgainr);
207 0 : vscaled1 = _mm_mul_ps(vinl1, vgainr);
208 0 : vout0 = _mm_add_ps(vscaled0, vinr0);
209 0 : vout1 = _mm_add_ps(vscaled1, vinr1);
210 0 : _mm_store_ps(&aOutputR[i], vout0);
211 0 : _mm_store_ps(&aOutputR[i+4], vout1);
212 : }
213 : }
214 0 : }
215 :
216 0 : void BufferComplexMultiply_SSE(const float* aInput,
217 : const float* aScale,
218 : float* aOutput,
219 : uint32_t aSize)
220 : {
221 : unsigned i;
222 : __m128 in0, in1, in2, in3,
223 : outreal0, outreal1, outreal2, outreal3,
224 : outimag0, outimag1, outimag2, outimag3;
225 :
226 0 : ASSERT_ALIGNED16(aInput);
227 0 : ASSERT_ALIGNED16(aScale);
228 0 : ASSERT_ALIGNED16(aOutput);
229 0 : ASSERT_MULTIPLE16(aSize);
230 :
231 0 : for (i = 0; i < aSize * 2; i += 16) {
232 0 : in0 = _mm_load_ps(&aInput[i]);
233 0 : in1 = _mm_load_ps(&aInput[i + 4]);
234 0 : in2 = _mm_load_ps(&aInput[i + 8]);
235 0 : in3 = _mm_load_ps(&aInput[i + 12]);
236 :
237 0 : outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
238 0 : outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
239 0 : outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
240 0 : outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
241 :
242 0 : in0 = _mm_load_ps(&aScale[i]);
243 0 : in1 = _mm_load_ps(&aScale[i + 4]);
244 0 : in2 = _mm_load_ps(&aScale[i + 8]);
245 0 : in3 = _mm_load_ps(&aScale[i + 12]);
246 :
247 0 : outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
248 0 : outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
249 0 : outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
250 0 : outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
251 :
252 0 : in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
253 0 : _mm_mul_ps(outimag0, outimag1));
254 0 : in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
255 0 : _mm_mul_ps(outimag0, outreal1));
256 0 : in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
257 0 : _mm_mul_ps(outimag2, outimag3));
258 0 : in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
259 0 : _mm_mul_ps(outimag2, outreal3));
260 :
261 0 : outreal0 = _mm_unpacklo_ps(in0, in1);
262 0 : outreal1 = _mm_unpackhi_ps(in0, in1);
263 0 : outreal2 = _mm_unpacklo_ps(in2, in3);
264 0 : outreal3 = _mm_unpackhi_ps(in2, in3);
265 :
266 0 : _mm_store_ps(&aOutput[i], outreal0);
267 0 : _mm_store_ps(&aOutput[i + 4], outreal1);
268 0 : _mm_store_ps(&aOutput[i + 8], outreal2);
269 0 : _mm_store_ps(&aOutput[i + 12], outreal3);
270 : }
271 0 : }
272 :
273 : float
274 0 : AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength)
275 : {
276 : unsigned i;
277 : __m128 in0, in1, in2, in3,
278 : acc0, acc1, acc2, acc3;
279 : float out[4];
280 :
281 0 : ASSERT_ALIGNED16(aInput);
282 0 : ASSERT_MULTIPLE16(aLength);
283 :
284 0 : acc0 = _mm_setzero_ps();
285 0 : acc1 = _mm_setzero_ps();
286 0 : acc2 = _mm_setzero_ps();
287 0 : acc3 = _mm_setzero_ps();
288 :
289 0 : for (i = 0; i < aLength; i+=16) {
290 0 : in0 = _mm_load_ps(&aInput[i]);
291 0 : in1 = _mm_load_ps(&aInput[i + 4]);
292 0 : in2 = _mm_load_ps(&aInput[i + 8]);
293 0 : in3 = _mm_load_ps(&aInput[i + 12]);
294 :
295 0 : in0 = _mm_mul_ps(in0, in0);
296 0 : in1 = _mm_mul_ps(in1, in1);
297 0 : in2 = _mm_mul_ps(in2, in2);
298 0 : in3 = _mm_mul_ps(in3, in3);
299 :
300 0 : acc0 = _mm_add_ps(acc0, in0);
301 0 : acc1 = _mm_add_ps(acc1, in1);
302 0 : acc2 = _mm_add_ps(acc2, in2);
303 0 : acc3 = _mm_add_ps(acc3, in3);
304 : }
305 :
306 0 : acc0 = _mm_add_ps(acc0, acc1);
307 0 : acc0 = _mm_add_ps(acc0, acc2);
308 0 : acc0 = _mm_add_ps(acc0, acc3);
309 :
310 : _mm_store_ps(out, acc0);
311 :
312 0 : return out[0] + out[1] + out[2] + out[3];
313 : }
314 :
315 : }
|