Line data Source code
1 : /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 : * This Source Code Form is subject to the terms of the Mozilla Public
3 : * License, v. 2.0. If a copy of the MPL was not distributed with this
4 : * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 :
6 : #ifndef _MOZILLA_GFX_SIMD_H_
7 : #define _MOZILLA_GFX_SIMD_H_
8 :
9 : /**
10 : * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
11 : * if they want access to the SSE2 functions.
12 : */
13 :
14 : #ifdef SIMD_COMPILE_SSE2
15 : #include <xmmintrin.h>
16 : #endif
17 :
18 : namespace mozilla {
19 : namespace gfx {
20 :
21 : namespace simd {
22 :
23 : template<typename u8x16_t>
24 : u8x16_t Load8(const uint8_t* aSource);
25 :
26 : template<typename u8x16_t>
27 : u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
28 : uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p);
29 :
30 : template<typename u8x16_t>
31 : u8x16_t FromZero8();
32 :
33 : template<typename i16x8_t>
34 : i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h);
35 :
36 : template<typename u16x8_t>
37 : u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h);
38 :
39 : template<typename i16x8_t>
40 : i16x8_t FromI16(int16_t a);
41 :
42 : template<typename u16x8_t>
43 : u16x8_t FromU16(uint16_t a);
44 :
45 : template<typename i32x4_t>
46 : i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
47 :
48 : template<typename i32x4_t>
49 : i32x4_t From32(int32_t a);
50 :
51 : template<typename f32x4_t>
52 : f32x4_t FromF32(float a, float b, float c, float d);
53 :
54 : template<typename f32x4_t>
55 : f32x4_t FromF32(float a);
56 :
57 : // All SIMD backends overload these functions for their SIMD types:
58 :
59 : #if 0
60 :
61 : // Store 16 bytes to a 16-byte aligned address
62 : void Store8(uint8_t* aTarget, u8x16_t aM);
63 :
64 : // Fixed shifts
65 : template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
66 : template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
67 :
68 : i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
69 : i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
70 : i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
71 : i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
72 : u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
73 : u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
74 : i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
75 : i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
76 :
77 : // Truncating i16 -> i16 multiplication
78 : i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
79 :
80 : // Long multiplication i16 -> i32
81 : // aFactorsA1B1 = (a1[4] b1[4])
82 : // aFactorsA2B2 = (a2[4] b2[4])
83 : // aProductA = a1 * a2, aProductB = b1 * b2
84 : void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
85 : i32x4_t& aProductA, i32x4_t& aProductB);
86 :
87 : // Long multiplication + pairwise addition i16 -> i32
88 : // See the scalar implementation for specifics.
89 : i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
90 : i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
91 :
92 : // Set all four 32-bit components to the value of the component at aIndex.
93 : template<int8_t aIndex>
94 : i32x4_t Splat32(i32x4_t aM);
95 :
96 : // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
97 : // re-interpret the result as sixteen 8-bit values.
98 : template<int8_t aIndex>
99 : u8x16_t Splat32On8(u8x16_t aM);
100 :
101 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
102 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
103 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
104 :
105 : u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
106 : u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
107 : i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
108 : i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
109 : i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
110 :
111 : i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
112 : i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
113 : u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
114 : u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
115 :
116 : i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
117 : u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
118 : u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
119 :
120 : i32x4 FastDivideBy255(i32x4 m);
121 : i16x8 FastDivideBy255_16(i16x8 m);
122 :
123 : #endif
124 :
125 : // Scalar
126 :
127 : struct Scalaru8x16_t {
128 : uint8_t u8[16];
129 : };
130 :
131 : union Scalari16x8_t {
132 : int16_t i16[8];
133 : uint16_t u16[8];
134 : };
135 :
136 : typedef Scalari16x8_t Scalaru16x8_t;
137 :
138 : struct Scalari32x4_t {
139 : int32_t i32[4];
140 : };
141 :
142 : struct Scalarf32x4_t {
143 : float f32[4];
144 : };
145 :
146 : template<>
147 : inline Scalaru8x16_t
148 0 : Load8<Scalaru8x16_t>(const uint8_t* aSource)
149 : {
150 0 : return *(Scalaru8x16_t*)aSource;
151 : }
152 :
153 0 : inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM)
154 : {
155 0 : *(Scalaru8x16_t*)aTarget = aM;
156 0 : }
157 :
158 : template<>
159 0 : inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
160 : uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
161 : {
162 : Scalaru8x16_t _m;
163 0 : _m.u8[0] = a;
164 0 : _m.u8[1] = b;
165 0 : _m.u8[2] = c;
166 0 : _m.u8[3] = d;
167 0 : _m.u8[4] = e;
168 0 : _m.u8[5] = f;
169 0 : _m.u8[6] = g;
170 0 : _m.u8[7] = h;
171 0 : _m.u8[8+0] = i;
172 0 : _m.u8[8+1] = j;
173 0 : _m.u8[8+2] = k;
174 0 : _m.u8[8+3] = l;
175 0 : _m.u8[8+4] = m;
176 0 : _m.u8[8+5] = n;
177 0 : _m.u8[8+6] = o;
178 0 : _m.u8[8+7] = p;
179 0 : return _m;
180 : }
181 :
182 : template<>
183 0 : inline Scalaru8x16_t FromZero8<Scalaru8x16_t>()
184 : {
185 0 : return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
186 : }
187 :
188 : template<>
189 0 : inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
190 : {
191 : Scalari16x8_t m;
192 0 : m.i16[0] = a;
193 0 : m.i16[1] = b;
194 0 : m.i16[2] = c;
195 0 : m.i16[3] = d;
196 0 : m.i16[4] = e;
197 0 : m.i16[5] = f;
198 0 : m.i16[6] = g;
199 0 : m.i16[7] = h;
200 0 : return m;
201 : }
202 :
203 : template<>
204 0 : inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
205 : {
206 : Scalaru16x8_t m;
207 0 : m.u16[0] = a;
208 0 : m.u16[1] = b;
209 0 : m.u16[2] = c;
210 0 : m.u16[3] = d;
211 0 : m.u16[4] = e;
212 0 : m.u16[5] = f;
213 0 : m.u16[6] = g;
214 0 : m.u16[7] = h;
215 0 : return m;
216 : }
217 :
218 : template<>
219 0 : inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a)
220 : {
221 0 : return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
222 : }
223 :
224 : template<>
225 0 : inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a)
226 : {
227 0 : return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
228 : }
229 :
230 : template<>
231 0 : inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d)
232 : {
233 : Scalari32x4_t m;
234 0 : m.i32[0] = a;
235 0 : m.i32[1] = b;
236 0 : m.i32[2] = c;
237 0 : m.i32[3] = d;
238 0 : return m;
239 : }
240 :
241 : template<>
242 0 : inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d)
243 : {
244 : Scalarf32x4_t m;
245 0 : m.f32[0] = a;
246 0 : m.f32[1] = b;
247 0 : m.f32[2] = c;
248 0 : m.f32[3] = d;
249 0 : return m;
250 : }
251 :
252 : template<>
253 0 : inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a)
254 : {
255 0 : return FromF32<Scalarf32x4_t>(a, a, a, a);
256 : }
257 :
258 : template<>
259 : inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a)
260 : {
261 : return From32<Scalari32x4_t>(a, a, a, a);
262 : }
263 :
264 : template<int32_t aNumberOfBits>
265 : inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM)
266 : {
267 : return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits,
268 : uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits,
269 : uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits,
270 : uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits);
271 : }
272 :
273 : template<int32_t aNumberOfBits>
274 0 : inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM)
275 : {
276 0 : return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
277 0 : aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
278 : }
279 :
280 : inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
281 : {
282 : return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1],
283 : aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3],
284 : aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
285 : aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
286 : }
287 :
288 0 : inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2)
289 : {
290 0 : return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
291 0 : aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]);
292 : }
293 :
294 0 : inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
295 : {
296 0 : return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1],
297 0 : aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3],
298 0 : aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
299 0 : aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
300 : }
301 :
302 : inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2)
303 : {
304 : return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
305 : aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]);
306 : }
307 :
308 : inline int32_t
309 0 : umin(int32_t a, int32_t b)
310 : {
311 0 : return a - ((a - b) & -(a > b));
312 : }
313 :
314 : inline int32_t
315 : umax(int32_t a, int32_t b)
316 : {
317 : return a - ((a - b) & -(a < b));
318 : }
319 :
320 : inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
321 : {
322 : return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
323 : umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
324 : umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
325 : umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
326 : umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]),
327 : umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]),
328 : umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]),
329 : umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7]));
330 : }
331 :
332 : inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
333 : {
334 : return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
335 : umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
336 : umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
337 : umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
338 : umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]),
339 : umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]),
340 : umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]),
341 : umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7]));
342 : }
343 :
344 : inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2)
345 : {
346 : return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
347 : umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
348 : }
349 :
350 : inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2)
351 : {
352 : return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
353 : umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
354 : }
355 :
356 0 : inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
357 : {
358 0 : return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
359 0 : uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
360 0 : uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
361 0 : uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
362 : }
363 :
364 0 : inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
365 : Scalari16x8_t aFactorsA2B2,
366 : Scalari32x4_t& aProductA,
367 : Scalari32x4_t& aProductB)
368 : {
369 0 : aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
370 0 : aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
371 0 : aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
372 0 : aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
373 0 : aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
374 0 : aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
375 0 : aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
376 0 : aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
377 0 : }
378 :
379 0 : inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
380 : Scalari16x8_t aFactorsB)
381 : {
382 0 : return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
383 0 : aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
384 0 : aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
385 0 : aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]);
386 : }
387 :
388 : template<int8_t aIndex>
389 0 : inline void AssertIndex()
390 : {
391 : static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
392 : "Invalid splat index");
393 0 : }
394 :
395 : template<int8_t aIndex>
396 : inline Scalari32x4_t Splat32(Scalari32x4_t aM)
397 : {
398 : AssertIndex<aIndex>();
399 : return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex],
400 : aM.i32[aIndex], aM.i32[aIndex]);
401 : }
402 :
403 : template<int8_t i>
404 0 : inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM)
405 : {
406 0 : AssertIndex<i>();
407 0 : return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
408 : aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
409 : aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
410 0 : aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]);
411 : }
412 :
413 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
414 : inline Scalari32x4_t Shuffle32(Scalari32x4_t aM)
415 : {
416 : AssertIndex<i0>();
417 : AssertIndex<i1>();
418 : AssertIndex<i2>();
419 : AssertIndex<i3>();
420 : Scalari32x4_t m = aM;
421 : m.i32[0] = aM.i32[i3];
422 : m.i32[1] = aM.i32[i2];
423 : m.i32[2] = aM.i32[i1];
424 : m.i32[3] = aM.i32[i0];
425 : return m;
426 : }
427 :
428 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
429 0 : inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM)
430 : {
431 0 : AssertIndex<i0>();
432 0 : AssertIndex<i1>();
433 0 : AssertIndex<i2>();
434 0 : AssertIndex<i3>();
435 0 : Scalari16x8_t m = aM;
436 0 : m.i16[0] = aM.i16[i3];
437 0 : m.i16[1] = aM.i16[i2];
438 0 : m.i16[2] = aM.i16[i1];
439 0 : m.i16[3] = aM.i16[i0];
440 0 : return m;
441 : }
442 :
443 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
444 0 : inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM)
445 : {
446 0 : AssertIndex<i0>();
447 0 : AssertIndex<i1>();
448 0 : AssertIndex<i2>();
449 0 : AssertIndex<i3>();
450 0 : Scalari16x8_t m = aM;
451 0 : m.i16[4 + 0] = aM.i16[4 + i3];
452 0 : m.i16[4 + 1] = aM.i16[4 + i2];
453 0 : m.i16[4 + 2] = aM.i16[4 + i1];
454 0 : m.i16[4 + 3] = aM.i16[4 + i0];
455 0 : return m;
456 : }
457 :
458 : template<int8_t aIndexLo, int8_t aIndexHi>
459 0 : inline Scalaru16x8_t Splat16(Scalaru16x8_t aM)
460 : {
461 0 : AssertIndex<aIndexLo>();
462 0 : AssertIndex<aIndexHi>();
463 : Scalaru16x8_t m;
464 0 : int16_t chosenValueLo = aM.u16[aIndexLo];
465 0 : m.u16[0] = chosenValueLo;
466 0 : m.u16[1] = chosenValueLo;
467 0 : m.u16[2] = chosenValueLo;
468 0 : m.u16[3] = chosenValueLo;
469 0 : int16_t chosenValueHi = aM.u16[4 + aIndexHi];
470 0 : m.u16[4] = chosenValueHi;
471 0 : m.u16[5] = chosenValueHi;
472 0 : m.u16[6] = chosenValueHi;
473 0 : m.u16[7] = chosenValueHi;
474 0 : return m;
475 : }
476 :
477 : inline Scalaru8x16_t
478 0 : InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2)
479 : {
480 0 : return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1],
481 0 : m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3],
482 0 : m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5],
483 0 : m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]);
484 : }
485 :
486 : inline Scalaru8x16_t
487 0 : InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2)
488 : {
489 0 : return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1],
490 0 : m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3],
491 0 : m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5],
492 0 : m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]);
493 : }
494 :
495 : inline Scalaru16x8_t
496 0 : InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2)
497 : {
498 0 : return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
499 0 : m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
500 : }
501 :
502 : inline Scalaru16x8_t
503 0 : InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2)
504 : {
505 0 : return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
506 0 : m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
507 : }
508 :
509 : inline Scalari32x4_t
510 : InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2)
511 : {
512 : return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
513 : }
514 :
515 : inline Scalari16x8_t
516 0 : UnpackLo8x8ToI16x8(Scalaru8x16_t aM)
517 : {
518 : Scalari16x8_t m;
519 0 : m.i16[0] = aM.u8[0];
520 0 : m.i16[1] = aM.u8[1];
521 0 : m.i16[2] = aM.u8[2];
522 0 : m.i16[3] = aM.u8[3];
523 0 : m.i16[4] = aM.u8[4];
524 0 : m.i16[5] = aM.u8[5];
525 0 : m.i16[6] = aM.u8[6];
526 0 : m.i16[7] = aM.u8[7];
527 0 : return m;
528 : }
529 :
530 : inline Scalari16x8_t
531 0 : UnpackHi8x8ToI16x8(Scalaru8x16_t aM)
532 : {
533 : Scalari16x8_t m;
534 0 : m.i16[0] = aM.u8[8+0];
535 0 : m.i16[1] = aM.u8[8+1];
536 0 : m.i16[2] = aM.u8[8+2];
537 0 : m.i16[3] = aM.u8[8+3];
538 0 : m.i16[4] = aM.u8[8+4];
539 0 : m.i16[5] = aM.u8[8+5];
540 0 : m.i16[6] = aM.u8[8+6];
541 0 : m.i16[7] = aM.u8[8+7];
542 0 : return m;
543 : }
544 :
545 : inline Scalaru16x8_t
546 0 : UnpackLo8x8ToU16x8(Scalaru8x16_t aM)
547 : {
548 0 : return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
549 0 : uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
550 : }
551 :
552 : inline Scalaru16x8_t
553 0 : UnpackHi8x8ToU16x8(Scalaru8x16_t aM)
554 : {
555 0 : return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3],
556 0 : aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]);
557 : }
558 :
559 : template<uint8_t aNumBytes>
560 : inline Scalaru8x16_t
561 : Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678)
562 : {
563 : Scalaru8x16_t m;
564 : for (uint8_t i = 0; i < 16; i++) {
565 : uint8_t sourceByte = i + aNumBytes;
566 : m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
567 : }
568 : return m;
569 : }
570 :
571 : template<typename T>
572 : inline int16_t
573 0 : SaturateTo16(T a)
574 : {
575 0 : return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
576 : }
577 :
578 : inline Scalari16x8_t
579 0 : PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2)
580 : {
581 : Scalari16x8_t m;
582 0 : m.i16[0] = SaturateTo16(m1.i32[0]);
583 0 : m.i16[1] = SaturateTo16(m1.i32[1]);
584 0 : m.i16[2] = SaturateTo16(m1.i32[2]);
585 0 : m.i16[3] = SaturateTo16(m1.i32[3]);
586 0 : m.i16[4] = SaturateTo16(m2.i32[0]);
587 0 : m.i16[5] = SaturateTo16(m2.i32[1]);
588 0 : m.i16[6] = SaturateTo16(m2.i32[2]);
589 0 : m.i16[7] = SaturateTo16(m2.i32[3]);
590 0 : return m;
591 : }
592 :
593 : template<typename T>
594 : inline uint16_t
595 0 : SaturateToU16(T a)
596 : {
597 0 : return uint16_t(umin(a & -(a >= 0), INT16_MAX));
598 : }
599 :
600 : inline Scalaru16x8_t
601 0 : PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2)
602 : {
603 : Scalaru16x8_t m;
604 0 : m.u16[0] = SaturateToU16(m1.i32[0]);
605 0 : m.u16[1] = SaturateToU16(m1.i32[1]);
606 0 : m.u16[2] = SaturateToU16(m1.i32[2]);
607 0 : m.u16[3] = SaturateToU16(m1.i32[3]);
608 0 : m.u16[4] = SaturateToU16(m2.i32[0]);
609 0 : m.u16[5] = SaturateToU16(m2.i32[1]);
610 0 : m.u16[6] = SaturateToU16(m2.i32[2]);
611 0 : m.u16[7] = SaturateToU16(m2.i32[3]);
612 0 : return m;
613 : }
614 :
615 : template<typename T>
616 : inline uint8_t
617 0 : SaturateTo8(T a)
618 : {
619 0 : return uint8_t(umin(a & -(a >= 0), 255));
620 : }
621 :
622 : inline Scalaru8x16_t
623 0 : PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4)
624 : {
625 : Scalaru8x16_t m;
626 0 : m.u8[0] = SaturateTo8(m1.i32[0]);
627 0 : m.u8[1] = SaturateTo8(m1.i32[1]);
628 0 : m.u8[2] = SaturateTo8(m1.i32[2]);
629 0 : m.u8[3] = SaturateTo8(m1.i32[3]);
630 0 : m.u8[4] = SaturateTo8(m2.i32[0]);
631 0 : m.u8[5] = SaturateTo8(m2.i32[1]);
632 0 : m.u8[6] = SaturateTo8(m2.i32[2]);
633 0 : m.u8[7] = SaturateTo8(m2.i32[3]);
634 0 : m.u8[8] = SaturateTo8(m3.i32[0]);
635 0 : m.u8[9] = SaturateTo8(m3.i32[1]);
636 0 : m.u8[10] = SaturateTo8(m3.i32[2]);
637 0 : m.u8[11] = SaturateTo8(m3.i32[3]);
638 0 : m.u8[12] = SaturateTo8(m4.i32[0]);
639 0 : m.u8[13] = SaturateTo8(m4.i32[1]);
640 0 : m.u8[14] = SaturateTo8(m4.i32[2]);
641 0 : m.u8[15] = SaturateTo8(m4.i32[3]);
642 0 : return m;
643 : }
644 :
645 : inline Scalaru8x16_t
646 0 : PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2)
647 : {
648 : Scalaru8x16_t m;
649 0 : m.u8[0] = SaturateTo8(m1.i16[0]);
650 0 : m.u8[1] = SaturateTo8(m1.i16[1]);
651 0 : m.u8[2] = SaturateTo8(m1.i16[2]);
652 0 : m.u8[3] = SaturateTo8(m1.i16[3]);
653 0 : m.u8[4] = SaturateTo8(m1.i16[4]);
654 0 : m.u8[5] = SaturateTo8(m1.i16[5]);
655 0 : m.u8[6] = SaturateTo8(m1.i16[6]);
656 0 : m.u8[7] = SaturateTo8(m1.i16[7]);
657 0 : m.u8[8] = SaturateTo8(m2.i16[0]);
658 0 : m.u8[9] = SaturateTo8(m2.i16[1]);
659 0 : m.u8[10] = SaturateTo8(m2.i16[2]);
660 0 : m.u8[11] = SaturateTo8(m2.i16[3]);
661 0 : m.u8[12] = SaturateTo8(m2.i16[4]);
662 0 : m.u8[13] = SaturateTo8(m2.i16[5]);
663 0 : m.u8[14] = SaturateTo8(m2.i16[6]);
664 0 : m.u8[15] = SaturateTo8(m2.i16[7]);
665 0 : return m;
666 : }
667 :
668 : // Fast approximate division by 255. It has the property that
669 : // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
670 : // But it only uses two adds and two shifts instead of an
671 : // integer division (which is expensive on many processors).
672 : //
673 : // equivalent to v/255
674 : template<class B, class A>
675 0 : inline B FastDivideBy255(A v)
676 : {
677 0 : return ((v << 8) + v + 255) >> 16;
678 : }
679 :
680 : inline Scalaru16x8_t
681 0 : FastDivideBy255_16(Scalaru16x8_t m)
682 : {
683 0 : return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
684 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
685 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
686 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
687 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
688 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
689 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
690 0 : FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
691 : }
692 :
693 : inline Scalari32x4_t
694 0 : FastDivideBy255(Scalari32x4_t m)
695 : {
696 : return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]),
697 : FastDivideBy255<int32_t>(m.i32[1]),
698 : FastDivideBy255<int32_t>(m.i32[2]),
699 0 : FastDivideBy255<int32_t>(m.i32[3]));
700 : }
701 :
702 : inline Scalaru8x16_t
703 : Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b)
704 : {
705 : return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
706 : (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
707 : (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
708 : (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
709 : (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
710 : (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
711 : (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
712 : (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
713 : (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]),
714 : (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]),
715 : (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]),
716 : (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]),
717 : (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]),
718 : (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]),
719 : (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]),
720 : (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7]));
721 : }
722 :
723 : inline Scalari32x4_t
724 0 : Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b)
725 : {
726 0 : return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
727 0 : (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
728 0 : (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
729 0 : (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
730 : }
731 :
732 0 : inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t)
733 : {
734 0 : return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
735 0 : a.f32[1] + (b.f32[1] - a.f32[1]) * t,
736 0 : a.f32[2] + (b.f32[2] - a.f32[2]) * t,
737 0 : a.f32[3] + (b.f32[3] - a.f32[3]) * t);
738 : }
739 :
740 0 : inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb)
741 : {
742 0 : return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb,
743 0 : a.f32[1] * wa + b.f32[1] * wb,
744 0 : a.f32[2] * wa + b.f32[2] * wb,
745 0 : a.f32[3] * wa + b.f32[3] * wb);
746 : }
747 :
748 0 : inline Scalarf32x4_t AbsF32(Scalarf32x4_t a)
749 : {
750 : return FromF32<Scalarf32x4_t>(fabs(a.f32[0]),
751 : fabs(a.f32[1]),
752 : fabs(a.f32[2]),
753 0 : fabs(a.f32[3]));
754 : }
755 :
756 0 : inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b)
757 : {
758 0 : return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0],
759 0 : a.f32[1] + b.f32[1],
760 0 : a.f32[2] + b.f32[2],
761 0 : a.f32[3] + b.f32[3]);
762 : }
763 :
764 0 : inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b)
765 : {
766 0 : return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0],
767 0 : a.f32[1] * b.f32[1],
768 0 : a.f32[2] * b.f32[2],
769 0 : a.f32[3] * b.f32[3]);
770 : }
771 :
772 0 : inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b)
773 : {
774 0 : return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0],
775 0 : a.f32[1] / b.f32[1],
776 0 : a.f32[2] / b.f32[2],
777 0 : a.f32[3] / b.f32[3]);
778 : }
779 :
780 : template<uint8_t aIndex>
781 0 : inline Scalarf32x4_t SplatF32(Scalarf32x4_t m)
782 : {
783 0 : AssertIndex<aIndex>();
784 0 : return FromF32<Scalarf32x4_t>(m.f32[aIndex],
785 : m.f32[aIndex],
786 : m.f32[aIndex],
787 0 : m.f32[aIndex]);
788 : }
789 :
790 0 : inline Scalari32x4_t F32ToI32(Scalarf32x4_t m)
791 : {
792 0 : return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)),
793 0 : int32_t(floor(m.f32[1] + 0.5f)),
794 0 : int32_t(floor(m.f32[2] + 0.5f)),
795 0 : int32_t(floor(m.f32[3] + 0.5f)));
796 : }
797 :
798 : #ifdef SIMD_COMPILE_SSE2
799 :
800 : // SSE2
801 :
802 : template<>
803 : inline __m128i
804 0 : Load8<__m128i>(const uint8_t* aSource)
805 : {
806 0 : return _mm_load_si128((const __m128i*)aSource);
807 : }
808 :
809 0 : inline void Store8(uint8_t* aTarget, __m128i aM)
810 : {
811 : _mm_store_si128((__m128i*)aTarget, aM);
812 0 : }
813 :
814 : template<>
815 0 : inline __m128i FromZero8<__m128i>()
816 : {
817 0 : return _mm_setzero_si128();
818 : }
819 :
820 : template<>
821 0 : inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
822 : uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
823 : {
824 0 : return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
825 0 : (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
826 : }
827 :
828 : template<>
829 0 : inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
830 : {
831 0 : return _mm_setr_epi16(a, b, c, d, e, f, g, h);
832 : }
833 :
834 : template<>
835 0 : inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
836 : {
837 0 : return _mm_setr_epi16(a, b, c, d, e, f, g, h);
838 : }
839 :
840 : template<>
841 0 : inline __m128i FromI16<__m128i>(int16_t a)
842 : {
843 0 : return _mm_set1_epi16(a);
844 : }
845 :
846 : template<>
847 0 : inline __m128i FromU16<__m128i>(uint16_t a)
848 : {
849 0 : return _mm_set1_epi16((int16_t)a);
850 : }
851 :
852 : template<>
853 0 : inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d)
854 : {
855 0 : return _mm_setr_epi32(a, b, c, d);
856 : }
857 :
858 : template<>
859 : inline __m128i From32<__m128i>(int32_t a)
860 : {
861 : return _mm_set1_epi32(a);
862 : }
863 :
864 : template<>
865 0 : inline __m128 FromF32<__m128>(float a, float b, float c, float d)
866 : {
867 0 : return _mm_setr_ps(a, b, c, d);
868 : }
869 :
870 : template<>
871 0 : inline __m128 FromF32<__m128>(float a)
872 : {
873 0 : return _mm_set1_ps(a);
874 : }
875 :
876 : template<int32_t aNumberOfBits>
877 0 : inline __m128i ShiftRight16(__m128i aM)
878 : {
879 0 : return _mm_srli_epi16(aM, aNumberOfBits);
880 : }
881 :
882 : template<int32_t aNumberOfBits>
883 0 : inline __m128i ShiftRight32(__m128i aM)
884 : {
885 0 : return _mm_srai_epi32(aM, aNumberOfBits);
886 : }
887 :
888 0 : inline __m128i Add16(__m128i aM1, __m128i aM2)
889 : {
890 0 : return _mm_add_epi16(aM1, aM2);
891 : }
892 :
893 0 : inline __m128i Add32(__m128i aM1, __m128i aM2)
894 : {
895 0 : return _mm_add_epi32(aM1, aM2);
896 : }
897 :
898 0 : inline __m128i Sub16(__m128i aM1, __m128i aM2)
899 : {
900 0 : return _mm_sub_epi16(aM1, aM2);
901 : }
902 :
903 : inline __m128i Sub32(__m128i aM1, __m128i aM2)
904 : {
905 : return _mm_sub_epi32(aM1, aM2);
906 : }
907 :
908 0 : inline __m128i Min8(__m128i aM1, __m128i aM2)
909 : {
910 0 : return _mm_min_epu8(aM1, aM2);
911 : }
912 :
913 0 : inline __m128i Max8(__m128i aM1, __m128i aM2)
914 : {
915 0 : return _mm_max_epu8(aM1, aM2);
916 : }
917 :
918 0 : inline __m128i Min32(__m128i aM1, __m128i aM2)
919 : {
920 0 : __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
921 0 : __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
922 0 : return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
923 : }
924 :
925 0 : inline __m128i Max32(__m128i aM1, __m128i aM2)
926 : {
927 0 : __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
928 0 : __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
929 0 : return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
930 : }
931 :
932 0 : inline __m128i Mul16(__m128i aM1, __m128i aM2)
933 : {
934 0 : return _mm_mullo_epi16(aM1, aM2);
935 : }
936 :
937 : inline __m128i MulU16(__m128i aM1, __m128i aM2)
938 : {
939 : return _mm_mullo_epi16(aM1, aM2);
940 : }
941 :
942 0 : inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1,
943 : __m128i aFactorsA2B2,
944 : __m128i& aProductA,
945 : __m128i& aProductB)
946 : {
947 0 : __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
948 0 : __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
949 0 : aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
950 0 : aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
951 0 : }
952 :
953 0 : inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA,
954 : __m128i aFactorsB)
955 : {
956 0 : return _mm_madd_epi16(aFactorsA, aFactorsB);
957 : }
958 :
959 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
960 0 : inline __m128i Shuffle32(__m128i aM)
961 : {
962 0 : AssertIndex<i0>();
963 0 : AssertIndex<i1>();
964 0 : AssertIndex<i2>();
965 0 : AssertIndex<i3>();
966 0 : return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
967 : }
968 :
969 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
970 0 : inline __m128i ShuffleLo16(__m128i aM)
971 : {
972 0 : AssertIndex<i0>();
973 0 : AssertIndex<i1>();
974 0 : AssertIndex<i2>();
975 0 : AssertIndex<i3>();
976 0 : return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
977 : }
978 :
979 : template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
980 0 : inline __m128i ShuffleHi16(__m128i aM)
981 : {
982 0 : AssertIndex<i0>();
983 0 : AssertIndex<i1>();
984 0 : AssertIndex<i2>();
985 0 : AssertIndex<i3>();
986 0 : return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
987 : }
988 :
989 : template<int8_t aIndex>
990 : inline __m128i Splat32(__m128i aM)
991 : {
992 : return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
993 : }
994 :
995 : template<int8_t aIndex>
996 0 : inline __m128i Splat32On8(__m128i aM)
997 : {
998 0 : return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
999 : }
1000 :
1001 : template<int8_t aIndexLo, int8_t aIndexHi>
1002 0 : inline __m128i Splat16(__m128i aM)
1003 : {
1004 0 : AssertIndex<aIndexLo>();
1005 0 : AssertIndex<aIndexHi>();
1006 0 : return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>(
1007 0 : ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM));
1008 : }
1009 :
1010 : inline __m128i
1011 0 : UnpackLo8x8ToI16x8(__m128i m)
1012 : {
1013 0 : __m128i zero = _mm_set1_epi8(0);
1014 0 : return _mm_unpacklo_epi8(m, zero);
1015 : }
1016 :
1017 : inline __m128i
1018 0 : UnpackHi8x8ToI16x8(__m128i m)
1019 : {
1020 0 : __m128i zero = _mm_set1_epi8(0);
1021 0 : return _mm_unpackhi_epi8(m, zero);
1022 : }
1023 :
1024 : inline __m128i
1025 0 : UnpackLo8x8ToU16x8(__m128i m)
1026 : {
1027 0 : __m128i zero = _mm_set1_epi8(0);
1028 0 : return _mm_unpacklo_epi8(m, zero);
1029 : }
1030 :
1031 : inline __m128i
1032 0 : UnpackHi8x8ToU16x8(__m128i m)
1033 : {
1034 0 : __m128i zero = _mm_set1_epi8(0);
1035 0 : return _mm_unpackhi_epi8(m, zero);
1036 : }
1037 :
1038 : inline __m128i
1039 0 : InterleaveLo8(__m128i m1, __m128i m2)
1040 : {
1041 0 : return _mm_unpacklo_epi8(m1, m2);
1042 : }
1043 :
1044 : inline __m128i
1045 0 : InterleaveHi8(__m128i m1, __m128i m2)
1046 : {
1047 0 : return _mm_unpackhi_epi8(m1, m2);
1048 : }
1049 :
1050 : inline __m128i
1051 0 : InterleaveLo16(__m128i m1, __m128i m2)
1052 : {
1053 0 : return _mm_unpacklo_epi16(m1, m2);
1054 : }
1055 :
1056 : inline __m128i
1057 0 : InterleaveHi16(__m128i m1, __m128i m2)
1058 : {
1059 0 : return _mm_unpackhi_epi16(m1, m2);
1060 : }
1061 :
1062 : inline __m128i
1063 : InterleaveLo32(__m128i m1, __m128i m2)
1064 : {
1065 : return _mm_unpacklo_epi32(m1, m2);
1066 : }
1067 :
1068 : template<uint8_t aNumBytes>
1069 : inline __m128i
1070 0 : Rotate8(__m128i a1234, __m128i a5678)
1071 : {
1072 0 : return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes));
1073 : }
1074 :
1075 : inline __m128i
1076 0 : PackAndSaturate32To16(__m128i m1, __m128i m2)
1077 : {
1078 0 : return _mm_packs_epi32(m1, m2);
1079 : }
1080 :
1081 : inline __m128i
1082 0 : PackAndSaturate32ToU16(__m128i m1, __m128i m2)
1083 : {
1084 0 : return _mm_packs_epi32(m1, m2);
1085 : }
1086 :
1087 : inline __m128i
1088 0 : PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4)
1089 : {
1090 : // Pack into 8 16bit signed integers (saturating).
1091 0 : __m128i m12 = _mm_packs_epi32(m1, m2);
1092 0 : __m128i m34 = _mm_packs_epi32(m3, m4);
1093 :
1094 : // Pack into 16 8bit unsigned integers (saturating).
1095 0 : return _mm_packus_epi16(m12, m34);
1096 : }
1097 :
1098 : inline __m128i
1099 0 : PackAndSaturate16To8(__m128i m1, __m128i m2)
1100 : {
1101 : // Pack into 16 8bit unsigned integers (saturating).
1102 0 : return _mm_packus_epi16(m1, m2);
1103 : }
1104 :
1105 : inline __m128i
1106 0 : FastDivideBy255(__m128i m)
1107 : {
1108 : // v = m << 8
1109 0 : __m128i v = _mm_slli_epi32(m, 8);
1110 : // v = v + (m + (255,255,255,255))
1111 0 : v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
1112 : // v = v >> 16
1113 0 : return _mm_srai_epi32(v, 16);
1114 : }
1115 :
1116 : inline __m128i
1117 0 : FastDivideBy255_16(__m128i m)
1118 : {
1119 0 : __m128i zero = _mm_set1_epi16(0);
1120 0 : __m128i lo = _mm_unpacklo_epi16(m, zero);
1121 0 : __m128i hi = _mm_unpackhi_epi16(m, zero);
1122 0 : return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
1123 : }
1124 :
1125 : inline __m128i
1126 0 : Pick(__m128i mask, __m128i a, __m128i b)
1127 : {
1128 0 : return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
1129 : }
1130 :
1131 0 : inline __m128 MixF32(__m128 a, __m128 b, float t)
1132 : {
1133 0 : return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
1134 : }
1135 :
1136 0 : inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb)
1137 : {
1138 0 : return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb)));
1139 : }
1140 :
1141 0 : inline __m128 AbsF32(__m128 a)
1142 : {
1143 0 : return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
1144 : }
1145 :
1146 0 : inline __m128 AddF32(__m128 a, __m128 b)
1147 : {
1148 0 : return _mm_add_ps(a, b);
1149 : }
1150 :
1151 0 : inline __m128 MulF32(__m128 a, __m128 b)
1152 : {
1153 0 : return _mm_mul_ps(a, b);
1154 : }
1155 :
1156 0 : inline __m128 DivF32(__m128 a, __m128 b)
1157 : {
1158 0 : return _mm_div_ps(a, b);
1159 : }
1160 :
1161 : template<uint8_t aIndex>
1162 0 : inline __m128 SplatF32(__m128 m)
1163 : {
1164 0 : AssertIndex<aIndex>();
1165 0 : return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
1166 : }
1167 :
1168 0 : inline __m128i F32ToI32(__m128 m)
1169 : {
1170 0 : return _mm_cvtps_epi32(m);
1171 : }
1172 :
1173 : #endif // SIMD_COMPILE_SSE2
1174 :
1175 : } // namespace simd
1176 :
1177 : } // namespace gfx
1178 : } // namespace mozilla
1179 :
1180 : #endif // _MOZILLA_GFX_SIMD_H_
|