Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef _V128_INTRINSICS_H
13 : #define _V128_INTRINSICS_H
14 :
15 : #include "./v64_intrinsics_x86.h"
16 :
17 : typedef __m128i v128;
18 :
19 : SIMD_INLINE uint32_t v128_low_u32(v128 a) {
20 0 : return (uint32_t)_mm_cvtsi128_si32(a);
21 : }
22 :
23 : SIMD_INLINE v64 v128_low_v64(v128 a) {
24 0 : return _mm_unpacklo_epi64(a, v64_zero());
25 : }
26 :
27 0 : SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
28 :
29 : SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
30 0 : return _mm_unpacklo_epi64(b, a);
31 : }
32 :
33 : SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
34 : return v128_from_v64(v64_from_64(a), v64_from_64(b));
35 : }
36 :
37 : SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
38 0 : return _mm_set_epi32(a, b, c, d);
39 : }
40 :
41 : SIMD_INLINE v128 v128_load_aligned(const void *p) {
42 0 : return _mm_load_si128((__m128i *)p);
43 : }
44 :
45 : SIMD_INLINE v128 v128_load_unaligned(const void *p) {
46 : #if defined(__SSSE3__)
47 0 : return (__m128i)_mm_lddqu_si128((__m128i *)p);
48 : #else
49 0 : return _mm_loadu_si128((__m128i *)p);
50 : #endif
51 : }
52 :
53 : SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
54 : _mm_store_si128((__m128i *)p, a);
55 : }
56 :
57 : SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
58 : _mm_storeu_si128((__m128i *)p, a);
59 : }
60 :
61 : // The following function requires an immediate.
62 : // Some compilers will check this during optimisation, others wont.
63 : #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
64 : #if defined(__SSSE3__)
65 : SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
66 : return c ? _mm_alignr_epi8(a, b, c) : b;
67 : }
68 : #else
69 : #define v128_align(a, b, c) \
70 : ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
71 : #endif
72 : #else
73 : #if defined(__SSSE3__)
74 : #define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, c) : (b))
75 : #else
76 : #define v128_align(a, b, c) \
77 : ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
78 : #endif
79 : #endif
80 :
81 0 : SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); }
82 :
83 0 : SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
84 :
85 0 : SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
86 :
87 0 : SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
88 :
89 0 : SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
90 :
91 0 : SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
92 :
93 : SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
94 :
95 0 : SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
96 :
97 : SIMD_INLINE v128 v128_padd_s16(v128 a) {
98 : return _mm_madd_epi16(a, _mm_set1_epi16(1));
99 : }
100 :
101 : SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
102 :
103 0 : SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
104 :
105 : SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
106 :
107 0 : SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
108 :
109 : SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
110 :
111 0 : SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
112 :
113 : SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
114 :
115 : SIMD_INLINE v128 v128_abs_s16(v128 a) {
116 : #if defined(__SSSE3__)
117 0 : return _mm_abs_epi16(a);
118 : #else
119 0 : return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
120 : #endif
121 : }
122 :
123 : SIMD_INLINE v128 v128_abs_s8(v128 a) {
124 : #if defined(__SSSE3__)
125 0 : return _mm_abs_epi8(a);
126 : #else
127 0 : v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
128 0 : return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
129 : #endif
130 : }
131 :
132 : SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
133 : return _mm_unpacklo_epi8(b, a);
134 : }
135 :
136 : SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
137 : return _mm_unpackhi_epi8(b, a);
138 : }
139 :
140 : SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
141 0 : return _mm_unpacklo_epi16(b, a);
142 : }
143 :
144 : SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
145 0 : return _mm_unpackhi_epi16(b, a);
146 : }
147 :
148 : SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
149 0 : return _mm_unpacklo_epi32(b, a);
150 : }
151 :
152 : SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
153 0 : return _mm_unpackhi_epi32(b, a);
154 : }
155 :
156 : SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
157 0 : return _mm_unpacklo_epi64(b, a);
158 : }
159 :
160 : SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
161 0 : return _mm_unpackhi_epi64(b, a);
162 : }
163 :
164 : SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
165 :
166 : SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
167 :
168 : SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
169 :
170 : SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
171 : return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
172 : }
173 :
174 : SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
175 : #if defined(__SSSE3__)
176 : #ifdef __x86_64__
177 : v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
178 : #else
179 : v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
180 : #endif
181 : return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
182 : _mm_shuffle_epi8(a, order));
183 : #else
184 : return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
185 : #endif
186 : }
187 :
188 : SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
189 : return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
190 : }
191 :
192 : SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
193 : #if defined(__SSSE3__)
194 : #ifdef __x86_64__
195 : v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
196 : #else
197 : v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
198 : #endif
199 : return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
200 : _mm_shuffle_epi8(a, order));
201 : #else
202 : return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
203 : #endif
204 : }
205 :
206 : SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
207 : return _mm_castps_si128(_mm_shuffle_ps(
208 : _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
209 : }
210 :
211 : SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
212 : return _mm_castps_si128(_mm_shuffle_ps(
213 : _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
214 : }
215 :
216 : SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
217 0 : return _mm_unpacklo_epi8(a, _mm_setzero_si128());
218 : }
219 :
220 : SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
221 : return _mm_unpacklo_epi8(a, _mm_setzero_si128());
222 : }
223 :
224 : SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
225 : return _mm_unpackhi_epi8(a, _mm_setzero_si128());
226 : }
227 :
228 : SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
229 : return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
230 : }
231 :
232 : SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
233 : return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
234 : }
235 :
236 : SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
237 : return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
238 : }
239 :
240 : SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
241 : return _mm_packs_epi32(b, a);
242 : }
243 :
244 : SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
245 0 : return _mm_packus_epi16(b, a);
246 : }
247 :
248 : SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
249 0 : return _mm_packs_epi16(b, a);
250 : }
251 :
252 : SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
253 : return _mm_unpacklo_epi16(a, _mm_setzero_si128());
254 : }
255 :
256 : SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
257 : return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
258 : }
259 :
260 : SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
261 : return _mm_unpacklo_epi16(a, _mm_setzero_si128());
262 : }
263 :
264 : SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
265 : return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
266 : }
267 :
268 : SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
269 : return _mm_unpackhi_epi16(a, _mm_setzero_si128());
270 : }
271 :
272 : SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
273 : return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
274 : }
275 :
276 : SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
277 : #if defined(__SSSE3__)
278 0 : return _mm_shuffle_epi8(x, pattern);
279 : #else
280 : v128 output;
281 0 : unsigned char *input = (unsigned char *)&x;
282 0 : unsigned char *index = (unsigned char *)&pattern;
283 0 : char *selected = (char *)&output;
284 : int counter;
285 :
286 0 : for (counter = 0; counter < 16; counter++) {
287 0 : selected[counter] = input[index[counter] & 15];
288 : }
289 :
290 0 : return output;
291 : #endif
292 : }
293 :
294 : SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
295 : v128 r = _mm_madd_epi16(a, b);
296 : #if defined(__SSE4_1__) && defined(__x86_64__)
297 : v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
298 : _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
299 : return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
300 : #else
301 : return (int64_t)_mm_cvtsi128_si32(r) +
302 : (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
303 : (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
304 : (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
305 : #endif
306 : }
307 :
308 : SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
309 : v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
310 : return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
311 : }
312 :
313 : typedef v128 sad128_internal;
314 :
315 : SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); }
316 :
317 : /* Implementation dependent return value. Result must be finalised with
318 : v128_sad_sum().
319 : The result for more than 32 v128_sad_u8() calls is undefined. */
320 : SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
321 : return _mm_add_epi64(s, _mm_sad_epu8(a, b));
322 : }
323 :
324 : SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
325 : return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
326 : }
327 :
328 : typedef v128 ssd128_internal;
329 :
330 : SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
331 :
332 : /* Implementation dependent return value. Result must be finalised with
333 : * v128_ssd_sum(). */
334 : SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
335 : v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
336 : _mm_unpacklo_epi8(b, _mm_setzero_si128()));
337 : v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
338 : _mm_unpackhi_epi8(b, _mm_setzero_si128()));
339 : v128 rl = _mm_madd_epi16(l, l);
340 : v128 rh = _mm_madd_epi16(h, h);
341 : v128 c = _mm_cvtsi32_si128(32);
342 : rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
343 : rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
344 : rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
345 : rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
346 : return _mm_add_epi64(
347 : s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
348 : }
349 :
350 : SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
351 : return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
352 : }
353 :
354 : SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
355 :
356 0 : SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
357 :
358 : SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
359 :
360 : SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
361 :
362 : SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
363 : v64 lo_bits = v64_mullo_s16(a, b);
364 : v64 hi_bits = v64_mulhi_s16(a, b);
365 : return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
366 : v64_ziplo_16(hi_bits, lo_bits));
367 : }
368 :
369 : SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
370 : return _mm_mullo_epi16(a, b);
371 : }
372 :
373 : SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
374 : return _mm_mulhi_epi16(a, b);
375 : }
376 :
377 : SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
378 : #if defined(__SSE4_1__)
379 0 : return _mm_mullo_epi32(a, b);
380 : #else
381 0 : return _mm_unpacklo_epi32(
382 0 : _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
383 0 : _mm_shuffle_epi32(
384 : _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
385 : #endif
386 : }
387 :
388 0 : SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
389 :
390 : SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
391 : #if defined(__SSSE3__)
392 : return _mm_maddubs_epi16(a, b);
393 : #else
394 : return _mm_packs_epi32(
395 : _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
396 : _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
397 : _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
398 : _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
399 : #endif
400 : }
401 :
402 : SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
403 :
404 : SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
405 : return _mm_sub_epi8(_mm_avg_epu8(a, b),
406 : _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
407 : }
408 :
409 : SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
410 :
411 0 : SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
412 :
413 : SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
414 :
415 : SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
416 : #if defined(__SSE4_1__)
417 : return _mm_min_epi8(a, b);
418 : #else
419 : v128 mask = _mm_cmplt_epi8(a, b);
420 : return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
421 : #endif
422 : }
423 :
424 : SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
425 : #if defined(__SSE4_1__)
426 : return _mm_max_epi8(a, b);
427 : #else
428 : v128 mask = _mm_cmplt_epi8(b, a);
429 : return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
430 : #endif
431 : }
432 :
433 0 : SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
434 :
435 : SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
436 :
437 : SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
438 :
439 0 : SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
440 :
441 : SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
442 :
443 : SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
444 : return _mm_cmpgt_epi16(a, b);
445 : }
446 :
447 : SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
448 0 : return _mm_cmplt_epi16(a, b);
449 : }
450 :
451 : SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
452 :
453 : SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
454 : return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
455 : _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
456 : }
457 :
458 : SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
459 0 : return _mm_and_si128(_mm_set1_epi8(0xff >> c),
460 : _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
461 : }
462 :
463 : SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
464 0 : __m128i x = _mm_cvtsi32_si128(c + 8);
465 0 : return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
466 : _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
467 : }
468 :
469 : SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
470 : return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
471 : }
472 :
473 : SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
474 0 : return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
475 : }
476 :
477 : SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
478 0 : return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
479 : }
480 :
481 : SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
482 : return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
483 : }
484 :
485 : SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
486 : return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
487 : }
488 :
489 : SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
490 : return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
491 : }
492 :
493 : /* These intrinsics require immediate values, so we must use #defines
494 : to enforce that. */
495 : #define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
496 : #define v128_shr_n_byte(a, c) _mm_srli_si128(a, c)
497 : #define v128_shl_n_8(a, c) \
498 : _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
499 : #define v128_shr_n_u8(a, c) \
500 : _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
501 : #define v128_shr_n_s8(a, c) \
502 : _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
503 : _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
504 : #define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
505 : #define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
506 : #define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
507 : #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
508 : #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
509 : #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
510 :
511 : #endif /* _V128_INTRINSICS_H */
|