Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef _V64_INTRINSICS_H
13 : #define _V64_INTRINSICS_H
14 :
15 : #include <emmintrin.h>
16 : #if defined(__SSSE3__)
17 : #include <tmmintrin.h>
18 : #endif
19 : #if defined(__SSE4_1__)
20 : #include <smmintrin.h>
21 : #endif
22 :
23 : typedef __m128i v64;
24 :
25 : SIMD_INLINE uint32_t v64_low_u32(v64 a) {
26 : return (uint32_t)_mm_cvtsi128_si32(a);
27 : }
28 :
29 : SIMD_INLINE uint32_t v64_high_u32(v64 a) {
30 : return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
31 : }
32 :
33 : SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
34 :
35 : SIMD_INLINE int32_t v64_high_s32(v64 a) {
36 : return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
37 : }
38 :
39 : SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
40 : return _mm_packs_epi32(
41 : _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
42 : _mm_setzero_si128());
43 : }
44 :
45 : SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
46 : return _mm_set_epi32(0, 0, x, y);
47 : }
48 :
49 : SIMD_INLINE v64 v64_from_64(uint64_t x) {
50 : #ifdef __x86_64__
51 : return _mm_cvtsi64_si128(x);
52 : #else
53 : return _mm_set_epi32(0, 0, x >> 32, (uint32_t)x);
54 : #endif
55 : }
56 :
57 : SIMD_INLINE uint64_t v64_u64(v64 x) {
58 : return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
59 : }
60 :
61 : SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
62 : return *((uint32_t *)p);
63 : }
64 :
65 : SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
66 : return *((uint32_t *)p);
67 : }
68 :
69 : SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
70 0 : *((uint32_t *)p) = a;
71 : }
72 :
73 : SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
74 0 : *((uint32_t *)p) = a;
75 : }
76 :
77 : SIMD_INLINE v64 v64_load_aligned(const void *p) {
78 0 : return _mm_loadl_epi64((__m128i *)p);
79 : }
80 :
81 : SIMD_INLINE v64 v64_load_unaligned(const void *p) {
82 0 : return _mm_loadl_epi64((__m128i *)p);
83 : }
84 :
85 : SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
86 : _mm_storel_epi64((__m128i *)p, a);
87 : }
88 :
89 : SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
90 : _mm_storel_epi64((__m128i *)p, a);
91 : }
92 :
93 : // The following function requires an immediate.
94 : #if defined(__OPTIMIZE__) && __OPTIMIZE__
95 : #define v64_align(a, b, c) \
96 : ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
97 : #else
98 : #define v64_align(a, b, c) \
99 : ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
100 : : (b))
101 : #endif
102 :
103 0 : SIMD_INLINE v64 v64_zero() { return _mm_setzero_si128(); }
104 :
105 : SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8(x); }
106 :
107 : SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16(x); }
108 :
109 : SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
110 :
111 : SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
112 :
113 : SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
114 :
115 : SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
116 :
117 : SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
118 :
119 : SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
120 :
121 : SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
122 :
123 : SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
124 :
125 : SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
126 :
127 : SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
128 :
129 : SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
130 :
131 : SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
132 :
133 : SIMD_INLINE v64 v64_abs_s16(v64 a) {
134 : #if defined(__SSSE3__)
135 : return _mm_abs_epi16(a);
136 : #else
137 : return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
138 : #endif
139 : }
140 :
141 : SIMD_INLINE v64 v64_abs_s8(v64 a) {
142 : #if defined(__SSSE3__)
143 : return _mm_abs_epi8(a);
144 : #else
145 : v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
146 : return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
147 : #endif
148 : }
149 :
150 : SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
151 :
152 : SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
153 : return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
154 : }
155 :
156 : SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
157 :
158 : SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
159 : return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
160 : }
161 :
162 : SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
163 :
164 : SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
165 : return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
166 : }
167 :
168 : SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
169 : __m128i t = _mm_unpacklo_epi64(b, a);
170 : return _mm_packs_epi32(t, t);
171 : }
172 :
173 : SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
174 : __m128i t = _mm_unpacklo_epi64(b, a);
175 : return _mm_packus_epi16(t, t);
176 : }
177 :
178 : SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
179 : __m128i t = _mm_unpacklo_epi64(b, a);
180 : return _mm_packs_epi16(t, t);
181 : }
182 :
183 : SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
184 : #if defined(__SSSE3__)
185 : return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
186 : v64_from_64(0x0f0d0b0907050301LL));
187 : #else
188 : return _mm_packus_epi16(
189 : _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
190 : _mm_setzero_si128());
191 : #endif
192 : }
193 :
194 : SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
195 : #if defined(__SSSE3__)
196 : return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
197 : v64_from_64(0x0e0c0a0806040200LL));
198 : #else
199 : return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
200 : #endif
201 : }
202 :
203 : SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
204 : #if defined(__SSSE3__)
205 : return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
206 : v64_from_64(0x0f0e0b0a07060302LL));
207 : #else
208 : return _mm_packs_epi32(
209 : _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
210 : _mm_setzero_si128());
211 : #endif
212 : }
213 :
214 : SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
215 : #if defined(__SSSE3__)
216 : return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
217 : v64_from_64(0x0d0c090805040100LL));
218 : #else
219 : return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
220 : #endif
221 : }
222 :
223 : SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
224 : return _mm_unpacklo_epi8(a, _mm_setzero_si128());
225 : }
226 :
227 : SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
228 : return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
229 : }
230 :
231 : SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
232 : return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
233 : }
234 :
235 : SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
236 : return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
237 : }
238 :
239 : SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
240 : return _mm_unpacklo_epi16(a, _mm_setzero_si128());
241 : }
242 :
243 : SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
244 : return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
245 : }
246 :
247 : SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
248 : return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
249 : }
250 :
251 : SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
252 : return _mm_srli_si128(
253 : _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
254 : }
255 :
256 : SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
257 : #if defined(__SSSE3__)
258 : return _mm_shuffle_epi8(x, pattern);
259 : #else
260 : v64 output;
261 : unsigned char *input = (unsigned char *)&x;
262 : unsigned char *index = (unsigned char *)&pattern;
263 : char *selected = (char *)&output;
264 : int counter;
265 :
266 : for (counter = 0; counter < 8; counter++) {
267 : selected[counter] = input[index[counter]];
268 : }
269 :
270 : return output;
271 : #endif
272 : }
273 :
274 : SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
275 : __m128i r, r1, r2, z;
276 : z = _mm_setzero_si128();
277 : r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
278 : _mm_unpacklo_epi8(b, z));
279 : r2 = _mm_srli_si128(r1, 8);
280 : r = _mm_add_epi32(r1, r2);
281 : r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
282 : return ((int32_t)v64_low_u32(r)) >> 8;
283 : }
284 :
285 : SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
286 : __m128i r = _mm_madd_epi16(a, b);
287 : #if defined(__SSE4_1__) && defined(__x86_64__)
288 : __m128i x = _mm_cvtepi32_epi64(r);
289 : return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
290 : #else
291 : return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
292 : (int64_t)_mm_cvtsi128_si32(r);
293 : #endif
294 : }
295 :
296 : SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
297 : return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
298 : }
299 :
300 : SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
301 : return v64_dotp_s16(a, v64_dup_16(1));
302 : }
303 :
304 : typedef v64 sad64_internal;
305 :
306 : SIMD_INLINE sad64_internal v64_sad_u8_init() { return _mm_setzero_si128(); }
307 :
308 : /* Implementation dependent return value. Result must be finalised with
309 : v64_sad_u8_sum().
310 : The result for more than 32 v64_sad_u8() calls is undefined. */
311 : SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
312 : return _mm_add_epi64(s, _mm_sad_epu8(a, b));
313 : }
314 :
315 : SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
316 :
317 : typedef v64 ssd64_internal;
318 :
319 : SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return _mm_setzero_si128(); }
320 :
321 : /* Implementation dependent return value. Result must be finalised with
322 : * v64_ssd_u8_sum(). */
323 : SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
324 : v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
325 : v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
326 : v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
327 : return _mm_add_epi64(
328 : s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
329 : }
330 :
331 : SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
332 :
333 : SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
334 :
335 : SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
336 :
337 : SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
338 :
339 : SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
340 :
341 : SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
342 :
343 : SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
344 :
345 : SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
346 : #if defined(__SSE4_1__)
347 : return _mm_mullo_epi32(a, b);
348 : #else
349 : return _mm_unpacklo_epi32(
350 : _mm_mul_epu32(a, b),
351 : _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
352 : #endif
353 : }
354 :
355 : SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
356 :
357 : SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
358 : #if defined(__SSSE3__)
359 : return _mm_maddubs_epi16(a, b);
360 : #else
361 : __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
362 : _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
363 : return _mm_packs_epi32(t, t);
364 : #endif
365 : }
366 :
367 : SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
368 :
369 : SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
370 : return _mm_sub_epi8(_mm_avg_epu8(a, b),
371 : _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
372 : }
373 :
374 : SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
375 :
376 : SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
377 :
378 : SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
379 :
380 : SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
381 : #if defined(__SSE4_1__)
382 : return _mm_min_epi8(a, b);
383 : #else
384 : v64 mask = _mm_cmplt_epi8(a, b);
385 : return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
386 : #endif
387 : }
388 :
389 : SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
390 : #if defined(__SSE4_1__)
391 : return _mm_max_epi8(a, b);
392 : #else
393 : v64 mask = _mm_cmplt_epi8(b, a);
394 : return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
395 : #endif
396 : }
397 :
398 : SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
399 :
400 : SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
401 :
402 : SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
403 :
404 : SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
405 :
406 : SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
407 :
408 : SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
409 :
410 : SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
411 :
412 : SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
413 :
414 : SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
415 : return _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << c)),
416 : _mm_sll_epi16(a, _mm_cvtsi32_si128(c)));
417 : }
418 :
419 : SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
420 : return _mm_and_si128(_mm_set1_epi8(0xff >> c),
421 : _mm_srl_epi16(a, _mm_cvtsi32_si128(c)));
422 : }
423 :
424 : SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
425 : return _mm_packs_epi16(
426 : _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128(c + 8)), a);
427 : }
428 :
429 : SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
430 : return _mm_sll_epi16(a, _mm_cvtsi32_si128(c));
431 : }
432 :
433 : SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
434 : return _mm_srl_epi16(a, _mm_cvtsi32_si128(c));
435 : }
436 :
437 : SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
438 : return _mm_sra_epi16(a, _mm_cvtsi32_si128(c));
439 : }
440 :
441 : SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
442 : return _mm_sll_epi32(a, _mm_cvtsi32_si128(c));
443 : }
444 :
445 : SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
446 : return _mm_srl_epi32(a, _mm_cvtsi32_si128(c));
447 : }
448 :
449 : SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
450 : return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
451 : }
452 :
453 : /* These intrinsics require immediate values, so we must use #defines
454 : to enforce that. */
455 : #define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
456 : #define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
457 : #define v64_shl_n_8(a, c) \
458 : _mm_and_si128(_mm_set1_epi8((uint8_t)(0xff << (c))), _mm_slli_epi16(a, c))
459 : #define v64_shr_n_u8(a, c) \
460 : _mm_and_si128(_mm_set1_epi8(0xff >> (c)), _mm_srli_epi16(a, c))
461 : #define v64_shr_n_s8(a, c) \
462 : _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
463 : #define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
464 : #define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
465 : #define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
466 : #define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
467 : #define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
468 : #define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
469 :
470 : #endif /* _V64_INTRINSICS_H */
|