Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <stdio.h>
13 : #include <tmmintrin.h>
14 :
15 : #include "./aom_config.h"
16 : #include "./aom_dsp_rtcd.h"
17 : #include "aom_dsp/blend.h"
18 : #include "aom/aom_integer.h"
19 : #include "aom_dsp/x86/synonyms.h"
20 :
21 : // For width a multiple of 16
22 : static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
23 : int src_stride,
24 : const uint8_t *a_ptr, int a_stride,
25 : const uint8_t *b_ptr, int b_stride,
26 : const uint8_t *m_ptr, int m_stride,
27 : int width, int height);
28 :
29 : static INLINE unsigned int masked_sad8xh_ssse3(
30 : const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
31 : const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
32 : int height);
33 :
34 : static INLINE unsigned int masked_sad4xh_ssse3(
35 : const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
36 : const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
37 : int height);
38 :
39 : #define MASKSADMXN_SSSE3(m, n) \
40 : unsigned int aom_masked_sad##m##x##n##_ssse3( \
41 : const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
42 : const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
43 : int invert_mask) { \
44 : if (!invert_mask) \
45 : return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \
46 : m, msk, msk_stride, m, n); \
47 : else \
48 : return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \
49 : ref_stride, msk, msk_stride, m, n); \
50 : }
51 :
52 : #define MASKSAD8XN_SSSE3(n) \
53 : unsigned int aom_masked_sad8x##n##_ssse3( \
54 : const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
55 : const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
56 : int invert_mask) { \
57 : if (!invert_mask) \
58 : return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
59 : second_pred, 8, msk, msk_stride, n); \
60 : else \
61 : return masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
62 : ref_stride, msk, msk_stride, n); \
63 : }
64 :
65 : #define MASKSAD4XN_SSSE3(n) \
66 : unsigned int aom_masked_sad4x##n##_ssse3( \
67 : const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
68 : const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
69 : int invert_mask) { \
70 : if (!invert_mask) \
71 : return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
72 : second_pred, 4, msk, msk_stride, n); \
73 : else \
74 : return masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
75 : ref_stride, msk, msk_stride, n); \
76 : }
77 :
78 : #if CONFIG_EXT_PARTITION
79 : MASKSADMXN_SSSE3(128, 128)
80 : MASKSADMXN_SSSE3(128, 64)
81 : MASKSADMXN_SSSE3(64, 128)
82 : #endif // CONFIG_EXT_PARTITION
83 0 : MASKSADMXN_SSSE3(64, 64)
84 0 : MASKSADMXN_SSSE3(64, 32)
85 0 : MASKSADMXN_SSSE3(32, 64)
86 0 : MASKSADMXN_SSSE3(32, 32)
87 0 : MASKSADMXN_SSSE3(32, 16)
88 0 : MASKSADMXN_SSSE3(16, 32)
89 0 : MASKSADMXN_SSSE3(16, 16)
90 0 : MASKSADMXN_SSSE3(16, 8)
91 0 : MASKSAD8XN_SSSE3(16)
92 0 : MASKSAD8XN_SSSE3(8)
93 0 : MASKSAD8XN_SSSE3(4)
94 0 : MASKSAD4XN_SSSE3(8)
95 0 : MASKSAD4XN_SSSE3(4)
96 :
97 0 : static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
98 : int src_stride,
99 : const uint8_t *a_ptr, int a_stride,
100 : const uint8_t *b_ptr, int b_stride,
101 : const uint8_t *m_ptr, int m_stride,
102 : int width, int height) {
103 : int x, y;
104 0 : __m128i res = _mm_setzero_si128();
105 0 : const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
106 :
107 0 : for (y = 0; y < height; y++) {
108 0 : for (x = 0; x < width; x += 16) {
109 0 : const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
110 0 : const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
111 0 : const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
112 0 : const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
113 0 : const __m128i m_inv = _mm_sub_epi8(mask_max, m);
114 :
115 : // Calculate 16 predicted pixels.
116 : // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
117 : // is 64 * 255, so we have plenty of space to add rounding constants.
118 0 : const __m128i data_l = _mm_unpacklo_epi8(a, b);
119 0 : const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
120 0 : __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
121 0 : pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
122 :
123 0 : const __m128i data_r = _mm_unpackhi_epi8(a, b);
124 0 : const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
125 0 : __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
126 0 : pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
127 :
128 0 : const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
129 0 : res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
130 : }
131 :
132 0 : src_ptr += src_stride;
133 0 : a_ptr += a_stride;
134 0 : b_ptr += b_stride;
135 0 : m_ptr += m_stride;
136 : }
137 : // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
138 0 : int32_t sad =
139 0 : _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
140 0 : return (sad + 31) >> 6;
141 : }
142 :
143 0 : static INLINE unsigned int masked_sad8xh_ssse3(
144 : const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
145 : const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
146 : int height) {
147 : int y;
148 0 : __m128i res = _mm_setzero_si128();
149 0 : const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
150 :
151 0 : for (y = 0; y < height; y += 2) {
152 0 : const __m128i src = _mm_unpacklo_epi64(
153 : _mm_loadl_epi64((const __m128i *)src_ptr),
154 0 : _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
155 0 : const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
156 0 : const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
157 0 : const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
158 0 : const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
159 0 : const __m128i m =
160 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
161 0 : _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
162 0 : const __m128i m_inv = _mm_sub_epi8(mask_max, m);
163 :
164 0 : const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
165 0 : const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
166 0 : __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
167 0 : pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
168 :
169 0 : const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
170 0 : const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
171 0 : __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
172 0 : pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
173 :
174 0 : const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
175 0 : res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
176 :
177 0 : src_ptr += src_stride * 2;
178 0 : a_ptr += a_stride * 2;
179 0 : b_ptr += b_stride * 2;
180 0 : m_ptr += m_stride * 2;
181 : }
182 0 : int32_t sad =
183 0 : _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
184 0 : return (sad + 31) >> 6;
185 : }
186 :
187 0 : static INLINE unsigned int masked_sad4xh_ssse3(
188 : const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
189 : const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
190 : int height) {
191 : int y;
192 0 : __m128i res = _mm_setzero_si128();
193 0 : const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
194 :
195 0 : for (y = 0; y < height; y += 2) {
196 : // Load two rows at a time, this seems to be a bit faster
197 : // than four rows at a time in this case.
198 0 : const __m128i src = _mm_unpacklo_epi32(
199 0 : _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
200 0 : _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
201 0 : const __m128i a =
202 0 : _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
203 0 : _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
204 0 : const __m128i b =
205 0 : _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
206 0 : _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
207 0 : const __m128i m =
208 0 : _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
209 0 : _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
210 0 : const __m128i m_inv = _mm_sub_epi8(mask_max, m);
211 :
212 0 : const __m128i data = _mm_unpacklo_epi8(a, b);
213 0 : const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
214 0 : __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
215 0 : pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
216 :
217 0 : const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
218 0 : res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
219 :
220 0 : src_ptr += src_stride * 2;
221 0 : a_ptr += a_stride * 2;
222 0 : b_ptr += b_stride * 2;
223 0 : m_ptr += m_stride * 2;
224 : }
225 : // At this point, the SAD is stored in lane 0 of 'res'
226 0 : int32_t sad = _mm_cvtsi128_si32(res);
227 0 : return (sad + 31) >> 6;
228 : }
229 :
230 : #if CONFIG_HIGHBITDEPTH
231 : // For width a multiple of 8
232 : static INLINE unsigned int highbd_masked_sad_ssse3(
233 : const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
234 : const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
235 : int width, int height);
236 :
237 : static INLINE unsigned int highbd_masked_sad4xh_ssse3(
238 : const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
239 : const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
240 : int height);
241 :
242 : #define HIGHBD_MASKSADMXN_SSSE3(m, n) \
243 : unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
244 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
245 : int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
246 : int msk_stride, int invert_mask) { \
247 : if (!invert_mask) \
248 : return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \
249 : second_pred8, m, msk, msk_stride, m, n); \
250 : else \
251 : return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
252 : ref_stride, msk, msk_stride, m, n); \
253 : }
254 :
255 : #define HIGHBD_MASKSAD4XN_SSSE3(n) \
256 : unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
257 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
258 : int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
259 : int msk_stride, int invert_mask) { \
260 : if (!invert_mask) \
261 : return highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, ref_stride, \
262 : second_pred8, 4, msk, msk_stride, n); \
263 : else \
264 : return highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
265 : ref8, ref_stride, msk, msk_stride, n); \
266 : }
267 :
268 : #if CONFIG_EXT_PARTITION
269 : HIGHBD_MASKSADMXN_SSSE3(128, 128)
270 : HIGHBD_MASKSADMXN_SSSE3(128, 64)
271 : HIGHBD_MASKSADMXN_SSSE3(64, 128)
272 : #endif // CONFIG_EXT_PARTITION
273 0 : HIGHBD_MASKSADMXN_SSSE3(64, 64)
274 0 : HIGHBD_MASKSADMXN_SSSE3(64, 32)
275 0 : HIGHBD_MASKSADMXN_SSSE3(32, 64)
276 0 : HIGHBD_MASKSADMXN_SSSE3(32, 32)
277 0 : HIGHBD_MASKSADMXN_SSSE3(32, 16)
278 0 : HIGHBD_MASKSADMXN_SSSE3(16, 32)
279 0 : HIGHBD_MASKSADMXN_SSSE3(16, 16)
280 0 : HIGHBD_MASKSADMXN_SSSE3(16, 8)
281 0 : HIGHBD_MASKSADMXN_SSSE3(8, 16)
282 0 : HIGHBD_MASKSADMXN_SSSE3(8, 8)
283 0 : HIGHBD_MASKSADMXN_SSSE3(8, 4)
284 0 : HIGHBD_MASKSAD4XN_SSSE3(8)
285 0 : HIGHBD_MASKSAD4XN_SSSE3(4)
286 :
287 0 : static INLINE unsigned int highbd_masked_sad_ssse3(
288 : const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
289 : const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
290 : int width, int height) {
291 0 : const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
292 0 : const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
293 0 : const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
294 : int x, y;
295 0 : __m128i res = _mm_setzero_si128();
296 0 : const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
297 0 : const __m128i round_const =
298 : _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
299 0 : const __m128i one = _mm_set1_epi16(1);
300 :
301 0 : for (y = 0; y < height; y++) {
302 0 : for (x = 0; x < width; x += 8) {
303 0 : const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
304 0 : const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
305 0 : const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
306 : // Zero-extend mask to 16 bits
307 0 : const __m128i m = _mm_unpacklo_epi8(
308 0 : _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
309 0 : const __m128i m_inv = _mm_sub_epi16(mask_max, m);
310 :
311 0 : const __m128i data_l = _mm_unpacklo_epi16(a, b);
312 0 : const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
313 0 : __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
314 0 : pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
315 : AOM_BLEND_A64_ROUND_BITS);
316 :
317 0 : const __m128i data_r = _mm_unpackhi_epi16(a, b);
318 0 : const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
319 0 : __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
320 0 : pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
321 : AOM_BLEND_A64_ROUND_BITS);
322 :
323 : // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
324 : // so it is safe to do signed saturation here.
325 0 : const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
326 : // There is no 16-bit SAD instruction, so we have to synthesize
327 : // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
328 : // and accumulating them at the end
329 0 : const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
330 0 : res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
331 : }
332 :
333 0 : src_ptr += src_stride;
334 0 : a_ptr += a_stride;
335 0 : b_ptr += b_stride;
336 0 : m_ptr += m_stride;
337 : }
338 : // At this point, we have four 32-bit partial SADs stored in 'res'.
339 0 : res = _mm_hadd_epi32(res, res);
340 0 : res = _mm_hadd_epi32(res, res);
341 0 : int sad = _mm_cvtsi128_si32(res);
342 0 : return (sad + 31) >> 6;
343 : }
344 :
345 0 : static INLINE unsigned int highbd_masked_sad4xh_ssse3(
346 : const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
347 : const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
348 : int height) {
349 0 : const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
350 0 : const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
351 0 : const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
352 : int y;
353 0 : __m128i res = _mm_setzero_si128();
354 0 : const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
355 0 : const __m128i round_const =
356 : _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
357 0 : const __m128i one = _mm_set1_epi16(1);
358 :
359 0 : for (y = 0; y < height; y += 2) {
360 0 : const __m128i src = _mm_unpacklo_epi64(
361 : _mm_loadl_epi64((const __m128i *)src_ptr),
362 0 : _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
363 0 : const __m128i a =
364 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
365 0 : _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
366 0 : const __m128i b =
367 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
368 0 : _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
369 : // Zero-extend mask to 16 bits
370 0 : const __m128i m = _mm_unpacklo_epi8(
371 : _mm_unpacklo_epi32(
372 0 : _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
373 0 : _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
374 : _mm_setzero_si128());
375 0 : const __m128i m_inv = _mm_sub_epi16(mask_max, m);
376 :
377 0 : const __m128i data_l = _mm_unpacklo_epi16(a, b);
378 0 : const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
379 0 : __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
380 0 : pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
381 : AOM_BLEND_A64_ROUND_BITS);
382 :
383 0 : const __m128i data_r = _mm_unpackhi_epi16(a, b);
384 0 : const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
385 0 : __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
386 0 : pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
387 : AOM_BLEND_A64_ROUND_BITS);
388 :
389 0 : const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
390 0 : const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
391 0 : res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
392 :
393 0 : src_ptr += src_stride * 2;
394 0 : a_ptr += a_stride * 2;
395 0 : b_ptr += b_stride * 2;
396 0 : m_ptr += m_stride * 2;
397 : }
398 0 : res = _mm_hadd_epi32(res, res);
399 0 : res = _mm_hadd_epi32(res, res);
400 0 : int sad = _mm_cvtsi128_si32(res);
401 0 : return (sad + 31) >> 6;
402 : }
403 :
404 : #endif
|