Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <stdlib.h>
13 : #include <string.h>
14 : #include <tmmintrin.h>
15 :
16 : #include "./aom_config.h"
17 : #include "./aom_dsp_rtcd.h"
18 : #include "aom_dsp/blend.h"
19 : #include "aom/aom_integer.h"
20 : #include "aom_ports/mem.h"
21 : #include "aom_dsp/aom_filter.h"
22 : #include "aom_dsp/x86/synonyms.h"
23 :
24 : // For width a multiple of 16
25 : static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
26 : int yoffset, uint8_t *dst, int w, int h);
27 :
28 : static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
29 : int yoffset, uint8_t *dst, int h);
30 :
31 : static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
32 : int yoffset, uint8_t *dst, int h);
33 :
34 : // For width a multiple of 16
35 : static void masked_variance(const uint8_t *src_ptr, int src_stride,
36 : const uint8_t *a_ptr, int a_stride,
37 : const uint8_t *b_ptr, int b_stride,
38 : const uint8_t *m_ptr, int m_stride, int width,
39 : int height, unsigned int *sse, int *sum_);
40 :
41 : static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
42 : const uint8_t *a_ptr, const uint8_t *b_ptr,
43 : const uint8_t *m_ptr, int m_stride, int height,
44 : unsigned int *sse, int *sum_);
45 :
46 : static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
47 : const uint8_t *a_ptr, const uint8_t *b_ptr,
48 : const uint8_t *m_ptr, int m_stride, int height,
49 : unsigned int *sse, int *sum_);
50 :
51 : #define MASK_SUBPIX_VAR_SSSE3(W, H) \
52 : unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
53 : const uint8_t *src, int src_stride, int xoffset, int yoffset, \
54 : const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
55 : const uint8_t *msk, int msk_stride, int invert_mask, \
56 : unsigned int *sse) { \
57 : int sum; \
58 : uint8_t temp[(H + 1) * W]; \
59 : \
60 : bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
61 : \
62 : if (!invert_mask) \
63 : masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
64 : msk_stride, W, H, sse, &sum); \
65 : else \
66 : masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
67 : msk_stride, W, H, sse, &sum); \
68 : return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
69 : }
70 :
71 : #define MASK_SUBPIX_VAR8XH_SSSE3(H) \
72 : unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \
73 : const uint8_t *src, int src_stride, int xoffset, int yoffset, \
74 : const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
75 : const uint8_t *msk, int msk_stride, int invert_mask, \
76 : unsigned int *sse) { \
77 : int sum; \
78 : uint8_t temp[(H + 1) * 8]; \
79 : \
80 : bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \
81 : \
82 : if (!invert_mask) \
83 : masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
84 : H, sse, &sum); \
85 : else \
86 : masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
87 : H, sse, &sum); \
88 : return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \
89 : }
90 :
91 : #define MASK_SUBPIX_VAR4XH_SSSE3(H) \
92 : unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \
93 : const uint8_t *src, int src_stride, int xoffset, int yoffset, \
94 : const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
95 : const uint8_t *msk, int msk_stride, int invert_mask, \
96 : unsigned int *sse) { \
97 : int sum; \
98 : uint8_t temp[(H + 1) * 4]; \
99 : \
100 : bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
101 : \
102 : if (!invert_mask) \
103 : masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
104 : H, sse, &sum); \
105 : else \
106 : masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
107 : H, sse, &sum); \
108 : return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
109 : }
110 :
111 : #if CONFIG_EXT_PARTITION
112 : MASK_SUBPIX_VAR_SSSE3(128, 128)
113 : MASK_SUBPIX_VAR_SSSE3(128, 64)
114 : MASK_SUBPIX_VAR_SSSE3(64, 128)
115 : #endif
116 0 : MASK_SUBPIX_VAR_SSSE3(64, 64)
117 0 : MASK_SUBPIX_VAR_SSSE3(64, 32)
118 0 : MASK_SUBPIX_VAR_SSSE3(32, 64)
119 0 : MASK_SUBPIX_VAR_SSSE3(32, 32)
120 0 : MASK_SUBPIX_VAR_SSSE3(32, 16)
121 0 : MASK_SUBPIX_VAR_SSSE3(16, 32)
122 0 : MASK_SUBPIX_VAR_SSSE3(16, 16)
123 0 : MASK_SUBPIX_VAR_SSSE3(16, 8)
124 0 : MASK_SUBPIX_VAR8XH_SSSE3(16)
125 0 : MASK_SUBPIX_VAR8XH_SSSE3(8)
126 0 : MASK_SUBPIX_VAR8XH_SSSE3(4)
127 0 : MASK_SUBPIX_VAR4XH_SSSE3(8)
128 0 : MASK_SUBPIX_VAR4XH_SSSE3(4)
129 :
130 0 : static INLINE __m128i filter_block(const __m128i a, const __m128i b,
131 : const __m128i filter) {
132 0 : __m128i v0 = _mm_unpacklo_epi8(a, b);
133 0 : v0 = _mm_maddubs_epi16(v0, filter);
134 0 : v0 = xx_roundn_epu16(v0, FILTER_BITS);
135 :
136 0 : __m128i v1 = _mm_unpackhi_epi8(a, b);
137 0 : v1 = _mm_maddubs_epi16(v1, filter);
138 0 : v1 = xx_roundn_epu16(v1, FILTER_BITS);
139 :
140 0 : return _mm_packus_epi16(v0, v1);
141 : }
142 :
143 0 : static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
144 : int yoffset, uint8_t *dst, int w, int h) {
145 : int i, j;
146 : // Horizontal filter
147 0 : if (xoffset == 0) {
148 0 : uint8_t *b = dst;
149 0 : for (i = 0; i < h + 1; ++i) {
150 0 : for (j = 0; j < w; j += 16) {
151 0 : __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
152 0 : _mm_storeu_si128((__m128i *)&b[j], x);
153 : }
154 0 : src += src_stride;
155 0 : b += w;
156 : }
157 0 : } else if (xoffset == 4) {
158 0 : uint8_t *b = dst;
159 0 : for (i = 0; i < h + 1; ++i) {
160 0 : for (j = 0; j < w; j += 16) {
161 0 : __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
162 0 : __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
163 0 : __m128i z = _mm_alignr_epi8(y, x, 1);
164 0 : _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
165 : }
166 0 : src += src_stride;
167 0 : b += w;
168 : }
169 : } else {
170 0 : uint8_t *b = dst;
171 0 : const uint8_t *hfilter = bilinear_filters_2t[xoffset];
172 0 : const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
173 0 : for (i = 0; i < h + 1; ++i) {
174 0 : for (j = 0; j < w; j += 16) {
175 0 : const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
176 0 : const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
177 0 : const __m128i z = _mm_alignr_epi8(y, x, 1);
178 0 : const __m128i res = filter_block(x, z, hfilter_vec);
179 0 : _mm_storeu_si128((__m128i *)&b[j], res);
180 : }
181 :
182 0 : src += src_stride;
183 0 : b += w;
184 : }
185 : }
186 :
187 : // Vertical filter
188 0 : if (yoffset == 0) {
189 : // The data is already in 'dst', so no need to filter
190 0 : } else if (yoffset == 4) {
191 0 : for (i = 0; i < h; ++i) {
192 0 : for (j = 0; j < w; j += 16) {
193 0 : __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
194 0 : __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
195 0 : _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
196 : }
197 0 : dst += w;
198 : }
199 : } else {
200 0 : const uint8_t *vfilter = bilinear_filters_2t[yoffset];
201 0 : const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
202 0 : for (i = 0; i < h; ++i) {
203 0 : for (j = 0; j < w; j += 16) {
204 0 : const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
205 0 : const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
206 0 : const __m128i res = filter_block(x, y, vfilter_vec);
207 0 : _mm_storeu_si128((__m128i *)&dst[j], res);
208 : }
209 :
210 0 : dst += w;
211 : }
212 : }
213 0 : }
214 :
215 0 : static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
216 : const __m128i *a1, const __m128i *b1,
217 : const __m128i *filter) {
218 0 : __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
219 0 : v0 = _mm_maddubs_epi16(v0, *filter);
220 0 : v0 = xx_roundn_epu16(v0, FILTER_BITS);
221 :
222 0 : __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
223 0 : v1 = _mm_maddubs_epi16(v1, *filter);
224 0 : v1 = xx_roundn_epu16(v1, FILTER_BITS);
225 :
226 0 : return _mm_packus_epi16(v0, v1);
227 : }
228 :
229 0 : static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
230 : int yoffset, uint8_t *dst, int h) {
231 : int i;
232 : // Horizontal filter
233 0 : if (xoffset == 0) {
234 0 : uint8_t *b = dst;
235 0 : for (i = 0; i < h + 1; ++i) {
236 0 : __m128i x = _mm_loadl_epi64((__m128i *)src);
237 : _mm_storel_epi64((__m128i *)b, x);
238 0 : src += src_stride;
239 0 : b += 8;
240 : }
241 0 : } else if (xoffset == 4) {
242 0 : uint8_t *b = dst;
243 0 : for (i = 0; i < h + 1; ++i) {
244 0 : __m128i x = _mm_loadu_si128((__m128i *)src);
245 0 : __m128i z = _mm_srli_si128(x, 1);
246 0 : _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
247 0 : src += src_stride;
248 0 : b += 8;
249 : }
250 : } else {
251 0 : uint8_t *b = dst;
252 0 : const uint8_t *hfilter = bilinear_filters_2t[xoffset];
253 0 : const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
254 0 : for (i = 0; i < h; i += 2) {
255 0 : const __m128i x0 = _mm_loadu_si128((__m128i *)src);
256 0 : const __m128i z0 = _mm_srli_si128(x0, 1);
257 0 : const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
258 0 : const __m128i z1 = _mm_srli_si128(x1, 1);
259 0 : const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
260 : _mm_storeu_si128((__m128i *)b, res);
261 :
262 0 : src += src_stride * 2;
263 0 : b += 16;
264 : }
265 : // Handle i = h separately
266 0 : const __m128i x0 = _mm_loadu_si128((__m128i *)src);
267 0 : const __m128i z0 = _mm_srli_si128(x0, 1);
268 :
269 0 : __m128i v0 = _mm_unpacklo_epi8(x0, z0);
270 0 : v0 = _mm_maddubs_epi16(v0, hfilter_vec);
271 0 : v0 = xx_roundn_epu16(v0, FILTER_BITS);
272 :
273 0 : _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
274 : }
275 :
276 : // Vertical filter
277 0 : if (yoffset == 0) {
278 : // The data is already in 'dst', so no need to filter
279 0 : } else if (yoffset == 4) {
280 0 : for (i = 0; i < h; ++i) {
281 0 : __m128i x = _mm_loadl_epi64((__m128i *)dst);
282 0 : __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
283 0 : _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
284 0 : dst += 8;
285 : }
286 : } else {
287 0 : const uint8_t *vfilter = bilinear_filters_2t[yoffset];
288 0 : const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
289 0 : for (i = 0; i < h; i += 2) {
290 0 : const __m128i x = _mm_loadl_epi64((__m128i *)dst);
291 0 : const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
292 0 : const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
293 0 : const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
294 : _mm_storeu_si128((__m128i *)dst, res);
295 :
296 0 : dst += 16;
297 : }
298 : }
299 0 : }
300 :
301 0 : static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
302 : int yoffset, uint8_t *dst, int h) {
303 : int i;
304 : // Horizontal filter
305 0 : if (xoffset == 0) {
306 0 : uint8_t *b = dst;
307 0 : for (i = 0; i < h + 1; ++i) {
308 0 : __m128i x = xx_loadl_32((__m128i *)src);
309 0 : xx_storel_32((__m128i *)b, x);
310 0 : src += src_stride;
311 0 : b += 4;
312 : }
313 0 : } else if (xoffset == 4) {
314 0 : uint8_t *b = dst;
315 0 : for (i = 0; i < h + 1; ++i) {
316 0 : __m128i x = _mm_loadl_epi64((__m128i *)src);
317 0 : __m128i z = _mm_srli_si128(x, 1);
318 0 : xx_storel_32((__m128i *)b, _mm_avg_epu8(x, z));
319 0 : src += src_stride;
320 0 : b += 4;
321 : }
322 : } else {
323 0 : uint8_t *b = dst;
324 0 : const uint8_t *hfilter = bilinear_filters_2t[xoffset];
325 0 : const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
326 0 : for (i = 0; i < h; i += 4) {
327 0 : const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
328 0 : const __m128i z0 = _mm_srli_si128(x0, 1);
329 0 : const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
330 0 : const __m128i z1 = _mm_srli_si128(x1, 1);
331 0 : const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
332 0 : const __m128i z2 = _mm_srli_si128(x2, 1);
333 0 : const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
334 0 : const __m128i z3 = _mm_srli_si128(x3, 1);
335 :
336 0 : const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
337 0 : const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
338 0 : const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
339 0 : const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
340 0 : const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
341 : _mm_storeu_si128((__m128i *)b, res);
342 :
343 0 : src += src_stride * 4;
344 0 : b += 16;
345 : }
346 : // Handle i = h separately
347 0 : const __m128i x = _mm_loadl_epi64((__m128i *)src);
348 0 : const __m128i z = _mm_srli_si128(x, 1);
349 :
350 0 : __m128i v0 = _mm_unpacklo_epi8(x, z);
351 0 : v0 = _mm_maddubs_epi16(v0, hfilter_vec);
352 0 : v0 = xx_roundn_epu16(v0, FILTER_BITS);
353 :
354 0 : xx_storel_32((__m128i *)b, _mm_packus_epi16(v0, v0));
355 : }
356 :
357 : // Vertical filter
358 0 : if (yoffset == 0) {
359 : // The data is already in 'dst', so no need to filter
360 0 : } else if (yoffset == 4) {
361 0 : for (i = 0; i < h; ++i) {
362 0 : __m128i x = xx_loadl_32((__m128i *)dst);
363 0 : __m128i y = xx_loadl_32((__m128i *)&dst[4]);
364 0 : xx_storel_32((__m128i *)dst, _mm_avg_epu8(x, y));
365 0 : dst += 4;
366 : }
367 : } else {
368 0 : const uint8_t *vfilter = bilinear_filters_2t[yoffset];
369 0 : const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
370 0 : for (i = 0; i < h; i += 4) {
371 0 : const __m128i a = xx_loadl_32((__m128i *)dst);
372 0 : const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
373 0 : const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
374 0 : const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
375 0 : const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
376 :
377 0 : const __m128i a0 = _mm_unpacklo_epi32(a, b);
378 0 : const __m128i b0 = _mm_unpacklo_epi32(b, c);
379 0 : const __m128i a1 = _mm_unpacklo_epi32(c, d);
380 0 : const __m128i b1 = _mm_unpacklo_epi32(d, e);
381 0 : const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
382 : _mm_storeu_si128((__m128i *)dst, res);
383 :
384 0 : dst += 16;
385 : }
386 : }
387 0 : }
388 :
389 0 : static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
390 : const __m128i *b, const __m128i *m,
391 : __m128i *sum, __m128i *sum_sq) {
392 0 : const __m128i zero = _mm_setzero_si128();
393 0 : const __m128i one = _mm_set1_epi16(1);
394 0 : const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
395 0 : const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
396 :
397 : // Calculate 16 predicted pixels.
398 : // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
399 : // is 64 * 255, so we have plenty of space to add rounding constants.
400 0 : const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
401 0 : const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
402 0 : __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
403 0 : pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
404 :
405 0 : const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
406 0 : const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
407 0 : __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
408 0 : pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
409 :
410 0 : const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
411 0 : const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
412 0 : const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
413 0 : const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
414 :
415 : // Update partial sums and partial sums of squares
416 0 : *sum =
417 0 : _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
418 0 : *sum_sq =
419 0 : _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
420 : _mm_madd_epi16(diff_r, diff_r)));
421 0 : }
422 :
423 0 : static void masked_variance(const uint8_t *src_ptr, int src_stride,
424 : const uint8_t *a_ptr, int a_stride,
425 : const uint8_t *b_ptr, int b_stride,
426 : const uint8_t *m_ptr, int m_stride, int width,
427 : int height, unsigned int *sse, int *sum_) {
428 : int x, y;
429 0 : __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
430 :
431 0 : for (y = 0; y < height; y++) {
432 0 : for (x = 0; x < width; x += 16) {
433 0 : const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
434 0 : const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
435 0 : const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
436 0 : const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
437 0 : accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
438 : }
439 :
440 0 : src_ptr += src_stride;
441 0 : a_ptr += a_stride;
442 0 : b_ptr += b_stride;
443 0 : m_ptr += m_stride;
444 : }
445 : // Reduce down to a single sum and sum of squares
446 0 : sum = _mm_hadd_epi32(sum, sum_sq);
447 0 : sum = _mm_hadd_epi32(sum, sum);
448 0 : *sum_ = _mm_cvtsi128_si32(sum);
449 0 : *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
450 0 : }
451 :
452 0 : static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
453 : const uint8_t *a_ptr, const uint8_t *b_ptr,
454 : const uint8_t *m_ptr, int m_stride, int height,
455 : unsigned int *sse, int *sum_) {
456 : int y;
457 0 : __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
458 :
459 0 : for (y = 0; y < height; y += 2) {
460 0 : __m128i src = _mm_unpacklo_epi64(
461 : _mm_loadl_epi64((const __m128i *)src_ptr),
462 0 : _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
463 0 : const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
464 0 : const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
465 0 : const __m128i m =
466 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
467 0 : _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
468 0 : accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
469 :
470 0 : src_ptr += src_stride * 2;
471 0 : a_ptr += 16;
472 0 : b_ptr += 16;
473 0 : m_ptr += m_stride * 2;
474 : }
475 : // Reduce down to a single sum and sum of squares
476 0 : sum = _mm_hadd_epi32(sum, sum_sq);
477 0 : sum = _mm_hadd_epi32(sum, sum);
478 0 : *sum_ = _mm_cvtsi128_si32(sum);
479 0 : *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
480 0 : }
481 :
482 0 : static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
483 : const uint8_t *a_ptr, const uint8_t *b_ptr,
484 : const uint8_t *m_ptr, int m_stride, int height,
485 : unsigned int *sse, int *sum_) {
486 : int y;
487 0 : __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
488 :
489 0 : for (y = 0; y < height; y += 4) {
490 : // Load four rows at a time
491 0 : __m128i src =
492 0 : _mm_setr_epi32(*(uint32_t *)src_ptr, *(uint32_t *)&src_ptr[src_stride],
493 0 : *(uint32_t *)&src_ptr[src_stride * 2],
494 0 : *(uint32_t *)&src_ptr[src_stride * 3]);
495 0 : const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
496 0 : const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
497 0 : const __m128i m = _mm_setr_epi32(
498 0 : *(uint32_t *)m_ptr, *(uint32_t *)&m_ptr[m_stride],
499 0 : *(uint32_t *)&m_ptr[m_stride * 2], *(uint32_t *)&m_ptr[m_stride * 3]);
500 0 : accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
501 :
502 0 : src_ptr += src_stride * 4;
503 0 : a_ptr += 16;
504 0 : b_ptr += 16;
505 0 : m_ptr += m_stride * 4;
506 : }
507 : // Reduce down to a single sum and sum of squares
508 0 : sum = _mm_hadd_epi32(sum, sum_sq);
509 0 : sum = _mm_hadd_epi32(sum, sum);
510 0 : *sum_ = _mm_cvtsi128_si32(sum);
511 0 : *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
512 0 : }
513 :
514 : #if CONFIG_HIGHBITDEPTH
515 : // For width a multiple of 8
516 : static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
517 : int xoffset, int yoffset, uint16_t *dst,
518 : int w, int h);
519 :
520 : static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
521 : int xoffset, int yoffset, uint16_t *dst,
522 : int h);
523 :
524 : // For width a multiple of 8
525 : static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
526 : const uint16_t *a_ptr, int a_stride,
527 : const uint16_t *b_ptr, int b_stride,
528 : const uint8_t *m_ptr, int m_stride,
529 : int width, int height, uint64_t *sse,
530 : int *sum_);
531 :
532 : static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
533 : const uint16_t *a_ptr,
534 : const uint16_t *b_ptr,
535 : const uint8_t *m_ptr, int m_stride,
536 : int height, int *sse, int *sum_);
537 :
538 : #define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \
539 : unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \
540 : const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
541 : const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
542 : const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
543 : uint64_t sse64; \
544 : int sum; \
545 : uint16_t temp[(H + 1) * W]; \
546 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
547 : const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
548 : const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
549 : \
550 : highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
551 : \
552 : if (!invert_mask) \
553 : highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
554 : msk_stride, W, H, &sse64, &sum); \
555 : else \
556 : highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
557 : msk_stride, W, H, &sse64, &sum); \
558 : *sse = (uint32_t)sse64; \
559 : return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
560 : } \
561 : unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
562 : const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
563 : const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
564 : const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
565 : uint64_t sse64; \
566 : int sum; \
567 : uint16_t temp[(H + 1) * W]; \
568 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
569 : const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
570 : const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
571 : \
572 : highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
573 : \
574 : if (!invert_mask) \
575 : highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
576 : msk_stride, W, H, &sse64, &sum); \
577 : else \
578 : highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
579 : msk_stride, W, H, &sse64, &sum); \
580 : *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \
581 : sum = ROUND_POWER_OF_TWO(sum, 2); \
582 : return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
583 : } \
584 : unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
585 : const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
586 : const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
587 : const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
588 : uint64_t sse64; \
589 : int sum; \
590 : uint16_t temp[(H + 1) * W]; \
591 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
592 : const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
593 : const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
594 : \
595 : highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
596 : \
597 : if (!invert_mask) \
598 : highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
599 : msk_stride, W, H, &sse64, &sum); \
600 : else \
601 : highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
602 : msk_stride, W, H, &sse64, &sum); \
603 : *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \
604 : sum = ROUND_POWER_OF_TWO(sum, 4); \
605 : return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
606 : }
607 :
608 : #define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \
609 : unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \
610 : const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
611 : const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
612 : const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
613 : int sse_; \
614 : int sum; \
615 : uint16_t temp[(H + 1) * 4]; \
616 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
617 : const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
618 : const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
619 : \
620 : highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
621 : \
622 : if (!invert_mask) \
623 : highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
624 : msk_stride, H, &sse_, &sum); \
625 : else \
626 : highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
627 : msk_stride, H, &sse_, &sum); \
628 : *sse = (uint32_t)sse_; \
629 : return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
630 : } \
631 : unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \
632 : const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
633 : const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
634 : const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
635 : int sse_; \
636 : int sum; \
637 : uint16_t temp[(H + 1) * 4]; \
638 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
639 : const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
640 : const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
641 : \
642 : highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
643 : \
644 : if (!invert_mask) \
645 : highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
646 : msk_stride, H, &sse_, &sum); \
647 : else \
648 : highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
649 : msk_stride, H, &sse_, &sum); \
650 : *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \
651 : sum = ROUND_POWER_OF_TWO(sum, 2); \
652 : return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
653 : } \
654 : unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \
655 : const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
656 : const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
657 : const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
658 : int sse_; \
659 : int sum; \
660 : uint16_t temp[(H + 1) * 4]; \
661 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
662 : const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
663 : const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
664 : \
665 : highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
666 : \
667 : if (!invert_mask) \
668 : highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
669 : msk_stride, H, &sse_, &sum); \
670 : else \
671 : highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
672 : msk_stride, H, &sse_, &sum); \
673 : *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \
674 : sum = ROUND_POWER_OF_TWO(sum, 4); \
675 : return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
676 : }
677 :
678 : #if CONFIG_EXT_PARTITION
679 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
680 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
681 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
682 : #endif
683 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
684 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
685 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
686 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
687 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
688 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
689 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
690 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
691 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
692 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
693 0 : HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
694 0 : HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
695 0 : HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
696 :
697 0 : static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
698 : const __m128i filter) {
699 0 : __m128i v0 = _mm_unpacklo_epi16(a, b);
700 0 : v0 = _mm_madd_epi16(v0, filter);
701 0 : v0 = xx_roundn_epu32(v0, FILTER_BITS);
702 :
703 0 : __m128i v1 = _mm_unpackhi_epi16(a, b);
704 0 : v1 = _mm_madd_epi16(v1, filter);
705 0 : v1 = xx_roundn_epu32(v1, FILTER_BITS);
706 :
707 0 : return _mm_packs_epi32(v0, v1);
708 : }
709 :
710 0 : static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
711 : int xoffset, int yoffset, uint16_t *dst,
712 : int w, int h) {
713 : int i, j;
714 : // Horizontal filter
715 0 : if (xoffset == 0) {
716 0 : uint16_t *b = dst;
717 0 : for (i = 0; i < h + 1; ++i) {
718 0 : for (j = 0; j < w; j += 8) {
719 0 : __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
720 0 : _mm_storeu_si128((__m128i *)&b[j], x);
721 : }
722 0 : src += src_stride;
723 0 : b += w;
724 : }
725 0 : } else if (xoffset == 4) {
726 0 : uint16_t *b = dst;
727 0 : for (i = 0; i < h + 1; ++i) {
728 0 : for (j = 0; j < w; j += 8) {
729 0 : __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
730 0 : __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
731 0 : __m128i z = _mm_alignr_epi8(y, x, 2);
732 0 : _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
733 : }
734 0 : src += src_stride;
735 0 : b += w;
736 : }
737 : } else {
738 0 : uint16_t *b = dst;
739 0 : const uint8_t *hfilter = bilinear_filters_2t[xoffset];
740 0 : const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
741 0 : for (i = 0; i < h + 1; ++i) {
742 0 : for (j = 0; j < w; j += 8) {
743 0 : const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
744 0 : const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
745 0 : const __m128i z = _mm_alignr_epi8(y, x, 2);
746 0 : const __m128i res = highbd_filter_block(x, z, hfilter_vec);
747 0 : _mm_storeu_si128((__m128i *)&b[j], res);
748 : }
749 :
750 0 : src += src_stride;
751 0 : b += w;
752 : }
753 : }
754 :
755 : // Vertical filter
756 0 : if (yoffset == 0) {
757 : // The data is already in 'dst', so no need to filter
758 0 : } else if (yoffset == 4) {
759 0 : for (i = 0; i < h; ++i) {
760 0 : for (j = 0; j < w; j += 8) {
761 0 : __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
762 0 : __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
763 0 : _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
764 : }
765 0 : dst += w;
766 : }
767 : } else {
768 0 : const uint8_t *vfilter = bilinear_filters_2t[yoffset];
769 0 : const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
770 0 : for (i = 0; i < h; ++i) {
771 0 : for (j = 0; j < w; j += 8) {
772 0 : const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
773 0 : const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
774 0 : const __m128i res = highbd_filter_block(x, y, vfilter_vec);
775 0 : _mm_storeu_si128((__m128i *)&dst[j], res);
776 : }
777 :
778 0 : dst += w;
779 : }
780 : }
781 0 : }
782 :
783 0 : static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
784 : const __m128i *b0,
785 : const __m128i *a1,
786 : const __m128i *b1,
787 : const __m128i *filter) {
788 0 : __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
789 0 : v0 = _mm_madd_epi16(v0, *filter);
790 0 : v0 = xx_roundn_epu32(v0, FILTER_BITS);
791 :
792 0 : __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
793 0 : v1 = _mm_madd_epi16(v1, *filter);
794 0 : v1 = xx_roundn_epu32(v1, FILTER_BITS);
795 :
796 0 : return _mm_packs_epi32(v0, v1);
797 : }
798 :
799 0 : static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
800 : int xoffset, int yoffset, uint16_t *dst,
801 : int h) {
802 : int i;
803 : // Horizontal filter
804 0 : if (xoffset == 0) {
805 0 : uint16_t *b = dst;
806 0 : for (i = 0; i < h + 1; ++i) {
807 0 : __m128i x = _mm_loadl_epi64((__m128i *)src);
808 : _mm_storel_epi64((__m128i *)b, x);
809 0 : src += src_stride;
810 0 : b += 4;
811 : }
812 0 : } else if (xoffset == 4) {
813 0 : uint16_t *b = dst;
814 0 : for (i = 0; i < h + 1; ++i) {
815 0 : __m128i x = _mm_loadu_si128((__m128i *)src);
816 0 : __m128i z = _mm_srli_si128(x, 2);
817 0 : _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
818 0 : src += src_stride;
819 0 : b += 4;
820 : }
821 : } else {
822 0 : uint16_t *b = dst;
823 0 : const uint8_t *hfilter = bilinear_filters_2t[xoffset];
824 0 : const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
825 0 : for (i = 0; i < h; i += 2) {
826 0 : const __m128i x0 = _mm_loadu_si128((__m128i *)src);
827 0 : const __m128i z0 = _mm_srli_si128(x0, 2);
828 0 : const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
829 0 : const __m128i z1 = _mm_srli_si128(x1, 2);
830 0 : const __m128i res =
831 : highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
832 : _mm_storeu_si128((__m128i *)b, res);
833 :
834 0 : src += src_stride * 2;
835 0 : b += 8;
836 : }
837 : // Process i = h separately
838 0 : __m128i x = _mm_loadu_si128((__m128i *)src);
839 0 : __m128i z = _mm_srli_si128(x, 2);
840 :
841 0 : __m128i v0 = _mm_unpacklo_epi16(x, z);
842 0 : v0 = _mm_madd_epi16(v0, hfilter_vec);
843 0 : v0 = xx_roundn_epu32(v0, FILTER_BITS);
844 :
845 0 : _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
846 : }
847 :
848 : // Vertical filter
849 0 : if (yoffset == 0) {
850 : // The data is already in 'dst', so no need to filter
851 0 : } else if (yoffset == 4) {
852 0 : for (i = 0; i < h; ++i) {
853 0 : __m128i x = _mm_loadl_epi64((__m128i *)dst);
854 0 : __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
855 0 : _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
856 0 : dst += 4;
857 : }
858 : } else {
859 0 : const uint8_t *vfilter = bilinear_filters_2t[yoffset];
860 0 : const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
861 0 : for (i = 0; i < h; i += 2) {
862 0 : const __m128i x = _mm_loadl_epi64((__m128i *)dst);
863 0 : const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
864 0 : const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
865 0 : const __m128i res =
866 : highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
867 : _mm_storeu_si128((__m128i *)dst, res);
868 :
869 0 : dst += 8;
870 : }
871 : }
872 0 : }
873 :
874 0 : static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
875 : const uint16_t *a_ptr, int a_stride,
876 : const uint16_t *b_ptr, int b_stride,
877 : const uint8_t *m_ptr, int m_stride,
878 : int width, int height, uint64_t *sse,
879 : int *sum_) {
880 : int x, y;
881 : // Note on bit widths:
882 : // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
883 : // so this can be kept as four 32-bit values.
884 : // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
885 : // so this must be stored as two 64-bit values.
886 0 : __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
887 0 : const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
888 0 : const __m128i round_const =
889 : _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
890 0 : const __m128i zero = _mm_setzero_si128();
891 :
892 0 : for (y = 0; y < height; y++) {
893 0 : for (x = 0; x < width; x += 8) {
894 0 : const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
895 0 : const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
896 0 : const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
897 0 : const __m128i m =
898 0 : _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
899 0 : const __m128i m_inv = _mm_sub_epi16(mask_max, m);
900 :
901 : // Calculate 8 predicted pixels.
902 0 : const __m128i data_l = _mm_unpacklo_epi16(a, b);
903 0 : const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
904 0 : __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
905 0 : pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
906 : AOM_BLEND_A64_ROUND_BITS);
907 :
908 0 : const __m128i data_r = _mm_unpackhi_epi16(a, b);
909 0 : const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
910 0 : __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
911 0 : pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
912 : AOM_BLEND_A64_ROUND_BITS);
913 :
914 0 : const __m128i src_l = _mm_unpacklo_epi16(src, zero);
915 0 : const __m128i src_r = _mm_unpackhi_epi16(src, zero);
916 0 : __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
917 0 : __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
918 :
919 : // Update partial sums and partial sums of squares
920 0 : sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
921 : // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
922 : // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
923 : // So we can re-pack into 16-bit fields and use _mm_madd_epi16
924 : // to calculate the squares and partially sum them.
925 0 : const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
926 0 : const __m128i prod = _mm_madd_epi16(tmp, tmp);
927 : // Then we want to sign-extend to 64 bits and accumulate
928 0 : const __m128i sign = _mm_srai_epi32(prod, 31);
929 0 : const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
930 0 : const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
931 0 : sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
932 : }
933 :
934 0 : src_ptr += src_stride;
935 0 : a_ptr += a_stride;
936 0 : b_ptr += b_stride;
937 0 : m_ptr += m_stride;
938 : }
939 : // Reduce down to a single sum and sum of squares
940 0 : sum = _mm_hadd_epi32(sum, zero);
941 0 : sum = _mm_hadd_epi32(sum, zero);
942 0 : *sum_ = _mm_cvtsi128_si32(sum);
943 0 : sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
944 : _mm_storel_epi64((__m128i *)sse, sum_sq);
945 0 : }
946 :
947 0 : static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
948 : const uint16_t *a_ptr,
949 : const uint16_t *b_ptr,
950 : const uint8_t *m_ptr, int m_stride,
951 : int height, int *sse, int *sum_) {
952 : int y;
953 : // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
954 : // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
955 : // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
956 : // So we can safely pack sum_sq into 32-bit fields, which is slightly more
957 : // convenient.
958 0 : __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
959 0 : const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
960 0 : const __m128i round_const =
961 : _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
962 0 : const __m128i zero = _mm_setzero_si128();
963 :
964 0 : for (y = 0; y < height; y += 2) {
965 0 : __m128i src = _mm_unpacklo_epi64(
966 : _mm_loadl_epi64((const __m128i *)src_ptr),
967 0 : _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
968 0 : const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
969 0 : const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
970 0 : const __m128i m = _mm_unpacklo_epi8(
971 : _mm_unpacklo_epi32(
972 0 : _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
973 0 : _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
974 : zero);
975 0 : const __m128i m_inv = _mm_sub_epi16(mask_max, m);
976 :
977 0 : const __m128i data_l = _mm_unpacklo_epi16(a, b);
978 0 : const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
979 0 : __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
980 0 : pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
981 : AOM_BLEND_A64_ROUND_BITS);
982 :
983 0 : const __m128i data_r = _mm_unpackhi_epi16(a, b);
984 0 : const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
985 0 : __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
986 0 : pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
987 : AOM_BLEND_A64_ROUND_BITS);
988 :
989 0 : const __m128i src_l = _mm_unpacklo_epi16(src, zero);
990 0 : const __m128i src_r = _mm_unpackhi_epi16(src, zero);
991 0 : __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
992 0 : __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
993 :
994 : // Update partial sums and partial sums of squares
995 0 : sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
996 0 : const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
997 0 : const __m128i prod = _mm_madd_epi16(tmp, tmp);
998 0 : sum_sq = _mm_add_epi32(sum_sq, prod);
999 :
1000 0 : src_ptr += src_stride * 2;
1001 0 : a_ptr += 8;
1002 0 : b_ptr += 8;
1003 0 : m_ptr += m_stride * 2;
1004 : }
1005 : // Reduce down to a single sum and sum of squares
1006 0 : sum = _mm_hadd_epi32(sum, sum_sq);
1007 0 : sum = _mm_hadd_epi32(sum, zero);
1008 0 : *sum_ = _mm_cvtsi128_si32(sum);
1009 0 : *sse = _mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
1010 0 : }
1011 :
1012 : #endif
|