Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h> // SSE2
13 :
14 : #include "./aom_config.h"
15 : #include "./aom_dsp_rtcd.h"
16 :
17 : #include "aom_ports/mem.h"
18 :
19 : typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
20 : const uint16_t *ref, int ref_stride,
21 : uint32_t *sse, int *sum);
22 :
23 : uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
24 : const uint16_t *ref, int ref_stride,
25 : uint32_t *sse, int *sum);
26 :
27 : uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
28 : const uint16_t *ref, int ref_stride,
29 : uint32_t *sse, int *sum);
30 :
31 0 : static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
32 : const uint16_t *ref, int ref_stride, int w,
33 : int h, uint32_t *sse, int *sum,
34 : high_variance_fn_t var_fn, int block_size) {
35 : int i, j;
36 :
37 0 : *sse = 0;
38 0 : *sum = 0;
39 :
40 0 : for (i = 0; i < h; i += block_size) {
41 0 : for (j = 0; j < w; j += block_size) {
42 : unsigned int sse0;
43 : int sum0;
44 0 : var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
45 : ref_stride, &sse0, &sum0);
46 0 : *sse += sse0;
47 0 : *sum += sum0;
48 : }
49 : }
50 0 : }
51 :
52 0 : static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
53 : const uint16_t *ref, int ref_stride, int w,
54 : int h, uint32_t *sse, int *sum,
55 : high_variance_fn_t var_fn, int block_size) {
56 : int i, j;
57 0 : uint64_t sse_long = 0;
58 0 : int32_t sum_long = 0;
59 :
60 0 : for (i = 0; i < h; i += block_size) {
61 0 : for (j = 0; j < w; j += block_size) {
62 : unsigned int sse0;
63 : int sum0;
64 0 : var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
65 : ref_stride, &sse0, &sum0);
66 0 : sse_long += sse0;
67 0 : sum_long += sum0;
68 : }
69 : }
70 0 : *sum = ROUND_POWER_OF_TWO(sum_long, 2);
71 0 : *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
72 0 : }
73 :
74 0 : static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
75 : const uint16_t *ref, int ref_stride, int w,
76 : int h, uint32_t *sse, int *sum,
77 : high_variance_fn_t var_fn, int block_size) {
78 : int i, j;
79 0 : uint64_t sse_long = 0;
80 0 : int32_t sum_long = 0;
81 :
82 0 : for (i = 0; i < h; i += block_size) {
83 0 : for (j = 0; j < w; j += block_size) {
84 : unsigned int sse0;
85 : int sum0;
86 0 : var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
87 : ref_stride, &sse0, &sum0);
88 0 : sse_long += sse0;
89 0 : sum_long += sum0;
90 : }
91 : }
92 0 : *sum = ROUND_POWER_OF_TWO(sum_long, 4);
93 0 : *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
94 0 : }
95 :
96 : #define HIGH_GET_VAR(S) \
97 : void aom_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
98 : const uint8_t *ref8, int ref_stride, \
99 : uint32_t *sse, int *sum) { \
100 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
101 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
102 : aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
103 : sum); \
104 : } \
105 : \
106 : void aom_highbd_10_get##S##x##S##var_sse2( \
107 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
108 : int ref_stride, uint32_t *sse, int *sum) { \
109 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
110 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
111 : aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
112 : sum); \
113 : *sum = ROUND_POWER_OF_TWO(*sum, 2); \
114 : *sse = ROUND_POWER_OF_TWO(*sse, 4); \
115 : } \
116 : \
117 : void aom_highbd_12_get##S##x##S##var_sse2( \
118 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
119 : int ref_stride, uint32_t *sse, int *sum) { \
120 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
121 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
122 : aom_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, sse, \
123 : sum); \
124 : *sum = ROUND_POWER_OF_TWO(*sum, 4); \
125 : *sse = ROUND_POWER_OF_TWO(*sse, 8); \
126 : }
127 :
128 0 : HIGH_GET_VAR(16);
129 0 : HIGH_GET_VAR(8);
130 :
131 : #undef HIGH_GET_VAR
132 :
133 : #define VAR_FN(w, h, block_size, shift) \
134 : uint32_t aom_highbd_8_variance##w##x##h##_sse2( \
135 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
136 : int ref_stride, uint32_t *sse) { \
137 : int sum; \
138 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
139 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
140 : highbd_8_variance_sse2( \
141 : src, src_stride, ref, ref_stride, w, h, sse, &sum, \
142 : aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
143 : return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
144 : } \
145 : \
146 : uint32_t aom_highbd_10_variance##w##x##h##_sse2( \
147 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
148 : int ref_stride, uint32_t *sse) { \
149 : int sum; \
150 : int64_t var; \
151 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
152 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
153 : highbd_10_variance_sse2( \
154 : src, src_stride, ref, ref_stride, w, h, sse, &sum, \
155 : aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
156 : var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
157 : return (var >= 0) ? (uint32_t)var : 0; \
158 : } \
159 : \
160 : uint32_t aom_highbd_12_variance##w##x##h##_sse2( \
161 : const uint8_t *src8, int src_stride, const uint8_t *ref8, \
162 : int ref_stride, uint32_t *sse) { \
163 : int sum; \
164 : int64_t var; \
165 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
166 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
167 : highbd_12_variance_sse2( \
168 : src, src_stride, ref, ref_stride, w, h, sse, &sum, \
169 : aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
170 : var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
171 : return (var >= 0) ? (uint32_t)var : 0; \
172 : }
173 :
174 0 : VAR_FN(64, 64, 16, 12);
175 0 : VAR_FN(64, 32, 16, 11);
176 0 : VAR_FN(32, 64, 16, 11);
177 0 : VAR_FN(32, 32, 16, 10);
178 0 : VAR_FN(32, 16, 16, 9);
179 0 : VAR_FN(16, 32, 16, 9);
180 0 : VAR_FN(16, 16, 16, 8);
181 0 : VAR_FN(16, 8, 8, 7);
182 0 : VAR_FN(8, 16, 8, 7);
183 0 : VAR_FN(8, 8, 8, 6);
184 :
185 : #undef VAR_FN
186 :
187 0 : unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
188 : const uint8_t *ref8, int ref_stride,
189 : unsigned int *sse) {
190 : int sum;
191 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
192 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
193 0 : highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
194 : aom_highbd_calc16x16var_sse2, 16);
195 0 : return *sse;
196 : }
197 :
198 0 : unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
199 : const uint8_t *ref8, int ref_stride,
200 : unsigned int *sse) {
201 : int sum;
202 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
203 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
204 0 : highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
205 : aom_highbd_calc16x16var_sse2, 16);
206 0 : return *sse;
207 : }
208 :
209 0 : unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
210 : const uint8_t *ref8, int ref_stride,
211 : unsigned int *sse) {
212 : int sum;
213 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
214 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
215 0 : highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
216 : aom_highbd_calc16x16var_sse2, 16);
217 0 : return *sse;
218 : }
219 :
220 0 : unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
221 : const uint8_t *ref8, int ref_stride,
222 : unsigned int *sse) {
223 : int sum;
224 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
225 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
226 0 : highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
227 : aom_highbd_calc8x8var_sse2, 8);
228 0 : return *sse;
229 : }
230 :
231 0 : unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
232 : const uint8_t *ref8, int ref_stride,
233 : unsigned int *sse) {
234 : int sum;
235 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
236 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
237 0 : highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
238 : aom_highbd_calc8x8var_sse2, 8);
239 0 : return *sse;
240 : }
241 :
242 0 : unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
243 : const uint8_t *ref8, int ref_stride,
244 : unsigned int *sse) {
245 : int sum;
246 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
247 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
248 0 : highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
249 : aom_highbd_calc8x8var_sse2, 8);
250 0 : return *sse;
251 : }
252 :
253 : // The 2 unused parameters are place holders for PIC enabled build.
254 : // These definitions are for functions defined in
255 : // highbd_subpel_variance_impl_sse2.asm
256 : #define DECL(w, opt) \
257 : int aom_highbd_sub_pixel_variance##w##xh_##opt( \
258 : const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
259 : const uint16_t *dst, ptrdiff_t dst_stride, int height, \
260 : unsigned int *sse, void *unused0, void *unused);
261 : #define DECLS(opt) \
262 : DECL(8, opt); \
263 : DECL(16, opt)
264 :
265 : DECLS(sse2);
266 :
267 : #undef DECLS
268 : #undef DECL
269 :
270 : #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
271 : uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
272 : const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
273 : const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
274 : uint32_t sse; \
275 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
276 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
277 : int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
278 : src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
279 : NULL); \
280 : if (w > wf) { \
281 : unsigned int sse2; \
282 : int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
283 : src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
284 : &sse2, NULL, NULL); \
285 : se += se2; \
286 : sse += sse2; \
287 : if (w > wf * 2) { \
288 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
289 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
290 : &sse2, NULL, NULL); \
291 : se += se2; \
292 : sse += sse2; \
293 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
294 : src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
295 : &sse2, NULL, NULL); \
296 : se += se2; \
297 : sse += sse2; \
298 : } \
299 : } \
300 : *sse_ptr = sse; \
301 : return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
302 : } \
303 : \
304 : uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
305 : const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
306 : const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
307 : int64_t var; \
308 : uint32_t sse; \
309 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
310 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
311 : int se = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
312 : src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
313 : NULL); \
314 : if (w > wf) { \
315 : uint32_t sse2; \
316 : int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
317 : src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
318 : &sse2, NULL, NULL); \
319 : se += se2; \
320 : sse += sse2; \
321 : if (w > wf * 2) { \
322 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
323 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
324 : &sse2, NULL, NULL); \
325 : se += se2; \
326 : sse += sse2; \
327 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
328 : src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
329 : &sse2, NULL, NULL); \
330 : se += se2; \
331 : sse += sse2; \
332 : } \
333 : } \
334 : se = ROUND_POWER_OF_TWO(se, 2); \
335 : sse = ROUND_POWER_OF_TWO(sse, 4); \
336 : *sse_ptr = sse; \
337 : var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
338 : return (var >= 0) ? (uint32_t)var : 0; \
339 : } \
340 : \
341 : uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
342 : const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
343 : const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
344 : int start_row; \
345 : uint32_t sse; \
346 : int se = 0; \
347 : int64_t var; \
348 : uint64_t long_sse = 0; \
349 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
350 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
351 : for (start_row = 0; start_row < h; start_row += 16) { \
352 : uint32_t sse2; \
353 : int height = h - start_row < 16 ? h - start_row : 16; \
354 : int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
355 : src + (start_row * src_stride), src_stride, x_offset, y_offset, \
356 : dst + (start_row * dst_stride), dst_stride, height, &sse2, NULL, \
357 : NULL); \
358 : se += se2; \
359 : long_sse += sse2; \
360 : if (w > wf) { \
361 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
362 : src + 16 + (start_row * src_stride), src_stride, x_offset, \
363 : y_offset, dst + 16 + (start_row * dst_stride), dst_stride, height, \
364 : &sse2, NULL, NULL); \
365 : se += se2; \
366 : long_sse += sse2; \
367 : if (w > wf * 2) { \
368 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
369 : src + 32 + (start_row * src_stride), src_stride, x_offset, \
370 : y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
371 : height, &sse2, NULL, NULL); \
372 : se += se2; \
373 : long_sse += sse2; \
374 : se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
375 : src + 48 + (start_row * src_stride), src_stride, x_offset, \
376 : y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
377 : height, &sse2, NULL, NULL); \
378 : se += se2; \
379 : long_sse += sse2; \
380 : } \
381 : } \
382 : } \
383 : se = ROUND_POWER_OF_TWO(se, 4); \
384 : sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
385 : *sse_ptr = sse; \
386 : var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
387 : return (var >= 0) ? (uint32_t)var : 0; \
388 : }
389 :
390 : #define FNS(opt) \
391 : FN(64, 64, 16, 6, 6, opt, (int64_t)); \
392 : FN(64, 32, 16, 6, 5, opt, (int64_t)); \
393 : FN(32, 64, 16, 5, 6, opt, (int64_t)); \
394 : FN(32, 32, 16, 5, 5, opt, (int64_t)); \
395 : FN(32, 16, 16, 5, 4, opt, (int64_t)); \
396 : FN(16, 32, 16, 4, 5, opt, (int64_t)); \
397 : FN(16, 16, 16, 4, 4, opt, (int64_t)); \
398 : FN(16, 8, 16, 4, 3, opt, (int64_t)); \
399 : FN(8, 16, 8, 3, 4, opt, (int64_t)); \
400 : FN(8, 8, 8, 3, 3, opt, (int64_t)); \
401 : FN(8, 4, 8, 3, 2, opt, (int64_t));
402 :
403 0 : FNS(sse2);
404 :
405 : #undef FNS
406 : #undef FN
407 :
408 : // The 2 unused parameters are place holders for PIC enabled build.
409 : #define DECL(w, opt) \
410 : int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \
411 : const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
412 : const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
413 : ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
414 : void *unused);
415 : #define DECLS(opt1) \
416 : DECL(16, opt1) \
417 : DECL(8, opt1)
418 :
419 : DECLS(sse2);
420 : #undef DECL
421 : #undef DECLS
422 :
423 : #define FN(w, h, wf, wlog2, hlog2, opt, cast) \
424 : uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
425 : const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
426 : const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
427 : const uint8_t *sec8) { \
428 : uint32_t sse; \
429 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
430 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
431 : uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
432 : int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
433 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
434 : NULL, NULL); \
435 : if (w > wf) { \
436 : uint32_t sse2; \
437 : int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
438 : src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
439 : sec + 16, w, h, &sse2, NULL, NULL); \
440 : se += se2; \
441 : sse += sse2; \
442 : if (w > wf * 2) { \
443 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
444 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
445 : sec + 32, w, h, &sse2, NULL, NULL); \
446 : se += se2; \
447 : sse += sse2; \
448 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
449 : src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
450 : sec + 48, w, h, &sse2, NULL, NULL); \
451 : se += se2; \
452 : sse += sse2; \
453 : } \
454 : } \
455 : *sse_ptr = sse; \
456 : return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
457 : } \
458 : \
459 : uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
460 : const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
461 : const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
462 : const uint8_t *sec8) { \
463 : int64_t var; \
464 : uint32_t sse; \
465 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
466 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
467 : uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
468 : int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
469 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
470 : NULL, NULL); \
471 : if (w > wf) { \
472 : uint32_t sse2; \
473 : int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
474 : src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
475 : sec + 16, w, h, &sse2, NULL, NULL); \
476 : se += se2; \
477 : sse += sse2; \
478 : if (w > wf * 2) { \
479 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
480 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
481 : sec + 32, w, h, &sse2, NULL, NULL); \
482 : se += se2; \
483 : sse += sse2; \
484 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
485 : src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
486 : sec + 48, w, h, &sse2, NULL, NULL); \
487 : se += se2; \
488 : sse += sse2; \
489 : } \
490 : } \
491 : se = ROUND_POWER_OF_TWO(se, 2); \
492 : sse = ROUND_POWER_OF_TWO(sse, 4); \
493 : *sse_ptr = sse; \
494 : var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
495 : return (var >= 0) ? (uint32_t)var : 0; \
496 : } \
497 : \
498 : uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
499 : const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
500 : const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
501 : const uint8_t *sec8) { \
502 : int start_row; \
503 : int64_t var; \
504 : uint32_t sse; \
505 : int se = 0; \
506 : uint64_t long_sse = 0; \
507 : uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
508 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
509 : uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
510 : for (start_row = 0; start_row < h; start_row += 16) { \
511 : uint32_t sse2; \
512 : int height = h - start_row < 16 ? h - start_row : 16; \
513 : int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
514 : src + (start_row * src_stride), src_stride, x_offset, y_offset, \
515 : dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
516 : w, height, &sse2, NULL, NULL); \
517 : se += se2; \
518 : long_sse += sse2; \
519 : if (w > wf) { \
520 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
521 : src + 16 + (start_row * src_stride), src_stride, x_offset, \
522 : y_offset, dst + 16 + (start_row * dst_stride), dst_stride, \
523 : sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \
524 : se += se2; \
525 : long_sse += sse2; \
526 : if (w > wf * 2) { \
527 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
528 : src + 32 + (start_row * src_stride), src_stride, x_offset, \
529 : y_offset, dst + 32 + (start_row * dst_stride), dst_stride, \
530 : sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \
531 : se += se2; \
532 : long_sse += sse2; \
533 : se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
534 : src + 48 + (start_row * src_stride), src_stride, x_offset, \
535 : y_offset, dst + 48 + (start_row * dst_stride), dst_stride, \
536 : sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \
537 : se += se2; \
538 : long_sse += sse2; \
539 : } \
540 : } \
541 : } \
542 : se = ROUND_POWER_OF_TWO(se, 4); \
543 : sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
544 : *sse_ptr = sse; \
545 : var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
546 : return (var >= 0) ? (uint32_t)var : 0; \
547 : }
548 :
549 : #define FNS(opt1) \
550 : FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
551 : FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
552 : FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
553 : FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
554 : FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
555 : FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
556 : FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
557 : FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
558 : FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
559 : FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
560 : FN(8, 4, 8, 3, 2, opt1, (int64_t));
561 :
562 0 : FNS(sse2);
563 :
564 : #undef FNS
565 : #undef FN
566 :
567 0 : void aom_highbd_upsampled_pred_sse2(uint16_t *comp_pred, int width, int height,
568 : const uint8_t *ref8, int ref_stride) {
569 : int i, j;
570 0 : int stride = ref_stride << 3;
571 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
572 :
573 0 : if (width >= 8) {
574 : // read 8 points at one time
575 0 : for (i = 0; i < height; i++) {
576 0 : for (j = 0; j < width; j += 8) {
577 0 : __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
578 0 : __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
579 0 : __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
580 0 : __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
581 0 : __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
582 0 : __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
583 0 : __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
584 0 : __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
585 : __m128i t0, t1, t2, t3;
586 :
587 0 : t0 = _mm_unpacklo_epi16(s0, s1);
588 0 : t1 = _mm_unpacklo_epi16(s2, s3);
589 0 : t2 = _mm_unpacklo_epi16(s4, s5);
590 0 : t3 = _mm_unpacklo_epi16(s6, s7);
591 0 : t0 = _mm_unpacklo_epi32(t0, t1);
592 0 : t2 = _mm_unpacklo_epi32(t2, t3);
593 0 : t0 = _mm_unpacklo_epi64(t0, t2);
594 :
595 : _mm_storeu_si128((__m128i *)(comp_pred), t0);
596 0 : comp_pred += 8;
597 0 : ref += 64; // 8 * 8;
598 : }
599 0 : ref += stride - (width << 3);
600 : }
601 : } else {
602 : // read 4 points at one time
603 0 : for (i = 0; i < height; i++) {
604 0 : for (j = 0; j < width; j += 4) {
605 0 : __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
606 0 : __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
607 0 : __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
608 0 : __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
609 : __m128i t0, t1;
610 :
611 0 : t0 = _mm_unpacklo_epi16(s0, s1);
612 0 : t1 = _mm_unpacklo_epi16(s2, s3);
613 0 : t0 = _mm_unpacklo_epi32(t0, t1);
614 :
615 : _mm_storel_epi64((__m128i *)(comp_pred), t0);
616 0 : comp_pred += 4;
617 0 : ref += 4 * 8;
618 : }
619 0 : ref += stride - (width << 3);
620 : }
621 : }
622 0 : }
623 :
624 0 : void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
625 : const uint8_t *pred8, int width,
626 : int height, const uint8_t *ref8,
627 : int ref_stride) {
628 0 : const __m128i one = _mm_set1_epi16(1);
629 : int i, j;
630 0 : int stride = ref_stride << 3;
631 0 : uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
632 0 : uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
633 :
634 0 : if (width >= 8) {
635 : // read 8 points at one time
636 0 : for (i = 0; i < height; i++) {
637 0 : for (j = 0; j < width; j += 8) {
638 0 : __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
639 0 : __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
640 0 : __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
641 0 : __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
642 0 : __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
643 0 : __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
644 0 : __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
645 0 : __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
646 0 : __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
647 : __m128i t0, t1, t2, t3;
648 :
649 0 : t0 = _mm_unpacklo_epi16(s0, s1);
650 0 : t1 = _mm_unpacklo_epi16(s2, s3);
651 0 : t2 = _mm_unpacklo_epi16(s4, s5);
652 0 : t3 = _mm_unpacklo_epi16(s6, s7);
653 0 : t0 = _mm_unpacklo_epi32(t0, t1);
654 0 : t2 = _mm_unpacklo_epi32(t2, t3);
655 0 : t0 = _mm_unpacklo_epi64(t0, t2);
656 :
657 0 : p0 = _mm_adds_epu16(t0, p0);
658 0 : p0 = _mm_adds_epu16(p0, one);
659 0 : p0 = _mm_srli_epi16(p0, 1);
660 :
661 : _mm_storeu_si128((__m128i *)(comp_pred), p0);
662 0 : comp_pred += 8;
663 0 : pred += 8;
664 0 : ref += 8 * 8;
665 : }
666 0 : ref += stride - (width << 3);
667 : }
668 : } else {
669 : // read 4 points at one time
670 0 : for (i = 0; i < height; i++) {
671 0 : for (j = 0; j < width; j += 4) {
672 0 : __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
673 0 : __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
674 0 : __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
675 0 : __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
676 0 : __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
677 : __m128i t0, t1;
678 :
679 0 : t0 = _mm_unpacklo_epi16(s0, s1);
680 0 : t1 = _mm_unpacklo_epi16(s2, s3);
681 0 : t0 = _mm_unpacklo_epi32(t0, t1);
682 :
683 0 : p0 = _mm_adds_epu16(t0, p0);
684 0 : p0 = _mm_adds_epu16(p0, one);
685 0 : p0 = _mm_srli_epi16(p0, 1);
686 :
687 : _mm_storel_epi64((__m128i *)(comp_pred), p0);
688 0 : comp_pred += 4;
689 0 : pred += 4;
690 0 : ref += 4 * 8;
691 : }
692 0 : ref += stride - (width << 3);
693 : }
694 : }
695 0 : }
|