Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <immintrin.h>
14 :
15 : #include "./aom_config.h"
16 : #include "aom_ports/mem.h"
17 : #include "aom/aom_integer.h"
18 :
19 : #include "aom_dsp/aom_dsp_common.h"
20 : #include "aom_dsp/aom_filter.h"
21 : #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
22 : #include "aom_dsp/x86/synonyms.h"
23 :
24 : ////////////////////////////////////////////////////////////////////////////////
25 : // 8 bit
26 : ////////////////////////////////////////////////////////////////////////////////
27 :
28 0 : static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
29 : const int32_t *wsrc, const int32_t *mask,
30 : unsigned int *const sse, int *const sum,
31 : const int h) {
32 0 : const int pre_step = pre_stride - 4;
33 0 : int n = 0;
34 0 : __m128i v_sum_d = _mm_setzero_si128();
35 0 : __m128i v_sse_d = _mm_setzero_si128();
36 :
37 0 : assert(IS_POWER_OF_TWO(h));
38 :
39 : do {
40 0 : const __m128i v_p_b = xx_loadl_32(pre + n);
41 0 : const __m128i v_m_d = xx_load_128(mask + n);
42 0 : const __m128i v_w_d = xx_load_128(wsrc + n);
43 :
44 0 : const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
45 :
46 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
47 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
48 : // than pmulld but produces the same result with these inputs.
49 0 : const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
50 :
51 0 : const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
52 0 : const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
53 0 : const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
54 :
55 0 : v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
56 0 : v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
57 :
58 0 : n += 4;
59 :
60 0 : if (n % 4 == 0) pre += pre_step;
61 0 : } while (n < 4 * h);
62 :
63 0 : *sum = xx_hsum_epi32_si32(v_sum_d);
64 0 : *sse = xx_hsum_epi32_si32(v_sse_d);
65 0 : }
66 :
67 0 : static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
68 : const int32_t *wsrc, const int32_t *mask,
69 : unsigned int *const sse, int *const sum,
70 : const int w, const int h) {
71 0 : const int pre_step = pre_stride - w;
72 0 : int n = 0;
73 0 : __m128i v_sum_d = _mm_setzero_si128();
74 0 : __m128i v_sse_d = _mm_setzero_si128();
75 :
76 0 : assert(w >= 8);
77 0 : assert(IS_POWER_OF_TWO(w));
78 0 : assert(IS_POWER_OF_TWO(h));
79 :
80 : do {
81 0 : const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
82 0 : const __m128i v_m1_d = xx_load_128(mask + n + 4);
83 0 : const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
84 0 : const __m128i v_p0_b = xx_loadl_32(pre + n);
85 0 : const __m128i v_m0_d = xx_load_128(mask + n);
86 0 : const __m128i v_w0_d = xx_load_128(wsrc + n);
87 :
88 0 : const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
89 0 : const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
90 :
91 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
92 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
93 : // than pmulld but produces the same result with these inputs.
94 0 : const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
95 0 : const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
96 :
97 0 : const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
98 0 : const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
99 :
100 0 : const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
101 0 : const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
102 0 : const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
103 0 : const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
104 :
105 0 : v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
106 0 : v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
107 0 : v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
108 :
109 0 : n += 8;
110 :
111 0 : if (n % w == 0) pre += pre_step;
112 0 : } while (n < w * h);
113 :
114 0 : *sum = xx_hsum_epi32_si32(v_sum_d);
115 0 : *sse = xx_hsum_epi32_si32(v_sse_d);
116 0 : }
117 :
118 : #define OBMCVARWXH(W, H) \
119 : unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
120 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
121 : const int32_t *mask, unsigned int *sse) { \
122 : int sum; \
123 : if (W == 4) { \
124 : obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
125 : } else { \
126 : obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
127 : } \
128 : return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
129 : }
130 :
131 : #if CONFIG_EXT_PARTITION
132 : OBMCVARWXH(128, 128)
133 : OBMCVARWXH(128, 64)
134 : OBMCVARWXH(64, 128)
135 : #endif // CONFIG_EXT_PARTITION
136 0 : OBMCVARWXH(64, 64)
137 0 : OBMCVARWXH(64, 32)
138 0 : OBMCVARWXH(32, 64)
139 0 : OBMCVARWXH(32, 32)
140 0 : OBMCVARWXH(32, 16)
141 0 : OBMCVARWXH(16, 32)
142 0 : OBMCVARWXH(16, 16)
143 0 : OBMCVARWXH(16, 8)
144 0 : OBMCVARWXH(8, 16)
145 0 : OBMCVARWXH(8, 8)
146 0 : OBMCVARWXH(8, 4)
147 0 : OBMCVARWXH(4, 8)
148 0 : OBMCVARWXH(4, 4)
149 :
150 : ////////////////////////////////////////////////////////////////////////////////
151 : // High bit-depth
152 : ////////////////////////////////////////////////////////////////////////////////
153 :
154 : #if CONFIG_HIGHBITDEPTH
155 0 : static INLINE void hbd_obmc_variance_w4(
156 : const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
157 : const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
158 0 : const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
159 0 : const int pre_step = pre_stride - 4;
160 0 : int n = 0;
161 0 : __m128i v_sum_d = _mm_setzero_si128();
162 0 : __m128i v_sse_d = _mm_setzero_si128();
163 :
164 0 : assert(IS_POWER_OF_TWO(h));
165 :
166 : do {
167 0 : const __m128i v_p_w = xx_loadl_64(pre + n);
168 0 : const __m128i v_m_d = xx_load_128(mask + n);
169 0 : const __m128i v_w_d = xx_load_128(wsrc + n);
170 :
171 0 : const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
172 :
173 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
174 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
175 : // than pmulld but produces the same result with these inputs.
176 0 : const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
177 :
178 0 : const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
179 0 : const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
180 0 : const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
181 :
182 0 : v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
183 0 : v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
184 :
185 0 : n += 4;
186 :
187 0 : if (n % 4 == 0) pre += pre_step;
188 0 : } while (n < 4 * h);
189 :
190 0 : *sum = xx_hsum_epi32_si32(v_sum_d);
191 0 : *sse = xx_hsum_epi32_si32(v_sse_d);
192 0 : }
193 :
194 0 : static INLINE void hbd_obmc_variance_w8n(
195 : const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
196 : const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
197 : const int h) {
198 0 : const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
199 0 : const int pre_step = pre_stride - w;
200 0 : int n = 0;
201 0 : __m128i v_sum_d = _mm_setzero_si128();
202 0 : __m128i v_sse_d = _mm_setzero_si128();
203 :
204 0 : assert(w >= 8);
205 0 : assert(IS_POWER_OF_TWO(w));
206 0 : assert(IS_POWER_OF_TWO(h));
207 :
208 : do {
209 0 : const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
210 0 : const __m128i v_m1_d = xx_load_128(mask + n + 4);
211 0 : const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
212 0 : const __m128i v_p0_w = xx_loadl_64(pre + n);
213 0 : const __m128i v_m0_d = xx_load_128(mask + n);
214 0 : const __m128i v_w0_d = xx_load_128(wsrc + n);
215 :
216 0 : const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
217 0 : const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
218 :
219 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
220 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
221 : // than pmulld but produces the same result with these inputs.
222 0 : const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
223 0 : const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
224 :
225 0 : const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
226 0 : const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
227 :
228 0 : const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
229 0 : const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
230 0 : const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
231 0 : const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
232 :
233 0 : v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
234 0 : v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
235 0 : v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
236 :
237 0 : n += 8;
238 :
239 0 : if (n % w == 0) pre += pre_step;
240 0 : } while (n < w * h);
241 :
242 0 : *sum += xx_hsum_epi32_si64(v_sum_d);
243 0 : *sse += xx_hsum_epi32_si64(v_sse_d);
244 0 : }
245 :
246 0 : static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
247 : const int32_t *wsrc,
248 : const int32_t *mask, int w, int h,
249 : unsigned int *sse, int *sum) {
250 0 : int64_t sum64 = 0;
251 0 : uint64_t sse64 = 0;
252 0 : if (w == 4) {
253 0 : hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
254 : } else {
255 0 : hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
256 : }
257 0 : *sum = (int)sum64;
258 0 : *sse = (unsigned int)sse64;
259 0 : }
260 :
261 0 : static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
262 : const int32_t *wsrc,
263 : const int32_t *mask, int w, int h,
264 : unsigned int *sse, int *sum) {
265 0 : int64_t sum64 = 0;
266 0 : uint64_t sse64 = 0;
267 0 : if (w == 4) {
268 0 : hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
269 : } else {
270 0 : hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
271 : }
272 0 : *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
273 0 : *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
274 0 : }
275 :
276 0 : static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
277 : const int32_t *wsrc,
278 : const int32_t *mask, int w, int h,
279 : unsigned int *sse, int *sum) {
280 0 : int64_t sum64 = 0;
281 0 : uint64_t sse64 = 0;
282 0 : if (w == 128) {
283 : do {
284 0 : hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 128,
285 : 32);
286 0 : pre8 += 32 * pre_stride;
287 0 : wsrc += 32 * 128;
288 0 : mask += 32 * 128;
289 0 : h -= 32;
290 0 : } while (h > 0);
291 0 : } else if (w == 64 && h >= 128) {
292 : do {
293 0 : hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, 64,
294 : 64);
295 0 : pre8 += 64 * pre_stride;
296 0 : wsrc += 64 * 64;
297 0 : mask += 64 * 64;
298 0 : h -= 64;
299 0 : } while (h > 0);
300 0 : } else if (w == 4) {
301 0 : hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
302 : } else {
303 0 : hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
304 : }
305 0 : *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
306 0 : *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
307 0 : }
308 :
309 : #define HBD_OBMCVARWXH(W, H) \
310 : unsigned int aom_highbd_obmc_variance##W##x##H##_sse4_1( \
311 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
312 : const int32_t *mask, unsigned int *sse) { \
313 : int sum; \
314 : highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
315 : return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
316 : } \
317 : \
318 : unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
319 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
320 : const int32_t *mask, unsigned int *sse) { \
321 : int sum; \
322 : int64_t var; \
323 : highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
324 : var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
325 : return (var >= 0) ? (uint32_t)var : 0; \
326 : } \
327 : \
328 : unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
329 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
330 : const int32_t *mask, unsigned int *sse) { \
331 : int sum; \
332 : int64_t var; \
333 : highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
334 : var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
335 : return (var >= 0) ? (uint32_t)var : 0; \
336 : }
337 :
338 : #if CONFIG_EXT_PARTITION
339 : HBD_OBMCVARWXH(128, 128)
340 : HBD_OBMCVARWXH(128, 64)
341 : HBD_OBMCVARWXH(64, 128)
342 : #endif // CONFIG_EXT_PARTITION
343 0 : HBD_OBMCVARWXH(64, 64)
344 0 : HBD_OBMCVARWXH(64, 32)
345 0 : HBD_OBMCVARWXH(32, 64)
346 0 : HBD_OBMCVARWXH(32, 32)
347 0 : HBD_OBMCVARWXH(32, 16)
348 0 : HBD_OBMCVARWXH(16, 32)
349 0 : HBD_OBMCVARWXH(16, 16)
350 0 : HBD_OBMCVARWXH(16, 8)
351 0 : HBD_OBMCVARWXH(8, 16)
352 0 : HBD_OBMCVARWXH(8, 8)
353 0 : HBD_OBMCVARWXH(8, 4)
354 0 : HBD_OBMCVARWXH(4, 8)
355 0 : HBD_OBMCVARWXH(4, 4)
356 : #endif // CONFIG_HIGHBITDEPTH
|