Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h> // AVX2
13 :
14 : #include "./aom_dsp_rtcd.h"
15 : #include "aom_ports/mem.h"
16 :
17 : /* clang-format off */
18 : DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
19 : 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
20 : 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
21 : 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22 : 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
23 : 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
24 : 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25 : 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
26 : 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
27 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
29 : 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
30 : 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
31 : 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
32 : 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
33 : 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
34 : 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
35 : };
36 : /* clang-format on */
37 :
38 0 : void aom_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
39 : const unsigned char *ref_ptr, int recon_stride,
40 : unsigned int *SSE, int *Sum) {
41 : __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
42 : __m256i ref_expand_high, madd_low, madd_high;
43 : unsigned int i, src_2strides, ref_2strides;
44 0 : __m256i zero_reg = _mm256_set1_epi16(0);
45 0 : __m256i sum_ref_src = _mm256_set1_epi16(0);
46 0 : __m256i madd_ref_src = _mm256_set1_epi16(0);
47 :
48 : // processing two strides in a 256 bit register reducing the number
49 : // of loop stride by half (comparing to the sse2 code)
50 0 : src_2strides = source_stride << 1;
51 0 : ref_2strides = recon_stride << 1;
52 0 : for (i = 0; i < 8; i++) {
53 0 : src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
54 0 : src = _mm256_inserti128_si256(
55 : src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
56 :
57 0 : ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
58 0 : ref = _mm256_inserti128_si256(
59 : ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
60 :
61 : // expanding to 16 bit each lane
62 0 : src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
63 0 : src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
64 :
65 0 : ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
66 0 : ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
67 :
68 : // src-ref
69 0 : src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
70 0 : src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
71 :
72 : // madd low (src - ref)
73 0 : madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
74 :
75 : // add high to low
76 0 : src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
77 :
78 : // madd high (src - ref)
79 0 : madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
80 :
81 0 : sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
82 :
83 : // add high to low
84 0 : madd_ref_src =
85 0 : _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
86 :
87 0 : src_ptr += src_2strides;
88 0 : ref_ptr += ref_2strides;
89 : }
90 :
91 : {
92 : __m128i sum_res, madd_res;
93 : __m128i expand_sum_low, expand_sum_high, expand_sum;
94 : __m128i expand_madd_low, expand_madd_high, expand_madd;
95 : __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
96 :
97 : // extract the low lane and add it to the high lane
98 0 : sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
99 0 : _mm256_extractf128_si256(sum_ref_src, 1));
100 :
101 0 : madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
102 0 : _mm256_extractf128_si256(madd_ref_src, 1));
103 :
104 : // padding each 2 bytes with another 2 zeroed bytes
105 0 : expand_sum_low =
106 0 : _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
107 0 : expand_sum_high =
108 0 : _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
109 :
110 : // shifting the sign 16 bits right
111 0 : expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
112 0 : expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
113 :
114 0 : expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
115 :
116 : // expand each 32 bits of the madd result to 64 bits
117 0 : expand_madd_low =
118 0 : _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
119 0 : expand_madd_high =
120 0 : _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
121 :
122 0 : expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
123 :
124 0 : ex_expand_sum_low =
125 0 : _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
126 0 : ex_expand_sum_high =
127 0 : _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
128 :
129 0 : ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
130 :
131 : // shift 8 bytes eight
132 0 : madd_res = _mm_srli_si128(expand_madd, 8);
133 0 : sum_res = _mm_srli_si128(ex_expand_sum, 8);
134 :
135 0 : madd_res = _mm_add_epi32(madd_res, expand_madd);
136 0 : sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
137 :
138 0 : *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
139 :
140 0 : *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
141 : }
142 : _mm256_zeroupper();
143 0 : }
144 :
145 0 : void aom_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
146 : const unsigned char *ref_ptr, int recon_stride,
147 : unsigned int *SSE, int *Sum) {
148 : __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
149 : __m256i ref_expand_high, madd_low, madd_high;
150 : unsigned int i;
151 0 : __m256i zero_reg = _mm256_set1_epi16(0);
152 0 : __m256i sum_ref_src = _mm256_set1_epi16(0);
153 0 : __m256i madd_ref_src = _mm256_set1_epi16(0);
154 :
155 : // processing 32 elements in parallel
156 0 : for (i = 0; i < 16; i++) {
157 0 : src = _mm256_loadu_si256((__m256i const *)(src_ptr));
158 :
159 0 : ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
160 :
161 : // expanding to 16 bit each lane
162 0 : src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
163 0 : src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
164 :
165 0 : ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
166 0 : ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
167 :
168 : // src-ref
169 0 : src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
170 0 : src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
171 :
172 : // madd low (src - ref)
173 0 : madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
174 :
175 : // add high to low
176 0 : src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
177 :
178 : // madd high (src - ref)
179 0 : madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
180 :
181 0 : sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
182 :
183 : // add high to low
184 0 : madd_ref_src =
185 0 : _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
186 :
187 0 : src_ptr += source_stride;
188 0 : ref_ptr += recon_stride;
189 : }
190 :
191 : {
192 : __m256i expand_sum_low, expand_sum_high, expand_sum;
193 : __m256i expand_madd_low, expand_madd_high, expand_madd;
194 : __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
195 :
196 : // padding each 2 bytes with another 2 zeroed bytes
197 0 : expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
198 0 : expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
199 :
200 : // shifting the sign 16 bits right
201 0 : expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
202 0 : expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
203 :
204 0 : expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
205 :
206 : // expand each 32 bits of the madd result to 64 bits
207 0 : expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
208 0 : expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
209 :
210 0 : expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
211 :
212 0 : ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
213 0 : ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
214 :
215 0 : ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
216 :
217 : // shift 8 bytes eight
218 0 : madd_ref_src = _mm256_srli_si256(expand_madd, 8);
219 0 : sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
220 :
221 0 : madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
222 0 : sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
223 :
224 : // extract the low lane and the high lane and add the results
225 0 : *((int *)SSE) =
226 0 : _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
227 0 : _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
228 :
229 0 : *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
230 0 : _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
231 : }
232 : _mm256_zeroupper();
233 0 : }
234 :
235 : #define FILTER_SRC(filter) \
236 : /* filter the source */ \
237 : exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
238 : exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
239 : \
240 : /* add 8 to source */ \
241 : exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
242 : exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
243 : \
244 : /* divide source by 16 */ \
245 : exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
246 : exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
247 :
248 : #define MERGE_WITH_SRC(src_reg, reg) \
249 : exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
250 : exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
251 :
252 : #define LOAD_SRC_DST \
253 : /* load source and destination */ \
254 : src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
255 : dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
256 :
257 : #define AVG_NEXT_SRC(src_reg, size_stride) \
258 : src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
259 : /* average between current and next stride source */ \
260 : src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
261 :
262 : #define MERGE_NEXT_SRC(src_reg, size_stride) \
263 : src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
264 : MERGE_WITH_SRC(src_reg, src_next_reg)
265 :
266 : #define CALC_SUM_SSE_INSIDE_LOOP \
267 : /* expand each byte to 2 bytes */ \
268 : exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
269 : exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
270 : /* source - dest */ \
271 : exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
272 : exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
273 : /* caculate sum */ \
274 : sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
275 : exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
276 : sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
277 : exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
278 : /* calculate sse */ \
279 : sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
280 : sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
281 :
282 : // final calculation to sum and sse
283 : #define CALC_SUM_AND_SSE \
284 : res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
285 : sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
286 : sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
287 : sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
288 : sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
289 : sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
290 : \
291 : sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
292 : sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
293 : \
294 : sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
295 : sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
296 : *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
297 : _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
298 : sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
299 : sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
300 : sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
301 : _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
302 :
303 0 : unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
304 : int x_offset, int y_offset,
305 : const uint8_t *dst, int dst_stride,
306 : int height, unsigned int *sse) {
307 : __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
308 : __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
309 : __m256i zero_reg;
310 : int i, sum;
311 0 : sum_reg = _mm256_set1_epi16(0);
312 0 : sse_reg = _mm256_set1_epi16(0);
313 0 : zero_reg = _mm256_set1_epi16(0);
314 :
315 : // x_offset = 0 and y_offset = 0
316 0 : if (x_offset == 0) {
317 0 : if (y_offset == 0) {
318 0 : for (i = 0; i < height; i++) {
319 0 : LOAD_SRC_DST
320 : // expend each byte to 2 bytes
321 0 : MERGE_WITH_SRC(src_reg, zero_reg)
322 0 : CALC_SUM_SSE_INSIDE_LOOP
323 0 : src += src_stride;
324 0 : dst += dst_stride;
325 : }
326 : // x_offset = 0 and y_offset = 8
327 0 : } else if (y_offset == 8) {
328 : __m256i src_next_reg;
329 0 : for (i = 0; i < height; i++) {
330 0 : LOAD_SRC_DST
331 0 : AVG_NEXT_SRC(src_reg, src_stride)
332 : // expend each byte to 2 bytes
333 0 : MERGE_WITH_SRC(src_reg, zero_reg)
334 0 : CALC_SUM_SSE_INSIDE_LOOP
335 0 : src += src_stride;
336 0 : dst += dst_stride;
337 : }
338 : // x_offset = 0 and y_offset = bilin interpolation
339 : } else {
340 : __m256i filter, pw8, src_next_reg;
341 :
342 0 : y_offset <<= 5;
343 0 : filter = _mm256_load_si256(
344 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
345 0 : pw8 = _mm256_set1_epi16(8);
346 0 : for (i = 0; i < height; i++) {
347 0 : LOAD_SRC_DST
348 0 : MERGE_NEXT_SRC(src_reg, src_stride)
349 0 : FILTER_SRC(filter)
350 0 : CALC_SUM_SSE_INSIDE_LOOP
351 0 : src += src_stride;
352 0 : dst += dst_stride;
353 : }
354 : }
355 : // x_offset = 8 and y_offset = 0
356 0 : } else if (x_offset == 8) {
357 0 : if (y_offset == 0) {
358 : __m256i src_next_reg;
359 0 : for (i = 0; i < height; i++) {
360 0 : LOAD_SRC_DST
361 0 : AVG_NEXT_SRC(src_reg, 1)
362 : // expand each byte to 2 bytes
363 0 : MERGE_WITH_SRC(src_reg, zero_reg)
364 0 : CALC_SUM_SSE_INSIDE_LOOP
365 0 : src += src_stride;
366 0 : dst += dst_stride;
367 : }
368 : // x_offset = 8 and y_offset = 8
369 0 : } else if (y_offset == 8) {
370 : __m256i src_next_reg, src_avg;
371 : // load source and another source starting from the next
372 : // following byte
373 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
374 0 : AVG_NEXT_SRC(src_reg, 1)
375 0 : for (i = 0; i < height; i++) {
376 0 : src_avg = src_reg;
377 0 : src += src_stride;
378 0 : LOAD_SRC_DST
379 0 : AVG_NEXT_SRC(src_reg, 1)
380 : // average between previous average to current average
381 0 : src_avg = _mm256_avg_epu8(src_avg, src_reg);
382 : // expand each byte to 2 bytes
383 0 : MERGE_WITH_SRC(src_avg, zero_reg)
384 : // save current source average
385 0 : CALC_SUM_SSE_INSIDE_LOOP
386 0 : dst += dst_stride;
387 : }
388 : // x_offset = 8 and y_offset = bilin interpolation
389 : } else {
390 : __m256i filter, pw8, src_next_reg, src_avg;
391 0 : y_offset <<= 5;
392 0 : filter = _mm256_load_si256(
393 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
394 0 : pw8 = _mm256_set1_epi16(8);
395 : // load source and another source starting from the next
396 : // following byte
397 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
398 0 : AVG_NEXT_SRC(src_reg, 1)
399 0 : for (i = 0; i < height; i++) {
400 : // save current source average
401 0 : src_avg = src_reg;
402 0 : src += src_stride;
403 0 : LOAD_SRC_DST
404 0 : AVG_NEXT_SRC(src_reg, 1)
405 0 : MERGE_WITH_SRC(src_avg, src_reg)
406 0 : FILTER_SRC(filter)
407 0 : CALC_SUM_SSE_INSIDE_LOOP
408 0 : dst += dst_stride;
409 : }
410 : }
411 : // x_offset = bilin interpolation and y_offset = 0
412 : } else {
413 0 : if (y_offset == 0) {
414 : __m256i filter, pw8, src_next_reg;
415 0 : x_offset <<= 5;
416 0 : filter = _mm256_load_si256(
417 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
418 0 : pw8 = _mm256_set1_epi16(8);
419 0 : for (i = 0; i < height; i++) {
420 0 : LOAD_SRC_DST
421 0 : MERGE_NEXT_SRC(src_reg, 1)
422 0 : FILTER_SRC(filter)
423 0 : CALC_SUM_SSE_INSIDE_LOOP
424 0 : src += src_stride;
425 0 : dst += dst_stride;
426 : }
427 : // x_offset = bilin interpolation and y_offset = 8
428 0 : } else if (y_offset == 8) {
429 : __m256i filter, pw8, src_next_reg, src_pack;
430 0 : x_offset <<= 5;
431 0 : filter = _mm256_load_si256(
432 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
433 0 : pw8 = _mm256_set1_epi16(8);
434 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
435 0 : MERGE_NEXT_SRC(src_reg, 1)
436 0 : FILTER_SRC(filter)
437 : // convert each 16 bit to 8 bit to each low and high lane source
438 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
439 0 : for (i = 0; i < height; i++) {
440 0 : src += src_stride;
441 0 : LOAD_SRC_DST
442 0 : MERGE_NEXT_SRC(src_reg, 1)
443 0 : FILTER_SRC(filter)
444 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
445 : // average between previous pack to the current
446 0 : src_pack = _mm256_avg_epu8(src_pack, src_reg);
447 0 : MERGE_WITH_SRC(src_pack, zero_reg)
448 0 : CALC_SUM_SSE_INSIDE_LOOP
449 0 : src_pack = src_reg;
450 0 : dst += dst_stride;
451 : }
452 : // x_offset = bilin interpolation and y_offset = bilin interpolation
453 : } else {
454 : __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
455 0 : x_offset <<= 5;
456 0 : xfilter = _mm256_load_si256(
457 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
458 0 : y_offset <<= 5;
459 0 : yfilter = _mm256_load_si256(
460 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
461 0 : pw8 = _mm256_set1_epi16(8);
462 : // load source and another source starting from the next
463 : // following byte
464 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
465 0 : MERGE_NEXT_SRC(src_reg, 1)
466 :
467 0 : FILTER_SRC(xfilter)
468 : // convert each 16 bit to 8 bit to each low and high lane source
469 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
470 0 : for (i = 0; i < height; i++) {
471 0 : src += src_stride;
472 0 : LOAD_SRC_DST
473 0 : MERGE_NEXT_SRC(src_reg, 1)
474 0 : FILTER_SRC(xfilter)
475 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
476 : // merge previous pack to current pack source
477 0 : MERGE_WITH_SRC(src_pack, src_reg)
478 : // filter the source
479 0 : FILTER_SRC(yfilter)
480 0 : src_pack = src_reg;
481 0 : CALC_SUM_SSE_INSIDE_LOOP
482 0 : dst += dst_stride;
483 : }
484 : }
485 : }
486 0 : CALC_SUM_AND_SSE
487 : _mm256_zeroupper();
488 0 : return sum;
489 : }
490 :
491 0 : unsigned int aom_sub_pixel_avg_variance32xh_avx2(
492 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
493 : const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
494 : int height, unsigned int *sse) {
495 : __m256i sec_reg;
496 : __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
497 : __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
498 : __m256i zero_reg;
499 : int i, sum;
500 0 : sum_reg = _mm256_set1_epi16(0);
501 0 : sse_reg = _mm256_set1_epi16(0);
502 0 : zero_reg = _mm256_set1_epi16(0);
503 :
504 : // x_offset = 0 and y_offset = 0
505 0 : if (x_offset == 0) {
506 0 : if (y_offset == 0) {
507 0 : for (i = 0; i < height; i++) {
508 0 : LOAD_SRC_DST
509 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
510 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
511 0 : sec += sec_stride;
512 : // expend each byte to 2 bytes
513 0 : MERGE_WITH_SRC(src_reg, zero_reg)
514 0 : CALC_SUM_SSE_INSIDE_LOOP
515 0 : src += src_stride;
516 0 : dst += dst_stride;
517 : }
518 0 : } else if (y_offset == 8) {
519 : __m256i src_next_reg;
520 0 : for (i = 0; i < height; i++) {
521 0 : LOAD_SRC_DST
522 0 : AVG_NEXT_SRC(src_reg, src_stride)
523 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
524 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
525 0 : sec += sec_stride;
526 : // expend each byte to 2 bytes
527 0 : MERGE_WITH_SRC(src_reg, zero_reg)
528 0 : CALC_SUM_SSE_INSIDE_LOOP
529 0 : src += src_stride;
530 0 : dst += dst_stride;
531 : }
532 : // x_offset = 0 and y_offset = bilin interpolation
533 : } else {
534 : __m256i filter, pw8, src_next_reg;
535 :
536 0 : y_offset <<= 5;
537 0 : filter = _mm256_load_si256(
538 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
539 0 : pw8 = _mm256_set1_epi16(8);
540 0 : for (i = 0; i < height; i++) {
541 0 : LOAD_SRC_DST
542 0 : MERGE_NEXT_SRC(src_reg, src_stride)
543 0 : FILTER_SRC(filter)
544 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
545 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
546 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
547 0 : sec += sec_stride;
548 0 : MERGE_WITH_SRC(src_reg, zero_reg)
549 0 : CALC_SUM_SSE_INSIDE_LOOP
550 0 : src += src_stride;
551 0 : dst += dst_stride;
552 : }
553 : }
554 : // x_offset = 8 and y_offset = 0
555 0 : } else if (x_offset == 8) {
556 0 : if (y_offset == 0) {
557 : __m256i src_next_reg;
558 0 : for (i = 0; i < height; i++) {
559 0 : LOAD_SRC_DST
560 0 : AVG_NEXT_SRC(src_reg, 1)
561 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
562 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
563 0 : sec += sec_stride;
564 : // expand each byte to 2 bytes
565 0 : MERGE_WITH_SRC(src_reg, zero_reg)
566 0 : CALC_SUM_SSE_INSIDE_LOOP
567 0 : src += src_stride;
568 0 : dst += dst_stride;
569 : }
570 : // x_offset = 8 and y_offset = 8
571 0 : } else if (y_offset == 8) {
572 : __m256i src_next_reg, src_avg;
573 : // load source and another source starting from the next
574 : // following byte
575 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
576 0 : AVG_NEXT_SRC(src_reg, 1)
577 0 : for (i = 0; i < height; i++) {
578 : // save current source average
579 0 : src_avg = src_reg;
580 0 : src += src_stride;
581 0 : LOAD_SRC_DST
582 0 : AVG_NEXT_SRC(src_reg, 1)
583 : // average between previous average to current average
584 0 : src_avg = _mm256_avg_epu8(src_avg, src_reg);
585 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
586 0 : src_avg = _mm256_avg_epu8(src_avg, sec_reg);
587 0 : sec += sec_stride;
588 : // expand each byte to 2 bytes
589 0 : MERGE_WITH_SRC(src_avg, zero_reg)
590 0 : CALC_SUM_SSE_INSIDE_LOOP
591 0 : dst += dst_stride;
592 : }
593 : // x_offset = 8 and y_offset = bilin interpolation
594 : } else {
595 : __m256i filter, pw8, src_next_reg, src_avg;
596 0 : y_offset <<= 5;
597 0 : filter = _mm256_load_si256(
598 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
599 0 : pw8 = _mm256_set1_epi16(8);
600 : // load source and another source starting from the next
601 : // following byte
602 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
603 0 : AVG_NEXT_SRC(src_reg, 1)
604 0 : for (i = 0; i < height; i++) {
605 : // save current source average
606 0 : src_avg = src_reg;
607 0 : src += src_stride;
608 0 : LOAD_SRC_DST
609 0 : AVG_NEXT_SRC(src_reg, 1)
610 0 : MERGE_WITH_SRC(src_avg, src_reg)
611 0 : FILTER_SRC(filter)
612 0 : src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
613 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
614 0 : src_avg = _mm256_avg_epu8(src_avg, sec_reg);
615 : // expand each byte to 2 bytes
616 0 : MERGE_WITH_SRC(src_avg, zero_reg)
617 0 : sec += sec_stride;
618 0 : CALC_SUM_SSE_INSIDE_LOOP
619 0 : dst += dst_stride;
620 : }
621 : }
622 : // x_offset = bilin interpolation and y_offset = 0
623 : } else {
624 0 : if (y_offset == 0) {
625 : __m256i filter, pw8, src_next_reg;
626 0 : x_offset <<= 5;
627 0 : filter = _mm256_load_si256(
628 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
629 0 : pw8 = _mm256_set1_epi16(8);
630 0 : for (i = 0; i < height; i++) {
631 0 : LOAD_SRC_DST
632 0 : MERGE_NEXT_SRC(src_reg, 1)
633 0 : FILTER_SRC(filter)
634 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
635 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
636 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
637 0 : MERGE_WITH_SRC(src_reg, zero_reg)
638 0 : sec += sec_stride;
639 0 : CALC_SUM_SSE_INSIDE_LOOP
640 0 : src += src_stride;
641 0 : dst += dst_stride;
642 : }
643 : // x_offset = bilin interpolation and y_offset = 8
644 0 : } else if (y_offset == 8) {
645 : __m256i filter, pw8, src_next_reg, src_pack;
646 0 : x_offset <<= 5;
647 0 : filter = _mm256_load_si256(
648 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
649 0 : pw8 = _mm256_set1_epi16(8);
650 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
651 0 : MERGE_NEXT_SRC(src_reg, 1)
652 0 : FILTER_SRC(filter)
653 : // convert each 16 bit to 8 bit to each low and high lane source
654 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
655 0 : for (i = 0; i < height; i++) {
656 0 : src += src_stride;
657 0 : LOAD_SRC_DST
658 0 : MERGE_NEXT_SRC(src_reg, 1)
659 0 : FILTER_SRC(filter)
660 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
661 : // average between previous pack to the current
662 0 : src_pack = _mm256_avg_epu8(src_pack, src_reg);
663 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
664 0 : src_pack = _mm256_avg_epu8(src_pack, sec_reg);
665 0 : sec += sec_stride;
666 0 : MERGE_WITH_SRC(src_pack, zero_reg)
667 0 : src_pack = src_reg;
668 0 : CALC_SUM_SSE_INSIDE_LOOP
669 0 : dst += dst_stride;
670 : }
671 : // x_offset = bilin interpolation and y_offset = bilin interpolation
672 : } else {
673 : __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
674 0 : x_offset <<= 5;
675 0 : xfilter = _mm256_load_si256(
676 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
677 0 : y_offset <<= 5;
678 0 : yfilter = _mm256_load_si256(
679 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
680 0 : pw8 = _mm256_set1_epi16(8);
681 : // load source and another source starting from the next
682 : // following byte
683 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
684 0 : MERGE_NEXT_SRC(src_reg, 1)
685 :
686 0 : FILTER_SRC(xfilter)
687 : // convert each 16 bit to 8 bit to each low and high lane source
688 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
689 0 : for (i = 0; i < height; i++) {
690 0 : src += src_stride;
691 0 : LOAD_SRC_DST
692 0 : MERGE_NEXT_SRC(src_reg, 1)
693 0 : FILTER_SRC(xfilter)
694 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
695 : // merge previous pack to current pack source
696 0 : MERGE_WITH_SRC(src_pack, src_reg)
697 : // filter the source
698 0 : FILTER_SRC(yfilter)
699 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
700 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
701 0 : src_pack = _mm256_avg_epu8(src_pack, sec_reg);
702 0 : MERGE_WITH_SRC(src_pack, zero_reg)
703 0 : src_pack = src_reg;
704 0 : sec += sec_stride;
705 0 : CALC_SUM_SSE_INSIDE_LOOP
706 0 : dst += dst_stride;
707 : }
708 : }
709 : }
710 0 : CALC_SUM_AND_SSE
711 : _mm256_zeroupper();
712 0 : return sum;
713 : }
|