Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <immintrin.h> // AVX2
12 :
13 : #include "./vpx_dsp_rtcd.h"
14 : #include "vpx_ports/mem.h"
15 :
16 : /* clang-format off */
17 : DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
18 : 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
19 : 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
20 : 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21 : 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22 : 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
23 : 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
24 : 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
25 : 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
26 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
27 : 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
28 : 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
29 : 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
30 : 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
31 : 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
32 : 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
33 : 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
34 : };
35 : /* clang-format on */
36 :
37 0 : void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
38 : const unsigned char *ref_ptr, int recon_stride,
39 : unsigned int *SSE, int *Sum) {
40 : __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
41 : __m256i ref_expand_high, madd_low, madd_high;
42 : unsigned int i, src_2strides, ref_2strides;
43 0 : __m256i zero_reg = _mm256_set1_epi16(0);
44 0 : __m256i sum_ref_src = _mm256_set1_epi16(0);
45 0 : __m256i madd_ref_src = _mm256_set1_epi16(0);
46 :
47 : // processing two strides in a 256 bit register reducing the number
48 : // of loop stride by half (comparing to the sse2 code)
49 0 : src_2strides = source_stride << 1;
50 0 : ref_2strides = recon_stride << 1;
51 0 : for (i = 0; i < 8; i++) {
52 0 : src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
53 0 : src = _mm256_inserti128_si256(
54 : src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
55 :
56 0 : ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
57 0 : ref = _mm256_inserti128_si256(
58 : ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
59 :
60 : // expanding to 16 bit each lane
61 0 : src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
62 0 : src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
63 :
64 0 : ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
65 0 : ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
66 :
67 : // src-ref
68 0 : src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
69 0 : src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
70 :
71 : // madd low (src - ref)
72 0 : madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
73 :
74 : // add high to low
75 0 : src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
76 :
77 : // madd high (src - ref)
78 0 : madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
79 :
80 0 : sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
81 :
82 : // add high to low
83 0 : madd_ref_src =
84 0 : _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
85 :
86 0 : src_ptr += src_2strides;
87 0 : ref_ptr += ref_2strides;
88 : }
89 :
90 : {
91 : __m128i sum_res, madd_res;
92 : __m128i expand_sum_low, expand_sum_high, expand_sum;
93 : __m128i expand_madd_low, expand_madd_high, expand_madd;
94 : __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
95 :
96 : // extract the low lane and add it to the high lane
97 0 : sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
98 0 : _mm256_extractf128_si256(sum_ref_src, 1));
99 :
100 0 : madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
101 0 : _mm256_extractf128_si256(madd_ref_src, 1));
102 :
103 : // padding each 2 bytes with another 2 zeroed bytes
104 0 : expand_sum_low =
105 0 : _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
106 0 : expand_sum_high =
107 0 : _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
108 :
109 : // shifting the sign 16 bits right
110 0 : expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
111 0 : expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
112 :
113 0 : expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
114 :
115 : // expand each 32 bits of the madd result to 64 bits
116 0 : expand_madd_low =
117 0 : _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
118 0 : expand_madd_high =
119 0 : _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
120 :
121 0 : expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
122 :
123 0 : ex_expand_sum_low =
124 0 : _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
125 0 : ex_expand_sum_high =
126 0 : _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
127 :
128 0 : ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
129 :
130 : // shift 8 bytes eight
131 0 : madd_res = _mm_srli_si128(expand_madd, 8);
132 0 : sum_res = _mm_srli_si128(ex_expand_sum, 8);
133 :
134 0 : madd_res = _mm_add_epi32(madd_res, expand_madd);
135 0 : sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
136 :
137 0 : *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
138 :
139 0 : *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
140 : }
141 0 : }
142 :
143 0 : void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
144 : const unsigned char *ref_ptr, int recon_stride,
145 : unsigned int *SSE, int *Sum) {
146 : __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
147 : __m256i ref_expand_high, madd_low, madd_high;
148 : unsigned int i;
149 0 : __m256i zero_reg = _mm256_set1_epi16(0);
150 0 : __m256i sum_ref_src = _mm256_set1_epi16(0);
151 0 : __m256i madd_ref_src = _mm256_set1_epi16(0);
152 :
153 : // processing 32 elements in parallel
154 0 : for (i = 0; i < 16; i++) {
155 0 : src = _mm256_loadu_si256((__m256i const *)(src_ptr));
156 :
157 0 : ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
158 :
159 : // expanding to 16 bit each lane
160 0 : src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
161 0 : src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
162 :
163 0 : ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
164 0 : ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
165 :
166 : // src-ref
167 0 : src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
168 0 : src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
169 :
170 : // madd low (src - ref)
171 0 : madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
172 :
173 : // add high to low
174 0 : src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
175 :
176 : // madd high (src - ref)
177 0 : madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
178 :
179 0 : sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
180 :
181 : // add high to low
182 0 : madd_ref_src =
183 0 : _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
184 :
185 0 : src_ptr += source_stride;
186 0 : ref_ptr += recon_stride;
187 : }
188 :
189 : {
190 : __m256i expand_sum_low, expand_sum_high, expand_sum;
191 : __m256i expand_madd_low, expand_madd_high, expand_madd;
192 : __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
193 :
194 : // padding each 2 bytes with another 2 zeroed bytes
195 0 : expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
196 0 : expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
197 :
198 : // shifting the sign 16 bits right
199 0 : expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
200 0 : expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
201 :
202 0 : expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
203 :
204 : // expand each 32 bits of the madd result to 64 bits
205 0 : expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
206 0 : expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
207 :
208 0 : expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
209 :
210 0 : ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
211 0 : ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
212 :
213 0 : ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
214 :
215 : // shift 8 bytes eight
216 0 : madd_ref_src = _mm256_srli_si256(expand_madd, 8);
217 0 : sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
218 :
219 0 : madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
220 0 : sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
221 :
222 : // extract the low lane and the high lane and add the results
223 0 : *((int *)SSE) =
224 0 : _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
225 0 : _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
226 :
227 0 : *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
228 0 : _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
229 : }
230 0 : }
231 :
232 : #define FILTER_SRC(filter) \
233 : /* filter the source */ \
234 : exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
235 : exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
236 : \
237 : /* add 8 to source */ \
238 : exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
239 : exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
240 : \
241 : /* divide source by 16 */ \
242 : exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
243 : exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
244 :
245 : #define MERGE_WITH_SRC(src_reg, reg) \
246 : exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
247 : exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
248 :
249 : #define LOAD_SRC_DST \
250 : /* load source and destination */ \
251 : src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
252 : dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
253 :
254 : #define AVG_NEXT_SRC(src_reg, size_stride) \
255 : src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
256 : /* average between current and next stride source */ \
257 : src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
258 :
259 : #define MERGE_NEXT_SRC(src_reg, size_stride) \
260 : src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
261 : MERGE_WITH_SRC(src_reg, src_next_reg)
262 :
263 : #define CALC_SUM_SSE_INSIDE_LOOP \
264 : /* expand each byte to 2 bytes */ \
265 : exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
266 : exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
267 : /* source - dest */ \
268 : exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
269 : exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
270 : /* caculate sum */ \
271 : sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
272 : exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
273 : sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
274 : exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
275 : /* calculate sse */ \
276 : sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
277 : sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
278 :
279 : // final calculation to sum and sse
280 : #define CALC_SUM_AND_SSE \
281 : res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
282 : sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
283 : sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
284 : sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
285 : sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
286 : sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
287 : \
288 : sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
289 : sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
290 : \
291 : sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
292 : sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
293 : *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
294 : _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
295 : sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
296 : sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
297 : sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
298 : _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
299 :
300 0 : unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
301 : int x_offset, int y_offset,
302 : const uint8_t *dst, int dst_stride,
303 : int height, unsigned int *sse) {
304 : __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
305 : __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
306 : __m256i zero_reg;
307 : int i, sum;
308 0 : sum_reg = _mm256_set1_epi16(0);
309 0 : sse_reg = _mm256_set1_epi16(0);
310 0 : zero_reg = _mm256_set1_epi16(0);
311 :
312 : // x_offset = 0 and y_offset = 0
313 0 : if (x_offset == 0) {
314 0 : if (y_offset == 0) {
315 0 : for (i = 0; i < height; i++) {
316 0 : LOAD_SRC_DST
317 : // expend each byte to 2 bytes
318 0 : MERGE_WITH_SRC(src_reg, zero_reg)
319 0 : CALC_SUM_SSE_INSIDE_LOOP
320 0 : src += src_stride;
321 0 : dst += dst_stride;
322 : }
323 : // x_offset = 0 and y_offset = 8
324 0 : } else if (y_offset == 8) {
325 : __m256i src_next_reg;
326 0 : for (i = 0; i < height; i++) {
327 0 : LOAD_SRC_DST
328 0 : AVG_NEXT_SRC(src_reg, src_stride)
329 : // expend each byte to 2 bytes
330 0 : MERGE_WITH_SRC(src_reg, zero_reg)
331 0 : CALC_SUM_SSE_INSIDE_LOOP
332 0 : src += src_stride;
333 0 : dst += dst_stride;
334 : }
335 : // x_offset = 0 and y_offset = bilin interpolation
336 : } else {
337 : __m256i filter, pw8, src_next_reg;
338 :
339 0 : y_offset <<= 5;
340 0 : filter = _mm256_load_si256(
341 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
342 0 : pw8 = _mm256_set1_epi16(8);
343 0 : for (i = 0; i < height; i++) {
344 0 : LOAD_SRC_DST
345 0 : MERGE_NEXT_SRC(src_reg, src_stride)
346 0 : FILTER_SRC(filter)
347 0 : CALC_SUM_SSE_INSIDE_LOOP
348 0 : src += src_stride;
349 0 : dst += dst_stride;
350 : }
351 : }
352 : // x_offset = 8 and y_offset = 0
353 0 : } else if (x_offset == 8) {
354 0 : if (y_offset == 0) {
355 : __m256i src_next_reg;
356 0 : for (i = 0; i < height; i++) {
357 0 : LOAD_SRC_DST
358 0 : AVG_NEXT_SRC(src_reg, 1)
359 : // expand each byte to 2 bytes
360 0 : MERGE_WITH_SRC(src_reg, zero_reg)
361 0 : CALC_SUM_SSE_INSIDE_LOOP
362 0 : src += src_stride;
363 0 : dst += dst_stride;
364 : }
365 : // x_offset = 8 and y_offset = 8
366 0 : } else if (y_offset == 8) {
367 : __m256i src_next_reg, src_avg;
368 : // load source and another source starting from the next
369 : // following byte
370 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
371 0 : AVG_NEXT_SRC(src_reg, 1)
372 0 : for (i = 0; i < height; i++) {
373 0 : src_avg = src_reg;
374 0 : src += src_stride;
375 0 : LOAD_SRC_DST
376 0 : AVG_NEXT_SRC(src_reg, 1)
377 : // average between previous average to current average
378 0 : src_avg = _mm256_avg_epu8(src_avg, src_reg);
379 : // expand each byte to 2 bytes
380 0 : MERGE_WITH_SRC(src_avg, zero_reg)
381 : // save current source average
382 0 : CALC_SUM_SSE_INSIDE_LOOP
383 0 : dst += dst_stride;
384 : }
385 : // x_offset = 8 and y_offset = bilin interpolation
386 : } else {
387 : __m256i filter, pw8, src_next_reg, src_avg;
388 0 : y_offset <<= 5;
389 0 : filter = _mm256_load_si256(
390 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
391 0 : pw8 = _mm256_set1_epi16(8);
392 : // load source and another source starting from the next
393 : // following byte
394 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
395 0 : AVG_NEXT_SRC(src_reg, 1)
396 0 : for (i = 0; i < height; i++) {
397 : // save current source average
398 0 : src_avg = src_reg;
399 0 : src += src_stride;
400 0 : LOAD_SRC_DST
401 0 : AVG_NEXT_SRC(src_reg, 1)
402 0 : MERGE_WITH_SRC(src_avg, src_reg)
403 0 : FILTER_SRC(filter)
404 0 : CALC_SUM_SSE_INSIDE_LOOP
405 0 : dst += dst_stride;
406 : }
407 : }
408 : // x_offset = bilin interpolation and y_offset = 0
409 : } else {
410 0 : if (y_offset == 0) {
411 : __m256i filter, pw8, src_next_reg;
412 0 : x_offset <<= 5;
413 0 : filter = _mm256_load_si256(
414 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
415 0 : pw8 = _mm256_set1_epi16(8);
416 0 : for (i = 0; i < height; i++) {
417 0 : LOAD_SRC_DST
418 0 : MERGE_NEXT_SRC(src_reg, 1)
419 0 : FILTER_SRC(filter)
420 0 : CALC_SUM_SSE_INSIDE_LOOP
421 0 : src += src_stride;
422 0 : dst += dst_stride;
423 : }
424 : // x_offset = bilin interpolation and y_offset = 8
425 0 : } else if (y_offset == 8) {
426 : __m256i filter, pw8, src_next_reg, src_pack;
427 0 : x_offset <<= 5;
428 0 : filter = _mm256_load_si256(
429 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
430 0 : pw8 = _mm256_set1_epi16(8);
431 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
432 0 : MERGE_NEXT_SRC(src_reg, 1)
433 0 : FILTER_SRC(filter)
434 : // convert each 16 bit to 8 bit to each low and high lane source
435 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
436 0 : for (i = 0; i < height; i++) {
437 0 : src += src_stride;
438 0 : LOAD_SRC_DST
439 0 : MERGE_NEXT_SRC(src_reg, 1)
440 0 : FILTER_SRC(filter)
441 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
442 : // average between previous pack to the current
443 0 : src_pack = _mm256_avg_epu8(src_pack, src_reg);
444 0 : MERGE_WITH_SRC(src_pack, zero_reg)
445 0 : CALC_SUM_SSE_INSIDE_LOOP
446 0 : src_pack = src_reg;
447 0 : dst += dst_stride;
448 : }
449 : // x_offset = bilin interpolation and y_offset = bilin interpolation
450 : } else {
451 : __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
452 0 : x_offset <<= 5;
453 0 : xfilter = _mm256_load_si256(
454 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
455 0 : y_offset <<= 5;
456 0 : yfilter = _mm256_load_si256(
457 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
458 0 : pw8 = _mm256_set1_epi16(8);
459 : // load source and another source starting from the next
460 : // following byte
461 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
462 0 : MERGE_NEXT_SRC(src_reg, 1)
463 :
464 0 : FILTER_SRC(xfilter)
465 : // convert each 16 bit to 8 bit to each low and high lane source
466 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
467 0 : for (i = 0; i < height; i++) {
468 0 : src += src_stride;
469 0 : LOAD_SRC_DST
470 0 : MERGE_NEXT_SRC(src_reg, 1)
471 0 : FILTER_SRC(xfilter)
472 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
473 : // merge previous pack to current pack source
474 0 : MERGE_WITH_SRC(src_pack, src_reg)
475 : // filter the source
476 0 : FILTER_SRC(yfilter)
477 0 : src_pack = src_reg;
478 0 : CALC_SUM_SSE_INSIDE_LOOP
479 0 : dst += dst_stride;
480 : }
481 : }
482 : }
483 0 : CALC_SUM_AND_SSE
484 0 : return sum;
485 : }
486 :
487 0 : unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
488 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
489 : const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
490 : int height, unsigned int *sse) {
491 : __m256i sec_reg;
492 : __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
493 : __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
494 : __m256i zero_reg;
495 : int i, sum;
496 0 : sum_reg = _mm256_set1_epi16(0);
497 0 : sse_reg = _mm256_set1_epi16(0);
498 0 : zero_reg = _mm256_set1_epi16(0);
499 :
500 : // x_offset = 0 and y_offset = 0
501 0 : if (x_offset == 0) {
502 0 : if (y_offset == 0) {
503 0 : for (i = 0; i < height; i++) {
504 0 : LOAD_SRC_DST
505 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
506 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
507 0 : sec += sec_stride;
508 : // expend each byte to 2 bytes
509 0 : MERGE_WITH_SRC(src_reg, zero_reg)
510 0 : CALC_SUM_SSE_INSIDE_LOOP
511 0 : src += src_stride;
512 0 : dst += dst_stride;
513 : }
514 0 : } else if (y_offset == 8) {
515 : __m256i src_next_reg;
516 0 : for (i = 0; i < height; i++) {
517 0 : LOAD_SRC_DST
518 0 : AVG_NEXT_SRC(src_reg, src_stride)
519 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
520 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
521 0 : sec += sec_stride;
522 : // expend each byte to 2 bytes
523 0 : MERGE_WITH_SRC(src_reg, zero_reg)
524 0 : CALC_SUM_SSE_INSIDE_LOOP
525 0 : src += src_stride;
526 0 : dst += dst_stride;
527 : }
528 : // x_offset = 0 and y_offset = bilin interpolation
529 : } else {
530 : __m256i filter, pw8, src_next_reg;
531 :
532 0 : y_offset <<= 5;
533 0 : filter = _mm256_load_si256(
534 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
535 0 : pw8 = _mm256_set1_epi16(8);
536 0 : for (i = 0; i < height; i++) {
537 0 : LOAD_SRC_DST
538 0 : MERGE_NEXT_SRC(src_reg, src_stride)
539 0 : FILTER_SRC(filter)
540 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
541 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
542 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
543 0 : sec += sec_stride;
544 0 : MERGE_WITH_SRC(src_reg, zero_reg)
545 0 : CALC_SUM_SSE_INSIDE_LOOP
546 0 : src += src_stride;
547 0 : dst += dst_stride;
548 : }
549 : }
550 : // x_offset = 8 and y_offset = 0
551 0 : } else if (x_offset == 8) {
552 0 : if (y_offset == 0) {
553 : __m256i src_next_reg;
554 0 : for (i = 0; i < height; i++) {
555 0 : LOAD_SRC_DST
556 0 : AVG_NEXT_SRC(src_reg, 1)
557 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
558 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
559 0 : sec += sec_stride;
560 : // expand each byte to 2 bytes
561 0 : MERGE_WITH_SRC(src_reg, zero_reg)
562 0 : CALC_SUM_SSE_INSIDE_LOOP
563 0 : src += src_stride;
564 0 : dst += dst_stride;
565 : }
566 : // x_offset = 8 and y_offset = 8
567 0 : } else if (y_offset == 8) {
568 : __m256i src_next_reg, src_avg;
569 : // load source and another source starting from the next
570 : // following byte
571 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
572 0 : AVG_NEXT_SRC(src_reg, 1)
573 0 : for (i = 0; i < height; i++) {
574 : // save current source average
575 0 : src_avg = src_reg;
576 0 : src += src_stride;
577 0 : LOAD_SRC_DST
578 0 : AVG_NEXT_SRC(src_reg, 1)
579 : // average between previous average to current average
580 0 : src_avg = _mm256_avg_epu8(src_avg, src_reg);
581 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
582 0 : src_avg = _mm256_avg_epu8(src_avg, sec_reg);
583 0 : sec += sec_stride;
584 : // expand each byte to 2 bytes
585 0 : MERGE_WITH_SRC(src_avg, zero_reg)
586 0 : CALC_SUM_SSE_INSIDE_LOOP
587 0 : dst += dst_stride;
588 : }
589 : // x_offset = 8 and y_offset = bilin interpolation
590 : } else {
591 : __m256i filter, pw8, src_next_reg, src_avg;
592 0 : y_offset <<= 5;
593 0 : filter = _mm256_load_si256(
594 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
595 0 : pw8 = _mm256_set1_epi16(8);
596 : // load source and another source starting from the next
597 : // following byte
598 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
599 0 : AVG_NEXT_SRC(src_reg, 1)
600 0 : for (i = 0; i < height; i++) {
601 : // save current source average
602 0 : src_avg = src_reg;
603 0 : src += src_stride;
604 0 : LOAD_SRC_DST
605 0 : AVG_NEXT_SRC(src_reg, 1)
606 0 : MERGE_WITH_SRC(src_avg, src_reg)
607 0 : FILTER_SRC(filter)
608 0 : src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
609 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
610 0 : src_avg = _mm256_avg_epu8(src_avg, sec_reg);
611 : // expand each byte to 2 bytes
612 0 : MERGE_WITH_SRC(src_avg, zero_reg)
613 0 : sec += sec_stride;
614 0 : CALC_SUM_SSE_INSIDE_LOOP
615 0 : dst += dst_stride;
616 : }
617 : }
618 : // x_offset = bilin interpolation and y_offset = 0
619 : } else {
620 0 : if (y_offset == 0) {
621 : __m256i filter, pw8, src_next_reg;
622 0 : x_offset <<= 5;
623 0 : filter = _mm256_load_si256(
624 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
625 0 : pw8 = _mm256_set1_epi16(8);
626 0 : for (i = 0; i < height; i++) {
627 0 : LOAD_SRC_DST
628 0 : MERGE_NEXT_SRC(src_reg, 1)
629 0 : FILTER_SRC(filter)
630 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
631 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
632 0 : src_reg = _mm256_avg_epu8(src_reg, sec_reg);
633 0 : MERGE_WITH_SRC(src_reg, zero_reg)
634 0 : sec += sec_stride;
635 0 : CALC_SUM_SSE_INSIDE_LOOP
636 0 : src += src_stride;
637 0 : dst += dst_stride;
638 : }
639 : // x_offset = bilin interpolation and y_offset = 8
640 0 : } else if (y_offset == 8) {
641 : __m256i filter, pw8, src_next_reg, src_pack;
642 0 : x_offset <<= 5;
643 0 : filter = _mm256_load_si256(
644 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
645 0 : pw8 = _mm256_set1_epi16(8);
646 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
647 0 : MERGE_NEXT_SRC(src_reg, 1)
648 0 : FILTER_SRC(filter)
649 : // convert each 16 bit to 8 bit to each low and high lane source
650 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
651 0 : for (i = 0; i < height; i++) {
652 0 : src += src_stride;
653 0 : LOAD_SRC_DST
654 0 : MERGE_NEXT_SRC(src_reg, 1)
655 0 : FILTER_SRC(filter)
656 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
657 : // average between previous pack to the current
658 0 : src_pack = _mm256_avg_epu8(src_pack, src_reg);
659 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
660 0 : src_pack = _mm256_avg_epu8(src_pack, sec_reg);
661 0 : sec += sec_stride;
662 0 : MERGE_WITH_SRC(src_pack, zero_reg)
663 0 : src_pack = src_reg;
664 0 : CALC_SUM_SSE_INSIDE_LOOP
665 0 : dst += dst_stride;
666 : }
667 : // x_offset = bilin interpolation and y_offset = bilin interpolation
668 : } else {
669 : __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
670 0 : x_offset <<= 5;
671 0 : xfilter = _mm256_load_si256(
672 : (__m256i const *)(bilinear_filters_avx2 + x_offset));
673 0 : y_offset <<= 5;
674 0 : yfilter = _mm256_load_si256(
675 : (__m256i const *)(bilinear_filters_avx2 + y_offset));
676 0 : pw8 = _mm256_set1_epi16(8);
677 : // load source and another source starting from the next
678 : // following byte
679 0 : src_reg = _mm256_loadu_si256((__m256i const *)(src));
680 0 : MERGE_NEXT_SRC(src_reg, 1)
681 :
682 0 : FILTER_SRC(xfilter)
683 : // convert each 16 bit to 8 bit to each low and high lane source
684 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
685 0 : for (i = 0; i < height; i++) {
686 0 : src += src_stride;
687 0 : LOAD_SRC_DST
688 0 : MERGE_NEXT_SRC(src_reg, 1)
689 0 : FILTER_SRC(xfilter)
690 0 : src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
691 : // merge previous pack to current pack source
692 0 : MERGE_WITH_SRC(src_pack, src_reg)
693 : // filter the source
694 0 : FILTER_SRC(yfilter)
695 0 : src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
696 0 : sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
697 0 : src_pack = _mm256_avg_epu8(src_pack, sec_reg);
698 0 : MERGE_WITH_SRC(src_pack, zero_reg)
699 0 : src_pack = src_reg;
700 0 : sec += sec_stride;
701 0 : CALC_SUM_SSE_INSIDE_LOOP
702 0 : dst += dst_stride;
703 : }
704 : }
705 : }
706 0 : CALC_SUM_AND_SSE
707 0 : return sum;
708 : }
|