Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <immintrin.h> /* AVX2 */
12 :
13 : #include "./vpx_dsp_rtcd.h"
14 : #include "vpx_ports/mem.h"
15 :
16 0 : void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
17 : const unsigned char *_blimit,
18 : const unsigned char *_limit,
19 : const unsigned char *_thresh) {
20 : __m128i mask, hev, flat, flat2;
21 0 : const __m128i zero = _mm_set1_epi16(0);
22 0 : const __m128i one = _mm_set1_epi8(1);
23 : __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
24 : __m128i abs_p1p0;
25 :
26 0 : const __m128i thresh =
27 0 : _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
28 0 : const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
29 0 : const __m128i blimit =
30 0 : _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
31 :
32 0 : q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
33 0 : q4p4 = _mm_castps_si128(
34 0 : _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
35 0 : q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
36 0 : q3p3 = _mm_castps_si128(
37 0 : _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
38 0 : q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
39 0 : q2p2 = _mm_castps_si128(
40 0 : _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
41 0 : q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
42 0 : q1p1 = _mm_castps_si128(
43 0 : _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
44 0 : p1q1 = _mm_shuffle_epi32(q1p1, 78);
45 0 : q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
46 0 : q0p0 = _mm_castps_si128(
47 : _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
48 0 : p0q0 = _mm_shuffle_epi32(q0p0, 78);
49 :
50 : {
51 : __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
52 0 : abs_p1p0 =
53 0 : _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
54 0 : abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
55 0 : fe = _mm_set1_epi8(0xfe);
56 0 : ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
57 0 : abs_p0q0 =
58 0 : _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
59 0 : abs_p1q1 =
60 0 : _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
61 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
62 0 : hev = _mm_subs_epu8(flat, thresh);
63 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
64 :
65 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
66 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
67 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
68 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
69 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
70 0 : mask = _mm_max_epu8(abs_p1p0, mask);
71 : // mask |= (abs(p1 - p0) > limit) * -1;
72 : // mask |= (abs(q1 - q0) > limit) * -1;
73 :
74 0 : work = _mm_max_epu8(
75 : _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
76 : _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
77 0 : mask = _mm_max_epu8(work, mask);
78 0 : mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
79 0 : mask = _mm_subs_epu8(mask, limit);
80 0 : mask = _mm_cmpeq_epi8(mask, zero);
81 : }
82 :
83 : // lp filter
84 : {
85 0 : const __m128i t4 = _mm_set1_epi8(4);
86 0 : const __m128i t3 = _mm_set1_epi8(3);
87 0 : const __m128i t80 = _mm_set1_epi8(0x80);
88 0 : const __m128i t1 = _mm_set1_epi16(0x1);
89 0 : __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
90 0 : __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
91 0 : __m128i qs0 = _mm_xor_si128(p0q0, t80);
92 0 : __m128i qs1 = _mm_xor_si128(p1q1, t80);
93 : __m128i filt;
94 : __m128i work_a;
95 : __m128i filter1, filter2;
96 : __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
97 : __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
98 :
99 0 : filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
100 0 : work_a = _mm_subs_epi8(qs0, qs0ps0);
101 0 : filt = _mm_adds_epi8(filt, work_a);
102 0 : filt = _mm_adds_epi8(filt, work_a);
103 0 : filt = _mm_adds_epi8(filt, work_a);
104 : /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
105 0 : filt = _mm_and_si128(filt, mask);
106 :
107 0 : filter1 = _mm_adds_epi8(filt, t4);
108 0 : filter2 = _mm_adds_epi8(filt, t3);
109 :
110 0 : filter1 = _mm_unpacklo_epi8(zero, filter1);
111 0 : filter1 = _mm_srai_epi16(filter1, 0xB);
112 0 : filter2 = _mm_unpacklo_epi8(zero, filter2);
113 0 : filter2 = _mm_srai_epi16(filter2, 0xB);
114 :
115 : /* Filter1 >> 3 */
116 0 : filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
117 0 : qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
118 :
119 : /* filt >> 1 */
120 0 : filt = _mm_adds_epi16(filter1, t1);
121 0 : filt = _mm_srai_epi16(filt, 1);
122 0 : filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
123 : filt);
124 0 : filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
125 0 : qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
126 : // loopfilter done
127 :
128 : {
129 : __m128i work;
130 0 : flat = _mm_max_epu8(
131 : _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
132 : _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
133 0 : flat = _mm_max_epu8(abs_p1p0, flat);
134 0 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
135 0 : flat = _mm_subs_epu8(flat, one);
136 0 : flat = _mm_cmpeq_epi8(flat, zero);
137 0 : flat = _mm_and_si128(flat, mask);
138 :
139 0 : q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
140 0 : q5p5 = _mm_castps_si128(
141 0 : _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
142 :
143 0 : q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
144 0 : q6p6 = _mm_castps_si128(
145 0 : _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
146 :
147 0 : flat2 = _mm_max_epu8(
148 : _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
149 : _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
150 :
151 0 : q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
152 0 : q7p7 = _mm_castps_si128(
153 0 : _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
154 :
155 0 : work = _mm_max_epu8(
156 : _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
157 : _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
158 :
159 0 : flat2 = _mm_max_epu8(work, flat2);
160 0 : flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
161 0 : flat2 = _mm_subs_epu8(flat2, one);
162 0 : flat2 = _mm_cmpeq_epi8(flat2, zero);
163 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
164 : }
165 :
166 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
167 : // flat and wide flat calculations
168 : {
169 0 : const __m128i eight = _mm_set1_epi16(8);
170 0 : const __m128i four = _mm_set1_epi16(4);
171 : __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
172 : __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
173 : __m128i pixelFilter_p, pixelFilter_q;
174 : __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
175 : __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
176 :
177 0 : p7_16 = _mm_unpacklo_epi8(q7p7, zero);
178 0 : p6_16 = _mm_unpacklo_epi8(q6p6, zero);
179 0 : p5_16 = _mm_unpacklo_epi8(q5p5, zero);
180 0 : p4_16 = _mm_unpacklo_epi8(q4p4, zero);
181 0 : p3_16 = _mm_unpacklo_epi8(q3p3, zero);
182 0 : p2_16 = _mm_unpacklo_epi8(q2p2, zero);
183 0 : p1_16 = _mm_unpacklo_epi8(q1p1, zero);
184 0 : p0_16 = _mm_unpacklo_epi8(q0p0, zero);
185 0 : q0_16 = _mm_unpackhi_epi8(q0p0, zero);
186 0 : q1_16 = _mm_unpackhi_epi8(q1p1, zero);
187 0 : q2_16 = _mm_unpackhi_epi8(q2p2, zero);
188 0 : q3_16 = _mm_unpackhi_epi8(q3p3, zero);
189 0 : q4_16 = _mm_unpackhi_epi8(q4p4, zero);
190 0 : q5_16 = _mm_unpackhi_epi8(q5p5, zero);
191 0 : q6_16 = _mm_unpackhi_epi8(q6p6, zero);
192 0 : q7_16 = _mm_unpackhi_epi8(q7p7, zero);
193 :
194 0 : pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
195 : _mm_add_epi16(p4_16, p3_16));
196 0 : pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
197 : _mm_add_epi16(q4_16, q3_16));
198 :
199 0 : pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
200 0 : pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
201 :
202 0 : pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
203 0 : pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
204 0 : pixelFilter_p =
205 0 : _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
206 0 : pixetFilter_p2p1p0 = _mm_add_epi16(
207 : four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
208 0 : res_p = _mm_srli_epi16(
209 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
210 0 : res_q = _mm_srli_epi16(
211 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
212 0 : flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
213 0 : res_p = _mm_srli_epi16(
214 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
215 0 : res_q = _mm_srli_epi16(
216 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
217 :
218 0 : flat_q0p0 = _mm_packus_epi16(res_p, res_q);
219 :
220 0 : sum_p7 = _mm_add_epi16(p7_16, p7_16);
221 0 : sum_q7 = _mm_add_epi16(q7_16, q7_16);
222 0 : sum_p3 = _mm_add_epi16(p3_16, p3_16);
223 0 : sum_q3 = _mm_add_epi16(q3_16, q3_16);
224 :
225 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
226 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
227 0 : res_p = _mm_srli_epi16(
228 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
229 0 : res_q = _mm_srli_epi16(
230 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
231 0 : flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
232 :
233 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
234 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
235 0 : res_p = _mm_srli_epi16(
236 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
237 0 : res_q = _mm_srli_epi16(
238 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
239 0 : flat_q1p1 = _mm_packus_epi16(res_p, res_q);
240 :
241 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
242 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
243 0 : sum_p3 = _mm_add_epi16(sum_p3, p3_16);
244 0 : sum_q3 = _mm_add_epi16(sum_q3, q3_16);
245 :
246 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
247 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
248 0 : res_p = _mm_srli_epi16(
249 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
250 0 : res_q = _mm_srli_epi16(
251 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
252 0 : flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
253 :
254 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
255 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
256 :
257 0 : res_p = _mm_srli_epi16(
258 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
259 0 : res_q = _mm_srli_epi16(
260 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
261 0 : flat_q2p2 = _mm_packus_epi16(res_p, res_q);
262 :
263 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
264 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
265 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
266 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
267 0 : res_p = _mm_srli_epi16(
268 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
269 0 : res_q = _mm_srli_epi16(
270 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
271 0 : flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
272 :
273 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
274 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
275 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
276 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
277 0 : res_p = _mm_srli_epi16(
278 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
279 0 : res_q = _mm_srli_epi16(
280 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
281 0 : flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
282 :
283 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
284 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
285 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
286 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
287 0 : res_p = _mm_srli_epi16(
288 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
289 0 : res_q = _mm_srli_epi16(
290 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
291 0 : flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
292 :
293 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
294 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
295 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
296 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
297 0 : res_p = _mm_srli_epi16(
298 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
299 0 : res_q = _mm_srli_epi16(
300 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
301 0 : flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
302 : }
303 : // wide flat
304 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
305 :
306 0 : flat = _mm_shuffle_epi32(flat, 68);
307 0 : flat2 = _mm_shuffle_epi32(flat2, 68);
308 :
309 0 : q2p2 = _mm_andnot_si128(flat, q2p2);
310 0 : flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
311 0 : q2p2 = _mm_or_si128(q2p2, flat_q2p2);
312 :
313 0 : qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
314 0 : flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
315 0 : q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
316 :
317 0 : qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
318 0 : flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
319 0 : q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
320 :
321 0 : q6p6 = _mm_andnot_si128(flat2, q6p6);
322 0 : flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
323 0 : q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
324 0 : _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
325 0 : _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
326 :
327 0 : q5p5 = _mm_andnot_si128(flat2, q5p5);
328 0 : flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
329 0 : q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
330 0 : _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
331 0 : _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
332 :
333 0 : q4p4 = _mm_andnot_si128(flat2, q4p4);
334 0 : flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
335 0 : q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
336 0 : _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
337 0 : _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
338 :
339 0 : q3p3 = _mm_andnot_si128(flat2, q3p3);
340 0 : flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
341 0 : q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
342 0 : _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
343 0 : _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
344 :
345 0 : q2p2 = _mm_andnot_si128(flat2, q2p2);
346 0 : flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
347 0 : q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
348 0 : _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
349 0 : _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
350 :
351 0 : q1p1 = _mm_andnot_si128(flat2, q1p1);
352 0 : flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
353 0 : q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
354 0 : _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
355 0 : _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
356 :
357 0 : q0p0 = _mm_andnot_si128(flat2, q0p0);
358 0 : flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
359 0 : q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
360 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
361 0 : _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
362 : }
363 0 : }
364 :
365 : DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
366 : 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
367 : 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
368 : };
369 :
370 0 : void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
371 : const unsigned char *_blimit,
372 : const unsigned char *_limit,
373 : const unsigned char *_thresh) {
374 : __m128i mask, hev, flat, flat2;
375 0 : const __m128i zero = _mm_set1_epi16(0);
376 0 : const __m128i one = _mm_set1_epi8(1);
377 : __m128i p7, p6, p5;
378 : __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
379 : __m128i q5, q6, q7;
380 : __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
381 : p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
382 :
383 0 : const __m128i thresh =
384 0 : _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
385 0 : const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
386 0 : const __m128i blimit =
387 0 : _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
388 :
389 0 : p256_4 =
390 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
391 0 : p256_3 =
392 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
393 0 : p256_2 =
394 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
395 0 : p256_1 =
396 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
397 0 : p256_0 =
398 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
399 0 : q256_0 =
400 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
401 0 : q256_1 =
402 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
403 0 : q256_2 =
404 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
405 0 : q256_3 =
406 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
407 0 : q256_4 =
408 0 : _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
409 :
410 0 : p4 = _mm256_castsi256_si128(p256_4);
411 0 : p3 = _mm256_castsi256_si128(p256_3);
412 0 : p2 = _mm256_castsi256_si128(p256_2);
413 0 : p1 = _mm256_castsi256_si128(p256_1);
414 0 : p0 = _mm256_castsi256_si128(p256_0);
415 0 : q0 = _mm256_castsi256_si128(q256_0);
416 0 : q1 = _mm256_castsi256_si128(q256_1);
417 0 : q2 = _mm256_castsi256_si128(q256_2);
418 0 : q3 = _mm256_castsi256_si128(q256_3);
419 0 : q4 = _mm256_castsi256_si128(q256_4);
420 :
421 : {
422 0 : const __m128i abs_p1p0 =
423 0 : _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
424 0 : const __m128i abs_q1q0 =
425 0 : _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
426 0 : const __m128i fe = _mm_set1_epi8(0xfe);
427 0 : const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
428 0 : __m128i abs_p0q0 =
429 0 : _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
430 0 : __m128i abs_p1q1 =
431 0 : _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
432 : __m128i work;
433 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
434 0 : hev = _mm_subs_epu8(flat, thresh);
435 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
436 :
437 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
438 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
439 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
440 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
441 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
442 0 : mask = _mm_max_epu8(flat, mask);
443 : // mask |= (abs(p1 - p0) > limit) * -1;
444 : // mask |= (abs(q1 - q0) > limit) * -1;
445 0 : work = _mm_max_epu8(
446 : _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
447 : _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
448 0 : mask = _mm_max_epu8(work, mask);
449 0 : work = _mm_max_epu8(
450 : _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
451 : _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
452 0 : mask = _mm_max_epu8(work, mask);
453 0 : mask = _mm_subs_epu8(mask, limit);
454 0 : mask = _mm_cmpeq_epi8(mask, zero);
455 : }
456 :
457 : // lp filter
458 : {
459 0 : const __m128i t4 = _mm_set1_epi8(4);
460 0 : const __m128i t3 = _mm_set1_epi8(3);
461 0 : const __m128i t80 = _mm_set1_epi8(0x80);
462 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
463 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
464 0 : const __m128i t1 = _mm_set1_epi8(0x1);
465 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
466 :
467 0 : __m128i ps1 = _mm_xor_si128(p1, t80);
468 0 : __m128i ps0 = _mm_xor_si128(p0, t80);
469 0 : __m128i qs0 = _mm_xor_si128(q0, t80);
470 0 : __m128i qs1 = _mm_xor_si128(q1, t80);
471 : __m128i filt;
472 : __m128i work_a;
473 : __m128i filter1, filter2;
474 : __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
475 : flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
476 : flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
477 :
478 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
479 0 : work_a = _mm_subs_epi8(qs0, ps0);
480 0 : filt = _mm_adds_epi8(filt, work_a);
481 0 : filt = _mm_adds_epi8(filt, work_a);
482 0 : filt = _mm_adds_epi8(filt, work_a);
483 : /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
484 0 : filt = _mm_and_si128(filt, mask);
485 :
486 0 : filter1 = _mm_adds_epi8(filt, t4);
487 0 : filter2 = _mm_adds_epi8(filt, t3);
488 :
489 : /* Filter1 >> 3 */
490 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
491 0 : filter1 = _mm_srli_epi16(filter1, 3);
492 0 : work_a = _mm_and_si128(work_a, te0);
493 0 : filter1 = _mm_and_si128(filter1, t1f);
494 0 : filter1 = _mm_or_si128(filter1, work_a);
495 0 : qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
496 :
497 : /* Filter2 >> 3 */
498 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
499 0 : filter2 = _mm_srli_epi16(filter2, 3);
500 0 : work_a = _mm_and_si128(work_a, te0);
501 0 : filter2 = _mm_and_si128(filter2, t1f);
502 0 : filter2 = _mm_or_si128(filter2, work_a);
503 0 : ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
504 :
505 : /* filt >> 1 */
506 0 : filt = _mm_adds_epi8(filter1, t1);
507 0 : work_a = _mm_cmpgt_epi8(zero, filt);
508 0 : filt = _mm_srli_epi16(filt, 1);
509 0 : work_a = _mm_and_si128(work_a, t80);
510 0 : filt = _mm_and_si128(filt, t7f);
511 0 : filt = _mm_or_si128(filt, work_a);
512 0 : filt = _mm_andnot_si128(hev, filt);
513 0 : ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
514 0 : qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
515 : // loopfilter done
516 :
517 : {
518 : __m128i work;
519 0 : work = _mm_max_epu8(
520 : _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
521 : _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
522 0 : flat = _mm_max_epu8(work, flat);
523 0 : work = _mm_max_epu8(
524 : _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
525 : _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
526 0 : flat = _mm_max_epu8(work, flat);
527 0 : work = _mm_max_epu8(
528 : _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
529 : _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
530 0 : flat = _mm_subs_epu8(flat, one);
531 0 : flat = _mm_cmpeq_epi8(flat, zero);
532 0 : flat = _mm_and_si128(flat, mask);
533 :
534 0 : p256_5 = _mm256_castpd_si256(
535 0 : _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
536 0 : q256_5 = _mm256_castpd_si256(
537 0 : _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
538 0 : p5 = _mm256_castsi256_si128(p256_5);
539 0 : q5 = _mm256_castsi256_si128(q256_5);
540 0 : flat2 = _mm_max_epu8(
541 : _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
542 : _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
543 :
544 0 : flat2 = _mm_max_epu8(work, flat2);
545 0 : p256_6 = _mm256_castpd_si256(
546 0 : _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
547 0 : q256_6 = _mm256_castpd_si256(
548 0 : _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
549 0 : p6 = _mm256_castsi256_si128(p256_6);
550 0 : q6 = _mm256_castsi256_si128(q256_6);
551 0 : work = _mm_max_epu8(
552 : _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
553 : _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
554 :
555 0 : flat2 = _mm_max_epu8(work, flat2);
556 :
557 0 : p256_7 = _mm256_castpd_si256(
558 0 : _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
559 0 : q256_7 = _mm256_castpd_si256(
560 0 : _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
561 0 : p7 = _mm256_castsi256_si128(p256_7);
562 0 : q7 = _mm256_castsi256_si128(q256_7);
563 0 : work = _mm_max_epu8(
564 : _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
565 : _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
566 :
567 0 : flat2 = _mm_max_epu8(work, flat2);
568 0 : flat2 = _mm_subs_epu8(flat2, one);
569 0 : flat2 = _mm_cmpeq_epi8(flat2, zero);
570 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
571 : }
572 :
573 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
574 : // flat and wide flat calculations
575 : {
576 0 : const __m256i eight = _mm256_set1_epi16(8);
577 0 : const __m256i four = _mm256_set1_epi16(4);
578 : __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
579 : pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
580 :
581 0 : const __m256i filter =
582 : _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
583 0 : p256_7 = _mm256_shuffle_epi8(p256_7, filter);
584 0 : p256_6 = _mm256_shuffle_epi8(p256_6, filter);
585 0 : p256_5 = _mm256_shuffle_epi8(p256_5, filter);
586 0 : p256_4 = _mm256_shuffle_epi8(p256_4, filter);
587 0 : p256_3 = _mm256_shuffle_epi8(p256_3, filter);
588 0 : p256_2 = _mm256_shuffle_epi8(p256_2, filter);
589 0 : p256_1 = _mm256_shuffle_epi8(p256_1, filter);
590 0 : p256_0 = _mm256_shuffle_epi8(p256_0, filter);
591 0 : q256_0 = _mm256_shuffle_epi8(q256_0, filter);
592 0 : q256_1 = _mm256_shuffle_epi8(q256_1, filter);
593 0 : q256_2 = _mm256_shuffle_epi8(q256_2, filter);
594 0 : q256_3 = _mm256_shuffle_epi8(q256_3, filter);
595 0 : q256_4 = _mm256_shuffle_epi8(q256_4, filter);
596 0 : q256_5 = _mm256_shuffle_epi8(q256_5, filter);
597 0 : q256_6 = _mm256_shuffle_epi8(q256_6, filter);
598 0 : q256_7 = _mm256_shuffle_epi8(q256_7, filter);
599 :
600 0 : pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
601 : _mm256_add_epi16(p256_4, p256_3));
602 0 : pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
603 : _mm256_add_epi16(q256_4, q256_3));
604 :
605 0 : pixetFilter_p2p1p0 =
606 0 : _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
607 0 : pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
608 :
609 0 : pixetFilter_q2q1q0 =
610 0 : _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
611 0 : pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
612 :
613 0 : pixelFilter_p = _mm256_add_epi16(
614 : eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
615 :
616 0 : pixetFilter_p2p1p0 = _mm256_add_epi16(
617 : four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
618 :
619 0 : res_p = _mm256_srli_epi16(
620 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
621 :
622 0 : flat2_p0 = _mm256_castsi256_si128(
623 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
624 :
625 0 : res_q = _mm256_srli_epi16(
626 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
627 :
628 0 : flat2_q0 = _mm256_castsi256_si128(
629 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
630 :
631 0 : res_p =
632 0 : _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
633 : _mm256_add_epi16(p256_3, p256_0)),
634 : 3);
635 :
636 0 : flat_p0 = _mm256_castsi256_si128(
637 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
638 :
639 0 : res_q =
640 0 : _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
641 : _mm256_add_epi16(q256_3, q256_0)),
642 : 3);
643 :
644 0 : flat_q0 = _mm256_castsi256_si128(
645 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
646 :
647 0 : sum_p7 = _mm256_add_epi16(p256_7, p256_7);
648 :
649 0 : sum_q7 = _mm256_add_epi16(q256_7, q256_7);
650 :
651 0 : sum_p3 = _mm256_add_epi16(p256_3, p256_3);
652 :
653 0 : sum_q3 = _mm256_add_epi16(q256_3, q256_3);
654 :
655 0 : pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
656 :
657 0 : pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
658 :
659 0 : res_p = _mm256_srli_epi16(
660 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
661 :
662 0 : flat2_p1 = _mm256_castsi256_si128(
663 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
664 :
665 0 : res_q = _mm256_srli_epi16(
666 : _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
667 :
668 0 : flat2_q1 = _mm256_castsi256_si128(
669 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
670 :
671 0 : pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
672 :
673 0 : pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
674 :
675 0 : res_p =
676 0 : _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
677 : _mm256_add_epi16(sum_p3, p256_1)),
678 : 3);
679 :
680 0 : flat_p1 = _mm256_castsi256_si128(
681 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
682 :
683 0 : res_q =
684 0 : _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
685 : _mm256_add_epi16(sum_q3, q256_1)),
686 : 3);
687 :
688 0 : flat_q1 = _mm256_castsi256_si128(
689 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
690 :
691 0 : sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
692 :
693 0 : sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
694 :
695 0 : sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
696 :
697 0 : sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
698 :
699 0 : pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
700 :
701 0 : pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
702 :
703 0 : res_p = _mm256_srli_epi16(
704 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
705 :
706 0 : flat2_p2 = _mm256_castsi256_si128(
707 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
708 :
709 0 : res_q = _mm256_srli_epi16(
710 : _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
711 :
712 0 : flat2_q2 = _mm256_castsi256_si128(
713 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
714 :
715 0 : pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
716 :
717 0 : pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
718 :
719 0 : res_p =
720 0 : _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
721 : _mm256_add_epi16(sum_p3, p256_2)),
722 : 3);
723 :
724 0 : flat_p2 = _mm256_castsi256_si128(
725 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
726 :
727 0 : res_q =
728 0 : _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
729 : _mm256_add_epi16(sum_q3, q256_2)),
730 : 3);
731 :
732 0 : flat_q2 = _mm256_castsi256_si128(
733 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
734 :
735 0 : sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
736 :
737 0 : sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
738 :
739 0 : pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
740 :
741 0 : pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
742 :
743 0 : res_p = _mm256_srli_epi16(
744 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
745 :
746 0 : flat2_p3 = _mm256_castsi256_si128(
747 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
748 :
749 0 : res_q = _mm256_srli_epi16(
750 : _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
751 :
752 0 : flat2_q3 = _mm256_castsi256_si128(
753 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
754 :
755 0 : sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
756 :
757 0 : sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
758 :
759 0 : pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
760 :
761 0 : pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
762 :
763 0 : res_p = _mm256_srli_epi16(
764 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
765 :
766 0 : flat2_p4 = _mm256_castsi256_si128(
767 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
768 :
769 0 : res_q = _mm256_srli_epi16(
770 : _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
771 :
772 0 : flat2_q4 = _mm256_castsi256_si128(
773 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
774 :
775 0 : sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
776 :
777 0 : sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
778 :
779 0 : pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
780 :
781 0 : pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
782 :
783 0 : res_p = _mm256_srli_epi16(
784 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
785 :
786 0 : flat2_p5 = _mm256_castsi256_si128(
787 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
788 :
789 0 : res_q = _mm256_srli_epi16(
790 : _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
791 :
792 0 : flat2_q5 = _mm256_castsi256_si128(
793 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
794 :
795 0 : sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
796 :
797 0 : sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
798 :
799 0 : pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
800 :
801 0 : pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
802 :
803 0 : res_p = _mm256_srli_epi16(
804 : _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
805 :
806 0 : flat2_p6 = _mm256_castsi256_si128(
807 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
808 :
809 0 : res_q = _mm256_srli_epi16(
810 : _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
811 :
812 0 : flat2_q6 = _mm256_castsi256_si128(
813 0 : _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
814 : }
815 :
816 : // wide flat
817 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
818 :
819 0 : p2 = _mm_andnot_si128(flat, p2);
820 0 : flat_p2 = _mm_and_si128(flat, flat_p2);
821 0 : p2 = _mm_or_si128(flat_p2, p2);
822 :
823 0 : p1 = _mm_andnot_si128(flat, ps1);
824 0 : flat_p1 = _mm_and_si128(flat, flat_p1);
825 0 : p1 = _mm_or_si128(flat_p1, p1);
826 :
827 0 : p0 = _mm_andnot_si128(flat, ps0);
828 0 : flat_p0 = _mm_and_si128(flat, flat_p0);
829 0 : p0 = _mm_or_si128(flat_p0, p0);
830 :
831 0 : q0 = _mm_andnot_si128(flat, qs0);
832 0 : flat_q0 = _mm_and_si128(flat, flat_q0);
833 0 : q0 = _mm_or_si128(flat_q0, q0);
834 :
835 0 : q1 = _mm_andnot_si128(flat, qs1);
836 0 : flat_q1 = _mm_and_si128(flat, flat_q1);
837 0 : q1 = _mm_or_si128(flat_q1, q1);
838 :
839 0 : q2 = _mm_andnot_si128(flat, q2);
840 0 : flat_q2 = _mm_and_si128(flat, flat_q2);
841 0 : q2 = _mm_or_si128(flat_q2, q2);
842 :
843 0 : p6 = _mm_andnot_si128(flat2, p6);
844 0 : flat2_p6 = _mm_and_si128(flat2, flat2_p6);
845 0 : p6 = _mm_or_si128(flat2_p6, p6);
846 0 : _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
847 :
848 0 : p5 = _mm_andnot_si128(flat2, p5);
849 0 : flat2_p5 = _mm_and_si128(flat2, flat2_p5);
850 0 : p5 = _mm_or_si128(flat2_p5, p5);
851 0 : _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
852 :
853 0 : p4 = _mm_andnot_si128(flat2, p4);
854 0 : flat2_p4 = _mm_and_si128(flat2, flat2_p4);
855 0 : p4 = _mm_or_si128(flat2_p4, p4);
856 0 : _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
857 :
858 0 : p3 = _mm_andnot_si128(flat2, p3);
859 0 : flat2_p3 = _mm_and_si128(flat2, flat2_p3);
860 0 : p3 = _mm_or_si128(flat2_p3, p3);
861 0 : _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
862 :
863 0 : p2 = _mm_andnot_si128(flat2, p2);
864 0 : flat2_p2 = _mm_and_si128(flat2, flat2_p2);
865 0 : p2 = _mm_or_si128(flat2_p2, p2);
866 0 : _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
867 :
868 0 : p1 = _mm_andnot_si128(flat2, p1);
869 0 : flat2_p1 = _mm_and_si128(flat2, flat2_p1);
870 0 : p1 = _mm_or_si128(flat2_p1, p1);
871 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
872 :
873 0 : p0 = _mm_andnot_si128(flat2, p0);
874 0 : flat2_p0 = _mm_and_si128(flat2, flat2_p0);
875 0 : p0 = _mm_or_si128(flat2_p0, p0);
876 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
877 :
878 0 : q0 = _mm_andnot_si128(flat2, q0);
879 0 : flat2_q0 = _mm_and_si128(flat2, flat2_q0);
880 0 : q0 = _mm_or_si128(flat2_q0, q0);
881 : _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
882 :
883 0 : q1 = _mm_andnot_si128(flat2, q1);
884 0 : flat2_q1 = _mm_and_si128(flat2, flat2_q1);
885 0 : q1 = _mm_or_si128(flat2_q1, q1);
886 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
887 :
888 0 : q2 = _mm_andnot_si128(flat2, q2);
889 0 : flat2_q2 = _mm_and_si128(flat2, flat2_q2);
890 0 : q2 = _mm_or_si128(flat2_q2, q2);
891 0 : _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
892 :
893 0 : q3 = _mm_andnot_si128(flat2, q3);
894 0 : flat2_q3 = _mm_and_si128(flat2, flat2_q3);
895 0 : q3 = _mm_or_si128(flat2_q3, q3);
896 0 : _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
897 :
898 0 : q4 = _mm_andnot_si128(flat2, q4);
899 0 : flat2_q4 = _mm_and_si128(flat2, flat2_q4);
900 0 : q4 = _mm_or_si128(flat2_q4, q4);
901 0 : _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
902 :
903 0 : q5 = _mm_andnot_si128(flat2, q5);
904 0 : flat2_q5 = _mm_and_si128(flat2, flat2_q5);
905 0 : q5 = _mm_or_si128(flat2_q5, q5);
906 0 : _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
907 :
908 0 : q6 = _mm_andnot_si128(flat2, q6);
909 0 : flat2_q6 = _mm_and_si128(flat2, flat2_q6);
910 0 : q6 = _mm_or_si128(flat2_q6, q6);
911 0 : _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
912 : }
913 0 : }
|