Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h> // SSE2
13 :
14 : #include "./aom_dsp_rtcd.h"
15 : #include "aom_ports/mem.h"
16 : #include "aom_ports/emmintrin_compat.h"
17 :
18 0 : static INLINE __m128i abs_diff(__m128i a, __m128i b) {
19 0 : return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
20 : }
21 :
22 : #if CONFIG_PARALLEL_DEBLOCKING
23 : // filter_mask and hev_mask
24 : #define FILTER_HEV_MASK4 \
25 : do { \
26 : /* (abs(q1 - q0), abs(p1 - p0) */ \
27 : __m128i flat = abs_diff(q1p1, q0p0); \
28 : /* abs(p1 - q1), abs(p0 - q0) */ \
29 : const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
30 : __m128i abs_p0q0, abs_p1q1; \
31 : \
32 : /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
33 : hev = \
34 : _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
35 : hev = _mm_cmpgt_epi16(hev, thresh); \
36 : hev = _mm_packs_epi16(hev, hev); \
37 : \
38 : /* const int8_t mask = filter_mask2(*limit, *blimit, */ \
39 : /* p1, p0, q0, q1); */ \
40 : abs_p0q0 = \
41 : _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
42 : abs_p1q1 = \
43 : _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
44 : abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
45 : abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
46 : /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
47 : mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
48 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
49 : mask = _mm_unpacklo_epi64(mask, flat); \
50 : mask = _mm_subs_epu8(mask, limit); \
51 : mask = _mm_cmpeq_epi8(mask, zero); \
52 : mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
53 : } while (0)
54 : #endif // CONFIG_PARALLEL_DEBLOCKING
55 :
56 : // filter_mask and hev_mask
57 : #define FILTER_HEV_MASK \
58 : do { \
59 : /* (abs(q1 - q0), abs(p1 - p0) */ \
60 : __m128i flat = abs_diff(q1p1, q0p0); \
61 : /* abs(p1 - q1), abs(p0 - q0) */ \
62 : const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
63 : __m128i abs_p0q0, abs_p1q1, work; \
64 : \
65 : /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
66 : hev = \
67 : _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
68 : hev = _mm_cmpgt_epi16(hev, thresh); \
69 : hev = _mm_packs_epi16(hev, hev); \
70 : \
71 : /* const int8_t mask = filter_mask(*limit, *blimit, */ \
72 : /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
73 : abs_p0q0 = \
74 : _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
75 : abs_p1q1 = \
76 : _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
77 : abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
78 : abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
79 : /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
80 : mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
81 : /* abs(p3 - p2), abs(p2 - p1) */ \
82 : work = abs_diff(p3p2, p2p1); \
83 : flat = _mm_max_epu8(work, flat); \
84 : /* abs(q3 - q2), abs(q2 - q1) */ \
85 : work = abs_diff(q3q2, q2q1); \
86 : flat = _mm_max_epu8(work, flat); \
87 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
88 : mask = _mm_unpacklo_epi64(mask, flat); \
89 : mask = _mm_subs_epu8(mask, limit); \
90 : mask = _mm_cmpeq_epi8(mask, zero); \
91 : mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
92 : } while (0)
93 :
94 : #define FILTER4 \
95 : do { \
96 : const __m128i t3t4 = \
97 : _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
98 : const __m128i t80 = _mm_set1_epi8(0x80); \
99 : __m128i filter, filter2filter1, work; \
100 : \
101 : ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
102 : qs1qs0 = _mm_xor_si128(q1q0, t80); \
103 : \
104 : /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
105 : work = _mm_subs_epi8(ps1ps0, qs1qs0); \
106 : filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
107 : /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
108 : filter = _mm_subs_epi8(filter, work); \
109 : filter = _mm_subs_epi8(filter, work); \
110 : filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
111 : filter = _mm_and_si128(filter, mask); /* & mask */ \
112 : filter = _mm_unpacklo_epi64(filter, filter); \
113 : \
114 : /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
115 : /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
116 : filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
117 : filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
118 : filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
119 : filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
120 : filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
121 : filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
122 : \
123 : /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
124 : filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
125 : filter = _mm_unpacklo_epi8(filter, filter); \
126 : filter = _mm_srai_epi16(filter, 9); /* round */ \
127 : filter = _mm_packs_epi16(filter, filter); \
128 : filter = _mm_andnot_si128(hev, filter); \
129 : \
130 : hev = _mm_unpackhi_epi64(filter2filter1, filter); \
131 : filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
132 : \
133 : /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
134 : qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
135 : /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
136 : ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
137 : qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
138 : ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
139 : } while (0)
140 :
141 0 : void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
142 : const uint8_t *_blimit, const uint8_t *_limit,
143 : const uint8_t *_thresh) {
144 0 : const __m128i zero = _mm_set1_epi16(0);
145 0 : const __m128i limit =
146 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
147 : _mm_loadl_epi64((const __m128i *)_limit));
148 0 : const __m128i thresh =
149 0 : _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
150 0 : const __m128i ff = _mm_cmpeq_epi8(zero, zero);
151 : #if !CONFIG_PARALLEL_DEBLOCKING
152 : __m128i p3p2, p2p1, q3q2, q2q1;
153 : #endif // !CONFIG_PARALLEL_DEBLOCKING
154 : __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
155 : __m128i mask, hev;
156 : #if !CONFIG_PARALLEL_DEBLOCKING
157 0 : p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
158 0 : _mm_loadl_epi64((__m128i *)(s - 4 * p)));
159 : #endif // !CONFIG_PARALLEL_DEBLOCKING
160 0 : q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
161 0 : _mm_loadl_epi64((__m128i *)(s + 1 * p)));
162 0 : q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
163 : _mm_loadl_epi64((__m128i *)(s + 0 * p)));
164 : #if !CONFIG_PARALLEL_DEBLOCKING
165 0 : q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
166 0 : _mm_loadl_epi64((__m128i *)(s + 3 * p)));
167 : #endif // !CONFIG_PARALLEL_DEBLOCKING
168 0 : p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
169 0 : q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
170 : #if !CONFIG_PARALLEL_DEBLOCKING
171 0 : p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
172 0 : q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
173 : #endif // !CONFIG_PARALLEL_DEBLOCKING
174 : #if !CONFIG_PARALLEL_DEBLOCKING
175 0 : FILTER_HEV_MASK;
176 : #else // CONFIG_PARALLEL_DEBLOCKING
177 : FILTER_HEV_MASK4;
178 : #endif // !CONFIG_PARALLEL_DEBLOCKING
179 0 : FILTER4;
180 :
181 0 : _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
182 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
183 : _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
184 0 : _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
185 0 : }
186 :
187 0 : void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
188 : const uint8_t *_blimit, const uint8_t *_limit,
189 : const uint8_t *_thresh) {
190 0 : const __m128i zero = _mm_set1_epi16(0);
191 0 : const __m128i limit =
192 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
193 : _mm_loadl_epi64((const __m128i *)_limit));
194 0 : const __m128i thresh =
195 0 : _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
196 0 : const __m128i ff = _mm_cmpeq_epi8(zero, zero);
197 : __m128i x0, x1, x2, x3;
198 : #if !CONFIG_PARALLEL_DEBLOCKING
199 : __m128i p3p2, p2p1, q3q2, q2q1;
200 : #endif // !CONFIG_PARALLEL_DEBLOCKING
201 : __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
202 : __m128i mask, hev;
203 :
204 : // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
205 0 : q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
206 0 : _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
207 :
208 : // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
209 0 : x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
210 0 : _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
211 :
212 : // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
213 0 : x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
214 0 : _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
215 :
216 : // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
217 0 : x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
218 0 : _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
219 :
220 : // Transpose 8x8
221 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
222 0 : p1p0 = _mm_unpacklo_epi16(q1q0, x1);
223 : // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
224 0 : x0 = _mm_unpacklo_epi16(x2, x3);
225 : #if !CONFIG_PARALLEL_DEBLOCKING
226 : // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
227 0 : p3p2 = _mm_unpacklo_epi32(p1p0, x0);
228 : #endif // !CONFIG_PARALLEL_DEBLOCKING
229 : // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
230 0 : p1p0 = _mm_unpackhi_epi32(p1p0, x0);
231 : #if !CONFIG_PARALLEL_DEBLOCKING
232 0 : p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
233 : #endif // !CONFIG_PARALLEL_DEBLOCKING
234 0 : p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
235 :
236 : // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
237 0 : q1q0 = _mm_unpackhi_epi16(q1q0, x1);
238 : // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
239 0 : x2 = _mm_unpackhi_epi16(x2, x3);
240 : #if !CONFIG_PARALLEL_DEBLOCKING
241 : // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
242 0 : q3q2 = _mm_unpackhi_epi32(q1q0, x2);
243 : #endif // !CONFIG_PARALLEL_DEBLOCKING
244 : // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
245 0 : q1q0 = _mm_unpacklo_epi32(q1q0, x2);
246 :
247 0 : q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
248 0 : q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
249 0 : p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
250 : #if !CONFIG_PARALLEL_DEBLOCKING
251 0 : p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
252 0 : q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
253 : #endif // !CONFIG_PARALLEL_DEBLOCKING
254 : #if !CONFIG_PARALLEL_DEBLOCKING
255 0 : FILTER_HEV_MASK;
256 : #else // CONFIG_PARALLEL_DEBLOCKING
257 : FILTER_HEV_MASK4;
258 : #endif // !CONFIG_PARALLEL_DEBLOCKING
259 0 : FILTER4;
260 :
261 : // Transpose 8x4 to 4x8
262 : // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
263 : // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
264 : // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
265 0 : ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
266 : // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
267 0 : x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
268 : // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
269 0 : ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
270 : // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
271 0 : qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
272 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
273 0 : ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
274 :
275 0 : *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
276 0 : ps1ps0 = _mm_srli_si128(ps1ps0, 4);
277 0 : *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
278 0 : ps1ps0 = _mm_srli_si128(ps1ps0, 4);
279 0 : *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
280 0 : ps1ps0 = _mm_srli_si128(ps1ps0, 4);
281 0 : *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
282 :
283 0 : *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
284 0 : qs1qs0 = _mm_srli_si128(qs1qs0, 4);
285 0 : *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
286 0 : qs1qs0 = _mm_srli_si128(qs1qs0, 4);
287 0 : *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
288 0 : qs1qs0 = _mm_srli_si128(qs1qs0, 4);
289 0 : *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
290 0 : }
291 :
292 0 : void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
293 : const unsigned char *_blimit,
294 : const unsigned char *_limit,
295 : const unsigned char *_thresh) {
296 0 : const __m128i zero = _mm_set1_epi16(0);
297 0 : const __m128i one = _mm_set1_epi8(1);
298 0 : const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
299 0 : const __m128i limit = _mm_load_si128((const __m128i *)_limit);
300 0 : const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
301 : __m128i mask, hev, flat, flat2;
302 : __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
303 : __m128i abs_p1p0;
304 :
305 0 : q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
306 0 : q4p4 = _mm_castps_si128(
307 0 : _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
308 0 : q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
309 0 : q3p3 = _mm_castps_si128(
310 0 : _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
311 0 : q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
312 0 : q2p2 = _mm_castps_si128(
313 0 : _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
314 0 : q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
315 0 : q1p1 = _mm_castps_si128(
316 0 : _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
317 0 : p1q1 = _mm_shuffle_epi32(q1p1, 78);
318 0 : q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
319 0 : q0p0 = _mm_castps_si128(
320 : _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
321 0 : p0q0 = _mm_shuffle_epi32(q0p0, 78);
322 :
323 : {
324 : __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
325 0 : abs_p1p0 = abs_diff(q1p1, q0p0);
326 0 : abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
327 0 : fe = _mm_set1_epi8(0xfe);
328 0 : ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
329 0 : abs_p0q0 = abs_diff(q0p0, p0q0);
330 0 : abs_p1q1 = abs_diff(q1p1, p1q1);
331 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
332 0 : hev = _mm_subs_epu8(flat, thresh);
333 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
334 :
335 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
336 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
337 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
338 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
339 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
340 0 : mask = _mm_max_epu8(abs_p1p0, mask);
341 : // mask |= (abs(p1 - p0) > limit) * -1;
342 : // mask |= (abs(q1 - q0) > limit) * -1;
343 :
344 0 : work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
345 0 : mask = _mm_max_epu8(work, mask);
346 0 : mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
347 0 : mask = _mm_subs_epu8(mask, limit);
348 0 : mask = _mm_cmpeq_epi8(mask, zero);
349 : }
350 :
351 : // lp filter
352 : {
353 0 : const __m128i t4 = _mm_set1_epi8(4);
354 0 : const __m128i t3 = _mm_set1_epi8(3);
355 0 : const __m128i t80 = _mm_set1_epi8(0x80);
356 0 : const __m128i t1 = _mm_set1_epi16(0x1);
357 0 : __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
358 0 : __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
359 0 : __m128i qs0 = _mm_xor_si128(p0q0, t80);
360 0 : __m128i qs1 = _mm_xor_si128(p1q1, t80);
361 : __m128i filt;
362 : __m128i work_a;
363 : __m128i filter1, filter2;
364 : __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
365 : __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
366 :
367 0 : filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
368 0 : work_a = _mm_subs_epi8(qs0, qs0ps0);
369 0 : filt = _mm_adds_epi8(filt, work_a);
370 0 : filt = _mm_adds_epi8(filt, work_a);
371 0 : filt = _mm_adds_epi8(filt, work_a);
372 : // (aom_filter + 3 * (qs0 - ps0)) & mask
373 0 : filt = _mm_and_si128(filt, mask);
374 :
375 0 : filter1 = _mm_adds_epi8(filt, t4);
376 0 : filter2 = _mm_adds_epi8(filt, t3);
377 :
378 0 : filter1 = _mm_unpacklo_epi8(zero, filter1);
379 0 : filter1 = _mm_srai_epi16(filter1, 0xB);
380 0 : filter2 = _mm_unpacklo_epi8(zero, filter2);
381 0 : filter2 = _mm_srai_epi16(filter2, 0xB);
382 :
383 : // Filter1 >> 3
384 0 : filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
385 0 : qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
386 :
387 : // filt >> 1
388 0 : filt = _mm_adds_epi16(filter1, t1);
389 0 : filt = _mm_srai_epi16(filt, 1);
390 0 : filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
391 : filt);
392 0 : filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
393 0 : qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
394 : // loopfilter done
395 :
396 : {
397 : __m128i work;
398 0 : flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
399 0 : flat = _mm_max_epu8(abs_p1p0, flat);
400 0 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
401 0 : flat = _mm_subs_epu8(flat, one);
402 0 : flat = _mm_cmpeq_epi8(flat, zero);
403 0 : flat = _mm_and_si128(flat, mask);
404 :
405 0 : q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
406 0 : q5p5 = _mm_castps_si128(
407 0 : _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
408 :
409 0 : q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
410 0 : q6p6 = _mm_castps_si128(
411 0 : _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
412 0 : flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
413 :
414 0 : q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
415 0 : q7p7 = _mm_castps_si128(
416 0 : _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
417 0 : work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
418 0 : flat2 = _mm_max_epu8(work, flat2);
419 0 : flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
420 0 : flat2 = _mm_subs_epu8(flat2, one);
421 0 : flat2 = _mm_cmpeq_epi8(flat2, zero);
422 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
423 : }
424 :
425 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
426 : // flat and wide flat calculations
427 : {
428 0 : const __m128i eight = _mm_set1_epi16(8);
429 0 : const __m128i four = _mm_set1_epi16(4);
430 : __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
431 : __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
432 : __m128i pixelFilter_p, pixelFilter_q;
433 : __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
434 : __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
435 :
436 0 : p7_16 = _mm_unpacklo_epi8(q7p7, zero);
437 0 : p6_16 = _mm_unpacklo_epi8(q6p6, zero);
438 0 : p5_16 = _mm_unpacklo_epi8(q5p5, zero);
439 0 : p4_16 = _mm_unpacklo_epi8(q4p4, zero);
440 0 : p3_16 = _mm_unpacklo_epi8(q3p3, zero);
441 0 : p2_16 = _mm_unpacklo_epi8(q2p2, zero);
442 0 : p1_16 = _mm_unpacklo_epi8(q1p1, zero);
443 0 : p0_16 = _mm_unpacklo_epi8(q0p0, zero);
444 0 : q0_16 = _mm_unpackhi_epi8(q0p0, zero);
445 0 : q1_16 = _mm_unpackhi_epi8(q1p1, zero);
446 0 : q2_16 = _mm_unpackhi_epi8(q2p2, zero);
447 0 : q3_16 = _mm_unpackhi_epi8(q3p3, zero);
448 0 : q4_16 = _mm_unpackhi_epi8(q4p4, zero);
449 0 : q5_16 = _mm_unpackhi_epi8(q5p5, zero);
450 0 : q6_16 = _mm_unpackhi_epi8(q6p6, zero);
451 0 : q7_16 = _mm_unpackhi_epi8(q7p7, zero);
452 :
453 0 : pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
454 : _mm_add_epi16(p4_16, p3_16));
455 0 : pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
456 : _mm_add_epi16(q4_16, q3_16));
457 :
458 0 : pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
459 0 : pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
460 :
461 0 : pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
462 0 : pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
463 0 : pixelFilter_p =
464 0 : _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
465 0 : pixetFilter_p2p1p0 = _mm_add_epi16(
466 : four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
467 0 : res_p = _mm_srli_epi16(
468 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
469 0 : res_q = _mm_srli_epi16(
470 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
471 0 : flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
472 0 : res_p = _mm_srli_epi16(
473 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
474 0 : res_q = _mm_srli_epi16(
475 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
476 :
477 0 : flat_q0p0 = _mm_packus_epi16(res_p, res_q);
478 :
479 0 : sum_p7 = _mm_add_epi16(p7_16, p7_16);
480 0 : sum_q7 = _mm_add_epi16(q7_16, q7_16);
481 0 : sum_p3 = _mm_add_epi16(p3_16, p3_16);
482 0 : sum_q3 = _mm_add_epi16(q3_16, q3_16);
483 :
484 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
485 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
486 0 : res_p = _mm_srli_epi16(
487 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
488 0 : res_q = _mm_srli_epi16(
489 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
490 0 : flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
491 :
492 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
493 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
494 0 : res_p = _mm_srli_epi16(
495 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
496 0 : res_q = _mm_srli_epi16(
497 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
498 0 : flat_q1p1 = _mm_packus_epi16(res_p, res_q);
499 :
500 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
501 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
502 0 : sum_p3 = _mm_add_epi16(sum_p3, p3_16);
503 0 : sum_q3 = _mm_add_epi16(sum_q3, q3_16);
504 :
505 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
506 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
507 0 : res_p = _mm_srli_epi16(
508 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
509 0 : res_q = _mm_srli_epi16(
510 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
511 0 : flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
512 :
513 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
514 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
515 :
516 0 : res_p = _mm_srli_epi16(
517 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
518 0 : res_q = _mm_srli_epi16(
519 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
520 0 : flat_q2p2 = _mm_packus_epi16(res_p, res_q);
521 :
522 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
523 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
524 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
525 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
526 0 : res_p = _mm_srli_epi16(
527 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
528 0 : res_q = _mm_srli_epi16(
529 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
530 0 : flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
531 :
532 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
533 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
534 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
535 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
536 0 : res_p = _mm_srli_epi16(
537 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
538 0 : res_q = _mm_srli_epi16(
539 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
540 0 : flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
541 :
542 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
543 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
544 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
545 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
546 0 : res_p = _mm_srli_epi16(
547 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
548 0 : res_q = _mm_srli_epi16(
549 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
550 0 : flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
551 :
552 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
553 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
554 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
555 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
556 0 : res_p = _mm_srli_epi16(
557 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
558 0 : res_q = _mm_srli_epi16(
559 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
560 0 : flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
561 : }
562 : // wide flat
563 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
564 :
565 0 : flat = _mm_shuffle_epi32(flat, 68);
566 0 : flat2 = _mm_shuffle_epi32(flat2, 68);
567 :
568 0 : q2p2 = _mm_andnot_si128(flat, q2p2);
569 0 : flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
570 0 : q2p2 = _mm_or_si128(q2p2, flat_q2p2);
571 :
572 0 : qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
573 0 : flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
574 0 : q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
575 :
576 0 : qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
577 0 : flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
578 0 : q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
579 :
580 0 : q6p6 = _mm_andnot_si128(flat2, q6p6);
581 0 : flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
582 0 : q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
583 0 : _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
584 0 : _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
585 :
586 0 : q5p5 = _mm_andnot_si128(flat2, q5p5);
587 0 : flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
588 0 : q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
589 0 : _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
590 0 : _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
591 :
592 0 : q4p4 = _mm_andnot_si128(flat2, q4p4);
593 0 : flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
594 0 : q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
595 0 : _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
596 0 : _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
597 :
598 0 : q3p3 = _mm_andnot_si128(flat2, q3p3);
599 0 : flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
600 0 : q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
601 0 : _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
602 0 : _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
603 :
604 0 : q2p2 = _mm_andnot_si128(flat2, q2p2);
605 0 : flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
606 0 : q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
607 0 : _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
608 0 : _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
609 :
610 0 : q1p1 = _mm_andnot_si128(flat2, q1p1);
611 0 : flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
612 0 : q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
613 0 : _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
614 0 : _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
615 :
616 0 : q0p0 = _mm_andnot_si128(flat2, q0p0);
617 0 : flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
618 0 : q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
619 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
620 0 : _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
621 : }
622 0 : }
623 :
624 0 : static INLINE __m128i filter_add2_sub2(const __m128i *const total,
625 : const __m128i *const a1,
626 : const __m128i *const a2,
627 : const __m128i *const s1,
628 : const __m128i *const s2) {
629 0 : __m128i x = _mm_add_epi16(*a1, *total);
630 0 : x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
631 0 : return x;
632 : }
633 :
634 0 : static INLINE __m128i filter8_mask(const __m128i *const flat,
635 : const __m128i *const other_filt,
636 : const __m128i *const f8_lo,
637 : const __m128i *const f8_hi) {
638 0 : const __m128i f8 =
639 0 : _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
640 0 : const __m128i result = _mm_and_si128(*flat, f8);
641 0 : return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
642 : }
643 :
644 0 : static INLINE __m128i filter16_mask(const __m128i *const flat,
645 : const __m128i *const other_filt,
646 : const __m128i *const f_lo,
647 : const __m128i *const f_hi) {
648 0 : const __m128i f =
649 0 : _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
650 0 : const __m128i result = _mm_and_si128(*flat, f);
651 0 : return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
652 : }
653 :
654 0 : void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
655 : const unsigned char *_blimit,
656 : const unsigned char *_limit,
657 : const unsigned char *_thresh) {
658 0 : const __m128i zero = _mm_set1_epi16(0);
659 0 : const __m128i one = _mm_set1_epi8(1);
660 0 : const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
661 0 : const __m128i limit = _mm_load_si128((const __m128i *)_limit);
662 0 : const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
663 : __m128i mask, hev, flat, flat2;
664 : __m128i p7, p6, p5;
665 : __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
666 : __m128i q5, q6, q7;
667 :
668 : __m128i op2, op1, op0, oq0, oq1, oq2;
669 :
670 : __m128i max_abs_p1p0q1q0;
671 :
672 0 : p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
673 0 : p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
674 0 : p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
675 0 : p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
676 0 : p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
677 0 : p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
678 0 : p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
679 0 : p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
680 0 : q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
681 0 : q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
682 0 : q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
683 0 : q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
684 0 : q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
685 0 : q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
686 0 : q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
687 0 : q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
688 :
689 : {
690 0 : const __m128i abs_p1p0 = abs_diff(p1, p0);
691 0 : const __m128i abs_q1q0 = abs_diff(q1, q0);
692 0 : const __m128i fe = _mm_set1_epi8(0xfe);
693 0 : const __m128i ff = _mm_cmpeq_epi8(zero, zero);
694 0 : __m128i abs_p0q0 = abs_diff(p0, q0);
695 0 : __m128i abs_p1q1 = abs_diff(p1, q1);
696 : __m128i work;
697 0 : max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
698 :
699 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
700 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
701 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
702 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
703 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
704 0 : mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
705 : // mask |= (abs(p1 - p0) > limit) * -1;
706 : // mask |= (abs(q1 - q0) > limit) * -1;
707 0 : work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
708 0 : mask = _mm_max_epu8(work, mask);
709 0 : work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
710 0 : mask = _mm_max_epu8(work, mask);
711 0 : mask = _mm_subs_epu8(mask, limit);
712 0 : mask = _mm_cmpeq_epi8(mask, zero);
713 : }
714 :
715 : {
716 : __m128i work;
717 0 : work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
718 0 : flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
719 0 : work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
720 0 : flat = _mm_max_epu8(work, flat);
721 0 : work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
722 0 : flat = _mm_subs_epu8(flat, one);
723 0 : flat = _mm_cmpeq_epi8(flat, zero);
724 0 : flat = _mm_and_si128(flat, mask);
725 0 : flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
726 0 : flat2 = _mm_max_epu8(work, flat2);
727 0 : work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
728 0 : flat2 = _mm_max_epu8(work, flat2);
729 0 : work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
730 0 : flat2 = _mm_max_epu8(work, flat2);
731 0 : flat2 = _mm_subs_epu8(flat2, one);
732 0 : flat2 = _mm_cmpeq_epi8(flat2, zero);
733 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
734 : }
735 :
736 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
737 : // filter4
738 : {
739 0 : const __m128i t4 = _mm_set1_epi8(4);
740 0 : const __m128i t3 = _mm_set1_epi8(3);
741 0 : const __m128i t80 = _mm_set1_epi8(0x80);
742 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
743 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
744 0 : const __m128i t1 = _mm_set1_epi8(0x1);
745 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
746 0 : const __m128i ff = _mm_cmpeq_epi8(t4, t4);
747 :
748 : __m128i filt;
749 : __m128i work_a;
750 : __m128i filter1, filter2;
751 :
752 0 : op1 = _mm_xor_si128(p1, t80);
753 0 : op0 = _mm_xor_si128(p0, t80);
754 0 : oq0 = _mm_xor_si128(q0, t80);
755 0 : oq1 = _mm_xor_si128(q1, t80);
756 :
757 0 : hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
758 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
759 0 : filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
760 :
761 0 : work_a = _mm_subs_epi8(oq0, op0);
762 0 : filt = _mm_adds_epi8(filt, work_a);
763 0 : filt = _mm_adds_epi8(filt, work_a);
764 0 : filt = _mm_adds_epi8(filt, work_a);
765 : // (aom_filter + 3 * (qs0 - ps0)) & mask
766 0 : filt = _mm_and_si128(filt, mask);
767 0 : filter1 = _mm_adds_epi8(filt, t4);
768 0 : filter2 = _mm_adds_epi8(filt, t3);
769 :
770 : // Filter1 >> 3
771 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
772 0 : filter1 = _mm_srli_epi16(filter1, 3);
773 0 : work_a = _mm_and_si128(work_a, te0);
774 0 : filter1 = _mm_and_si128(filter1, t1f);
775 0 : filter1 = _mm_or_si128(filter1, work_a);
776 0 : oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
777 :
778 : // Filter2 >> 3
779 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
780 0 : filter2 = _mm_srli_epi16(filter2, 3);
781 0 : work_a = _mm_and_si128(work_a, te0);
782 0 : filter2 = _mm_and_si128(filter2, t1f);
783 0 : filter2 = _mm_or_si128(filter2, work_a);
784 0 : op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
785 :
786 : // filt >> 1
787 0 : filt = _mm_adds_epi8(filter1, t1);
788 0 : work_a = _mm_cmpgt_epi8(zero, filt);
789 0 : filt = _mm_srli_epi16(filt, 1);
790 0 : work_a = _mm_and_si128(work_a, t80);
791 0 : filt = _mm_and_si128(filt, t7f);
792 0 : filt = _mm_or_si128(filt, work_a);
793 0 : filt = _mm_andnot_si128(hev, filt);
794 0 : op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
795 0 : oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
796 : // loopfilter done
797 :
798 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
799 : // filter8
800 : {
801 0 : const __m128i four = _mm_set1_epi16(4);
802 0 : const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
803 0 : const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
804 0 : const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
805 0 : const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
806 0 : const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
807 0 : const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
808 0 : const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
809 0 : const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
810 :
811 0 : const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
812 0 : const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
813 0 : const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
814 0 : const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
815 0 : const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
816 0 : const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
817 0 : const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
818 0 : const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
819 : __m128i f8_lo, f8_hi;
820 :
821 0 : f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
822 : _mm_add_epi16(p3_lo, p2_lo));
823 0 : f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
824 : _mm_add_epi16(p2_lo, p1_lo));
825 0 : f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
826 :
827 0 : f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
828 : _mm_add_epi16(p3_hi, p2_hi));
829 0 : f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
830 : _mm_add_epi16(p2_hi, p1_hi));
831 0 : f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
832 :
833 0 : op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
834 :
835 0 : f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
836 0 : f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
837 0 : op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
838 :
839 0 : f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
840 0 : f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
841 0 : op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
842 :
843 0 : f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
844 0 : f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
845 0 : oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
846 :
847 0 : f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
848 0 : f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
849 0 : oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
850 :
851 0 : f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
852 0 : f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
853 0 : oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
854 : }
855 :
856 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
857 : // wide flat calculations
858 : {
859 0 : const __m128i eight = _mm_set1_epi16(8);
860 0 : const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
861 0 : const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
862 0 : const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
863 0 : const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
864 0 : const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
865 0 : const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
866 0 : const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
867 0 : const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
868 0 : const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
869 0 : const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
870 0 : const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
871 0 : const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
872 0 : const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
873 0 : const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
874 0 : const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
875 0 : const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
876 :
877 0 : const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
878 0 : const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
879 0 : const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
880 0 : const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
881 0 : const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
882 0 : const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
883 0 : const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
884 0 : const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
885 0 : const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
886 0 : const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
887 0 : const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
888 0 : const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
889 0 : const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
890 0 : const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
891 0 : const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
892 0 : const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
893 :
894 : __m128i f_lo;
895 : __m128i f_hi;
896 :
897 0 : f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
898 0 : f_lo =
899 0 : _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
900 0 : f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
901 : _mm_add_epi16(p2_lo, p1_lo));
902 0 : f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
903 0 : f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
904 :
905 0 : f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
906 0 : f_hi =
907 0 : _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
908 0 : f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
909 : _mm_add_epi16(p2_hi, p1_hi));
910 0 : f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
911 0 : f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
912 :
913 0 : p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
914 0 : _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
915 :
916 0 : f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
917 0 : f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
918 0 : p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
919 0 : _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
920 :
921 0 : f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
922 0 : f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
923 0 : p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
924 0 : _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
925 :
926 0 : f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
927 0 : f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
928 0 : p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
929 0 : _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
930 :
931 0 : f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
932 0 : f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
933 0 : op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
934 0 : _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
935 :
936 0 : f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
937 0 : f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
938 0 : op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
939 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
940 :
941 0 : f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
942 0 : f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
943 0 : op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
944 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
945 :
946 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
947 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
948 0 : oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
949 0 : _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
950 :
951 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
952 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
953 0 : oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
954 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
955 :
956 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
957 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
958 0 : oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
959 0 : _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
960 :
961 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
962 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
963 0 : q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
964 0 : _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
965 :
966 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
967 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
968 0 : q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
969 0 : _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
970 :
971 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
972 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
973 0 : q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
974 0 : _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
975 :
976 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
977 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
978 0 : q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
979 0 : _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
980 : }
981 : // wide flat
982 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
983 : }
984 0 : }
985 :
986 0 : void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
987 : const unsigned char *_blimit,
988 : const unsigned char *_limit,
989 : const unsigned char *_thresh) {
990 : DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
991 : DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
992 : DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
993 : DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
994 : DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
995 : DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
996 0 : const __m128i zero = _mm_set1_epi16(0);
997 0 : const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
998 0 : const __m128i limit = _mm_load_si128((const __m128i *)_limit);
999 0 : const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
1000 : __m128i mask, hev, flat;
1001 : __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1002 : __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
1003 :
1004 0 : q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
1005 0 : _mm_loadl_epi64((__m128i *)(s + 3 * p)));
1006 0 : q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
1007 0 : _mm_loadl_epi64((__m128i *)(s + 2 * p)));
1008 0 : q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
1009 0 : _mm_loadl_epi64((__m128i *)(s + 1 * p)));
1010 0 : q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
1011 : _mm_loadl_epi64((__m128i *)(s - 0 * p)));
1012 0 : p1q1 = _mm_shuffle_epi32(q1p1, 78);
1013 0 : p0q0 = _mm_shuffle_epi32(q0p0, 78);
1014 :
1015 : {
1016 : // filter_mask and hev_mask
1017 0 : const __m128i one = _mm_set1_epi8(1);
1018 0 : const __m128i fe = _mm_set1_epi8(0xfe);
1019 0 : const __m128i ff = _mm_cmpeq_epi8(fe, fe);
1020 : __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
1021 0 : abs_p1p0 = abs_diff(q1p1, q0p0);
1022 0 : abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
1023 :
1024 0 : abs_p0q0 = abs_diff(q0p0, p0q0);
1025 0 : abs_p1q1 = abs_diff(q1p1, p1q1);
1026 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1027 0 : hev = _mm_subs_epu8(flat, thresh);
1028 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1029 :
1030 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1031 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1032 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1033 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1034 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1035 0 : mask = _mm_max_epu8(abs_p1p0, mask);
1036 : // mask |= (abs(p1 - p0) > limit) * -1;
1037 : // mask |= (abs(q1 - q0) > limit) * -1;
1038 :
1039 0 : work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
1040 0 : mask = _mm_max_epu8(work, mask);
1041 0 : mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
1042 0 : mask = _mm_subs_epu8(mask, limit);
1043 0 : mask = _mm_cmpeq_epi8(mask, zero);
1044 :
1045 : // flat_mask4
1046 :
1047 0 : flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
1048 0 : flat = _mm_max_epu8(abs_p1p0, flat);
1049 0 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
1050 0 : flat = _mm_subs_epu8(flat, one);
1051 0 : flat = _mm_cmpeq_epi8(flat, zero);
1052 0 : flat = _mm_and_si128(flat, mask);
1053 : }
1054 :
1055 : {
1056 0 : const __m128i four = _mm_set1_epi16(4);
1057 0 : unsigned char *src = s;
1058 : {
1059 : __m128i workp_a, workp_b, workp_shft;
1060 0 : p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1061 0 : p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1062 0 : p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1063 0 : p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1064 0 : q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1065 0 : q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1066 0 : q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1067 0 : q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1068 :
1069 0 : workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1070 0 : workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1071 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1072 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1073 0 : _mm_storel_epi64((__m128i *)&flat_op2[0],
1074 : _mm_packus_epi16(workp_shft, workp_shft));
1075 :
1076 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1077 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1078 0 : _mm_storel_epi64((__m128i *)&flat_op1[0],
1079 : _mm_packus_epi16(workp_shft, workp_shft));
1080 :
1081 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1082 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1083 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1084 0 : _mm_storel_epi64((__m128i *)&flat_op0[0],
1085 : _mm_packus_epi16(workp_shft, workp_shft));
1086 :
1087 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1088 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1089 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1090 0 : _mm_storel_epi64((__m128i *)&flat_oq0[0],
1091 : _mm_packus_epi16(workp_shft, workp_shft));
1092 :
1093 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1094 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1095 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1096 0 : _mm_storel_epi64((__m128i *)&flat_oq1[0],
1097 : _mm_packus_epi16(workp_shft, workp_shft));
1098 :
1099 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1100 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1101 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1102 0 : _mm_storel_epi64((__m128i *)&flat_oq2[0],
1103 : _mm_packus_epi16(workp_shft, workp_shft));
1104 : }
1105 : }
1106 : // lp filter
1107 : {
1108 0 : const __m128i t4 = _mm_set1_epi8(4);
1109 0 : const __m128i t3 = _mm_set1_epi8(3);
1110 0 : const __m128i t80 = _mm_set1_epi8(0x80);
1111 0 : const __m128i t1 = _mm_set1_epi8(0x1);
1112 0 : const __m128i ps1 =
1113 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
1114 0 : const __m128i ps0 =
1115 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
1116 0 : const __m128i qs0 =
1117 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
1118 0 : const __m128i qs1 =
1119 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
1120 : __m128i filt;
1121 : __m128i work_a;
1122 : __m128i filter1, filter2;
1123 :
1124 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1125 0 : work_a = _mm_subs_epi8(qs0, ps0);
1126 0 : filt = _mm_adds_epi8(filt, work_a);
1127 0 : filt = _mm_adds_epi8(filt, work_a);
1128 0 : filt = _mm_adds_epi8(filt, work_a);
1129 : // (aom_filter + 3 * (qs0 - ps0)) & mask
1130 0 : filt = _mm_and_si128(filt, mask);
1131 :
1132 0 : filter1 = _mm_adds_epi8(filt, t4);
1133 0 : filter2 = _mm_adds_epi8(filt, t3);
1134 :
1135 : // Filter1 >> 3
1136 0 : filter1 = _mm_unpacklo_epi8(zero, filter1);
1137 0 : filter1 = _mm_srai_epi16(filter1, 11);
1138 0 : filter1 = _mm_packs_epi16(filter1, filter1);
1139 :
1140 : // Filter2 >> 3
1141 0 : filter2 = _mm_unpacklo_epi8(zero, filter2);
1142 0 : filter2 = _mm_srai_epi16(filter2, 11);
1143 0 : filter2 = _mm_packs_epi16(filter2, zero);
1144 :
1145 : // filt >> 1
1146 0 : filt = _mm_adds_epi8(filter1, t1);
1147 0 : filt = _mm_unpacklo_epi8(zero, filt);
1148 0 : filt = _mm_srai_epi16(filt, 9);
1149 0 : filt = _mm_packs_epi16(filt, zero);
1150 :
1151 0 : filt = _mm_andnot_si128(hev, filt);
1152 :
1153 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1154 0 : q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1155 0 : work_a = _mm_andnot_si128(flat, work_a);
1156 0 : q0 = _mm_and_si128(flat, q0);
1157 0 : q0 = _mm_or_si128(work_a, q0);
1158 :
1159 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1160 0 : q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1161 0 : work_a = _mm_andnot_si128(flat, work_a);
1162 0 : q1 = _mm_and_si128(flat, q1);
1163 0 : q1 = _mm_or_si128(work_a, q1);
1164 :
1165 0 : work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1166 0 : q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1167 0 : work_a = _mm_andnot_si128(flat, work_a);
1168 0 : q2 = _mm_and_si128(flat, q2);
1169 0 : q2 = _mm_or_si128(work_a, q2);
1170 :
1171 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1172 0 : p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1173 0 : work_a = _mm_andnot_si128(flat, work_a);
1174 0 : p0 = _mm_and_si128(flat, p0);
1175 0 : p0 = _mm_or_si128(work_a, p0);
1176 :
1177 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1178 0 : p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1179 0 : work_a = _mm_andnot_si128(flat, work_a);
1180 0 : p1 = _mm_and_si128(flat, p1);
1181 0 : p1 = _mm_or_si128(work_a, p1);
1182 :
1183 0 : work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1184 0 : p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1185 0 : work_a = _mm_andnot_si128(flat, work_a);
1186 0 : p2 = _mm_and_si128(flat, p2);
1187 0 : p2 = _mm_or_si128(work_a, p2);
1188 :
1189 0 : _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1190 0 : _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1191 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1192 : _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1193 0 : _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1194 0 : _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1195 : }
1196 0 : }
1197 :
1198 0 : void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1199 : const uint8_t *_limit0,
1200 : const uint8_t *_thresh0,
1201 : const uint8_t *_blimit1,
1202 : const uint8_t *_limit1,
1203 : const uint8_t *_thresh1) {
1204 : DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1205 : DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1206 : DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1207 : DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1208 : DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1209 : DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1210 0 : const __m128i zero = _mm_set1_epi16(0);
1211 0 : const __m128i blimit =
1212 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1213 : _mm_load_si128((const __m128i *)_blimit1));
1214 0 : const __m128i limit =
1215 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1216 : _mm_load_si128((const __m128i *)_limit1));
1217 0 : const __m128i thresh =
1218 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1219 : _mm_load_si128((const __m128i *)_thresh1));
1220 :
1221 : __m128i mask, hev, flat;
1222 : __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1223 :
1224 0 : p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1225 0 : p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1226 0 : p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1227 0 : p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1228 0 : q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1229 0 : q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1230 0 : q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1231 0 : q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1232 : {
1233 0 : const __m128i abs_p1p0 =
1234 0 : _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1235 0 : const __m128i abs_q1q0 =
1236 0 : _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1237 0 : const __m128i one = _mm_set1_epi8(1);
1238 0 : const __m128i fe = _mm_set1_epi8(0xfe);
1239 0 : const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1240 0 : __m128i abs_p0q0 =
1241 0 : _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1242 0 : __m128i abs_p1q1 =
1243 0 : _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1244 : __m128i work;
1245 :
1246 : // filter_mask and hev_mask
1247 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1248 0 : hev = _mm_subs_epu8(flat, thresh);
1249 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1250 :
1251 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1252 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1253 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1254 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1255 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1256 0 : mask = _mm_max_epu8(flat, mask);
1257 : // mask |= (abs(p1 - p0) > limit) * -1;
1258 : // mask |= (abs(q1 - q0) > limit) * -1;
1259 0 : work = _mm_max_epu8(
1260 : _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1261 : _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1262 0 : mask = _mm_max_epu8(work, mask);
1263 0 : work = _mm_max_epu8(
1264 : _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1265 : _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1266 0 : mask = _mm_max_epu8(work, mask);
1267 0 : mask = _mm_subs_epu8(mask, limit);
1268 0 : mask = _mm_cmpeq_epi8(mask, zero);
1269 :
1270 : // flat_mask4
1271 0 : work = _mm_max_epu8(
1272 : _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
1273 : _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
1274 0 : flat = _mm_max_epu8(work, flat);
1275 0 : work = _mm_max_epu8(
1276 : _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
1277 : _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
1278 0 : flat = _mm_max_epu8(work, flat);
1279 0 : flat = _mm_subs_epu8(flat, one);
1280 0 : flat = _mm_cmpeq_epi8(flat, zero);
1281 0 : flat = _mm_and_si128(flat, mask);
1282 : }
1283 : {
1284 0 : const __m128i four = _mm_set1_epi16(4);
1285 0 : unsigned char *src = s;
1286 0 : int i = 0;
1287 :
1288 : do {
1289 : __m128i workp_a, workp_b, workp_shft;
1290 0 : p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1291 0 : p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1292 0 : p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1293 0 : p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1294 0 : q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1295 0 : q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1296 0 : q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1297 0 : q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1298 :
1299 0 : workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1300 0 : workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1301 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1302 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1303 0 : _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1304 : _mm_packus_epi16(workp_shft, workp_shft));
1305 :
1306 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1307 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1308 0 : _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1309 : _mm_packus_epi16(workp_shft, workp_shft));
1310 :
1311 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1312 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1313 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1314 0 : _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1315 : _mm_packus_epi16(workp_shft, workp_shft));
1316 :
1317 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1318 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1319 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1320 0 : _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1321 : _mm_packus_epi16(workp_shft, workp_shft));
1322 :
1323 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1324 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1325 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1326 0 : _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1327 : _mm_packus_epi16(workp_shft, workp_shft));
1328 :
1329 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1330 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1331 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1332 0 : _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1333 : _mm_packus_epi16(workp_shft, workp_shft));
1334 :
1335 0 : src += 8;
1336 0 : } while (++i < 2);
1337 : }
1338 : // lp filter
1339 : {
1340 0 : const __m128i t4 = _mm_set1_epi8(4);
1341 0 : const __m128i t3 = _mm_set1_epi8(3);
1342 0 : const __m128i t80 = _mm_set1_epi8(0x80);
1343 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
1344 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
1345 0 : const __m128i t1 = _mm_set1_epi8(0x1);
1346 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
1347 :
1348 0 : const __m128i ps1 =
1349 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1350 0 : const __m128i ps0 =
1351 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1352 0 : const __m128i qs0 =
1353 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1354 0 : const __m128i qs1 =
1355 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1356 : __m128i filt;
1357 : __m128i work_a;
1358 : __m128i filter1, filter2;
1359 :
1360 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1361 0 : work_a = _mm_subs_epi8(qs0, ps0);
1362 0 : filt = _mm_adds_epi8(filt, work_a);
1363 0 : filt = _mm_adds_epi8(filt, work_a);
1364 0 : filt = _mm_adds_epi8(filt, work_a);
1365 : // (aom_filter + 3 * (qs0 - ps0)) & mask
1366 0 : filt = _mm_and_si128(filt, mask);
1367 :
1368 0 : filter1 = _mm_adds_epi8(filt, t4);
1369 0 : filter2 = _mm_adds_epi8(filt, t3);
1370 :
1371 : // Filter1 >> 3
1372 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
1373 0 : filter1 = _mm_srli_epi16(filter1, 3);
1374 0 : work_a = _mm_and_si128(work_a, te0);
1375 0 : filter1 = _mm_and_si128(filter1, t1f);
1376 0 : filter1 = _mm_or_si128(filter1, work_a);
1377 :
1378 : // Filter2 >> 3
1379 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
1380 0 : filter2 = _mm_srli_epi16(filter2, 3);
1381 0 : work_a = _mm_and_si128(work_a, te0);
1382 0 : filter2 = _mm_and_si128(filter2, t1f);
1383 0 : filter2 = _mm_or_si128(filter2, work_a);
1384 :
1385 : // filt >> 1
1386 0 : filt = _mm_adds_epi8(filter1, t1);
1387 0 : work_a = _mm_cmpgt_epi8(zero, filt);
1388 0 : filt = _mm_srli_epi16(filt, 1);
1389 0 : work_a = _mm_and_si128(work_a, t80);
1390 0 : filt = _mm_and_si128(filt, t7f);
1391 0 : filt = _mm_or_si128(filt, work_a);
1392 :
1393 0 : filt = _mm_andnot_si128(hev, filt);
1394 :
1395 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1396 0 : q0 = _mm_load_si128((__m128i *)flat_oq0);
1397 0 : work_a = _mm_andnot_si128(flat, work_a);
1398 0 : q0 = _mm_and_si128(flat, q0);
1399 0 : q0 = _mm_or_si128(work_a, q0);
1400 :
1401 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1402 0 : q1 = _mm_load_si128((__m128i *)flat_oq1);
1403 0 : work_a = _mm_andnot_si128(flat, work_a);
1404 0 : q1 = _mm_and_si128(flat, q1);
1405 0 : q1 = _mm_or_si128(work_a, q1);
1406 :
1407 0 : work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1408 0 : q2 = _mm_load_si128((__m128i *)flat_oq2);
1409 0 : work_a = _mm_andnot_si128(flat, work_a);
1410 0 : q2 = _mm_and_si128(flat, q2);
1411 0 : q2 = _mm_or_si128(work_a, q2);
1412 :
1413 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1414 0 : p0 = _mm_load_si128((__m128i *)flat_op0);
1415 0 : work_a = _mm_andnot_si128(flat, work_a);
1416 0 : p0 = _mm_and_si128(flat, p0);
1417 0 : p0 = _mm_or_si128(work_a, p0);
1418 :
1419 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1420 0 : p1 = _mm_load_si128((__m128i *)flat_op1);
1421 0 : work_a = _mm_andnot_si128(flat, work_a);
1422 0 : p1 = _mm_and_si128(flat, p1);
1423 0 : p1 = _mm_or_si128(work_a, p1);
1424 :
1425 0 : work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1426 0 : p2 = _mm_load_si128((__m128i *)flat_op2);
1427 0 : work_a = _mm_andnot_si128(flat, work_a);
1428 0 : p2 = _mm_and_si128(flat, p2);
1429 0 : p2 = _mm_or_si128(work_a, p2);
1430 :
1431 0 : _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1432 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1433 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1434 : _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1435 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1436 0 : _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1437 : }
1438 0 : }
1439 :
1440 0 : void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1441 : const unsigned char *_blimit0,
1442 : const unsigned char *_limit0,
1443 : const unsigned char *_thresh0,
1444 : const unsigned char *_blimit1,
1445 : const unsigned char *_limit1,
1446 : const unsigned char *_thresh1) {
1447 0 : const __m128i blimit =
1448 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1449 : _mm_load_si128((const __m128i *)_blimit1));
1450 0 : const __m128i limit =
1451 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1452 : _mm_load_si128((const __m128i *)_limit1));
1453 0 : const __m128i thresh =
1454 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1455 : _mm_load_si128((const __m128i *)_thresh1));
1456 0 : const __m128i zero = _mm_set1_epi16(0);
1457 : #if !CONFIG_PARALLEL_DEBLOCKING
1458 : __m128i p3, p2, q2, q3;
1459 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1460 : __m128i p1, p0, q0, q1;
1461 : __m128i mask, hev, flat;
1462 : #if !CONFIG_PARALLEL_DEBLOCKING
1463 0 : p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1464 0 : p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1465 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1466 0 : p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1467 0 : p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1468 0 : q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1469 0 : q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1470 : #if !CONFIG_PARALLEL_DEBLOCKING
1471 0 : q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1472 0 : q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1473 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1474 : // filter_mask and hev_mask
1475 : {
1476 0 : const __m128i abs_p1p0 =
1477 0 : _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1478 0 : const __m128i abs_q1q0 =
1479 0 : _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1480 0 : const __m128i fe = _mm_set1_epi8(0xfe);
1481 0 : const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1482 0 : __m128i abs_p0q0 =
1483 0 : _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1484 0 : __m128i abs_p1q1 =
1485 0 : _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1486 : #if !CONFIG_PARALLEL_DEBLOCKING
1487 : __m128i work;
1488 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1489 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1490 0 : hev = _mm_subs_epu8(flat, thresh);
1491 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1492 :
1493 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1494 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1495 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1496 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1497 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1498 0 : mask = _mm_max_epu8(flat, mask);
1499 : #if !CONFIG_PARALLEL_DEBLOCKING
1500 : // mask |= (abs(p1 - p0) > limit) * -1;
1501 : // mask |= (abs(q1 - q0) > limit) * -1;
1502 0 : work = _mm_max_epu8(
1503 : _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1504 : _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1505 0 : mask = _mm_max_epu8(work, mask);
1506 0 : work = _mm_max_epu8(
1507 : _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1508 : _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1509 0 : mask = _mm_max_epu8(work, mask);
1510 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1511 0 : mask = _mm_subs_epu8(mask, limit);
1512 0 : mask = _mm_cmpeq_epi8(mask, zero);
1513 : }
1514 :
1515 : // filter4
1516 : {
1517 0 : const __m128i t4 = _mm_set1_epi8(4);
1518 0 : const __m128i t3 = _mm_set1_epi8(3);
1519 0 : const __m128i t80 = _mm_set1_epi8(0x80);
1520 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
1521 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
1522 0 : const __m128i t1 = _mm_set1_epi8(0x1);
1523 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
1524 :
1525 0 : const __m128i ps1 =
1526 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1527 0 : const __m128i ps0 =
1528 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1529 0 : const __m128i qs0 =
1530 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1531 0 : const __m128i qs1 =
1532 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1533 : __m128i filt;
1534 : __m128i work_a;
1535 : __m128i filter1, filter2;
1536 :
1537 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1538 0 : work_a = _mm_subs_epi8(qs0, ps0);
1539 0 : filt = _mm_adds_epi8(filt, work_a);
1540 0 : filt = _mm_adds_epi8(filt, work_a);
1541 0 : filt = _mm_adds_epi8(filt, work_a);
1542 : // (aom_filter + 3 * (qs0 - ps0)) & mask
1543 0 : filt = _mm_and_si128(filt, mask);
1544 :
1545 0 : filter1 = _mm_adds_epi8(filt, t4);
1546 0 : filter2 = _mm_adds_epi8(filt, t3);
1547 :
1548 : // Filter1 >> 3
1549 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
1550 0 : filter1 = _mm_srli_epi16(filter1, 3);
1551 0 : work_a = _mm_and_si128(work_a, te0);
1552 0 : filter1 = _mm_and_si128(filter1, t1f);
1553 0 : filter1 = _mm_or_si128(filter1, work_a);
1554 :
1555 : // Filter2 >> 3
1556 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
1557 0 : filter2 = _mm_srli_epi16(filter2, 3);
1558 0 : work_a = _mm_and_si128(work_a, te0);
1559 0 : filter2 = _mm_and_si128(filter2, t1f);
1560 0 : filter2 = _mm_or_si128(filter2, work_a);
1561 :
1562 : // filt >> 1
1563 0 : filt = _mm_adds_epi8(filter1, t1);
1564 0 : work_a = _mm_cmpgt_epi8(zero, filt);
1565 0 : filt = _mm_srli_epi16(filt, 1);
1566 0 : work_a = _mm_and_si128(work_a, t80);
1567 0 : filt = _mm_and_si128(filt, t7f);
1568 0 : filt = _mm_or_si128(filt, work_a);
1569 :
1570 0 : filt = _mm_andnot_si128(hev, filt);
1571 :
1572 0 : q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1573 0 : q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1574 0 : p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1575 0 : p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1576 :
1577 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1578 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1579 : _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1580 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1581 : }
1582 0 : }
1583 :
1584 0 : static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1585 : int in_p, unsigned char *out, int out_p) {
1586 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1587 : __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1588 :
1589 : // 2-way interleave w/hoisting of unpacks
1590 0 : x0 = _mm_loadl_epi64((__m128i *)in0); // 1
1591 0 : x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
1592 0 : x0 = _mm_unpacklo_epi8(x0, x1); // 1
1593 :
1594 0 : x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
1595 0 : x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
1596 0 : x1 = _mm_unpacklo_epi8(x2, x3); // 2
1597 :
1598 0 : x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
1599 0 : x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
1600 0 : x2 = _mm_unpacklo_epi8(x4, x5); // 3
1601 :
1602 0 : x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
1603 0 : x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
1604 0 : x3 = _mm_unpacklo_epi8(x6, x7); // 4
1605 0 : x4 = _mm_unpacklo_epi16(x0, x1); // 9
1606 :
1607 0 : x8 = _mm_loadl_epi64((__m128i *)in1); // 2
1608 0 : x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
1609 0 : x8 = _mm_unpacklo_epi8(x8, x9); // 5
1610 0 : x5 = _mm_unpacklo_epi16(x2, x3); // 10
1611 :
1612 0 : x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
1613 0 : x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
1614 0 : x9 = _mm_unpacklo_epi8(x10, x11); // 6
1615 :
1616 0 : x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
1617 0 : x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
1618 0 : x10 = _mm_unpacklo_epi8(x12, x13); // 7
1619 0 : x12 = _mm_unpacklo_epi16(x8, x9); // 11
1620 :
1621 0 : x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
1622 0 : x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
1623 0 : x11 = _mm_unpacklo_epi8(x14, x15); // 8
1624 0 : x13 = _mm_unpacklo_epi16(x10, x11); // 12
1625 :
1626 0 : x6 = _mm_unpacklo_epi32(x4, x5); // 13
1627 0 : x7 = _mm_unpackhi_epi32(x4, x5); // 14
1628 0 : x14 = _mm_unpacklo_epi32(x12, x13); // 15
1629 0 : x15 = _mm_unpackhi_epi32(x12, x13); // 16
1630 :
1631 : // Store first 4-line result
1632 0 : _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1633 0 : _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1634 0 : _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1635 0 : _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1636 :
1637 0 : x4 = _mm_unpackhi_epi16(x0, x1);
1638 0 : x5 = _mm_unpackhi_epi16(x2, x3);
1639 0 : x12 = _mm_unpackhi_epi16(x8, x9);
1640 0 : x13 = _mm_unpackhi_epi16(x10, x11);
1641 :
1642 0 : x6 = _mm_unpacklo_epi32(x4, x5);
1643 0 : x7 = _mm_unpackhi_epi32(x4, x5);
1644 0 : x14 = _mm_unpacklo_epi32(x12, x13);
1645 0 : x15 = _mm_unpackhi_epi32(x12, x13);
1646 :
1647 : // Store second 4-line result
1648 0 : _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1649 0 : _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1650 0 : _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1651 0 : _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1652 0 : }
1653 :
1654 : #if CONFIG_PARALLEL_DEBLOCKING
1655 : #define movq(p) _mm_loadl_epi64((const __m128i *)(p))
1656 : #define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
1657 : #define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
1658 : #define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
1659 : #define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
1660 : #define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
1661 : enum { ROTATE_DWORD_RIGHT = 0x39 };
1662 : static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
1663 : const uint8_t *pSrc,
1664 : const ptrdiff_t srcStride) {
1665 : for (uint32_t idx = 0; idx < 2; idx += 1) {
1666 : __m128i r0, r1, r2, r3;
1667 : // load data
1668 : r0 = movq(pSrc);
1669 : r1 = movq(pSrc + srcStride);
1670 : r2 = movq(pSrc + srcStride * 2);
1671 : r3 = movq(pSrc + srcStride * 3);
1672 : // transpose
1673 : r0 = punpcklbw(r0, r1);
1674 : r2 = punpcklbw(r2, r3);
1675 : r1 = punpckhwd(r0, r2);
1676 : r0 = punpcklwd(r0, r2);
1677 : // store data
1678 : movd(pDst, r0);
1679 : r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
1680 : movd(pDst + dstStride, r0);
1681 : r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
1682 : movd(pDst + dstStride * 2, r0);
1683 : r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
1684 : movd(pDst + dstStride * 3, r0);
1685 : movd(pDst + dstStride * 4, r1);
1686 : r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
1687 : movd(pDst + dstStride * 5, r1);
1688 : r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
1689 : movd(pDst + dstStride * 6, r1);
1690 : r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
1691 : movd(pDst + dstStride * 7, r1);
1692 : // advance the pointers
1693 : pDst += dstStride * 8;
1694 : pSrc += 8;
1695 : }
1696 : }
1697 :
1698 : #endif // CONFIG_PARALLEL_DEBLOCKING
1699 0 : static INLINE void transpose(unsigned char *src[], int in_p,
1700 : unsigned char *dst[], int out_p,
1701 : int num_8x8_to_transpose) {
1702 0 : int idx8x8 = 0;
1703 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1704 : do {
1705 0 : unsigned char *in = src[idx8x8];
1706 0 : unsigned char *out = dst[idx8x8];
1707 :
1708 0 : x0 =
1709 : _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
1710 0 : x1 =
1711 0 : _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
1712 : // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1713 0 : x0 = _mm_unpacklo_epi8(x0, x1);
1714 :
1715 0 : x2 =
1716 0 : _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
1717 0 : x3 =
1718 0 : _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
1719 : // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1720 0 : x1 = _mm_unpacklo_epi8(x2, x3);
1721 :
1722 0 : x4 =
1723 0 : _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
1724 0 : x5 =
1725 0 : _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
1726 : // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1727 0 : x2 = _mm_unpacklo_epi8(x4, x5);
1728 :
1729 0 : x6 =
1730 0 : _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
1731 0 : x7 =
1732 0 : _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
1733 : // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1734 0 : x3 = _mm_unpacklo_epi8(x6, x7);
1735 :
1736 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1737 0 : x4 = _mm_unpacklo_epi16(x0, x1);
1738 : // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1739 0 : x5 = _mm_unpacklo_epi16(x2, x3);
1740 : // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1741 0 : x6 = _mm_unpacklo_epi32(x4, x5);
1742 0 : _mm_storel_pd((double *)(out + 0 * out_p),
1743 : _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
1744 0 : _mm_storeh_pd((double *)(out + 1 * out_p),
1745 : _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
1746 : // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1747 0 : x7 = _mm_unpackhi_epi32(x4, x5);
1748 0 : _mm_storel_pd((double *)(out + 2 * out_p),
1749 : _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
1750 0 : _mm_storeh_pd((double *)(out + 3 * out_p),
1751 : _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
1752 :
1753 : // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1754 0 : x4 = _mm_unpackhi_epi16(x0, x1);
1755 : // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1756 0 : x5 = _mm_unpackhi_epi16(x2, x3);
1757 : // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1758 0 : x6 = _mm_unpacklo_epi32(x4, x5);
1759 0 : _mm_storel_pd((double *)(out + 4 * out_p),
1760 : _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
1761 0 : _mm_storeh_pd((double *)(out + 5 * out_p),
1762 : _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
1763 : // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1764 0 : x7 = _mm_unpackhi_epi32(x4, x5);
1765 :
1766 0 : _mm_storel_pd((double *)(out + 6 * out_p),
1767 : _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
1768 0 : _mm_storeh_pd((double *)(out + 7 * out_p),
1769 : _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
1770 0 : } while (++idx8x8 < num_8x8_to_transpose);
1771 0 : }
1772 :
1773 0 : void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1774 : const uint8_t *limit0, const uint8_t *thresh0,
1775 : const uint8_t *blimit1, const uint8_t *limit1,
1776 : const uint8_t *thresh1) {
1777 : DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1778 : #if !CONFIG_PARALLEL_DEBLOCKING
1779 : unsigned char *src[2];
1780 : unsigned char *dst[2];
1781 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1782 : // Transpose 8x16
1783 0 : transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1784 :
1785 : // Loop filtering
1786 0 : aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1787 : blimit1, limit1, thresh1);
1788 : #if !CONFIG_PARALLEL_DEBLOCKING
1789 0 : src[0] = t_dst;
1790 0 : src[1] = t_dst + 8;
1791 0 : dst[0] = s - 4;
1792 0 : dst[1] = s - 4 + p * 8;
1793 :
1794 : // Transpose back
1795 0 : transpose(src, 16, dst, p, 2);
1796 : #else // CONFIG_PARALLEL_DEBLOCKING
1797 : transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
1798 : #endif // !CONFIG_PARALLEL_DEBLOCKING
1799 0 : }
1800 :
1801 0 : void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
1802 : const unsigned char *blimit,
1803 : const unsigned char *limit,
1804 : const unsigned char *thresh) {
1805 : DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1806 : unsigned char *src[1];
1807 : unsigned char *dst[1];
1808 :
1809 : // Transpose 8x8
1810 0 : src[0] = s - 4;
1811 0 : dst[0] = t_dst;
1812 :
1813 0 : transpose(src, p, dst, 8, 1);
1814 :
1815 : // Loop filtering
1816 0 : aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1817 :
1818 0 : src[0] = t_dst;
1819 0 : dst[0] = s - 4;
1820 :
1821 : // Transpose back
1822 0 : transpose(src, 8, dst, p, 1);
1823 0 : }
1824 :
1825 0 : void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1826 : const uint8_t *limit0, const uint8_t *thresh0,
1827 : const uint8_t *blimit1, const uint8_t *limit1,
1828 : const uint8_t *thresh1) {
1829 : DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1830 : unsigned char *src[2];
1831 : unsigned char *dst[2];
1832 :
1833 : // Transpose 8x16
1834 0 : transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1835 :
1836 : // Loop filtering
1837 0 : aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1838 : blimit1, limit1, thresh1);
1839 0 : src[0] = t_dst;
1840 0 : src[1] = t_dst + 8;
1841 :
1842 0 : dst[0] = s - 4;
1843 0 : dst[1] = s - 4 + p * 8;
1844 :
1845 : // Transpose back
1846 0 : transpose(src, 16, dst, p, 2);
1847 0 : }
1848 :
1849 0 : void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
1850 : const unsigned char *blimit,
1851 : const unsigned char *limit,
1852 : const unsigned char *thresh) {
1853 : DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1854 : unsigned char *src[2];
1855 : unsigned char *dst[2];
1856 :
1857 0 : src[0] = s - 8;
1858 0 : src[1] = s;
1859 0 : dst[0] = t_dst;
1860 0 : dst[1] = t_dst + 8 * 8;
1861 :
1862 : // Transpose 16x8
1863 0 : transpose(src, p, dst, 8, 2);
1864 :
1865 : // Loop filtering
1866 0 : aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1867 :
1868 0 : src[0] = t_dst;
1869 0 : src[1] = t_dst + 8 * 8;
1870 0 : dst[0] = s - 8;
1871 0 : dst[1] = s;
1872 :
1873 : // Transpose back
1874 0 : transpose(src, 8, dst, p, 2);
1875 0 : }
1876 :
1877 0 : void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1878 : const uint8_t *blimit, const uint8_t *limit,
1879 : const uint8_t *thresh) {
1880 : DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1881 :
1882 : // Transpose 16x16
1883 0 : transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1884 0 : transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1885 :
1886 : // Loop filtering
1887 0 : aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1888 :
1889 : // Transpose back
1890 0 : transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1891 0 : transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1892 0 : }
|