Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <emmintrin.h> // SSE2
12 :
13 : #include "./vpx_dsp_rtcd.h"
14 : #include "vpx_ports/mem.h"
15 : #include "vpx_ports/emmintrin_compat.h"
16 :
17 0 : static INLINE __m128i abs_diff(__m128i a, __m128i b) {
18 0 : return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
19 : }
20 :
21 : // filter_mask and hev_mask
22 : #define FILTER_HEV_MASK \
23 : do { \
24 : /* (abs(q1 - q0), abs(p1 - p0) */ \
25 : __m128i flat = abs_diff(q1p1, q0p0); \
26 : /* abs(p1 - q1), abs(p0 - q0) */ \
27 : const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
28 : __m128i abs_p0q0, abs_p1q1, work; \
29 : \
30 : /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
31 : hev = \
32 : _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
33 : hev = _mm_cmpgt_epi16(hev, thresh); \
34 : hev = _mm_packs_epi16(hev, hev); \
35 : \
36 : /* const int8_t mask = filter_mask(*limit, *blimit, */ \
37 : /* p3, p2, p1, p0, q0, q1, q2, q3); */ \
38 : abs_p0q0 = \
39 : _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
40 : abs_p1q1 = \
41 : _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
42 : abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
43 : abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
44 : /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
45 : mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
46 : /* abs(p3 - p2), abs(p2 - p1) */ \
47 : work = abs_diff(p3p2, p2p1); \
48 : flat = _mm_max_epu8(work, flat); \
49 : /* abs(q3 - q2), abs(q2 - q1) */ \
50 : work = abs_diff(q3q2, q2q1); \
51 : flat = _mm_max_epu8(work, flat); \
52 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
53 : mask = _mm_unpacklo_epi64(mask, flat); \
54 : mask = _mm_subs_epu8(mask, limit); \
55 : mask = _mm_cmpeq_epi8(mask, zero); \
56 : mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
57 : } while (0)
58 :
59 : #define FILTER4 \
60 : do { \
61 : const __m128i t3t4 = \
62 : _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4); \
63 : const __m128i t80 = _mm_set1_epi8(0x80); \
64 : __m128i filter, filter2filter1, work; \
65 : \
66 : ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */ \
67 : qs1qs0 = _mm_xor_si128(q1q0, t80); \
68 : \
69 : /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */ \
70 : work = _mm_subs_epi8(ps1ps0, qs1qs0); \
71 : filter = _mm_and_si128(_mm_srli_si128(work, 8), hev); \
72 : /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */ \
73 : filter = _mm_subs_epi8(filter, work); \
74 : filter = _mm_subs_epi8(filter, work); \
75 : filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */ \
76 : filter = _mm_and_si128(filter, mask); /* & mask */ \
77 : filter = _mm_unpacklo_epi64(filter, filter); \
78 : \
79 : /* filter1 = signed_char_clamp(filter + 4) >> 3; */ \
80 : /* filter2 = signed_char_clamp(filter + 3) >> 3; */ \
81 : filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */ \
82 : filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1); \
83 : filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1); \
84 : filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */ \
85 : filter = _mm_srai_epi16(filter, 11); /* >> 3 */ \
86 : filter2filter1 = _mm_packs_epi16(filter2filter1, filter); \
87 : \
88 : /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */ \
89 : filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */ \
90 : filter = _mm_unpacklo_epi8(filter, filter); \
91 : filter = _mm_srai_epi16(filter, 9); /* round */ \
92 : filter = _mm_packs_epi16(filter, filter); \
93 : filter = _mm_andnot_si128(hev, filter); \
94 : \
95 : hev = _mm_unpackhi_epi64(filter2filter1, filter); \
96 : filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter); \
97 : \
98 : /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
99 : qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1); \
100 : /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
101 : ps1ps0 = _mm_adds_epi8(ps1ps0, hev); \
102 : qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */ \
103 : ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */ \
104 : } while (0)
105 :
106 0 : void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
107 : const uint8_t *_blimit, const uint8_t *_limit,
108 : const uint8_t *_thresh) {
109 0 : const __m128i zero = _mm_set1_epi16(0);
110 0 : const __m128i limit =
111 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
112 : _mm_loadl_epi64((const __m128i *)_limit));
113 0 : const __m128i thresh =
114 0 : _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
115 0 : const __m128i ff = _mm_cmpeq_epi8(zero, zero);
116 : __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
117 : __m128i mask, hev;
118 :
119 0 : p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
120 0 : _mm_loadl_epi64((__m128i *)(s - 4 * p)));
121 0 : q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
122 0 : _mm_loadl_epi64((__m128i *)(s + 1 * p)));
123 0 : q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
124 : _mm_loadl_epi64((__m128i *)(s + 0 * p)));
125 0 : q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
126 0 : _mm_loadl_epi64((__m128i *)(s + 3 * p)));
127 0 : p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
128 0 : p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
129 0 : q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
130 0 : q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
131 :
132 0 : FILTER_HEV_MASK;
133 0 : FILTER4;
134 :
135 0 : _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
136 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0); // *op0
137 : _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0); // *oq0
138 0 : _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0)); // *oq1
139 0 : }
140 :
141 0 : void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
142 : const uint8_t *_blimit, const uint8_t *_limit,
143 : const uint8_t *_thresh) {
144 0 : const __m128i zero = _mm_set1_epi16(0);
145 0 : const __m128i limit =
146 0 : _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
147 : _mm_loadl_epi64((const __m128i *)_limit));
148 0 : const __m128i thresh =
149 0 : _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
150 0 : const __m128i ff = _mm_cmpeq_epi8(zero, zero);
151 : __m128i x0, x1, x2, x3;
152 : __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
153 : __m128i mask, hev;
154 :
155 : // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
156 0 : q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
157 0 : _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
158 :
159 : // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
160 0 : x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
161 0 : _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
162 :
163 : // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
164 0 : x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
165 0 : _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
166 :
167 : // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
168 0 : x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
169 0 : _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
170 :
171 : // Transpose 8x8
172 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
173 0 : p1p0 = _mm_unpacklo_epi16(q1q0, x1);
174 : // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
175 0 : x0 = _mm_unpacklo_epi16(x2, x3);
176 : // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
177 0 : p3p2 = _mm_unpacklo_epi32(p1p0, x0);
178 : // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
179 0 : p1p0 = _mm_unpackhi_epi32(p1p0, x0);
180 0 : p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
181 0 : p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
182 :
183 : // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
184 0 : q1q0 = _mm_unpackhi_epi16(q1q0, x1);
185 : // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
186 0 : x2 = _mm_unpackhi_epi16(x2, x3);
187 : // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
188 0 : q3q2 = _mm_unpackhi_epi32(q1q0, x2);
189 : // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
190 0 : q1q0 = _mm_unpacklo_epi32(q1q0, x2);
191 :
192 0 : q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
193 0 : q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
194 0 : p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
195 0 : p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
196 0 : q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
197 :
198 0 : FILTER_HEV_MASK;
199 0 : FILTER4;
200 :
201 : // Transpose 8x4 to 4x8
202 : // qs1qs0: 20 21 22 23 24 25 26 27 30 31 32 33 34 34 36 37
203 : // ps1ps0: 10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07
204 : // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
205 0 : ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
206 : // 10 30 11 31 12 32 13 33 14 34 15 35 16 36 17 37
207 0 : x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
208 : // 00 20 01 21 02 22 03 23 04 24 05 25 06 26 07 27
209 0 : ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
210 : // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
211 0 : qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
212 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
213 0 : ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
214 :
215 0 : *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
216 0 : ps1ps0 = _mm_srli_si128(ps1ps0, 4);
217 0 : *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
218 0 : ps1ps0 = _mm_srli_si128(ps1ps0, 4);
219 0 : *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
220 0 : ps1ps0 = _mm_srli_si128(ps1ps0, 4);
221 0 : *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
222 :
223 0 : *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
224 0 : qs1qs0 = _mm_srli_si128(qs1qs0, 4);
225 0 : *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
226 0 : qs1qs0 = _mm_srli_si128(qs1qs0, 4);
227 0 : *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
228 0 : qs1qs0 = _mm_srli_si128(qs1qs0, 4);
229 0 : *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
230 0 : }
231 :
232 0 : void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
233 : const unsigned char *_blimit,
234 : const unsigned char *_limit,
235 : const unsigned char *_thresh) {
236 0 : const __m128i zero = _mm_set1_epi16(0);
237 0 : const __m128i one = _mm_set1_epi8(1);
238 0 : const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
239 0 : const __m128i limit = _mm_load_si128((const __m128i *)_limit);
240 0 : const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
241 : __m128i mask, hev, flat, flat2;
242 : __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
243 : __m128i abs_p1p0;
244 :
245 0 : q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
246 0 : q4p4 = _mm_castps_si128(
247 0 : _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
248 0 : q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
249 0 : q3p3 = _mm_castps_si128(
250 0 : _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
251 0 : q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
252 0 : q2p2 = _mm_castps_si128(
253 0 : _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
254 0 : q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
255 0 : q1p1 = _mm_castps_si128(
256 0 : _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
257 0 : p1q1 = _mm_shuffle_epi32(q1p1, 78);
258 0 : q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
259 0 : q0p0 = _mm_castps_si128(
260 : _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
261 0 : p0q0 = _mm_shuffle_epi32(q0p0, 78);
262 :
263 : {
264 : __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
265 0 : abs_p1p0 = abs_diff(q1p1, q0p0);
266 0 : abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
267 0 : fe = _mm_set1_epi8(0xfe);
268 0 : ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
269 0 : abs_p0q0 = abs_diff(q0p0, p0q0);
270 0 : abs_p1q1 = abs_diff(q1p1, p1q1);
271 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
272 0 : hev = _mm_subs_epu8(flat, thresh);
273 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
274 :
275 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
276 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
277 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
278 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
279 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
280 0 : mask = _mm_max_epu8(abs_p1p0, mask);
281 : // mask |= (abs(p1 - p0) > limit) * -1;
282 : // mask |= (abs(q1 - q0) > limit) * -1;
283 :
284 0 : work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
285 0 : mask = _mm_max_epu8(work, mask);
286 0 : mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
287 0 : mask = _mm_subs_epu8(mask, limit);
288 0 : mask = _mm_cmpeq_epi8(mask, zero);
289 : }
290 :
291 : // lp filter
292 : {
293 0 : const __m128i t4 = _mm_set1_epi8(4);
294 0 : const __m128i t3 = _mm_set1_epi8(3);
295 0 : const __m128i t80 = _mm_set1_epi8(0x80);
296 0 : const __m128i t1 = _mm_set1_epi16(0x1);
297 0 : __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
298 0 : __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
299 0 : __m128i qs0 = _mm_xor_si128(p0q0, t80);
300 0 : __m128i qs1 = _mm_xor_si128(p1q1, t80);
301 : __m128i filt;
302 : __m128i work_a;
303 : __m128i filter1, filter2;
304 : __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
305 : __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
306 :
307 0 : filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
308 0 : work_a = _mm_subs_epi8(qs0, qs0ps0);
309 0 : filt = _mm_adds_epi8(filt, work_a);
310 0 : filt = _mm_adds_epi8(filt, work_a);
311 0 : filt = _mm_adds_epi8(filt, work_a);
312 : // (vpx_filter + 3 * (qs0 - ps0)) & mask
313 0 : filt = _mm_and_si128(filt, mask);
314 :
315 0 : filter1 = _mm_adds_epi8(filt, t4);
316 0 : filter2 = _mm_adds_epi8(filt, t3);
317 :
318 0 : filter1 = _mm_unpacklo_epi8(zero, filter1);
319 0 : filter1 = _mm_srai_epi16(filter1, 0xB);
320 0 : filter2 = _mm_unpacklo_epi8(zero, filter2);
321 0 : filter2 = _mm_srai_epi16(filter2, 0xB);
322 :
323 : // Filter1 >> 3
324 0 : filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
325 0 : qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
326 :
327 : // filt >> 1
328 0 : filt = _mm_adds_epi16(filter1, t1);
329 0 : filt = _mm_srai_epi16(filt, 1);
330 0 : filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
331 : filt);
332 0 : filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
333 0 : qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
334 : // loopfilter done
335 :
336 : {
337 : __m128i work;
338 0 : flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
339 0 : flat = _mm_max_epu8(abs_p1p0, flat);
340 0 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
341 0 : flat = _mm_subs_epu8(flat, one);
342 0 : flat = _mm_cmpeq_epi8(flat, zero);
343 0 : flat = _mm_and_si128(flat, mask);
344 :
345 0 : q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
346 0 : q5p5 = _mm_castps_si128(
347 0 : _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
348 :
349 0 : q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
350 0 : q6p6 = _mm_castps_si128(
351 0 : _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
352 0 : flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
353 :
354 0 : q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
355 0 : q7p7 = _mm_castps_si128(
356 0 : _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
357 0 : work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
358 0 : flat2 = _mm_max_epu8(work, flat2);
359 0 : flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
360 0 : flat2 = _mm_subs_epu8(flat2, one);
361 0 : flat2 = _mm_cmpeq_epi8(flat2, zero);
362 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
363 : }
364 :
365 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
366 : // flat and wide flat calculations
367 : {
368 0 : const __m128i eight = _mm_set1_epi16(8);
369 0 : const __m128i four = _mm_set1_epi16(4);
370 : __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
371 : __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
372 : __m128i pixelFilter_p, pixelFilter_q;
373 : __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
374 : __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
375 :
376 0 : p7_16 = _mm_unpacklo_epi8(q7p7, zero);
377 0 : p6_16 = _mm_unpacklo_epi8(q6p6, zero);
378 0 : p5_16 = _mm_unpacklo_epi8(q5p5, zero);
379 0 : p4_16 = _mm_unpacklo_epi8(q4p4, zero);
380 0 : p3_16 = _mm_unpacklo_epi8(q3p3, zero);
381 0 : p2_16 = _mm_unpacklo_epi8(q2p2, zero);
382 0 : p1_16 = _mm_unpacklo_epi8(q1p1, zero);
383 0 : p0_16 = _mm_unpacklo_epi8(q0p0, zero);
384 0 : q0_16 = _mm_unpackhi_epi8(q0p0, zero);
385 0 : q1_16 = _mm_unpackhi_epi8(q1p1, zero);
386 0 : q2_16 = _mm_unpackhi_epi8(q2p2, zero);
387 0 : q3_16 = _mm_unpackhi_epi8(q3p3, zero);
388 0 : q4_16 = _mm_unpackhi_epi8(q4p4, zero);
389 0 : q5_16 = _mm_unpackhi_epi8(q5p5, zero);
390 0 : q6_16 = _mm_unpackhi_epi8(q6p6, zero);
391 0 : q7_16 = _mm_unpackhi_epi8(q7p7, zero);
392 :
393 0 : pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
394 : _mm_add_epi16(p4_16, p3_16));
395 0 : pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
396 : _mm_add_epi16(q4_16, q3_16));
397 :
398 0 : pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
399 0 : pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
400 :
401 0 : pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
402 0 : pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
403 0 : pixelFilter_p =
404 0 : _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
405 0 : pixetFilter_p2p1p0 = _mm_add_epi16(
406 : four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
407 0 : res_p = _mm_srli_epi16(
408 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
409 0 : res_q = _mm_srli_epi16(
410 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
411 0 : flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
412 0 : res_p = _mm_srli_epi16(
413 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
414 0 : res_q = _mm_srli_epi16(
415 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
416 :
417 0 : flat_q0p0 = _mm_packus_epi16(res_p, res_q);
418 :
419 0 : sum_p7 = _mm_add_epi16(p7_16, p7_16);
420 0 : sum_q7 = _mm_add_epi16(q7_16, q7_16);
421 0 : sum_p3 = _mm_add_epi16(p3_16, p3_16);
422 0 : sum_q3 = _mm_add_epi16(q3_16, q3_16);
423 :
424 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
425 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
426 0 : res_p = _mm_srli_epi16(
427 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
428 0 : res_q = _mm_srli_epi16(
429 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
430 0 : flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
431 :
432 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
433 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
434 0 : res_p = _mm_srli_epi16(
435 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
436 0 : res_q = _mm_srli_epi16(
437 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
438 0 : flat_q1p1 = _mm_packus_epi16(res_p, res_q);
439 :
440 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
441 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
442 0 : sum_p3 = _mm_add_epi16(sum_p3, p3_16);
443 0 : sum_q3 = _mm_add_epi16(sum_q3, q3_16);
444 :
445 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
446 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
447 0 : res_p = _mm_srli_epi16(
448 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
449 0 : res_q = _mm_srli_epi16(
450 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
451 0 : flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
452 :
453 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
454 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
455 :
456 0 : res_p = _mm_srli_epi16(
457 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
458 0 : res_q = _mm_srli_epi16(
459 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
460 0 : flat_q2p2 = _mm_packus_epi16(res_p, res_q);
461 :
462 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
463 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
464 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
465 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
466 0 : res_p = _mm_srli_epi16(
467 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
468 0 : res_q = _mm_srli_epi16(
469 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
470 0 : flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
471 :
472 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
473 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
474 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
475 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
476 0 : res_p = _mm_srli_epi16(
477 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
478 0 : res_q = _mm_srli_epi16(
479 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
480 0 : flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
481 :
482 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
483 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
484 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
485 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
486 0 : res_p = _mm_srli_epi16(
487 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
488 0 : res_q = _mm_srli_epi16(
489 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
490 0 : flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
491 :
492 0 : sum_p7 = _mm_add_epi16(sum_p7, p7_16);
493 0 : sum_q7 = _mm_add_epi16(sum_q7, q7_16);
494 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
495 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
496 0 : res_p = _mm_srli_epi16(
497 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
498 0 : res_q = _mm_srli_epi16(
499 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
500 0 : flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
501 : }
502 : // wide flat
503 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
504 :
505 0 : flat = _mm_shuffle_epi32(flat, 68);
506 0 : flat2 = _mm_shuffle_epi32(flat2, 68);
507 :
508 0 : q2p2 = _mm_andnot_si128(flat, q2p2);
509 0 : flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
510 0 : q2p2 = _mm_or_si128(q2p2, flat_q2p2);
511 :
512 0 : qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
513 0 : flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
514 0 : q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
515 :
516 0 : qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
517 0 : flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
518 0 : q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
519 :
520 0 : q6p6 = _mm_andnot_si128(flat2, q6p6);
521 0 : flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
522 0 : q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
523 0 : _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
524 0 : _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
525 :
526 0 : q5p5 = _mm_andnot_si128(flat2, q5p5);
527 0 : flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
528 0 : q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
529 0 : _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
530 0 : _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
531 :
532 0 : q4p4 = _mm_andnot_si128(flat2, q4p4);
533 0 : flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
534 0 : q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
535 0 : _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
536 0 : _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
537 :
538 0 : q3p3 = _mm_andnot_si128(flat2, q3p3);
539 0 : flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
540 0 : q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
541 0 : _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
542 0 : _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
543 :
544 0 : q2p2 = _mm_andnot_si128(flat2, q2p2);
545 0 : flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
546 0 : q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
547 0 : _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
548 0 : _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
549 :
550 0 : q1p1 = _mm_andnot_si128(flat2, q1p1);
551 0 : flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
552 0 : q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
553 0 : _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
554 0 : _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
555 :
556 0 : q0p0 = _mm_andnot_si128(flat2, q0p0);
557 0 : flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
558 0 : q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
559 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
560 0 : _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
561 : }
562 0 : }
563 :
564 0 : static INLINE __m128i filter_add2_sub2(const __m128i *const total,
565 : const __m128i *const a1,
566 : const __m128i *const a2,
567 : const __m128i *const s1,
568 : const __m128i *const s2) {
569 0 : __m128i x = _mm_add_epi16(*a1, *total);
570 0 : x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
571 0 : return x;
572 : }
573 :
574 0 : static INLINE __m128i filter8_mask(const __m128i *const flat,
575 : const __m128i *const other_filt,
576 : const __m128i *const f8_lo,
577 : const __m128i *const f8_hi) {
578 0 : const __m128i f8 =
579 0 : _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
580 0 : const __m128i result = _mm_and_si128(*flat, f8);
581 0 : return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
582 : }
583 :
584 0 : static INLINE __m128i filter16_mask(const __m128i *const flat,
585 : const __m128i *const other_filt,
586 : const __m128i *const f_lo,
587 : const __m128i *const f_hi) {
588 0 : const __m128i f =
589 0 : _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
590 0 : const __m128i result = _mm_and_si128(*flat, f);
591 0 : return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
592 : }
593 :
594 0 : void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
595 : const unsigned char *_blimit,
596 : const unsigned char *_limit,
597 : const unsigned char *_thresh) {
598 0 : const __m128i zero = _mm_set1_epi16(0);
599 0 : const __m128i one = _mm_set1_epi8(1);
600 0 : const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
601 0 : const __m128i limit = _mm_load_si128((const __m128i *)_limit);
602 0 : const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
603 : __m128i mask, hev, flat, flat2;
604 : __m128i p7, p6, p5;
605 : __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
606 : __m128i q5, q6, q7;
607 :
608 : __m128i op2, op1, op0, oq0, oq1, oq2;
609 :
610 : __m128i max_abs_p1p0q1q0;
611 :
612 0 : p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
613 0 : p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
614 0 : p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
615 0 : p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
616 0 : p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
617 0 : p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
618 0 : p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
619 0 : p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
620 0 : q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
621 0 : q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
622 0 : q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
623 0 : q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
624 0 : q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
625 0 : q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
626 0 : q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
627 0 : q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
628 :
629 : {
630 0 : const __m128i abs_p1p0 = abs_diff(p1, p0);
631 0 : const __m128i abs_q1q0 = abs_diff(q1, q0);
632 0 : const __m128i fe = _mm_set1_epi8(0xfe);
633 0 : const __m128i ff = _mm_cmpeq_epi8(zero, zero);
634 0 : __m128i abs_p0q0 = abs_diff(p0, q0);
635 0 : __m128i abs_p1q1 = abs_diff(p1, q1);
636 : __m128i work;
637 0 : max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
638 :
639 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
640 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
641 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
642 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
643 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
644 0 : mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
645 : // mask |= (abs(p1 - p0) > limit) * -1;
646 : // mask |= (abs(q1 - q0) > limit) * -1;
647 0 : work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
648 0 : mask = _mm_max_epu8(work, mask);
649 0 : work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
650 0 : mask = _mm_max_epu8(work, mask);
651 0 : mask = _mm_subs_epu8(mask, limit);
652 0 : mask = _mm_cmpeq_epi8(mask, zero);
653 : }
654 :
655 : {
656 : __m128i work;
657 0 : work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
658 0 : flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
659 0 : work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
660 0 : flat = _mm_max_epu8(work, flat);
661 0 : work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
662 0 : flat = _mm_subs_epu8(flat, one);
663 0 : flat = _mm_cmpeq_epi8(flat, zero);
664 0 : flat = _mm_and_si128(flat, mask);
665 0 : flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
666 0 : flat2 = _mm_max_epu8(work, flat2);
667 0 : work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
668 0 : flat2 = _mm_max_epu8(work, flat2);
669 0 : work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
670 0 : flat2 = _mm_max_epu8(work, flat2);
671 0 : flat2 = _mm_subs_epu8(flat2, one);
672 0 : flat2 = _mm_cmpeq_epi8(flat2, zero);
673 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
674 : }
675 :
676 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
677 : // filter4
678 : {
679 0 : const __m128i t4 = _mm_set1_epi8(4);
680 0 : const __m128i t3 = _mm_set1_epi8(3);
681 0 : const __m128i t80 = _mm_set1_epi8(0x80);
682 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
683 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
684 0 : const __m128i t1 = _mm_set1_epi8(0x1);
685 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
686 0 : const __m128i ff = _mm_cmpeq_epi8(t4, t4);
687 :
688 : __m128i filt;
689 : __m128i work_a;
690 : __m128i filter1, filter2;
691 :
692 0 : op1 = _mm_xor_si128(p1, t80);
693 0 : op0 = _mm_xor_si128(p0, t80);
694 0 : oq0 = _mm_xor_si128(q0, t80);
695 0 : oq1 = _mm_xor_si128(q1, t80);
696 :
697 0 : hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
698 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
699 0 : filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
700 :
701 0 : work_a = _mm_subs_epi8(oq0, op0);
702 0 : filt = _mm_adds_epi8(filt, work_a);
703 0 : filt = _mm_adds_epi8(filt, work_a);
704 0 : filt = _mm_adds_epi8(filt, work_a);
705 : // (vpx_filter + 3 * (qs0 - ps0)) & mask
706 0 : filt = _mm_and_si128(filt, mask);
707 0 : filter1 = _mm_adds_epi8(filt, t4);
708 0 : filter2 = _mm_adds_epi8(filt, t3);
709 :
710 : // Filter1 >> 3
711 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
712 0 : filter1 = _mm_srli_epi16(filter1, 3);
713 0 : work_a = _mm_and_si128(work_a, te0);
714 0 : filter1 = _mm_and_si128(filter1, t1f);
715 0 : filter1 = _mm_or_si128(filter1, work_a);
716 0 : oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
717 :
718 : // Filter2 >> 3
719 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
720 0 : filter2 = _mm_srli_epi16(filter2, 3);
721 0 : work_a = _mm_and_si128(work_a, te0);
722 0 : filter2 = _mm_and_si128(filter2, t1f);
723 0 : filter2 = _mm_or_si128(filter2, work_a);
724 0 : op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
725 :
726 : // filt >> 1
727 0 : filt = _mm_adds_epi8(filter1, t1);
728 0 : work_a = _mm_cmpgt_epi8(zero, filt);
729 0 : filt = _mm_srli_epi16(filt, 1);
730 0 : work_a = _mm_and_si128(work_a, t80);
731 0 : filt = _mm_and_si128(filt, t7f);
732 0 : filt = _mm_or_si128(filt, work_a);
733 0 : filt = _mm_andnot_si128(hev, filt);
734 0 : op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
735 0 : oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
736 : // loopfilter done
737 :
738 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
739 : // filter8
740 : {
741 0 : const __m128i four = _mm_set1_epi16(4);
742 0 : const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
743 0 : const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
744 0 : const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
745 0 : const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
746 0 : const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
747 0 : const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
748 0 : const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
749 0 : const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
750 :
751 0 : const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
752 0 : const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
753 0 : const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
754 0 : const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
755 0 : const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
756 0 : const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
757 0 : const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
758 0 : const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
759 : __m128i f8_lo, f8_hi;
760 :
761 0 : f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
762 : _mm_add_epi16(p3_lo, p2_lo));
763 0 : f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
764 : _mm_add_epi16(p2_lo, p1_lo));
765 0 : f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
766 :
767 0 : f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
768 : _mm_add_epi16(p3_hi, p2_hi));
769 0 : f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
770 : _mm_add_epi16(p2_hi, p1_hi));
771 0 : f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
772 :
773 0 : op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
774 :
775 0 : f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
776 0 : f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
777 0 : op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
778 :
779 0 : f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
780 0 : f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
781 0 : op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
782 :
783 0 : f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
784 0 : f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
785 0 : oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
786 :
787 0 : f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
788 0 : f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
789 0 : oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
790 :
791 0 : f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
792 0 : f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
793 0 : oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
794 : }
795 :
796 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
797 : // wide flat calculations
798 : {
799 0 : const __m128i eight = _mm_set1_epi16(8);
800 0 : const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
801 0 : const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
802 0 : const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
803 0 : const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
804 0 : const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
805 0 : const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
806 0 : const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
807 0 : const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
808 0 : const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
809 0 : const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
810 0 : const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
811 0 : const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
812 0 : const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
813 0 : const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
814 0 : const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
815 0 : const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
816 :
817 0 : const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
818 0 : const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
819 0 : const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
820 0 : const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
821 0 : const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
822 0 : const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
823 0 : const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
824 0 : const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
825 0 : const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
826 0 : const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
827 0 : const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
828 0 : const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
829 0 : const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
830 0 : const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
831 0 : const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
832 0 : const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
833 :
834 : __m128i f_lo;
835 : __m128i f_hi;
836 :
837 0 : f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo); // p7 * 7
838 0 : f_lo =
839 0 : _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
840 0 : f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
841 : _mm_add_epi16(p2_lo, p1_lo));
842 0 : f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
843 0 : f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
844 :
845 0 : f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi); // p7 * 7
846 0 : f_hi =
847 0 : _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
848 0 : f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
849 : _mm_add_epi16(p2_hi, p1_hi));
850 0 : f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
851 0 : f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
852 :
853 0 : p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
854 0 : _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
855 :
856 0 : f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
857 0 : f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
858 0 : p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
859 0 : _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
860 :
861 0 : f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
862 0 : f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
863 0 : p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
864 0 : _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
865 :
866 0 : f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
867 0 : f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
868 0 : p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
869 0 : _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
870 :
871 0 : f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
872 0 : f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
873 0 : op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
874 0 : _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
875 :
876 0 : f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
877 0 : f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
878 0 : op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
879 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
880 :
881 0 : f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
882 0 : f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
883 0 : op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
884 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
885 :
886 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
887 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
888 0 : oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
889 0 : _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
890 :
891 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
892 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
893 0 : oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
894 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
895 :
896 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
897 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
898 0 : oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
899 0 : _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
900 :
901 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
902 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
903 0 : q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
904 0 : _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
905 :
906 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
907 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
908 0 : q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
909 0 : _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
910 :
911 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
912 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
913 0 : q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
914 0 : _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
915 :
916 0 : f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
917 0 : f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
918 0 : q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
919 0 : _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
920 : }
921 : // wide flat
922 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
923 : }
924 0 : }
925 :
926 0 : void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
927 : const unsigned char *_blimit,
928 : const unsigned char *_limit,
929 : const unsigned char *_thresh) {
930 : DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
931 : DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
932 : DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
933 : DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
934 : DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
935 : DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
936 0 : const __m128i zero = _mm_set1_epi16(0);
937 0 : const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
938 0 : const __m128i limit = _mm_load_si128((const __m128i *)_limit);
939 0 : const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
940 : __m128i mask, hev, flat;
941 : __m128i p3, p2, p1, p0, q0, q1, q2, q3;
942 : __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
943 :
944 0 : q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
945 0 : _mm_loadl_epi64((__m128i *)(s + 3 * p)));
946 0 : q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
947 0 : _mm_loadl_epi64((__m128i *)(s + 2 * p)));
948 0 : q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
949 0 : _mm_loadl_epi64((__m128i *)(s + 1 * p)));
950 0 : q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
951 : _mm_loadl_epi64((__m128i *)(s - 0 * p)));
952 0 : p1q1 = _mm_shuffle_epi32(q1p1, 78);
953 0 : p0q0 = _mm_shuffle_epi32(q0p0, 78);
954 :
955 : {
956 : // filter_mask and hev_mask
957 0 : const __m128i one = _mm_set1_epi8(1);
958 0 : const __m128i fe = _mm_set1_epi8(0xfe);
959 0 : const __m128i ff = _mm_cmpeq_epi8(fe, fe);
960 : __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
961 0 : abs_p1p0 = abs_diff(q1p1, q0p0);
962 0 : abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
963 :
964 0 : abs_p0q0 = abs_diff(q0p0, p0q0);
965 0 : abs_p1q1 = abs_diff(q1p1, p1q1);
966 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
967 0 : hev = _mm_subs_epu8(flat, thresh);
968 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
969 :
970 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
971 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
972 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
973 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
974 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
975 0 : mask = _mm_max_epu8(abs_p1p0, mask);
976 : // mask |= (abs(p1 - p0) > limit) * -1;
977 : // mask |= (abs(q1 - q0) > limit) * -1;
978 :
979 0 : work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
980 0 : mask = _mm_max_epu8(work, mask);
981 0 : mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
982 0 : mask = _mm_subs_epu8(mask, limit);
983 0 : mask = _mm_cmpeq_epi8(mask, zero);
984 :
985 : // flat_mask4
986 :
987 0 : flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
988 0 : flat = _mm_max_epu8(abs_p1p0, flat);
989 0 : flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
990 0 : flat = _mm_subs_epu8(flat, one);
991 0 : flat = _mm_cmpeq_epi8(flat, zero);
992 0 : flat = _mm_and_si128(flat, mask);
993 : }
994 :
995 : {
996 0 : const __m128i four = _mm_set1_epi16(4);
997 0 : unsigned char *src = s;
998 : {
999 : __m128i workp_a, workp_b, workp_shft;
1000 0 : p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1001 0 : p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1002 0 : p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1003 0 : p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1004 0 : q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1005 0 : q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1006 0 : q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1007 0 : q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1008 :
1009 0 : workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1010 0 : workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1011 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1012 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1013 0 : _mm_storel_epi64((__m128i *)&flat_op2[0],
1014 : _mm_packus_epi16(workp_shft, workp_shft));
1015 :
1016 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1017 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1018 0 : _mm_storel_epi64((__m128i *)&flat_op1[0],
1019 : _mm_packus_epi16(workp_shft, workp_shft));
1020 :
1021 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1022 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1023 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1024 0 : _mm_storel_epi64((__m128i *)&flat_op0[0],
1025 : _mm_packus_epi16(workp_shft, workp_shft));
1026 :
1027 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1028 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1029 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1030 0 : _mm_storel_epi64((__m128i *)&flat_oq0[0],
1031 : _mm_packus_epi16(workp_shft, workp_shft));
1032 :
1033 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1034 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1035 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1036 0 : _mm_storel_epi64((__m128i *)&flat_oq1[0],
1037 : _mm_packus_epi16(workp_shft, workp_shft));
1038 :
1039 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1040 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1041 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1042 0 : _mm_storel_epi64((__m128i *)&flat_oq2[0],
1043 : _mm_packus_epi16(workp_shft, workp_shft));
1044 : }
1045 : }
1046 : // lp filter
1047 : {
1048 0 : const __m128i t4 = _mm_set1_epi8(4);
1049 0 : const __m128i t3 = _mm_set1_epi8(3);
1050 0 : const __m128i t80 = _mm_set1_epi8(0x80);
1051 0 : const __m128i t1 = _mm_set1_epi8(0x1);
1052 0 : const __m128i ps1 =
1053 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
1054 0 : const __m128i ps0 =
1055 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
1056 0 : const __m128i qs0 =
1057 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
1058 0 : const __m128i qs1 =
1059 0 : _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
1060 : __m128i filt;
1061 : __m128i work_a;
1062 : __m128i filter1, filter2;
1063 :
1064 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1065 0 : work_a = _mm_subs_epi8(qs0, ps0);
1066 0 : filt = _mm_adds_epi8(filt, work_a);
1067 0 : filt = _mm_adds_epi8(filt, work_a);
1068 0 : filt = _mm_adds_epi8(filt, work_a);
1069 : // (vpx_filter + 3 * (qs0 - ps0)) & mask
1070 0 : filt = _mm_and_si128(filt, mask);
1071 :
1072 0 : filter1 = _mm_adds_epi8(filt, t4);
1073 0 : filter2 = _mm_adds_epi8(filt, t3);
1074 :
1075 : // Filter1 >> 3
1076 0 : filter1 = _mm_unpacklo_epi8(zero, filter1);
1077 0 : filter1 = _mm_srai_epi16(filter1, 11);
1078 0 : filter1 = _mm_packs_epi16(filter1, filter1);
1079 :
1080 : // Filter2 >> 3
1081 0 : filter2 = _mm_unpacklo_epi8(zero, filter2);
1082 0 : filter2 = _mm_srai_epi16(filter2, 11);
1083 0 : filter2 = _mm_packs_epi16(filter2, zero);
1084 :
1085 : // filt >> 1
1086 0 : filt = _mm_adds_epi8(filter1, t1);
1087 0 : filt = _mm_unpacklo_epi8(zero, filt);
1088 0 : filt = _mm_srai_epi16(filt, 9);
1089 0 : filt = _mm_packs_epi16(filt, zero);
1090 :
1091 0 : filt = _mm_andnot_si128(hev, filt);
1092 :
1093 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1094 0 : q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
1095 0 : work_a = _mm_andnot_si128(flat, work_a);
1096 0 : q0 = _mm_and_si128(flat, q0);
1097 0 : q0 = _mm_or_si128(work_a, q0);
1098 :
1099 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1100 0 : q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
1101 0 : work_a = _mm_andnot_si128(flat, work_a);
1102 0 : q1 = _mm_and_si128(flat, q1);
1103 0 : q1 = _mm_or_si128(work_a, q1);
1104 :
1105 0 : work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1106 0 : q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
1107 0 : work_a = _mm_andnot_si128(flat, work_a);
1108 0 : q2 = _mm_and_si128(flat, q2);
1109 0 : q2 = _mm_or_si128(work_a, q2);
1110 :
1111 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1112 0 : p0 = _mm_loadl_epi64((__m128i *)flat_op0);
1113 0 : work_a = _mm_andnot_si128(flat, work_a);
1114 0 : p0 = _mm_and_si128(flat, p0);
1115 0 : p0 = _mm_or_si128(work_a, p0);
1116 :
1117 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1118 0 : p1 = _mm_loadl_epi64((__m128i *)flat_op1);
1119 0 : work_a = _mm_andnot_si128(flat, work_a);
1120 0 : p1 = _mm_and_si128(flat, p1);
1121 0 : p1 = _mm_or_si128(work_a, p1);
1122 :
1123 0 : work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1124 0 : p2 = _mm_loadl_epi64((__m128i *)flat_op2);
1125 0 : work_a = _mm_andnot_si128(flat, work_a);
1126 0 : p2 = _mm_and_si128(flat, p2);
1127 0 : p2 = _mm_or_si128(work_a, p2);
1128 :
1129 0 : _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
1130 0 : _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
1131 0 : _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
1132 : _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
1133 0 : _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
1134 0 : _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
1135 : }
1136 0 : }
1137 :
1138 0 : void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
1139 : const uint8_t *_limit0,
1140 : const uint8_t *_thresh0,
1141 : const uint8_t *_blimit1,
1142 : const uint8_t *_limit1,
1143 : const uint8_t *_thresh1) {
1144 : DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
1145 : DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
1146 : DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
1147 : DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
1148 : DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
1149 : DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
1150 0 : const __m128i zero = _mm_set1_epi16(0);
1151 0 : const __m128i blimit =
1152 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1153 : _mm_load_si128((const __m128i *)_blimit1));
1154 0 : const __m128i limit =
1155 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1156 : _mm_load_si128((const __m128i *)_limit1));
1157 0 : const __m128i thresh =
1158 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1159 : _mm_load_si128((const __m128i *)_thresh1));
1160 :
1161 : __m128i mask, hev, flat;
1162 : __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1163 :
1164 0 : p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1165 0 : p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1166 0 : p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1167 0 : p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1168 0 : q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1169 0 : q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1170 0 : q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1171 0 : q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1172 : {
1173 0 : const __m128i abs_p1p0 =
1174 0 : _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1175 0 : const __m128i abs_q1q0 =
1176 0 : _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1177 0 : const __m128i one = _mm_set1_epi8(1);
1178 0 : const __m128i fe = _mm_set1_epi8(0xfe);
1179 0 : const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1180 0 : __m128i abs_p0q0 =
1181 0 : _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1182 0 : __m128i abs_p1q1 =
1183 0 : _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1184 : __m128i work;
1185 :
1186 : // filter_mask and hev_mask
1187 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1188 0 : hev = _mm_subs_epu8(flat, thresh);
1189 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1190 :
1191 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1192 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1193 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1194 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1195 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1196 0 : mask = _mm_max_epu8(flat, mask);
1197 : // mask |= (abs(p1 - p0) > limit) * -1;
1198 : // mask |= (abs(q1 - q0) > limit) * -1;
1199 0 : work = _mm_max_epu8(
1200 : _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1201 : _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1202 0 : mask = _mm_max_epu8(work, mask);
1203 0 : work = _mm_max_epu8(
1204 : _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1205 : _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1206 0 : mask = _mm_max_epu8(work, mask);
1207 0 : mask = _mm_subs_epu8(mask, limit);
1208 0 : mask = _mm_cmpeq_epi8(mask, zero);
1209 :
1210 : // flat_mask4
1211 0 : work = _mm_max_epu8(
1212 : _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
1213 : _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
1214 0 : flat = _mm_max_epu8(work, flat);
1215 0 : work = _mm_max_epu8(
1216 : _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
1217 : _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
1218 0 : flat = _mm_max_epu8(work, flat);
1219 0 : flat = _mm_subs_epu8(flat, one);
1220 0 : flat = _mm_cmpeq_epi8(flat, zero);
1221 0 : flat = _mm_and_si128(flat, mask);
1222 : }
1223 : {
1224 0 : const __m128i four = _mm_set1_epi16(4);
1225 0 : unsigned char *src = s;
1226 0 : int i = 0;
1227 :
1228 : do {
1229 : __m128i workp_a, workp_b, workp_shft;
1230 0 : p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
1231 0 : p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
1232 0 : p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
1233 0 : p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
1234 0 : q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
1235 0 : q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
1236 0 : q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
1237 0 : q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
1238 :
1239 0 : workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
1240 0 : workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
1241 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
1242 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1243 0 : _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
1244 : _mm_packus_epi16(workp_shft, workp_shft));
1245 :
1246 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
1247 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1248 0 : _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
1249 : _mm_packus_epi16(workp_shft, workp_shft));
1250 :
1251 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
1252 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
1253 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1254 0 : _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
1255 : _mm_packus_epi16(workp_shft, workp_shft));
1256 :
1257 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
1258 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
1259 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1260 0 : _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
1261 : _mm_packus_epi16(workp_shft, workp_shft));
1262 :
1263 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
1264 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
1265 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1266 0 : _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
1267 : _mm_packus_epi16(workp_shft, workp_shft));
1268 :
1269 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
1270 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
1271 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
1272 0 : _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
1273 : _mm_packus_epi16(workp_shft, workp_shft));
1274 :
1275 0 : src += 8;
1276 0 : } while (++i < 2);
1277 : }
1278 : // lp filter
1279 : {
1280 0 : const __m128i t4 = _mm_set1_epi8(4);
1281 0 : const __m128i t3 = _mm_set1_epi8(3);
1282 0 : const __m128i t80 = _mm_set1_epi8(0x80);
1283 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
1284 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
1285 0 : const __m128i t1 = _mm_set1_epi8(0x1);
1286 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
1287 :
1288 0 : const __m128i ps1 =
1289 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1290 0 : const __m128i ps0 =
1291 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1292 0 : const __m128i qs0 =
1293 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1294 0 : const __m128i qs1 =
1295 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1296 : __m128i filt;
1297 : __m128i work_a;
1298 : __m128i filter1, filter2;
1299 :
1300 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1301 0 : work_a = _mm_subs_epi8(qs0, ps0);
1302 0 : filt = _mm_adds_epi8(filt, work_a);
1303 0 : filt = _mm_adds_epi8(filt, work_a);
1304 0 : filt = _mm_adds_epi8(filt, work_a);
1305 : // (vpx_filter + 3 * (qs0 - ps0)) & mask
1306 0 : filt = _mm_and_si128(filt, mask);
1307 :
1308 0 : filter1 = _mm_adds_epi8(filt, t4);
1309 0 : filter2 = _mm_adds_epi8(filt, t3);
1310 :
1311 : // Filter1 >> 3
1312 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
1313 0 : filter1 = _mm_srli_epi16(filter1, 3);
1314 0 : work_a = _mm_and_si128(work_a, te0);
1315 0 : filter1 = _mm_and_si128(filter1, t1f);
1316 0 : filter1 = _mm_or_si128(filter1, work_a);
1317 :
1318 : // Filter2 >> 3
1319 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
1320 0 : filter2 = _mm_srli_epi16(filter2, 3);
1321 0 : work_a = _mm_and_si128(work_a, te0);
1322 0 : filter2 = _mm_and_si128(filter2, t1f);
1323 0 : filter2 = _mm_or_si128(filter2, work_a);
1324 :
1325 : // filt >> 1
1326 0 : filt = _mm_adds_epi8(filter1, t1);
1327 0 : work_a = _mm_cmpgt_epi8(zero, filt);
1328 0 : filt = _mm_srli_epi16(filt, 1);
1329 0 : work_a = _mm_and_si128(work_a, t80);
1330 0 : filt = _mm_and_si128(filt, t7f);
1331 0 : filt = _mm_or_si128(filt, work_a);
1332 :
1333 0 : filt = _mm_andnot_si128(hev, filt);
1334 :
1335 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1336 0 : q0 = _mm_load_si128((__m128i *)flat_oq0);
1337 0 : work_a = _mm_andnot_si128(flat, work_a);
1338 0 : q0 = _mm_and_si128(flat, q0);
1339 0 : q0 = _mm_or_si128(work_a, q0);
1340 :
1341 0 : work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1342 0 : q1 = _mm_load_si128((__m128i *)flat_oq1);
1343 0 : work_a = _mm_andnot_si128(flat, work_a);
1344 0 : q1 = _mm_and_si128(flat, q1);
1345 0 : q1 = _mm_or_si128(work_a, q1);
1346 :
1347 0 : work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
1348 0 : q2 = _mm_load_si128((__m128i *)flat_oq2);
1349 0 : work_a = _mm_andnot_si128(flat, work_a);
1350 0 : q2 = _mm_and_si128(flat, q2);
1351 0 : q2 = _mm_or_si128(work_a, q2);
1352 :
1353 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1354 0 : p0 = _mm_load_si128((__m128i *)flat_op0);
1355 0 : work_a = _mm_andnot_si128(flat, work_a);
1356 0 : p0 = _mm_and_si128(flat, p0);
1357 0 : p0 = _mm_or_si128(work_a, p0);
1358 :
1359 0 : work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1360 0 : p1 = _mm_load_si128((__m128i *)flat_op1);
1361 0 : work_a = _mm_andnot_si128(flat, work_a);
1362 0 : p1 = _mm_and_si128(flat, p1);
1363 0 : p1 = _mm_or_si128(work_a, p1);
1364 :
1365 0 : work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
1366 0 : p2 = _mm_load_si128((__m128i *)flat_op2);
1367 0 : work_a = _mm_andnot_si128(flat, work_a);
1368 0 : p2 = _mm_and_si128(flat, p2);
1369 0 : p2 = _mm_or_si128(work_a, p2);
1370 :
1371 0 : _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
1372 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1373 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1374 : _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1375 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1376 0 : _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
1377 : }
1378 0 : }
1379 :
1380 0 : void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
1381 : const unsigned char *_blimit0,
1382 : const unsigned char *_limit0,
1383 : const unsigned char *_thresh0,
1384 : const unsigned char *_blimit1,
1385 : const unsigned char *_limit1,
1386 : const unsigned char *_thresh1) {
1387 0 : const __m128i blimit =
1388 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
1389 : _mm_load_si128((const __m128i *)_blimit1));
1390 0 : const __m128i limit =
1391 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
1392 : _mm_load_si128((const __m128i *)_limit1));
1393 0 : const __m128i thresh =
1394 0 : _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
1395 : _mm_load_si128((const __m128i *)_thresh1));
1396 0 : const __m128i zero = _mm_set1_epi16(0);
1397 : __m128i p3, p2, p1, p0, q0, q1, q2, q3;
1398 : __m128i mask, hev, flat;
1399 :
1400 0 : p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
1401 0 : p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
1402 0 : p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
1403 0 : p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
1404 0 : q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
1405 0 : q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
1406 0 : q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
1407 0 : q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
1408 :
1409 : // filter_mask and hev_mask
1410 : {
1411 0 : const __m128i abs_p1p0 =
1412 0 : _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
1413 0 : const __m128i abs_q1q0 =
1414 0 : _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
1415 0 : const __m128i fe = _mm_set1_epi8(0xfe);
1416 0 : const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
1417 0 : __m128i abs_p0q0 =
1418 0 : _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
1419 0 : __m128i abs_p1q1 =
1420 0 : _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
1421 : __m128i work;
1422 :
1423 0 : flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
1424 0 : hev = _mm_subs_epu8(flat, thresh);
1425 0 : hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
1426 :
1427 0 : abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
1428 0 : abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
1429 0 : mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
1430 0 : mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
1431 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
1432 0 : mask = _mm_max_epu8(flat, mask);
1433 : // mask |= (abs(p1 - p0) > limit) * -1;
1434 : // mask |= (abs(q1 - q0) > limit) * -1;
1435 0 : work = _mm_max_epu8(
1436 : _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
1437 : _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
1438 0 : mask = _mm_max_epu8(work, mask);
1439 0 : work = _mm_max_epu8(
1440 : _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
1441 : _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
1442 0 : mask = _mm_max_epu8(work, mask);
1443 0 : mask = _mm_subs_epu8(mask, limit);
1444 0 : mask = _mm_cmpeq_epi8(mask, zero);
1445 : }
1446 :
1447 : // filter4
1448 : {
1449 0 : const __m128i t4 = _mm_set1_epi8(4);
1450 0 : const __m128i t3 = _mm_set1_epi8(3);
1451 0 : const __m128i t80 = _mm_set1_epi8(0x80);
1452 0 : const __m128i te0 = _mm_set1_epi8(0xe0);
1453 0 : const __m128i t1f = _mm_set1_epi8(0x1f);
1454 0 : const __m128i t1 = _mm_set1_epi8(0x1);
1455 0 : const __m128i t7f = _mm_set1_epi8(0x7f);
1456 :
1457 0 : const __m128i ps1 =
1458 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
1459 0 : const __m128i ps0 =
1460 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
1461 0 : const __m128i qs0 =
1462 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
1463 0 : const __m128i qs1 =
1464 0 : _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
1465 : __m128i filt;
1466 : __m128i work_a;
1467 : __m128i filter1, filter2;
1468 :
1469 0 : filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
1470 0 : work_a = _mm_subs_epi8(qs0, ps0);
1471 0 : filt = _mm_adds_epi8(filt, work_a);
1472 0 : filt = _mm_adds_epi8(filt, work_a);
1473 0 : filt = _mm_adds_epi8(filt, work_a);
1474 : // (vpx_filter + 3 * (qs0 - ps0)) & mask
1475 0 : filt = _mm_and_si128(filt, mask);
1476 :
1477 0 : filter1 = _mm_adds_epi8(filt, t4);
1478 0 : filter2 = _mm_adds_epi8(filt, t3);
1479 :
1480 : // Filter1 >> 3
1481 0 : work_a = _mm_cmpgt_epi8(zero, filter1);
1482 0 : filter1 = _mm_srli_epi16(filter1, 3);
1483 0 : work_a = _mm_and_si128(work_a, te0);
1484 0 : filter1 = _mm_and_si128(filter1, t1f);
1485 0 : filter1 = _mm_or_si128(filter1, work_a);
1486 :
1487 : // Filter2 >> 3
1488 0 : work_a = _mm_cmpgt_epi8(zero, filter2);
1489 0 : filter2 = _mm_srli_epi16(filter2, 3);
1490 0 : work_a = _mm_and_si128(work_a, te0);
1491 0 : filter2 = _mm_and_si128(filter2, t1f);
1492 0 : filter2 = _mm_or_si128(filter2, work_a);
1493 :
1494 : // filt >> 1
1495 0 : filt = _mm_adds_epi8(filter1, t1);
1496 0 : work_a = _mm_cmpgt_epi8(zero, filt);
1497 0 : filt = _mm_srli_epi16(filt, 1);
1498 0 : work_a = _mm_and_si128(work_a, t80);
1499 0 : filt = _mm_and_si128(filt, t7f);
1500 0 : filt = _mm_or_si128(filt, work_a);
1501 :
1502 0 : filt = _mm_andnot_si128(hev, filt);
1503 :
1504 0 : q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
1505 0 : q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
1506 0 : p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
1507 0 : p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
1508 :
1509 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
1510 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
1511 : _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
1512 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
1513 : }
1514 0 : }
1515 :
1516 0 : static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
1517 : int in_p, unsigned char *out, int out_p) {
1518 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1519 : __m128i x8, x9, x10, x11, x12, x13, x14, x15;
1520 :
1521 : // 2-way interleave w/hoisting of unpacks
1522 0 : x0 = _mm_loadl_epi64((__m128i *)in0); // 1
1523 0 : x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); // 3
1524 0 : x0 = _mm_unpacklo_epi8(x0, x1); // 1
1525 :
1526 0 : x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); // 5
1527 0 : x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); // 7
1528 0 : x1 = _mm_unpacklo_epi8(x2, x3); // 2
1529 :
1530 0 : x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); // 9
1531 0 : x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); // 11
1532 0 : x2 = _mm_unpacklo_epi8(x4, x5); // 3
1533 :
1534 0 : x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); // 13
1535 0 : x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); // 15
1536 0 : x3 = _mm_unpacklo_epi8(x6, x7); // 4
1537 0 : x4 = _mm_unpacklo_epi16(x0, x1); // 9
1538 :
1539 0 : x8 = _mm_loadl_epi64((__m128i *)in1); // 2
1540 0 : x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); // 4
1541 0 : x8 = _mm_unpacklo_epi8(x8, x9); // 5
1542 0 : x5 = _mm_unpacklo_epi16(x2, x3); // 10
1543 :
1544 0 : x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); // 6
1545 0 : x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); // 8
1546 0 : x9 = _mm_unpacklo_epi8(x10, x11); // 6
1547 :
1548 0 : x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); // 10
1549 0 : x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); // 12
1550 0 : x10 = _mm_unpacklo_epi8(x12, x13); // 7
1551 0 : x12 = _mm_unpacklo_epi16(x8, x9); // 11
1552 :
1553 0 : x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); // 14
1554 0 : x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); // 16
1555 0 : x11 = _mm_unpacklo_epi8(x14, x15); // 8
1556 0 : x13 = _mm_unpacklo_epi16(x10, x11); // 12
1557 :
1558 0 : x6 = _mm_unpacklo_epi32(x4, x5); // 13
1559 0 : x7 = _mm_unpackhi_epi32(x4, x5); // 14
1560 0 : x14 = _mm_unpacklo_epi32(x12, x13); // 15
1561 0 : x15 = _mm_unpackhi_epi32(x12, x13); // 16
1562 :
1563 : // Store first 4-line result
1564 0 : _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
1565 0 : _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
1566 0 : _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
1567 0 : _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
1568 :
1569 0 : x4 = _mm_unpackhi_epi16(x0, x1);
1570 0 : x5 = _mm_unpackhi_epi16(x2, x3);
1571 0 : x12 = _mm_unpackhi_epi16(x8, x9);
1572 0 : x13 = _mm_unpackhi_epi16(x10, x11);
1573 :
1574 0 : x6 = _mm_unpacklo_epi32(x4, x5);
1575 0 : x7 = _mm_unpackhi_epi32(x4, x5);
1576 0 : x14 = _mm_unpacklo_epi32(x12, x13);
1577 0 : x15 = _mm_unpackhi_epi32(x12, x13);
1578 :
1579 : // Store second 4-line result
1580 0 : _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
1581 0 : _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
1582 0 : _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
1583 0 : _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
1584 0 : }
1585 :
1586 0 : static INLINE void transpose(unsigned char *src[], int in_p,
1587 : unsigned char *dst[], int out_p,
1588 : int num_8x8_to_transpose) {
1589 0 : int idx8x8 = 0;
1590 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
1591 : do {
1592 0 : unsigned char *in = src[idx8x8];
1593 0 : unsigned char *out = dst[idx8x8];
1594 :
1595 0 : x0 =
1596 : _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
1597 0 : x1 =
1598 0 : _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
1599 : // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
1600 0 : x0 = _mm_unpacklo_epi8(x0, x1);
1601 :
1602 0 : x2 =
1603 0 : _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
1604 0 : x3 =
1605 0 : _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
1606 : // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
1607 0 : x1 = _mm_unpacklo_epi8(x2, x3);
1608 :
1609 0 : x4 =
1610 0 : _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
1611 0 : x5 =
1612 0 : _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
1613 : // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
1614 0 : x2 = _mm_unpacklo_epi8(x4, x5);
1615 :
1616 0 : x6 =
1617 0 : _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
1618 0 : x7 =
1619 0 : _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
1620 : // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
1621 0 : x3 = _mm_unpacklo_epi8(x6, x7);
1622 :
1623 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
1624 0 : x4 = _mm_unpacklo_epi16(x0, x1);
1625 : // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
1626 0 : x5 = _mm_unpacklo_epi16(x2, x3);
1627 : // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
1628 0 : x6 = _mm_unpacklo_epi32(x4, x5);
1629 0 : _mm_storel_pd((double *)(out + 0 * out_p),
1630 : _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70
1631 0 : _mm_storeh_pd((double *)(out + 1 * out_p),
1632 : _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71
1633 : // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
1634 0 : x7 = _mm_unpackhi_epi32(x4, x5);
1635 0 : _mm_storel_pd((double *)(out + 2 * out_p),
1636 : _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72
1637 0 : _mm_storeh_pd((double *)(out + 3 * out_p),
1638 : _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73
1639 :
1640 : // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
1641 0 : x4 = _mm_unpackhi_epi16(x0, x1);
1642 : // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
1643 0 : x5 = _mm_unpackhi_epi16(x2, x3);
1644 : // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
1645 0 : x6 = _mm_unpacklo_epi32(x4, x5);
1646 0 : _mm_storel_pd((double *)(out + 4 * out_p),
1647 : _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74
1648 0 : _mm_storeh_pd((double *)(out + 5 * out_p),
1649 : _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75
1650 : // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
1651 0 : x7 = _mm_unpackhi_epi32(x4, x5);
1652 :
1653 0 : _mm_storel_pd((double *)(out + 6 * out_p),
1654 : _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76
1655 0 : _mm_storeh_pd((double *)(out + 7 * out_p),
1656 : _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77
1657 0 : } while (++idx8x8 < num_8x8_to_transpose);
1658 0 : }
1659 :
1660 0 : void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1661 : const uint8_t *limit0, const uint8_t *thresh0,
1662 : const uint8_t *blimit1, const uint8_t *limit1,
1663 : const uint8_t *thresh1) {
1664 : DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1665 : unsigned char *src[2];
1666 : unsigned char *dst[2];
1667 :
1668 : // Transpose 8x16
1669 0 : transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1670 :
1671 : // Loop filtering
1672 0 : vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1673 : blimit1, limit1, thresh1);
1674 0 : src[0] = t_dst;
1675 0 : src[1] = t_dst + 8;
1676 0 : dst[0] = s - 4;
1677 0 : dst[1] = s - 4 + p * 8;
1678 :
1679 : // Transpose back
1680 0 : transpose(src, 16, dst, p, 2);
1681 0 : }
1682 :
1683 0 : void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
1684 : const unsigned char *blimit,
1685 : const unsigned char *limit,
1686 : const unsigned char *thresh) {
1687 : DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
1688 : unsigned char *src[1];
1689 : unsigned char *dst[1];
1690 :
1691 : // Transpose 8x8
1692 0 : src[0] = s - 4;
1693 0 : dst[0] = t_dst;
1694 :
1695 0 : transpose(src, p, dst, 8, 1);
1696 :
1697 : // Loop filtering
1698 0 : vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
1699 :
1700 0 : src[0] = t_dst;
1701 0 : dst[0] = s - 4;
1702 :
1703 : // Transpose back
1704 0 : transpose(src, 8, dst, p, 1);
1705 0 : }
1706 :
1707 0 : void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
1708 : const uint8_t *limit0, const uint8_t *thresh0,
1709 : const uint8_t *blimit1, const uint8_t *limit1,
1710 : const uint8_t *thresh1) {
1711 : DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
1712 : unsigned char *src[2];
1713 : unsigned char *dst[2];
1714 :
1715 : // Transpose 8x16
1716 0 : transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1717 :
1718 : // Loop filtering
1719 0 : vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
1720 : blimit1, limit1, thresh1);
1721 0 : src[0] = t_dst;
1722 0 : src[1] = t_dst + 8;
1723 :
1724 0 : dst[0] = s - 4;
1725 0 : dst[1] = s - 4 + p * 8;
1726 :
1727 : // Transpose back
1728 0 : transpose(src, 16, dst, p, 2);
1729 0 : }
1730 :
1731 0 : void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
1732 : const unsigned char *blimit,
1733 : const unsigned char *limit,
1734 : const unsigned char *thresh) {
1735 : DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
1736 : unsigned char *src[2];
1737 : unsigned char *dst[2];
1738 :
1739 0 : src[0] = s - 8;
1740 0 : src[1] = s;
1741 0 : dst[0] = t_dst;
1742 0 : dst[1] = t_dst + 8 * 8;
1743 :
1744 : // Transpose 16x8
1745 0 : transpose(src, p, dst, 8, 2);
1746 :
1747 : // Loop filtering
1748 0 : vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
1749 :
1750 0 : src[0] = t_dst;
1751 0 : src[1] = t_dst + 8 * 8;
1752 0 : dst[0] = s - 8;
1753 0 : dst[1] = s;
1754 :
1755 : // Transpose back
1756 0 : transpose(src, 8, dst, p, 2);
1757 0 : }
1758 :
1759 0 : void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
1760 : const uint8_t *blimit, const uint8_t *limit,
1761 : const uint8_t *thresh) {
1762 : DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
1763 :
1764 : // Transpose 16x16
1765 0 : transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1766 0 : transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1767 :
1768 : // Loop filtering
1769 0 : vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
1770 :
1771 : // Transpose back
1772 0 : transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1773 0 : transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1774 0 : }
|