Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h> // SSE2
13 :
14 : #include "./aom_dsp_rtcd.h"
15 : #include "aom_ports/mem.h"
16 : #include "aom_ports/emmintrin_compat.h"
17 :
18 0 : static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
19 : __m128i ubounded;
20 : __m128i lbounded;
21 : __m128i retval;
22 :
23 0 : const __m128i zero = _mm_set1_epi16(0);
24 0 : const __m128i one = _mm_set1_epi16(1);
25 : __m128i t80, max, min;
26 :
27 0 : if (bd == 8) {
28 0 : t80 = _mm_set1_epi16(0x80);
29 0 : max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
30 0 : } else if (bd == 10) {
31 0 : t80 = _mm_set1_epi16(0x200);
32 0 : max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
33 : } else { // bd == 12
34 0 : t80 = _mm_set1_epi16(0x800);
35 0 : max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
36 : }
37 :
38 0 : min = _mm_subs_epi16(zero, t80);
39 :
40 0 : ubounded = _mm_cmpgt_epi16(value, max);
41 0 : lbounded = _mm_cmplt_epi16(value, min);
42 0 : retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
43 0 : ubounded = _mm_and_si128(ubounded, max);
44 0 : lbounded = _mm_and_si128(lbounded, min);
45 0 : retval = _mm_or_si128(retval, ubounded);
46 0 : retval = _mm_or_si128(retval, lbounded);
47 0 : return retval;
48 : }
49 :
50 : // TODO(debargha, peter): Break up large functions into smaller ones
51 : // in this file.
52 0 : void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
53 : const uint8_t *_blimit,
54 : const uint8_t *_limit,
55 : const uint8_t *_thresh, int bd) {
56 0 : const __m128i zero = _mm_set1_epi16(0);
57 0 : const __m128i one = _mm_set1_epi16(1);
58 : __m128i blimit, limit, thresh;
59 : __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
60 : __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
61 : __m128i ps1, qs1, ps0, qs0;
62 : __m128i abs_p0q0, abs_p1q1, ffff, work;
63 : __m128i filt, work_a, filter1, filter2;
64 : __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
65 : __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
66 : __m128i flat2_q0, flat2_p0;
67 : __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
68 : __m128i pixelFilter_p, pixelFilter_q;
69 : __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
70 : __m128i sum_p7, sum_q7, sum_p3, sum_q3;
71 : __m128i t4, t3, t80, t1;
72 : __m128i eight, four;
73 :
74 0 : if (bd == 8) {
75 0 : blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
76 0 : limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
77 0 : thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
78 0 : } else if (bd == 10) {
79 0 : blimit = _mm_slli_epi16(
80 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
81 0 : limit = _mm_slli_epi16(
82 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
83 0 : thresh = _mm_slli_epi16(
84 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
85 : } else { // bd == 12
86 0 : blimit = _mm_slli_epi16(
87 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
88 0 : limit = _mm_slli_epi16(
89 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
90 0 : thresh = _mm_slli_epi16(
91 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
92 : }
93 :
94 0 : q4 = _mm_load_si128((__m128i *)(s + 4 * p));
95 0 : p4 = _mm_load_si128((__m128i *)(s - 5 * p));
96 0 : q3 = _mm_load_si128((__m128i *)(s + 3 * p));
97 0 : p3 = _mm_load_si128((__m128i *)(s - 4 * p));
98 0 : q2 = _mm_load_si128((__m128i *)(s + 2 * p));
99 0 : p2 = _mm_load_si128((__m128i *)(s - 3 * p));
100 0 : q1 = _mm_load_si128((__m128i *)(s + 1 * p));
101 0 : p1 = _mm_load_si128((__m128i *)(s - 2 * p));
102 0 : q0 = _mm_load_si128((__m128i *)(s + 0 * p));
103 0 : p0 = _mm_load_si128((__m128i *)(s - 1 * p));
104 :
105 : // highbd_filter_mask
106 0 : abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
107 0 : abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
108 :
109 0 : ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
110 :
111 0 : abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
112 0 : abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
113 :
114 : // highbd_hev_mask (in C code this is actually called from highbd_filter4)
115 0 : flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
116 0 : hev = _mm_subs_epu16(flat, thresh);
117 0 : hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
118 :
119 0 : abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0); // abs(p0 - q0) * 2
120 0 : abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // abs(p1 - q1) / 2
121 0 : mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
122 0 : mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
123 0 : mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
124 0 : work = _mm_max_epi16(
125 : _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
126 : _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
127 0 : mask = _mm_max_epi16(work, mask);
128 0 : work = _mm_max_epi16(
129 : _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
130 : _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
131 0 : mask = _mm_max_epi16(work, mask);
132 0 : work = _mm_max_epi16(
133 : _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
134 : _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
135 0 : mask = _mm_max_epi16(work, mask);
136 :
137 0 : mask = _mm_subs_epu16(mask, limit);
138 0 : mask = _mm_cmpeq_epi16(mask, zero); // return ~mask
139 :
140 : // lp filter
141 : // highbd_filter4
142 0 : t4 = _mm_set1_epi16(4);
143 0 : t3 = _mm_set1_epi16(3);
144 0 : if (bd == 8)
145 0 : t80 = _mm_set1_epi16(0x80);
146 0 : else if (bd == 10)
147 0 : t80 = _mm_set1_epi16(0x200);
148 : else // bd == 12
149 0 : t80 = _mm_set1_epi16(0x800);
150 :
151 0 : t1 = _mm_set1_epi16(0x1);
152 :
153 0 : ps1 = _mm_subs_epi16(p1, t80);
154 0 : qs1 = _mm_subs_epi16(q1, t80);
155 0 : ps0 = _mm_subs_epi16(p0, t80);
156 0 : qs0 = _mm_subs_epi16(q0, t80);
157 :
158 0 : filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
159 : hev);
160 0 : work_a = _mm_subs_epi16(qs0, ps0);
161 0 : filt = _mm_adds_epi16(filt, work_a);
162 0 : filt = _mm_adds_epi16(filt, work_a);
163 0 : filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
164 0 : filt = _mm_and_si128(filt, mask);
165 0 : filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
166 0 : filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
167 :
168 : // Filter1 >> 3
169 0 : filter1 = _mm_srai_epi16(filter1, 0x3);
170 0 : filter2 = _mm_srai_epi16(filter2, 0x3);
171 :
172 0 : qs0 = _mm_adds_epi16(
173 : signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
174 0 : ps0 = _mm_adds_epi16(
175 : signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
176 0 : filt = _mm_adds_epi16(filter1, t1);
177 0 : filt = _mm_srai_epi16(filt, 1);
178 0 : filt = _mm_andnot_si128(hev, filt);
179 0 : qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
180 : t80);
181 0 : ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
182 : t80);
183 :
184 : // end highbd_filter4
185 : // loopfilter done
186 :
187 : // highbd_flat_mask4
188 0 : flat = _mm_max_epi16(
189 : _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
190 : _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
191 0 : work = _mm_max_epi16(
192 : _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
193 : _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
194 0 : flat = _mm_max_epi16(work, flat);
195 0 : work = _mm_max_epi16(abs_p1p0, abs_q1q0);
196 0 : flat = _mm_max_epi16(work, flat);
197 :
198 0 : if (bd == 8)
199 0 : flat = _mm_subs_epu16(flat, one);
200 0 : else if (bd == 10)
201 0 : flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
202 : else // bd == 12
203 0 : flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
204 :
205 0 : flat = _mm_cmpeq_epi16(flat, zero);
206 : // end flat_mask4
207 :
208 : // flat & mask = flat && mask (as used in filter8)
209 : // (because, in both vars, each block of 16 either all 1s or all 0s)
210 0 : flat = _mm_and_si128(flat, mask);
211 :
212 0 : p5 = _mm_load_si128((__m128i *)(s - 6 * p));
213 0 : q5 = _mm_load_si128((__m128i *)(s + 5 * p));
214 0 : p6 = _mm_load_si128((__m128i *)(s - 7 * p));
215 0 : q6 = _mm_load_si128((__m128i *)(s + 6 * p));
216 0 : p7 = _mm_load_si128((__m128i *)(s - 8 * p));
217 0 : q7 = _mm_load_si128((__m128i *)(s + 7 * p));
218 :
219 : // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
220 : // but referred to as p0-p4 & q0-q4 in fn)
221 0 : flat2 = _mm_max_epi16(
222 : _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
223 : _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
224 :
225 0 : work = _mm_max_epi16(
226 : _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
227 : _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
228 0 : flat2 = _mm_max_epi16(work, flat2);
229 :
230 0 : work = _mm_max_epi16(
231 : _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
232 : _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
233 0 : flat2 = _mm_max_epi16(work, flat2);
234 :
235 0 : work = _mm_max_epi16(
236 : _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
237 : _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
238 0 : flat2 = _mm_max_epi16(work, flat2);
239 :
240 0 : if (bd == 8)
241 0 : flat2 = _mm_subs_epu16(flat2, one);
242 0 : else if (bd == 10)
243 0 : flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
244 : else // bd == 12
245 0 : flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
246 :
247 0 : flat2 = _mm_cmpeq_epi16(flat2, zero);
248 0 : flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
249 : // end highbd_flat_mask5
250 :
251 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
252 : // flat and wide flat calculations
253 0 : eight = _mm_set1_epi16(8);
254 0 : four = _mm_set1_epi16(4);
255 :
256 0 : pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
257 0 : pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
258 :
259 0 : pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
260 0 : pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
261 :
262 0 : pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
263 0 : pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
264 0 : pixelFilter_p =
265 0 : _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
266 0 : pixetFilter_p2p1p0 = _mm_add_epi16(
267 : four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
268 0 : flat2_p0 =
269 0 : _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
270 0 : flat2_q0 =
271 0 : _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
272 0 : flat_p0 = _mm_srli_epi16(
273 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
274 0 : flat_q0 = _mm_srli_epi16(
275 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
276 :
277 0 : sum_p7 = _mm_add_epi16(p7, p7);
278 0 : sum_q7 = _mm_add_epi16(q7, q7);
279 0 : sum_p3 = _mm_add_epi16(p3, p3);
280 0 : sum_q3 = _mm_add_epi16(q3, q3);
281 :
282 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
283 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
284 0 : flat2_p1 = _mm_srli_epi16(
285 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
286 0 : flat2_q1 = _mm_srli_epi16(
287 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
288 :
289 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
290 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
291 0 : flat_p1 = _mm_srli_epi16(
292 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
293 0 : flat_q1 = _mm_srli_epi16(
294 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
295 :
296 0 : sum_p7 = _mm_add_epi16(sum_p7, p7);
297 0 : sum_q7 = _mm_add_epi16(sum_q7, q7);
298 0 : sum_p3 = _mm_add_epi16(sum_p3, p3);
299 0 : sum_q3 = _mm_add_epi16(sum_q3, q3);
300 :
301 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
302 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
303 0 : flat2_p2 = _mm_srli_epi16(
304 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
305 0 : flat2_q2 = _mm_srli_epi16(
306 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
307 :
308 0 : pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
309 0 : pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
310 0 : flat_p2 = _mm_srli_epi16(
311 : _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
312 0 : flat_q2 = _mm_srli_epi16(
313 : _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
314 :
315 0 : sum_p7 = _mm_add_epi16(sum_p7, p7);
316 0 : sum_q7 = _mm_add_epi16(sum_q7, q7);
317 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
318 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
319 0 : flat2_p3 = _mm_srli_epi16(
320 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
321 0 : flat2_q3 = _mm_srli_epi16(
322 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
323 :
324 0 : sum_p7 = _mm_add_epi16(sum_p7, p7);
325 0 : sum_q7 = _mm_add_epi16(sum_q7, q7);
326 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
327 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
328 0 : flat2_p4 = _mm_srli_epi16(
329 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
330 0 : flat2_q4 = _mm_srli_epi16(
331 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
332 :
333 0 : sum_p7 = _mm_add_epi16(sum_p7, p7);
334 0 : sum_q7 = _mm_add_epi16(sum_q7, q7);
335 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
336 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
337 0 : flat2_p5 = _mm_srli_epi16(
338 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
339 0 : flat2_q5 = _mm_srli_epi16(
340 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
341 :
342 0 : sum_p7 = _mm_add_epi16(sum_p7, p7);
343 0 : sum_q7 = _mm_add_epi16(sum_q7, q7);
344 0 : pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
345 0 : pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
346 0 : flat2_p6 = _mm_srli_epi16(
347 : _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
348 0 : flat2_q6 = _mm_srli_epi16(
349 : _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
350 :
351 : // wide flat
352 : // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
353 :
354 : // highbd_filter8
355 0 : p2 = _mm_andnot_si128(flat, p2);
356 : // p2 remains unchanged if !(flat && mask)
357 0 : flat_p2 = _mm_and_si128(flat, flat_p2);
358 : // when (flat && mask)
359 0 : p2 = _mm_or_si128(p2, flat_p2); // full list of p2 values
360 0 : q2 = _mm_andnot_si128(flat, q2);
361 0 : flat_q2 = _mm_and_si128(flat, flat_q2);
362 0 : q2 = _mm_or_si128(q2, flat_q2); // full list of q2 values
363 :
364 0 : ps1 = _mm_andnot_si128(flat, ps1);
365 : // p1 takes the value assigned to in in filter4 if !(flat && mask)
366 0 : flat_p1 = _mm_and_si128(flat, flat_p1);
367 : // when (flat && mask)
368 0 : p1 = _mm_or_si128(ps1, flat_p1); // full list of p1 values
369 0 : qs1 = _mm_andnot_si128(flat, qs1);
370 0 : flat_q1 = _mm_and_si128(flat, flat_q1);
371 0 : q1 = _mm_or_si128(qs1, flat_q1); // full list of q1 values
372 :
373 0 : ps0 = _mm_andnot_si128(flat, ps0);
374 : // p0 takes the value assigned to in in filter4 if !(flat && mask)
375 0 : flat_p0 = _mm_and_si128(flat, flat_p0);
376 : // when (flat && mask)
377 0 : p0 = _mm_or_si128(ps0, flat_p0); // full list of p0 values
378 0 : qs0 = _mm_andnot_si128(flat, qs0);
379 0 : flat_q0 = _mm_and_si128(flat, flat_q0);
380 0 : q0 = _mm_or_si128(qs0, flat_q0); // full list of q0 values
381 : // end highbd_filter8
382 :
383 : // highbd_filter16
384 0 : p6 = _mm_andnot_si128(flat2, p6);
385 : // p6 remains unchanged if !(flat2 && flat && mask)
386 0 : flat2_p6 = _mm_and_si128(flat2, flat2_p6);
387 : // get values for when (flat2 && flat && mask)
388 0 : p6 = _mm_or_si128(p6, flat2_p6); // full list of p6 values
389 0 : q6 = _mm_andnot_si128(flat2, q6);
390 : // q6 remains unchanged if !(flat2 && flat && mask)
391 0 : flat2_q6 = _mm_and_si128(flat2, flat2_q6);
392 : // get values for when (flat2 && flat && mask)
393 0 : q6 = _mm_or_si128(q6, flat2_q6); // full list of q6 values
394 0 : _mm_store_si128((__m128i *)(s - 7 * p), p6);
395 0 : _mm_store_si128((__m128i *)(s + 6 * p), q6);
396 :
397 0 : p5 = _mm_andnot_si128(flat2, p5);
398 : // p5 remains unchanged if !(flat2 && flat && mask)
399 0 : flat2_p5 = _mm_and_si128(flat2, flat2_p5);
400 : // get values for when (flat2 && flat && mask)
401 0 : p5 = _mm_or_si128(p5, flat2_p5);
402 : // full list of p5 values
403 0 : q5 = _mm_andnot_si128(flat2, q5);
404 : // q5 remains unchanged if !(flat2 && flat && mask)
405 0 : flat2_q5 = _mm_and_si128(flat2, flat2_q5);
406 : // get values for when (flat2 && flat && mask)
407 0 : q5 = _mm_or_si128(q5, flat2_q5);
408 : // full list of q5 values
409 0 : _mm_store_si128((__m128i *)(s - 6 * p), p5);
410 0 : _mm_store_si128((__m128i *)(s + 5 * p), q5);
411 :
412 0 : p4 = _mm_andnot_si128(flat2, p4);
413 : // p4 remains unchanged if !(flat2 && flat && mask)
414 0 : flat2_p4 = _mm_and_si128(flat2, flat2_p4);
415 : // get values for when (flat2 && flat && mask)
416 0 : p4 = _mm_or_si128(p4, flat2_p4); // full list of p4 values
417 0 : q4 = _mm_andnot_si128(flat2, q4);
418 : // q4 remains unchanged if !(flat2 && flat && mask)
419 0 : flat2_q4 = _mm_and_si128(flat2, flat2_q4);
420 : // get values for when (flat2 && flat && mask)
421 0 : q4 = _mm_or_si128(q4, flat2_q4); // full list of q4 values
422 0 : _mm_store_si128((__m128i *)(s - 5 * p), p4);
423 0 : _mm_store_si128((__m128i *)(s + 4 * p), q4);
424 :
425 0 : p3 = _mm_andnot_si128(flat2, p3);
426 : // p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
427 0 : flat2_p3 = _mm_and_si128(flat2, flat2_p3);
428 : // get values for when (flat2 && flat && mask)
429 0 : p3 = _mm_or_si128(p3, flat2_p3); // full list of p3 values
430 0 : q3 = _mm_andnot_si128(flat2, q3);
431 : // q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
432 0 : flat2_q3 = _mm_and_si128(flat2, flat2_q3);
433 : // get values for when (flat2 && flat && mask)
434 0 : q3 = _mm_or_si128(q3, flat2_q3); // full list of q3 values
435 0 : _mm_store_si128((__m128i *)(s - 4 * p), p3);
436 0 : _mm_store_si128((__m128i *)(s + 3 * p), q3);
437 :
438 0 : p2 = _mm_andnot_si128(flat2, p2);
439 : // p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
440 0 : flat2_p2 = _mm_and_si128(flat2, flat2_p2);
441 : // get values for when (flat2 && flat && mask)
442 0 : p2 = _mm_or_si128(p2, flat2_p2);
443 : // full list of p2 values
444 0 : q2 = _mm_andnot_si128(flat2, q2);
445 : // q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
446 0 : flat2_q2 = _mm_and_si128(flat2, flat2_q2);
447 : // get values for when (flat2 && flat && mask)
448 0 : q2 = _mm_or_si128(q2, flat2_q2); // full list of q2 values
449 0 : _mm_store_si128((__m128i *)(s - 3 * p), p2);
450 0 : _mm_store_si128((__m128i *)(s + 2 * p), q2);
451 :
452 0 : p1 = _mm_andnot_si128(flat2, p1);
453 : // p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
454 0 : flat2_p1 = _mm_and_si128(flat2, flat2_p1);
455 : // get values for when (flat2 && flat && mask)
456 0 : p1 = _mm_or_si128(p1, flat2_p1); // full list of p1 values
457 0 : q1 = _mm_andnot_si128(flat2, q1);
458 : // q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
459 0 : flat2_q1 = _mm_and_si128(flat2, flat2_q1);
460 : // get values for when (flat2 && flat && mask)
461 0 : q1 = _mm_or_si128(q1, flat2_q1); // full list of q1 values
462 0 : _mm_store_si128((__m128i *)(s - 2 * p), p1);
463 0 : _mm_store_si128((__m128i *)(s + 1 * p), q1);
464 :
465 0 : p0 = _mm_andnot_si128(flat2, p0);
466 : // p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
467 0 : flat2_p0 = _mm_and_si128(flat2, flat2_p0);
468 : // get values for when (flat2 && flat && mask)
469 0 : p0 = _mm_or_si128(p0, flat2_p0); // full list of p0 values
470 0 : q0 = _mm_andnot_si128(flat2, q0);
471 : // q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
472 0 : flat2_q0 = _mm_and_si128(flat2, flat2_q0);
473 : // get values for when (flat2 && flat && mask)
474 0 : q0 = _mm_or_si128(q0, flat2_q0); // full list of q0 values
475 0 : _mm_store_si128((__m128i *)(s - 1 * p), p0);
476 : _mm_store_si128((__m128i *)(s - 0 * p), q0);
477 0 : }
478 :
479 0 : void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
480 : const uint8_t *_blimit,
481 : const uint8_t *_limit,
482 : const uint8_t *_thresh, int bd) {
483 0 : aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
484 0 : aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
485 0 : }
486 :
487 0 : void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
488 : const uint8_t *_blimit,
489 : const uint8_t *_limit,
490 : const uint8_t *_thresh, int bd) {
491 : DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
492 : DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
493 : DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
494 : DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
495 : DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
496 : DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
497 0 : const __m128i zero = _mm_set1_epi16(0);
498 : __m128i blimit, limit, thresh;
499 : __m128i mask, hev, flat;
500 0 : __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
501 0 : __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
502 0 : __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
503 0 : __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
504 0 : __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
505 0 : __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
506 0 : __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
507 0 : __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
508 0 : const __m128i one = _mm_set1_epi16(1);
509 0 : const __m128i ffff = _mm_cmpeq_epi16(one, one);
510 : __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
511 0 : const __m128i four = _mm_set1_epi16(4);
512 : __m128i workp_a, workp_b, workp_shft;
513 :
514 0 : const __m128i t4 = _mm_set1_epi16(4);
515 0 : const __m128i t3 = _mm_set1_epi16(3);
516 : __m128i t80;
517 0 : const __m128i t1 = _mm_set1_epi16(0x1);
518 : __m128i ps1, ps0, qs0, qs1;
519 : __m128i filt;
520 : __m128i work_a;
521 : __m128i filter1, filter2;
522 :
523 0 : if (bd == 8) {
524 0 : blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
525 0 : limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
526 0 : thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
527 0 : t80 = _mm_set1_epi16(0x80);
528 0 : } else if (bd == 10) {
529 0 : blimit = _mm_slli_epi16(
530 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
531 0 : limit = _mm_slli_epi16(
532 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
533 0 : thresh = _mm_slli_epi16(
534 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
535 0 : t80 = _mm_set1_epi16(0x200);
536 : } else { // bd == 12
537 0 : blimit = _mm_slli_epi16(
538 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
539 0 : limit = _mm_slli_epi16(
540 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
541 0 : thresh = _mm_slli_epi16(
542 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
543 0 : t80 = _mm_set1_epi16(0x800);
544 : }
545 :
546 0 : ps1 = _mm_subs_epi16(p1, t80);
547 0 : ps0 = _mm_subs_epi16(p0, t80);
548 0 : qs0 = _mm_subs_epi16(q0, t80);
549 0 : qs1 = _mm_subs_epi16(q1, t80);
550 :
551 : // filter_mask and hev_mask
552 0 : abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
553 0 : abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
554 :
555 0 : abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
556 0 : abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
557 0 : flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
558 0 : hev = _mm_subs_epu16(flat, thresh);
559 0 : hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
560 :
561 0 : abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
562 0 : abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
563 0 : mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
564 0 : mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
565 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
566 : // So taking maximums continues to work:
567 0 : mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
568 0 : mask = _mm_max_epi16(abs_p1p0, mask);
569 : // mask |= (abs(p1 - p0) > limit) * -1;
570 0 : mask = _mm_max_epi16(abs_q1q0, mask);
571 : // mask |= (abs(q1 - q0) > limit) * -1;
572 :
573 0 : work = _mm_max_epi16(
574 : _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
575 : _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
576 0 : mask = _mm_max_epi16(work, mask);
577 0 : work = _mm_max_epi16(
578 : _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
579 : _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
580 0 : mask = _mm_max_epi16(work, mask);
581 0 : mask = _mm_subs_epu16(mask, limit);
582 0 : mask = _mm_cmpeq_epi16(mask, zero);
583 :
584 : // flat_mask4
585 0 : flat = _mm_max_epi16(
586 : _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
587 : _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
588 0 : work = _mm_max_epi16(
589 : _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
590 : _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
591 0 : flat = _mm_max_epi16(work, flat);
592 0 : flat = _mm_max_epi16(abs_p1p0, flat);
593 0 : flat = _mm_max_epi16(abs_q1q0, flat);
594 :
595 0 : if (bd == 8)
596 0 : flat = _mm_subs_epu16(flat, one);
597 0 : else if (bd == 10)
598 0 : flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
599 : else // bd == 12
600 0 : flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
601 :
602 0 : flat = _mm_cmpeq_epi16(flat, zero);
603 0 : flat = _mm_and_si128(flat, mask); // flat & mask
604 :
605 : // Added before shift for rounding part of ROUND_POWER_OF_TWO
606 :
607 0 : workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
608 0 : workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
609 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
610 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
611 : _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
612 :
613 0 : workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
614 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
615 : _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
616 :
617 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
618 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
619 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
620 : _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
621 :
622 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
623 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
624 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
625 : _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
626 :
627 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
628 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
629 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
630 : _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
631 :
632 0 : workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
633 0 : workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
634 0 : workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
635 : _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
636 :
637 : // lp filter
638 0 : filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
639 0 : filt = _mm_and_si128(filt, hev);
640 0 : work_a = _mm_subs_epi16(qs0, ps0);
641 0 : filt = _mm_adds_epi16(filt, work_a);
642 0 : filt = _mm_adds_epi16(filt, work_a);
643 0 : filt = _mm_adds_epi16(filt, work_a);
644 : // (aom_filter + 3 * (qs0 - ps0)) & mask
645 0 : filt = signed_char_clamp_bd_sse2(filt, bd);
646 0 : filt = _mm_and_si128(filt, mask);
647 :
648 0 : filter1 = _mm_adds_epi16(filt, t4);
649 0 : filter2 = _mm_adds_epi16(filt, t3);
650 :
651 : // Filter1 >> 3
652 0 : filter1 = signed_char_clamp_bd_sse2(filter1, bd);
653 0 : filter1 = _mm_srai_epi16(filter1, 3);
654 :
655 : // Filter2 >> 3
656 0 : filter2 = signed_char_clamp_bd_sse2(filter2, bd);
657 0 : filter2 = _mm_srai_epi16(filter2, 3);
658 :
659 : // filt >> 1
660 0 : filt = _mm_adds_epi16(filter1, t1);
661 0 : filt = _mm_srai_epi16(filt, 1);
662 : // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
663 0 : filt = _mm_andnot_si128(hev, filt);
664 :
665 0 : work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
666 0 : work_a = _mm_adds_epi16(work_a, t80);
667 0 : q0 = _mm_load_si128((__m128i *)flat_oq0);
668 0 : work_a = _mm_andnot_si128(flat, work_a);
669 0 : q0 = _mm_and_si128(flat, q0);
670 0 : q0 = _mm_or_si128(work_a, q0);
671 :
672 0 : work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
673 0 : work_a = _mm_adds_epi16(work_a, t80);
674 0 : q1 = _mm_load_si128((__m128i *)flat_oq1);
675 0 : work_a = _mm_andnot_si128(flat, work_a);
676 0 : q1 = _mm_and_si128(flat, q1);
677 0 : q1 = _mm_or_si128(work_a, q1);
678 :
679 0 : work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
680 0 : q2 = _mm_load_si128((__m128i *)flat_oq2);
681 0 : work_a = _mm_andnot_si128(flat, work_a);
682 0 : q2 = _mm_and_si128(flat, q2);
683 0 : q2 = _mm_or_si128(work_a, q2);
684 :
685 0 : work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
686 0 : work_a = _mm_adds_epi16(work_a, t80);
687 0 : p0 = _mm_load_si128((__m128i *)flat_op0);
688 0 : work_a = _mm_andnot_si128(flat, work_a);
689 0 : p0 = _mm_and_si128(flat, p0);
690 0 : p0 = _mm_or_si128(work_a, p0);
691 :
692 0 : work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
693 0 : work_a = _mm_adds_epi16(work_a, t80);
694 0 : p1 = _mm_load_si128((__m128i *)flat_op1);
695 0 : work_a = _mm_andnot_si128(flat, work_a);
696 0 : p1 = _mm_and_si128(flat, p1);
697 0 : p1 = _mm_or_si128(work_a, p1);
698 :
699 0 : work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
700 0 : p2 = _mm_load_si128((__m128i *)flat_op2);
701 0 : work_a = _mm_andnot_si128(flat, work_a);
702 0 : p2 = _mm_and_si128(flat, p2);
703 0 : p2 = _mm_or_si128(work_a, p2);
704 :
705 0 : _mm_store_si128((__m128i *)(s - 3 * p), p2);
706 0 : _mm_store_si128((__m128i *)(s - 2 * p), p1);
707 0 : _mm_store_si128((__m128i *)(s - 1 * p), p0);
708 : _mm_store_si128((__m128i *)(s + 0 * p), q0);
709 0 : _mm_store_si128((__m128i *)(s + 1 * p), q1);
710 : _mm_store_si128((__m128i *)(s + 2 * p), q2);
711 0 : }
712 :
713 0 : void aom_highbd_lpf_horizontal_8_dual_sse2(
714 : uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
715 : const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
716 : const uint8_t *_thresh1, int bd) {
717 0 : aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
718 0 : aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
719 0 : }
720 :
721 0 : void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
722 : const uint8_t *_blimit,
723 : const uint8_t *_limit,
724 : const uint8_t *_thresh, int bd) {
725 0 : const __m128i zero = _mm_set1_epi16(0);
726 : __m128i blimit, limit, thresh;
727 : __m128i mask, hev, flat;
728 0 : __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
729 0 : __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
730 0 : __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
731 0 : __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
732 0 : __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
733 0 : __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
734 0 : __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
735 0 : __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
736 0 : const __m128i abs_p1p0 =
737 0 : _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
738 0 : const __m128i abs_q1q0 =
739 0 : _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
740 0 : const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
741 0 : const __m128i one = _mm_set1_epi16(1);
742 0 : __m128i abs_p0q0 =
743 0 : _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
744 0 : __m128i abs_p1q1 =
745 0 : _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
746 : __m128i work;
747 0 : const __m128i t4 = _mm_set1_epi16(4);
748 0 : const __m128i t3 = _mm_set1_epi16(3);
749 : __m128i t80;
750 : __m128i tff80;
751 : __m128i tffe0;
752 : __m128i t1f;
753 : // equivalent to shifting 0x1f left by bitdepth - 8
754 : // and setting new bits to 1
755 0 : const __m128i t1 = _mm_set1_epi16(0x1);
756 : __m128i t7f;
757 : // equivalent to shifting 0x7f left by bitdepth - 8
758 : // and setting new bits to 1
759 : __m128i ps1, ps0, qs0, qs1;
760 : __m128i filt;
761 : __m128i work_a;
762 : __m128i filter1, filter2;
763 :
764 0 : if (bd == 8) {
765 0 : blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
766 0 : limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
767 0 : thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
768 0 : t80 = _mm_set1_epi16(0x80);
769 0 : tff80 = _mm_set1_epi16(0xff80);
770 0 : tffe0 = _mm_set1_epi16(0xffe0);
771 0 : t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
772 0 : t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
773 0 : } else if (bd == 10) {
774 0 : blimit = _mm_slli_epi16(
775 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
776 0 : limit = _mm_slli_epi16(
777 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
778 0 : thresh = _mm_slli_epi16(
779 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
780 0 : t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
781 0 : tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
782 0 : tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
783 0 : t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
784 0 : t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
785 : } else { // bd == 12
786 0 : blimit = _mm_slli_epi16(
787 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
788 0 : limit = _mm_slli_epi16(
789 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
790 0 : thresh = _mm_slli_epi16(
791 : _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
792 0 : t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
793 0 : tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
794 0 : tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
795 0 : t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
796 0 : t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
797 : }
798 :
799 0 : ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
800 0 : ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
801 0 : qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
802 0 : qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
803 :
804 : // filter_mask and hev_mask
805 0 : flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
806 0 : hev = _mm_subs_epu16(flat, thresh);
807 0 : hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
808 :
809 0 : abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
810 0 : abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
811 0 : mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
812 0 : mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
813 : // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
814 : // So taking maximums continues to work:
815 0 : mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
816 0 : mask = _mm_max_epi16(flat, mask);
817 : // mask |= (abs(p1 - p0) > limit) * -1;
818 : // mask |= (abs(q1 - q0) > limit) * -1;
819 0 : work = _mm_max_epi16(
820 : _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
821 : _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
822 0 : mask = _mm_max_epi16(work, mask);
823 0 : work = _mm_max_epi16(
824 : _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
825 : _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
826 0 : mask = _mm_max_epi16(work, mask);
827 0 : mask = _mm_subs_epu16(mask, limit);
828 0 : mask = _mm_cmpeq_epi16(mask, zero);
829 :
830 : // filter4
831 0 : filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
832 0 : filt = _mm_and_si128(filt, hev);
833 0 : work_a = _mm_subs_epi16(qs0, ps0);
834 0 : filt = _mm_adds_epi16(filt, work_a);
835 0 : filt = _mm_adds_epi16(filt, work_a);
836 0 : filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
837 :
838 : // (aom_filter + 3 * (qs0 - ps0)) & mask
839 0 : filt = _mm_and_si128(filt, mask);
840 :
841 0 : filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
842 0 : filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
843 :
844 : // Filter1 >> 3
845 0 : work_a = _mm_cmpgt_epi16(zero, filter1); // get the values that are <0
846 0 : filter1 = _mm_srli_epi16(filter1, 3);
847 0 : work_a = _mm_and_si128(work_a, tffe0); // sign bits for the values < 0
848 0 : filter1 = _mm_and_si128(filter1, t1f); // clamp the range
849 0 : filter1 = _mm_or_si128(filter1, work_a); // reinsert the sign bits
850 :
851 : // Filter2 >> 3
852 0 : work_a = _mm_cmpgt_epi16(zero, filter2);
853 0 : filter2 = _mm_srli_epi16(filter2, 3);
854 0 : work_a = _mm_and_si128(work_a, tffe0);
855 0 : filter2 = _mm_and_si128(filter2, t1f);
856 0 : filter2 = _mm_or_si128(filter2, work_a);
857 :
858 : // filt >> 1
859 0 : filt = _mm_adds_epi16(filter1, t1);
860 0 : work_a = _mm_cmpgt_epi16(zero, filt);
861 0 : filt = _mm_srli_epi16(filt, 1);
862 0 : work_a = _mm_and_si128(work_a, tff80);
863 0 : filt = _mm_and_si128(filt, t7f);
864 0 : filt = _mm_or_si128(filt, work_a);
865 :
866 0 : filt = _mm_andnot_si128(hev, filt);
867 :
868 0 : q0 = _mm_adds_epi16(
869 : signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
870 0 : q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
871 : t80);
872 0 : p0 = _mm_adds_epi16(
873 : signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
874 0 : p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
875 : t80);
876 :
877 0 : _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
878 0 : _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
879 : _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
880 0 : _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
881 0 : }
882 :
883 0 : void aom_highbd_lpf_horizontal_4_dual_sse2(
884 : uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
885 : const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
886 : const uint8_t *_thresh1, int bd) {
887 0 : aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
888 0 : aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
889 0 : }
890 :
891 0 : static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
892 : int out_p, int num_8x8_to_transpose) {
893 0 : int idx8x8 = 0;
894 : __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
895 : do {
896 0 : uint16_t *in = src[idx8x8];
897 0 : uint16_t *out = dst[idx8x8];
898 :
899 0 : p0 =
900 : _mm_loadu_si128((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
901 0 : p1 =
902 0 : _mm_loadu_si128((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
903 0 : p2 =
904 0 : _mm_loadu_si128((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
905 0 : p3 =
906 0 : _mm_loadu_si128((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
907 0 : p4 =
908 0 : _mm_loadu_si128((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
909 0 : p5 =
910 0 : _mm_loadu_si128((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
911 0 : p6 =
912 0 : _mm_loadu_si128((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
913 0 : p7 =
914 0 : _mm_loadu_si128((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
915 : // 00 10 01 11 02 12 03 13
916 0 : x0 = _mm_unpacklo_epi16(p0, p1);
917 : // 20 30 21 31 22 32 23 33
918 0 : x1 = _mm_unpacklo_epi16(p2, p3);
919 : // 40 50 41 51 42 52 43 53
920 0 : x2 = _mm_unpacklo_epi16(p4, p5);
921 : // 60 70 61 71 62 72 63 73
922 0 : x3 = _mm_unpacklo_epi16(p6, p7);
923 : // 00 10 20 30 01 11 21 31
924 0 : x4 = _mm_unpacklo_epi32(x0, x1);
925 : // 40 50 60 70 41 51 61 71
926 0 : x5 = _mm_unpacklo_epi32(x2, x3);
927 : // 00 10 20 30 40 50 60 70
928 0 : x6 = _mm_unpacklo_epi64(x4, x5);
929 : // 01 11 21 31 41 51 61 71
930 0 : x7 = _mm_unpackhi_epi64(x4, x5);
931 :
932 : _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
933 : // 00 10 20 30 40 50 60 70
934 0 : _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
935 : // 01 11 21 31 41 51 61 71
936 :
937 : // 02 12 22 32 03 13 23 33
938 0 : x4 = _mm_unpackhi_epi32(x0, x1);
939 : // 42 52 62 72 43 53 63 73
940 0 : x5 = _mm_unpackhi_epi32(x2, x3);
941 : // 02 12 22 32 42 52 62 72
942 0 : x6 = _mm_unpacklo_epi64(x4, x5);
943 : // 03 13 23 33 43 53 63 73
944 0 : x7 = _mm_unpackhi_epi64(x4, x5);
945 :
946 0 : _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
947 : // 02 12 22 32 42 52 62 72
948 0 : _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
949 : // 03 13 23 33 43 53 63 73
950 :
951 : // 04 14 05 15 06 16 07 17
952 0 : x0 = _mm_unpackhi_epi16(p0, p1);
953 : // 24 34 25 35 26 36 27 37
954 0 : x1 = _mm_unpackhi_epi16(p2, p3);
955 : // 44 54 45 55 46 56 47 57
956 0 : x2 = _mm_unpackhi_epi16(p4, p5);
957 : // 64 74 65 75 66 76 67 77
958 0 : x3 = _mm_unpackhi_epi16(p6, p7);
959 : // 04 14 24 34 05 15 25 35
960 0 : x4 = _mm_unpacklo_epi32(x0, x1);
961 : // 44 54 64 74 45 55 65 75
962 0 : x5 = _mm_unpacklo_epi32(x2, x3);
963 : // 04 14 24 34 44 54 64 74
964 0 : x6 = _mm_unpacklo_epi64(x4, x5);
965 : // 05 15 25 35 45 55 65 75
966 0 : x7 = _mm_unpackhi_epi64(x4, x5);
967 :
968 0 : _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
969 : // 04 14 24 34 44 54 64 74
970 0 : _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
971 : // 05 15 25 35 45 55 65 75
972 :
973 : // 06 16 26 36 07 17 27 37
974 0 : x4 = _mm_unpackhi_epi32(x0, x1);
975 : // 46 56 66 76 47 57 67 77
976 0 : x5 = _mm_unpackhi_epi32(x2, x3);
977 : // 06 16 26 36 46 56 66 76
978 0 : x6 = _mm_unpacklo_epi64(x4, x5);
979 : // 07 17 27 37 47 57 67 77
980 0 : x7 = _mm_unpackhi_epi64(x4, x5);
981 :
982 0 : _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
983 : // 06 16 26 36 46 56 66 76
984 0 : _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
985 : // 07 17 27 37 47 57 67 77
986 0 : } while (++idx8x8 < num_8x8_to_transpose);
987 0 : }
988 :
989 0 : static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
990 : uint16_t *out, int out_p) {
991 : uint16_t *src0[1];
992 : uint16_t *src1[1];
993 : uint16_t *dest0[1];
994 : uint16_t *dest1[1];
995 0 : src0[0] = in0;
996 0 : src1[0] = in1;
997 0 : dest0[0] = out;
998 0 : dest1[0] = out + 8;
999 0 : highbd_transpose(src0, in_p, dest0, out_p, 1);
1000 0 : highbd_transpose(src1, in_p, dest1, out_p, 1);
1001 0 : }
1002 :
1003 0 : void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
1004 : const uint8_t *limit, const uint8_t *thresh,
1005 : int bd) {
1006 : DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1007 : uint16_t *src[1];
1008 : uint16_t *dst[1];
1009 :
1010 : // Transpose 8x8
1011 0 : src[0] = s - 4;
1012 0 : dst[0] = t_dst;
1013 :
1014 0 : highbd_transpose(src, p, dst, 8, 1);
1015 :
1016 : // Loop filtering
1017 0 : aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1018 :
1019 0 : src[0] = t_dst;
1020 0 : dst[0] = s - 4;
1021 :
1022 : // Transpose back
1023 0 : highbd_transpose(src, 8, dst, p, 1);
1024 0 : }
1025 :
1026 0 : void aom_highbd_lpf_vertical_4_dual_sse2(
1027 : uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
1028 : const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1029 : const uint8_t *thresh1, int bd) {
1030 : DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1031 : uint16_t *src[2];
1032 : uint16_t *dst[2];
1033 :
1034 : // Transpose 8x16
1035 0 : highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1036 :
1037 : // Loop filtering
1038 0 : aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1039 : thresh0, blimit1, limit1, thresh1, bd);
1040 0 : src[0] = t_dst;
1041 0 : src[1] = t_dst + 8;
1042 0 : dst[0] = s - 4;
1043 0 : dst[1] = s - 4 + p * 8;
1044 :
1045 : // Transpose back
1046 0 : highbd_transpose(src, 16, dst, p, 2);
1047 0 : }
1048 :
1049 0 : void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
1050 : const uint8_t *limit, const uint8_t *thresh,
1051 : int bd) {
1052 : DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
1053 : uint16_t *src[1];
1054 : uint16_t *dst[1];
1055 :
1056 : // Transpose 8x8
1057 0 : src[0] = s - 4;
1058 0 : dst[0] = t_dst;
1059 :
1060 0 : highbd_transpose(src, p, dst, 8, 1);
1061 :
1062 : // Loop filtering
1063 0 : aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
1064 :
1065 0 : src[0] = t_dst;
1066 0 : dst[0] = s - 4;
1067 :
1068 : // Transpose back
1069 0 : highbd_transpose(src, 8, dst, p, 1);
1070 0 : }
1071 :
1072 0 : void aom_highbd_lpf_vertical_8_dual_sse2(
1073 : uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
1074 : const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
1075 : const uint8_t *thresh1, int bd) {
1076 : DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
1077 : uint16_t *src[2];
1078 : uint16_t *dst[2];
1079 :
1080 : // Transpose 8x16
1081 0 : highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
1082 :
1083 : // Loop filtering
1084 0 : aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
1085 : thresh0, blimit1, limit1, thresh1, bd);
1086 0 : src[0] = t_dst;
1087 0 : src[1] = t_dst + 8;
1088 :
1089 0 : dst[0] = s - 4;
1090 0 : dst[1] = s - 4 + p * 8;
1091 :
1092 : // Transpose back
1093 0 : highbd_transpose(src, 16, dst, p, 2);
1094 0 : }
1095 :
1096 0 : void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
1097 : const uint8_t *limit,
1098 : const uint8_t *thresh, int bd) {
1099 : DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
1100 : uint16_t *src[2];
1101 : uint16_t *dst[2];
1102 :
1103 0 : src[0] = s - 8;
1104 0 : src[1] = s;
1105 0 : dst[0] = t_dst;
1106 0 : dst[1] = t_dst + 8 * 8;
1107 :
1108 : // Transpose 16x8
1109 0 : highbd_transpose(src, p, dst, 8, 2);
1110 :
1111 : // Loop filtering
1112 0 : aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
1113 : bd);
1114 0 : src[0] = t_dst;
1115 0 : src[1] = t_dst + 8 * 8;
1116 0 : dst[0] = s - 8;
1117 0 : dst[1] = s;
1118 :
1119 : // Transpose back
1120 0 : highbd_transpose(src, 8, dst, p, 2);
1121 0 : }
1122 :
1123 0 : void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
1124 : const uint8_t *blimit,
1125 : const uint8_t *limit,
1126 : const uint8_t *thresh, int bd) {
1127 : DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
1128 :
1129 : // Transpose 16x16
1130 0 : highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
1131 0 : highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
1132 :
1133 : // Loop filtering
1134 0 : aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
1135 : thresh, bd);
1136 :
1137 : // Transpose back
1138 0 : highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
1139 0 : highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
1140 0 : }
|