Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "./av1_rtcd.h"
13 : #include "./cdef_simd.h"
14 : #include "./od_dering.h"
15 :
16 : /* partial A is a 16-bit vector of the form:
17 : [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
18 : [0 y1 y2 y3 y4 y5 y6 y7].
19 : This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
20 : (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
21 : and const2. */
22 0 : static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
23 : v128 const2) {
24 : v128 tmp;
25 : /* Reverse partial B. */
26 0 : partialb = v128_shuffle_8(
27 : partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
28 : /* Interleave the x and y values of identical indices and pair x8 with 0. */
29 0 : tmp = partiala;
30 0 : partiala = v128_ziplo_16(partialb, partiala);
31 0 : partialb = v128_ziphi_16(partialb, tmp);
32 : /* Square and add the corresponding x and y values. */
33 0 : partiala = v128_madd_s16(partiala, partiala);
34 0 : partialb = v128_madd_s16(partialb, partialb);
35 : /* Multiply by constant. */
36 0 : partiala = v128_mullo_s32(partiala, const1);
37 0 : partialb = v128_mullo_s32(partialb, const2);
38 : /* Sum all results. */
39 0 : partiala = v128_add_32(partiala, partialb);
40 0 : return partiala;
41 : }
42 :
43 0 : static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
44 : v128 t0, t1, t2, t3;
45 0 : t0 = v128_ziplo_32(x1, x0);
46 0 : t1 = v128_ziplo_32(x3, x2);
47 0 : t2 = v128_ziphi_32(x1, x0);
48 0 : t3 = v128_ziphi_32(x3, x2);
49 0 : x0 = v128_ziplo_64(t1, t0);
50 0 : x1 = v128_ziphi_64(t1, t0);
51 0 : x2 = v128_ziplo_64(t3, t2);
52 0 : x3 = v128_ziphi_64(t3, t2);
53 0 : return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
54 : }
55 :
56 : /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
57 : to compute the remaining directions. */
58 0 : static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
59 : v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
60 : v128 partial6;
61 : v128 tmp;
62 : /* Partial sums for lines 0 and 1. */
63 0 : partial4a = v128_shl_n_byte(lines[0], 14);
64 0 : partial4b = v128_shr_n_byte(lines[0], 2);
65 0 : partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
66 0 : partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
67 0 : tmp = v128_add_16(lines[0], lines[1]);
68 0 : partial5a = v128_shl_n_byte(tmp, 10);
69 0 : partial5b = v128_shr_n_byte(tmp, 6);
70 0 : partial7a = v128_shl_n_byte(tmp, 4);
71 0 : partial7b = v128_shr_n_byte(tmp, 12);
72 0 : partial6 = tmp;
73 :
74 : /* Partial sums for lines 2 and 3. */
75 0 : partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
76 0 : partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
77 0 : partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
78 0 : partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
79 0 : tmp = v128_add_16(lines[2], lines[3]);
80 0 : partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
81 0 : partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
82 0 : partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
83 0 : partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
84 0 : partial6 = v128_add_16(partial6, tmp);
85 :
86 : /* Partial sums for lines 4 and 5. */
87 0 : partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
88 0 : partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
89 0 : partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
90 0 : partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
91 0 : tmp = v128_add_16(lines[4], lines[5]);
92 0 : partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
93 0 : partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
94 0 : partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
95 0 : partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
96 0 : partial6 = v128_add_16(partial6, tmp);
97 :
98 : /* Partial sums for lines 6 and 7. */
99 0 : partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
100 0 : partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
101 0 : partial4a = v128_add_16(partial4a, lines[7]);
102 0 : tmp = v128_add_16(lines[6], lines[7]);
103 0 : partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
104 0 : partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
105 0 : partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
106 0 : partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
107 0 : partial6 = v128_add_16(partial6, tmp);
108 :
109 : /* Compute costs in terms of partial sums. */
110 0 : partial4a =
111 0 : fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
112 : v128_from_32(105, 120, 140, 168));
113 0 : partial7a =
114 0 : fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
115 : v128_from_32(105, 105, 105, 140));
116 0 : partial5a =
117 0 : fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
118 : v128_from_32(105, 105, 105, 140));
119 0 : partial6 = v128_madd_s16(partial6, partial6);
120 0 : partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
121 :
122 0 : partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
123 : v128_store_unaligned(tmp_cost1, partial4a);
124 0 : return partial4a;
125 : }
126 :
127 : /* transpose and reverse the order of the lines -- equivalent to a 90-degree
128 : counter-clockwise rotation of the pixels. */
129 0 : static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
130 0 : const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
131 0 : const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
132 0 : const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
133 0 : const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
134 0 : const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
135 0 : const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
136 0 : const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
137 0 : const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
138 :
139 0 : const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
140 0 : const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
141 0 : const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
142 0 : const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
143 0 : const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
144 0 : const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
145 0 : const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
146 0 : const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
147 :
148 0 : res[7] = v128_ziplo_64(tr1_1, tr1_0);
149 0 : res[6] = v128_ziphi_64(tr1_1, tr1_0);
150 0 : res[5] = v128_ziplo_64(tr1_3, tr1_2);
151 0 : res[4] = v128_ziphi_64(tr1_3, tr1_2);
152 0 : res[3] = v128_ziplo_64(tr1_5, tr1_4);
153 0 : res[2] = v128_ziphi_64(tr1_5, tr1_4);
154 0 : res[1] = v128_ziplo_64(tr1_7, tr1_6);
155 0 : res[0] = v128_ziphi_64(tr1_7, tr1_6);
156 0 : }
157 :
158 0 : int SIMD_FUNC(od_dir_find8)(const od_dering_in *img, int stride, int32_t *var,
159 : int coeff_shift) {
160 : int i;
161 : int32_t cost[8];
162 0 : int32_t best_cost = 0;
163 0 : int best_dir = 0;
164 : v128 lines[8];
165 0 : for (i = 0; i < 8; i++) {
166 0 : lines[i] = v128_load_unaligned(&img[i * stride]);
167 0 : lines[i] =
168 0 : v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
169 : }
170 :
171 : #if defined(__SSE4_1__)
172 : /* Compute "mostly vertical" directions. */
173 0 : __m128i dir47 = compute_directions(lines, cost + 4);
174 :
175 0 : array_reverse_transpose_8x8(lines, lines);
176 :
177 : /* Compute "mostly horizontal" directions. */
178 0 : __m128i dir03 = compute_directions(lines, cost);
179 :
180 0 : __m128i max = _mm_max_epi32(dir03, dir47);
181 0 : max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(1, 0, 3, 2)));
182 0 : max = _mm_max_epi32(max, _mm_shuffle_epi32(max, _MM_SHUFFLE(2, 3, 0, 1)));
183 0 : best_cost = _mm_cvtsi128_si32(max);
184 0 : __m128i t =
185 0 : _mm_packs_epi32(_mm_cmpeq_epi32(max, dir03), _mm_cmpeq_epi32(max, dir47));
186 0 : best_dir = _mm_movemask_epi8(_mm_packs_epi16(t, t));
187 0 : best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
188 : #else
189 : /* Compute "mostly vertical" directions. */
190 0 : compute_directions(lines, cost + 4);
191 :
192 0 : array_reverse_transpose_8x8(lines, lines);
193 :
194 : /* Compute "mostly horizontal" directions. */
195 0 : compute_directions(lines, cost);
196 :
197 0 : for (i = 0; i < 8; i++) {
198 0 : if (cost[i] > best_cost) {
199 0 : best_cost = cost[i];
200 0 : best_dir = i;
201 : }
202 : }
203 : #endif
204 :
205 : /* Difference between the optimal variance and the variance along the
206 : orthogonal direction. Again, the sum(x^2) terms cancel out. */
207 0 : *var = best_cost - cost[(best_dir + 4) & 7];
208 : /* We'd normally divide by 840, but dividing by 1024 is close enough
209 : for what we're going to do with this. */
210 0 : *var >>= 10;
211 0 : return best_dir;
212 : }
213 :
214 0 : void SIMD_FUNC(od_filter_dering_direction_4x4)(uint16_t *y, int ystride,
215 : const uint16_t *in,
216 : int threshold, int dir,
217 : int damping) {
218 : int i;
219 : v128 p0, p1, sum, row, res;
220 0 : int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
221 0 : int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
222 :
223 0 : if (threshold) damping -= get_msb(threshold);
224 0 : for (i = 0; i < 4; i += 2) {
225 0 : sum = v128_zero();
226 0 : row = v128_from_v64(v64_load_aligned(&in[i * OD_FILT_BSTRIDE]),
227 0 : v64_load_aligned(&in[(i + 1) * OD_FILT_BSTRIDE]));
228 :
229 : // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
230 0 : p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]),
231 0 : v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o1]));
232 0 : p0 = constrain16(p0, row, threshold, damping);
233 :
234 : // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
235 0 : p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]),
236 0 : v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o1]));
237 0 : p1 = constrain16(p1, row, threshold, damping);
238 :
239 : // sum += 4 * (p0 + p1)
240 0 : sum = v128_add_16(sum, v128_shl_n_16(v128_add_16(p0, p1), 2));
241 :
242 : // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
243 0 : p0 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]),
244 0 : v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE + o2]));
245 0 : p0 = constrain16(p0, row, threshold, damping);
246 :
247 : // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
248 0 : p1 = v128_from_v64(v64_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]),
249 0 : v64_load_unaligned(&in[(i + 1) * OD_FILT_BSTRIDE - o2]));
250 0 : p1 = constrain16(p1, row, threshold, damping);
251 :
252 : // sum += 1 * (p0 + p1)
253 0 : sum = v128_add_16(sum, v128_add_16(p0, p1));
254 :
255 : // res = row + ((sum + 8) >> 4)
256 0 : res = v128_add_16(sum, v128_dup_16(8));
257 0 : res = v128_shr_n_s16(res, 4);
258 0 : res = v128_add_16(row, res);
259 0 : v64_store_aligned(&y[i * ystride], v128_high_v64(res));
260 0 : v64_store_aligned(&y[(i + 1) * ystride], v128_low_v64(res));
261 : }
262 0 : }
263 :
264 0 : void SIMD_FUNC(od_filter_dering_direction_8x8)(uint16_t *y, int ystride,
265 : const uint16_t *in,
266 : int threshold, int dir,
267 : int damping) {
268 : int i;
269 : v128 sum, p0, p1, row, res;
270 0 : int o1 = OD_DIRECTION_OFFSETS_TABLE[dir][0];
271 0 : int o2 = OD_DIRECTION_OFFSETS_TABLE[dir][1];
272 0 : int o3 = OD_DIRECTION_OFFSETS_TABLE[dir][2];
273 :
274 0 : if (threshold) damping -= get_msb(threshold);
275 0 : for (i = 0; i < 8; i++) {
276 0 : sum = v128_zero();
277 0 : row = v128_load_aligned(&in[i * OD_FILT_BSTRIDE]);
278 :
279 : // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
280 0 : p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o1]);
281 0 : p0 = constrain16(p0, row, threshold, damping);
282 :
283 : // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
284 0 : p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o1]);
285 0 : p1 = constrain16(p1, row, threshold, damping);
286 :
287 : // sum += 3 * (p0 + p1)
288 0 : p0 = v128_add_16(p0, p1);
289 0 : p0 = v128_add_16(p0, v128_shl_n_16(p0, 1));
290 0 : sum = v128_add_16(sum, p0);
291 :
292 : // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
293 0 : p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o2]);
294 0 : p0 = constrain16(p0, row, threshold, damping);
295 :
296 : // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
297 0 : p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o2]);
298 0 : p1 = constrain16(p1, row, threshold, damping);
299 :
300 : // sum += 2 * (p0 + p1)
301 0 : p0 = v128_shl_n_16(v128_add_16(p0, p1), 1);
302 0 : sum = v128_add_16(sum, p0);
303 :
304 : // p0 = constrain16(in[i*OD_FILT_BSTRIDE + offset], row, threshold, damping)
305 0 : p0 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE + o3]);
306 0 : p0 = constrain16(p0, row, threshold, damping);
307 :
308 : // p1 = constrain16(in[i*OD_FILT_BSTRIDE - offset], row, threshold, damping)
309 0 : p1 = v128_load_unaligned(&in[i * OD_FILT_BSTRIDE - o3]);
310 0 : p1 = constrain16(p1, row, threshold, damping);
311 :
312 : // sum += (p0 + p1)
313 0 : p0 = v128_add_16(p0, p1);
314 0 : sum = v128_add_16(sum, p0);
315 :
316 : // res = row + ((sum + 8) >> 4)
317 0 : res = v128_add_16(sum, v128_dup_16(8));
318 0 : res = v128_shr_n_s16(res, 4);
319 0 : res = v128_add_16(row, res);
320 0 : v128_store_unaligned(&y[i * ystride], res);
321 : }
322 0 : }
323 :
324 0 : void SIMD_FUNC(copy_8x8_16bit_to_8bit)(uint8_t *dst, int dstride,
325 : const uint16_t *src, int sstride) {
326 : int i;
327 0 : for (i = 0; i < 8; i++) {
328 0 : v128 row = v128_load_unaligned(&src[i * sstride]);
329 0 : row = v128_pack_s16_u8(row, row);
330 0 : v64_store_unaligned(&dst[i * dstride], v128_low_v64(row));
331 : }
332 0 : }
333 :
334 0 : void SIMD_FUNC(copy_4x4_16bit_to_8bit)(uint8_t *dst, int dstride,
335 : const uint16_t *src, int sstride) {
336 : int i;
337 0 : for (i = 0; i < 4; i++) {
338 0 : v128 row = v128_load_unaligned(&src[i * sstride]);
339 0 : row = v128_pack_s16_u8(row, row);
340 0 : u32_store_unaligned(&dst[i * dstride], v128_low_u32(row));
341 : }
342 0 : }
343 :
344 0 : void SIMD_FUNC(copy_8x8_16bit_to_16bit)(uint16_t *dst, int dstride,
345 : const uint16_t *src, int sstride) {
346 : int i;
347 0 : for (i = 0; i < 8; i++) {
348 0 : v128 row = v128_load_unaligned(&src[i * sstride]);
349 0 : v128_store_unaligned(&dst[i * dstride], row);
350 : }
351 0 : }
352 :
353 0 : void SIMD_FUNC(copy_4x4_16bit_to_16bit)(uint16_t *dst, int dstride,
354 : const uint16_t *src, int sstride) {
355 : int i;
356 0 : for (i = 0; i < 4; i++) {
357 0 : v64 row = v64_load_unaligned(&src[i * sstride]);
358 0 : v64_store_unaligned(&dst[i * dstride], row);
359 : }
360 0 : }
361 :
362 0 : void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
363 : const uint8_t *src, int sstride, int v,
364 : int h) {
365 : int i, j;
366 0 : for (i = 0; i < v; i++) {
367 0 : for (j = 0; j < (h & ~0x7); j += 8) {
368 0 : v64 row = v64_load_unaligned(&src[i * sstride + j]);
369 0 : v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
370 : }
371 0 : for (; j < h; j++) {
372 0 : dst[i * dstride + j] = src[i * sstride + j];
373 : }
374 : }
375 0 : }
376 :
377 0 : void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
378 : const uint16_t *src, int sstride,
379 : int v, int h) {
380 : int i, j;
381 0 : for (i = 0; i < v; i++) {
382 0 : for (j = 0; j < (h & ~0x7); j += 8) {
383 0 : v128 row = v128_load_unaligned(&src[i * sstride + j]);
384 0 : v128_store_unaligned(&dst[i * dstride + j], row);
385 : }
386 0 : for (; j < h; j++) {
387 0 : dst[i * dstride + j] = src[i * sstride + j];
388 : }
389 : }
390 0 : }
|