Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include "./av1_rtcd.h"
13 : #include "./cdef_simd.h"
14 : #include "aom_ports/bitops.h"
15 : #include "aom_ports/mem.h"
16 :
17 : // sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
18 : SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
19 : unsigned int adjdamp) {
20 : const v256 diff16 = v256_sub_16(a, b);
21 0 : v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
22 0 : const v128 sign = v128_cmplt_s8(diff, v128_zero());
23 0 : diff = v128_abs_s8(diff);
24 0 : return v128_xor(
25 : v128_add_8(sign,
26 : v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
27 : v128_shr_u8(diff, adjdamp)))),
28 : sign);
29 : }
30 :
31 : // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
32 : // 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
33 : // 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
34 : // 3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
35 : SIMD_INLINE v128 calc_delta(v256 x, v256 a, v256 b, v256 c, v256 d, v256 e,
36 : v256 f, v256 g, v256 h, unsigned int s,
37 : unsigned int dmp) {
38 0 : const v128 bdeg =
39 0 : v128_add_8(v128_add_8(constrain(b, x, s, dmp), constrain(d, x, s, dmp)),
40 : v128_add_8(constrain(e, x, s, dmp), constrain(g, x, s, dmp)));
41 0 : const v128 delta = v128_add_8(
42 : v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(c, x, s, dmp)),
43 : v128_add_8(constrain(f, x, s, dmp), constrain(h, x, s, dmp))),
44 : v128_add_8(v128_add_8(bdeg, bdeg), bdeg));
45 0 : return v128_add_8(
46 : v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
47 : v128_shr_s8(
48 : v128_add_8(v128_dup_8(8),
49 : v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
50 : 4));
51 : }
52 :
53 : // delta = 1/8 * constrain(a, x, s, d) + 3/8 * constrain(b, x, s, d) +
54 : // 3/8 * constrain(c, x, s, d) + 1/8 * constrain(d, x, s, d) +
55 : SIMD_INLINE v128 calc_hdelta(v256 x, v256 a, v256 b, v256 c, v256 d,
56 : unsigned int s, unsigned int dmp) {
57 0 : const v128 bc = v128_add_8(constrain(b, x, s, dmp), constrain(c, x, s, dmp));
58 0 : const v128 delta =
59 0 : v128_add_8(v128_add_8(constrain(a, x, s, dmp), constrain(d, x, s, dmp)),
60 : v128_add_8(v128_add_8(bc, bc), bc));
61 0 : return v128_add_8(
62 : v128_pack_s16_u8(v256_high_v128(x), v256_low_v128(x)),
63 : v128_shr_s8(
64 : v128_add_8(v128_dup_8(4),
65 : v128_add_8(delta, v128_cmplt_s8(delta, v128_zero()))),
66 : 3));
67 : }
68 :
69 : // Process blocks of width 8, two lines at a time, 8 bit.
70 0 : static void SIMD_FUNC(clpf_block8)(uint8_t *dst, const uint16_t *src,
71 : int dstride, int sstride, int sizey,
72 : unsigned int strength,
73 : unsigned int adjdamp) {
74 : int y;
75 :
76 0 : for (y = 0; y < sizey; y += 2) {
77 0 : const v128 l1 = v128_load_aligned(src);
78 0 : const v128 l2 = v128_load_aligned(src + sstride);
79 0 : const v128 l3 = v128_load_aligned(src - sstride);
80 0 : const v128 l4 = v128_load_aligned(src + 2 * sstride);
81 0 : const v256 a = v256_from_v128(v128_load_aligned(src - 2 * sstride), l3);
82 : const v256 b = v256_from_v128(l3, l1);
83 : const v256 g = v256_from_v128(l2, l4);
84 0 : const v256 h = v256_from_v128(l4, v128_load_aligned(src + 3 * sstride));
85 0 : const v256 c = v256_from_v128(v128_load_unaligned(src - 2),
86 0 : v128_load_unaligned(src - 2 + sstride));
87 0 : const v256 d = v256_from_v128(v128_load_unaligned(src - 1),
88 0 : v128_load_unaligned(src - 1 + sstride));
89 0 : const v256 e = v256_from_v128(v128_load_unaligned(src + 1),
90 0 : v128_load_unaligned(src + 1 + sstride));
91 0 : const v256 f = v256_from_v128(v128_load_unaligned(src + 2),
92 0 : v128_load_unaligned(src + 2 + sstride));
93 0 : const v128 o = calc_delta(v256_from_v128(l1, l2), a, b, c, d, e, f, g, h,
94 : strength, adjdamp);
95 :
96 0 : v64_store_aligned(dst, v128_high_v64(o));
97 0 : v64_store_aligned(dst + dstride, v128_low_v64(o));
98 0 : src += sstride * 2;
99 0 : dst += dstride * 2;
100 : }
101 0 : }
102 :
103 : // Process blocks of width 4, four lines at a time, 8 bit.
104 0 : static void SIMD_FUNC(clpf_block4)(uint8_t *dst, const uint16_t *src,
105 : int dstride, int sstride, int sizey,
106 : unsigned int strength,
107 : unsigned int adjdamp) {
108 : int y;
109 :
110 0 : for (y = 0; y < sizey; y += 4) {
111 0 : const v64 l0 = v64_load_aligned(src - 2 * sstride);
112 0 : const v64 l1 = v64_load_aligned(src - sstride);
113 0 : const v64 l2 = v64_load_aligned(src);
114 0 : const v64 l3 = v64_load_aligned(src + sstride);
115 0 : const v64 l4 = v64_load_aligned(src + 2 * sstride);
116 0 : const v64 l5 = v64_load_aligned(src + 3 * sstride);
117 0 : const v64 l6 = v64_load_aligned(src + 4 * sstride);
118 0 : const v64 l7 = v64_load_aligned(src + 5 * sstride);
119 0 : const v128 o =
120 0 : calc_delta(v256_from_v64(l2, l3, l4, l5), v256_from_v64(l0, l1, l2, l3),
121 : v256_from_v64(l1, l2, l3, l4),
122 0 : v256_from_v64(v64_load_unaligned(src - 2),
123 0 : v64_load_unaligned(src + sstride - 2),
124 0 : v64_load_unaligned(src + 2 * sstride - 2),
125 0 : v64_load_unaligned(src + 3 * sstride - 2)),
126 0 : v256_from_v64(v64_load_unaligned(src - 1),
127 0 : v64_load_unaligned(src + sstride - 1),
128 0 : v64_load_unaligned(src + 2 * sstride - 1),
129 0 : v64_load_unaligned(src + 3 * sstride - 1)),
130 0 : v256_from_v64(v64_load_unaligned(src + 1),
131 0 : v64_load_unaligned(src + sstride + 1),
132 0 : v64_load_unaligned(src + 2 * sstride + 1),
133 0 : v64_load_unaligned(src + 3 * sstride + 1)),
134 0 : v256_from_v64(v64_load_unaligned(src + 2),
135 0 : v64_load_unaligned(src + sstride + 2),
136 0 : v64_load_unaligned(src + 2 * sstride + 2),
137 0 : v64_load_unaligned(src + 3 * sstride + 2)),
138 : v256_from_v64(l3, l4, l5, l6), v256_from_v64(l4, l5, l6, l7),
139 : strength, adjdamp);
140 :
141 0 : u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
142 0 : u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
143 0 : u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
144 0 : u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
145 :
146 0 : dst += 4 * dstride;
147 0 : src += 4 * sstride;
148 : }
149 0 : }
150 :
151 0 : static void SIMD_FUNC(clpf_hblock8)(uint8_t *dst, const uint16_t *src,
152 : int dstride, int sstride, int sizey,
153 : unsigned int strength,
154 : unsigned int adjdamp) {
155 : int y;
156 :
157 0 : for (y = 0; y < sizey; y += 2) {
158 0 : const v256 x = v256_from_v128(v128_load_aligned(src),
159 0 : v128_load_aligned(src + sstride));
160 0 : const v256 a = v256_from_v128(v128_load_unaligned(src - 2),
161 0 : v128_load_unaligned(src - 2 + sstride));
162 0 : const v256 b = v256_from_v128(v128_load_unaligned(src - 1),
163 0 : v128_load_unaligned(src - 1 + sstride));
164 0 : const v256 c = v256_from_v128(v128_load_unaligned(src + 1),
165 0 : v128_load_unaligned(src + 1 + sstride));
166 0 : const v256 d = v256_from_v128(v128_load_unaligned(src + 2),
167 0 : v128_load_unaligned(src + 2 + sstride));
168 0 : const v128 o = calc_hdelta(x, a, b, c, d, strength, adjdamp);
169 :
170 0 : v64_store_aligned(dst, v128_high_v64(o));
171 0 : v64_store_aligned(dst + dstride, v128_low_v64(o));
172 0 : src += sstride * 2;
173 0 : dst += dstride * 2;
174 : }
175 0 : }
176 :
177 : // Process blocks of width 4, four lines at a time, 8 bit.
178 0 : static void SIMD_FUNC(clpf_hblock4)(uint8_t *dst, const uint16_t *src,
179 : int dstride, int sstride, int sizey,
180 : unsigned int strength,
181 : unsigned int adjdamp) {
182 : int y;
183 :
184 0 : for (y = 0; y < sizey; y += 4) {
185 0 : const v256 a = v256_from_v64(v64_load_unaligned(src - 2),
186 0 : v64_load_unaligned(src + sstride - 2),
187 0 : v64_load_unaligned(src + 2 * sstride - 2),
188 0 : v64_load_unaligned(src + 3 * sstride - 2));
189 0 : const v256 b = v256_from_v64(v64_load_unaligned(src - 1),
190 0 : v64_load_unaligned(src + sstride - 1),
191 0 : v64_load_unaligned(src + 2 * sstride - 1),
192 0 : v64_load_unaligned(src + 3 * sstride - 1));
193 0 : const v256 c = v256_from_v64(v64_load_unaligned(src + 1),
194 0 : v64_load_unaligned(src + sstride + 1),
195 0 : v64_load_unaligned(src + 2 * sstride + 1),
196 0 : v64_load_unaligned(src + 3 * sstride + 1));
197 0 : const v256 d = v256_from_v64(v64_load_unaligned(src + 2),
198 0 : v64_load_unaligned(src + sstride + 2),
199 0 : v64_load_unaligned(src + 2 * sstride + 2),
200 0 : v64_load_unaligned(src + 3 * sstride + 2));
201 :
202 0 : const v128 o = calc_hdelta(
203 0 : v256_from_v64(v64_load_aligned(src), v64_load_aligned(src + sstride),
204 0 : v64_load_aligned(src + 2 * sstride),
205 0 : v64_load_aligned(src + 3 * sstride)),
206 : a, b, c, d, strength, adjdamp);
207 :
208 0 : u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
209 0 : u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
210 0 : u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
211 0 : u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
212 :
213 0 : dst += 4 * dstride;
214 0 : src += 4 * sstride;
215 : }
216 0 : }
217 :
218 0 : void SIMD_FUNC(aom_clpf_block)(uint8_t *dst, const uint16_t *src, int dstride,
219 : int sstride, int sizex, int sizey,
220 : unsigned int strength, unsigned int dmp) {
221 0 : if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
222 : // Fallback to C for odd sizes:
223 : // * block widths not 4 or 8
224 : // * block heights not a multiple of 4 if the block width is 4
225 0 : aom_clpf_block_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
226 : } else {
227 0 : (sizex == 4 ? SIMD_FUNC(clpf_block4) : SIMD_FUNC(clpf_block8))(
228 0 : dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
229 : }
230 0 : }
231 :
232 0 : void SIMD_FUNC(aom_clpf_hblock)(uint8_t *dst, const uint16_t *src, int dstride,
233 : int sstride, int sizex, int sizey,
234 : unsigned int strength, unsigned int dmp) {
235 0 : if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
236 : // Fallback to C for odd sizes:
237 : // * block widths not 4 or 8
238 : // * block heights not a multiple of 4 if the block width is 4
239 0 : aom_clpf_hblock_c(dst, src, dstride, sstride, sizex, sizey, strength, dmp);
240 : } else {
241 0 : (sizex == 4 ? SIMD_FUNC(clpf_hblock4) : SIMD_FUNC(clpf_hblock8))(
242 0 : dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
243 : }
244 0 : }
245 :
246 : // delta = 1/16 * constrain(a, x, s, d) + 3/16 * constrain(b, x, s, d) +
247 : // 1/16 * constrain(c, x, s, d) + 3/16 * constrain(d, x, s, d) +
248 : // 3/16 * constrain(e, x, s, d) + 1/16 * constrain(f, x, s, d) +
249 : // 3/16 * constrain(g, x, s, d) + 1/16 * constrain(h, x, s, d)
250 : SIMD_INLINE v128 calc_delta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d, v128 e,
251 : v128 f, v128 g, v128 h, unsigned int s,
252 : unsigned int dmp) {
253 0 : const v128 bdeg = v128_add_16(
254 : v128_add_16(constrain16(b, x, s, dmp), constrain16(d, x, s, dmp)),
255 : v128_add_16(constrain16(e, x, s, dmp), constrain16(g, x, s, dmp)));
256 0 : const v128 delta = v128_add_16(
257 : v128_add_16(
258 : v128_add_16(constrain16(a, x, s, dmp), constrain16(c, x, s, dmp)),
259 : v128_add_16(constrain16(f, x, s, dmp), constrain16(h, x, s, dmp))),
260 : v128_add_16(v128_add_16(bdeg, bdeg), bdeg));
261 0 : return v128_add_16(
262 : x,
263 : v128_shr_s16(
264 : v128_add_16(v128_dup_16(8),
265 : v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
266 : 4));
267 : }
268 :
269 0 : static void calc_delta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
270 : v128 f, v128 g, v128 h, uint16_t *dst,
271 : unsigned int s, unsigned int dmp, int dstride) {
272 0 : o = calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, dmp);
273 0 : v64_store_aligned(dst, v128_high_v64(o));
274 0 : v64_store_aligned(dst + dstride, v128_low_v64(o));
275 0 : }
276 :
277 0 : static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
278 : v128 f, v128 g, v128 h, uint16_t *dst,
279 : unsigned int s, unsigned int adjdamp) {
280 0 : v128_store_aligned(dst,
281 : calc_delta_hbd(o, a, b, c, d, e, f, g, h, s, adjdamp));
282 0 : }
283 :
284 : // delta = 1/16 * constrain(a, x, s, dmp) + 3/16 * constrain(b, x, s, dmp) +
285 : // 3/16 * constrain(c, x, s, dmp) + 1/16 * constrain(d, x, s, dmp)
286 : SIMD_INLINE v128 calc_hdelta_hbd(v128 x, v128 a, v128 b, v128 c, v128 d,
287 : unsigned int s, unsigned int dmp) {
288 0 : const v128 bc =
289 0 : v128_add_16(constrain16(b, x, s, dmp), constrain16(c, x, s, dmp));
290 0 : const v128 delta = v128_add_16(
291 : v128_add_16(constrain16(a, x, s, dmp), constrain16(d, x, s, dmp)),
292 : v128_add_16(v128_add_16(bc, bc), bc));
293 0 : return v128_add_16(
294 : x,
295 : v128_shr_s16(
296 : v128_add_16(v128_dup_16(4),
297 : v128_add_16(delta, v128_cmplt_s16(delta, v128_zero()))),
298 : 3));
299 : }
300 :
301 0 : static void calc_hdelta_hbd4(v128 o, v128 a, v128 b, v128 c, v128 d,
302 : uint16_t *dst, unsigned int s,
303 : unsigned int adjdamp, int dstride) {
304 0 : o = calc_hdelta_hbd(o, a, b, c, d, s, adjdamp);
305 0 : v64_store_aligned(dst, v128_high_v64(o));
306 0 : v64_store_aligned(dst + dstride, v128_low_v64(o));
307 0 : }
308 :
309 0 : static void calc_hdelta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d,
310 : uint16_t *dst, unsigned int s,
311 : unsigned int adjdamp) {
312 0 : v128_store_aligned(dst, calc_hdelta_hbd(o, a, b, c, d, s, adjdamp));
313 0 : }
314 :
315 : // Process blocks of width 4, two lines at time.
316 0 : static void SIMD_FUNC(clpf_block_hbd4)(uint16_t *dst, const uint16_t *src,
317 : int dstride, int sstride, int sizey,
318 : unsigned int strength,
319 : unsigned int adjdamp) {
320 : int y;
321 :
322 0 : for (y = 0; y < sizey; y += 2) {
323 0 : const v64 l1 = v64_load_aligned(src);
324 0 : const v64 l2 = v64_load_aligned(src + sstride);
325 0 : const v64 l3 = v64_load_aligned(src - sstride);
326 0 : const v64 l4 = v64_load_aligned(src + 2 * sstride);
327 0 : const v128 a = v128_from_v64(v64_load_aligned(src - 2 * sstride), l3);
328 0 : const v128 b = v128_from_v64(l3, l1);
329 0 : const v128 g = v128_from_v64(l2, l4);
330 0 : const v128 h = v128_from_v64(l4, v64_load_aligned(src + 3 * sstride));
331 0 : const v128 c = v128_from_v64(v64_load_unaligned(src - 2),
332 0 : v64_load_unaligned(src - 2 + sstride));
333 0 : const v128 d = v128_from_v64(v64_load_unaligned(src - 1),
334 0 : v64_load_unaligned(src - 1 + sstride));
335 0 : const v128 e = v128_from_v64(v64_load_unaligned(src + 1),
336 0 : v64_load_unaligned(src + 1 + sstride));
337 0 : const v128 f = v128_from_v64(v64_load_unaligned(src + 2),
338 0 : v64_load_unaligned(src + 2 + sstride));
339 :
340 0 : calc_delta_hbd4(v128_from_v64(l1, l2), a, b, c, d, e, f, g, h, dst,
341 : strength, adjdamp, dstride);
342 0 : src += sstride * 2;
343 0 : dst += dstride * 2;
344 : }
345 0 : }
346 :
347 : // The most simple case. Start here if you need to understand the functions.
348 0 : static void SIMD_FUNC(clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
349 : int dstride, int sstride, int sizey,
350 : unsigned int strength,
351 : unsigned int adjdamp) {
352 : int y;
353 :
354 0 : for (y = 0; y < sizey; y++) {
355 0 : const v128 o = v128_load_aligned(src);
356 0 : const v128 a = v128_load_aligned(src - 2 * sstride);
357 0 : const v128 b = v128_load_aligned(src - 1 * sstride);
358 0 : const v128 g = v128_load_aligned(src + sstride);
359 0 : const v128 h = v128_load_aligned(src + 2 * sstride);
360 0 : const v128 c = v128_load_unaligned(src - 2);
361 0 : const v128 d = v128_load_unaligned(src - 1);
362 0 : const v128 e = v128_load_unaligned(src + 1);
363 0 : const v128 f = v128_load_unaligned(src + 2);
364 :
365 0 : calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, adjdamp);
366 0 : src += sstride;
367 0 : dst += dstride;
368 : }
369 0 : }
370 :
371 : // Process blocks of width 4, horizontal filter, two lines at time.
372 0 : static void SIMD_FUNC(clpf_hblock_hbd4)(uint16_t *dst, const uint16_t *src,
373 : int dstride, int sstride, int sizey,
374 : unsigned int strength,
375 : unsigned int adjdamp) {
376 : int y;
377 :
378 0 : for (y = 0; y < sizey; y += 2) {
379 0 : const v128 a = v128_from_v64(v64_load_unaligned(src - 2),
380 0 : v64_load_unaligned(src - 2 + sstride));
381 0 : const v128 b = v128_from_v64(v64_load_unaligned(src - 1),
382 0 : v64_load_unaligned(src - 1 + sstride));
383 0 : const v128 c = v128_from_v64(v64_load_unaligned(src + 1),
384 0 : v64_load_unaligned(src + 1 + sstride));
385 0 : const v128 d = v128_from_v64(v64_load_unaligned(src + 2),
386 0 : v64_load_unaligned(src + 2 + sstride));
387 :
388 0 : calc_hdelta_hbd4(v128_from_v64(v64_load_unaligned(src),
389 0 : v64_load_unaligned(src + sstride)),
390 : a, b, c, d, dst, strength, adjdamp, dstride);
391 0 : src += sstride * 2;
392 0 : dst += dstride * 2;
393 : }
394 0 : }
395 :
396 : // Process blocks of width 8, horizontal filter, two lines at time.
397 0 : static void SIMD_FUNC(clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
398 : int dstride, int sstride, int sizey,
399 : unsigned int strength,
400 : unsigned int adjdamp) {
401 : int y;
402 :
403 0 : for (y = 0; y < sizey; y++) {
404 0 : const v128 o = v128_load_aligned(src);
405 0 : const v128 a = v128_load_unaligned(src - 2);
406 0 : const v128 b = v128_load_unaligned(src - 1);
407 0 : const v128 c = v128_load_unaligned(src + 1);
408 0 : const v128 d = v128_load_unaligned(src + 2);
409 :
410 0 : calc_hdelta_hbd8(o, a, b, c, d, dst, strength, adjdamp);
411 0 : src += sstride;
412 0 : dst += dstride;
413 : }
414 0 : }
415 :
416 0 : void SIMD_FUNC(aom_clpf_block_hbd)(uint16_t *dst, const uint16_t *src,
417 : int dstride, int sstride, int sizex,
418 : int sizey, unsigned int strength,
419 : unsigned int dmp) {
420 0 : if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
421 : // Fallback to C for odd sizes:
422 : // * block width not 4 or 8
423 : // * block heights not a multiple of 2 if the block width is 4
424 0 : aom_clpf_block_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
425 : dmp);
426 : } else {
427 0 : (sizex == 4 ? SIMD_FUNC(clpf_block_hbd4) : SIMD_FUNC(clpf_block_hbd))(
428 0 : dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
429 : }
430 0 : }
431 :
432 0 : void SIMD_FUNC(aom_clpf_hblock_hbd)(uint16_t *dst, const uint16_t *src,
433 : int dstride, int sstride, int sizex,
434 : int sizey, unsigned int strength,
435 : unsigned int dmp) {
436 0 : if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
437 : // Fallback to C for odd sizes:
438 : // * block width not 4 or 8
439 : // * block heights not a multiple of 2 if the block width is 4
440 0 : aom_clpf_hblock_hbd_c(dst, src, dstride, sstride, sizex, sizey, strength,
441 : dmp);
442 : } else {
443 0 : (sizex == 4 ? SIMD_FUNC(clpf_hblock_hbd4) : SIMD_FUNC(clpf_hblock_hbd))(
444 0 : dst, src, dstride, sstride, sizey, strength, dmp - get_msb(strength));
445 : }
446 0 : }
|