Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <string.h>
14 :
15 : #include "./aom_dsp_rtcd.h"
16 : #include "./av1_rtcd.h"
17 : #include "av1/common/convolve.h"
18 : #include "av1/common/filter.h"
19 : #include "av1/common/onyxc_int.h"
20 : #include "aom_dsp/aom_dsp_common.h"
21 : #include "aom_ports/mem.h"
22 :
23 : #define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
24 : #define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
25 : #define MAX_STEP (32)
26 :
27 0 : void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
28 : int dst_stride, int w, int h,
29 : const InterpFilterParams filter_params,
30 : const int subpel_x_q4, int x_step_q4,
31 : ConvolveParams *conv_params) {
32 : int x, y;
33 0 : int filter_size = filter_params.taps;
34 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
35 0 : src -= filter_size / 2 - 1;
36 0 : for (y = 0; y < h; ++y) {
37 0 : int x_q4 = subpel_x_q4;
38 0 : for (x = 0; x < w; ++x) {
39 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
40 0 : const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
41 : filter_params, x_q4 & SUBPEL_MASK);
42 0 : int k, sum = 0;
43 0 : for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
44 :
45 0 : sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
46 0 : if (conv_params->ref)
47 0 : dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
48 : else
49 0 : dst[x] = sum;
50 :
51 0 : x_q4 += x_step_q4;
52 : }
53 0 : src += src_stride;
54 0 : dst += dst_stride;
55 : }
56 0 : }
57 :
58 0 : void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
59 : int dst_stride, int w, int h,
60 : const InterpFilterParams filter_params,
61 : const int subpel_y_q4, int y_step_q4,
62 : ConvolveParams *conv_params) {
63 : int x, y;
64 0 : int filter_size = filter_params.taps;
65 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
66 0 : src -= src_stride * (filter_size / 2 - 1);
67 0 : for (x = 0; x < w; ++x) {
68 0 : int y_q4 = subpel_y_q4;
69 0 : for (y = 0; y < h; ++y) {
70 0 : const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
71 0 : const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
72 : filter_params, y_q4 & SUBPEL_MASK);
73 0 : int k, sum = 0;
74 0 : for (k = 0; k < filter_size; ++k)
75 0 : sum += src_y[k * src_stride] * y_filter[k];
76 :
77 0 : sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
78 0 : if (conv_params->ref)
79 0 : dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
80 : else
81 0 : dst[y * dst_stride] = sum;
82 :
83 0 : y_q4 += y_step_q4;
84 : }
85 0 : ++src;
86 0 : ++dst;
87 : }
88 0 : }
89 :
90 0 : static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
91 : int dst_stride, int w, int h,
92 : ConvolveParams *conv_params) {
93 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
94 0 : if (conv_params->ref == 0) {
95 : int r;
96 0 : for (r = 0; r < h; ++r) {
97 0 : memcpy(dst, src, w);
98 0 : src += src_stride;
99 0 : dst += dst_stride;
100 : }
101 : } else {
102 : int r, c;
103 0 : for (r = 0; r < h; ++r) {
104 0 : for (c = 0; c < w; ++c) {
105 0 : dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
106 : }
107 0 : src += src_stride;
108 0 : dst += dst_stride;
109 : }
110 : }
111 0 : }
112 :
113 0 : void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
114 : int dst_stride, int w, int h,
115 : const InterpFilterParams filter_params,
116 : const int subpel_x_q4, int x_step_q4,
117 : ConvolveParams *conv_params) {
118 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
119 0 : if (filter_params.taps == SUBPEL_TAPS) {
120 0 : const int16_t *filter_x =
121 : av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
122 0 : if (conv_params->ref == 0)
123 0 : aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
124 : NULL, -1, w, h);
125 : else
126 0 : aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
127 : x_step_q4, NULL, -1, w, h);
128 : } else {
129 0 : av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
130 : subpel_x_q4, x_step_q4, conv_params);
131 : }
132 0 : }
133 :
134 0 : void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride,
135 : uint8_t *dst, int dst_stride, int w, int h,
136 : const InterpFilterParams filter_params,
137 : const int subpel_x_q4, int x_step_q4,
138 : ConvolveParams *conv_params) {
139 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
140 0 : if (filter_params.taps == SUBPEL_TAPS) {
141 0 : const int16_t *filter_x =
142 : av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
143 0 : if (conv_params->ref == 0)
144 0 : aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
145 : x_step_q4, NULL, -1, w, h);
146 : else
147 0 : aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
148 : x_step_q4, NULL, -1, w, h);
149 : } else {
150 0 : av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
151 : subpel_x_q4, x_step_q4, conv_params);
152 : }
153 0 : }
154 :
155 0 : void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
156 : int dst_stride, int w, int h,
157 : const InterpFilterParams filter_params,
158 : const int subpel_y_q4, int y_step_q4,
159 : ConvolveParams *conv_params) {
160 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
161 0 : if (filter_params.taps == SUBPEL_TAPS) {
162 0 : const int16_t *filter_y =
163 : av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
164 0 : if (conv_params->ref == 0) {
165 0 : aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
166 : y_step_q4, w, h);
167 : } else {
168 0 : aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
169 : filter_y, y_step_q4, w, h);
170 : }
171 : } else {
172 0 : av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
173 : subpel_y_q4, y_step_q4, conv_params);
174 : }
175 0 : }
176 :
177 0 : void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride,
178 : uint8_t *dst, int dst_stride, int w, int h,
179 : const InterpFilterParams filter_params,
180 : const int subpel_y_q4, int y_step_q4,
181 : ConvolveParams *conv_params) {
182 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
183 0 : if (filter_params.taps == SUBPEL_TAPS) {
184 0 : const int16_t *filter_y =
185 : av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
186 0 : if (conv_params->ref == 0) {
187 0 : aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
188 : y_step_q4, w, h);
189 : } else {
190 0 : aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1,
191 : filter_y, y_step_q4, w, h);
192 : }
193 : } else {
194 0 : av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
195 : subpel_y_q4, y_step_q4, conv_params);
196 : }
197 0 : }
198 :
199 : #if CONFIG_CONVOLVE_ROUND
200 : void av1_convolve_rounding(const int32_t *src, int src_stride, uint8_t *dst,
201 : int dst_stride, int w, int h, int bits) {
202 : int r, c;
203 : for (r = 0; r < h; ++r) {
204 : for (c = 0; c < w; ++c) {
205 : dst[r * dst_stride + c] =
206 : clip_pixel(ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits));
207 : }
208 : }
209 : }
210 :
211 : void av1_convolve_2d(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
212 : int dst_stride, int w, int h,
213 : InterpFilterParams *filter_params_x,
214 : InterpFilterParams *filter_params_y, const int subpel_x_q4,
215 : const int subpel_y_q4, ConvolveParams *conv_params) {
216 : int x, y, k;
217 : CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
218 : int im_h = h + filter_params_y->taps - 1;
219 : int im_stride = w;
220 : const int fo_vert = filter_params_y->taps / 2 - 1;
221 : const int fo_horiz = filter_params_x->taps / 2 - 1;
222 : (void)conv_params;
223 : // horizontal filter
224 : const uint8_t *src_horiz = src - fo_vert * src_stride;
225 : const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
226 : *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
227 : for (y = 0; y < im_h; ++y) {
228 : for (x = 0; x < w; ++x) {
229 : CONV_BUF_TYPE sum = 0;
230 : for (k = 0; k < filter_params_x->taps; ++k) {
231 : sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
232 : }
233 : #if CONFIG_COMPOUND_ROUND
234 : im_block[y * im_stride + x] =
235 : clip_pixel(ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0));
236 : #else
237 : im_block[y * im_stride + x] =
238 : ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0);
239 : #endif
240 : }
241 : }
242 :
243 : // vertical filter
244 : CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride;
245 : const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
246 : *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
247 : for (y = 0; y < h; ++y) {
248 : for (x = 0; x < w; ++x) {
249 : CONV_BUF_TYPE sum = 0;
250 : for (k = 0; k < filter_params_y->taps; ++k) {
251 : sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
252 : }
253 : dst[y * dst_stride + x] +=
254 : ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1);
255 : }
256 : }
257 : }
258 :
259 : static INLINE void transpose_uint8(uint8_t *dst, int dst_stride,
260 : const uint8_t *src, int src_stride, int w,
261 : int h) {
262 : int r, c;
263 : for (r = 0; r < h; ++r)
264 : for (c = 0; c < w; ++c)
265 : dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
266 : }
267 :
268 : static INLINE void transpose_int32(int32_t *dst, int dst_stride,
269 : const int32_t *src, int src_stride, int w,
270 : int h) {
271 : int r, c;
272 : for (r = 0; r < h; ++r)
273 : for (c = 0; c < w; ++c)
274 : dst[c * (dst_stride) + r] = src[r * (src_stride) + c];
275 : }
276 :
277 : void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
278 : int dst_stride, int w, int h,
279 : const InterpFilter *interp_filter,
280 : const int subpel_x_q4, int x_step_q4,
281 : const int subpel_y_q4, int y_step_q4,
282 : ConvolveParams *conv_params) {
283 : (void)x_step_q4;
284 : (void)y_step_q4;
285 : (void)dst;
286 : (void)dst_stride;
287 : #if CONFIG_DUAL_FILTER
288 : InterpFilterParams filter_params_x =
289 : av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]);
290 : InterpFilterParams filter_params_y =
291 : av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]);
292 :
293 : #if USE_EXTRA_FILTER
294 : if (filter_params_x.interp_filter == MULTITAP_SHARP &&
295 : filter_params_y.interp_filter == MULTITAP_SHARP) {
296 : // Avoid two directions both using 12-tap filter.
297 : // This will reduce hardware implementation cost.
298 : filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
299 : }
300 : #endif // USE_EXTRA_FILTER
301 : #else
302 : InterpFilterParams filter_params_x =
303 : av1_get_interp_filter_params(*interp_filter);
304 : InterpFilterParams filter_params_y =
305 : av1_get_interp_filter_params(*interp_filter);
306 : #endif
307 :
308 : if (filter_params_y.taps < filter_params_x.taps) {
309 : uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
310 : (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
311 : int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
312 : CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
313 : int tr_dst_stride = MAX_SB_SIZE;
314 : int fo_vert = filter_params_y.taps / 2 - 1;
315 : int fo_horiz = filter_params_x.taps / 2 - 1;
316 :
317 : transpose_uint8(tr_src, tr_src_stride,
318 : src - fo_vert * src_stride - fo_horiz, src_stride,
319 : w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
320 : transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
321 : conv_params->dst_stride, w, h);
322 :
323 : // horizontal and vertical parameters are swapped because of the transpose
324 : av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride,
325 : tr_dst, tr_dst_stride, h, w, &filter_params_y,
326 : &filter_params_x, subpel_y_q4, subpel_x_q4, conv_params);
327 : transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
328 : tr_dst_stride, h, w);
329 : } else {
330 : av1_convolve_2d(src, src_stride, conv_params->dst, conv_params->dst_stride,
331 : w, h, &filter_params_x, &filter_params_y, subpel_x_q4,
332 : subpel_y_q4, conv_params);
333 : }
334 : }
335 :
336 : #if CONFIG_HIGHBITDEPTH
337 : static INLINE void transpose_uint16(uint16_t *dst, int dst_stride,
338 : const uint16_t *src, int src_stride, int w,
339 : int h) {
340 : int r, c;
341 : for (r = 0; r < h; ++r)
342 : for (c = 0; c < w; ++c) dst[c * dst_stride + r] = src[r * src_stride + c];
343 : }
344 :
345 : void av1_highbd_convolve_rounding(const int32_t *src, int src_stride,
346 : uint8_t *dst8, int dst_stride, int w, int h,
347 : int bits, int bd) {
348 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
349 : int r, c;
350 : for (r = 0; r < h; ++r) {
351 : for (c = 0; c < w; ++c) {
352 : dst[r * dst_stride + c] = clip_pixel_highbd(
353 : ROUND_POWER_OF_TWO_SIGNED(src[r * src_stride + c], bits), bd);
354 : }
355 : }
356 : }
357 :
358 : void av1_highbd_convolve_2d(const uint16_t *src, int src_stride,
359 : CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
360 : InterpFilterParams *filter_params_x,
361 : InterpFilterParams *filter_params_y,
362 : const int subpel_x_q4, const int subpel_y_q4,
363 : ConvolveParams *conv_params, int bd) {
364 : int x, y, k;
365 : CONV_BUF_TYPE im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
366 : int im_h = h + filter_params_y->taps - 1;
367 : int im_stride = w;
368 : const int fo_vert = filter_params_y->taps / 2 - 1;
369 : const int fo_horiz = filter_params_x->taps / 2 - 1;
370 : (void)conv_params;
371 : // horizontal filter
372 : const uint16_t *src_horiz = src - fo_vert * src_stride;
373 : const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
374 : *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
375 : for (y = 0; y < im_h; ++y) {
376 : for (x = 0; x < w; ++x) {
377 : CONV_BUF_TYPE sum = 0;
378 : for (k = 0; k < filter_params_x->taps; ++k) {
379 : sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
380 : }
381 : #if CONFIG_COMPOUND_ROUND
382 : im_block[y * im_stride + x] = clip_pixel_highbd(
383 : ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0), bd);
384 : #else
385 : (void)bd;
386 : im_block[y * im_stride + x] =
387 : ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_0);
388 : #endif
389 : }
390 : }
391 :
392 : // vertical filter
393 : CONV_BUF_TYPE *src_vert = im_block + fo_vert * im_stride;
394 : const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
395 : *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
396 : for (y = 0; y < h; ++y) {
397 : for (x = 0; x < w; ++x) {
398 : CONV_BUF_TYPE sum = 0;
399 : for (k = 0; k < filter_params_y->taps; ++k) {
400 : sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
401 : }
402 : dst[y * dst_stride + x] +=
403 : ROUND_POWER_OF_TWO_SIGNED(sum, conv_params->round_1);
404 : }
405 : }
406 : }
407 :
408 : void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
409 : uint8_t *dst, int dst_stride, int w, int h,
410 : const InterpFilter *interp_filter,
411 : const int subpel_x_q4, int x_step_q4,
412 : const int subpel_y_q4, int y_step_q4,
413 : ConvolveParams *conv_params, int bd) {
414 : (void)x_step_q4;
415 : (void)y_step_q4;
416 : (void)dst;
417 : (void)dst_stride;
418 : #if CONFIG_DUAL_FILTER
419 : InterpFilterParams filter_params_x =
420 : av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]);
421 : InterpFilterParams filter_params_y =
422 : av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]);
423 :
424 : #if USE_EXTRA_FILTER
425 : if (filter_params_x.interp_filter == MULTITAP_SHARP &&
426 : filter_params_y.interp_filter == MULTITAP_SHARP) {
427 : // Avoid two directions both using 12-tap filter.
428 : // This will reduce hardware implementation cost.
429 : filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
430 : }
431 : #endif
432 : #else
433 : InterpFilterParams filter_params_x =
434 : av1_get_interp_filter_params(*interp_filter);
435 : InterpFilterParams filter_params_y =
436 : av1_get_interp_filter_params(*interp_filter);
437 : #endif
438 : const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
439 : if (filter_params_y.taps < filter_params_x.taps) {
440 : uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
441 : (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
442 : int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
443 : CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
444 : int tr_dst_stride = MAX_SB_SIZE;
445 : int fo_vert = filter_params_y.taps / 2 - 1;
446 : int fo_horiz = filter_params_x.taps / 2 - 1;
447 :
448 : transpose_uint16(
449 : tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
450 : src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
451 : transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
452 : conv_params->dst_stride, w, h);
453 :
454 : // horizontal and vertical parameters are swapped because of the transpose
455 : av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
456 : tr_src_stride, tr_dst, tr_dst_stride, h, w,
457 : &filter_params_y, &filter_params_x, subpel_y_q4,
458 : subpel_x_q4, conv_params, bd);
459 : transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
460 : tr_dst_stride, h, w);
461 : } else {
462 : av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
463 : conv_params->dst_stride, w, h, &filter_params_x,
464 : &filter_params_y, subpel_x_q4, subpel_y_q4,
465 : conv_params, bd);
466 : }
467 : }
468 : #endif // CONFIG_HIGHBITDEPTH
469 :
470 : #endif // CONFIG_CONVOLVE_ROUND
471 :
472 : typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
473 : int dst_stride, int w, int h,
474 : const InterpFilterParams filter_params,
475 : const int subpel_q4, int step_q4,
476 : ConvolveParams *conv_params);
477 :
478 0 : static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst,
479 : int dst_stride, int w, int h,
480 : #if CONFIG_DUAL_FILTER
481 : const InterpFilter *interp_filter,
482 : #else
483 : const InterpFilter interp_filter,
484 : #endif
485 : const int subpel_x_q4, int x_step_q4,
486 : const int subpel_y_q4, int y_step_q4,
487 : ConvolveParams *conv_params,
488 : ConvolveFunc convolve_horiz,
489 : ConvolveFunc convolve_vert) {
490 0 : int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
491 0 : int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
492 : #if CONFIG_DUAL_FILTER
493 0 : InterpFilterParams filter_params_x =
494 0 : av1_get_interp_filter_params(interp_filter[1 + 2 * conv_params->ref]);
495 0 : InterpFilterParams filter_params_y =
496 0 : av1_get_interp_filter_params(interp_filter[0 + 2 * conv_params->ref]);
497 : InterpFilterParams filter_params;
498 : #else
499 : InterpFilterParams filter_params =
500 : av1_get_interp_filter_params(interp_filter);
501 : #endif
502 0 : assert(conv_params->round == CONVOLVE_OPT_ROUND);
503 :
504 0 : assert(w <= MAX_BLOCK_WIDTH);
505 0 : assert(h <= MAX_BLOCK_HEIGHT);
506 0 : assert(y_step_q4 <= MAX_STEP);
507 0 : assert(x_step_q4 <= MAX_STEP);
508 :
509 0 : if (ignore_horiz && ignore_vert) {
510 0 : convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
511 0 : } else if (ignore_vert) {
512 : #if CONFIG_DUAL_FILTER
513 0 : filter_params = filter_params_x;
514 : #endif
515 0 : assert(filter_params.taps <= MAX_FILTER_TAP);
516 0 : convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
517 : subpel_x_q4, x_step_q4, conv_params);
518 0 : } else if (ignore_horiz) {
519 : #if CONFIG_DUAL_FILTER
520 0 : filter_params = filter_params_y;
521 : #endif
522 0 : assert(filter_params.taps <= MAX_FILTER_TAP);
523 0 : convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
524 : subpel_y_q4, y_step_q4, conv_params);
525 : } else {
526 : // temp's size is set to a 256 aligned value to facilitate SIMD
527 : // implementation. The value is greater than (maximum possible intermediate
528 : // height or width) * MAX_SB_SIZE
529 : DECLARE_ALIGNED(16, uint8_t,
530 : temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
531 0 : int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
532 : int filter_size;
533 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
534 : if (interp_filter[0 + 2 * conv_params->ref] == MULTITAP_SHARP &&
535 : interp_filter[1 + 2 * conv_params->ref] == MULTITAP_SHARP) {
536 : // Avoid two directions both using 12-tap filter.
537 : // This will reduce hardware implementation cost.
538 : filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
539 : }
540 :
541 : // we do filter with fewer taps first to reduce hardware implementation
542 : // complexity
543 : if (filter_params_y.taps < filter_params_x.taps) {
544 : int intermediate_width;
545 : int temp_stride = max_intermediate_size;
546 : ConvolveParams temp_conv_params;
547 : temp_conv_params.ref = 0;
548 : temp_conv_params.round = CONVOLVE_OPT_ROUND;
549 : filter_params = filter_params_y;
550 : filter_size = filter_params_x.taps;
551 : intermediate_width =
552 : (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
553 : assert(intermediate_width <= max_intermediate_size);
554 :
555 : assert(filter_params.taps <= MAX_FILTER_TAP);
556 :
557 : convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
558 : intermediate_width, h, filter_params, subpel_y_q4,
559 : y_step_q4, &temp_conv_params);
560 :
561 : filter_params = filter_params_x;
562 : assert(filter_params.taps <= MAX_FILTER_TAP);
563 : convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
564 : w, h, filter_params, subpel_x_q4, x_step_q4, conv_params);
565 : } else
566 : #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
567 : {
568 : int intermediate_height;
569 0 : int temp_stride = MAX_SB_SIZE;
570 : ConvolveParams temp_conv_params;
571 0 : temp_conv_params.ref = 0;
572 0 : temp_conv_params.round = CONVOLVE_OPT_ROUND;
573 : #if CONFIG_DUAL_FILTER
574 0 : filter_params = filter_params_x;
575 0 : filter_size = filter_params_y.taps;
576 : #else
577 : filter_size = filter_params.taps;
578 : #endif
579 0 : intermediate_height =
580 0 : (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
581 0 : assert(intermediate_height <= max_intermediate_size);
582 : (void)max_intermediate_size;
583 :
584 0 : assert(filter_params.taps <= MAX_FILTER_TAP);
585 :
586 0 : convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
587 : temp_stride, w, intermediate_height, filter_params,
588 : subpel_x_q4, x_step_q4, &temp_conv_params);
589 :
590 : #if CONFIG_DUAL_FILTER
591 0 : filter_params = filter_params_y;
592 : #endif
593 0 : assert(filter_params.taps <= MAX_FILTER_TAP);
594 :
595 0 : convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
596 : dst, dst_stride, w, h, filter_params, subpel_y_q4,
597 : y_step_q4, conv_params);
598 : }
599 : }
600 0 : }
601 :
602 0 : void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
603 : int dst_stride, int w, int h,
604 : #if CONFIG_DUAL_FILTER
605 : const InterpFilter *interp_filter,
606 : #else
607 : const InterpFilter interp_filter,
608 : #endif
609 : const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
610 : int y_step_q4, ConvolveParams *conv_params) {
611 0 : convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filter,
612 : subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
613 : av1_convolve_horiz_facade, av1_convolve_vert_facade);
614 0 : }
615 :
616 0 : void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
617 : int dst_stride, int w, int h,
618 : #if CONFIG_DUAL_FILTER
619 : const InterpFilter *interp_filter,
620 : #else
621 : const InterpFilter interp_filter,
622 : #endif
623 : const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
624 : int y_step_q4, ConvolveParams *conv_params) {
625 0 : convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filter,
626 : subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
627 : av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c);
628 0 : }
629 :
630 0 : void av1_lowbd_convolve_init_c(void) {
631 : // A placeholder for SIMD initialization
632 0 : return;
633 : }
634 :
635 0 : void av1_highbd_convolve_init_c(void) {
636 : // A placeholder for SIMD initialization
637 0 : return;
638 : }
639 :
640 0 : void av1_convolve_init(AV1_COMMON *cm) {
641 : #if CONFIG_HIGHBITDEPTH
642 0 : if (cm->use_highbitdepth)
643 0 : av1_highbd_convolve_init();
644 : else
645 0 : av1_lowbd_convolve_init();
646 : #else
647 : (void)cm;
648 : av1_lowbd_convolve_init();
649 : #endif
650 0 : return;
651 : }
652 :
653 : #if CONFIG_HIGHBITDEPTH
654 0 : void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
655 : uint16_t *dst, int dst_stride, int w, int h,
656 : const InterpFilterParams filter_params,
657 : const int subpel_x_q4, int x_step_q4, int avg,
658 : int bd) {
659 : int x, y;
660 0 : int filter_size = filter_params.taps;
661 0 : src -= filter_size / 2 - 1;
662 0 : for (y = 0; y < h; ++y) {
663 0 : int x_q4 = subpel_x_q4;
664 0 : for (x = 0; x < w; ++x) {
665 0 : const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
666 0 : const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
667 : filter_params, x_q4 & SUBPEL_MASK);
668 0 : int k, sum = 0;
669 0 : for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
670 0 : if (avg)
671 0 : dst[x] = ROUND_POWER_OF_TWO(
672 : dst[x] +
673 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
674 : 1);
675 : else
676 0 : dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
677 0 : x_q4 += x_step_q4;
678 : }
679 0 : src += src_stride;
680 0 : dst += dst_stride;
681 : }
682 0 : }
683 :
684 0 : void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
685 : uint16_t *dst, int dst_stride, int w, int h,
686 : const InterpFilterParams filter_params,
687 : const int subpel_y_q4, int y_step_q4, int avg,
688 : int bd) {
689 : int x, y;
690 0 : int filter_size = filter_params.taps;
691 0 : src -= src_stride * (filter_size / 2 - 1);
692 :
693 0 : for (x = 0; x < w; ++x) {
694 0 : int y_q4 = subpel_y_q4;
695 0 : for (y = 0; y < h; ++y) {
696 0 : const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
697 0 : const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
698 : filter_params, y_q4 & SUBPEL_MASK);
699 0 : int k, sum = 0;
700 0 : for (k = 0; k < filter_size; ++k)
701 0 : sum += src_y[k * src_stride] * y_filter[k];
702 0 : if (avg) {
703 0 : dst[y * dst_stride] = ROUND_POWER_OF_TWO(
704 : dst[y * dst_stride] +
705 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
706 : 1);
707 : } else {
708 0 : dst[y * dst_stride] =
709 0 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
710 : }
711 0 : y_q4 += y_step_q4;
712 : }
713 0 : ++src;
714 0 : ++dst;
715 : }
716 0 : }
717 :
718 0 : static void highbd_convolve_copy(const uint16_t *src, int src_stride,
719 : uint16_t *dst, int dst_stride, int w, int h,
720 : int avg, int bd) {
721 0 : if (avg == 0) {
722 : int r;
723 0 : for (r = 0; r < h; ++r) {
724 0 : memcpy(dst, src, w * sizeof(*src));
725 0 : src += src_stride;
726 0 : dst += dst_stride;
727 : }
728 : } else {
729 : int r, c;
730 0 : for (r = 0; r < h; ++r) {
731 0 : for (c = 0; c < w; ++c) {
732 0 : dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
733 : }
734 0 : src += src_stride;
735 0 : dst += dst_stride;
736 : }
737 : }
738 0 : }
739 :
740 0 : void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride,
741 : uint8_t *dst8, int dst_stride, int w,
742 : int h,
743 : const InterpFilterParams filter_params,
744 : const int subpel_x_q4, int x_step_q4,
745 : int avg, int bd) {
746 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
747 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
748 0 : if (filter_params.taps == SUBPEL_TAPS) {
749 0 : const int16_t *filter_x =
750 : av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
751 0 : if (avg == 0)
752 0 : aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x,
753 : x_step_q4, NULL, -1, w, h, bd);
754 : else
755 0 : aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride,
756 : filter_x, x_step_q4, NULL, -1, w, h, bd);
757 : } else {
758 0 : av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
759 : filter_params, subpel_x_q4, x_step_q4, avg, bd);
760 : }
761 0 : }
762 :
763 0 : void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
764 : uint8_t *dst8, int dst_stride, int w,
765 : int h,
766 : const InterpFilterParams filter_params,
767 : const int subpel_y_q4, int y_step_q4,
768 : int avg, int bd) {
769 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
770 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
771 :
772 0 : if (filter_params.taps == SUBPEL_TAPS) {
773 0 : const int16_t *filter_y =
774 : av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
775 0 : if (avg == 0) {
776 0 : aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1,
777 : filter_y, y_step_q4, w, h, bd);
778 : } else {
779 0 : aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL,
780 : -1, filter_y, y_step_q4, w, h, bd);
781 : }
782 : } else {
783 0 : av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
784 : filter_params, subpel_y_q4, y_step_q4, avg, bd);
785 : }
786 0 : }
787 :
788 0 : void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
789 : int dst_stride, int w, int h,
790 : #if CONFIG_DUAL_FILTER
791 : const InterpFilter *interp_filter,
792 : #else
793 : const InterpFilter interp_filter,
794 : #endif
795 : const int subpel_x_q4, int x_step_q4,
796 : const int subpel_y_q4, int y_step_q4, int ref_idx,
797 : int bd) {
798 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
799 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
800 0 : int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
801 0 : int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
802 :
803 0 : assert(w <= MAX_BLOCK_WIDTH);
804 0 : assert(h <= MAX_BLOCK_HEIGHT);
805 0 : assert(y_step_q4 <= MAX_STEP);
806 0 : assert(x_step_q4 <= MAX_STEP);
807 :
808 0 : if (ignore_horiz && ignore_vert) {
809 0 : highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
810 0 : } else if (ignore_vert) {
811 : #if CONFIG_DUAL_FILTER
812 0 : InterpFilterParams filter_params =
813 0 : av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
814 : #else
815 : InterpFilterParams filter_params =
816 : av1_get_interp_filter_params(interp_filter);
817 : #endif
818 0 : av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h,
819 : filter_params, subpel_x_q4, x_step_q4,
820 : ref_idx, bd);
821 0 : } else if (ignore_horiz) {
822 : #if CONFIG_DUAL_FILTER
823 0 : InterpFilterParams filter_params =
824 0 : av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
825 : #else
826 : InterpFilterParams filter_params =
827 : av1_get_interp_filter_params(interp_filter);
828 : #endif
829 0 : av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h,
830 : filter_params, subpel_y_q4, y_step_q4,
831 : ref_idx, bd);
832 : } else {
833 : // temp's size is set to a 256 aligned value to facilitate SIMD
834 : // implementation. The value is greater than (maximum possible intermediate
835 : // height or width) * MAX_SB_SIZE
836 : DECLARE_ALIGNED(16, uint16_t,
837 : temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
838 0 : uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
839 0 : int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
840 : int filter_size;
841 : InterpFilterParams filter_params;
842 : #if CONFIG_DUAL_FILTER
843 0 : InterpFilterParams filter_params_x =
844 0 : av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
845 0 : InterpFilterParams filter_params_y =
846 0 : av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
847 : #endif
848 :
849 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
850 : if (interp_filter[0 + 2 * ref_idx] == MULTITAP_SHARP &&
851 : interp_filter[1 + 2 * ref_idx] == MULTITAP_SHARP) {
852 : // Avoid two directions both using 12-tap filter.
853 : // This will reduce hardware implementation cost.
854 : filter_params_y = av1_get_interp_filter_params(EIGHTTAP_SHARP);
855 : }
856 : if (filter_params_y.taps < filter_params_x.taps) {
857 : int intermediate_width;
858 : int temp_stride = max_intermediate_size;
859 : filter_params = filter_params_y;
860 : filter_size = filter_params_x.taps;
861 : intermediate_width =
862 : (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
863 : assert(intermediate_width <= max_intermediate_size);
864 :
865 : assert(filter_params.taps <= MAX_FILTER_TAP);
866 :
867 : av1_highbd_convolve_vert_facade(
868 : src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
869 : intermediate_width, h, filter_params, subpel_y_q4, y_step_q4, 0, bd);
870 :
871 : filter_params = filter_params_x;
872 : assert(filter_params.taps <= MAX_FILTER_TAP);
873 :
874 : av1_highbd_convolve_horiz_facade(
875 : temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
876 : filter_params, subpel_x_q4, x_step_q4, ref_idx, bd);
877 : } else
878 : #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
879 : {
880 : int intermediate_height;
881 0 : int temp_stride = MAX_SB_SIZE;
882 : #if CONFIG_DUAL_FILTER
883 0 : filter_params = filter_params_x;
884 0 : filter_size = filter_params_y.taps;
885 : #else
886 : filter_params = av1_get_interp_filter_params(interp_filter);
887 : filter_size = filter_params.taps;
888 : #endif
889 0 : intermediate_height =
890 0 : (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
891 0 : assert(intermediate_height <= max_intermediate_size);
892 : (void)max_intermediate_size;
893 :
894 0 : av1_highbd_convolve_horiz_facade(
895 0 : src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
896 : temp_stride, w, intermediate_height, filter_params, subpel_x_q4,
897 : x_step_q4, 0, bd);
898 :
899 : #if CONFIG_DUAL_FILTER
900 0 : filter_params = filter_params_y;
901 : #endif
902 0 : filter_size = filter_params.taps;
903 0 : assert(filter_params.taps <= MAX_FILTER_TAP);
904 :
905 0 : av1_highbd_convolve_vert_facade(
906 0 : temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
907 : dst_stride, w, h, filter_params, subpel_y_q4, y_step_q4, ref_idx, bd);
908 : }
909 : }
910 0 : }
911 : #endif // CONFIG_HIGHBITDEPTH
|