Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <string.h>
14 :
15 : #include "./aom_config.h"
16 : #include "./aom_dsp_rtcd.h"
17 : #include "aom/aom_integer.h"
18 : #include "aom_dsp/aom_convolve.h"
19 : #include "aom_dsp/aom_dsp_common.h"
20 : #include "aom_dsp/aom_filter.h"
21 : #include "aom_ports/mem.h"
22 :
23 0 : static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
24 : uint8_t *dst, ptrdiff_t dst_stride,
25 : const InterpKernel *x_filters, int x0_q4,
26 : int x_step_q4, int w, int h) {
27 : int x, y;
28 0 : src -= SUBPEL_TAPS / 2 - 1;
29 0 : for (y = 0; y < h; ++y) {
30 0 : int x_q4 = x0_q4;
31 0 : for (x = 0; x < w; ++x) {
32 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34 0 : int k, sum = 0;
35 0 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36 0 : dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37 0 : x_q4 += x_step_q4;
38 : }
39 0 : src += src_stride;
40 0 : dst += dst_stride;
41 : }
42 0 : }
43 :
44 0 : static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
45 : uint8_t *dst, ptrdiff_t dst_stride,
46 : const InterpKernel *x_filters, int x0_q4,
47 : int x_step_q4, int w, int h) {
48 : int x, y;
49 0 : src -= SUBPEL_TAPS / 2 - 1;
50 0 : for (y = 0; y < h; ++y) {
51 0 : int x_q4 = x0_q4;
52 0 : for (x = 0; x < w; ++x) {
53 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
54 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
55 0 : int k, sum = 0;
56 0 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
57 0 : dst[x] = ROUND_POWER_OF_TWO(
58 : dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
59 0 : x_q4 += x_step_q4;
60 : }
61 0 : src += src_stride;
62 0 : dst += dst_stride;
63 : }
64 0 : }
65 :
66 0 : static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
67 : uint8_t *dst, ptrdiff_t dst_stride,
68 : const InterpKernel *y_filters, int y0_q4,
69 : int y_step_q4, int w, int h) {
70 : int x, y;
71 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
72 :
73 0 : for (x = 0; x < w; ++x) {
74 0 : int y_q4 = y0_q4;
75 0 : for (y = 0; y < h; ++y) {
76 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
77 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
78 0 : int k, sum = 0;
79 0 : for (k = 0; k < SUBPEL_TAPS; ++k)
80 0 : sum += src_y[k * src_stride] * y_filter[k];
81 0 : dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
82 0 : y_q4 += y_step_q4;
83 : }
84 0 : ++src;
85 0 : ++dst;
86 : }
87 0 : }
88 :
89 0 : static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
90 : uint8_t *dst, ptrdiff_t dst_stride,
91 : const InterpKernel *y_filters, int y0_q4,
92 : int y_step_q4, int w, int h) {
93 : int x, y;
94 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
95 :
96 0 : for (x = 0; x < w; ++x) {
97 0 : int y_q4 = y0_q4;
98 0 : for (y = 0; y < h; ++y) {
99 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
100 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
101 0 : int k, sum = 0;
102 0 : for (k = 0; k < SUBPEL_TAPS; ++k)
103 0 : sum += src_y[k * src_stride] * y_filter[k];
104 0 : dst[y * dst_stride] = ROUND_POWER_OF_TWO(
105 : dst[y * dst_stride] +
106 : clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
107 : 1);
108 0 : y_q4 += y_step_q4;
109 : }
110 0 : ++src;
111 0 : ++dst;
112 : }
113 0 : }
114 :
115 0 : static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
116 : ptrdiff_t dst_stride, const InterpKernel *const x_filters,
117 : int x0_q4, int x_step_q4,
118 : const InterpKernel *const y_filters, int y0_q4,
119 : int y_step_q4, int w, int h) {
120 : // Note: Fixed size intermediate buffer, temp, places limits on parameters.
121 : // 2d filtering proceeds in 2 steps:
122 : // (1) Interpolate horizontally into an intermediate buffer, temp.
123 : // (2) Interpolate temp vertically to derive the sub-pixel result.
124 : // Deriving the maximum number of rows in the temp buffer (135):
125 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
126 : // --Largest block size is 64x64 pixels.
127 : // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
128 : // original frame (in 1/16th pixel units).
129 : // --Must round-up because block may be located at sub-pixel position.
130 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
131 : // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
132 : uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
133 0 : int intermediate_height =
134 0 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
135 :
136 0 : assert(w <= MAX_SB_SIZE);
137 0 : assert(h <= MAX_SB_SIZE);
138 :
139 0 : assert(y_step_q4 <= 32);
140 0 : assert(x_step_q4 <= 32);
141 :
142 0 : convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
143 : MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
144 : intermediate_height);
145 0 : convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
146 : dst_stride, y_filters, y0_q4, y_step_q4, w, h);
147 0 : }
148 :
149 0 : static const InterpKernel *get_filter_base(const int16_t *filter) {
150 : // NOTE: This assumes that the filter table is 256-byte aligned.
151 : // TODO(agrange) Modify to make independent of table alignment.
152 0 : return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
153 : }
154 :
155 0 : static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
156 0 : return (int)((const InterpKernel *)(intptr_t)f - base);
157 : }
158 :
159 0 : void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
160 : uint8_t *dst, ptrdiff_t dst_stride,
161 : const int16_t *filter_x, int x_step_q4,
162 : const int16_t *filter_y, int y_step_q4, int w,
163 : int h) {
164 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
165 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
166 :
167 : (void)filter_y;
168 : (void)y_step_q4;
169 :
170 0 : convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
171 : w, h);
172 0 : }
173 :
174 0 : void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
175 : uint8_t *dst, ptrdiff_t dst_stride,
176 : const int16_t *filter_x, int x_step_q4,
177 : const int16_t *filter_y, int y_step_q4, int w,
178 : int h) {
179 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
180 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
181 :
182 : (void)filter_y;
183 : (void)y_step_q4;
184 :
185 0 : convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
186 : x_step_q4, w, h);
187 0 : }
188 :
189 0 : void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
190 : uint8_t *dst, ptrdiff_t dst_stride,
191 : const int16_t *filter_x, int x_step_q4,
192 : const int16_t *filter_y, int y_step_q4, int w,
193 : int h) {
194 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
195 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
196 :
197 : (void)filter_x;
198 : (void)x_step_q4;
199 :
200 0 : convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
201 : w, h);
202 0 : }
203 :
204 0 : void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
205 : uint8_t *dst, ptrdiff_t dst_stride,
206 : const int16_t *filter_x, int x_step_q4,
207 : const int16_t *filter_y, int y_step_q4, int w,
208 : int h) {
209 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
210 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
211 :
212 : (void)filter_x;
213 : (void)x_step_q4;
214 :
215 0 : convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
216 : y_step_q4, w, h);
217 0 : }
218 :
219 0 : void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
220 : ptrdiff_t dst_stride, const int16_t *filter_x,
221 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
222 : int w, int h) {
223 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
224 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
225 :
226 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
227 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
228 :
229 0 : convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
230 : filters_y, y0_q4, y_step_q4, w, h);
231 0 : }
232 :
233 0 : void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
234 : ptrdiff_t dst_stride, const int16_t *filter_x,
235 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
236 : int w, int h) {
237 : /* Fixed size intermediate buffer places limits on parameters. */
238 : DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
239 0 : assert(w <= MAX_SB_SIZE);
240 0 : assert(h <= MAX_SB_SIZE);
241 :
242 0 : aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
243 : filter_y, y_step_q4, w, h);
244 0 : aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
245 : h);
246 0 : }
247 :
248 0 : void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
249 : ptrdiff_t dst_stride, const int16_t *filter_x,
250 : int filter_x_stride, const int16_t *filter_y,
251 : int filter_y_stride, int w, int h) {
252 : int r;
253 :
254 : (void)filter_x;
255 : (void)filter_x_stride;
256 : (void)filter_y;
257 : (void)filter_y_stride;
258 :
259 0 : for (r = h; r > 0; --r) {
260 0 : memcpy(dst, src, w);
261 0 : src += src_stride;
262 0 : dst += dst_stride;
263 : }
264 0 : }
265 :
266 0 : void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
267 : ptrdiff_t dst_stride, const int16_t *filter_x,
268 : int filter_x_stride, const int16_t *filter_y,
269 : int filter_y_stride, int w, int h) {
270 : int x, y;
271 :
272 : (void)filter_x;
273 : (void)filter_x_stride;
274 : (void)filter_y;
275 : (void)filter_y_stride;
276 :
277 0 : for (y = 0; y < h; ++y) {
278 0 : for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
279 :
280 0 : src += src_stride;
281 0 : dst += dst_stride;
282 : }
283 0 : }
284 :
285 0 : void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
286 : ptrdiff_t dst_stride, const int16_t *filter_x,
287 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
288 : int w, int h) {
289 0 : aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
290 : filter_y, y_step_q4, w, h);
291 0 : }
292 :
293 0 : void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
294 : ptrdiff_t dst_stride, const int16_t *filter_x,
295 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
296 : int w, int h) {
297 0 : aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
298 : filter_y, y_step_q4, w, h);
299 0 : }
300 :
301 0 : void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
302 : ptrdiff_t dst_stride, const int16_t *filter_x,
303 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
304 : int w, int h) {
305 0 : aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
306 : filter_y, y_step_q4, w, h);
307 0 : }
308 :
309 0 : void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
310 : uint8_t *dst, ptrdiff_t dst_stride,
311 : const int16_t *filter_x, int x_step_q4,
312 : const int16_t *filter_y, int y_step_q4, int w,
313 : int h) {
314 0 : aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
315 : x_step_q4, filter_y, y_step_q4, w, h);
316 0 : }
317 :
318 0 : void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
319 : uint8_t *dst, ptrdiff_t dst_stride,
320 : const int16_t *filter_x, int x_step_q4,
321 : const int16_t *filter_y, int y_step_q4, int w,
322 : int h) {
323 0 : aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
324 : x_step_q4, filter_y, y_step_q4, w, h);
325 0 : }
326 :
327 0 : void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
328 : ptrdiff_t dst_stride, const int16_t *filter_x,
329 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
330 : int w, int h) {
331 0 : aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
332 : filter_y, y_step_q4, w, h);
333 0 : }
334 :
335 : #if CONFIG_LOOP_RESTORATION
336 : static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
337 : uint8_t *dst, ptrdiff_t dst_stride,
338 : const InterpKernel *x_filters, int x0_q4,
339 : int x_step_q4, int w, int h) {
340 : int x, y, k;
341 : src -= SUBPEL_TAPS / 2 - 1;
342 : for (y = 0; y < h; ++y) {
343 : int x_q4 = x0_q4;
344 : for (x = 0; x < w; ++x) {
345 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
346 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
347 : int sum = 0;
348 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
349 : dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
350 : src_x[SUBPEL_TAPS / 2 - 1]);
351 : x_q4 += x_step_q4;
352 : }
353 : src += src_stride;
354 : dst += dst_stride;
355 : }
356 : }
357 :
358 : static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
359 : uint8_t *dst, ptrdiff_t dst_stride,
360 : const InterpKernel *y_filters, int y0_q4,
361 : int y_step_q4, int w, int h) {
362 : int x, y, k;
363 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
364 :
365 : for (x = 0; x < w; ++x) {
366 : int y_q4 = y0_q4;
367 : for (y = 0; y < h; ++y) {
368 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
369 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
370 : int sum = 0;
371 : for (k = 0; k < SUBPEL_TAPS; ++k)
372 : sum += src_y[k * src_stride] * y_filter[k];
373 : dst[y * dst_stride] =
374 : clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
375 : src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
376 : y_q4 += y_step_q4;
377 : }
378 : ++src;
379 : ++dst;
380 : }
381 : }
382 :
383 : static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
384 : uint8_t *dst, ptrdiff_t dst_stride,
385 : const InterpKernel *const x_filters, int x0_q4,
386 : int x_step_q4, const InterpKernel *const y_filters,
387 : int y0_q4, int y_step_q4, int w, int h) {
388 : uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
389 : int intermediate_height =
390 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
391 :
392 : assert(w <= MAX_SB_SIZE);
393 : assert(h <= MAX_SB_SIZE);
394 :
395 : assert(y_step_q4 <= 32);
396 : assert(x_step_q4 <= 32);
397 :
398 : convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
399 : temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
400 : intermediate_height);
401 : convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
402 : dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
403 : }
404 :
405 : void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
406 : uint8_t *dst, ptrdiff_t dst_stride,
407 : const int16_t *filter_x, int x_step_q4,
408 : const int16_t *filter_y, int y_step_q4,
409 : int w, int h) {
410 : const InterpKernel *const filters_x = get_filter_base(filter_x);
411 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
412 :
413 : (void)filter_y;
414 : (void)y_step_q4;
415 :
416 : convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
417 : x_step_q4, w, h);
418 : }
419 :
420 : void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
421 : uint8_t *dst, ptrdiff_t dst_stride,
422 : const int16_t *filter_x, int x_step_q4,
423 : const int16_t *filter_y, int y_step_q4, int w,
424 : int h) {
425 : const InterpKernel *const filters_y = get_filter_base(filter_y);
426 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
427 :
428 : (void)filter_x;
429 : (void)x_step_q4;
430 :
431 : convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
432 : y_step_q4, w, h);
433 : }
434 :
435 : void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
436 : uint8_t *dst, ptrdiff_t dst_stride,
437 : const int16_t *filter_x, int x_step_q4,
438 : const int16_t *filter_y, int y_step_q4, int w,
439 : int h) {
440 : const InterpKernel *const filters_x = get_filter_base(filter_x);
441 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
442 :
443 : const InterpKernel *const filters_y = get_filter_base(filter_y);
444 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
445 :
446 : convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
447 : x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
448 : }
449 :
450 : static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
451 : uint16_t *dst, ptrdiff_t dst_stride,
452 : const InterpKernel *x_filters, int x0_q4,
453 : int x_step_q4, int w, int h) {
454 : const int bd = 8;
455 : int x, y, k;
456 : src -= SUBPEL_TAPS / 2 - 1;
457 : for (y = 0; y < h; ++y) {
458 : int x_q4 = x0_q4;
459 : for (x = 0; x < w; ++x) {
460 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
461 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
462 : int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
463 : (1 << (bd + FILTER_BITS - 1));
464 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
465 : dst[x] =
466 : (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
467 : 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
468 : x_q4 += x_step_q4;
469 : }
470 : src += src_stride;
471 : dst += dst_stride;
472 : }
473 : }
474 :
475 : static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
476 : uint8_t *dst, ptrdiff_t dst_stride,
477 : const InterpKernel *y_filters, int y0_q4,
478 : int y_step_q4, int w, int h) {
479 : const int bd = 8;
480 : int x, y, k;
481 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
482 :
483 : for (x = 0; x < w; ++x) {
484 : int y_q4 = y0_q4;
485 : for (y = 0; y < h; ++y) {
486 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
487 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
488 : int sum =
489 : ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
490 : (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
491 : for (k = 0; k < SUBPEL_TAPS; ++k)
492 : sum += src_y[k * src_stride] * y_filter[k];
493 : dst[y * dst_stride] =
494 : clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
495 : y_q4 += y_step_q4;
496 : }
497 : ++src;
498 : ++dst;
499 : }
500 : }
501 :
502 : static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
503 : uint8_t *dst, ptrdiff_t dst_stride,
504 : const InterpKernel *const x_filters, int x0_q4,
505 : int x_step_q4,
506 : const InterpKernel *const y_filters, int y0_q4,
507 : int y_step_q4, int w, int h) {
508 : uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
509 : int intermediate_height =
510 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
511 :
512 : assert(w <= MAX_SB_SIZE);
513 : assert(h <= MAX_SB_SIZE);
514 :
515 : assert(y_step_q4 <= 32);
516 : assert(x_step_q4 <= 32);
517 :
518 : convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
519 : src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
520 : x_step_q4, w, intermediate_height);
521 : convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
522 : MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
523 : y_step_q4, w, h);
524 : }
525 :
526 : void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
527 : uint16_t *dst, ptrdiff_t dst_stride,
528 : const int16_t *filter_x, int x_step_q4,
529 : const int16_t *filter_y, int y_step_q4,
530 : int w, int h) {
531 : const InterpKernel *const filters_x = get_filter_base(filter_x);
532 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
533 :
534 : (void)filter_y;
535 : (void)y_step_q4;
536 :
537 : convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
538 : x_step_q4, w, h);
539 : }
540 :
541 : void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
542 : uint8_t *dst, ptrdiff_t dst_stride,
543 : const int16_t *filter_x, int x_step_q4,
544 : const int16_t *filter_y, int y_step_q4,
545 : int w, int h) {
546 : const InterpKernel *const filters_y = get_filter_base(filter_y);
547 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
548 :
549 : (void)filter_x;
550 : (void)x_step_q4;
551 :
552 : convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
553 : y_step_q4, w, h);
554 : }
555 :
556 : void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
557 : uint8_t *dst, ptrdiff_t dst_stride,
558 : const int16_t *filter_x, int x_step_q4,
559 : const int16_t *filter_y, int y_step_q4, int w,
560 : int h) {
561 : const InterpKernel *const filters_x = get_filter_base(filter_x);
562 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
563 :
564 : const InterpKernel *const filters_y = get_filter_base(filter_y);
565 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
566 :
567 : convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
568 : x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
569 : }
570 : #endif // CONFIG_LOOP_RESTORATION
571 :
572 : #if CONFIG_HIGHBITDEPTH
573 0 : static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
574 : uint8_t *dst8, ptrdiff_t dst_stride,
575 : const InterpKernel *x_filters, int x0_q4,
576 : int x_step_q4, int w, int h, int bd) {
577 : int x, y;
578 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
579 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
580 0 : src -= SUBPEL_TAPS / 2 - 1;
581 0 : for (y = 0; y < h; ++y) {
582 0 : int x_q4 = x0_q4;
583 0 : for (x = 0; x < w; ++x) {
584 0 : const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
585 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
586 0 : int k, sum = 0;
587 0 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
588 0 : dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
589 0 : x_q4 += x_step_q4;
590 : }
591 0 : src += src_stride;
592 0 : dst += dst_stride;
593 : }
594 0 : }
595 :
596 0 : static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
597 : uint8_t *dst8, ptrdiff_t dst_stride,
598 : const InterpKernel *x_filters, int x0_q4,
599 : int x_step_q4, int w, int h, int bd) {
600 : int x, y;
601 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
602 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
603 0 : src -= SUBPEL_TAPS / 2 - 1;
604 0 : for (y = 0; y < h; ++y) {
605 0 : int x_q4 = x0_q4;
606 0 : for (x = 0; x < w; ++x) {
607 0 : const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
608 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
609 0 : int k, sum = 0;
610 0 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
611 0 : dst[x] = ROUND_POWER_OF_TWO(
612 : dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
613 : 1);
614 0 : x_q4 += x_step_q4;
615 : }
616 0 : src += src_stride;
617 0 : dst += dst_stride;
618 : }
619 0 : }
620 :
621 0 : static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
622 : uint8_t *dst8, ptrdiff_t dst_stride,
623 : const InterpKernel *y_filters, int y0_q4,
624 : int y_step_q4, int w, int h, int bd) {
625 : int x, y;
626 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
627 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
628 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
629 0 : for (x = 0; x < w; ++x) {
630 0 : int y_q4 = y0_q4;
631 0 : for (y = 0; y < h; ++y) {
632 0 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
633 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
634 0 : int k, sum = 0;
635 0 : for (k = 0; k < SUBPEL_TAPS; ++k)
636 0 : sum += src_y[k * src_stride] * y_filter[k];
637 0 : dst[y * dst_stride] =
638 0 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
639 0 : y_q4 += y_step_q4;
640 : }
641 0 : ++src;
642 0 : ++dst;
643 : }
644 0 : }
645 :
646 0 : static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
647 : uint8_t *dst8, ptrdiff_t dst_stride,
648 : const InterpKernel *y_filters, int y0_q4,
649 : int y_step_q4, int w, int h, int bd) {
650 : int x, y;
651 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
652 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
653 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
654 0 : for (x = 0; x < w; ++x) {
655 0 : int y_q4 = y0_q4;
656 0 : for (y = 0; y < h; ++y) {
657 0 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
658 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
659 0 : int k, sum = 0;
660 0 : for (k = 0; k < SUBPEL_TAPS; ++k)
661 0 : sum += src_y[k * src_stride] * y_filter[k];
662 0 : dst[y * dst_stride] = ROUND_POWER_OF_TWO(
663 : dst[y * dst_stride] +
664 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
665 : 1);
666 0 : y_q4 += y_step_q4;
667 : }
668 0 : ++src;
669 0 : ++dst;
670 : }
671 0 : }
672 :
673 0 : static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
674 : uint8_t *dst, ptrdiff_t dst_stride,
675 : const InterpKernel *const x_filters, int x0_q4,
676 : int x_step_q4, const InterpKernel *const y_filters,
677 : int y0_q4, int y_step_q4, int w, int h, int bd) {
678 : // Note: Fixed size intermediate buffer, temp, places limits on parameters.
679 : // 2d filtering proceeds in 2 steps:
680 : // (1) Interpolate horizontally into an intermediate buffer, temp.
681 : // (2) Interpolate temp vertically to derive the sub-pixel result.
682 : // Deriving the maximum number of rows in the temp buffer (135):
683 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
684 : // --Largest block size is 64x64 pixels.
685 : // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
686 : // original frame (in 1/16th pixel units).
687 : // --Must round-up because block may be located at sub-pixel position.
688 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
689 : // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
690 : uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
691 0 : int intermediate_height =
692 0 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
693 :
694 0 : assert(w <= MAX_SB_SIZE);
695 0 : assert(h <= MAX_SB_SIZE);
696 0 : assert(y_step_q4 <= 32);
697 0 : assert(x_step_q4 <= 32);
698 :
699 0 : highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
700 0 : CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
701 : x_step_q4, w, intermediate_height, bd);
702 0 : highbd_convolve_vert(
703 0 : CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
704 : MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
705 0 : }
706 :
707 0 : void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
708 : uint8_t *dst, ptrdiff_t dst_stride,
709 : const int16_t *filter_x, int x_step_q4,
710 : const int16_t *filter_y, int y_step_q4, int w,
711 : int h, int bd) {
712 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
713 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
714 : (void)filter_y;
715 : (void)y_step_q4;
716 :
717 0 : highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
718 : x_step_q4, w, h, bd);
719 0 : }
720 :
721 0 : void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
722 : uint8_t *dst, ptrdiff_t dst_stride,
723 : const int16_t *filter_x, int x_step_q4,
724 : const int16_t *filter_y, int y_step_q4,
725 : int w, int h, int bd) {
726 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
727 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
728 : (void)filter_y;
729 : (void)y_step_q4;
730 :
731 0 : highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
732 : x_step_q4, w, h, bd);
733 0 : }
734 :
735 0 : void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
736 : uint8_t *dst, ptrdiff_t dst_stride,
737 : const int16_t *filter_x, int x_step_q4,
738 : const int16_t *filter_y, int y_step_q4, int w,
739 : int h, int bd) {
740 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
741 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
742 : (void)filter_x;
743 : (void)x_step_q4;
744 :
745 0 : highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
746 : y_step_q4, w, h, bd);
747 0 : }
748 :
749 0 : void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
750 : uint8_t *dst, ptrdiff_t dst_stride,
751 : const int16_t *filter_x, int x_step_q4,
752 : const int16_t *filter_y, int y_step_q4,
753 : int w, int h, int bd) {
754 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
755 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
756 : (void)filter_x;
757 : (void)x_step_q4;
758 :
759 0 : highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
760 : y_step_q4, w, h, bd);
761 0 : }
762 :
763 0 : void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
764 : uint8_t *dst, ptrdiff_t dst_stride,
765 : const int16_t *filter_x, int x_step_q4,
766 : const int16_t *filter_y, int y_step_q4, int w,
767 : int h, int bd) {
768 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
769 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
770 :
771 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
772 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
773 :
774 0 : highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
775 : filters_y, y0_q4, y_step_q4, w, h, bd);
776 0 : }
777 :
778 0 : void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
779 : uint8_t *dst, ptrdiff_t dst_stride,
780 : const int16_t *filter_x, int x_step_q4,
781 : const int16_t *filter_y, int y_step_q4, int w,
782 : int h, int bd) {
783 : // Fixed size intermediate buffer places limits on parameters.
784 : DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
785 0 : assert(w <= MAX_SB_SIZE);
786 0 : assert(h <= MAX_SB_SIZE);
787 :
788 0 : aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
789 : filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
790 0 : aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
791 : dst_stride, NULL, 0, NULL, 0, w, h, bd);
792 0 : }
793 :
794 0 : void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
795 : uint8_t *dst8, ptrdiff_t dst_stride,
796 : const int16_t *filter_x, int filter_x_stride,
797 : const int16_t *filter_y, int filter_y_stride,
798 : int w, int h, int bd) {
799 : int r;
800 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
801 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
802 : (void)filter_x;
803 : (void)filter_y;
804 : (void)filter_x_stride;
805 : (void)filter_y_stride;
806 : (void)bd;
807 :
808 0 : for (r = h; r > 0; --r) {
809 0 : memcpy(dst, src, w * sizeof(uint16_t));
810 0 : src += src_stride;
811 0 : dst += dst_stride;
812 : }
813 0 : }
814 :
815 0 : void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
816 : uint8_t *dst8, ptrdiff_t dst_stride,
817 : const int16_t *filter_x, int filter_x_stride,
818 : const int16_t *filter_y, int filter_y_stride,
819 : int w, int h, int bd) {
820 : int x, y;
821 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
822 0 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
823 : (void)filter_x;
824 : (void)filter_y;
825 : (void)filter_x_stride;
826 : (void)filter_y_stride;
827 : (void)bd;
828 :
829 0 : for (y = 0; y < h; ++y) {
830 0 : for (x = 0; x < w; ++x) {
831 0 : dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
832 : }
833 0 : src += src_stride;
834 0 : dst += dst_stride;
835 : }
836 0 : }
837 :
838 : #if CONFIG_LOOP_RESTORATION
839 : static void highbd_convolve_add_src_horiz(const uint8_t *src8,
840 : ptrdiff_t src_stride, uint8_t *dst8,
841 : ptrdiff_t dst_stride,
842 : const InterpKernel *x_filters,
843 : int x0_q4, int x_step_q4, int w,
844 : int h, int bd) {
845 : int x, y, k;
846 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
847 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
848 : src -= SUBPEL_TAPS / 2 - 1;
849 : for (y = 0; y < h; ++y) {
850 : int x_q4 = x0_q4;
851 : for (x = 0; x < w; ++x) {
852 : const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
853 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
854 : int sum = 0;
855 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
856 : dst[x] = clip_pixel_highbd(
857 : ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
858 : bd);
859 : x_q4 += x_step_q4;
860 : }
861 : src += src_stride;
862 : dst += dst_stride;
863 : }
864 : }
865 :
866 : static void highbd_convolve_add_src_vert(const uint8_t *src8,
867 : ptrdiff_t src_stride, uint8_t *dst8,
868 : ptrdiff_t dst_stride,
869 : const InterpKernel *y_filters,
870 : int y0_q4, int y_step_q4, int w, int h,
871 : int bd) {
872 : int x, y, k;
873 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
874 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
875 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
876 : for (x = 0; x < w; ++x) {
877 : int y_q4 = y0_q4;
878 : for (y = 0; y < h; ++y) {
879 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
880 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
881 : int sum = 0;
882 : for (k = 0; k < SUBPEL_TAPS; ++k)
883 : sum += src_y[k * src_stride] * y_filter[k];
884 : dst[y * dst_stride] =
885 : clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
886 : src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
887 : bd);
888 : y_q4 += y_step_q4;
889 : }
890 : ++src;
891 : ++dst;
892 : }
893 : }
894 :
895 : static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
896 : uint8_t *dst, ptrdiff_t dst_stride,
897 : const InterpKernel *const x_filters,
898 : int x0_q4, int x_step_q4,
899 : const InterpKernel *const y_filters,
900 : int y0_q4, int y_step_q4, int w, int h,
901 : int bd) {
902 : // Note: Fixed size intermediate buffer, temp, places limits on parameters.
903 : // 2d filtering proceeds in 2 steps:
904 : // (1) Interpolate horizontally into an intermediate buffer, temp.
905 : // (2) Interpolate temp vertically to derive the sub-pixel result.
906 : // Deriving the maximum number of rows in the temp buffer (135):
907 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
908 : // --Largest block size is 64x64 pixels.
909 : // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
910 : // original frame (in 1/16th pixel units).
911 : // --Must round-up because block may be located at sub-pixel position.
912 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
913 : // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
914 : uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
915 : int intermediate_height =
916 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
917 :
918 : assert(w <= MAX_SB_SIZE);
919 : assert(h <= MAX_SB_SIZE);
920 : assert(y_step_q4 <= 32);
921 : assert(x_step_q4 <= 32);
922 :
923 : highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
924 : src_stride, CONVERT_TO_BYTEPTR(temp),
925 : MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
926 : intermediate_height, bd);
927 : highbd_convolve_add_src_vert(
928 : CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
929 : MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
930 : }
931 :
932 : void aom_highbd_convolve8_add_src_horiz_c(
933 : const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
934 : ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
935 : const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
936 : const InterpKernel *const filters_x = get_filter_base(filter_x);
937 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
938 : (void)filter_y;
939 : (void)y_step_q4;
940 :
941 : highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
942 : x0_q4, x_step_q4, w, h, bd);
943 : }
944 :
945 : void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
946 : ptrdiff_t src_stride, uint8_t *dst,
947 : ptrdiff_t dst_stride,
948 : const int16_t *filter_x, int x_step_q4,
949 : const int16_t *filter_y, int y_step_q4,
950 : int w, int h, int bd) {
951 : const InterpKernel *const filters_y = get_filter_base(filter_y);
952 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
953 : (void)filter_x;
954 : (void)x_step_q4;
955 :
956 : highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
957 : y0_q4, y_step_q4, w, h, bd);
958 : }
959 :
960 : void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
961 : uint8_t *dst, ptrdiff_t dst_stride,
962 : const int16_t *filter_x, int x_step_q4,
963 : const int16_t *filter_y, int y_step_q4,
964 : int w, int h, int bd) {
965 : const InterpKernel *const filters_x = get_filter_base(filter_x);
966 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
967 :
968 : const InterpKernel *const filters_y = get_filter_base(filter_y);
969 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
970 :
971 : highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
972 : x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
973 : }
974 :
975 : static void highbd_convolve_add_src_horiz_hip(
976 : const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
977 : ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
978 : int x_step_q4, int w, int h, int bd) {
979 : const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
980 : int x, y, k;
981 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
982 : src -= SUBPEL_TAPS / 2 - 1;
983 : for (y = 0; y < h; ++y) {
984 : int x_q4 = x0_q4;
985 : for (x = 0; x < w; ++x) {
986 : const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
987 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
988 : int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
989 : (1 << (bd + FILTER_BITS - 1));
990 : for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
991 : dst[x] =
992 : (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
993 : 0, extraprec_clamp_limit - 1);
994 : x_q4 += x_step_q4;
995 : }
996 : src += src_stride;
997 : dst += dst_stride;
998 : }
999 : }
1000 :
1001 : static void highbd_convolve_add_src_vert_hip(
1002 : const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1003 : ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1004 : int y_step_q4, int w, int h, int bd) {
1005 : int x, y, k;
1006 : uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1007 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1008 : for (x = 0; x < w; ++x) {
1009 : int y_q4 = y0_q4;
1010 : for (y = 0; y < h; ++y) {
1011 : const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1012 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1013 : int sum =
1014 : ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1015 : (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
1016 : for (k = 0; k < SUBPEL_TAPS; ++k)
1017 : sum += src_y[k * src_stride] * y_filter[k];
1018 : dst[y * dst_stride] = clip_pixel_highbd(
1019 : ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
1020 : y_q4 += y_step_q4;
1021 : }
1022 : ++src;
1023 : ++dst;
1024 : }
1025 : }
1026 :
1027 : static void highbd_convolve_add_src_hip(
1028 : const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1029 : ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
1030 : int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
1031 : int y_step_q4, int w, int h, int bd) {
1032 : // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1033 : // 2d filtering proceeds in 2 steps:
1034 : // (1) Interpolate horizontally into an intermediate buffer, temp.
1035 : // (2) Interpolate temp vertically to derive the sub-pixel result.
1036 : // Deriving the maximum number of rows in the temp buffer (135):
1037 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1038 : // --Largest block size is 64x64 pixels.
1039 : // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1040 : // original frame (in 1/16th pixel units).
1041 : // --Must round-up because block may be located at sub-pixel position.
1042 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1043 : // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1044 : uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1045 : int intermediate_height =
1046 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1047 :
1048 : assert(w <= MAX_SB_SIZE);
1049 : assert(h <= MAX_SB_SIZE);
1050 : assert(y_step_q4 <= 32);
1051 : assert(x_step_q4 <= 32);
1052 :
1053 : highbd_convolve_add_src_horiz_hip(
1054 : src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
1055 : x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
1056 : highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1057 : MAX_SB_SIZE, dst, dst_stride, y_filters,
1058 : y0_q4, y_step_q4, w, h, bd);
1059 : }
1060 :
1061 : void aom_highbd_convolve8_add_src_horiz_hip_c(
1062 : const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
1063 : ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1064 : const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1065 : const InterpKernel *const filters_x = get_filter_base(filter_x);
1066 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
1067 : (void)filter_y;
1068 : (void)y_step_q4;
1069 :
1070 : highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
1071 : x0_q4, x_step_q4, w, h, bd);
1072 : }
1073 :
1074 : void aom_highbd_convolve8_add_src_vert_hip_c(
1075 : const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
1076 : ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1077 : const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1078 : const InterpKernel *const filters_y = get_filter_base(filter_y);
1079 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
1080 : (void)filter_x;
1081 : (void)x_step_q4;
1082 :
1083 : highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
1084 : y0_q4, y_step_q4, w, h, bd);
1085 : }
1086 :
1087 : void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
1088 : ptrdiff_t src_stride, uint8_t *dst,
1089 : ptrdiff_t dst_stride,
1090 : const int16_t *filter_x, int x_step_q4,
1091 : const int16_t *filter_y, int y_step_q4,
1092 : int w, int h, int bd) {
1093 : const InterpKernel *const filters_x = get_filter_base(filter_x);
1094 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
1095 :
1096 : const InterpKernel *const filters_y = get_filter_base(filter_y);
1097 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
1098 :
1099 : highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
1100 : x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
1101 : h, bd);
1102 : }
1103 :
1104 : #endif // CONFIG_LOOP_RESTORATION
1105 : #endif // CONFIG_HIGHBITDEPTH
|