Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <smmintrin.h>
14 :
15 : #include "./av1_rtcd.h"
16 : #include "av1/common/filter.h"
17 :
18 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
19 : DECLARE_ALIGNED(16, static int16_t, subpel_filters_sharp[15][6][8]);
20 : #endif
21 :
22 : #if USE_TEMPORALFILTER_12TAP
23 : DECLARE_ALIGNED(16, static int16_t, subpel_temporalfilter[15][6][8]);
24 : #endif
25 :
26 : typedef int16_t (*HbdSubpelFilterCoeffs)[8];
27 :
28 : typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
29 : int src_stride, uint16_t *dst, int dst_stride,
30 : int bd);
31 :
32 : static INLINE HbdSubpelFilterCoeffs
33 0 : hbd_get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
34 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
35 : if (p.interp_filter == MULTITAP_SHARP) {
36 : return &subpel_filters_sharp[index][0];
37 : }
38 : #endif
39 : #if USE_TEMPORALFILTER_12TAP
40 0 : if (p.interp_filter == TEMPORALFILTER_12TAP) {
41 0 : return &subpel_temporalfilter[index][0];
42 : }
43 : #endif
44 : (void)p;
45 : (void)index;
46 0 : return NULL;
47 : }
48 :
49 0 : static void init_simd_filter(const int16_t *filter_ptr, int taps,
50 : int16_t (*simd_filter)[6][8]) {
51 : int shift;
52 0 : int offset = (12 - taps) / 2;
53 0 : for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
54 0 : const int16_t *filter_row = filter_ptr + shift * taps;
55 : int i, j;
56 0 : for (i = 0; i < 12; ++i) {
57 0 : for (j = 0; j < 4; ++j) {
58 0 : int r = i / 2;
59 0 : int c = j * 2 + (i % 2);
60 0 : if (i - offset >= 0 && i - offset < taps)
61 0 : simd_filter[shift - 1][r][c] = filter_row[i - offset];
62 : else
63 0 : simd_filter[shift - 1][r][c] = 0;
64 : }
65 : }
66 : }
67 0 : }
68 :
69 0 : void av1_highbd_convolve_init_sse4_1(void) {
70 : #if USE_TEMPORALFILTER_12TAP
71 : {
72 0 : InterpFilterParams filter_params =
73 : av1_get_interp_filter_params(TEMPORALFILTER_12TAP);
74 0 : int taps = filter_params.taps;
75 0 : const int16_t *filter_ptr = filter_params.filter_ptr;
76 0 : init_simd_filter(filter_ptr, taps, subpel_temporalfilter);
77 : }
78 : #endif
79 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
80 : {
81 : InterpFilterParams filter_params =
82 : av1_get_interp_filter_params(MULTITAP_SHARP);
83 : int taps = filter_params.taps;
84 : const int16_t *filter_ptr = filter_params.filter_ptr;
85 : init_simd_filter(filter_ptr, taps, subpel_filters_sharp);
86 : }
87 : #endif
88 0 : }
89 :
90 : // pixelsNum 0: write all 4 pixels
91 : // 1/2/3: residual pixels 1/2/3
92 0 : static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
93 : int dst_stride) {
94 0 : if (2 == width) {
95 0 : if (0 == pixelsNum) {
96 0 : *(int *)dst = _mm_cvtsi128_si32(u[0]);
97 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
98 0 : *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
99 0 : *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
100 0 : } else if (1 == pixelsNum) {
101 0 : *(int *)dst = _mm_cvtsi128_si32(u[0]);
102 0 : } else if (2 == pixelsNum) {
103 0 : *(int *)dst = _mm_cvtsi128_si32(u[0]);
104 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
105 0 : } else if (3 == pixelsNum) {
106 0 : *(int *)dst = _mm_cvtsi128_si32(u[0]);
107 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
108 0 : *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
109 : }
110 : } else {
111 0 : if (0 == pixelsNum) {
112 0 : _mm_storel_epi64((__m128i *)dst, u[0]);
113 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
114 0 : _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
115 0 : _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
116 0 : } else if (1 == pixelsNum) {
117 0 : _mm_storel_epi64((__m128i *)dst, u[0]);
118 0 : } else if (2 == pixelsNum) {
119 0 : _mm_storel_epi64((__m128i *)dst, u[0]);
120 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
121 0 : } else if (3 == pixelsNum) {
122 0 : _mm_storel_epi64((__m128i *)dst, u[0]);
123 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
124 0 : _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
125 : }
126 : }
127 0 : }
128 :
129 : // 16-bit pixels clip with bd (10/12)
130 0 : static void highbd_clip(__m128i *p, int numVecs, int bd) {
131 0 : const __m128i zero = _mm_setzero_si128();
132 0 : const __m128i one = _mm_set1_epi16(1);
133 0 : const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
134 : __m128i clamped, mask;
135 : int i;
136 :
137 0 : for (i = 0; i < numVecs; i++) {
138 0 : mask = _mm_cmpgt_epi16(p[i], max);
139 0 : clamped = _mm_andnot_si128(mask, p[i]);
140 0 : mask = _mm_and_si128(mask, max);
141 0 : clamped = _mm_or_si128(mask, clamped);
142 0 : mask = _mm_cmpgt_epi16(clamped, zero);
143 0 : p[i] = _mm_and_si128(clamped, mask);
144 : }
145 0 : }
146 :
147 0 : static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
148 : __m128i v0, v1;
149 0 : __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
150 :
151 0 : u[0] = _mm_loadu_si128((__m128i const *)src);
152 0 : u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
153 0 : u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
154 0 : u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
155 :
156 0 : u[0] = _mm_add_epi32(u[0], rnd);
157 0 : u[1] = _mm_add_epi32(u[1], rnd);
158 0 : u[2] = _mm_add_epi32(u[2], rnd);
159 0 : u[3] = _mm_add_epi32(u[3], rnd);
160 :
161 0 : u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
162 0 : u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
163 0 : u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
164 0 : u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
165 :
166 0 : u[0] = _mm_packus_epi32(u[0], u[1]);
167 0 : u[1] = _mm_packus_epi32(u[2], u[3]);
168 :
169 0 : highbd_clip(u, 2, bd);
170 :
171 0 : v0 = _mm_unpacklo_epi16(u[0], u[1]);
172 0 : v1 = _mm_unpackhi_epi16(u[0], u[1]);
173 :
174 0 : u[0] = _mm_unpacklo_epi16(v0, v1);
175 0 : u[2] = _mm_unpackhi_epi16(v0, v1);
176 :
177 0 : u[1] = _mm_srli_si128(u[0], 8);
178 0 : u[3] = _mm_srli_si128(u[2], 8);
179 0 : }
180 :
181 : // pixelsNum = 0 : all 4 rows of pixels will be saved.
182 : // pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
183 0 : void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
184 : uint16_t *dst, int dst_stride, int bd) {
185 : __m128i u[4];
186 0 : transClipPixel(src, src_stride, u, bd);
187 0 : writePixel(u, width, pixelsNum, dst, dst_stride);
188 0 : }
189 :
190 0 : void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
191 : int src_stride, uint16_t *dst, int dst_stride,
192 : int bd) {
193 : __m128i u[4], v[4];
194 0 : const __m128i ones = _mm_set1_epi16(1);
195 :
196 0 : transClipPixel(src, src_stride, u, bd);
197 :
198 0 : v[0] = _mm_loadl_epi64((__m128i const *)dst);
199 0 : v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
200 0 : v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
201 0 : v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
202 :
203 0 : u[0] = _mm_add_epi16(u[0], v[0]);
204 0 : u[1] = _mm_add_epi16(u[1], v[1]);
205 0 : u[2] = _mm_add_epi16(u[2], v[2]);
206 0 : u[3] = _mm_add_epi16(u[3], v[3]);
207 :
208 0 : u[0] = _mm_add_epi16(u[0], ones);
209 0 : u[1] = _mm_add_epi16(u[1], ones);
210 0 : u[2] = _mm_add_epi16(u[2], ones);
211 0 : u[3] = _mm_add_epi16(u[3], ones);
212 :
213 0 : u[0] = _mm_srai_epi16(u[0], 1);
214 0 : u[1] = _mm_srai_epi16(u[1], 1);
215 0 : u[2] = _mm_srai_epi16(u[2], 1);
216 0 : u[3] = _mm_srai_epi16(u[3], 1);
217 :
218 0 : writePixel(u, width, pixelsNum, dst, dst_stride);
219 0 : }
220 :
221 : static TransposeSave transSaveTab[2] = { trans_save_4x4, trans_accum_save_4x4 };
222 :
223 0 : static INLINE void transpose_pair(__m128i *in, __m128i *out) {
224 : __m128i x0, x1;
225 :
226 0 : x0 = _mm_unpacklo_epi32(in[0], in[1]);
227 0 : x1 = _mm_unpacklo_epi32(in[2], in[3]);
228 :
229 0 : out[0] = _mm_unpacklo_epi64(x0, x1);
230 0 : out[1] = _mm_unpackhi_epi64(x0, x1);
231 :
232 0 : x0 = _mm_unpackhi_epi32(in[0], in[1]);
233 0 : x1 = _mm_unpackhi_epi32(in[2], in[3]);
234 :
235 0 : out[2] = _mm_unpacklo_epi64(x0, x1);
236 0 : out[3] = _mm_unpackhi_epi64(x0, x1);
237 :
238 0 : x0 = _mm_unpacklo_epi32(in[4], in[5]);
239 0 : x1 = _mm_unpacklo_epi32(in[6], in[7]);
240 :
241 0 : out[4] = _mm_unpacklo_epi64(x0, x1);
242 0 : out[5] = _mm_unpackhi_epi64(x0, x1);
243 0 : }
244 :
245 0 : static void highbd_filter_horiz(const uint16_t *src, int src_stride, __m128i *f,
246 : int tapsNum, uint32_t *buf) {
247 : __m128i u[8], v[6];
248 :
249 0 : assert(tapsNum == 10 || tapsNum == 12);
250 0 : if (tapsNum == 10) {
251 0 : src -= 1;
252 : }
253 :
254 0 : u[0] = _mm_loadu_si128((__m128i const *)src);
255 0 : u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
256 0 : u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
257 0 : u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
258 :
259 0 : u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
260 0 : u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
261 0 : u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
262 0 : u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
263 :
264 0 : transpose_pair(u, v);
265 :
266 0 : u[0] = _mm_madd_epi16(v[0], f[0]);
267 0 : u[1] = _mm_madd_epi16(v[1], f[1]);
268 0 : u[2] = _mm_madd_epi16(v[2], f[2]);
269 0 : u[3] = _mm_madd_epi16(v[3], f[3]);
270 0 : u[4] = _mm_madd_epi16(v[4], f[4]);
271 0 : u[5] = _mm_madd_epi16(v[5], f[5]);
272 :
273 0 : u[6] = _mm_min_epi32(u[2], u[3]);
274 0 : u[7] = _mm_max_epi32(u[2], u[3]);
275 :
276 0 : u[0] = _mm_add_epi32(u[0], u[1]);
277 0 : u[0] = _mm_add_epi32(u[0], u[5]);
278 0 : u[0] = _mm_add_epi32(u[0], u[4]);
279 0 : u[0] = _mm_add_epi32(u[0], u[6]);
280 0 : u[0] = _mm_add_epi32(u[0], u[7]);
281 :
282 0 : _mm_storeu_si128((__m128i *)buf, u[0]);
283 0 : }
284 :
285 0 : void av1_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
286 : uint16_t *dst, int dst_stride, int w,
287 : int h,
288 : const InterpFilterParams filter_params,
289 : const int subpel_x_q4, int x_step_q4,
290 : int avg, int bd) {
291 : DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
292 : __m128i verf[6];
293 : HbdSubpelFilterCoeffs vCoeffs;
294 : const uint16_t *srcPtr;
295 0 : const int tapsNum = filter_params.taps;
296 : int i, col, count, blkResidu, blkHeight;
297 0 : TransposeSave transSave = transSaveTab[avg];
298 : (void)x_step_q4;
299 :
300 0 : if (0 == subpel_x_q4 || 16 != x_step_q4) {
301 0 : av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
302 : filter_params, subpel_x_q4, x_step_q4, avg, bd);
303 0 : return;
304 : }
305 :
306 0 : vCoeffs =
307 0 : hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
308 0 : if (!vCoeffs) {
309 0 : av1_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
310 : filter_params, subpel_x_q4, x_step_q4, avg, bd);
311 0 : return;
312 : }
313 :
314 0 : verf[0] = *((const __m128i *)(vCoeffs));
315 0 : verf[1] = *((const __m128i *)(vCoeffs + 1));
316 0 : verf[2] = *((const __m128i *)(vCoeffs + 2));
317 0 : verf[3] = *((const __m128i *)(vCoeffs + 3));
318 0 : verf[4] = *((const __m128i *)(vCoeffs + 4));
319 0 : verf[5] = *((const __m128i *)(vCoeffs + 5));
320 :
321 0 : src -= (tapsNum >> 1) - 1;
322 0 : srcPtr = src;
323 :
324 0 : count = 0;
325 0 : blkHeight = h >> 2;
326 0 : blkResidu = h & 3;
327 :
328 0 : while (blkHeight != 0) {
329 0 : for (col = 0; col < w; col += 4) {
330 0 : for (i = 0; i < 4; ++i) {
331 0 : highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
332 0 : srcPtr += 1;
333 : }
334 0 : transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
335 : }
336 0 : count++;
337 0 : srcPtr = src + count * src_stride * 4;
338 0 : dst += dst_stride * 4;
339 0 : blkHeight--;
340 : }
341 :
342 0 : if (blkResidu == 0) return;
343 :
344 0 : for (col = 0; col < w; col += 4) {
345 0 : for (i = 0; i < 4; ++i) {
346 0 : highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
347 0 : srcPtr += 1;
348 : }
349 0 : transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
350 : }
351 : }
352 :
353 : // Vertical convolutional filter
354 :
355 : typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
356 :
357 0 : static void highbdRndingPacks(__m128i *u) {
358 0 : __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
359 0 : u[0] = _mm_add_epi32(u[0], rnd);
360 0 : u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
361 0 : u[0] = _mm_packus_epi32(u[0], u[0]);
362 0 : }
363 :
364 0 : static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
365 0 : highbdRndingPacks(u);
366 0 : highbd_clip(u, 1, bd);
367 0 : *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
368 0 : }
369 :
370 0 : static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
371 0 : __m128i v = _mm_loadl_epi64((__m128i const *)dst);
372 0 : const __m128i ones = _mm_set1_epi16(1);
373 :
374 0 : highbdRndingPacks(u);
375 0 : highbd_clip(u, 1, bd);
376 :
377 0 : v = _mm_add_epi16(v, u[0]);
378 0 : v = _mm_add_epi16(v, ones);
379 0 : v = _mm_srai_epi16(v, 1);
380 0 : *(uint32_t *)dst = _mm_cvtsi128_si32(v);
381 0 : }
382 :
383 : WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
384 :
385 0 : static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
386 0 : highbdRndingPacks(u);
387 0 : highbd_clip(u, 1, bd);
388 0 : _mm_storel_epi64((__m128i *)dst, u[0]);
389 0 : }
390 :
391 0 : static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
392 0 : __m128i v = _mm_loadl_epi64((__m128i const *)dst);
393 0 : const __m128i ones = _mm_set1_epi16(1);
394 :
395 0 : highbdRndingPacks(u);
396 0 : highbd_clip(u, 1, bd);
397 :
398 0 : v = _mm_add_epi16(v, u[0]);
399 0 : v = _mm_add_epi16(v, ones);
400 0 : v = _mm_srai_epi16(v, 1);
401 : _mm_storel_epi64((__m128i *)dst, v);
402 0 : }
403 :
404 : WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
405 :
406 0 : static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
407 : const __m128i *f, int taps,
408 : uint16_t *dst, WritePixels saveFunc,
409 : int bd) {
410 : __m128i s[12];
411 0 : __m128i zero = _mm_setzero_si128();
412 0 : int i = 0;
413 0 : int r = 0;
414 :
415 : // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
416 0 : assert(taps == 10 || taps == 12);
417 0 : if (10 == taps) {
418 0 : i += 1;
419 0 : s[0] = zero;
420 : }
421 0 : while (i < 12) {
422 0 : s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
423 0 : i += 1;
424 0 : r += 1;
425 : }
426 :
427 0 : s[0] = _mm_unpacklo_epi16(s[0], s[1]);
428 0 : s[2] = _mm_unpacklo_epi16(s[2], s[3]);
429 0 : s[4] = _mm_unpacklo_epi16(s[4], s[5]);
430 0 : s[6] = _mm_unpacklo_epi16(s[6], s[7]);
431 0 : s[8] = _mm_unpacklo_epi16(s[8], s[9]);
432 0 : s[10] = _mm_unpacklo_epi16(s[10], s[11]);
433 :
434 0 : s[0] = _mm_madd_epi16(s[0], f[0]);
435 0 : s[2] = _mm_madd_epi16(s[2], f[1]);
436 0 : s[4] = _mm_madd_epi16(s[4], f[2]);
437 0 : s[6] = _mm_madd_epi16(s[6], f[3]);
438 0 : s[8] = _mm_madd_epi16(s[8], f[4]);
439 0 : s[10] = _mm_madd_epi16(s[10], f[5]);
440 :
441 0 : s[1] = _mm_min_epi32(s[4], s[6]);
442 0 : s[3] = _mm_max_epi32(s[4], s[6]);
443 :
444 0 : s[0] = _mm_add_epi32(s[0], s[2]);
445 0 : s[0] = _mm_add_epi32(s[0], s[10]);
446 0 : s[0] = _mm_add_epi32(s[0], s[8]);
447 0 : s[0] = _mm_add_epi32(s[0], s[1]);
448 0 : s[0] = _mm_add_epi32(s[0], s[3]);
449 :
450 0 : saveFunc(s, bd, dst);
451 0 : }
452 :
453 0 : static void highbd_filter_vert_compute_large(const uint16_t *src,
454 : int src_stride, const __m128i *f,
455 : int taps, int w, int h,
456 : uint16_t *dst, int dst_stride,
457 : int avg, int bd) {
458 : int col;
459 0 : int rowIndex = 0;
460 0 : const uint16_t *src_ptr = src;
461 0 : uint16_t *dst_ptr = dst;
462 0 : const int step = 4;
463 0 : WritePixels write4pixels = write4pixelsTab[avg];
464 :
465 : do {
466 0 : for (col = 0; col < w; col += step) {
467 0 : filter_vert_horiz_parallel(src_ptr, src_stride, f, taps, dst_ptr,
468 : write4pixels, bd);
469 0 : src_ptr += step;
470 0 : dst_ptr += step;
471 : }
472 0 : rowIndex++;
473 0 : src_ptr = src + rowIndex * src_stride;
474 0 : dst_ptr = dst + rowIndex * dst_stride;
475 0 : } while (rowIndex < h);
476 0 : }
477 :
478 0 : static void highbd_filter_vert_compute_small(const uint16_t *src,
479 : int src_stride, const __m128i *f,
480 : int taps, int w, int h,
481 : uint16_t *dst, int dst_stride,
482 : int avg, int bd) {
483 0 : int rowIndex = 0;
484 0 : WritePixels write2pixels = write2pixelsTab[avg];
485 : (void)w;
486 :
487 : do {
488 0 : filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels, bd);
489 0 : rowIndex++;
490 0 : src += src_stride;
491 0 : dst += dst_stride;
492 0 : } while (rowIndex < h);
493 0 : }
494 :
495 0 : void av1_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
496 : uint16_t *dst, int dst_stride, int w,
497 : int h,
498 : const InterpFilterParams filter_params,
499 : const int subpel_y_q4, int y_step_q4,
500 : int avg, int bd) {
501 : __m128i verf[6];
502 : HbdSubpelFilterCoeffs vCoeffs;
503 0 : const int tapsNum = filter_params.taps;
504 :
505 0 : if (0 == subpel_y_q4 || 16 != y_step_q4) {
506 0 : av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
507 : filter_params, subpel_y_q4, y_step_q4, avg, bd);
508 0 : return;
509 : }
510 :
511 0 : vCoeffs =
512 0 : hbd_get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
513 0 : if (!vCoeffs) {
514 0 : av1_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
515 : filter_params, subpel_y_q4, y_step_q4, avg, bd);
516 0 : return;
517 : }
518 :
519 0 : verf[0] = *((const __m128i *)(vCoeffs));
520 0 : verf[1] = *((const __m128i *)(vCoeffs + 1));
521 0 : verf[2] = *((const __m128i *)(vCoeffs + 2));
522 0 : verf[3] = *((const __m128i *)(vCoeffs + 3));
523 0 : verf[4] = *((const __m128i *)(vCoeffs + 4));
524 0 : verf[5] = *((const __m128i *)(vCoeffs + 5));
525 :
526 0 : src -= src_stride * ((tapsNum >> 1) - 1);
527 :
528 0 : if (w > 2) {
529 0 : highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h, dst,
530 : dst_stride, avg, bd);
531 : } else {
532 0 : highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h, dst,
533 : dst_stride, avg, bd);
534 : }
535 : }
|