Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <tmmintrin.h>
14 :
15 : #include "./aom_config.h"
16 : #include "./av1_rtcd.h"
17 : #include "av1/common/filter.h"
18 :
19 : #define WIDTH_BOUND (16)
20 : #define HEIGHT_BOUND (16)
21 :
22 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
23 : DECLARE_ALIGNED(16, static int8_t,
24 : sub_pel_filters_12sharp_signal_dir[15][2][16]);
25 :
26 : DECLARE_ALIGNED(16, static int8_t,
27 : sub_pel_filters_12sharp_ver_signal_dir[15][6][16]);
28 : #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
29 :
30 : #if USE_TEMPORALFILTER_12TAP
31 : DECLARE_ALIGNED(16, static int8_t,
32 : sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]);
33 :
34 : DECLARE_ALIGNED(16, static int8_t,
35 : sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]);
36 : #endif
37 :
38 : typedef int8_t (*SubpelFilterCoeffs)[16];
39 :
40 : static INLINE SubpelFilterCoeffs
41 0 : get_subpel_filter_signal_dir(const InterpFilterParams p, int index) {
42 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
43 : if (p.interp_filter == MULTITAP_SHARP) {
44 : return &sub_pel_filters_12sharp_signal_dir[index][0];
45 : }
46 : #endif
47 : #if USE_TEMPORALFILTER_12TAP
48 0 : if (p.interp_filter == TEMPORALFILTER_12TAP) {
49 0 : return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
50 : }
51 : #endif
52 : (void)p;
53 : (void)index;
54 0 : return NULL;
55 : }
56 :
57 : static INLINE SubpelFilterCoeffs
58 0 : get_subpel_filter_ver_signal_dir(const InterpFilterParams p, int index) {
59 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
60 : if (p.interp_filter == MULTITAP_SHARP) {
61 : return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
62 : }
63 : #endif
64 : #if USE_TEMPORALFILTER_12TAP
65 0 : if (p.interp_filter == TEMPORALFILTER_12TAP) {
66 0 : return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
67 : }
68 : #endif
69 : (void)p;
70 : (void)index;
71 0 : return NULL;
72 : }
73 :
74 0 : static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
75 : __m128i t0, t1;
76 :
77 0 : t0 = _mm_unpacklo_epi16(in[0], in[1]);
78 0 : t1 = _mm_unpacklo_epi16(in[2], in[3]);
79 :
80 0 : out[0] = _mm_unpacklo_epi32(t0, t1);
81 0 : out[1] = _mm_srli_si128(out[0], 8);
82 0 : out[2] = _mm_unpackhi_epi32(t0, t1);
83 0 : out[3] = _mm_srli_si128(out[2], 8);
84 :
85 0 : t0 = _mm_unpackhi_epi16(in[0], in[1]);
86 0 : t1 = _mm_unpackhi_epi16(in[2], in[3]);
87 :
88 0 : out[4] = _mm_unpacklo_epi32(t0, t1);
89 0 : out[5] = _mm_srli_si128(out[4], 8);
90 : // Note: We ignore out[6] and out[7] because
91 : // they're zero vectors.
92 0 : }
93 :
94 : typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
95 :
96 0 : static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
97 0 : const __m128i zero = _mm_setzero_si128();
98 0 : const __m128i one = _mm_set1_epi16(1);
99 0 : __m128i y = _mm_loadl_epi64((__m128i const *)src);
100 0 : y = _mm_unpacklo_epi8(y, zero);
101 0 : y = _mm_add_epi16(*x, y);
102 0 : y = _mm_add_epi16(y, one);
103 0 : y = _mm_srai_epi16(y, 1);
104 0 : y = _mm_packus_epi16(y, y);
105 0 : return y;
106 : }
107 :
108 0 : static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
109 : uint32_t temp;
110 0 : __m128i u = _mm_packus_epi16(*x, *x);
111 0 : temp = _mm_cvtsi128_si32(u);
112 0 : *(uint16_t *)dst = (uint16_t)temp;
113 0 : }
114 :
115 0 : static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
116 : uint32_t temp;
117 0 : __m128i y = accumulate_store(x, dst);
118 0 : temp = _mm_cvtsi128_si32(y);
119 0 : *(uint16_t *)dst = (uint16_t)temp;
120 0 : }
121 :
122 : static store_pixel_t store2pixelTab[2] = { store_2_pixel_only,
123 : accumulate_store_2_pixel };
124 :
125 0 : static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
126 0 : __m128i u = _mm_packus_epi16(*x, *x);
127 0 : *(int *)dst = _mm_cvtsi128_si32(u);
128 0 : }
129 :
130 0 : static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
131 0 : __m128i y = accumulate_store(x, dst);
132 0 : *(int *)dst = _mm_cvtsi128_si32(y);
133 0 : }
134 :
135 : static store_pixel_t store4pixelTab[2] = { store_4_pixel_only,
136 : accumulate_store_4_pixel };
137 :
138 0 : static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
139 : store_pixel_t store_func, uint8_t *dst) {
140 : __m128i sumPairRow[4];
141 : __m128i sumPairCol[8];
142 : __m128i pixel;
143 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
144 0 : const __m128i zero = _mm_setzero_si128();
145 :
146 0 : assert(tapsNum == 10 || tapsNum == 12);
147 0 : if (10 == tapsNum) {
148 0 : src -= 1;
149 : }
150 :
151 0 : pixel = _mm_loadu_si128((__m128i const *)src);
152 0 : sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
153 0 : sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
154 0 : sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
155 :
156 0 : pixel = _mm_loadu_si128((__m128i const *)(src + 1));
157 0 : sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
158 0 : sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
159 0 : sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
160 :
161 0 : transpose_4x8(sumPairRow, sumPairCol);
162 :
163 0 : sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
164 0 : sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
165 :
166 0 : sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
167 0 : sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
168 :
169 0 : sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
170 0 : sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
171 0 : sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
172 :
173 0 : sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
174 0 : sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
175 0 : sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
176 :
177 0 : store_func(&sumPairRow[1], dst);
178 0 : }
179 :
180 0 : static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
181 : store_pixel_t store, uint8_t *buf) {
182 0 : horiz_w4_ssse3(src, f, tapsNum, store, buf);
183 0 : src += 4;
184 0 : buf += 4;
185 0 : horiz_w4_ssse3(src, f, tapsNum, store, buf);
186 0 : }
187 :
188 0 : static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
189 : store_pixel_t store, uint8_t *buf) {
190 0 : horiz_w8_ssse3(src, f, tapsNum, store, buf);
191 0 : src += 8;
192 0 : buf += 8;
193 0 : horiz_w8_ssse3(src, f, tapsNum, store, buf);
194 0 : }
195 :
196 0 : static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
197 : store_pixel_t store, uint8_t *buf) {
198 0 : horiz_w16_ssse3(src, f, tapsNum, store, buf);
199 0 : src += 16;
200 0 : buf += 16;
201 0 : horiz_w16_ssse3(src, f, tapsNum, store, buf);
202 0 : }
203 :
204 0 : static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
205 : store_pixel_t store, uint8_t *buf) {
206 0 : horiz_w32_ssse3(src, f, tapsNum, store, buf);
207 0 : src += 32;
208 0 : buf += 32;
209 0 : horiz_w32_ssse3(src, f, tapsNum, store, buf);
210 0 : }
211 :
212 0 : static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
213 : store_pixel_t store, uint8_t *buf) {
214 0 : horiz_w64_ssse3(src, f, tapsNum, store, buf);
215 0 : src += 64;
216 0 : buf += 64;
217 0 : horiz_w64_ssse3(src, f, tapsNum, store, buf);
218 0 : }
219 :
220 : static void (*horizTab[6])(const uint8_t *, const __m128i *, int, store_pixel_t,
221 : uint8_t *) = {
222 : horiz_w4_ssse3, horiz_w8_ssse3, horiz_w16_ssse3,
223 : horiz_w32_ssse3, horiz_w64_ssse3, horiz_w128_ssse3,
224 : };
225 :
226 0 : static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
227 : int width, store_pixel_t store, uint8_t *dst) {
228 0 : switch (width) {
229 : // Note:
230 : // For width=2 and 4, store function must be different
231 : case 2:
232 0 : case 4: horizTab[0](src, f, tapsNum, store, dst); break;
233 0 : case 8: horizTab[1](src, f, tapsNum, store, dst); break;
234 0 : case 16: horizTab[2](src, f, tapsNum, store, dst); break;
235 0 : case 32: horizTab[3](src, f, tapsNum, store, dst); break;
236 0 : case 64: horizTab[4](src, f, tapsNum, store, dst); break;
237 0 : case 128: horizTab[5](src, f, tapsNum, store, dst); break;
238 0 : default: assert(0);
239 : }
240 0 : }
241 :
242 : // Vertical 8-pixel parallel
243 : typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
244 : uint8_t *dst, int dst_stride);
245 :
246 0 : static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
247 : int src_stride, uint8_t *dst,
248 : int dst_stride) {
249 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
250 : __m128i v0, v1, v2, v3;
251 :
252 0 : __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
253 0 : __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
254 0 : __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
255 0 : __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
256 0 : __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
257 0 : __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
258 0 : __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
259 0 : __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
260 :
261 0 : u0 = _mm_mulhrs_epi16(u0, k_256);
262 0 : u1 = _mm_mulhrs_epi16(u1, k_256);
263 0 : u2 = _mm_mulhrs_epi16(u2, k_256);
264 0 : u3 = _mm_mulhrs_epi16(u3, k_256);
265 0 : u4 = _mm_mulhrs_epi16(u4, k_256);
266 0 : u5 = _mm_mulhrs_epi16(u5, k_256);
267 0 : u6 = _mm_mulhrs_epi16(u6, k_256);
268 0 : u7 = _mm_mulhrs_epi16(u7, k_256);
269 :
270 0 : v0 = _mm_packus_epi16(u0, u1);
271 0 : v1 = _mm_packus_epi16(u2, u3);
272 0 : v2 = _mm_packus_epi16(u4, u5);
273 0 : v3 = _mm_packus_epi16(u6, u7);
274 :
275 0 : u0 = _mm_unpacklo_epi8(v0, v1);
276 0 : u1 = _mm_unpackhi_epi8(v0, v1);
277 0 : u2 = _mm_unpacklo_epi8(v2, v3);
278 0 : u3 = _mm_unpackhi_epi8(v2, v3);
279 :
280 0 : u4 = _mm_unpacklo_epi8(u0, u1);
281 0 : u5 = _mm_unpacklo_epi8(u2, u3);
282 0 : u6 = _mm_unpackhi_epi8(u0, u1);
283 0 : u7 = _mm_unpackhi_epi8(u2, u3);
284 :
285 0 : u0 = _mm_unpacklo_epi32(u4, u5);
286 0 : u1 = _mm_unpackhi_epi32(u4, u5);
287 0 : u2 = _mm_unpacklo_epi32(u6, u7);
288 0 : u3 = _mm_unpackhi_epi32(u6, u7);
289 :
290 0 : u4 = _mm_srli_si128(u0, 8);
291 0 : u5 = _mm_srli_si128(u1, 8);
292 0 : u6 = _mm_srli_si128(u2, 8);
293 0 : u7 = _mm_srli_si128(u3, 8);
294 :
295 : _mm_storel_epi64((__m128i *)dst, u0);
296 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
297 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
298 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
299 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
300 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
301 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
302 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
303 0 : }
304 :
305 0 : static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
306 : int src_stride, uint8_t *dst,
307 : int dst_stride) {
308 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
309 0 : const __m128i zero = _mm_setzero_si128();
310 0 : const __m128i one = _mm_set1_epi16(1);
311 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
312 :
313 0 : __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
314 0 : __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
315 0 : __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
316 0 : __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
317 0 : __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
318 0 : __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
319 0 : __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
320 0 : __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
321 :
322 0 : u0 = _mm_mulhrs_epi16(u0, k_256);
323 0 : u1 = _mm_mulhrs_epi16(u1, k_256);
324 0 : u2 = _mm_mulhrs_epi16(u2, k_256);
325 0 : u3 = _mm_mulhrs_epi16(u3, k_256);
326 0 : u4 = _mm_mulhrs_epi16(u4, k_256);
327 0 : u5 = _mm_mulhrs_epi16(u5, k_256);
328 0 : u6 = _mm_mulhrs_epi16(u6, k_256);
329 0 : u7 = _mm_mulhrs_epi16(u7, k_256);
330 :
331 0 : v0 = _mm_packus_epi16(u0, u1);
332 0 : v1 = _mm_packus_epi16(u2, u3);
333 0 : v2 = _mm_packus_epi16(u4, u5);
334 0 : v3 = _mm_packus_epi16(u6, u7);
335 :
336 0 : u0 = _mm_unpacklo_epi8(v0, v1);
337 0 : u1 = _mm_unpackhi_epi8(v0, v1);
338 0 : u2 = _mm_unpacklo_epi8(v2, v3);
339 0 : u3 = _mm_unpackhi_epi8(v2, v3);
340 :
341 0 : u4 = _mm_unpacklo_epi8(u0, u1);
342 0 : u5 = _mm_unpacklo_epi8(u2, u3);
343 0 : u6 = _mm_unpackhi_epi8(u0, u1);
344 0 : u7 = _mm_unpackhi_epi8(u2, u3);
345 :
346 0 : u0 = _mm_unpacklo_epi32(u4, u5);
347 0 : u1 = _mm_unpackhi_epi32(u4, u5);
348 0 : u2 = _mm_unpacklo_epi32(u6, u7);
349 0 : u3 = _mm_unpackhi_epi32(u6, u7);
350 :
351 0 : u4 = _mm_srli_si128(u0, 8);
352 0 : u5 = _mm_srli_si128(u1, 8);
353 0 : u6 = _mm_srli_si128(u2, 8);
354 0 : u7 = _mm_srli_si128(u3, 8);
355 :
356 0 : v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
357 0 : v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
358 0 : v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
359 0 : v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
360 0 : v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
361 0 : v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
362 0 : v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
363 0 : v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
364 :
365 0 : u0 = _mm_unpacklo_epi8(u0, zero);
366 0 : u1 = _mm_unpacklo_epi8(u1, zero);
367 0 : u2 = _mm_unpacklo_epi8(u2, zero);
368 0 : u3 = _mm_unpacklo_epi8(u3, zero);
369 0 : u4 = _mm_unpacklo_epi8(u4, zero);
370 0 : u5 = _mm_unpacklo_epi8(u5, zero);
371 0 : u6 = _mm_unpacklo_epi8(u6, zero);
372 0 : u7 = _mm_unpacklo_epi8(u7, zero);
373 :
374 0 : v0 = _mm_unpacklo_epi8(v0, zero);
375 0 : v1 = _mm_unpacklo_epi8(v1, zero);
376 0 : v2 = _mm_unpacklo_epi8(v2, zero);
377 0 : v3 = _mm_unpacklo_epi8(v3, zero);
378 0 : v4 = _mm_unpacklo_epi8(v4, zero);
379 0 : v5 = _mm_unpacklo_epi8(v5, zero);
380 0 : v6 = _mm_unpacklo_epi8(v6, zero);
381 0 : v7 = _mm_unpacklo_epi8(v7, zero);
382 :
383 0 : v0 = _mm_adds_epi16(u0, v0);
384 0 : v1 = _mm_adds_epi16(u4, v1);
385 0 : v2 = _mm_adds_epi16(u1, v2);
386 0 : v3 = _mm_adds_epi16(u5, v3);
387 0 : v4 = _mm_adds_epi16(u2, v4);
388 0 : v5 = _mm_adds_epi16(u6, v5);
389 0 : v6 = _mm_adds_epi16(u3, v6);
390 0 : v7 = _mm_adds_epi16(u7, v7);
391 :
392 0 : v0 = _mm_adds_epi16(v0, one);
393 0 : v1 = _mm_adds_epi16(v1, one);
394 0 : v2 = _mm_adds_epi16(v2, one);
395 0 : v3 = _mm_adds_epi16(v3, one);
396 0 : v4 = _mm_adds_epi16(v4, one);
397 0 : v5 = _mm_adds_epi16(v5, one);
398 0 : v6 = _mm_adds_epi16(v6, one);
399 0 : v7 = _mm_adds_epi16(v7, one);
400 :
401 0 : v0 = _mm_srai_epi16(v0, 1);
402 0 : v1 = _mm_srai_epi16(v1, 1);
403 0 : v2 = _mm_srai_epi16(v2, 1);
404 0 : v3 = _mm_srai_epi16(v3, 1);
405 0 : v4 = _mm_srai_epi16(v4, 1);
406 0 : v5 = _mm_srai_epi16(v5, 1);
407 0 : v6 = _mm_srai_epi16(v6, 1);
408 0 : v7 = _mm_srai_epi16(v7, 1);
409 :
410 0 : u0 = _mm_packus_epi16(v0, v1);
411 0 : u1 = _mm_packus_epi16(v2, v3);
412 0 : u2 = _mm_packus_epi16(v4, v5);
413 0 : u3 = _mm_packus_epi16(v6, v7);
414 :
415 0 : u4 = _mm_srli_si128(u0, 8);
416 0 : u5 = _mm_srli_si128(u1, 8);
417 0 : u6 = _mm_srli_si128(u2, 8);
418 0 : u7 = _mm_srli_si128(u3, 8);
419 :
420 : _mm_storel_epi64((__m128i *)dst, u0);
421 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), u4);
422 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), u1);
423 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), u5);
424 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), u2);
425 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), u6);
426 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), u3);
427 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), u7);
428 0 : }
429 :
430 : static transpose_to_dst_t trans8x8Tab[2] = { transpose8x8_direct_to_dst,
431 : transpose8x8_accumu_to_dst };
432 :
433 0 : static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
434 : __m128i t0, t1, t2, t3, u0, u1;
435 :
436 0 : t0 = _mm_unpacklo_epi16(in[0], in[1]);
437 0 : t1 = _mm_unpacklo_epi16(in[2], in[3]);
438 0 : t2 = _mm_unpacklo_epi16(in[4], in[5]);
439 0 : t3 = _mm_unpacklo_epi16(in[6], in[7]);
440 :
441 0 : u0 = _mm_unpacklo_epi32(t0, t1);
442 0 : u1 = _mm_unpacklo_epi32(t2, t3);
443 :
444 0 : out[0] = _mm_unpacklo_epi64(u0, u1);
445 0 : out[1] = _mm_unpackhi_epi64(u0, u1);
446 :
447 0 : u0 = _mm_unpackhi_epi32(t0, t1);
448 0 : u1 = _mm_unpackhi_epi32(t2, t3);
449 :
450 0 : out[2] = _mm_unpacklo_epi64(u0, u1);
451 0 : out[3] = _mm_unpackhi_epi64(u0, u1);
452 :
453 0 : t0 = _mm_unpackhi_epi16(in[0], in[1]);
454 0 : t1 = _mm_unpackhi_epi16(in[2], in[3]);
455 0 : t2 = _mm_unpackhi_epi16(in[4], in[5]);
456 0 : t3 = _mm_unpackhi_epi16(in[6], in[7]);
457 :
458 0 : u0 = _mm_unpacklo_epi32(t0, t1);
459 0 : u1 = _mm_unpacklo_epi32(t2, t3);
460 :
461 0 : out[4] = _mm_unpacklo_epi64(u0, u1);
462 0 : out[5] = _mm_unpackhi_epi64(u0, u1);
463 :
464 : // Ignore out[6] and out[7]
465 : // they're zero vectors.
466 0 : }
467 :
468 0 : static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
469 : __m128i *f, int tapsNum, uint16_t *buf) {
470 : __m128i s[8], t[6];
471 : __m128i min_x2x3, max_x2x3;
472 : __m128i temp;
473 :
474 0 : assert(tapsNum == 10 || tapsNum == 12);
475 0 : if (tapsNum == 10) {
476 0 : src_ptr -= 1;
477 : }
478 0 : s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
479 0 : s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
480 0 : s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
481 0 : s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
482 0 : s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
483 0 : s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
484 0 : s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
485 0 : s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
486 :
487 : // TRANSPOSE...
488 : // Vecotor represents column pixel pairs instead of a row
489 0 : transpose_8x16(s, t);
490 :
491 : // multiply 2 adjacent elements with the filter and add the result
492 0 : s[0] = _mm_maddubs_epi16(t[0], f[0]);
493 0 : s[1] = _mm_maddubs_epi16(t[1], f[1]);
494 0 : s[2] = _mm_maddubs_epi16(t[2], f[2]);
495 0 : s[3] = _mm_maddubs_epi16(t[3], f[3]);
496 0 : s[4] = _mm_maddubs_epi16(t[4], f[4]);
497 0 : s[5] = _mm_maddubs_epi16(t[5], f[5]);
498 :
499 : // add and saturate the results together
500 0 : min_x2x3 = _mm_min_epi16(s[2], s[3]);
501 0 : max_x2x3 = _mm_max_epi16(s[2], s[3]);
502 0 : temp = _mm_adds_epi16(s[0], s[1]);
503 0 : temp = _mm_adds_epi16(temp, s[5]);
504 0 : temp = _mm_adds_epi16(temp, s[4]);
505 :
506 0 : temp = _mm_adds_epi16(temp, min_x2x3);
507 0 : temp = _mm_adds_epi16(temp, max_x2x3);
508 :
509 : _mm_storeu_si128((__m128i *)buf, temp);
510 0 : }
511 :
512 : // Vertical 4-pixel parallel
513 0 : static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
514 : int src_stride, uint8_t *dst,
515 : int dst_stride) {
516 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
517 : __m128i v0, v1, v2, v3;
518 :
519 : // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
520 0 : __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
521 0 : __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
522 0 : __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
523 0 : __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
524 :
525 0 : v0 = _mm_unpacklo_epi16(u0, u1);
526 0 : v1 = _mm_unpacklo_epi16(u2, u3);
527 :
528 0 : v2 = _mm_unpacklo_epi32(v0, v1);
529 0 : v3 = _mm_unpackhi_epi32(v0, v1);
530 :
531 0 : u0 = _mm_mulhrs_epi16(v2, k_256);
532 0 : u1 = _mm_mulhrs_epi16(v3, k_256);
533 :
534 0 : u0 = _mm_packus_epi16(u0, u1);
535 0 : u1 = _mm_srli_si128(u0, 4);
536 0 : u2 = _mm_srli_si128(u0, 8);
537 0 : u3 = _mm_srli_si128(u0, 12);
538 :
539 0 : *(int *)(dst) = _mm_cvtsi128_si32(u0);
540 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
541 0 : *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
542 0 : *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
543 0 : }
544 :
545 0 : static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
546 : int src_stride, uint8_t *dst,
547 : int dst_stride) {
548 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
549 0 : const __m128i zero = _mm_setzero_si128();
550 0 : const __m128i one = _mm_set1_epi16(1);
551 :
552 : __m128i v0, v1, v2, v3;
553 :
554 0 : __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
555 0 : __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
556 0 : __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
557 0 : __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
558 :
559 0 : v0 = _mm_unpacklo_epi16(u0, u1);
560 0 : v1 = _mm_unpacklo_epi16(u2, u3);
561 :
562 0 : v2 = _mm_unpacklo_epi32(v0, v1);
563 0 : v3 = _mm_unpackhi_epi32(v0, v1);
564 :
565 0 : u0 = _mm_mulhrs_epi16(v2, k_256);
566 0 : u1 = _mm_mulhrs_epi16(v3, k_256);
567 :
568 0 : u2 = _mm_packus_epi16(u0, u1);
569 0 : u0 = _mm_unpacklo_epi8(u2, zero);
570 0 : u1 = _mm_unpackhi_epi8(u2, zero);
571 :
572 : // load pixel values
573 0 : v0 = _mm_loadl_epi64((__m128i const *)(dst));
574 0 : v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
575 0 : v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
576 0 : v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
577 :
578 0 : v0 = _mm_unpacklo_epi8(v0, zero);
579 0 : v1 = _mm_unpacklo_epi8(v1, zero);
580 0 : v2 = _mm_unpacklo_epi8(v2, zero);
581 0 : v3 = _mm_unpacklo_epi8(v3, zero);
582 :
583 0 : v0 = _mm_unpacklo_epi64(v0, v1);
584 0 : v1 = _mm_unpacklo_epi64(v2, v3);
585 :
586 0 : u0 = _mm_adds_epi16(u0, v0);
587 0 : u1 = _mm_adds_epi16(u1, v1);
588 :
589 0 : u0 = _mm_adds_epi16(u0, one);
590 0 : u1 = _mm_adds_epi16(u1, one);
591 :
592 0 : u0 = _mm_srai_epi16(u0, 1);
593 0 : u1 = _mm_srai_epi16(u1, 1);
594 :
595 : // saturation and pack to pixels
596 0 : u0 = _mm_packus_epi16(u0, u1);
597 0 : u1 = _mm_srli_si128(u0, 4);
598 0 : u2 = _mm_srli_si128(u0, 8);
599 0 : u3 = _mm_srli_si128(u0, 12);
600 :
601 0 : *(int *)(dst) = _mm_cvtsi128_si32(u0);
602 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
603 0 : *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
604 0 : *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
605 0 : }
606 :
607 : static transpose_to_dst_t trans4x4Tab[2] = { transpose4x4_direct_to_dst,
608 : transpose4x4_accumu_to_dst };
609 :
610 0 : static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
611 : __m128i *f, int tapsNum, uint16_t *buf) {
612 : __m128i A, B, C, D;
613 : __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
614 : __m128i x0, x1, x2, x3, x4, x5;
615 : __m128i min_x2x3, max_x2x3, temp;
616 :
617 0 : assert(tapsNum == 10 || tapsNum == 12);
618 0 : if (tapsNum == 10) {
619 0 : src_ptr -= 1;
620 : }
621 0 : A = _mm_loadu_si128((const __m128i *)src_ptr);
622 0 : B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
623 0 : C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
624 0 : D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
625 :
626 : // TRANSPOSE...
627 : // Vecotor represents column pixel pairs instead of a row
628 : // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
629 0 : tr0_0 = _mm_unpacklo_epi16(A, B);
630 : // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
631 0 : tr0_1 = _mm_unpacklo_epi16(C, D);
632 : // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
633 0 : s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
634 : // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
635 0 : s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
636 : // 02 03 12 13 22 23 32 33
637 0 : s3s2 = _mm_srli_si128(s1s0, 8);
638 : // 06 07 16 17 26 27 36 37
639 0 : s7s6 = _mm_srli_si128(s5s4, 8);
640 :
641 0 : tr0_0 = _mm_unpackhi_epi16(A, B);
642 0 : tr0_1 = _mm_unpackhi_epi16(C, D);
643 0 : s9s8 = _mm_unpacklo_epi32(tr0_0, tr0_1);
644 0 : sbsa = _mm_srli_si128(s9s8, 8);
645 :
646 : // multiply 2 adjacent elements with the filter and add the result
647 0 : x0 = _mm_maddubs_epi16(s1s0, f[0]);
648 0 : x1 = _mm_maddubs_epi16(s3s2, f[1]);
649 0 : x2 = _mm_maddubs_epi16(s5s4, f[2]);
650 0 : x3 = _mm_maddubs_epi16(s7s6, f[3]);
651 0 : x4 = _mm_maddubs_epi16(s9s8, f[4]);
652 0 : x5 = _mm_maddubs_epi16(sbsa, f[5]);
653 : // add and saturate the results together
654 0 : min_x2x3 = _mm_min_epi16(x2, x3);
655 0 : max_x2x3 = _mm_max_epi16(x2, x3);
656 0 : temp = _mm_adds_epi16(x0, x1);
657 0 : temp = _mm_adds_epi16(temp, x5);
658 0 : temp = _mm_adds_epi16(temp, x4);
659 :
660 0 : temp = _mm_adds_epi16(temp, min_x2x3);
661 0 : temp = _mm_adds_epi16(temp, max_x2x3);
662 : _mm_storel_epi64((__m128i *)buf, temp);
663 0 : }
664 :
665 : // Note:
666 : // This function assumes:
667 : // (1) 10/12-taps filters
668 : // (2) x_step_q4 = 16 then filter is fixed at the call
669 :
670 0 : void av1_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
671 : int dst_stride, int w, int h,
672 : const InterpFilterParams filter_params,
673 : const int subpel_x_q4, int x_step_q4,
674 : ConvolveParams *conv_params) {
675 : DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
676 : __m128i verf[6];
677 : __m128i horf[2];
678 : SubpelFilterCoeffs hCoeffs, vCoeffs;
679 : const uint8_t *src_ptr;
680 0 : store_pixel_t store2p = store2pixelTab[conv_params->ref];
681 0 : store_pixel_t store4p = store4pixelTab[conv_params->ref];
682 0 : transpose_to_dst_t transpose_4x4 = trans4x4Tab[conv_params->ref];
683 0 : transpose_to_dst_t transpose_8x8 = trans8x8Tab[conv_params->ref];
684 :
685 0 : const int tapsNum = filter_params.taps;
686 : int block_height, block_residu;
687 : int i, col, count;
688 : (void)x_step_q4;
689 :
690 0 : if (0 == subpel_x_q4 || 16 != x_step_q4) {
691 0 : av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
692 : subpel_x_q4, x_step_q4, conv_params);
693 0 : return;
694 : }
695 :
696 0 : hCoeffs = get_subpel_filter_signal_dir(filter_params, subpel_x_q4 - 1);
697 0 : vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_x_q4 - 1);
698 :
699 0 : if (!hCoeffs || !vCoeffs) {
700 0 : av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
701 : subpel_x_q4, x_step_q4, conv_params);
702 0 : return;
703 : }
704 :
705 0 : verf[0] = *((const __m128i *)(vCoeffs));
706 0 : verf[1] = *((const __m128i *)(vCoeffs + 1));
707 0 : verf[2] = *((const __m128i *)(vCoeffs + 2));
708 0 : verf[3] = *((const __m128i *)(vCoeffs + 3));
709 0 : verf[4] = *((const __m128i *)(vCoeffs + 4));
710 0 : verf[5] = *((const __m128i *)(vCoeffs + 5));
711 :
712 0 : horf[0] = *((const __m128i *)(hCoeffs));
713 0 : horf[1] = *((const __m128i *)(hCoeffs + 1));
714 :
715 0 : count = 0;
716 :
717 : // here tapsNum is filter size
718 0 : src -= (tapsNum >> 1) - 1;
719 0 : src_ptr = src;
720 0 : if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
721 : // 8-pixels parallel
722 0 : block_height = h >> 3;
723 0 : block_residu = h & 7;
724 :
725 : do {
726 0 : for (col = 0; col < w; col += 8) {
727 0 : for (i = 0; i < 8; ++i) {
728 0 : filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
729 : temp + (i * 8));
730 0 : src_ptr += 1;
731 : }
732 0 : transpose_8x8(temp, 8, dst + col, dst_stride);
733 : }
734 0 : count++;
735 0 : src_ptr = src + count * src_stride * 8;
736 0 : dst += dst_stride * 8;
737 0 : } while (count < block_height);
738 :
739 0 : for (i = 0; i < block_residu; ++i) {
740 0 : filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
741 0 : src_ptr += src_stride;
742 0 : dst += dst_stride;
743 : }
744 : } else {
745 0 : if (w > 2) {
746 : // 4-pixels parallel
747 0 : block_height = h >> 2;
748 0 : block_residu = h & 3;
749 :
750 : do {
751 0 : for (col = 0; col < w; col += 4) {
752 0 : for (i = 0; i < 4; ++i) {
753 0 : filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
754 : temp + (i * 4));
755 0 : src_ptr += 1;
756 : }
757 0 : transpose_4x4(temp, 4, dst + col, dst_stride);
758 : }
759 0 : count++;
760 0 : src_ptr = src + count * src_stride * 4;
761 0 : dst += dst_stride * 4;
762 0 : } while (count < block_height);
763 :
764 0 : for (i = 0; i < block_residu; ++i) {
765 0 : filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
766 0 : src_ptr += src_stride;
767 0 : dst += dst_stride;
768 : }
769 : } else {
770 0 : for (i = 0; i < h; i++) {
771 0 : filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
772 0 : src_ptr += src_stride;
773 0 : dst += dst_stride;
774 : }
775 : }
776 : }
777 : }
778 :
779 : // Vertical convolution filtering
780 0 : static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
781 0 : __m128i u = _mm_packus_epi16(*x, *x);
782 : _mm_storel_epi64((__m128i *)dst, u);
783 0 : }
784 :
785 0 : static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
786 0 : __m128i y = accumulate_store(x, dst);
787 : _mm_storel_epi64((__m128i *)dst, y);
788 0 : }
789 :
790 : static store_pixel_t store8pixelTab[2] = { store_8_pixel_only,
791 : accumulate_store_8_pixel };
792 :
793 0 : static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
794 : int tapsNum, __m128i *f) {
795 : __m128i s[12];
796 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
797 0 : const __m128i zero = _mm_setzero_si128();
798 : __m128i min_x2x3, max_x2x3, sum;
799 0 : int i = 0;
800 0 : int r = 0;
801 :
802 0 : if (10 == tapsNum) {
803 0 : i += 1;
804 0 : s[0] = zero;
805 : }
806 0 : while (i < 12) {
807 0 : s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
808 0 : i += 1;
809 0 : r += 1;
810 : }
811 :
812 0 : s[0] = _mm_unpacklo_epi8(s[0], s[1]);
813 0 : s[2] = _mm_unpacklo_epi8(s[2], s[3]);
814 0 : s[4] = _mm_unpacklo_epi8(s[4], s[5]);
815 0 : s[6] = _mm_unpacklo_epi8(s[6], s[7]);
816 0 : s[8] = _mm_unpacklo_epi8(s[8], s[9]);
817 0 : s[10] = _mm_unpacklo_epi8(s[10], s[11]);
818 :
819 0 : s[0] = _mm_maddubs_epi16(s[0], f[0]);
820 0 : s[2] = _mm_maddubs_epi16(s[2], f[1]);
821 0 : s[4] = _mm_maddubs_epi16(s[4], f[2]);
822 0 : s[6] = _mm_maddubs_epi16(s[6], f[3]);
823 0 : s[8] = _mm_maddubs_epi16(s[8], f[4]);
824 0 : s[10] = _mm_maddubs_epi16(s[10], f[5]);
825 :
826 0 : min_x2x3 = _mm_min_epi16(s[4], s[6]);
827 0 : max_x2x3 = _mm_max_epi16(s[4], s[6]);
828 0 : sum = _mm_adds_epi16(s[0], s[2]);
829 0 : sum = _mm_adds_epi16(sum, s[10]);
830 0 : sum = _mm_adds_epi16(sum, s[8]);
831 :
832 0 : sum = _mm_adds_epi16(sum, min_x2x3);
833 0 : sum = _mm_adds_epi16(sum, max_x2x3);
834 :
835 0 : sum = _mm_mulhrs_epi16(sum, k_256);
836 0 : sum = _mm_packus_epi16(sum, sum);
837 0 : sum = _mm_unpacklo_epi8(sum, zero);
838 0 : return sum;
839 : }
840 :
841 0 : static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
842 : __m128i *f, int tapsNum,
843 : store_pixel_t store_func,
844 : uint8_t *dst) {
845 0 : __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
846 0 : store_func(&sum, dst);
847 0 : }
848 :
849 0 : static void filter_vert_compute_small(const uint8_t *src, int src_stride,
850 : __m128i *f, int tapsNum,
851 : store_pixel_t store_func, int h,
852 : uint8_t *dst, int dst_stride) {
853 0 : int rowIndex = 0;
854 : do {
855 0 : filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
856 : dst);
857 0 : rowIndex++;
858 0 : src += src_stride;
859 0 : dst += dst_stride;
860 0 : } while (rowIndex < h);
861 0 : }
862 :
863 0 : static void filter_vert_compute_large(const uint8_t *src, int src_stride,
864 : __m128i *f, int tapsNum,
865 : store_pixel_t store_func, int w, int h,
866 : uint8_t *dst, int dst_stride) {
867 : int col;
868 0 : int rowIndex = 0;
869 0 : const uint8_t *src_ptr = src;
870 0 : uint8_t *dst_ptr = dst;
871 :
872 : do {
873 0 : for (col = 0; col < w; col += 8) {
874 0 : filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
875 : store_func, dst_ptr);
876 0 : src_ptr += 8;
877 0 : dst_ptr += 8;
878 : }
879 0 : rowIndex++;
880 0 : src_ptr = src + rowIndex * src_stride;
881 0 : dst_ptr = dst + rowIndex * dst_stride;
882 0 : } while (rowIndex < h);
883 0 : }
884 :
885 0 : void av1_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
886 : int dst_stride, int w, int h,
887 : const InterpFilterParams filter_params,
888 : const int subpel_y_q4, int y_step_q4,
889 : ConvolveParams *conv_params) {
890 : __m128i verf[6];
891 : SubpelFilterCoeffs vCoeffs;
892 : const uint8_t *src_ptr;
893 0 : uint8_t *dst_ptr = dst;
894 0 : store_pixel_t store2p = store2pixelTab[conv_params->ref];
895 0 : store_pixel_t store4p = store4pixelTab[conv_params->ref];
896 0 : store_pixel_t store8p = store8pixelTab[conv_params->ref];
897 0 : const int tapsNum = filter_params.taps;
898 :
899 0 : if (0 == subpel_y_q4 || 16 != y_step_q4) {
900 0 : av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
901 : subpel_y_q4, y_step_q4, conv_params);
902 0 : return;
903 : }
904 :
905 0 : vCoeffs = get_subpel_filter_ver_signal_dir(filter_params, subpel_y_q4 - 1);
906 :
907 0 : if (!vCoeffs) {
908 0 : av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
909 : subpel_y_q4, y_step_q4, conv_params);
910 0 : return;
911 : }
912 :
913 0 : verf[0] = *((const __m128i *)(vCoeffs));
914 0 : verf[1] = *((const __m128i *)(vCoeffs + 1));
915 0 : verf[2] = *((const __m128i *)(vCoeffs + 2));
916 0 : verf[3] = *((const __m128i *)(vCoeffs + 3));
917 0 : verf[4] = *((const __m128i *)(vCoeffs + 4));
918 0 : verf[5] = *((const __m128i *)(vCoeffs + 5));
919 :
920 0 : src -= src_stride * ((tapsNum >> 1) - 1);
921 0 : src_ptr = src;
922 :
923 0 : if (w > 4) {
924 0 : filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p, w, h,
925 : dst_ptr, dst_stride);
926 0 : } else if (4 == w) {
927 0 : filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p, h,
928 : dst_ptr, dst_stride);
929 0 : } else if (2 == w) {
930 0 : filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p, h,
931 : dst_ptr, dst_stride);
932 : } else {
933 0 : assert(0);
934 : }
935 : }
936 :
937 0 : static void init_simd_horiz_filter(const int16_t *filter_ptr, int taps,
938 : int8_t (*simd_horiz_filter)[2][16]) {
939 : int shift;
940 0 : int offset = (12 - taps) / 2;
941 : const int16_t *filter_row;
942 0 : for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
943 : int i;
944 0 : filter_row = filter_ptr + shift * taps;
945 0 : for (i = 0; i < offset; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
946 :
947 0 : for (i = 0; i < offset + 2; ++i) simd_horiz_filter[shift - 1][1][i] = 0;
948 :
949 0 : for (i = 0; i < taps; ++i) {
950 0 : simd_horiz_filter[shift - 1][0][i + offset] = (int8_t)filter_row[i];
951 0 : simd_horiz_filter[shift - 1][1][i + offset + 2] = (int8_t)filter_row[i];
952 : }
953 :
954 0 : for (i = offset + taps; i < 16; ++i) simd_horiz_filter[shift - 1][0][i] = 0;
955 :
956 0 : for (i = offset + 2 + taps; i < 16; ++i)
957 0 : simd_horiz_filter[shift - 1][1][i] = 0;
958 : }
959 0 : }
960 :
961 0 : static void init_simd_vert_filter(const int16_t *filter_ptr, int taps,
962 : int8_t (*simd_vert_filter)[6][16]) {
963 : int shift;
964 0 : int offset = (12 - taps) / 2;
965 : const int16_t *filter_row;
966 0 : for (shift = 1; shift < SUBPEL_SHIFTS; ++shift) {
967 : int i;
968 0 : filter_row = filter_ptr + shift * taps;
969 0 : for (i = 0; i < 6; ++i) {
970 : int j;
971 0 : for (j = 0; j < 16; ++j) {
972 0 : int c = i * 2 + (j % 2) - offset;
973 0 : if (c >= 0 && c < taps)
974 0 : simd_vert_filter[shift - 1][i][j] = (int8_t)filter_row[c];
975 : else
976 0 : simd_vert_filter[shift - 1][i][j] = 0;
977 : }
978 : }
979 : }
980 0 : }
981 :
982 : typedef struct SimdFilter {
983 : InterpFilter interp_filter;
984 : int8_t (*simd_horiz_filter)[2][16];
985 : int8_t (*simd_vert_filter)[6][16];
986 : } SimdFilter;
987 :
988 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
989 : #define MULTITAP_FILTER_NUM 1
990 : SimdFilter simd_filters[MULTITAP_FILTER_NUM] = {
991 : { MULTITAP_SHARP, &sub_pel_filters_12sharp_signal_dir[0],
992 : &sub_pel_filters_12sharp_ver_signal_dir[0] },
993 : };
994 : #endif
995 :
996 : #if USE_TEMPORALFILTER_12TAP
997 : SimdFilter temporal_simd_filter = {
998 : TEMPORALFILTER_12TAP, &sub_pel_filters_temporalfilter_12_signal_dir[0],
999 : &sub_pel_filters_temporalfilter_12_ver_signal_dir[0]
1000 : };
1001 : #endif
1002 :
1003 0 : void av1_lowbd_convolve_init_ssse3(void) {
1004 : #if USE_TEMPORALFILTER_12TAP
1005 : {
1006 0 : InterpFilterParams filter_params =
1007 0 : av1_get_interp_filter_params(temporal_simd_filter.interp_filter);
1008 0 : int taps = filter_params.taps;
1009 0 : const int16_t *filter_ptr = filter_params.filter_ptr;
1010 0 : init_simd_horiz_filter(filter_ptr, taps,
1011 : temporal_simd_filter.simd_horiz_filter);
1012 0 : init_simd_vert_filter(filter_ptr, taps,
1013 : temporal_simd_filter.simd_vert_filter);
1014 : }
1015 : #endif
1016 : #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1017 : {
1018 : int i;
1019 : for (i = 0; i < MULTITAP_FILTER_NUM; ++i) {
1020 : InterpFilter interp_filter = simd_filters[i].interp_filter;
1021 : InterpFilterParams filter_params =
1022 : av1_get_interp_filter_params(interp_filter);
1023 : int taps = filter_params.taps;
1024 : const int16_t *filter_ptr = filter_params.filter_ptr;
1025 : init_simd_horiz_filter(filter_ptr, taps,
1026 : simd_filters[i].simd_horiz_filter);
1027 : init_simd_vert_filter(filter_ptr, taps, simd_filters[i].simd_vert_filter);
1028 : }
1029 : }
1030 : #endif
1031 0 : return;
1032 : }
|