Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <tmmintrin.h>
13 :
14 : #include "./aom_dsp_rtcd.h"
15 : #include "aom_dsp/aom_filter.h"
16 : #include "aom_dsp/x86/convolve.h"
17 : #include "aom_mem/aom_mem.h"
18 : #include "aom_ports/mem.h"
19 : #include "aom_ports/emmintrin_compat.h"
20 :
21 : // filters only for the 4_h8 convolution
22 : DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
23 : 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
24 : };
25 :
26 : DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
27 : 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
28 : };
29 :
30 : // filters for 8_h8 and 16_h8
31 : DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
32 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
33 : };
34 :
35 : DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
36 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
37 : };
38 :
39 : DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
40 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
41 : };
42 :
43 : DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
44 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
45 : };
46 :
47 : // These are reused by the avx2 intrinsics.
48 : filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
49 : filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
50 : filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
51 :
52 0 : void aom_filter_block1d4_h8_intrin_ssse3(
53 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
54 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
55 : __m128i firstFilters, secondFilters, shuffle1, shuffle2;
56 : __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
57 : __m128i addFilterReg64, filtersReg, srcReg, minReg;
58 : unsigned int i;
59 :
60 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
61 0 : addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
62 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
63 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
64 : // in both lanes of 128 bit register.
65 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
66 :
67 : // duplicate only the first 16 bits in the filter into the first lane
68 0 : firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
69 : // duplicate only the third 16 bit in the filter into the first lane
70 0 : secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
71 : // duplicate only the seconds 16 bits in the filter into the second lane
72 : // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
73 0 : firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
74 : // duplicate only the forth 16 bits in the filter into the second lane
75 : // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
76 0 : secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
77 :
78 : // loading the local filters
79 0 : shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
80 0 : shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
81 :
82 0 : for (i = 0; i < output_height; i++) {
83 0 : srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
84 :
85 : // filter the source buffer
86 0 : srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
87 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
88 :
89 : // multiply 2 adjacent elements with the filter and add the result
90 0 : srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
91 0 : srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
92 :
93 : // extract the higher half of the lane
94 0 : srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
95 0 : srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
96 :
97 0 : minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
98 :
99 : // add and saturate all the results together
100 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
101 0 : srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
102 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
103 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
104 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
105 :
106 : // shift by 7 bit each 16 bits
107 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
108 :
109 : // shrink to 8 bit each 16 bits
110 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
111 0 : src_ptr += src_pixels_per_line;
112 :
113 : // save only 4 bytes
114 0 : *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
115 :
116 0 : output_ptr += output_pitch;
117 : }
118 0 : }
119 :
120 0 : void aom_filter_block1d8_h8_intrin_ssse3(
121 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
122 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
123 : __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
124 : __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
125 : __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
126 : __m128i addFilterReg64, filtersReg, minReg;
127 : unsigned int i;
128 :
129 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
130 0 : addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
131 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
132 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
133 : // in both lanes of 128 bit register.
134 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
135 :
136 : // duplicate only the first 16 bits (first and second byte)
137 : // across 128 bit register
138 0 : firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
139 : // duplicate only the second 16 bits (third and forth byte)
140 : // across 128 bit register
141 0 : secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
142 : // duplicate only the third 16 bits (fifth and sixth byte)
143 : // across 128 bit register
144 0 : thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
145 : // duplicate only the forth 16 bits (seventh and eighth byte)
146 : // across 128 bit register
147 0 : forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
148 :
149 0 : filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
150 0 : filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
151 0 : filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
152 0 : filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
153 :
154 0 : for (i = 0; i < output_height; i++) {
155 0 : srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
156 :
157 : // filter the source buffer
158 0 : srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
159 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
160 :
161 : // multiply 2 adjacent elements with the filter and add the result
162 0 : srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
163 0 : srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
164 :
165 : // filter the source buffer
166 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
167 0 : srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
168 :
169 : // multiply 2 adjacent elements with the filter and add the result
170 0 : srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
171 0 : srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
172 :
173 : // add and saturate all the results together
174 0 : minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
175 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
176 :
177 0 : srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
178 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
179 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
180 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
181 :
182 : // shift by 7 bit each 16 bits
183 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
184 :
185 : // shrink to 8 bit each 16 bits
186 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
187 :
188 0 : src_ptr += src_pixels_per_line;
189 :
190 : // save only 8 bytes
191 : _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
192 :
193 0 : output_ptr += output_pitch;
194 : }
195 0 : }
196 :
197 0 : void aom_filter_block1d8_v8_intrin_ssse3(
198 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
199 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
200 : __m128i addFilterReg64, filtersReg, minReg;
201 : __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
202 : __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
203 : __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
204 : __m128i srcReg8;
205 : unsigned int i;
206 :
207 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
208 0 : addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
209 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
210 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
211 : // in both lanes of 128 bit register.
212 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
213 :
214 : // duplicate only the first 16 bits in the filter
215 0 : firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
216 : // duplicate only the second 16 bits in the filter
217 0 : secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
218 : // duplicate only the third 16 bits in the filter
219 0 : thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
220 : // duplicate only the forth 16 bits in the filter
221 0 : forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
222 :
223 : // load the first 7 rows of 8 bytes
224 0 : srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
225 0 : srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
226 0 : srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
227 0 : srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
228 0 : srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
229 0 : srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
230 0 : srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
231 :
232 0 : for (i = 0; i < output_height; i++) {
233 : // load the last 8 bytes
234 0 : srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
235 :
236 : // merge the result together
237 0 : srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
238 0 : srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
239 :
240 : // merge the result together
241 0 : srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
242 0 : srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
243 :
244 : // multiply 2 adjacent elements with the filter and add the result
245 0 : srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
246 0 : srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
247 0 : srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
248 0 : srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
249 :
250 : // add and saturate the results together
251 0 : minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
252 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
253 0 : srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
254 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
255 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
256 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
257 :
258 : // shift by 7 bit each 16 bit
259 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
260 :
261 : // shrink to 8 bit each 16 bits
262 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
263 :
264 0 : src_ptr += src_pitch;
265 :
266 : // shift down a row
267 0 : srcReg1 = srcReg2;
268 0 : srcReg2 = srcReg3;
269 0 : srcReg3 = srcReg4;
270 0 : srcReg4 = srcReg5;
271 0 : srcReg5 = srcReg6;
272 0 : srcReg6 = srcReg7;
273 0 : srcReg7 = srcReg8;
274 :
275 : // save only 8 bytes convolve result
276 : _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
277 :
278 0 : output_ptr += out_pitch;
279 : }
280 0 : }
281 :
282 : filter8_1dfunction aom_filter_block1d16_v8_ssse3;
283 : filter8_1dfunction aom_filter_block1d16_h8_ssse3;
284 : filter8_1dfunction aom_filter_block1d8_v8_ssse3;
285 : filter8_1dfunction aom_filter_block1d8_h8_ssse3;
286 : filter8_1dfunction aom_filter_block1d4_v8_ssse3;
287 : filter8_1dfunction aom_filter_block1d4_h8_ssse3;
288 : filter8_1dfunction aom_filter_block1d16_v8_avg_ssse3;
289 : filter8_1dfunction aom_filter_block1d16_h8_avg_ssse3;
290 : filter8_1dfunction aom_filter_block1d8_v8_avg_ssse3;
291 : filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
292 : filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
293 : filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
294 : #if CONFIG_LOOP_RESTORATION
295 : filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
296 : filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
297 : filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
298 : filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
299 : filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
300 : filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
301 : #endif
302 :
303 : filter8_1dfunction aom_filter_block1d16_v2_ssse3;
304 : filter8_1dfunction aom_filter_block1d16_h2_ssse3;
305 : filter8_1dfunction aom_filter_block1d8_v2_ssse3;
306 : filter8_1dfunction aom_filter_block1d8_h2_ssse3;
307 : filter8_1dfunction aom_filter_block1d4_v2_ssse3;
308 : filter8_1dfunction aom_filter_block1d4_h2_ssse3;
309 : filter8_1dfunction aom_filter_block1d16_v2_avg_ssse3;
310 : filter8_1dfunction aom_filter_block1d16_h2_avg_ssse3;
311 : filter8_1dfunction aom_filter_block1d8_v2_avg_ssse3;
312 : filter8_1dfunction aom_filter_block1d8_h2_avg_ssse3;
313 : filter8_1dfunction aom_filter_block1d4_v2_avg_ssse3;
314 : filter8_1dfunction aom_filter_block1d4_h2_avg_ssse3;
315 :
316 : // void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
317 : // uint8_t *dst, ptrdiff_t dst_stride,
318 : // const int16_t *filter_x, int x_step_q4,
319 : // const int16_t *filter_y, int y_step_q4,
320 : // int w, int h);
321 : // void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
322 : // uint8_t *dst, ptrdiff_t dst_stride,
323 : // const int16_t *filter_x, int x_step_q4,
324 : // const int16_t *filter_y, int y_step_q4,
325 : // int w, int h);
326 : // void aom_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
327 : // uint8_t *dst, ptrdiff_t dst_stride,
328 : // const int16_t *filter_x, int x_step_q4,
329 : // const int16_t *filter_y, int y_step_q4,
330 : // int w, int h);
331 : // void aom_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
332 : // uint8_t *dst, ptrdiff_t dst_stride,
333 : // const int16_t *filter_x, int x_step_q4,
334 : // const int16_t *filter_y, int y_step_q4,
335 : // int w, int h);
336 0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
337 0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
338 0 : FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
339 0 : FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
340 : ssse3);
341 :
342 : #if CONFIG_LOOP_RESTORATION
343 : FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
344 : ssse3);
345 : FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
346 : src - src_stride * 3, add_src_, ssse3);
347 : #endif
348 :
349 : #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
350 : out2, out3, out4, out5, out6, out7) \
351 : { \
352 : const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
353 : const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
354 : const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
355 : const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
356 : \
357 : const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
358 : const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
359 : const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
360 : const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
361 : \
362 : const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
363 : const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
364 : const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
365 : const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
366 : \
367 : out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
368 : out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
369 : out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
370 : out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
371 : out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
372 : out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
373 : out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
374 : out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
375 : }
376 :
377 0 : static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
378 : uint8_t *dst, const int16_t *x_filter) {
379 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
380 0 : const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
381 : // pack and duplicate the filter values
382 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
383 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
384 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
385 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
386 0 : const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
387 0 : const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
388 0 : const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
389 0 : const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
390 0 : const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
391 0 : const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
392 0 : const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
393 0 : const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
394 : // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
395 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
396 : // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
397 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
398 : // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
399 0 : const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
400 : // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
401 0 : const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
402 : // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
403 0 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
404 : // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
405 0 : const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
406 : // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
407 0 : const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
408 : // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
409 0 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
410 : // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
411 0 : const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
412 0 : const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
413 0 : const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
414 0 : const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
415 : // multiply 2 adjacent elements with the filter and add the result
416 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
417 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
418 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
419 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
420 : // add and saturate the results together
421 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
422 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
423 0 : __m128i temp = _mm_adds_epi16(x0, x3);
424 0 : temp = _mm_adds_epi16(temp, min_x2x1);
425 0 : temp = _mm_adds_epi16(temp, max_x2x1);
426 : // round and shift by 7 bit each 16 bit
427 0 : temp = _mm_mulhrs_epi16(temp, k_256);
428 : // shrink to 8 bit each 16 bits
429 0 : temp = _mm_packus_epi16(temp, temp);
430 : // save only 8 bytes convolve result
431 : _mm_storel_epi64((__m128i *)dst, temp);
432 0 : }
433 :
434 0 : static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
435 : uint8_t *dst, ptrdiff_t dst_stride) {
436 : __m128i A, B, C, D, E, F, G, H;
437 :
438 0 : A = _mm_loadl_epi64((const __m128i *)src);
439 0 : B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
440 0 : C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
441 0 : D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
442 0 : E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
443 0 : F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
444 0 : G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
445 0 : H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
446 :
447 0 : TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
448 :
449 : _mm_storel_epi64((__m128i *)dst, A);
450 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
451 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
452 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
453 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
454 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
455 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
456 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
457 0 : }
458 :
459 0 : static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
460 : uint8_t *dst, ptrdiff_t dst_stride,
461 : const InterpKernel *x_filters, int x0_q4,
462 : int x_step_q4, int w, int h) {
463 : DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
464 : int x, y, z;
465 0 : src -= SUBPEL_TAPS / 2 - 1;
466 :
467 : // This function processes 8x8 areas. The intermediate height is not always
468 : // a multiple of 8, so force it to be a multiple of 8 here.
469 0 : y = h + (8 - (h & 0x7));
470 :
471 : do {
472 0 : int x_q4 = x0_q4;
473 0 : for (x = 0; x < w; x += 8) {
474 : // process 8 src_x steps
475 0 : for (z = 0; z < 8; ++z) {
476 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
477 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
478 0 : if (x_q4 & SUBPEL_MASK) {
479 0 : filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
480 : } else {
481 : int i;
482 0 : for (i = 0; i < 8; ++i) {
483 0 : temp[z * 8 + i] = src_x[i * src_stride + 3];
484 : }
485 : }
486 0 : x_q4 += x_step_q4;
487 : }
488 :
489 : // transpose the 8x8 filters values back to dst
490 0 : transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
491 : }
492 :
493 0 : src += src_stride * 8;
494 0 : dst += dst_stride * 8;
495 0 : } while (y -= 8);
496 0 : }
497 :
498 0 : static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
499 : uint8_t *dst, const int16_t *filter) {
500 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
501 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
502 : // pack and duplicate the filter values
503 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
504 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
505 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
506 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
507 0 : const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
508 0 : const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
509 0 : const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
510 0 : const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
511 : // TRANSPOSE...
512 : // 00 01 02 03 04 05 06 07
513 : // 10 11 12 13 14 15 16 17
514 : // 20 21 22 23 24 25 26 27
515 : // 30 31 32 33 34 35 36 37
516 : //
517 : // TO
518 : //
519 : // 00 10 20 30
520 : // 01 11 21 31
521 : // 02 12 22 32
522 : // 03 13 23 33
523 : // 04 14 24 34
524 : // 05 15 25 35
525 : // 06 16 26 36
526 : // 07 17 27 37
527 : //
528 : // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
529 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
530 : // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
531 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
532 : // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
533 0 : const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
534 : // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
535 0 : const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
536 : // 02 03 12 13 22 23 32 33
537 0 : const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
538 : // 06 07 16 17 26 27 36 37
539 0 : const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
540 : // multiply 2 adjacent elements with the filter and add the result
541 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
542 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
543 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
544 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
545 : // add and saturate the results together
546 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
547 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
548 0 : __m128i temp = _mm_adds_epi16(x0, x3);
549 0 : temp = _mm_adds_epi16(temp, min_x2x1);
550 0 : temp = _mm_adds_epi16(temp, max_x2x1);
551 : // round and shift by 7 bit each 16 bit
552 0 : temp = _mm_mulhrs_epi16(temp, k_256);
553 : // shrink to 8 bit each 16 bits
554 0 : temp = _mm_packus_epi16(temp, temp);
555 : // save only 4 bytes
556 0 : *(int *)dst = _mm_cvtsi128_si32(temp);
557 0 : }
558 :
559 0 : static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
560 : uint8_t *dst, ptrdiff_t dst_stride) {
561 0 : __m128i A = _mm_cvtsi32_si128(*(const int *)src);
562 0 : __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
563 0 : __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
564 0 : __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
565 : // 00 10 01 11 02 12 03 13
566 0 : const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
567 : // 20 30 21 31 22 32 23 33
568 0 : const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
569 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
570 0 : A = _mm_unpacklo_epi16(tr0_0, tr0_1);
571 0 : B = _mm_srli_si128(A, 4);
572 0 : C = _mm_srli_si128(A, 8);
573 0 : D = _mm_srli_si128(A, 12);
574 :
575 0 : *(int *)(dst) = _mm_cvtsi128_si32(A);
576 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
577 0 : *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
578 0 : *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
579 0 : }
580 :
581 0 : static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
582 : uint8_t *dst, ptrdiff_t dst_stride,
583 : const InterpKernel *x_filters, int x0_q4,
584 : int x_step_q4, int w, int h) {
585 : DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
586 : int x, y, z;
587 0 : src -= SUBPEL_TAPS / 2 - 1;
588 :
589 0 : for (y = 0; y < h; y += 4) {
590 0 : int x_q4 = x0_q4;
591 0 : for (x = 0; x < w; x += 4) {
592 : // process 4 src_x steps
593 0 : for (z = 0; z < 4; ++z) {
594 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
595 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
596 0 : if (x_q4 & SUBPEL_MASK) {
597 0 : filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
598 : } else {
599 : int i;
600 0 : for (i = 0; i < 4; ++i) {
601 0 : temp[z * 4 + i] = src_x[i * src_stride + 3];
602 : }
603 : }
604 0 : x_q4 += x_step_q4;
605 : }
606 :
607 : // transpose the 4x4 filters values back to dst
608 0 : transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
609 : }
610 :
611 0 : src += src_stride * 4;
612 0 : dst += dst_stride * 4;
613 : }
614 0 : }
615 :
616 0 : static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
617 : uint8_t *dst, const int16_t *filter) {
618 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
619 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
620 : // pack and duplicate the filter values
621 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
622 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
623 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
624 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
625 0 : const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
626 0 : const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
627 0 : const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
628 0 : const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
629 0 : const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
630 0 : const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
631 0 : const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
632 0 : const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
633 0 : const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
634 0 : const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
635 0 : const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
636 0 : const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
637 : // multiply 2 adjacent elements with the filter and add the result
638 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
639 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
640 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
641 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
642 : // add and saturate the results together
643 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
644 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
645 0 : __m128i temp = _mm_adds_epi16(x0, x3);
646 0 : temp = _mm_adds_epi16(temp, min_x2x1);
647 0 : temp = _mm_adds_epi16(temp, max_x2x1);
648 : // round and shift by 7 bit each 16 bit
649 0 : temp = _mm_mulhrs_epi16(temp, k_256);
650 : // shrink to 8 bit each 16 bits
651 0 : temp = _mm_packus_epi16(temp, temp);
652 : // save only 4 bytes
653 0 : *(int *)dst = _mm_cvtsi128_si32(temp);
654 0 : }
655 :
656 0 : static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
657 : uint8_t *dst, ptrdiff_t dst_stride,
658 : const InterpKernel *y_filters, int y0_q4,
659 : int y_step_q4, int w, int h) {
660 : int y;
661 0 : int y_q4 = y0_q4;
662 :
663 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
664 0 : for (y = 0; y < h; ++y) {
665 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
666 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
667 :
668 0 : if (y_q4 & SUBPEL_MASK) {
669 0 : filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
670 : } else {
671 0 : memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
672 : }
673 :
674 0 : y_q4 += y_step_q4;
675 : }
676 0 : }
677 :
678 0 : static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
679 : uint8_t *dst, const int16_t *filter) {
680 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
681 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
682 : // pack and duplicate the filter values
683 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
684 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
685 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
686 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
687 0 : const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
688 0 : const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
689 0 : const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
690 0 : const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
691 0 : const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
692 0 : const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
693 0 : const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
694 0 : const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
695 0 : const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
696 0 : const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
697 0 : const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
698 0 : const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
699 : // multiply 2 adjacent elements with the filter and add the result
700 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
701 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
702 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
703 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
704 : // add and saturate the results together
705 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
706 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
707 0 : __m128i temp = _mm_adds_epi16(x0, x3);
708 0 : temp = _mm_adds_epi16(temp, min_x2x1);
709 0 : temp = _mm_adds_epi16(temp, max_x2x1);
710 : // round and shift by 7 bit each 16 bit
711 0 : temp = _mm_mulhrs_epi16(temp, k_256);
712 : // shrink to 8 bit each 16 bits
713 0 : temp = _mm_packus_epi16(temp, temp);
714 : // save only 8 bytes convolve result
715 : _mm_storel_epi64((__m128i *)dst, temp);
716 0 : }
717 :
718 0 : static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
719 : uint8_t *dst, ptrdiff_t dst_stride,
720 : const InterpKernel *y_filters, int y0_q4,
721 : int y_step_q4, int w, int h) {
722 : int y;
723 0 : int y_q4 = y0_q4;
724 :
725 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
726 0 : for (y = 0; y < h; ++y) {
727 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
728 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
729 0 : if (y_q4 & SUBPEL_MASK) {
730 0 : filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
731 : } else {
732 0 : memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
733 : }
734 0 : y_q4 += y_step_q4;
735 : }
736 0 : }
737 :
738 0 : static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
739 : uint8_t *dst, const int16_t *filter, int w) {
740 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
741 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
742 : // pack and duplicate the filter values
743 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
744 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
745 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
746 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
747 : int i;
748 :
749 0 : for (i = 0; i < w; i += 16) {
750 0 : const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
751 0 : const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
752 0 : const __m128i C =
753 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
754 0 : const __m128i D =
755 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
756 0 : const __m128i E =
757 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
758 0 : const __m128i F =
759 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
760 0 : const __m128i G =
761 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
762 0 : const __m128i H =
763 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
764 : // merge the result together
765 0 : const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
766 0 : const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
767 0 : const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
768 0 : const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
769 : // multiply 2 adjacent elements with the filter and add the result
770 0 : const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
771 0 : const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
772 0 : const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
773 0 : const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
774 : // add and saturate the results together
775 0 : const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
776 0 : const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
777 : // merge the result together
778 0 : const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
779 0 : const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
780 : // multiply 2 adjacent elements with the filter and add the result
781 0 : const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
782 0 : const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
783 : // merge the result together
784 0 : const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
785 0 : const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
786 : // multiply 2 adjacent elements with the filter and add the result
787 0 : const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
788 0 : const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
789 : // add and saturate the results together
790 0 : __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
791 0 : __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
792 :
793 : // add and saturate the results together
794 0 : temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
795 0 : temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
796 : // round and shift by 7 bit each 16 bit
797 0 : temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
798 0 : temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
799 : // shrink to 8 bit each 16 bits, the first lane contain the first
800 : // convolve result and the second lane contain the second convolve
801 : // result
802 0 : temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
803 0 : src_ptr += 16;
804 : // save 16 bytes convolve result
805 0 : _mm_store_si128((__m128i *)&dst[i], temp_hi);
806 : }
807 0 : }
808 :
809 0 : static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
810 : uint8_t *dst, ptrdiff_t dst_stride,
811 : const InterpKernel *y_filters, int y0_q4,
812 : int y_step_q4, int w, int h) {
813 : int y;
814 0 : int y_q4 = y0_q4;
815 :
816 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
817 0 : for (y = 0; y < h; ++y) {
818 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
819 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
820 0 : if (y_q4 & SUBPEL_MASK) {
821 0 : filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
822 : w);
823 : } else {
824 0 : memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
825 : }
826 0 : y_q4 += y_step_q4;
827 : }
828 0 : }
829 :
830 0 : static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
831 : uint8_t *dst, ptrdiff_t dst_stride,
832 : const InterpKernel *const x_filters, int x0_q4,
833 : int x_step_q4, const InterpKernel *const y_filters,
834 : int y0_q4, int y_step_q4, int w, int h) {
835 : // Note: Fixed size intermediate buffer, temp, places limits on parameters.
836 : // 2d filtering proceeds in 2 steps:
837 : // (1) Interpolate horizontally into an intermediate buffer, temp.
838 : // (2) Interpolate temp vertically to derive the sub-pixel result.
839 : // Deriving the maximum number of rows in the temp buffer (135):
840 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
841 : // --Largest block size is 64x64 pixels.
842 : // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
843 : // original frame (in 1/16th pixel units).
844 : // --Must round-up because block may be located at sub-pixel position.
845 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
846 : // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
847 : // --Require an additional 8 rows for the horiz_w8 transpose tail.
848 : DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
849 0 : const int intermediate_height =
850 0 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
851 :
852 0 : assert(w <= MAX_SB_SIZE);
853 0 : assert(h <= MAX_SB_SIZE);
854 0 : assert(y_step_q4 <= 32);
855 0 : assert(x_step_q4 <= 32);
856 :
857 0 : if (w >= 8) {
858 0 : scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
859 : src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
860 : x_step_q4, w, intermediate_height);
861 : } else {
862 0 : scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
863 : src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
864 : x_step_q4, w, intermediate_height);
865 : }
866 :
867 0 : if (w >= 16) {
868 0 : scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
869 : MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
870 : y_step_q4, w, h);
871 0 : } else if (w == 8) {
872 0 : scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
873 : MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
874 : y_step_q4, w, h);
875 : } else {
876 0 : scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
877 : MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
878 : y_step_q4, w, h);
879 : }
880 0 : }
881 :
882 0 : static const InterpKernel *get_filter_base(const int16_t *filter) {
883 : // NOTE: This assumes that the filter table is 256-byte aligned.
884 : // TODO(agrange) Modify to make independent of table alignment.
885 0 : return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
886 : }
887 :
888 0 : static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
889 0 : return (int)((const InterpKernel *)(intptr_t)f - base);
890 : }
891 :
892 0 : void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
893 : ptrdiff_t dst_stride, const int16_t *filter_x,
894 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
895 : int w, int h) {
896 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
897 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
898 :
899 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
900 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
901 :
902 0 : scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
903 : x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
904 0 : }
905 :
906 : // void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
907 : // uint8_t *dst, ptrdiff_t dst_stride,
908 : // const int16_t *filter_x, int x_step_q4,
909 : // const int16_t *filter_y, int y_step_q4,
910 : // int w, int h);
911 : // void aom_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
912 : // uint8_t *dst, ptrdiff_t dst_stride,
913 : // const int16_t *filter_x, int x_step_q4,
914 : // const int16_t *filter_y, int y_step_q4,
915 : // int w, int h);
916 0 : FUN_CONV_2D(, ssse3);
917 0 : FUN_CONV_2D(avg_, ssse3);
918 : #if CONFIG_LOOP_RESTORATION
919 : FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
920 : #endif
|