Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <tmmintrin.h>
12 :
13 : #include "./vpx_dsp_rtcd.h"
14 : #include "vpx_dsp/vpx_filter.h"
15 : #include "vpx_dsp/x86/convolve.h"
16 : #include "vpx_mem/vpx_mem.h"
17 : #include "vpx_ports/mem.h"
18 : #include "vpx_ports/emmintrin_compat.h"
19 :
20 : // filters only for the 4_h8 convolution
21 : DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
22 : 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
23 : };
24 :
25 : DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
26 : 4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
27 : };
28 :
29 : // filters for 8_h8 and 16_h8
30 : DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
31 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
32 : };
33 :
34 : DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
35 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36 : };
37 :
38 : DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
39 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
40 : };
41 :
42 : DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
43 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
44 : };
45 :
46 : // These are reused by the avx2 intrinsics.
47 : filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
48 : filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
49 : filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
50 :
51 0 : void vpx_filter_block1d4_h8_intrin_ssse3(
52 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
53 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
54 : __m128i firstFilters, secondFilters, shuffle1, shuffle2;
55 : __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
56 : __m128i addFilterReg64, filtersReg, srcReg, minReg;
57 : unsigned int i;
58 :
59 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
60 0 : addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
61 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
62 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
63 : // in both lanes of 128 bit register.
64 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
65 :
66 : // duplicate only the first 16 bits in the filter into the first lane
67 0 : firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
68 : // duplicate only the third 16 bit in the filter into the first lane
69 0 : secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
70 : // duplicate only the seconds 16 bits in the filter into the second lane
71 : // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
72 0 : firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
73 : // duplicate only the forth 16 bits in the filter into the second lane
74 : // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
75 0 : secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
76 :
77 : // loading the local filters
78 0 : shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
79 0 : shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
80 :
81 0 : for (i = 0; i < output_height; i++) {
82 0 : srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
83 :
84 : // filter the source buffer
85 0 : srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
86 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);
87 :
88 : // multiply 2 adjacent elements with the filter and add the result
89 0 : srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
90 0 : srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
91 :
92 : // extract the higher half of the lane
93 0 : srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
94 0 : srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
95 :
96 0 : minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
97 :
98 : // add and saturate all the results together
99 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
100 0 : srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
101 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
102 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
103 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
104 :
105 : // shift by 7 bit each 16 bits
106 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
107 :
108 : // shrink to 8 bit each 16 bits
109 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
110 0 : src_ptr += src_pixels_per_line;
111 :
112 : // save only 4 bytes
113 0 : *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
114 :
115 0 : output_ptr += output_pitch;
116 : }
117 0 : }
118 :
119 0 : void vpx_filter_block1d8_h8_intrin_ssse3(
120 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
121 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
122 : __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
123 : __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
124 : __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
125 : __m128i addFilterReg64, filtersReg, minReg;
126 : unsigned int i;
127 :
128 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
129 0 : addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
130 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
131 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
132 : // in both lanes of 128 bit register.
133 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
134 :
135 : // duplicate only the first 16 bits (first and second byte)
136 : // across 128 bit register
137 0 : firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
138 : // duplicate only the second 16 bits (third and forth byte)
139 : // across 128 bit register
140 0 : secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
141 : // duplicate only the third 16 bits (fifth and sixth byte)
142 : // across 128 bit register
143 0 : thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
144 : // duplicate only the forth 16 bits (seventh and eighth byte)
145 : // across 128 bit register
146 0 : forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
147 :
148 0 : filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
149 0 : filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
150 0 : filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
151 0 : filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
152 :
153 0 : for (i = 0; i < output_height; i++) {
154 0 : srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
155 :
156 : // filter the source buffer
157 0 : srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
158 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
159 :
160 : // multiply 2 adjacent elements with the filter and add the result
161 0 : srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
162 0 : srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
163 :
164 : // filter the source buffer
165 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
166 0 : srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
167 :
168 : // multiply 2 adjacent elements with the filter and add the result
169 0 : srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
170 0 : srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
171 :
172 : // add and saturate all the results together
173 0 : minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
174 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
175 :
176 0 : srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
177 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
178 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
179 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
180 :
181 : // shift by 7 bit each 16 bits
182 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
183 :
184 : // shrink to 8 bit each 16 bits
185 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
186 :
187 0 : src_ptr += src_pixels_per_line;
188 :
189 : // save only 8 bytes
190 : _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
191 :
192 0 : output_ptr += output_pitch;
193 : }
194 0 : }
195 :
196 0 : void vpx_filter_block1d8_v8_intrin_ssse3(
197 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
198 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
199 : __m128i addFilterReg64, filtersReg, minReg;
200 : __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
201 : __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
202 : __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
203 : __m128i srcReg8;
204 : unsigned int i;
205 :
206 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
207 0 : addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
208 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
209 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
210 : // in both lanes of 128 bit register.
211 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
212 :
213 : // duplicate only the first 16 bits in the filter
214 0 : firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
215 : // duplicate only the second 16 bits in the filter
216 0 : secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
217 : // duplicate only the third 16 bits in the filter
218 0 : thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
219 : // duplicate only the forth 16 bits in the filter
220 0 : forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
221 :
222 : // load the first 7 rows of 8 bytes
223 0 : srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
224 0 : srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
225 0 : srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
226 0 : srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
227 0 : srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
228 0 : srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
229 0 : srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
230 :
231 0 : for (i = 0; i < output_height; i++) {
232 : // load the last 8 bytes
233 0 : srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
234 :
235 : // merge the result together
236 0 : srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
237 0 : srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
238 :
239 : // merge the result together
240 0 : srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
241 0 : srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
242 :
243 : // multiply 2 adjacent elements with the filter and add the result
244 0 : srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
245 0 : srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
246 0 : srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
247 0 : srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
248 :
249 : // add and saturate the results together
250 0 : minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
251 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
252 0 : srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
253 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
254 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
255 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
256 :
257 : // shift by 7 bit each 16 bit
258 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
259 :
260 : // shrink to 8 bit each 16 bits
261 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
262 :
263 0 : src_ptr += src_pitch;
264 :
265 : // shift down a row
266 0 : srcReg1 = srcReg2;
267 0 : srcReg2 = srcReg3;
268 0 : srcReg3 = srcReg4;
269 0 : srcReg4 = srcReg5;
270 0 : srcReg5 = srcReg6;
271 0 : srcReg6 = srcReg7;
272 0 : srcReg7 = srcReg8;
273 :
274 : // save only 8 bytes convolve result
275 : _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
276 :
277 0 : output_ptr += out_pitch;
278 : }
279 0 : }
280 :
281 : filter8_1dfunction vpx_filter_block1d16_v8_ssse3;
282 : filter8_1dfunction vpx_filter_block1d16_h8_ssse3;
283 : filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
284 : filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
285 : filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
286 : filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
287 : filter8_1dfunction vpx_filter_block1d16_v8_avg_ssse3;
288 : filter8_1dfunction vpx_filter_block1d16_h8_avg_ssse3;
289 : filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
290 : filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
291 : filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
292 : filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
293 :
294 : filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
295 : filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
296 : filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
297 : filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
298 : filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
299 : filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
300 : filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
301 : filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
302 : filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
303 : filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
304 : filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
305 : filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
306 :
307 : // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
308 : // uint8_t *dst, ptrdiff_t dst_stride,
309 : // const int16_t *filter_x, int x_step_q4,
310 : // const int16_t *filter_y, int y_step_q4,
311 : // int w, int h);
312 : // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
313 : // uint8_t *dst, ptrdiff_t dst_stride,
314 : // const int16_t *filter_x, int x_step_q4,
315 : // const int16_t *filter_y, int y_step_q4,
316 : // int w, int h);
317 : // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
318 : // uint8_t *dst, ptrdiff_t dst_stride,
319 : // const int16_t *filter_x, int x_step_q4,
320 : // const int16_t *filter_y, int y_step_q4,
321 : // int w, int h);
322 : // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
323 : // uint8_t *dst, ptrdiff_t dst_stride,
324 : // const int16_t *filter_x, int x_step_q4,
325 : // const int16_t *filter_y, int y_step_q4,
326 : // int w, int h);
327 0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
328 0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
329 0 : FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
330 0 : FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
331 : ssse3);
332 :
333 : #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
334 : out2, out3, out4, out5, out6, out7) \
335 : { \
336 : const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
337 : const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
338 : const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
339 : const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
340 : \
341 : const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
342 : const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
343 : const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
344 : const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
345 : \
346 : const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
347 : const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
348 : const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
349 : const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
350 : \
351 : out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
352 : out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
353 : out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
354 : out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
355 : out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
356 : out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
357 : out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
358 : out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
359 : }
360 :
361 0 : static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
362 : uint8_t *dst, const int16_t *x_filter) {
363 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
364 0 : const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
365 : // pack and duplicate the filter values
366 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
367 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
368 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
369 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
370 0 : const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
371 0 : const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
372 0 : const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
373 0 : const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
374 0 : const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
375 0 : const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
376 0 : const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
377 0 : const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
378 : // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
379 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
380 : // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
381 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
382 : // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
383 0 : const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
384 : // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
385 0 : const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
386 : // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
387 0 : const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
388 : // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
389 0 : const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
390 : // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
391 0 : const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
392 : // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
393 0 : const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
394 : // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
395 0 : const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
396 0 : const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
397 0 : const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
398 0 : const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
399 : // multiply 2 adjacent elements with the filter and add the result
400 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
401 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
402 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
403 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
404 : // add and saturate the results together
405 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
406 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
407 0 : __m128i temp = _mm_adds_epi16(x0, x3);
408 0 : temp = _mm_adds_epi16(temp, min_x2x1);
409 0 : temp = _mm_adds_epi16(temp, max_x2x1);
410 : // round and shift by 7 bit each 16 bit
411 0 : temp = _mm_mulhrs_epi16(temp, k_256);
412 : // shrink to 8 bit each 16 bits
413 0 : temp = _mm_packus_epi16(temp, temp);
414 : // save only 8 bytes convolve result
415 : _mm_storel_epi64((__m128i *)dst, temp);
416 0 : }
417 :
418 0 : static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
419 : uint8_t *dst, ptrdiff_t dst_stride) {
420 : __m128i A, B, C, D, E, F, G, H;
421 :
422 0 : A = _mm_loadl_epi64((const __m128i *)src);
423 0 : B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
424 0 : C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
425 0 : D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
426 0 : E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
427 0 : F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
428 0 : G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
429 0 : H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
430 :
431 0 : TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
432 :
433 : _mm_storel_epi64((__m128i *)dst, A);
434 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
435 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
436 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
437 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
438 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
439 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
440 0 : _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
441 0 : }
442 :
443 0 : static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
444 : uint8_t *dst, ptrdiff_t dst_stride,
445 : const InterpKernel *x_filters, int x0_q4,
446 : int x_step_q4, int w, int h) {
447 : DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
448 : int x, y, z;
449 0 : src -= SUBPEL_TAPS / 2 - 1;
450 :
451 : // This function processes 8x8 areas. The intermediate height is not always
452 : // a multiple of 8, so force it to be a multiple of 8 here.
453 0 : y = h + (8 - (h & 0x7));
454 :
455 : do {
456 0 : int x_q4 = x0_q4;
457 0 : for (x = 0; x < w; x += 8) {
458 : // process 8 src_x steps
459 0 : for (z = 0; z < 8; ++z) {
460 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
461 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
462 0 : if (x_q4 & SUBPEL_MASK) {
463 0 : filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
464 : } else {
465 : int i;
466 0 : for (i = 0; i < 8; ++i) {
467 0 : temp[z * 8 + i] = src_x[i * src_stride + 3];
468 : }
469 : }
470 0 : x_q4 += x_step_q4;
471 : }
472 :
473 : // transpose the 8x8 filters values back to dst
474 0 : transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
475 : }
476 :
477 0 : src += src_stride * 8;
478 0 : dst += dst_stride * 8;
479 0 : } while (y -= 8);
480 0 : }
481 :
482 0 : static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
483 : uint8_t *dst, const int16_t *filter) {
484 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
485 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
486 : // pack and duplicate the filter values
487 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
488 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
489 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
490 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
491 0 : const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
492 0 : const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
493 0 : const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
494 0 : const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
495 : // TRANSPOSE...
496 : // 00 01 02 03 04 05 06 07
497 : // 10 11 12 13 14 15 16 17
498 : // 20 21 22 23 24 25 26 27
499 : // 30 31 32 33 34 35 36 37
500 : //
501 : // TO
502 : //
503 : // 00 10 20 30
504 : // 01 11 21 31
505 : // 02 12 22 32
506 : // 03 13 23 33
507 : // 04 14 24 34
508 : // 05 15 25 35
509 : // 06 16 26 36
510 : // 07 17 27 37
511 : //
512 : // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
513 0 : const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
514 : // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
515 0 : const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
516 : // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
517 0 : const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
518 : // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
519 0 : const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
520 : // 02 03 12 13 22 23 32 33
521 0 : const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
522 : // 06 07 16 17 26 27 36 37
523 0 : const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
524 : // multiply 2 adjacent elements with the filter and add the result
525 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
526 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
527 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
528 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
529 : // add and saturate the results together
530 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
531 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
532 0 : __m128i temp = _mm_adds_epi16(x0, x3);
533 0 : temp = _mm_adds_epi16(temp, min_x2x1);
534 0 : temp = _mm_adds_epi16(temp, max_x2x1);
535 : // round and shift by 7 bit each 16 bit
536 0 : temp = _mm_mulhrs_epi16(temp, k_256);
537 : // shrink to 8 bit each 16 bits
538 0 : temp = _mm_packus_epi16(temp, temp);
539 : // save only 4 bytes
540 0 : *(int *)dst = _mm_cvtsi128_si32(temp);
541 0 : }
542 :
543 0 : static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
544 : uint8_t *dst, ptrdiff_t dst_stride) {
545 0 : __m128i A = _mm_cvtsi32_si128(*(const int *)src);
546 0 : __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
547 0 : __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
548 0 : __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
549 : // 00 10 01 11 02 12 03 13
550 0 : const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
551 : // 20 30 21 31 22 32 23 33
552 0 : const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
553 : // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
554 0 : A = _mm_unpacklo_epi16(tr0_0, tr0_1);
555 0 : B = _mm_srli_si128(A, 4);
556 0 : C = _mm_srli_si128(A, 8);
557 0 : D = _mm_srli_si128(A, 12);
558 :
559 0 : *(int *)(dst) = _mm_cvtsi128_si32(A);
560 0 : *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
561 0 : *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
562 0 : *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
563 0 : }
564 :
565 0 : static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
566 : uint8_t *dst, ptrdiff_t dst_stride,
567 : const InterpKernel *x_filters, int x0_q4,
568 : int x_step_q4, int w, int h) {
569 : DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
570 : int x, y, z;
571 0 : src -= SUBPEL_TAPS / 2 - 1;
572 :
573 0 : for (y = 0; y < h; y += 4) {
574 0 : int x_q4 = x0_q4;
575 0 : for (x = 0; x < w; x += 4) {
576 : // process 4 src_x steps
577 0 : for (z = 0; z < 4; ++z) {
578 0 : const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
579 0 : const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
580 0 : if (x_q4 & SUBPEL_MASK) {
581 0 : filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
582 : } else {
583 : int i;
584 0 : for (i = 0; i < 4; ++i) {
585 0 : temp[z * 4 + i] = src_x[i * src_stride + 3];
586 : }
587 : }
588 0 : x_q4 += x_step_q4;
589 : }
590 :
591 : // transpose the 4x4 filters values back to dst
592 0 : transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
593 : }
594 :
595 0 : src += src_stride * 4;
596 0 : dst += dst_stride * 4;
597 : }
598 0 : }
599 :
600 0 : static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
601 : uint8_t *dst, const int16_t *filter) {
602 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
603 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
604 : // pack and duplicate the filter values
605 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
606 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
607 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
608 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
609 0 : const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
610 0 : const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
611 0 : const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
612 0 : const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
613 0 : const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
614 0 : const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
615 0 : const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
616 0 : const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
617 0 : const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
618 0 : const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
619 0 : const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
620 0 : const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
621 : // multiply 2 adjacent elements with the filter and add the result
622 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
623 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
624 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
625 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
626 : // add and saturate the results together
627 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
628 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
629 0 : __m128i temp = _mm_adds_epi16(x0, x3);
630 0 : temp = _mm_adds_epi16(temp, min_x2x1);
631 0 : temp = _mm_adds_epi16(temp, max_x2x1);
632 : // round and shift by 7 bit each 16 bit
633 0 : temp = _mm_mulhrs_epi16(temp, k_256);
634 : // shrink to 8 bit each 16 bits
635 0 : temp = _mm_packus_epi16(temp, temp);
636 : // save only 4 bytes
637 0 : *(int *)dst = _mm_cvtsi128_si32(temp);
638 0 : }
639 :
640 0 : static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
641 : uint8_t *dst, ptrdiff_t dst_stride,
642 : const InterpKernel *y_filters, int y0_q4,
643 : int y_step_q4, int w, int h) {
644 : int y;
645 0 : int y_q4 = y0_q4;
646 :
647 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
648 0 : for (y = 0; y < h; ++y) {
649 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
650 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
651 :
652 0 : if (y_q4 & SUBPEL_MASK) {
653 0 : filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
654 : } else {
655 0 : memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
656 : }
657 :
658 0 : y_q4 += y_step_q4;
659 : }
660 0 : }
661 :
662 0 : static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
663 : uint8_t *dst, const int16_t *filter) {
664 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
665 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
666 : // pack and duplicate the filter values
667 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
668 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
669 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
670 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
671 0 : const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
672 0 : const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
673 0 : const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
674 0 : const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
675 0 : const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
676 0 : const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
677 0 : const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
678 0 : const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
679 0 : const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
680 0 : const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
681 0 : const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
682 0 : const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
683 : // multiply 2 adjacent elements with the filter and add the result
684 0 : const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
685 0 : const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
686 0 : const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
687 0 : const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
688 : // add and saturate the results together
689 0 : const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
690 0 : const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
691 0 : __m128i temp = _mm_adds_epi16(x0, x3);
692 0 : temp = _mm_adds_epi16(temp, min_x2x1);
693 0 : temp = _mm_adds_epi16(temp, max_x2x1);
694 : // round and shift by 7 bit each 16 bit
695 0 : temp = _mm_mulhrs_epi16(temp, k_256);
696 : // shrink to 8 bit each 16 bits
697 0 : temp = _mm_packus_epi16(temp, temp);
698 : // save only 8 bytes convolve result
699 : _mm_storel_epi64((__m128i *)dst, temp);
700 0 : }
701 :
702 0 : static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
703 : uint8_t *dst, ptrdiff_t dst_stride,
704 : const InterpKernel *y_filters, int y0_q4,
705 : int y_step_q4, int w, int h) {
706 : int y;
707 0 : int y_q4 = y0_q4;
708 :
709 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
710 0 : for (y = 0; y < h; ++y) {
711 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
712 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
713 0 : if (y_q4 & SUBPEL_MASK) {
714 0 : filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
715 : } else {
716 0 : memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
717 : }
718 0 : y_q4 += y_step_q4;
719 : }
720 0 : }
721 :
722 0 : static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
723 : uint8_t *dst, const int16_t *filter, int w) {
724 0 : const __m128i k_256 = _mm_set1_epi16(1 << 8);
725 0 : const __m128i f_values = _mm_load_si128((const __m128i *)filter);
726 : // pack and duplicate the filter values
727 0 : const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
728 0 : const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
729 0 : const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
730 0 : const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
731 : int i;
732 :
733 0 : for (i = 0; i < w; i += 16) {
734 0 : const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
735 0 : const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
736 0 : const __m128i C =
737 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
738 0 : const __m128i D =
739 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
740 0 : const __m128i E =
741 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
742 0 : const __m128i F =
743 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
744 0 : const __m128i G =
745 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
746 0 : const __m128i H =
747 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
748 : // merge the result together
749 0 : const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
750 0 : const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
751 0 : const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
752 0 : const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
753 : // multiply 2 adjacent elements with the filter and add the result
754 0 : const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
755 0 : const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
756 0 : const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
757 0 : const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
758 : // add and saturate the results together
759 0 : const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
760 0 : const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
761 : // merge the result together
762 0 : const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
763 0 : const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
764 : // multiply 2 adjacent elements with the filter and add the result
765 0 : const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
766 0 : const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
767 : // merge the result together
768 0 : const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
769 0 : const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
770 : // multiply 2 adjacent elements with the filter and add the result
771 0 : const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
772 0 : const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
773 : // add and saturate the results together
774 0 : __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
775 0 : __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
776 :
777 : // add and saturate the results together
778 0 : temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
779 0 : temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
780 : // round and shift by 7 bit each 16 bit
781 0 : temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
782 0 : temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
783 : // shrink to 8 bit each 16 bits, the first lane contain the first
784 : // convolve result and the second lane contain the second convolve
785 : // result
786 0 : temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
787 0 : src_ptr += 16;
788 : // save 16 bytes convolve result
789 0 : _mm_store_si128((__m128i *)&dst[i], temp_hi);
790 : }
791 0 : }
792 :
793 0 : static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
794 : uint8_t *dst, ptrdiff_t dst_stride,
795 : const InterpKernel *y_filters, int y0_q4,
796 : int y_step_q4, int w, int h) {
797 : int y;
798 0 : int y_q4 = y0_q4;
799 :
800 0 : src -= src_stride * (SUBPEL_TAPS / 2 - 1);
801 0 : for (y = 0; y < h; ++y) {
802 0 : const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
803 0 : const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
804 0 : if (y_q4 & SUBPEL_MASK) {
805 0 : filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
806 : w);
807 : } else {
808 0 : memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
809 : }
810 0 : y_q4 += y_step_q4;
811 : }
812 0 : }
813 :
814 0 : static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
815 : uint8_t *dst, ptrdiff_t dst_stride,
816 : const InterpKernel *const x_filters, int x0_q4,
817 : int x_step_q4, const InterpKernel *const y_filters,
818 : int y0_q4, int y_step_q4, int w, int h) {
819 : // Note: Fixed size intermediate buffer, temp, places limits on parameters.
820 : // 2d filtering proceeds in 2 steps:
821 : // (1) Interpolate horizontally into an intermediate buffer, temp.
822 : // (2) Interpolate temp vertically to derive the sub-pixel result.
823 : // Deriving the maximum number of rows in the temp buffer (135):
824 : // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
825 : // --Largest block size is 64x64 pixels.
826 : // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
827 : // original frame (in 1/16th pixel units).
828 : // --Must round-up because block may be located at sub-pixel position.
829 : // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
830 : // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
831 : // --Require an additional 8 rows for the horiz_w8 transpose tail.
832 : DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
833 0 : const int intermediate_height =
834 0 : (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
835 :
836 0 : assert(w <= 64);
837 0 : assert(h <= 64);
838 0 : assert(y_step_q4 <= 32);
839 0 : assert(x_step_q4 <= 32);
840 :
841 0 : if (w >= 8) {
842 0 : scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
843 : src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
844 : w, intermediate_height);
845 : } else {
846 0 : scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
847 : src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
848 : w, intermediate_height);
849 : }
850 :
851 0 : if (w >= 16) {
852 0 : scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
853 : dst_stride, y_filters, y0_q4, y_step_q4, w, h);
854 0 : } else if (w == 8) {
855 0 : scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
856 : dst_stride, y_filters, y0_q4, y_step_q4, w, h);
857 : } else {
858 0 : scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
859 : dst_stride, y_filters, y0_q4, y_step_q4, w, h);
860 : }
861 0 : }
862 :
863 0 : void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
864 : ptrdiff_t dst_stride, const int16_t *filter_x,
865 : int x_step_q4, const int16_t *filter_y, int y_step_q4,
866 : int w, int h) {
867 0 : const InterpKernel *const filters_x = get_filter_base(filter_x);
868 0 : const int x0_q4 = get_filter_offset(filter_x, filters_x);
869 :
870 0 : const InterpKernel *const filters_y = get_filter_base(filter_y);
871 0 : const int y0_q4 = get_filter_offset(filter_y, filters_y);
872 :
873 0 : scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
874 : x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
875 0 : }
876 :
877 : // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
878 : // uint8_t *dst, ptrdiff_t dst_stride,
879 : // const int16_t *filter_x, int x_step_q4,
880 : // const int16_t *filter_y, int y_step_q4,
881 : // int w, int h);
882 : // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
883 : // uint8_t *dst, ptrdiff_t dst_stride,
884 : // const int16_t *filter_x, int x_step_q4,
885 : // const int16_t *filter_y, int y_step_q4,
886 : // int w, int h);
887 0 : FUN_CONV_2D(, ssse3);
888 0 : FUN_CONV_2D(avg_, ssse3);
|