Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <immintrin.h>
12 :
13 : #include "./vpx_dsp_rtcd.h"
14 : #include "vpx_dsp/x86/convolve.h"
15 : #include "vpx_ports/mem.h"
16 :
17 : // filters for 16_h8 and 16_v8
18 : DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
19 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
20 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
21 : };
22 :
23 : DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
24 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
25 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
26 : };
27 :
28 : DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
29 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
30 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
31 : };
32 :
33 : DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
34 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
35 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
36 : };
37 :
38 : #if defined(__clang__)
39 : #if (__clang_major__ > 0 && __clang_major__ < 3) || \
40 : (__clang_major__ == 3 && __clang_minor__ <= 3) || \
41 : (defined(__APPLE__) && defined(__apple_build_version__) && \
42 : ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
43 : (__clang_major__ == 5 && __clang_minor__ == 0)))
44 : #define MM256_BROADCASTSI128_SI256(x) \
45 : _mm_broadcastsi128_si256((__m128i const *)&(x))
46 : #else // clang > 3.3, and not 5.0 on macosx.
47 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
48 : #endif // clang <= 3.3
49 : #elif defined(__GNUC__)
50 : #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
51 : #define MM256_BROADCASTSI128_SI256(x) \
52 : _mm_broadcastsi128_si256((__m128i const *)&(x))
53 : #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
54 : #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
55 : #else // gcc > 4.7
56 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
57 : #endif // gcc <= 4.6
58 : #else // !(gcc || clang)
59 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
60 : #endif // __clang__
61 :
62 0 : static void vpx_filter_block1d16_h8_avx2(
63 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
64 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
65 : __m128i filtersReg;
66 : __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
67 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
68 : __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
69 : __m256i srcReg32b1, srcReg32b2, filtersReg32;
70 : unsigned int i;
71 : ptrdiff_t src_stride, dst_stride;
72 :
73 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
74 0 : addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
75 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
76 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
77 : // in both lanes of 128 bit register.
78 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
79 : // have the same data in both lanes of a 256 bit register
80 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
81 :
82 : // duplicate only the first 16 bits (first and second byte)
83 : // across 256 bit register
84 0 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
85 : // duplicate only the second 16 bits (third and forth byte)
86 : // across 256 bit register
87 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
88 : // duplicate only the third 16 bits (fifth and sixth byte)
89 : // across 256 bit register
90 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
91 : // duplicate only the forth 16 bits (seventh and eighth byte)
92 : // across 256 bit register
93 0 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
94 :
95 0 : filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
96 0 : filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
97 0 : filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
98 0 : filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
99 :
100 : // multiple the size of the source and destination stride by two
101 0 : src_stride = src_pixels_per_line << 1;
102 0 : dst_stride = output_pitch << 1;
103 0 : for (i = output_height; i > 1; i -= 2) {
104 : // load the 2 strides of source
105 0 : srcReg32b1 =
106 0 : _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
107 0 : srcReg32b1 = _mm256_inserti128_si256(
108 : srcReg32b1,
109 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
110 : 1);
111 :
112 : // filter the source buffer
113 0 : srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
114 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
115 :
116 : // multiply 2 adjacent elements with the filter and add the result
117 0 : srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
118 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
119 :
120 : // add and saturate the results together
121 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
122 :
123 : // filter the source buffer
124 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
125 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
126 :
127 : // multiply 2 adjacent elements with the filter and add the result
128 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
129 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
130 :
131 : // add and saturate the results together
132 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(
133 : srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
134 :
135 : // reading 2 strides of the next 16 bytes
136 : // (part of it was being read by earlier read)
137 0 : srcReg32b2 =
138 0 : _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
139 0 : srcReg32b2 = _mm256_inserti128_si256(
140 : srcReg32b2,
141 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
142 : 1);
143 :
144 : // add and saturate the results together
145 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(
146 : srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
147 :
148 : // filter the source buffer
149 0 : srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
150 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
151 :
152 : // multiply 2 adjacent elements with the filter and add the result
153 0 : srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
154 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
155 :
156 : // add and saturate the results together
157 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
158 :
159 : // filter the source buffer
160 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
161 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
162 :
163 : // multiply 2 adjacent elements with the filter and add the result
164 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
165 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
166 :
167 : // add and saturate the results together
168 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(
169 : srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
170 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(
171 : srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
172 :
173 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
174 :
175 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
176 :
177 : // shift by 7 bit each 16 bit
178 0 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
179 0 : srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
180 :
181 : // shrink to 8 bit each 16 bits, the first lane contain the first
182 : // convolve result and the second lane contain the second convolve
183 : // result
184 0 : srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
185 :
186 0 : src_ptr += src_stride;
187 :
188 : // save 16 bytes
189 0 : _mm_store_si128((__m128i *)output_ptr,
190 : _mm256_castsi256_si128(srcRegFilt32b1_1));
191 :
192 : // save the next 16 bits
193 0 : _mm_store_si128((__m128i *)(output_ptr + output_pitch),
194 0 : _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
195 0 : output_ptr += dst_stride;
196 : }
197 :
198 : // if the number of strides is odd.
199 : // process only 16 bytes
200 0 : if (i > 0) {
201 : __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
202 : __m128i srcRegFilt2, srcRegFilt3;
203 :
204 0 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
205 :
206 : // filter the source buffer
207 0 : srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
208 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
209 :
210 : // multiply 2 adjacent elements with the filter and add the result
211 0 : srcRegFilt1_1 =
212 0 : _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
213 0 : srcRegFilt2 =
214 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
215 :
216 : // add and saturate the results together
217 0 : srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
218 :
219 : // filter the source buffer
220 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
221 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
222 :
223 : // multiply 2 adjacent elements with the filter and add the result
224 0 : srcRegFilt3 =
225 0 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
226 0 : srcRegFilt2 =
227 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
228 :
229 : // add and saturate the results together
230 0 : srcRegFilt1_1 =
231 0 : _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
232 :
233 : // reading the next 16 bytes
234 : // (part of it was being read by earlier read)
235 0 : srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
236 :
237 : // add and saturate the results together
238 0 : srcRegFilt1_1 =
239 0 : _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
240 :
241 : // filter the source buffer
242 0 : srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
243 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
244 :
245 : // multiply 2 adjacent elements with the filter and add the result
246 0 : srcRegFilt2_1 =
247 0 : _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
248 0 : srcRegFilt2 =
249 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
250 :
251 : // add and saturate the results together
252 0 : srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
253 :
254 : // filter the source buffer
255 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
256 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
257 :
258 : // multiply 2 adjacent elements with the filter and add the result
259 0 : srcRegFilt3 =
260 0 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
261 0 : srcRegFilt2 =
262 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
263 :
264 : // add and saturate the results together
265 0 : srcRegFilt2_1 =
266 0 : _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
267 0 : srcRegFilt2_1 =
268 0 : _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
269 :
270 0 : srcRegFilt1_1 =
271 0 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
272 :
273 0 : srcRegFilt2_1 =
274 0 : _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
275 :
276 : // shift by 7 bit each 16 bit
277 0 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
278 0 : srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
279 :
280 : // shrink to 8 bit each 16 bits, the first lane contain the first
281 : // convolve result and the second lane contain the second convolve
282 : // result
283 0 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
284 :
285 : // save 16 bytes
286 : _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
287 : }
288 0 : }
289 :
290 0 : static void vpx_filter_block1d16_v8_avx2(
291 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
292 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
293 : __m128i filtersReg;
294 : __m256i addFilterReg64;
295 : __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
296 : __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
297 : __m256i srcReg32b11, srcReg32b12, filtersReg32;
298 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
299 : unsigned int i;
300 : ptrdiff_t src_stride, dst_stride;
301 :
302 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
303 0 : addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
304 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
305 : // converting the 16 bit (short) to 8 bit (byte) and have the
306 : // same data in both lanes of 128 bit register.
307 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
308 : // have the same data in both lanes of a 256 bit register
309 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
310 :
311 : // duplicate only the first 16 bits (first and second byte)
312 : // across 256 bit register
313 0 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
314 : // duplicate only the second 16 bits (third and forth byte)
315 : // across 256 bit register
316 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
317 : // duplicate only the third 16 bits (fifth and sixth byte)
318 : // across 256 bit register
319 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
320 : // duplicate only the forth 16 bits (seventh and eighth byte)
321 : // across 256 bit register
322 0 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
323 :
324 : // multiple the size of the source and destination stride by two
325 0 : src_stride = src_pitch << 1;
326 0 : dst_stride = out_pitch << 1;
327 :
328 : // load 16 bytes 7 times in stride of src_pitch
329 0 : srcReg32b1 =
330 0 : _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
331 0 : srcReg32b2 = _mm256_castsi128_si256(
332 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
333 0 : srcReg32b3 = _mm256_castsi128_si256(
334 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
335 0 : srcReg32b4 = _mm256_castsi128_si256(
336 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
337 0 : srcReg32b5 = _mm256_castsi128_si256(
338 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
339 0 : srcReg32b6 = _mm256_castsi128_si256(
340 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
341 0 : srcReg32b7 = _mm256_castsi128_si256(
342 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
343 :
344 : // have each consecutive loads on the same 256 register
345 0 : srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
346 : _mm256_castsi256_si128(srcReg32b2), 1);
347 0 : srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
348 : _mm256_castsi256_si128(srcReg32b3), 1);
349 0 : srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
350 : _mm256_castsi256_si128(srcReg32b4), 1);
351 0 : srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
352 : _mm256_castsi256_si128(srcReg32b5), 1);
353 0 : srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
354 : _mm256_castsi256_si128(srcReg32b6), 1);
355 0 : srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
356 : _mm256_castsi256_si128(srcReg32b7), 1);
357 :
358 : // merge every two consecutive registers except the last one
359 0 : srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
360 0 : srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
361 :
362 : // save
363 0 : srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
364 :
365 : // save
366 0 : srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
367 :
368 : // save
369 0 : srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
370 :
371 : // save
372 0 : srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
373 :
374 0 : for (i = output_height; i > 1; i -= 2) {
375 : // load the last 2 loads of 16 bytes and have every two
376 : // consecutive loads in the same 256 bit register
377 0 : srcReg32b8 = _mm256_castsi128_si256(
378 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
379 0 : srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
380 : _mm256_castsi256_si128(srcReg32b8), 1);
381 0 : srcReg32b9 = _mm256_castsi128_si256(
382 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
383 0 : srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
384 : _mm256_castsi256_si128(srcReg32b9), 1);
385 :
386 : // merge every two consecutive registers
387 : // save
388 0 : srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
389 0 : srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
390 :
391 : // multiply 2 adjacent elements with the filter and add the result
392 0 : srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
393 0 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
394 :
395 : // add and saturate the results together
396 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
397 :
398 : // multiply 2 adjacent elements with the filter and add the result
399 0 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
400 0 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
401 :
402 : // add and saturate the results together
403 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
404 : _mm256_min_epi16(srcReg32b8, srcReg32b12));
405 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
406 : _mm256_max_epi16(srcReg32b8, srcReg32b12));
407 :
408 : // multiply 2 adjacent elements with the filter and add the result
409 0 : srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
410 0 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
411 :
412 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
413 :
414 : // multiply 2 adjacent elements with the filter and add the result
415 0 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
416 0 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
417 :
418 : // add and saturate the results together
419 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
420 : _mm256_min_epi16(srcReg32b8, srcReg32b12));
421 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
422 : _mm256_max_epi16(srcReg32b8, srcReg32b12));
423 :
424 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
425 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
426 :
427 : // shift by 7 bit each 16 bit
428 0 : srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
429 0 : srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
430 :
431 : // shrink to 8 bit each 16 bits, the first lane contain the first
432 : // convolve result and the second lane contain the second convolve
433 : // result
434 0 : srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
435 :
436 0 : src_ptr += src_stride;
437 :
438 : // save 16 bytes
439 0 : _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
440 :
441 : // save the next 16 bits
442 0 : _mm_store_si128((__m128i *)(output_ptr + out_pitch),
443 0 : _mm256_extractf128_si256(srcReg32b1, 1));
444 :
445 0 : output_ptr += dst_stride;
446 :
447 : // save part of the registers for next strides
448 0 : srcReg32b10 = srcReg32b11;
449 0 : srcReg32b1 = srcReg32b3;
450 0 : srcReg32b11 = srcReg32b2;
451 0 : srcReg32b3 = srcReg32b5;
452 0 : srcReg32b2 = srcReg32b4;
453 0 : srcReg32b5 = srcReg32b7;
454 0 : srcReg32b7 = srcReg32b9;
455 : }
456 0 : if (i > 0) {
457 : __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
458 : __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
459 : // load the last 16 bytes
460 0 : srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
461 :
462 : // merge the last 2 results together
463 0 : srcRegFilt4 =
464 0 : _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
465 0 : srcRegFilt7 =
466 0 : _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
467 :
468 : // multiply 2 adjacent elements with the filter and add the result
469 0 : srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
470 : _mm256_castsi256_si128(firstFilters));
471 0 : srcRegFilt4 =
472 0 : _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
473 0 : srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
474 : _mm256_castsi256_si128(firstFilters));
475 0 : srcRegFilt7 =
476 0 : _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
477 :
478 : // add and saturate the results together
479 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
480 0 : srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
481 :
482 : // multiply 2 adjacent elements with the filter and add the result
483 0 : srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
484 : _mm256_castsi256_si128(secondFilters));
485 0 : srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
486 : _mm256_castsi256_si128(secondFilters));
487 :
488 : // multiply 2 adjacent elements with the filter and add the result
489 0 : srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
490 : _mm256_castsi256_si128(thirdFilters));
491 0 : srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
492 : _mm256_castsi256_si128(thirdFilters));
493 :
494 : // add and saturate the results together
495 0 : srcRegFilt1 =
496 0 : _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
497 0 : srcRegFilt3 =
498 0 : _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
499 :
500 : // add and saturate the results together
501 0 : srcRegFilt1 =
502 0 : _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
503 0 : srcRegFilt3 =
504 0 : _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
505 :
506 0 : srcRegFilt1 =
507 0 : _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
508 0 : srcRegFilt3 =
509 0 : _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
510 :
511 : // shift by 7 bit each 16 bit
512 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
513 0 : srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
514 :
515 : // shrink to 8 bit each 16 bits, the first lane contain the first
516 : // convolve result and the second lane contain the second convolve
517 : // result
518 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
519 :
520 : // save 16 bytes
521 : _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
522 : }
523 0 : }
524 :
525 : #if HAVE_AVX2 && HAVE_SSSE3
526 : filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
527 : #if ARCH_X86_64
528 : filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
529 : filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
530 : filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
531 : #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
532 : #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
533 : #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
534 : #else // ARCH_X86
535 : filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
536 : filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
537 : filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
538 : #define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
539 : #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
540 : #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
541 : #endif // ARCH_X86_64
542 : filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
543 : filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
544 : filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
545 : filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
546 : filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
547 : filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
548 : #define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
549 : #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
550 : #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
551 : #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
552 : #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
553 : #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
554 : #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
555 : // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
556 : // uint8_t *dst, ptrdiff_t dst_stride,
557 : // const int16_t *filter_x, int x_step_q4,
558 : // const int16_t *filter_y, int y_step_q4,
559 : // int w, int h);
560 : // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
561 : // uint8_t *dst, ptrdiff_t dst_stride,
562 : // const int16_t *filter_x, int x_step_q4,
563 : // const int16_t *filter_y, int y_step_q4,
564 : // int w, int h);
565 0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
566 0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
567 :
568 : // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
569 : // uint8_t *dst, ptrdiff_t dst_stride,
570 : // const int16_t *filter_x, int x_step_q4,
571 : // const int16_t *filter_y, int y_step_q4,
572 : // int w, int h);
573 0 : FUN_CONV_2D(, avx2);
574 : #endif // HAVE_AX2 && HAVE_SSSE3
|