Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 :
14 : #include "./aom_dsp_rtcd.h"
15 : #include "aom_dsp/x86/convolve.h"
16 : #include "aom_ports/mem.h"
17 :
18 : // filters for 16_h8 and 16_v8
19 : DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
20 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
21 : 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
22 : };
23 :
24 : DECLARE_ALIGNED(32, static const uint8_t, filt2_global_avx2[32]) = {
25 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
26 : 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
27 : };
28 :
29 : DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
30 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
31 : 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
32 : };
33 :
34 : DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
35 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
36 : 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
37 : };
38 :
39 : #if defined(__clang__)
40 : #if (__clang_major__ > 0 && __clang_major__ < 3) || \
41 : (__clang_major__ == 3 && __clang_minor__ <= 3) || \
42 : (defined(__APPLE__) && defined(__apple_build_version__) && \
43 : ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
44 : (__clang_major__ == 5 && __clang_minor__ == 0)))
45 : #define MM256_BROADCASTSI128_SI256(x) \
46 : _mm_broadcastsi128_si256((__m128i const *)&(x))
47 : #else // clang > 3.3, and not 5.0 on macosx.
48 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
49 : #endif // clang <= 3.3
50 : #elif defined(__GNUC__)
51 : #if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
52 : #define MM256_BROADCASTSI128_SI256(x) \
53 : _mm_broadcastsi128_si256((__m128i const *)&(x))
54 : #elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
55 : #define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
56 : #else // gcc > 4.7
57 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
58 : #endif // gcc <= 4.6
59 : #else // !(gcc || clang)
60 : #define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
61 : #endif // __clang__
62 :
63 0 : static void aom_filter_block1d16_h8_avx2(
64 : const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
65 : ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
66 : __m128i filtersReg;
67 : __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
68 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
69 : __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
70 : __m256i srcReg32b1, srcReg32b2, filtersReg32;
71 : unsigned int i;
72 : ptrdiff_t src_stride, dst_stride;
73 :
74 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
75 0 : addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
76 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
77 : // converting the 16 bit (short) to 8 bit (byte) and have the same data
78 : // in both lanes of 128 bit register.
79 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
80 : // have the same data in both lanes of a 256 bit register
81 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
82 :
83 : // duplicate only the first 16 bits (first and second byte)
84 : // across 256 bit register
85 0 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
86 : // duplicate only the second 16 bits (third and forth byte)
87 : // across 256 bit register
88 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
89 : // duplicate only the third 16 bits (fifth and sixth byte)
90 : // across 256 bit register
91 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
92 : // duplicate only the forth 16 bits (seventh and eighth byte)
93 : // across 256 bit register
94 0 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
95 :
96 0 : filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
97 0 : filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
98 0 : filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
99 0 : filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
100 :
101 : // multiple the size of the source and destination stride by two
102 0 : src_stride = src_pixels_per_line << 1;
103 0 : dst_stride = output_pitch << 1;
104 0 : for (i = output_height; i > 1; i -= 2) {
105 : // load the 2 strides of source
106 0 : srcReg32b1 =
107 0 : _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
108 0 : srcReg32b1 = _mm256_inserti128_si256(
109 : srcReg32b1,
110 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
111 : 1);
112 :
113 : // filter the source buffer
114 0 : srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
115 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
116 :
117 : // multiply 2 adjacent elements with the filter and add the result
118 0 : srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
119 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
120 :
121 : // add and saturate the results together
122 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
123 :
124 : // filter the source buffer
125 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
126 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
127 :
128 : // multiply 2 adjacent elements with the filter and add the result
129 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
130 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
131 :
132 : // add and saturate the results together
133 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(
134 : srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
135 :
136 : // reading 2 strides of the next 16 bytes
137 : // (part of it was being read by earlier read)
138 0 : srcReg32b2 =
139 0 : _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
140 0 : srcReg32b2 = _mm256_inserti128_si256(
141 : srcReg32b2,
142 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
143 : 1);
144 :
145 : // add and saturate the results together
146 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(
147 : srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
148 :
149 : // filter the source buffer
150 0 : srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
151 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
152 :
153 : // multiply 2 adjacent elements with the filter and add the result
154 0 : srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
155 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
156 :
157 : // add and saturate the results together
158 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
159 :
160 : // filter the source buffer
161 0 : srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
162 0 : srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
163 :
164 : // multiply 2 adjacent elements with the filter and add the result
165 0 : srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
166 0 : srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
167 :
168 : // add and saturate the results together
169 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(
170 : srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
171 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(
172 : srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
173 :
174 0 : srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
175 :
176 0 : srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
177 :
178 : // shift by 7 bit each 16 bit
179 0 : srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
180 0 : srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
181 :
182 : // shrink to 8 bit each 16 bits, the first lane contain the first
183 : // convolve result and the second lane contain the second convolve
184 : // result
185 0 : srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
186 :
187 0 : src_ptr += src_stride;
188 :
189 : // save 16 bytes
190 0 : _mm_store_si128((__m128i *)output_ptr,
191 : _mm256_castsi256_si128(srcRegFilt32b1_1));
192 :
193 : // save the next 16 bits
194 0 : _mm_store_si128((__m128i *)(output_ptr + output_pitch),
195 0 : _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
196 0 : output_ptr += dst_stride;
197 : }
198 :
199 : // if the number of strides is odd.
200 : // process only 16 bytes
201 0 : if (i > 0) {
202 : __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
203 : __m128i srcRegFilt2, srcRegFilt3;
204 :
205 0 : srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
206 :
207 : // filter the source buffer
208 0 : srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
209 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
210 :
211 : // multiply 2 adjacent elements with the filter and add the result
212 0 : srcRegFilt1_1 =
213 0 : _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
214 0 : srcRegFilt2 =
215 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
216 :
217 : // add and saturate the results together
218 0 : srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
219 :
220 : // filter the source buffer
221 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
222 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
223 :
224 : // multiply 2 adjacent elements with the filter and add the result
225 0 : srcRegFilt3 =
226 0 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
227 0 : srcRegFilt2 =
228 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
229 :
230 : // add and saturate the results together
231 0 : srcRegFilt1_1 =
232 0 : _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
233 :
234 : // reading the next 16 bytes
235 : // (part of it was being read by earlier read)
236 0 : srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
237 :
238 : // add and saturate the results together
239 0 : srcRegFilt1_1 =
240 0 : _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
241 :
242 : // filter the source buffer
243 0 : srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
244 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
245 :
246 : // multiply 2 adjacent elements with the filter and add the result
247 0 : srcRegFilt2_1 =
248 0 : _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
249 0 : srcRegFilt2 =
250 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
251 :
252 : // add and saturate the results together
253 0 : srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
254 :
255 : // filter the source buffer
256 0 : srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
257 0 : srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
258 :
259 : // multiply 2 adjacent elements with the filter and add the result
260 0 : srcRegFilt3 =
261 0 : _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
262 0 : srcRegFilt2 =
263 0 : _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
264 :
265 : // add and saturate the results together
266 0 : srcRegFilt2_1 =
267 0 : _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
268 0 : srcRegFilt2_1 =
269 0 : _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
270 :
271 0 : srcRegFilt1_1 =
272 0 : _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
273 :
274 0 : srcRegFilt2_1 =
275 0 : _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
276 :
277 : // shift by 7 bit each 16 bit
278 0 : srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
279 0 : srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
280 :
281 : // shrink to 8 bit each 16 bits, the first lane contain the first
282 : // convolve result and the second lane contain the second convolve
283 : // result
284 0 : srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
285 :
286 : // save 16 bytes
287 : _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
288 : }
289 0 : }
290 :
291 0 : static void aom_filter_block1d16_v8_avx2(
292 : const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
293 : ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
294 : __m128i filtersReg;
295 : __m256i addFilterReg64;
296 : __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
297 : __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
298 : __m256i srcReg32b11, srcReg32b12, filtersReg32;
299 : __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
300 : unsigned int i;
301 : ptrdiff_t src_stride, dst_stride;
302 :
303 : // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
304 0 : addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
305 0 : filtersReg = _mm_loadu_si128((const __m128i *)filter);
306 : // converting the 16 bit (short) to 8 bit (byte) and have the
307 : // same data in both lanes of 128 bit register.
308 0 : filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
309 : // have the same data in both lanes of a 256 bit register
310 0 : filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
311 :
312 : // duplicate only the first 16 bits (first and second byte)
313 : // across 256 bit register
314 0 : firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
315 : // duplicate only the second 16 bits (third and forth byte)
316 : // across 256 bit register
317 0 : secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
318 : // duplicate only the third 16 bits (fifth and sixth byte)
319 : // across 256 bit register
320 0 : thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
321 : // duplicate only the forth 16 bits (seventh and eighth byte)
322 : // across 256 bit register
323 0 : forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
324 :
325 : // multiple the size of the source and destination stride by two
326 0 : src_stride = src_pitch << 1;
327 0 : dst_stride = out_pitch << 1;
328 :
329 : // load 16 bytes 7 times in stride of src_pitch
330 0 : srcReg32b1 =
331 0 : _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
332 0 : srcReg32b2 = _mm256_castsi128_si256(
333 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
334 0 : srcReg32b3 = _mm256_castsi128_si256(
335 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
336 0 : srcReg32b4 = _mm256_castsi128_si256(
337 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
338 0 : srcReg32b5 = _mm256_castsi128_si256(
339 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
340 0 : srcReg32b6 = _mm256_castsi128_si256(
341 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
342 0 : srcReg32b7 = _mm256_castsi128_si256(
343 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
344 :
345 : // have each consecutive loads on the same 256 register
346 0 : srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
347 : _mm256_castsi256_si128(srcReg32b2), 1);
348 0 : srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
349 : _mm256_castsi256_si128(srcReg32b3), 1);
350 0 : srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
351 : _mm256_castsi256_si128(srcReg32b4), 1);
352 0 : srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
353 : _mm256_castsi256_si128(srcReg32b5), 1);
354 0 : srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
355 : _mm256_castsi256_si128(srcReg32b6), 1);
356 0 : srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
357 : _mm256_castsi256_si128(srcReg32b7), 1);
358 :
359 : // merge every two consecutive registers except the last one
360 0 : srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
361 0 : srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
362 :
363 : // save
364 0 : srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
365 :
366 : // save
367 0 : srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
368 :
369 : // save
370 0 : srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
371 :
372 : // save
373 0 : srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
374 :
375 0 : for (i = output_height; i > 1; i -= 2) {
376 : // load the last 2 loads of 16 bytes and have every two
377 : // consecutive loads in the same 256 bit register
378 0 : srcReg32b8 = _mm256_castsi128_si256(
379 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
380 0 : srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
381 : _mm256_castsi256_si128(srcReg32b8), 1);
382 0 : srcReg32b9 = _mm256_castsi128_si256(
383 0 : _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
384 0 : srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
385 : _mm256_castsi256_si128(srcReg32b9), 1);
386 :
387 : // merge every two consecutive registers
388 : // save
389 0 : srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
390 0 : srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
391 :
392 : // multiply 2 adjacent elements with the filter and add the result
393 0 : srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
394 0 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
395 :
396 : // add and saturate the results together
397 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
398 :
399 : // multiply 2 adjacent elements with the filter and add the result
400 0 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
401 0 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
402 :
403 : // add and saturate the results together
404 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
405 : _mm256_min_epi16(srcReg32b8, srcReg32b12));
406 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
407 : _mm256_max_epi16(srcReg32b8, srcReg32b12));
408 :
409 : // multiply 2 adjacent elements with the filter and add the result
410 0 : srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
411 0 : srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
412 :
413 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
414 :
415 : // multiply 2 adjacent elements with the filter and add the result
416 0 : srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
417 0 : srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
418 :
419 : // add and saturate the results together
420 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
421 : _mm256_min_epi16(srcReg32b8, srcReg32b12));
422 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
423 : _mm256_max_epi16(srcReg32b8, srcReg32b12));
424 :
425 0 : srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
426 0 : srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
427 :
428 : // shift by 7 bit each 16 bit
429 0 : srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
430 0 : srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
431 :
432 : // shrink to 8 bit each 16 bits, the first lane contain the first
433 : // convolve result and the second lane contain the second convolve
434 : // result
435 0 : srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
436 :
437 0 : src_ptr += src_stride;
438 :
439 : // save 16 bytes
440 0 : _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
441 :
442 : // save the next 16 bits
443 0 : _mm_store_si128((__m128i *)(output_ptr + out_pitch),
444 0 : _mm256_extractf128_si256(srcReg32b1, 1));
445 :
446 0 : output_ptr += dst_stride;
447 :
448 : // save part of the registers for next strides
449 0 : srcReg32b10 = srcReg32b11;
450 0 : srcReg32b1 = srcReg32b3;
451 0 : srcReg32b11 = srcReg32b2;
452 0 : srcReg32b3 = srcReg32b5;
453 0 : srcReg32b2 = srcReg32b4;
454 0 : srcReg32b5 = srcReg32b7;
455 0 : srcReg32b7 = srcReg32b9;
456 : }
457 0 : if (i > 0) {
458 : __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
459 : __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
460 : // load the last 16 bytes
461 0 : srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
462 :
463 : // merge the last 2 results together
464 0 : srcRegFilt4 =
465 0 : _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
466 0 : srcRegFilt7 =
467 0 : _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
468 :
469 : // multiply 2 adjacent elements with the filter and add the result
470 0 : srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
471 : _mm256_castsi256_si128(firstFilters));
472 0 : srcRegFilt4 =
473 0 : _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
474 0 : srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
475 : _mm256_castsi256_si128(firstFilters));
476 0 : srcRegFilt7 =
477 0 : _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
478 :
479 : // add and saturate the results together
480 0 : srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
481 0 : srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
482 :
483 : // multiply 2 adjacent elements with the filter and add the result
484 0 : srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
485 : _mm256_castsi256_si128(secondFilters));
486 0 : srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
487 : _mm256_castsi256_si128(secondFilters));
488 :
489 : // multiply 2 adjacent elements with the filter and add the result
490 0 : srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
491 : _mm256_castsi256_si128(thirdFilters));
492 0 : srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
493 : _mm256_castsi256_si128(thirdFilters));
494 :
495 : // add and saturate the results together
496 0 : srcRegFilt1 =
497 0 : _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
498 0 : srcRegFilt3 =
499 0 : _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
500 :
501 : // add and saturate the results together
502 0 : srcRegFilt1 =
503 0 : _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
504 0 : srcRegFilt3 =
505 0 : _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
506 :
507 0 : srcRegFilt1 =
508 0 : _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
509 0 : srcRegFilt3 =
510 0 : _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
511 :
512 : // shift by 7 bit each 16 bit
513 0 : srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
514 0 : srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
515 :
516 : // shrink to 8 bit each 16 bits, the first lane contain the first
517 : // convolve result and the second lane contain the second convolve
518 : // result
519 0 : srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
520 :
521 : // save 16 bytes
522 : _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
523 : }
524 0 : }
525 :
526 : #if HAVE_AVX2 && HAVE_SSSE3
527 : filter8_1dfunction aom_filter_block1d4_v8_ssse3;
528 : #if ARCH_X86_64
529 : filter8_1dfunction aom_filter_block1d8_v8_intrin_ssse3;
530 : filter8_1dfunction aom_filter_block1d8_h8_intrin_ssse3;
531 : filter8_1dfunction aom_filter_block1d4_h8_intrin_ssse3;
532 : #define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_intrin_ssse3
533 : #define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_intrin_ssse3
534 : #define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_intrin_ssse3
535 : #else // ARCH_X86
536 : filter8_1dfunction aom_filter_block1d8_v8_ssse3;
537 : filter8_1dfunction aom_filter_block1d8_h8_ssse3;
538 : filter8_1dfunction aom_filter_block1d4_h8_ssse3;
539 : #define aom_filter_block1d8_v8_avx2 aom_filter_block1d8_v8_ssse3
540 : #define aom_filter_block1d8_h8_avx2 aom_filter_block1d8_h8_ssse3
541 : #define aom_filter_block1d4_h8_avx2 aom_filter_block1d4_h8_ssse3
542 : #endif // ARCH_X86_64
543 : filter8_1dfunction aom_filter_block1d16_v2_ssse3;
544 : filter8_1dfunction aom_filter_block1d16_h2_ssse3;
545 : filter8_1dfunction aom_filter_block1d8_v2_ssse3;
546 : filter8_1dfunction aom_filter_block1d8_h2_ssse3;
547 : filter8_1dfunction aom_filter_block1d4_v2_ssse3;
548 : filter8_1dfunction aom_filter_block1d4_h2_ssse3;
549 : #define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
550 : #define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
551 : #define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
552 : #define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
553 : #define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
554 : #define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
555 : #define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
556 : // void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
557 : // uint8_t *dst, ptrdiff_t dst_stride,
558 : // const int16_t *filter_x, int x_step_q4,
559 : // const int16_t *filter_y, int y_step_q4,
560 : // int w, int h);
561 : // void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
562 : // uint8_t *dst, ptrdiff_t dst_stride,
563 : // const int16_t *filter_x, int x_step_q4,
564 : // const int16_t *filter_y, int y_step_q4,
565 : // int w, int h);
566 0 : FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
567 0 : FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
568 :
569 : // void aom_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
570 : // uint8_t *dst, ptrdiff_t dst_stride,
571 : // const int16_t *filter_x, int x_step_q4,
572 : // const int16_t *filter_y, int y_step_q4,
573 : // int w, int h);
574 0 : FUN_CONV_2D(, avx2);
575 : #endif // HAVE_AX2 && HAVE_SSSE3
|