Line data Source code
1 : /*
2 : * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h>
13 :
14 : #include "./av1_rtcd.h"
15 : #include "av1/common/warped_motion.h"
16 :
17 0 : void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
18 : int height, int stride, uint8_t *pred, int p_col,
19 : int p_row, int p_width, int p_height, int p_stride,
20 : int subsampling_x, int subsampling_y, int comp_avg,
21 : int16_t alpha, int16_t beta, int16_t gamma,
22 : int16_t delta) {
23 : __m128i tmp[15];
24 : int i, j, k;
25 0 : const int bd = 8;
26 :
27 : /* Note: For this code to work, the left/right frame borders need to be
28 : extended by at least 13 pixels each. By the time we get here, other
29 : code will have set up this border, but we allow an explicit check
30 : for debugging purposes.
31 : */
32 : /*for (i = 0; i < height; ++i) {
33 : for (j = 0; j < 13; ++j) {
34 : assert(ref[i * stride - 13 + j] == ref[i * stride]);
35 : assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
36 : }
37 : }*/
38 :
39 0 : for (i = 0; i < p_height; i += 8) {
40 0 : for (j = 0; j < p_width; j += 8) {
41 : // (x, y) coordinates of the center of this block in the destination
42 : // image
43 0 : const int32_t dst_x = p_col + j + 4;
44 0 : const int32_t dst_y = p_row + i + 4;
45 :
46 : int32_t x4, y4, ix4, sx4, iy4, sy4;
47 0 : if (subsampling_x)
48 0 : x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
49 0 : (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
50 : 4;
51 : else
52 0 : x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];
53 :
54 0 : if (subsampling_y)
55 0 : y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
56 0 : (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
57 : 4;
58 : else
59 0 : y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];
60 :
61 0 : ix4 = x4 >> WARPEDMODEL_PREC_BITS;
62 0 : sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
63 0 : iy4 = y4 >> WARPEDMODEL_PREC_BITS;
64 0 : sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
65 :
66 : // Add in all the constant terms, including rounding and offset
67 0 : sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
68 : (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
69 0 : sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
70 : (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
71 :
72 0 : sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
73 0 : sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
74 :
75 : // Horizontal filter
76 : // If the block is aligned such that, after clamping, every sample
77 : // would be taken from the leftmost/rightmost column, then we can
78 : // skip the expensive horizontal filter.
79 0 : if (ix4 <= -7) {
80 0 : for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
81 0 : int iy = iy4 + k;
82 0 : if (iy < 0)
83 0 : iy = 0;
84 0 : else if (iy > height - 1)
85 0 : iy = height - 1;
86 0 : tmp[k + 7] = _mm_set1_epi16(
87 0 : (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
88 0 : 1)) +
89 0 : ref[iy * stride] *
90 : (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
91 : }
92 0 : } else if (ix4 >= width + 6) {
93 0 : for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
94 0 : int iy = iy4 + k;
95 0 : if (iy < 0)
96 0 : iy = 0;
97 0 : else if (iy > height - 1)
98 0 : iy = height - 1;
99 0 : tmp[k + 7] = _mm_set1_epi16(
100 0 : (1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
101 0 : 1)) +
102 0 : ref[iy * stride + (width - 1)] *
103 : (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
104 : }
105 : } else {
106 0 : for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
107 0 : int iy = iy4 + k;
108 0 : if (iy < 0)
109 0 : iy = 0;
110 0 : else if (iy > height - 1)
111 0 : iy = height - 1;
112 0 : int sx = sx4 + beta * (k + 4);
113 :
114 : // Load source pixels
115 0 : const __m128i zero = _mm_setzero_si128();
116 0 : const __m128i src =
117 0 : _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
118 :
119 : // Filter even-index pixels
120 0 : const __m128i tmp_0 = _mm_loadu_si128(
121 0 : (__m128i *)(warped_filter +
122 0 : ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
123 0 : const __m128i tmp_2 = _mm_loadu_si128(
124 0 : (__m128i *)(warped_filter +
125 0 : ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
126 0 : const __m128i tmp_4 = _mm_loadu_si128(
127 0 : (__m128i *)(warped_filter +
128 0 : ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
129 0 : const __m128i tmp_6 = _mm_loadu_si128(
130 0 : (__m128i *)(warped_filter +
131 0 : ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
132 :
133 : // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
134 0 : const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
135 : // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
136 0 : const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
137 : // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
138 0 : const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
139 : // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
140 0 : const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
141 :
142 : // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
143 0 : const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
144 : // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
145 0 : const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
146 : // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
147 0 : const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
148 : // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
149 0 : const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
150 :
151 0 : const __m128i round_const =
152 0 : _mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
153 : ((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
154 :
155 : // Calculate filtered results
156 0 : const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
157 0 : const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
158 0 : const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
159 0 : const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
160 0 : const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
161 0 : const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
162 0 : const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
163 0 : const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
164 :
165 0 : __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
166 : _mm_add_epi32(res_2, res_6));
167 0 : res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
168 : HORSHEAR_REDUCE_PREC_BITS);
169 :
170 : // Filter odd-index pixels
171 0 : const __m128i tmp_1 = _mm_loadu_si128(
172 0 : (__m128i *)(warped_filter +
173 0 : ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
174 0 : const __m128i tmp_3 = _mm_loadu_si128(
175 0 : (__m128i *)(warped_filter +
176 0 : ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
177 0 : const __m128i tmp_5 = _mm_loadu_si128(
178 0 : (__m128i *)(warped_filter +
179 0 : ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
180 0 : const __m128i tmp_7 = _mm_loadu_si128(
181 0 : (__m128i *)(warped_filter +
182 0 : ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
183 :
184 0 : const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
185 0 : const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
186 0 : const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
187 0 : const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
188 :
189 0 : const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
190 0 : const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
191 0 : const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
192 0 : const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
193 :
194 0 : const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
195 0 : const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
196 0 : const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
197 0 : const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
198 0 : const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
199 0 : const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
200 0 : const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
201 0 : const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
202 :
203 0 : __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
204 : _mm_add_epi32(res_3, res_7));
205 0 : res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
206 : HORSHEAR_REDUCE_PREC_BITS);
207 :
208 : // Combine results into one register.
209 : // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
210 : // as this order helps with the vertical filter.
211 0 : tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
212 : }
213 : }
214 :
215 : // Vertical filter
216 0 : for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
217 0 : int sy = sy4 + delta * (k + 4);
218 :
219 : // Load from tmp and rearrange pairs of consecutive rows into the
220 : // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
221 0 : const __m128i *src = tmp + (k + 4);
222 0 : const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
223 0 : const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
224 0 : const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
225 0 : const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
226 :
227 : // Filter even-index pixels
228 0 : const __m128i tmp_0 = _mm_loadu_si128(
229 0 : (__m128i *)(warped_filter +
230 0 : ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
231 0 : const __m128i tmp_2 = _mm_loadu_si128(
232 0 : (__m128i *)(warped_filter +
233 0 : ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
234 0 : const __m128i tmp_4 = _mm_loadu_si128(
235 0 : (__m128i *)(warped_filter +
236 0 : ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
237 0 : const __m128i tmp_6 = _mm_loadu_si128(
238 0 : (__m128i *)(warped_filter +
239 0 : ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
240 :
241 0 : const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
242 0 : const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
243 0 : const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
244 0 : const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
245 :
246 0 : const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
247 0 : const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
248 0 : const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
249 0 : const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
250 :
251 0 : const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
252 0 : const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
253 0 : const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
254 0 : const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
255 :
256 0 : const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
257 : _mm_add_epi32(res_4, res_6));
258 :
259 : // Filter odd-index pixels
260 0 : const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
261 0 : const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
262 0 : const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
263 0 : const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
264 :
265 0 : const __m128i tmp_1 = _mm_loadu_si128(
266 0 : (__m128i *)(warped_filter +
267 0 : ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
268 0 : const __m128i tmp_3 = _mm_loadu_si128(
269 0 : (__m128i *)(warped_filter +
270 0 : ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
271 0 : const __m128i tmp_5 = _mm_loadu_si128(
272 0 : (__m128i *)(warped_filter +
273 0 : ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
274 0 : const __m128i tmp_7 = _mm_loadu_si128(
275 0 : (__m128i *)(warped_filter +
276 0 : ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
277 :
278 0 : const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
279 0 : const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
280 0 : const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
281 0 : const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
282 :
283 0 : const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
284 0 : const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
285 0 : const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
286 0 : const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
287 :
288 0 : const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
289 0 : const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
290 0 : const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
291 0 : const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
292 :
293 0 : const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
294 : _mm_add_epi32(res_5, res_7));
295 :
296 : // Rearrange pixels back into the order 0 ... 7
297 0 : const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
298 0 : const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
299 :
300 : // Round and pack into 8 bits
301 0 : const __m128i round_const =
302 0 : _mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
303 : ((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
304 :
305 0 : const __m128i res_lo_round = _mm_srai_epi32(
306 : _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
307 0 : const __m128i res_hi_round = _mm_srai_epi32(
308 : _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);
309 :
310 0 : const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
311 0 : __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
312 :
313 : // Store, blending with 'pred' if needed
314 0 : __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
315 :
316 : // Note: If we're outputting a 4x4 block, we need to be very careful
317 : // to only output 4 pixels at this point, to avoid encode/decode
318 : // mismatches when encoding with multiple threads.
319 0 : if (p_width == 4) {
320 0 : if (comp_avg) {
321 0 : const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
322 0 : res_8bit = _mm_avg_epu8(res_8bit, orig);
323 : }
324 0 : *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
325 : } else {
326 0 : if (comp_avg) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
327 : _mm_storel_epi64(p, res_8bit);
328 : }
329 : }
330 : }
331 : }
332 0 : }
|