Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h>
13 :
14 : #include "aom_dsp/x86/synonyms.h"
15 :
16 : #include "./aom_dsp_rtcd.h"
17 : #include "aom_ports/mem.h"
18 :
19 0 : void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
20 : int *min, int *max) {
21 : __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
22 0 : u0 = _mm_setzero_si128();
23 : // Row 0
24 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
25 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
26 0 : diff = _mm_subs_epi16(s0, d0);
27 0 : negdiff = _mm_subs_epi16(u0, diff);
28 0 : absdiff0 = _mm_max_epi16(diff, negdiff);
29 : // Row 1
30 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
31 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
32 0 : diff = _mm_subs_epi16(s0, d0);
33 0 : negdiff = _mm_subs_epi16(u0, diff);
34 0 : absdiff = _mm_max_epi16(diff, negdiff);
35 0 : maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
36 0 : minabsdiff = _mm_min_epi16(absdiff0, absdiff);
37 : // Row 2
38 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
39 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
40 0 : diff = _mm_subs_epi16(s0, d0);
41 0 : negdiff = _mm_subs_epi16(u0, diff);
42 0 : absdiff = _mm_max_epi16(diff, negdiff);
43 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
44 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
45 : // Row 3
46 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
47 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
48 0 : diff = _mm_subs_epi16(s0, d0);
49 0 : negdiff = _mm_subs_epi16(u0, diff);
50 0 : absdiff = _mm_max_epi16(diff, negdiff);
51 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
52 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
53 : // Row 4
54 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
55 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
56 0 : diff = _mm_subs_epi16(s0, d0);
57 0 : negdiff = _mm_subs_epi16(u0, diff);
58 0 : absdiff = _mm_max_epi16(diff, negdiff);
59 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
60 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
61 : // Row 5
62 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
63 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
64 0 : diff = _mm_subs_epi16(s0, d0);
65 0 : negdiff = _mm_subs_epi16(u0, diff);
66 0 : absdiff = _mm_max_epi16(diff, negdiff);
67 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
68 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
69 : // Row 6
70 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
71 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
72 0 : diff = _mm_subs_epi16(s0, d0);
73 0 : negdiff = _mm_subs_epi16(u0, diff);
74 0 : absdiff = _mm_max_epi16(diff, negdiff);
75 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
76 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
77 : // Row 7
78 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
79 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
80 0 : diff = _mm_subs_epi16(s0, d0);
81 0 : negdiff = _mm_subs_epi16(u0, diff);
82 0 : absdiff = _mm_max_epi16(diff, negdiff);
83 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
84 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
85 :
86 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
87 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
88 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
89 0 : *max = _mm_extract_epi16(maxabsdiff, 0);
90 :
91 0 : minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
92 0 : minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
93 0 : minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
94 0 : *min = _mm_extract_epi16(minabsdiff, 0);
95 0 : }
96 :
97 0 : static void hadamard_col8_sse2(__m128i *in, int iter) {
98 0 : __m128i a0 = in[0];
99 0 : __m128i a1 = in[1];
100 0 : __m128i a2 = in[2];
101 0 : __m128i a3 = in[3];
102 0 : __m128i a4 = in[4];
103 0 : __m128i a5 = in[5];
104 0 : __m128i a6 = in[6];
105 0 : __m128i a7 = in[7];
106 :
107 0 : __m128i b0 = _mm_add_epi16(a0, a1);
108 0 : __m128i b1 = _mm_sub_epi16(a0, a1);
109 0 : __m128i b2 = _mm_add_epi16(a2, a3);
110 0 : __m128i b3 = _mm_sub_epi16(a2, a3);
111 0 : __m128i b4 = _mm_add_epi16(a4, a5);
112 0 : __m128i b5 = _mm_sub_epi16(a4, a5);
113 0 : __m128i b6 = _mm_add_epi16(a6, a7);
114 0 : __m128i b7 = _mm_sub_epi16(a6, a7);
115 :
116 0 : a0 = _mm_add_epi16(b0, b2);
117 0 : a1 = _mm_add_epi16(b1, b3);
118 0 : a2 = _mm_sub_epi16(b0, b2);
119 0 : a3 = _mm_sub_epi16(b1, b3);
120 0 : a4 = _mm_add_epi16(b4, b6);
121 0 : a5 = _mm_add_epi16(b5, b7);
122 0 : a6 = _mm_sub_epi16(b4, b6);
123 0 : a7 = _mm_sub_epi16(b5, b7);
124 :
125 0 : if (iter == 0) {
126 0 : b0 = _mm_add_epi16(a0, a4);
127 0 : b7 = _mm_add_epi16(a1, a5);
128 0 : b3 = _mm_add_epi16(a2, a6);
129 0 : b4 = _mm_add_epi16(a3, a7);
130 0 : b2 = _mm_sub_epi16(a0, a4);
131 0 : b6 = _mm_sub_epi16(a1, a5);
132 0 : b1 = _mm_sub_epi16(a2, a6);
133 0 : b5 = _mm_sub_epi16(a3, a7);
134 :
135 0 : a0 = _mm_unpacklo_epi16(b0, b1);
136 0 : a1 = _mm_unpacklo_epi16(b2, b3);
137 0 : a2 = _mm_unpackhi_epi16(b0, b1);
138 0 : a3 = _mm_unpackhi_epi16(b2, b3);
139 0 : a4 = _mm_unpacklo_epi16(b4, b5);
140 0 : a5 = _mm_unpacklo_epi16(b6, b7);
141 0 : a6 = _mm_unpackhi_epi16(b4, b5);
142 0 : a7 = _mm_unpackhi_epi16(b6, b7);
143 :
144 0 : b0 = _mm_unpacklo_epi32(a0, a1);
145 0 : b1 = _mm_unpacklo_epi32(a4, a5);
146 0 : b2 = _mm_unpackhi_epi32(a0, a1);
147 0 : b3 = _mm_unpackhi_epi32(a4, a5);
148 0 : b4 = _mm_unpacklo_epi32(a2, a3);
149 0 : b5 = _mm_unpacklo_epi32(a6, a7);
150 0 : b6 = _mm_unpackhi_epi32(a2, a3);
151 0 : b7 = _mm_unpackhi_epi32(a6, a7);
152 :
153 0 : in[0] = _mm_unpacklo_epi64(b0, b1);
154 0 : in[1] = _mm_unpackhi_epi64(b0, b1);
155 0 : in[2] = _mm_unpacklo_epi64(b2, b3);
156 0 : in[3] = _mm_unpackhi_epi64(b2, b3);
157 0 : in[4] = _mm_unpacklo_epi64(b4, b5);
158 0 : in[5] = _mm_unpackhi_epi64(b4, b5);
159 0 : in[6] = _mm_unpacklo_epi64(b6, b7);
160 0 : in[7] = _mm_unpackhi_epi64(b6, b7);
161 : } else {
162 0 : in[0] = _mm_add_epi16(a0, a4);
163 0 : in[7] = _mm_add_epi16(a1, a5);
164 0 : in[3] = _mm_add_epi16(a2, a6);
165 0 : in[4] = _mm_add_epi16(a3, a7);
166 0 : in[2] = _mm_sub_epi16(a0, a4);
167 0 : in[6] = _mm_sub_epi16(a1, a5);
168 0 : in[1] = _mm_sub_epi16(a2, a6);
169 0 : in[5] = _mm_sub_epi16(a3, a7);
170 : }
171 0 : }
172 :
173 0 : void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
174 : int16_t *coeff) {
175 : __m128i src[8];
176 0 : src[0] = _mm_load_si128((const __m128i *)src_diff);
177 0 : src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
178 0 : src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
179 0 : src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
180 0 : src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
181 0 : src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
182 0 : src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
183 0 : src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
184 :
185 0 : hadamard_col8_sse2(src, 0);
186 0 : hadamard_col8_sse2(src, 1);
187 :
188 0 : _mm_store_si128((__m128i *)coeff, src[0]);
189 0 : coeff += 8;
190 0 : _mm_store_si128((__m128i *)coeff, src[1]);
191 0 : coeff += 8;
192 0 : _mm_store_si128((__m128i *)coeff, src[2]);
193 0 : coeff += 8;
194 0 : _mm_store_si128((__m128i *)coeff, src[3]);
195 0 : coeff += 8;
196 0 : _mm_store_si128((__m128i *)coeff, src[4]);
197 0 : coeff += 8;
198 0 : _mm_store_si128((__m128i *)coeff, src[5]);
199 0 : coeff += 8;
200 0 : _mm_store_si128((__m128i *)coeff, src[6]);
201 0 : coeff += 8;
202 0 : _mm_store_si128((__m128i *)coeff, src[7]);
203 0 : }
204 :
205 0 : void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
206 : int16_t *coeff) {
207 : int idx;
208 0 : for (idx = 0; idx < 4; ++idx) {
209 0 : int16_t const *src_ptr =
210 0 : src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
211 0 : aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
212 : }
213 :
214 0 : for (idx = 0; idx < 64; idx += 8) {
215 0 : __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
216 0 : __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
217 0 : __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
218 0 : __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
219 :
220 0 : __m128i b0 = _mm_add_epi16(coeff0, coeff1);
221 0 : __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
222 0 : __m128i b2 = _mm_add_epi16(coeff2, coeff3);
223 0 : __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
224 :
225 0 : b0 = _mm_srai_epi16(b0, 1);
226 0 : b1 = _mm_srai_epi16(b1, 1);
227 0 : b2 = _mm_srai_epi16(b2, 1);
228 0 : b3 = _mm_srai_epi16(b3, 1);
229 :
230 0 : coeff0 = _mm_add_epi16(b0, b2);
231 0 : coeff1 = _mm_add_epi16(b1, b3);
232 : _mm_store_si128((__m128i *)coeff, coeff0);
233 0 : _mm_store_si128((__m128i *)(coeff + 64), coeff1);
234 :
235 0 : coeff2 = _mm_sub_epi16(b0, b2);
236 0 : coeff3 = _mm_sub_epi16(b1, b3);
237 0 : _mm_store_si128((__m128i *)(coeff + 128), coeff2);
238 0 : _mm_store_si128((__m128i *)(coeff + 192), coeff3);
239 :
240 0 : coeff += 8;
241 : }
242 0 : }
243 :
244 0 : int aom_satd_sse2(const int16_t *coeff, int length) {
245 : int i;
246 0 : const __m128i zero = _mm_setzero_si128();
247 0 : __m128i accum = zero;
248 :
249 0 : for (i = 0; i < length; i += 8) {
250 0 : const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
251 0 : const __m128i inv = _mm_sub_epi16(zero, src_line);
252 0 : const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
253 0 : const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
254 0 : const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
255 0 : const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
256 0 : accum = _mm_add_epi32(accum, sum);
257 0 : coeff += 8;
258 : }
259 :
260 : { // cascading summation of accum
261 0 : __m128i hi = _mm_srli_si128(accum, 8);
262 0 : accum = _mm_add_epi32(accum, hi);
263 0 : hi = _mm_srli_epi64(accum, 32);
264 0 : accum = _mm_add_epi32(accum, hi);
265 : }
266 :
267 0 : return _mm_cvtsi128_si32(accum);
268 : }
269 :
270 0 : void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride,
271 : int height) {
272 : int idx;
273 0 : __m128i zero = _mm_setzero_si128();
274 0 : __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
275 0 : __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
276 0 : __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
277 : __m128i t0, t1;
278 0 : int height_1 = height - 1;
279 0 : ref += ref_stride;
280 :
281 0 : for (idx = 1; idx < height_1; idx += 2) {
282 0 : src_line = _mm_loadu_si128((const __m128i *)ref);
283 0 : t0 = _mm_unpacklo_epi8(src_line, zero);
284 0 : t1 = _mm_unpackhi_epi8(src_line, zero);
285 0 : s0 = _mm_adds_epu16(s0, t0);
286 0 : s1 = _mm_adds_epu16(s1, t1);
287 0 : ref += ref_stride;
288 :
289 0 : src_line = _mm_loadu_si128((const __m128i *)ref);
290 0 : t0 = _mm_unpacklo_epi8(src_line, zero);
291 0 : t1 = _mm_unpackhi_epi8(src_line, zero);
292 0 : s0 = _mm_adds_epu16(s0, t0);
293 0 : s1 = _mm_adds_epu16(s1, t1);
294 0 : ref += ref_stride;
295 : }
296 :
297 0 : src_line = _mm_loadu_si128((const __m128i *)ref);
298 0 : t0 = _mm_unpacklo_epi8(src_line, zero);
299 0 : t1 = _mm_unpackhi_epi8(src_line, zero);
300 0 : s0 = _mm_adds_epu16(s0, t0);
301 0 : s1 = _mm_adds_epu16(s1, t1);
302 :
303 0 : if (height == 64) {
304 0 : s0 = _mm_srai_epi16(s0, 5);
305 0 : s1 = _mm_srai_epi16(s1, 5);
306 0 : } else if (height == 32) {
307 0 : s0 = _mm_srai_epi16(s0, 4);
308 0 : s1 = _mm_srai_epi16(s1, 4);
309 : } else {
310 0 : s0 = _mm_srai_epi16(s0, 3);
311 0 : s1 = _mm_srai_epi16(s1, 3);
312 : }
313 :
314 : _mm_storeu_si128((__m128i *)hbuf, s0);
315 0 : hbuf += 8;
316 : _mm_storeu_si128((__m128i *)hbuf, s1);
317 0 : }
318 :
319 0 : int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) {
320 0 : __m128i zero = _mm_setzero_si128();
321 0 : __m128i src_line = _mm_load_si128((const __m128i *)ref);
322 0 : __m128i s0 = _mm_sad_epu8(src_line, zero);
323 : __m128i s1;
324 : int i;
325 :
326 0 : for (i = 16; i < width; i += 16) {
327 0 : ref += 16;
328 0 : src_line = _mm_load_si128((const __m128i *)ref);
329 0 : s1 = _mm_sad_epu8(src_line, zero);
330 0 : s0 = _mm_adds_epu16(s0, s1);
331 : }
332 :
333 0 : s1 = _mm_srli_si128(s0, 8);
334 0 : s0 = _mm_adds_epu16(s0, s1);
335 :
336 0 : return _mm_extract_epi16(s0, 0);
337 : }
338 :
339 0 : int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) {
340 : int idx;
341 0 : int width = 4 << bwl;
342 : int16_t mean;
343 0 : __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
344 0 : __m128i v1 = _mm_load_si128((const __m128i *)src);
345 0 : __m128i diff = _mm_subs_epi16(v0, v1);
346 0 : __m128i sum = diff;
347 0 : __m128i sse = _mm_madd_epi16(diff, diff);
348 :
349 0 : ref += 8;
350 0 : src += 8;
351 :
352 0 : for (idx = 8; idx < width; idx += 8) {
353 0 : v0 = _mm_loadu_si128((const __m128i *)ref);
354 0 : v1 = _mm_load_si128((const __m128i *)src);
355 0 : diff = _mm_subs_epi16(v0, v1);
356 :
357 0 : sum = _mm_add_epi16(sum, diff);
358 0 : v0 = _mm_madd_epi16(diff, diff);
359 0 : sse = _mm_add_epi32(sse, v0);
360 :
361 0 : ref += 8;
362 0 : src += 8;
363 : }
364 :
365 0 : v0 = _mm_srli_si128(sum, 8);
366 0 : sum = _mm_add_epi16(sum, v0);
367 0 : v0 = _mm_srli_epi64(sum, 32);
368 0 : sum = _mm_add_epi16(sum, v0);
369 0 : v0 = _mm_srli_epi32(sum, 16);
370 0 : sum = _mm_add_epi16(sum, v0);
371 :
372 0 : v1 = _mm_srli_si128(sse, 8);
373 0 : sse = _mm_add_epi32(sse, v1);
374 0 : v1 = _mm_srli_epi64(sse, 32);
375 0 : sse = _mm_add_epi32(sse, v1);
376 :
377 0 : mean = _mm_extract_epi16(sum, 0);
378 :
379 0 : return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
380 : }
|