Line data Source code
1 : /*
2 : * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <emmintrin.h>
12 :
13 : #include "./vpx_dsp_rtcd.h"
14 : #include "vpx_ports/mem.h"
15 :
16 0 : void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
17 : int *min, int *max) {
18 : __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
19 0 : u0 = _mm_setzero_si128();
20 : // Row 0
21 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
22 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
23 0 : diff = _mm_subs_epi16(s0, d0);
24 0 : negdiff = _mm_subs_epi16(u0, diff);
25 0 : absdiff0 = _mm_max_epi16(diff, negdiff);
26 : // Row 1
27 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
28 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
29 0 : diff = _mm_subs_epi16(s0, d0);
30 0 : negdiff = _mm_subs_epi16(u0, diff);
31 0 : absdiff = _mm_max_epi16(diff, negdiff);
32 0 : maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
33 0 : minabsdiff = _mm_min_epi16(absdiff0, absdiff);
34 : // Row 2
35 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
36 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
37 0 : diff = _mm_subs_epi16(s0, d0);
38 0 : negdiff = _mm_subs_epi16(u0, diff);
39 0 : absdiff = _mm_max_epi16(diff, negdiff);
40 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
41 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
42 : // Row 3
43 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
44 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
45 0 : diff = _mm_subs_epi16(s0, d0);
46 0 : negdiff = _mm_subs_epi16(u0, diff);
47 0 : absdiff = _mm_max_epi16(diff, negdiff);
48 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
49 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
50 : // Row 4
51 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
52 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
53 0 : diff = _mm_subs_epi16(s0, d0);
54 0 : negdiff = _mm_subs_epi16(u0, diff);
55 0 : absdiff = _mm_max_epi16(diff, negdiff);
56 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
57 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
58 : // Row 5
59 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
60 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
61 0 : diff = _mm_subs_epi16(s0, d0);
62 0 : negdiff = _mm_subs_epi16(u0, diff);
63 0 : absdiff = _mm_max_epi16(diff, negdiff);
64 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
65 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
66 : // Row 6
67 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
68 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
69 0 : diff = _mm_subs_epi16(s0, d0);
70 0 : negdiff = _mm_subs_epi16(u0, diff);
71 0 : absdiff = _mm_max_epi16(diff, negdiff);
72 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
73 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
74 : // Row 7
75 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
76 0 : d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
77 0 : diff = _mm_subs_epi16(s0, d0);
78 0 : negdiff = _mm_subs_epi16(u0, diff);
79 0 : absdiff = _mm_max_epi16(diff, negdiff);
80 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
81 0 : minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
82 :
83 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
84 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
85 0 : maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
86 0 : *max = _mm_extract_epi16(maxabsdiff, 0);
87 :
88 0 : minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
89 0 : minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
90 0 : minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
91 0 : *min = _mm_extract_epi16(minabsdiff, 0);
92 0 : }
93 :
94 0 : unsigned int vpx_avg_8x8_sse2(const uint8_t *s, int p) {
95 : __m128i s0, s1, u0;
96 0 : unsigned int avg = 0;
97 0 : u0 = _mm_setzero_si128();
98 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
99 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
100 0 : s0 = _mm_adds_epu16(s0, s1);
101 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
102 0 : s0 = _mm_adds_epu16(s0, s1);
103 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
104 0 : s0 = _mm_adds_epu16(s0, s1);
105 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
106 0 : s0 = _mm_adds_epu16(s0, s1);
107 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
108 0 : s0 = _mm_adds_epu16(s0, s1);
109 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
110 0 : s0 = _mm_adds_epu16(s0, s1);
111 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
112 0 : s0 = _mm_adds_epu16(s0, s1);
113 :
114 0 : s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
115 0 : s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
116 0 : s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
117 0 : avg = _mm_extract_epi16(s0, 0);
118 0 : return (avg + 32) >> 6;
119 : }
120 :
121 0 : unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
122 : __m128i s0, s1, u0;
123 0 : unsigned int avg = 0;
124 0 : u0 = _mm_setzero_si128();
125 0 : s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
126 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
127 0 : s0 = _mm_adds_epu16(s0, s1);
128 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
129 0 : s0 = _mm_adds_epu16(s0, s1);
130 0 : s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
131 0 : s0 = _mm_adds_epu16(s0, s1);
132 :
133 0 : s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
134 0 : s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
135 0 : avg = _mm_extract_epi16(s0, 0);
136 0 : return (avg + 8) >> 4;
137 : }
138 :
139 0 : static void hadamard_col8_sse2(__m128i *in, int iter) {
140 0 : __m128i a0 = in[0];
141 0 : __m128i a1 = in[1];
142 0 : __m128i a2 = in[2];
143 0 : __m128i a3 = in[3];
144 0 : __m128i a4 = in[4];
145 0 : __m128i a5 = in[5];
146 0 : __m128i a6 = in[6];
147 0 : __m128i a7 = in[7];
148 :
149 0 : __m128i b0 = _mm_add_epi16(a0, a1);
150 0 : __m128i b1 = _mm_sub_epi16(a0, a1);
151 0 : __m128i b2 = _mm_add_epi16(a2, a3);
152 0 : __m128i b3 = _mm_sub_epi16(a2, a3);
153 0 : __m128i b4 = _mm_add_epi16(a4, a5);
154 0 : __m128i b5 = _mm_sub_epi16(a4, a5);
155 0 : __m128i b6 = _mm_add_epi16(a6, a7);
156 0 : __m128i b7 = _mm_sub_epi16(a6, a7);
157 :
158 0 : a0 = _mm_add_epi16(b0, b2);
159 0 : a1 = _mm_add_epi16(b1, b3);
160 0 : a2 = _mm_sub_epi16(b0, b2);
161 0 : a3 = _mm_sub_epi16(b1, b3);
162 0 : a4 = _mm_add_epi16(b4, b6);
163 0 : a5 = _mm_add_epi16(b5, b7);
164 0 : a6 = _mm_sub_epi16(b4, b6);
165 0 : a7 = _mm_sub_epi16(b5, b7);
166 :
167 0 : if (iter == 0) {
168 0 : b0 = _mm_add_epi16(a0, a4);
169 0 : b7 = _mm_add_epi16(a1, a5);
170 0 : b3 = _mm_add_epi16(a2, a6);
171 0 : b4 = _mm_add_epi16(a3, a7);
172 0 : b2 = _mm_sub_epi16(a0, a4);
173 0 : b6 = _mm_sub_epi16(a1, a5);
174 0 : b1 = _mm_sub_epi16(a2, a6);
175 0 : b5 = _mm_sub_epi16(a3, a7);
176 :
177 0 : a0 = _mm_unpacklo_epi16(b0, b1);
178 0 : a1 = _mm_unpacklo_epi16(b2, b3);
179 0 : a2 = _mm_unpackhi_epi16(b0, b1);
180 0 : a3 = _mm_unpackhi_epi16(b2, b3);
181 0 : a4 = _mm_unpacklo_epi16(b4, b5);
182 0 : a5 = _mm_unpacklo_epi16(b6, b7);
183 0 : a6 = _mm_unpackhi_epi16(b4, b5);
184 0 : a7 = _mm_unpackhi_epi16(b6, b7);
185 :
186 0 : b0 = _mm_unpacklo_epi32(a0, a1);
187 0 : b1 = _mm_unpacklo_epi32(a4, a5);
188 0 : b2 = _mm_unpackhi_epi32(a0, a1);
189 0 : b3 = _mm_unpackhi_epi32(a4, a5);
190 0 : b4 = _mm_unpacklo_epi32(a2, a3);
191 0 : b5 = _mm_unpacklo_epi32(a6, a7);
192 0 : b6 = _mm_unpackhi_epi32(a2, a3);
193 0 : b7 = _mm_unpackhi_epi32(a6, a7);
194 :
195 0 : in[0] = _mm_unpacklo_epi64(b0, b1);
196 0 : in[1] = _mm_unpackhi_epi64(b0, b1);
197 0 : in[2] = _mm_unpacklo_epi64(b2, b3);
198 0 : in[3] = _mm_unpackhi_epi64(b2, b3);
199 0 : in[4] = _mm_unpacklo_epi64(b4, b5);
200 0 : in[5] = _mm_unpackhi_epi64(b4, b5);
201 0 : in[6] = _mm_unpacklo_epi64(b6, b7);
202 0 : in[7] = _mm_unpackhi_epi64(b6, b7);
203 : } else {
204 0 : in[0] = _mm_add_epi16(a0, a4);
205 0 : in[7] = _mm_add_epi16(a1, a5);
206 0 : in[3] = _mm_add_epi16(a2, a6);
207 0 : in[4] = _mm_add_epi16(a3, a7);
208 0 : in[2] = _mm_sub_epi16(a0, a4);
209 0 : in[6] = _mm_sub_epi16(a1, a5);
210 0 : in[1] = _mm_sub_epi16(a2, a6);
211 0 : in[5] = _mm_sub_epi16(a3, a7);
212 : }
213 0 : }
214 :
215 0 : void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
216 : int16_t *coeff) {
217 : __m128i src[8];
218 0 : src[0] = _mm_load_si128((const __m128i *)src_diff);
219 0 : src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
220 0 : src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
221 0 : src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
222 0 : src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
223 0 : src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
224 0 : src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
225 0 : src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
226 :
227 0 : hadamard_col8_sse2(src, 0);
228 0 : hadamard_col8_sse2(src, 1);
229 :
230 0 : _mm_store_si128((__m128i *)coeff, src[0]);
231 0 : coeff += 8;
232 0 : _mm_store_si128((__m128i *)coeff, src[1]);
233 0 : coeff += 8;
234 0 : _mm_store_si128((__m128i *)coeff, src[2]);
235 0 : coeff += 8;
236 0 : _mm_store_si128((__m128i *)coeff, src[3]);
237 0 : coeff += 8;
238 0 : _mm_store_si128((__m128i *)coeff, src[4]);
239 0 : coeff += 8;
240 0 : _mm_store_si128((__m128i *)coeff, src[5]);
241 0 : coeff += 8;
242 0 : _mm_store_si128((__m128i *)coeff, src[6]);
243 0 : coeff += 8;
244 0 : _mm_store_si128((__m128i *)coeff, src[7]);
245 0 : }
246 :
247 0 : void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
248 : int16_t *coeff) {
249 : int idx;
250 0 : for (idx = 0; idx < 4; ++idx) {
251 0 : int16_t const *src_ptr =
252 0 : src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
253 0 : vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
254 : }
255 :
256 0 : for (idx = 0; idx < 64; idx += 8) {
257 0 : __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
258 0 : __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
259 0 : __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
260 0 : __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
261 :
262 0 : __m128i b0 = _mm_add_epi16(coeff0, coeff1);
263 0 : __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
264 0 : __m128i b2 = _mm_add_epi16(coeff2, coeff3);
265 0 : __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
266 :
267 0 : b0 = _mm_srai_epi16(b0, 1);
268 0 : b1 = _mm_srai_epi16(b1, 1);
269 0 : b2 = _mm_srai_epi16(b2, 1);
270 0 : b3 = _mm_srai_epi16(b3, 1);
271 :
272 0 : coeff0 = _mm_add_epi16(b0, b2);
273 0 : coeff1 = _mm_add_epi16(b1, b3);
274 : _mm_store_si128((__m128i *)coeff, coeff0);
275 0 : _mm_store_si128((__m128i *)(coeff + 64), coeff1);
276 :
277 0 : coeff2 = _mm_sub_epi16(b0, b2);
278 0 : coeff3 = _mm_sub_epi16(b1, b3);
279 0 : _mm_store_si128((__m128i *)(coeff + 128), coeff2);
280 0 : _mm_store_si128((__m128i *)(coeff + 192), coeff3);
281 :
282 0 : coeff += 8;
283 : }
284 0 : }
285 :
286 0 : int vpx_satd_sse2(const int16_t *coeff, int length) {
287 : int i;
288 0 : const __m128i zero = _mm_setzero_si128();
289 0 : __m128i accum = zero;
290 :
291 0 : for (i = 0; i < length; i += 8) {
292 0 : const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
293 0 : const __m128i inv = _mm_sub_epi16(zero, src_line);
294 0 : const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
295 0 : const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
296 0 : const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
297 0 : const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
298 0 : accum = _mm_add_epi32(accum, sum);
299 0 : coeff += 8;
300 : }
301 :
302 : { // cascading summation of accum
303 0 : __m128i hi = _mm_srli_si128(accum, 8);
304 0 : accum = _mm_add_epi32(accum, hi);
305 0 : hi = _mm_srli_epi64(accum, 32);
306 0 : accum = _mm_add_epi32(accum, hi);
307 : }
308 :
309 0 : return _mm_cvtsi128_si32(accum);
310 : }
311 :
312 0 : void vpx_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref,
313 : const int ref_stride, const int height) {
314 : int idx;
315 0 : __m128i zero = _mm_setzero_si128();
316 0 : __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
317 0 : __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
318 0 : __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
319 : __m128i t0, t1;
320 0 : int height_1 = height - 1;
321 0 : ref += ref_stride;
322 :
323 0 : for (idx = 1; idx < height_1; idx += 2) {
324 0 : src_line = _mm_loadu_si128((const __m128i *)ref);
325 0 : t0 = _mm_unpacklo_epi8(src_line, zero);
326 0 : t1 = _mm_unpackhi_epi8(src_line, zero);
327 0 : s0 = _mm_adds_epu16(s0, t0);
328 0 : s1 = _mm_adds_epu16(s1, t1);
329 0 : ref += ref_stride;
330 :
331 0 : src_line = _mm_loadu_si128((const __m128i *)ref);
332 0 : t0 = _mm_unpacklo_epi8(src_line, zero);
333 0 : t1 = _mm_unpackhi_epi8(src_line, zero);
334 0 : s0 = _mm_adds_epu16(s0, t0);
335 0 : s1 = _mm_adds_epu16(s1, t1);
336 0 : ref += ref_stride;
337 : }
338 :
339 0 : src_line = _mm_loadu_si128((const __m128i *)ref);
340 0 : t0 = _mm_unpacklo_epi8(src_line, zero);
341 0 : t1 = _mm_unpackhi_epi8(src_line, zero);
342 0 : s0 = _mm_adds_epu16(s0, t0);
343 0 : s1 = _mm_adds_epu16(s1, t1);
344 :
345 0 : if (height == 64) {
346 0 : s0 = _mm_srai_epi16(s0, 5);
347 0 : s1 = _mm_srai_epi16(s1, 5);
348 0 : } else if (height == 32) {
349 0 : s0 = _mm_srai_epi16(s0, 4);
350 0 : s1 = _mm_srai_epi16(s1, 4);
351 : } else {
352 0 : s0 = _mm_srai_epi16(s0, 3);
353 0 : s1 = _mm_srai_epi16(s1, 3);
354 : }
355 :
356 : _mm_storeu_si128((__m128i *)hbuf, s0);
357 0 : hbuf += 8;
358 : _mm_storeu_si128((__m128i *)hbuf, s1);
359 0 : }
360 :
361 0 : int16_t vpx_int_pro_col_sse2(uint8_t const *ref, const int width) {
362 0 : __m128i zero = _mm_setzero_si128();
363 0 : __m128i src_line = _mm_load_si128((const __m128i *)ref);
364 0 : __m128i s0 = _mm_sad_epu8(src_line, zero);
365 : __m128i s1;
366 : int i;
367 :
368 0 : for (i = 16; i < width; i += 16) {
369 0 : ref += 16;
370 0 : src_line = _mm_load_si128((const __m128i *)ref);
371 0 : s1 = _mm_sad_epu8(src_line, zero);
372 0 : s0 = _mm_adds_epu16(s0, s1);
373 : }
374 :
375 0 : s1 = _mm_srli_si128(s0, 8);
376 0 : s0 = _mm_adds_epu16(s0, s1);
377 :
378 0 : return _mm_extract_epi16(s0, 0);
379 : }
380 :
381 0 : int vpx_vector_var_sse2(int16_t const *ref, int16_t const *src, const int bwl) {
382 : int idx;
383 0 : int width = 4 << bwl;
384 : int16_t mean;
385 0 : __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
386 0 : __m128i v1 = _mm_load_si128((const __m128i *)src);
387 0 : __m128i diff = _mm_subs_epi16(v0, v1);
388 0 : __m128i sum = diff;
389 0 : __m128i sse = _mm_madd_epi16(diff, diff);
390 :
391 0 : ref += 8;
392 0 : src += 8;
393 :
394 0 : for (idx = 8; idx < width; idx += 8) {
395 0 : v0 = _mm_loadu_si128((const __m128i *)ref);
396 0 : v1 = _mm_load_si128((const __m128i *)src);
397 0 : diff = _mm_subs_epi16(v0, v1);
398 :
399 0 : sum = _mm_add_epi16(sum, diff);
400 0 : v0 = _mm_madd_epi16(diff, diff);
401 0 : sse = _mm_add_epi32(sse, v0);
402 :
403 0 : ref += 8;
404 0 : src += 8;
405 : }
406 :
407 0 : v0 = _mm_srli_si128(sum, 8);
408 0 : sum = _mm_add_epi16(sum, v0);
409 0 : v0 = _mm_srli_epi64(sum, 32);
410 0 : sum = _mm_add_epi16(sum, v0);
411 0 : v0 = _mm_srli_epi32(sum, 16);
412 0 : sum = _mm_add_epi16(sum, v0);
413 :
414 0 : v1 = _mm_srli_si128(sse, 8);
415 0 : sse = _mm_add_epi32(sse, v1);
416 0 : v1 = _mm_srli_epi64(sse, 32);
417 0 : sse = _mm_add_epi32(sse, v1);
418 :
419 0 : mean = _mm_extract_epi16(sum, 0);
420 :
421 0 : return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
422 : }
|