Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <emmintrin.h> // SSE2
12 :
13 : #include "./vpx_config.h"
14 : #include "./vpx_dsp_rtcd.h"
15 :
16 : #include "vpx_ports/mem.h"
17 :
18 : typedef void (*getNxMvar_fn_t)(const unsigned char *src, int src_stride,
19 : const unsigned char *ref, int ref_stride,
20 : unsigned int *sse, int *sum);
21 :
22 0 : unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
23 0 : __m128i vsum = _mm_setzero_si128();
24 : int i;
25 :
26 0 : for (i = 0; i < 32; ++i) {
27 0 : const __m128i v = _mm_loadu_si128((const __m128i *)src);
28 0 : vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
29 0 : src += 8;
30 : }
31 :
32 0 : vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
33 0 : vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
34 0 : return _mm_cvtsi128_si32(vsum);
35 : }
36 :
37 : #define READ64(p, stride, i) \
38 : _mm_unpacklo_epi8( \
39 : _mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
40 : _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
41 :
42 0 : static void get4x4var_sse2(const uint8_t *src, int src_stride,
43 : const uint8_t *ref, int ref_stride,
44 : unsigned int *sse, int *sum) {
45 0 : const __m128i zero = _mm_setzero_si128();
46 0 : const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
47 0 : const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
48 0 : const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
49 0 : const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
50 0 : const __m128i diff0 = _mm_sub_epi16(src0, ref0);
51 0 : const __m128i diff1 = _mm_sub_epi16(src1, ref1);
52 :
53 : // sum
54 0 : __m128i vsum = _mm_add_epi16(diff0, diff1);
55 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
56 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
57 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
58 0 : *sum = (int16_t)_mm_extract_epi16(vsum, 0);
59 :
60 : // sse
61 0 : vsum =
62 0 : _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
63 0 : vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
64 0 : vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
65 0 : *sse = _mm_cvtsi128_si32(vsum);
66 0 : }
67 :
68 0 : void vpx_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref,
69 : int ref_stride, unsigned int *sse, int *sum) {
70 0 : const __m128i zero = _mm_setzero_si128();
71 0 : __m128i vsum = _mm_setzero_si128();
72 0 : __m128i vsse = _mm_setzero_si128();
73 : int i;
74 :
75 0 : for (i = 0; i < 8; i += 2) {
76 0 : const __m128i src0 = _mm_unpacklo_epi8(
77 0 : _mm_loadl_epi64((const __m128i *)(src + i * src_stride)), zero);
78 0 : const __m128i ref0 = _mm_unpacklo_epi8(
79 0 : _mm_loadl_epi64((const __m128i *)(ref + i * ref_stride)), zero);
80 0 : const __m128i diff0 = _mm_sub_epi16(src0, ref0);
81 :
82 0 : const __m128i src1 = _mm_unpacklo_epi8(
83 0 : _mm_loadl_epi64((const __m128i *)(src + (i + 1) * src_stride)), zero);
84 0 : const __m128i ref1 = _mm_unpacklo_epi8(
85 0 : _mm_loadl_epi64((const __m128i *)(ref + (i + 1) * ref_stride)), zero);
86 0 : const __m128i diff1 = _mm_sub_epi16(src1, ref1);
87 :
88 0 : vsum = _mm_add_epi16(vsum, diff0);
89 0 : vsum = _mm_add_epi16(vsum, diff1);
90 0 : vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
91 0 : vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
92 : }
93 :
94 : // sum
95 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
96 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
97 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
98 0 : *sum = (int16_t)_mm_extract_epi16(vsum, 0);
99 :
100 : // sse
101 0 : vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
102 0 : vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
103 0 : *sse = _mm_cvtsi128_si32(vsse);
104 0 : }
105 :
106 0 : void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
107 : const uint8_t *ref, int ref_stride, unsigned int *sse,
108 : int *sum) {
109 0 : const __m128i zero = _mm_setzero_si128();
110 0 : __m128i vsum = _mm_setzero_si128();
111 0 : __m128i vsse = _mm_setzero_si128();
112 : int i;
113 :
114 0 : for (i = 0; i < 16; ++i) {
115 0 : const __m128i s = _mm_loadu_si128((const __m128i *)src);
116 0 : const __m128i r = _mm_loadu_si128((const __m128i *)ref);
117 :
118 0 : const __m128i src0 = _mm_unpacklo_epi8(s, zero);
119 0 : const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
120 0 : const __m128i diff0 = _mm_sub_epi16(src0, ref0);
121 :
122 0 : const __m128i src1 = _mm_unpackhi_epi8(s, zero);
123 0 : const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
124 0 : const __m128i diff1 = _mm_sub_epi16(src1, ref1);
125 :
126 0 : vsum = _mm_add_epi16(vsum, diff0);
127 0 : vsum = _mm_add_epi16(vsum, diff1);
128 0 : vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
129 0 : vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
130 :
131 0 : src += src_stride;
132 0 : ref += ref_stride;
133 : }
134 :
135 : // sum
136 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
137 0 : vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
138 0 : *sum =
139 0 : (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1);
140 :
141 : // sse
142 0 : vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
143 0 : vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
144 0 : *sse = _mm_cvtsi128_si32(vsse);
145 0 : }
146 :
147 0 : static void variance_sse2(const unsigned char *src, int src_stride,
148 : const unsigned char *ref, int ref_stride, int w,
149 : int h, unsigned int *sse, int *sum,
150 : getNxMvar_fn_t var_fn, int block_size) {
151 : int i, j;
152 :
153 0 : *sse = 0;
154 0 : *sum = 0;
155 :
156 0 : for (i = 0; i < h; i += block_size) {
157 0 : for (j = 0; j < w; j += block_size) {
158 : unsigned int sse0;
159 : int sum0;
160 0 : var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
161 : ref_stride, &sse0, &sum0);
162 0 : *sse += sse0;
163 0 : *sum += sum0;
164 : }
165 : }
166 0 : }
167 :
168 0 : unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
169 : const unsigned char *ref, int ref_stride,
170 : unsigned int *sse) {
171 : int sum;
172 0 : get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
173 0 : return *sse - ((sum * sum) >> 4);
174 : }
175 :
176 0 : unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
177 : const uint8_t *ref, int ref_stride,
178 : unsigned int *sse) {
179 : int sum;
180 0 : variance_sse2(src, src_stride, ref, ref_stride, 8, 4, sse, &sum,
181 : get4x4var_sse2, 4);
182 0 : return *sse - ((sum * sum) >> 5);
183 : }
184 :
185 0 : unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
186 : const uint8_t *ref, int ref_stride,
187 : unsigned int *sse) {
188 : int sum;
189 0 : variance_sse2(src, src_stride, ref, ref_stride, 4, 8, sse, &sum,
190 : get4x4var_sse2, 4);
191 0 : return *sse - ((sum * sum) >> 5);
192 : }
193 :
194 0 : unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
195 : const unsigned char *ref, int ref_stride,
196 : unsigned int *sse) {
197 : int sum;
198 0 : vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
199 0 : return *sse - ((sum * sum) >> 6);
200 : }
201 :
202 0 : unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
203 : const unsigned char *ref, int ref_stride,
204 : unsigned int *sse) {
205 : int sum;
206 0 : variance_sse2(src, src_stride, ref, ref_stride, 16, 8, sse, &sum,
207 : vpx_get8x8var_sse2, 8);
208 0 : return *sse - ((sum * sum) >> 7);
209 : }
210 :
211 0 : unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
212 : const unsigned char *ref, int ref_stride,
213 : unsigned int *sse) {
214 : int sum;
215 0 : variance_sse2(src, src_stride, ref, ref_stride, 8, 16, sse, &sum,
216 : vpx_get8x8var_sse2, 8);
217 0 : return *sse - ((sum * sum) >> 7);
218 : }
219 :
220 0 : unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
221 : const unsigned char *ref, int ref_stride,
222 : unsigned int *sse) {
223 : int sum;
224 0 : vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
225 0 : return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
226 : }
227 :
228 0 : unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
229 : const uint8_t *ref, int ref_stride,
230 : unsigned int *sse) {
231 : int sum;
232 0 : variance_sse2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
233 : vpx_get16x16var_sse2, 16);
234 0 : return *sse - (unsigned int)(((int64_t)sum * sum) >> 10);
235 : }
236 :
237 0 : unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
238 : const uint8_t *ref, int ref_stride,
239 : unsigned int *sse) {
240 : int sum;
241 0 : variance_sse2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
242 : vpx_get16x16var_sse2, 16);
243 0 : return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
244 : }
245 :
246 0 : unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
247 : const uint8_t *ref, int ref_stride,
248 : unsigned int *sse) {
249 : int sum;
250 0 : variance_sse2(src, src_stride, ref, ref_stride, 16, 32, sse, &sum,
251 : vpx_get16x16var_sse2, 16);
252 0 : return *sse - (unsigned int)(((int64_t)sum * sum) >> 9);
253 : }
254 :
255 0 : unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
256 : const uint8_t *ref, int ref_stride,
257 : unsigned int *sse) {
258 : int sum;
259 0 : variance_sse2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
260 : vpx_get16x16var_sse2, 16);
261 0 : return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
262 : }
263 :
264 0 : unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
265 : const uint8_t *ref, int ref_stride,
266 : unsigned int *sse) {
267 : int sum;
268 0 : variance_sse2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
269 : vpx_get16x16var_sse2, 16);
270 0 : return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
271 : }
272 :
273 0 : unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
274 : const uint8_t *ref, int ref_stride,
275 : unsigned int *sse) {
276 : int sum;
277 0 : variance_sse2(src, src_stride, ref, ref_stride, 32, 64, sse, &sum,
278 : vpx_get16x16var_sse2, 16);
279 0 : return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
280 : }
281 :
282 0 : unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
283 : const uint8_t *ref, int ref_stride,
284 : unsigned int *sse) {
285 0 : vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
286 0 : return *sse;
287 : }
288 :
289 0 : unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
290 : const uint8_t *ref, int ref_stride,
291 : unsigned int *sse) {
292 0 : vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
293 0 : return *sse;
294 : }
295 :
296 0 : unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
297 : const uint8_t *ref, int ref_stride,
298 : unsigned int *sse) {
299 0 : vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
300 0 : return *sse;
301 : }
302 :
303 0 : unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
304 : const uint8_t *ref, int ref_stride,
305 : unsigned int *sse) {
306 0 : vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
307 0 : return *sse;
308 : }
309 :
310 : // The 2 unused parameters are place holders for PIC enabled build.
311 : // These definitions are for functions defined in subpel_variance.asm
312 : #define DECL(w, opt) \
313 : int vpx_sub_pixel_variance##w##xh_##opt( \
314 : const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
315 : const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
316 : void *unused0, void *unused)
317 : #define DECLS(opt1, opt2) \
318 : DECL(4, opt1); \
319 : DECL(8, opt1); \
320 : DECL(16, opt1)
321 :
322 : DECLS(sse2, sse2);
323 : DECLS(ssse3, ssse3);
324 : #undef DECLS
325 : #undef DECL
326 :
327 : #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
328 : unsigned int vpx_sub_pixel_variance##w##x##h##_##opt( \
329 : const uint8_t *src, int src_stride, int x_offset, int y_offset, \
330 : const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
331 : unsigned int sse; \
332 : int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
333 : y_offset, dst, dst_stride, \
334 : h, &sse, NULL, NULL); \
335 : if (w > wf) { \
336 : unsigned int sse2; \
337 : int se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
338 : src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, h, \
339 : &sse2, NULL, NULL); \
340 : se += se2; \
341 : sse += sse2; \
342 : if (w > wf * 2) { \
343 : se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
344 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, h, \
345 : &sse2, NULL, NULL); \
346 : se += se2; \
347 : sse += sse2; \
348 : se2 = vpx_sub_pixel_variance##wf##xh_##opt( \
349 : src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, h, \
350 : &sse2, NULL, NULL); \
351 : se += se2; \
352 : sse += sse2; \
353 : } \
354 : } \
355 : *sse_ptr = sse; \
356 : return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
357 : }
358 :
359 : #define FNS(opt1, opt2) \
360 : FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
361 : FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
362 : FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
363 : FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
364 : FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
365 : FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
366 : FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
367 : FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t)); \
368 : FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t)); \
369 : FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t)); \
370 : FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t)); \
371 : FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t)); \
372 : FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
373 :
374 0 : FNS(sse2, sse2);
375 0 : FNS(ssse3, ssse3);
376 :
377 : #undef FNS
378 : #undef FN
379 :
380 : // The 2 unused parameters are place holders for PIC enabled build.
381 : #define DECL(w, opt) \
382 : int vpx_sub_pixel_avg_variance##w##xh_##opt( \
383 : const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
384 : const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
385 : ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
386 : void *unused)
387 : #define DECLS(opt1, opt2) \
388 : DECL(4, opt1); \
389 : DECL(8, opt1); \
390 : DECL(16, opt1)
391 :
392 : DECLS(sse2, sse2);
393 : DECLS(ssse3, ssse3);
394 : #undef DECL
395 : #undef DECLS
396 :
397 : #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
398 : unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt( \
399 : const uint8_t *src, int src_stride, int x_offset, int y_offset, \
400 : const uint8_t *dst, int dst_stride, unsigned int *sseptr, \
401 : const uint8_t *sec) { \
402 : unsigned int sse; \
403 : int se = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
404 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
405 : NULL, NULL); \
406 : if (w > wf) { \
407 : unsigned int sse2; \
408 : int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
409 : src + 16, src_stride, x_offset, y_offset, dst + 16, dst_stride, \
410 : sec + 16, w, h, &sse2, NULL, NULL); \
411 : se += se2; \
412 : sse += sse2; \
413 : if (w > wf * 2) { \
414 : se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
415 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, \
416 : sec + 32, w, h, &sse2, NULL, NULL); \
417 : se += se2; \
418 : sse += sse2; \
419 : se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt( \
420 : src + 48, src_stride, x_offset, y_offset, dst + 48, dst_stride, \
421 : sec + 48, w, h, &sse2, NULL, NULL); \
422 : se += se2; \
423 : sse += sse2; \
424 : } \
425 : } \
426 : *sseptr = sse; \
427 : return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
428 : }
429 :
430 : #define FNS(opt1, opt2) \
431 : FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t)); \
432 : FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t)); \
433 : FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t)); \
434 : FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t)); \
435 : FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t)); \
436 : FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t)); \
437 : FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
438 : FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t)); \
439 : FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t)); \
440 : FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t)); \
441 : FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t)); \
442 : FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t)); \
443 : FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
444 :
445 0 : FNS(sse2, sse);
446 0 : FNS(ssse3, ssse3);
447 :
448 : #undef FNS
449 : #undef FN
|