Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 : #include "./vpx_dsp_rtcd.h"
11 :
12 : typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
13 : const uint8_t *ref, int ref_stride,
14 : unsigned int *sse, int *sum);
15 :
16 : void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
17 : const uint8_t *ref, int ref_stride, unsigned int *sse,
18 : int *sum);
19 :
20 0 : static void variance_avx2(const uint8_t *src, int src_stride,
21 : const uint8_t *ref, int ref_stride, int w, int h,
22 : unsigned int *sse, int *sum, get_var_avx2 var_fn,
23 : int block_size) {
24 : int i, j;
25 :
26 0 : *sse = 0;
27 0 : *sum = 0;
28 :
29 0 : for (i = 0; i < h; i += 16) {
30 0 : for (j = 0; j < w; j += block_size) {
31 : unsigned int sse0;
32 : int sum0;
33 0 : var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
34 : ref_stride, &sse0, &sum0);
35 0 : *sse += sse0;
36 0 : *sum += sum0;
37 : }
38 : }
39 0 : }
40 :
41 0 : unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
42 : const uint8_t *ref, int ref_stride,
43 : unsigned int *sse) {
44 : int sum;
45 0 : variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
46 : vpx_get16x16var_avx2, 16);
47 0 : return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
48 : }
49 :
50 0 : unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
51 : const uint8_t *ref, int ref_stride,
52 : unsigned int *sse) {
53 : int sum;
54 0 : vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
55 0 : return *sse;
56 : }
57 :
58 0 : unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
59 : const uint8_t *ref, int ref_stride,
60 : unsigned int *sse) {
61 : int sum;
62 0 : variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
63 : vpx_get32x32var_avx2, 32);
64 0 : return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
65 : }
66 :
67 0 : unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
68 : const uint8_t *ref, int ref_stride,
69 : unsigned int *sse) {
70 : int sum;
71 0 : variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
72 : vpx_get32x32var_avx2, 32);
73 0 : return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
74 : }
75 :
76 0 : unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
77 : const uint8_t *ref, int ref_stride,
78 : unsigned int *sse) {
79 : int sum;
80 0 : variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
81 : vpx_get32x32var_avx2, 32);
82 0 : return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
83 : }
84 :
85 0 : unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
86 : const uint8_t *ref, int ref_stride,
87 : unsigned int *sse) {
88 : int sum;
89 0 : variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
90 : vpx_get32x32var_avx2, 32);
91 0 : return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
92 : }
93 :
94 : unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
95 : int x_offset, int y_offset,
96 : const uint8_t *dst, int dst_stride,
97 : int height, unsigned int *sse);
98 :
99 : unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
100 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
101 : const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
102 : int height, unsigned int *sseptr);
103 :
104 0 : unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
105 : int src_stride, int x_offset,
106 : int y_offset, const uint8_t *dst,
107 : int dst_stride,
108 : unsigned int *sse) {
109 : unsigned int sse1;
110 0 : const int se1 = vpx_sub_pixel_variance32xh_avx2(
111 : src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
112 : unsigned int sse2;
113 0 : const int se2 =
114 0 : vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
115 : dst + 32, dst_stride, 64, &sse2);
116 0 : const int se = se1 + se2;
117 0 : *sse = sse1 + sse2;
118 0 : return *sse - (uint32_t)(((int64_t)se * se) >> 12);
119 : }
120 :
121 0 : unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
122 : int src_stride, int x_offset,
123 : int y_offset, const uint8_t *dst,
124 : int dst_stride,
125 : unsigned int *sse) {
126 0 : const int se = vpx_sub_pixel_variance32xh_avx2(
127 : src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
128 0 : return *sse - (uint32_t)(((int64_t)se * se) >> 10);
129 : }
130 :
131 0 : unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
132 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
133 : const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
134 : unsigned int sse1;
135 0 : const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
136 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
137 : unsigned int sse2;
138 0 : const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
139 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
140 : 64, 64, &sse2);
141 0 : const int se = se1 + se2;
142 :
143 0 : *sse = sse1 + sse2;
144 :
145 0 : return *sse - (uint32_t)(((int64_t)se * se) >> 12);
146 : }
147 :
148 0 : unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
149 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
150 : const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
151 : // Process 32 elements in parallel.
152 0 : const int se = vpx_sub_pixel_avg_variance32xh_avx2(
153 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
154 0 : return *sse - (uint32_t)(((int64_t)se * se) >> 10);
155 : }
|