Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <immintrin.h>
13 : #include "./aom_dsp_rtcd.h"
14 :
15 : typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
16 : const uint8_t *ref, int ref_stride,
17 : unsigned int *sse, int *sum);
18 :
19 : void aom_get32x32var_avx2(const uint8_t *src, int src_stride,
20 : const uint8_t *ref, int ref_stride, unsigned int *sse,
21 : int *sum);
22 :
23 0 : static void variance_avx2(const uint8_t *src, int src_stride,
24 : const uint8_t *ref, int ref_stride, int w, int h,
25 : unsigned int *sse, int *sum, get_var_avx2 var_fn,
26 : int block_size) {
27 : int i, j;
28 :
29 0 : *sse = 0;
30 0 : *sum = 0;
31 :
32 0 : for (i = 0; i < h; i += 16) {
33 0 : for (j = 0; j < w; j += block_size) {
34 : unsigned int sse0;
35 : int sum0;
36 0 : var_fn(&src[src_stride * i + j], src_stride, &ref[ref_stride * i + j],
37 : ref_stride, &sse0, &sum0);
38 0 : *sse += sse0;
39 0 : *sum += sum0;
40 : }
41 : }
42 0 : }
43 :
44 0 : unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride,
45 : const uint8_t *ref, int ref_stride,
46 : unsigned int *sse) {
47 : int sum;
48 : unsigned int variance;
49 0 : variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
50 : aom_get16x16var_avx2, 16);
51 :
52 0 : variance = *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
53 : _mm256_zeroupper();
54 0 : return variance;
55 : }
56 :
57 0 : unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
58 : const uint8_t *ref, int ref_stride,
59 : unsigned int *sse) {
60 : int sum;
61 0 : aom_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
62 : _mm256_zeroupper();
63 0 : return *sse;
64 : }
65 :
66 0 : unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride,
67 : const uint8_t *ref, int ref_stride,
68 : unsigned int *sse) {
69 : int sum;
70 : unsigned int variance;
71 0 : variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
72 : aom_get32x32var_avx2, 32);
73 :
74 0 : variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
75 : _mm256_zeroupper();
76 0 : return variance;
77 : }
78 :
79 0 : unsigned int aom_variance32x32_avx2(const uint8_t *src, int src_stride,
80 : const uint8_t *ref, int ref_stride,
81 : unsigned int *sse) {
82 : int sum;
83 : unsigned int variance;
84 0 : variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
85 : aom_get32x32var_avx2, 32);
86 :
87 0 : variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
88 : _mm256_zeroupper();
89 0 : return variance;
90 : }
91 :
92 0 : unsigned int aom_variance64x64_avx2(const uint8_t *src, int src_stride,
93 : const uint8_t *ref, int ref_stride,
94 : unsigned int *sse) {
95 : int sum;
96 : unsigned int variance;
97 0 : variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
98 : aom_get32x32var_avx2, 32);
99 :
100 0 : variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
101 : _mm256_zeroupper();
102 0 : return variance;
103 : }
104 :
105 0 : unsigned int aom_variance64x32_avx2(const uint8_t *src, int src_stride,
106 : const uint8_t *ref, int ref_stride,
107 : unsigned int *sse) {
108 : int sum;
109 : unsigned int variance;
110 0 : variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
111 : aom_get32x32var_avx2, 32);
112 :
113 0 : variance = *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
114 : _mm256_zeroupper();
115 0 : return variance;
116 : }
117 :
118 : unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
119 : int x_offset, int y_offset,
120 : const uint8_t *dst, int dst_stride,
121 : int height, unsigned int *sse);
122 :
123 : unsigned int aom_sub_pixel_avg_variance32xh_avx2(
124 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
125 : const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
126 : int height, unsigned int *sseptr);
127 :
128 0 : unsigned int aom_sub_pixel_variance64x64_avx2(const uint8_t *src,
129 : int src_stride, int x_offset,
130 : int y_offset, const uint8_t *dst,
131 : int dst_stride,
132 : unsigned int *sse) {
133 : unsigned int sse1;
134 0 : const int se1 = aom_sub_pixel_variance32xh_avx2(
135 : src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
136 : unsigned int sse2;
137 0 : const int se2 =
138 0 : aom_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
139 : dst + 32, dst_stride, 64, &sse2);
140 0 : const int se = se1 + se2;
141 : unsigned int variance;
142 0 : *sse = sse1 + sse2;
143 :
144 0 : variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
145 : _mm256_zeroupper();
146 0 : return variance;
147 : }
148 :
149 0 : unsigned int aom_sub_pixel_variance32x32_avx2(const uint8_t *src,
150 : int src_stride, int x_offset,
151 : int y_offset, const uint8_t *dst,
152 : int dst_stride,
153 : unsigned int *sse) {
154 0 : const int se = aom_sub_pixel_variance32xh_avx2(
155 : src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
156 :
157 0 : const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
158 : _mm256_zeroupper();
159 0 : return variance;
160 : }
161 :
162 0 : unsigned int aom_sub_pixel_avg_variance64x64_avx2(
163 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
164 : const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
165 : unsigned int sse1;
166 0 : const int se1 = aom_sub_pixel_avg_variance32xh_avx2(
167 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
168 : unsigned int sse2;
169 0 : const int se2 = aom_sub_pixel_avg_variance32xh_avx2(
170 : src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
171 : 64, 64, &sse2);
172 0 : const int se = se1 + se2;
173 : unsigned int variance;
174 :
175 0 : *sse = sse1 + sse2;
176 :
177 0 : variance = *sse - (uint32_t)(((int64_t)se * se) >> 12);
178 : _mm256_zeroupper();
179 0 : return variance;
180 : }
181 :
182 0 : unsigned int aom_sub_pixel_avg_variance32x32_avx2(
183 : const uint8_t *src, int src_stride, int x_offset, int y_offset,
184 : const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
185 : // Process 32 elements in parallel.
186 0 : const int se = aom_sub_pixel_avg_variance32xh_avx2(
187 : src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
188 :
189 0 : const unsigned int variance = *sse - (uint32_t)(((int64_t)se * se) >> 10);
190 : _mm256_zeroupper();
191 0 : return variance;
192 : }
|