Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <smmintrin.h> /* SSE4.1 */
13 :
14 : #include "./aom_config.h"
15 : #include "./aom_dsp_rtcd.h"
16 :
17 : #include "aom_dsp/variance.h"
18 : #include "aom_dsp/aom_filter.h"
19 :
20 0 : static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
21 : const uint8_t *b8, int b_stride,
22 : uint64_t *sse, int64_t *sum) {
23 : __m128i u0, u1, u2, u3;
24 : __m128i s0, s1, s2, s3;
25 : __m128i t0, t1, x0, y0;
26 : __m128i a0, a1, a2, a3;
27 : __m128i b0, b1, b2, b3;
28 0 : __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
29 :
30 0 : uint16_t *a = CONVERT_TO_SHORTPTR(a8);
31 0 : uint16_t *b = CONVERT_TO_SHORTPTR(b8);
32 :
33 0 : a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
34 0 : a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
35 0 : a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
36 0 : a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
37 :
38 0 : b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
39 0 : b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
40 0 : b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
41 0 : b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
42 :
43 0 : u0 = _mm_unpacklo_epi16(a0, a1);
44 0 : u1 = _mm_unpacklo_epi16(a2, a3);
45 0 : u2 = _mm_unpacklo_epi16(b0, b1);
46 0 : u3 = _mm_unpacklo_epi16(b2, b3);
47 :
48 0 : s0 = _mm_sub_epi16(u0, u2);
49 0 : s1 = _mm_sub_epi16(u1, u3);
50 :
51 0 : t0 = _mm_madd_epi16(s0, k_one_epi16);
52 0 : t1 = _mm_madd_epi16(s1, k_one_epi16);
53 :
54 0 : s2 = _mm_hadd_epi32(t0, t1);
55 0 : s3 = _mm_hadd_epi32(s2, s2);
56 0 : y0 = _mm_hadd_epi32(s3, s3);
57 :
58 0 : t0 = _mm_madd_epi16(s0, s0);
59 0 : t1 = _mm_madd_epi16(s1, s1);
60 :
61 0 : s2 = _mm_hadd_epi32(t0, t1);
62 0 : s3 = _mm_hadd_epi32(s2, s2);
63 0 : x0 = _mm_hadd_epi32(s3, s3);
64 :
65 0 : *sse = (uint64_t)_mm_extract_epi32(x0, 0);
66 0 : *sum = (int64_t)_mm_extract_epi32(y0, 0);
67 0 : }
68 :
69 0 : uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
70 : const uint8_t *b, int b_stride,
71 : uint32_t *sse) {
72 : int64_t sum, diff;
73 : uint64_t local_sse;
74 :
75 0 : variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
76 0 : *sse = (uint32_t)local_sse;
77 :
78 0 : diff = (int64_t)*sse - ((sum * sum) >> 4);
79 0 : return (diff >= 0) ? (uint32_t)diff : 0;
80 : }
81 :
82 0 : uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
83 : const uint8_t *b, int b_stride,
84 : uint32_t *sse) {
85 : int64_t sum, diff;
86 : uint64_t local_sse;
87 :
88 0 : variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
89 0 : *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
90 0 : sum = ROUND_POWER_OF_TWO(sum, 2);
91 :
92 0 : diff = (int64_t)*sse - ((sum * sum) >> 4);
93 0 : return (diff >= 0) ? (uint32_t)diff : 0;
94 : }
95 :
96 0 : uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
97 : const uint8_t *b, int b_stride,
98 : uint32_t *sse) {
99 : int64_t sum, diff;
100 : uint64_t local_sse;
101 :
102 0 : variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
103 0 : *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
104 0 : sum = ROUND_POWER_OF_TWO(sum, 4);
105 :
106 0 : diff = (int64_t)*sse - ((sum * sum) >> 4);
107 0 : return diff >= 0 ? (uint32_t)diff : 0;
108 : }
109 :
110 : // Sub-pixel
111 0 : uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
112 : const uint8_t *src, int src_stride, int xoffset, int yoffset,
113 : const uint8_t *dst, int dst_stride, uint32_t *sse) {
114 : uint16_t fdata3[(4 + 1) * 4];
115 : uint16_t temp2[4 * 4];
116 :
117 0 : aom_highbd_var_filter_block2d_bil_first_pass(
118 0 : src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
119 0 : aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
120 0 : bilinear_filters_2t[yoffset]);
121 :
122 0 : return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
123 : sse);
124 : }
125 :
126 0 : uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
127 : const uint8_t *src, int src_stride, int xoffset, int yoffset,
128 : const uint8_t *dst, int dst_stride, uint32_t *sse) {
129 : uint16_t fdata3[(4 + 1) * 4];
130 : uint16_t temp2[4 * 4];
131 :
132 0 : aom_highbd_var_filter_block2d_bil_first_pass(
133 0 : src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
134 0 : aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
135 0 : bilinear_filters_2t[yoffset]);
136 :
137 0 : return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
138 : dst_stride, sse);
139 : }
140 :
141 0 : uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
142 : const uint8_t *src, int src_stride, int xoffset, int yoffset,
143 : const uint8_t *dst, int dst_stride, uint32_t *sse) {
144 : uint16_t fdata3[(4 + 1) * 4];
145 : uint16_t temp2[4 * 4];
146 :
147 0 : aom_highbd_var_filter_block2d_bil_first_pass(
148 0 : src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
149 0 : aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
150 0 : bilinear_filters_2t[yoffset]);
151 :
152 0 : return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
153 : dst_stride, sse);
154 : }
155 :
156 : // Sub-pixel average
157 :
158 0 : uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
159 : const uint8_t *src, int src_stride, int xoffset, int yoffset,
160 : const uint8_t *dst, int dst_stride, uint32_t *sse,
161 : const uint8_t *second_pred) {
162 : uint16_t fdata3[(4 + 1) * 4];
163 : uint16_t temp2[4 * 4];
164 : DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
165 :
166 0 : aom_highbd_var_filter_block2d_bil_first_pass(
167 0 : src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
168 0 : aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
169 0 : bilinear_filters_2t[yoffset]);
170 :
171 0 : aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
172 : 4);
173 :
174 0 : return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
175 : sse);
176 : }
177 :
178 0 : uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
179 : const uint8_t *src, int src_stride, int xoffset, int yoffset,
180 : const uint8_t *dst, int dst_stride, uint32_t *sse,
181 : const uint8_t *second_pred) {
182 : uint16_t fdata3[(4 + 1) * 4];
183 : uint16_t temp2[4 * 4];
184 : DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
185 :
186 0 : aom_highbd_var_filter_block2d_bil_first_pass(
187 0 : src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
188 0 : aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
189 0 : bilinear_filters_2t[yoffset]);
190 :
191 0 : aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
192 : 4);
193 :
194 0 : return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
195 : dst_stride, sse);
196 : }
197 :
198 0 : uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
199 : const uint8_t *src, int src_stride, int xoffset, int yoffset,
200 : const uint8_t *dst, int dst_stride, uint32_t *sse,
201 : const uint8_t *second_pred) {
202 : uint16_t fdata3[(4 + 1) * 4];
203 : uint16_t temp2[4 * 4];
204 : DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
205 :
206 0 : aom_highbd_var_filter_block2d_bil_first_pass(
207 0 : src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
208 0 : aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
209 0 : bilinear_filters_2t[yoffset]);
210 :
211 0 : aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
212 : 4);
213 :
214 0 : return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
215 : dst_stride, sse);
216 : }
|