Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <immintrin.h>
14 :
15 : #include "./aom_config.h"
16 : #include "aom_ports/mem.h"
17 : #include "aom/aom_integer.h"
18 :
19 : #include "aom_dsp/aom_dsp_common.h"
20 : #include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
21 : #include "aom_dsp/x86/synonyms.h"
22 :
23 : ////////////////////////////////////////////////////////////////////////////////
24 : // 8 bit
25 : ////////////////////////////////////////////////////////////////////////////////
26 :
27 0 : static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
28 : const int32_t *wsrc, const int32_t *mask,
29 : const int height) {
30 0 : const int pre_step = pre_stride - 4;
31 0 : int n = 0;
32 0 : __m128i v_sad_d = _mm_setzero_si128();
33 :
34 : do {
35 0 : const __m128i v_p_b = xx_loadl_32(pre + n);
36 0 : const __m128i v_m_d = xx_load_128(mask + n);
37 0 : const __m128i v_w_d = xx_load_128(wsrc + n);
38 :
39 0 : const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
40 :
41 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
42 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
43 : // than pmulld but produces the same result with these inputs.
44 0 : const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
45 :
46 0 : const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
47 0 : const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
48 :
49 : // Rounded absolute difference
50 0 : const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
51 :
52 0 : v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
53 :
54 0 : n += 4;
55 :
56 0 : if (n % 4 == 0) pre += pre_step;
57 0 : } while (n < 4 * height);
58 :
59 0 : return xx_hsum_epi32_si32(v_sad_d);
60 : }
61 :
62 0 : static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
63 : const int pre_stride,
64 : const int32_t *wsrc,
65 : const int32_t *mask, const int width,
66 : const int height) {
67 0 : const int pre_step = pre_stride - width;
68 0 : int n = 0;
69 0 : __m128i v_sad_d = _mm_setzero_si128();
70 :
71 0 : assert(width >= 8);
72 0 : assert(IS_POWER_OF_TWO(width));
73 :
74 : do {
75 0 : const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
76 0 : const __m128i v_m1_d = xx_load_128(mask + n + 4);
77 0 : const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
78 0 : const __m128i v_p0_b = xx_loadl_32(pre + n);
79 0 : const __m128i v_m0_d = xx_load_128(mask + n);
80 0 : const __m128i v_w0_d = xx_load_128(wsrc + n);
81 :
82 0 : const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
83 0 : const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
84 :
85 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
86 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
87 : // than pmulld but produces the same result with these inputs.
88 0 : const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
89 0 : const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
90 :
91 0 : const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
92 0 : const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
93 0 : const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
94 0 : const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
95 :
96 : // Rounded absolute difference
97 0 : const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
98 0 : const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
99 :
100 0 : v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
101 0 : v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
102 :
103 0 : n += 8;
104 :
105 0 : if (n % width == 0) pre += pre_step;
106 0 : } while (n < width * height);
107 :
108 0 : return xx_hsum_epi32_si32(v_sad_d);
109 : }
110 :
111 : #define OBMCSADWXH(w, h) \
112 : unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
113 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
114 : const int32_t *msk) { \
115 : if (w == 4) { \
116 : return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
117 : } else { \
118 : return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
119 : } \
120 : }
121 :
122 : #if CONFIG_EXT_PARTITION
123 : OBMCSADWXH(128, 128)
124 : OBMCSADWXH(128, 64)
125 : OBMCSADWXH(64, 128)
126 : #endif // CONFIG_EXT_PARTITION
127 0 : OBMCSADWXH(64, 64)
128 0 : OBMCSADWXH(64, 32)
129 0 : OBMCSADWXH(32, 64)
130 0 : OBMCSADWXH(32, 32)
131 0 : OBMCSADWXH(32, 16)
132 0 : OBMCSADWXH(16, 32)
133 0 : OBMCSADWXH(16, 16)
134 0 : OBMCSADWXH(16, 8)
135 0 : OBMCSADWXH(8, 16)
136 0 : OBMCSADWXH(8, 8)
137 0 : OBMCSADWXH(8, 4)
138 0 : OBMCSADWXH(4, 8)
139 0 : OBMCSADWXH(4, 4)
140 :
141 : ////////////////////////////////////////////////////////////////////////////////
142 : // High bit-depth
143 : ////////////////////////////////////////////////////////////////////////////////
144 :
145 : #if CONFIG_HIGHBITDEPTH
146 0 : static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
147 : const int pre_stride,
148 : const int32_t *wsrc,
149 : const int32_t *mask,
150 : const int height) {
151 0 : const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
152 0 : const int pre_step = pre_stride - 4;
153 0 : int n = 0;
154 0 : __m128i v_sad_d = _mm_setzero_si128();
155 :
156 : do {
157 0 : const __m128i v_p_w = xx_loadl_64(pre + n);
158 0 : const __m128i v_m_d = xx_load_128(mask + n);
159 0 : const __m128i v_w_d = xx_load_128(wsrc + n);
160 :
161 0 : const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
162 :
163 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
164 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
165 : // than pmulld but produces the same result with these inputs.
166 0 : const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
167 :
168 0 : const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
169 0 : const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
170 :
171 : // Rounded absolute difference
172 0 : const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
173 :
174 0 : v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
175 :
176 0 : n += 4;
177 :
178 0 : if (n % 4 == 0) pre += pre_step;
179 0 : } while (n < 4 * height);
180 :
181 0 : return xx_hsum_epi32_si32(v_sad_d);
182 : }
183 :
184 0 : static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
185 : const int pre_stride,
186 : const int32_t *wsrc,
187 : const int32_t *mask,
188 : const int width, const int height) {
189 0 : const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
190 0 : const int pre_step = pre_stride - width;
191 0 : int n = 0;
192 0 : __m128i v_sad_d = _mm_setzero_si128();
193 :
194 0 : assert(width >= 8);
195 0 : assert(IS_POWER_OF_TWO(width));
196 :
197 : do {
198 0 : const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
199 0 : const __m128i v_m1_d = xx_load_128(mask + n + 4);
200 0 : const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
201 0 : const __m128i v_p0_w = xx_loadl_64(pre + n);
202 0 : const __m128i v_m0_d = xx_load_128(mask + n);
203 0 : const __m128i v_w0_d = xx_load_128(wsrc + n);
204 :
205 0 : const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
206 0 : const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
207 :
208 : // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
209 : // boundaries. We use pmaddwd, as it has lower latency on Haswell
210 : // than pmulld but produces the same result with these inputs.
211 0 : const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
212 0 : const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
213 :
214 0 : const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
215 0 : const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
216 0 : const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
217 0 : const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
218 :
219 : // Rounded absolute difference
220 0 : const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
221 0 : const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
222 :
223 0 : v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
224 0 : v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
225 :
226 0 : n += 8;
227 :
228 0 : if (n % width == 0) pre += pre_step;
229 0 : } while (n < width * height);
230 :
231 0 : return xx_hsum_epi32_si32(v_sad_d);
232 : }
233 :
234 : #define HBD_OBMCSADWXH(w, h) \
235 : unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
236 : const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
237 : const int32_t *mask) { \
238 : if (w == 4) { \
239 : return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
240 : } else { \
241 : return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
242 : } \
243 : }
244 :
245 : #if CONFIG_EXT_PARTITION
246 : HBD_OBMCSADWXH(128, 128)
247 : HBD_OBMCSADWXH(128, 64)
248 : HBD_OBMCSADWXH(64, 128)
249 : #endif // CONFIG_EXT_PARTITION
250 0 : HBD_OBMCSADWXH(64, 64)
251 0 : HBD_OBMCSADWXH(64, 32)
252 0 : HBD_OBMCSADWXH(32, 64)
253 0 : HBD_OBMCSADWXH(32, 32)
254 0 : HBD_OBMCSADWXH(32, 16)
255 0 : HBD_OBMCSADWXH(16, 32)
256 0 : HBD_OBMCSADWXH(16, 16)
257 0 : HBD_OBMCSADWXH(16, 8)
258 0 : HBD_OBMCSADWXH(8, 16)
259 0 : HBD_OBMCSADWXH(8, 8)
260 0 : HBD_OBMCSADWXH(8, 4)
261 0 : HBD_OBMCSADWXH(4, 8)
262 0 : HBD_OBMCSADWXH(4, 4)
263 : #endif // CONFIG_HIGHBITDEPTH
|