Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <assert.h>
13 : #include <emmintrin.h>
14 : #include <stddef.h>
15 :
16 : #include "./aom_config.h"
17 : #include "./aom_dsp_rtcd.h"
18 :
19 : typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
20 : const uint16_t *src, ptrdiff_t src_stride,
21 : const uint16_t *pred,
22 : ptrdiff_t pred_stride);
23 :
24 0 : static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
25 : const uint16_t *src, ptrdiff_t src_stride,
26 : const uint16_t *pred, ptrdiff_t pred_stride) {
27 : __m128i u0, u1, u2, u3;
28 : __m128i v0, v1, v2, v3;
29 : __m128i x0, x1, x2, x3;
30 0 : int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
31 :
32 0 : u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
33 0 : u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
34 0 : u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
35 0 : u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
36 :
37 0 : v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
38 0 : v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
39 0 : v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
40 0 : v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
41 :
42 0 : x0 = _mm_sub_epi16(u0, v0);
43 0 : x1 = _mm_sub_epi16(u1, v1);
44 0 : x2 = _mm_sub_epi16(u2, v2);
45 0 : x3 = _mm_sub_epi16(u3, v3);
46 :
47 : _mm_storel_epi64((__m128i *)store_diff, x0);
48 0 : store_diff = (int64_t *)(diff + 1 * diff_stride);
49 : _mm_storel_epi64((__m128i *)store_diff, x1);
50 0 : store_diff = (int64_t *)(diff + 2 * diff_stride);
51 : _mm_storel_epi64((__m128i *)store_diff, x2);
52 0 : store_diff = (int64_t *)(diff + 3 * diff_stride);
53 : _mm_storel_epi64((__m128i *)store_diff, x3);
54 0 : }
55 :
56 0 : static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
57 : const uint16_t *src, ptrdiff_t src_stride,
58 : const uint16_t *pred, ptrdiff_t pred_stride) {
59 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
60 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
61 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
62 0 : int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
63 :
64 0 : u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
65 0 : u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
66 0 : u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
67 0 : u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
68 0 : u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
69 0 : u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
70 0 : u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
71 0 : u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
72 :
73 0 : v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
74 0 : v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
75 0 : v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
76 0 : v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
77 0 : v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
78 0 : v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
79 0 : v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
80 0 : v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
81 :
82 0 : x0 = _mm_sub_epi16(u0, v0);
83 0 : x1 = _mm_sub_epi16(u1, v1);
84 0 : x2 = _mm_sub_epi16(u2, v2);
85 0 : x3 = _mm_sub_epi16(u3, v3);
86 0 : x4 = _mm_sub_epi16(u4, v4);
87 0 : x5 = _mm_sub_epi16(u5, v5);
88 0 : x6 = _mm_sub_epi16(u6, v6);
89 0 : x7 = _mm_sub_epi16(u7, v7);
90 :
91 : _mm_storel_epi64((__m128i *)store_diff, x0);
92 0 : store_diff = (int64_t *)(diff + 1 * diff_stride);
93 : _mm_storel_epi64((__m128i *)store_diff, x1);
94 0 : store_diff = (int64_t *)(diff + 2 * diff_stride);
95 : _mm_storel_epi64((__m128i *)store_diff, x2);
96 0 : store_diff = (int64_t *)(diff + 3 * diff_stride);
97 : _mm_storel_epi64((__m128i *)store_diff, x3);
98 0 : store_diff = (int64_t *)(diff + 4 * diff_stride);
99 : _mm_storel_epi64((__m128i *)store_diff, x4);
100 0 : store_diff = (int64_t *)(diff + 5 * diff_stride);
101 : _mm_storel_epi64((__m128i *)store_diff, x5);
102 0 : store_diff = (int64_t *)(diff + 6 * diff_stride);
103 : _mm_storel_epi64((__m128i *)store_diff, x6);
104 0 : store_diff = (int64_t *)(diff + 7 * diff_stride);
105 : _mm_storel_epi64((__m128i *)store_diff, x7);
106 0 : }
107 :
108 0 : static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
109 : const uint16_t *src, ptrdiff_t src_stride,
110 : const uint16_t *pred, ptrdiff_t pred_stride) {
111 : __m128i u0, u1, u2, u3;
112 : __m128i v0, v1, v2, v3;
113 : __m128i x0, x1, x2, x3;
114 :
115 0 : u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
116 0 : u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
117 0 : u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
118 0 : u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
119 :
120 0 : v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
121 0 : v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
122 0 : v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
123 0 : v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
124 :
125 0 : x0 = _mm_sub_epi16(u0, v0);
126 0 : x1 = _mm_sub_epi16(u1, v1);
127 0 : x2 = _mm_sub_epi16(u2, v2);
128 0 : x3 = _mm_sub_epi16(u3, v3);
129 :
130 : _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
131 0 : _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
132 0 : _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
133 0 : _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
134 0 : }
135 :
136 0 : static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
137 : const uint16_t *src, ptrdiff_t src_stride,
138 : const uint16_t *pred, ptrdiff_t pred_stride) {
139 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
140 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
141 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
142 :
143 0 : u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
144 0 : u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
145 0 : u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
146 0 : u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
147 0 : u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
148 0 : u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
149 0 : u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
150 0 : u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
151 :
152 0 : v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
153 0 : v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
154 0 : v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
155 0 : v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
156 0 : v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
157 0 : v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
158 0 : v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
159 0 : v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
160 :
161 0 : x0 = _mm_sub_epi16(u0, v0);
162 0 : x1 = _mm_sub_epi16(u1, v1);
163 0 : x2 = _mm_sub_epi16(u2, v2);
164 0 : x3 = _mm_sub_epi16(u3, v3);
165 0 : x4 = _mm_sub_epi16(u4, v4);
166 0 : x5 = _mm_sub_epi16(u5, v5);
167 0 : x6 = _mm_sub_epi16(u6, v6);
168 0 : x7 = _mm_sub_epi16(u7, v7);
169 :
170 : _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
171 0 : _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
172 0 : _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
173 0 : _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
174 0 : _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
175 0 : _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
176 0 : _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
177 0 : _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
178 0 : }
179 :
180 0 : static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
181 : const uint16_t *src, ptrdiff_t src_stride,
182 : const uint16_t *pred, ptrdiff_t pred_stride) {
183 0 : subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
184 0 : diff += diff_stride << 3;
185 0 : src += src_stride << 3;
186 0 : pred += pred_stride << 3;
187 0 : subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
188 0 : }
189 :
190 0 : static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
191 : const uint16_t *src, ptrdiff_t src_stride,
192 : const uint16_t *pred, ptrdiff_t pred_stride) {
193 0 : subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
194 0 : diff += 8;
195 0 : src += 8;
196 0 : pred += 8;
197 0 : subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
198 0 : }
199 :
200 0 : static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
201 : const uint16_t *src, ptrdiff_t src_stride,
202 : const uint16_t *pred, ptrdiff_t pred_stride) {
203 0 : subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
204 0 : diff += diff_stride << 3;
205 0 : src += src_stride << 3;
206 0 : pred += pred_stride << 3;
207 0 : subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
208 0 : }
209 :
210 0 : static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
211 : const uint16_t *src, ptrdiff_t src_stride,
212 : const uint16_t *pred, ptrdiff_t pred_stride) {
213 0 : subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
214 0 : diff += diff_stride << 4;
215 0 : src += src_stride << 4;
216 0 : pred += pred_stride << 4;
217 0 : subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
218 0 : }
219 :
220 0 : static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
221 : const uint16_t *src, ptrdiff_t src_stride,
222 : const uint16_t *pred, ptrdiff_t pred_stride) {
223 0 : subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
224 0 : diff += 16;
225 0 : src += 16;
226 0 : pred += 16;
227 0 : subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
228 0 : }
229 :
230 0 : static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
231 : const uint16_t *src, ptrdiff_t src_stride,
232 : const uint16_t *pred, ptrdiff_t pred_stride) {
233 0 : subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
234 0 : diff += diff_stride << 4;
235 0 : src += src_stride << 4;
236 0 : pred += pred_stride << 4;
237 0 : subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
238 0 : }
239 :
240 0 : static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
241 : const uint16_t *src, ptrdiff_t src_stride,
242 : const uint16_t *pred, ptrdiff_t pred_stride) {
243 0 : subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
244 0 : diff += diff_stride << 5;
245 0 : src += src_stride << 5;
246 0 : pred += pred_stride << 5;
247 0 : subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
248 0 : }
249 :
250 0 : static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
251 : const uint16_t *src, ptrdiff_t src_stride,
252 : const uint16_t *pred, ptrdiff_t pred_stride) {
253 0 : subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
254 0 : diff += 32;
255 0 : src += 32;
256 0 : pred += 32;
257 0 : subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
258 0 : }
259 :
260 0 : static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
261 : const uint16_t *src, ptrdiff_t src_stride,
262 : const uint16_t *pred, ptrdiff_t pred_stride) {
263 0 : subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
264 0 : diff += diff_stride << 5;
265 0 : src += src_stride << 5;
266 0 : pred += pred_stride << 5;
267 0 : subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
268 0 : }
269 :
270 0 : static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
271 : const uint16_t *src, ptrdiff_t src_stride,
272 : const uint16_t *pred, ptrdiff_t pred_stride) {
273 0 : subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
274 0 : diff += diff_stride << 6;
275 0 : src += src_stride << 6;
276 0 : pred += pred_stride << 6;
277 0 : subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
278 0 : }
279 :
280 0 : static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
281 : const uint16_t *src, ptrdiff_t src_stride,
282 : const uint16_t *pred, ptrdiff_t pred_stride) {
283 0 : subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
284 0 : diff += 64;
285 0 : src += 64;
286 0 : pred += 64;
287 0 : subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
288 0 : }
289 :
290 0 : static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
291 : const uint16_t *src, ptrdiff_t src_stride,
292 : const uint16_t *pred, ptrdiff_t pred_stride) {
293 0 : subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
294 0 : diff += diff_stride << 6;
295 0 : src += src_stride << 6;
296 0 : pred += pred_stride << 6;
297 0 : subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
298 0 : }
299 :
300 0 : static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
301 0 : SubtractWxHFuncType ret_func_ptr = NULL;
302 0 : if (rows == 4) {
303 0 : if (cols == 4) {
304 0 : ret_func_ptr = subtract_4x4;
305 0 : } else if (cols == 8) {
306 0 : ret_func_ptr = subtract_8x4;
307 : }
308 0 : } else if (rows == 8) {
309 0 : if (cols == 4) {
310 0 : ret_func_ptr = subtract_4x8;
311 0 : } else if (cols == 8) {
312 0 : ret_func_ptr = subtract_8x8;
313 0 : } else if (cols == 16) {
314 0 : ret_func_ptr = subtract_16x8;
315 : }
316 0 : } else if (rows == 16) {
317 0 : if (cols == 8) {
318 0 : ret_func_ptr = subtract_8x16;
319 0 : } else if (cols == 16) {
320 0 : ret_func_ptr = subtract_16x16;
321 0 : } else if (cols == 32) {
322 0 : ret_func_ptr = subtract_32x16;
323 : }
324 0 : } else if (rows == 32) {
325 0 : if (cols == 16) {
326 0 : ret_func_ptr = subtract_16x32;
327 0 : } else if (cols == 32) {
328 0 : ret_func_ptr = subtract_32x32;
329 0 : } else if (cols == 64) {
330 0 : ret_func_ptr = subtract_64x32;
331 : }
332 0 : } else if (rows == 64) {
333 0 : if (cols == 32) {
334 0 : ret_func_ptr = subtract_32x64;
335 0 : } else if (cols == 64) {
336 0 : ret_func_ptr = subtract_64x64;
337 0 : } else if (cols == 128) {
338 0 : ret_func_ptr = subtract_128x64;
339 : }
340 0 : } else if (rows == 128) {
341 0 : if (cols == 64) {
342 0 : ret_func_ptr = subtract_64x128;
343 0 : } else if (cols == 128) {
344 0 : ret_func_ptr = subtract_128x128;
345 : }
346 : }
347 0 : if (!ret_func_ptr) {
348 0 : assert(0);
349 : }
350 0 : return ret_func_ptr;
351 : }
352 :
353 0 : void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
354 : ptrdiff_t diff_stride, const uint8_t *src8,
355 : ptrdiff_t src_stride, const uint8_t *pred8,
356 : ptrdiff_t pred_stride, int bd) {
357 0 : uint16_t *src = CONVERT_TO_SHORTPTR(src8);
358 0 : uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
359 : SubtractWxHFuncType func;
360 : (void)bd;
361 :
362 0 : func = getSubtractFunc(rows, cols);
363 0 : func(diff, diff_stride, src, src_stride, pred, pred_stride);
364 0 : }
|