Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "vpx_config.h"
12 : #include "vp8_rtcd.h"
13 : #include "vpx_ports/mem.h"
14 : #include "filter_x86.h"
15 :
16 : extern const short vp8_six_tap_x86[8][6 * 8];
17 :
18 : extern void vp8_filter_block1d_h6_mmx(unsigned char *src_ptr,
19 : unsigned short *output_ptr,
20 : unsigned int src_pixels_per_line,
21 : unsigned int pixel_step,
22 : unsigned int output_height,
23 : unsigned int output_width,
24 : const short *vp8_filter);
25 : extern void vp8_filter_block1dc_v6_mmx(
26 : unsigned short *src_ptr, unsigned char *output_ptr, int output_pitch,
27 : unsigned int pixels_per_line, unsigned int pixel_step,
28 : unsigned int output_height, unsigned int output_width,
29 : const short *vp8_filter);
30 : extern void vp8_filter_block1d8_h6_sse2(unsigned char *src_ptr,
31 : unsigned short *output_ptr,
32 : unsigned int src_pixels_per_line,
33 : unsigned int pixel_step,
34 : unsigned int output_height,
35 : unsigned int output_width,
36 : const short *vp8_filter);
37 : extern void vp8_filter_block1d16_h6_sse2(unsigned char *src_ptr,
38 : unsigned short *output_ptr,
39 : unsigned int src_pixels_per_line,
40 : unsigned int pixel_step,
41 : unsigned int output_height,
42 : unsigned int output_width,
43 : const short *vp8_filter);
44 : extern void vp8_filter_block1d8_v6_sse2(
45 : unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
46 : unsigned int pixels_per_line, unsigned int pixel_step,
47 : unsigned int output_height, unsigned int output_width,
48 : const short *vp8_filter);
49 : extern void vp8_filter_block1d16_v6_sse2(
50 : unsigned short *src_ptr, unsigned char *output_ptr, int dst_ptich,
51 : unsigned int pixels_per_line, unsigned int pixel_step,
52 : unsigned int output_height, unsigned int output_width,
53 : const short *vp8_filter);
54 : extern void vp8_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
55 : unsigned short *output_ptr,
56 : unsigned int src_pixels_per_line,
57 : unsigned int output_height,
58 : unsigned int output_width);
59 : extern void vp8_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
60 : unsigned int src_pixels_per_line,
61 : unsigned char *output_ptr,
62 : int dst_ptich,
63 : unsigned int output_height,
64 : const short *vp8_filter);
65 : extern void vp8_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
66 : unsigned int src_pixels_per_line,
67 : unsigned char *output_ptr,
68 : int dst_ptich,
69 : unsigned int output_height,
70 : const short *vp8_filter);
71 : extern void vp8_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
72 : unsigned int src_pixels_per_line,
73 : unsigned char *output_ptr,
74 : int dst_ptich,
75 : unsigned int output_height,
76 : const short *vp8_filter);
77 :
78 : #if HAVE_MMX
79 0 : void vp8_sixtap_predict4x4_mmx(unsigned char *src_ptr, int src_pixels_per_line,
80 : int xoffset, int yoffset, unsigned char *dst_ptr,
81 : int dst_pitch) {
82 : DECLARE_ALIGNED(16, unsigned short,
83 : FData2[16 * 16]); /* Temp data bufffer used in filtering */
84 : const short *HFilter, *VFilter;
85 0 : HFilter = vp8_six_tap_x86[xoffset];
86 0 : vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2,
87 : src_pixels_per_line, 1, 9, 8, HFilter);
88 0 : VFilter = vp8_six_tap_x86[yoffset];
89 0 : vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4, 4, 4,
90 : VFilter);
91 0 : }
92 : #endif
93 :
94 : #if HAVE_SSE2
95 0 : void vp8_sixtap_predict16x16_sse2(unsigned char *src_ptr,
96 : int src_pixels_per_line, int xoffset,
97 : int yoffset, unsigned char *dst_ptr,
98 : int dst_pitch
99 :
100 : ) {
101 : DECLARE_ALIGNED(16, unsigned short,
102 : FData2[24 * 24]); /* Temp data bufffer used in filtering */
103 :
104 : const short *HFilter, *VFilter;
105 :
106 0 : if (xoffset) {
107 0 : if (yoffset) {
108 0 : HFilter = vp8_six_tap_x86[xoffset];
109 0 : vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
110 : src_pixels_per_line, 1, 21, 32, HFilter);
111 0 : VFilter = vp8_six_tap_x86[yoffset];
112 0 : vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
113 : dst_pitch, VFilter);
114 : } else {
115 : /* First-pass only */
116 0 : HFilter = vp8_six_tap_x86[xoffset];
117 0 : vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
118 : dst_pitch, 16, HFilter);
119 : }
120 : } else {
121 : /* Second-pass only */
122 0 : VFilter = vp8_six_tap_x86[yoffset];
123 0 : vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
124 : src_pixels_per_line, 21, 32);
125 0 : vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16, 16,
126 : dst_pitch, VFilter);
127 : }
128 0 : }
129 :
130 0 : void vp8_sixtap_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_line,
131 : int xoffset, int yoffset,
132 : unsigned char *dst_ptr, int dst_pitch) {
133 : DECLARE_ALIGNED(16, unsigned short,
134 : FData2[256]); /* Temp data bufffer used in filtering */
135 : const short *HFilter, *VFilter;
136 :
137 0 : if (xoffset) {
138 0 : if (yoffset) {
139 0 : HFilter = vp8_six_tap_x86[xoffset];
140 0 : vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
141 : src_pixels_per_line, 1, 13, 16, HFilter);
142 0 : VFilter = vp8_six_tap_x86[yoffset];
143 0 : vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 8,
144 : dst_pitch, VFilter);
145 : } else {
146 : /* First-pass only */
147 0 : HFilter = vp8_six_tap_x86[xoffset];
148 0 : vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
149 : dst_pitch, 8, HFilter);
150 : }
151 : } else {
152 : /* Second-pass only */
153 0 : VFilter = vp8_six_tap_x86[yoffset];
154 0 : vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
155 : src_pixels_per_line, dst_ptr, dst_pitch, 8,
156 : VFilter);
157 : }
158 0 : }
159 :
160 0 : void vp8_sixtap_predict8x4_sse2(unsigned char *src_ptr, int src_pixels_per_line,
161 : int xoffset, int yoffset,
162 : unsigned char *dst_ptr, int dst_pitch) {
163 : DECLARE_ALIGNED(16, unsigned short,
164 : FData2[256]); /* Temp data bufffer used in filtering */
165 : const short *HFilter, *VFilter;
166 :
167 0 : if (xoffset) {
168 0 : if (yoffset) {
169 0 : HFilter = vp8_six_tap_x86[xoffset];
170 0 : vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2,
171 : src_pixels_per_line, 1, 9, 16, HFilter);
172 0 : VFilter = vp8_six_tap_x86[yoffset];
173 0 : vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8, 4,
174 : dst_pitch, VFilter);
175 : } else {
176 : /* First-pass only */
177 0 : HFilter = vp8_six_tap_x86[xoffset];
178 0 : vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr,
179 : dst_pitch, 4, HFilter);
180 : }
181 : } else {
182 : /* Second-pass only */
183 0 : VFilter = vp8_six_tap_x86[yoffset];
184 0 : vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
185 : src_pixels_per_line, dst_ptr, dst_pitch, 4,
186 : VFilter);
187 : }
188 0 : }
189 :
190 : #endif
191 :
192 : #if HAVE_SSSE3
193 :
194 : extern void vp8_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
195 : unsigned int src_pixels_per_line,
196 : unsigned char *output_ptr,
197 : unsigned int output_pitch,
198 : unsigned int output_height,
199 : unsigned int vp8_filter_index);
200 :
201 : extern void vp8_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
202 : unsigned int src_pixels_per_line,
203 : unsigned char *output_ptr,
204 : unsigned int output_pitch,
205 : unsigned int output_height,
206 : unsigned int vp8_filter_index);
207 :
208 : extern void vp8_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
209 : unsigned int src_pitch,
210 : unsigned char *output_ptr,
211 : unsigned int out_pitch,
212 : unsigned int output_height,
213 : unsigned int vp8_filter_index);
214 :
215 : extern void vp8_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
216 : unsigned int src_pitch,
217 : unsigned char *output_ptr,
218 : unsigned int out_pitch,
219 : unsigned int output_height,
220 : unsigned int vp8_filter_index);
221 :
222 : extern void vp8_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
223 : unsigned int src_pixels_per_line,
224 : unsigned char *output_ptr,
225 : unsigned int output_pitch,
226 : unsigned int output_height,
227 : unsigned int vp8_filter_index);
228 :
229 : extern void vp8_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
230 : unsigned int src_pitch,
231 : unsigned char *output_ptr,
232 : unsigned int out_pitch,
233 : unsigned int output_height,
234 : unsigned int vp8_filter_index);
235 :
236 0 : void vp8_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
237 : int src_pixels_per_line, int xoffset,
238 : int yoffset, unsigned char *dst_ptr,
239 : int dst_pitch
240 :
241 : ) {
242 : DECLARE_ALIGNED(16, unsigned char, FData2[24 * 24]);
243 :
244 0 : if (xoffset) {
245 0 : if (yoffset) {
246 0 : vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
247 : src_pixels_per_line, FData2, 16, 21,
248 : xoffset);
249 0 : vp8_filter_block1d16_v6_ssse3(FData2, 16, dst_ptr, dst_pitch, 16,
250 : yoffset);
251 : } else {
252 : /* First-pass only */
253 0 : vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
254 : dst_pitch, 16, xoffset);
255 : }
256 : } else {
257 0 : if (yoffset) {
258 : /* Second-pass only */
259 0 : vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
260 : src_pixels_per_line, dst_ptr, dst_pitch, 16,
261 : yoffset);
262 : } else {
263 : /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
264 : * yoffset==0) case correctly. Add copy function here to guarantee
265 : * six-tap function handles all possible offsets. */
266 0 : vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
267 : }
268 : }
269 0 : }
270 :
271 0 : void vp8_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
272 : int src_pixels_per_line, int xoffset,
273 : int yoffset, unsigned char *dst_ptr,
274 : int dst_pitch) {
275 : DECLARE_ALIGNED(16, unsigned char, FData2[256]);
276 :
277 0 : if (xoffset) {
278 0 : if (yoffset) {
279 0 : vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
280 : src_pixels_per_line, FData2, 8, 13, xoffset);
281 0 : vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 8, yoffset);
282 : } else {
283 0 : vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
284 : dst_pitch, 8, xoffset);
285 : }
286 : } else {
287 0 : if (yoffset) {
288 : /* Second-pass only */
289 0 : vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
290 : src_pixels_per_line, dst_ptr, dst_pitch, 8,
291 : yoffset);
292 : } else {
293 : /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
294 : * yoffset==0) case correctly. Add copy function here to guarantee
295 : * six-tap function handles all possible offsets. */
296 0 : vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
297 : }
298 : }
299 0 : }
300 :
301 0 : void vp8_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
302 : int src_pixels_per_line, int xoffset,
303 : int yoffset, unsigned char *dst_ptr,
304 : int dst_pitch) {
305 : DECLARE_ALIGNED(16, unsigned char, FData2[256]);
306 :
307 0 : if (xoffset) {
308 0 : if (yoffset) {
309 0 : vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
310 : src_pixels_per_line, FData2, 8, 9, xoffset);
311 0 : vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 4, yoffset);
312 : } else {
313 : /* First-pass only */
314 0 : vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
315 : dst_pitch, 4, xoffset);
316 : }
317 : } else {
318 0 : if (yoffset) {
319 : /* Second-pass only */
320 0 : vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
321 : src_pixels_per_line, dst_ptr, dst_pitch, 4,
322 : yoffset);
323 : } else {
324 : /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
325 : * yoffset==0) case correctly. Add copy function here to guarantee
326 : * six-tap function handles all possible offsets. */
327 0 : vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch);
328 : }
329 : }
330 0 : }
331 :
332 0 : void vp8_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
333 : int src_pixels_per_line, int xoffset,
334 : int yoffset, unsigned char *dst_ptr,
335 : int dst_pitch) {
336 : DECLARE_ALIGNED(16, unsigned char, FData2[4 * 9]);
337 :
338 0 : if (xoffset) {
339 0 : if (yoffset) {
340 0 : vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
341 : src_pixels_per_line, FData2, 4, 9, xoffset);
342 0 : vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 4, yoffset);
343 : } else {
344 0 : vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, dst_ptr,
345 : dst_pitch, 4, xoffset);
346 : }
347 : } else {
348 0 : if (yoffset) {
349 0 : vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
350 : src_pixels_per_line, dst_ptr, dst_pitch, 4,
351 : yoffset);
352 : } else {
353 : /* ssse3 second-pass only function couldn't handle (xoffset==0 &&
354 : * yoffset==0) case correctly. Add copy function here to guarantee
355 : * six-tap function handles all possible offsets. */
356 : int r;
357 :
358 0 : for (r = 0; r < 4; ++r) {
359 0 : dst_ptr[0] = src_ptr[0];
360 0 : dst_ptr[1] = src_ptr[1];
361 0 : dst_ptr[2] = src_ptr[2];
362 0 : dst_ptr[3] = src_ptr[3];
363 0 : dst_ptr += dst_pitch;
364 0 : src_ptr += src_pixels_per_line;
365 : }
366 : }
367 : }
368 0 : }
369 :
370 : #endif
|