Line data Source code
1 : /*
2 : * Copyright 2015 The LibYuv Project Authors. All rights reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "libyuv/row.h"
12 : #include "libyuv/rotate_row.h"
13 :
14 : #ifdef __cplusplus
15 : namespace libyuv {
16 : extern "C" {
17 : #endif
18 :
19 : // This module is for GCC x86 and x64.
20 : #if !defined(LIBYUV_DISABLE_X86) && \
21 : (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22 :
23 : // Transpose 8x8. 32 or 64 bit, but not NaCL for 64 bit.
24 : #if defined(HAS_TRANSPOSEWX8_SSSE3)
25 0 : void TransposeWx8_SSSE3(const uint8* src,
26 : int src_stride,
27 : uint8* dst,
28 : int dst_stride,
29 : int width) {
30 : asm volatile(
31 : // Read in the data from the source pointer.
32 : // First round of bit swap.
33 : LABELALIGN
34 : "1: \n"
35 : "movq (%0),%%xmm0 \n"
36 : "movq (%0,%3),%%xmm1 \n"
37 : "lea (%0,%3,2),%0 \n"
38 : "punpcklbw %%xmm1,%%xmm0 \n"
39 : "movq (%0),%%xmm2 \n"
40 : "movdqa %%xmm0,%%xmm1 \n"
41 : "palignr $0x8,%%xmm1,%%xmm1 \n"
42 : "movq (%0,%3),%%xmm3 \n"
43 : "lea (%0,%3,2),%0 \n"
44 : "punpcklbw %%xmm3,%%xmm2 \n"
45 : "movdqa %%xmm2,%%xmm3 \n"
46 : "movq (%0),%%xmm4 \n"
47 : "palignr $0x8,%%xmm3,%%xmm3 \n"
48 : "movq (%0,%3),%%xmm5 \n"
49 : "lea (%0,%3,2),%0 \n"
50 : "punpcklbw %%xmm5,%%xmm4 \n"
51 : "movdqa %%xmm4,%%xmm5 \n"
52 : "movq (%0),%%xmm6 \n"
53 : "palignr $0x8,%%xmm5,%%xmm5 \n"
54 : "movq (%0,%3),%%xmm7 \n"
55 : "lea (%0,%3,2),%0 \n"
56 : "punpcklbw %%xmm7,%%xmm6 \n"
57 : "neg %3 \n"
58 : "movdqa %%xmm6,%%xmm7 \n"
59 : "lea 0x8(%0,%3,8),%0 \n"
60 : "palignr $0x8,%%xmm7,%%xmm7 \n"
61 : "neg %3 \n"
62 : // Second round of bit swap.
63 : "punpcklwd %%xmm2,%%xmm0 \n"
64 : "punpcklwd %%xmm3,%%xmm1 \n"
65 : "movdqa %%xmm0,%%xmm2 \n"
66 : "movdqa %%xmm1,%%xmm3 \n"
67 : "palignr $0x8,%%xmm2,%%xmm2 \n"
68 : "palignr $0x8,%%xmm3,%%xmm3 \n"
69 : "punpcklwd %%xmm6,%%xmm4 \n"
70 : "punpcklwd %%xmm7,%%xmm5 \n"
71 : "movdqa %%xmm4,%%xmm6 \n"
72 : "movdqa %%xmm5,%%xmm7 \n"
73 : "palignr $0x8,%%xmm6,%%xmm6 \n"
74 : "palignr $0x8,%%xmm7,%%xmm7 \n"
75 : // Third round of bit swap.
76 : // Write to the destination pointer.
77 : "punpckldq %%xmm4,%%xmm0 \n"
78 : "movq %%xmm0,(%1) \n"
79 : "movdqa %%xmm0,%%xmm4 \n"
80 : "palignr $0x8,%%xmm4,%%xmm4 \n"
81 : "movq %%xmm4,(%1,%4) \n"
82 : "lea (%1,%4,2),%1 \n"
83 : "punpckldq %%xmm6,%%xmm2 \n"
84 : "movdqa %%xmm2,%%xmm6 \n"
85 : "movq %%xmm2,(%1) \n"
86 : "palignr $0x8,%%xmm6,%%xmm6 \n"
87 : "punpckldq %%xmm5,%%xmm1 \n"
88 : "movq %%xmm6,(%1,%4) \n"
89 : "lea (%1,%4,2),%1 \n"
90 : "movdqa %%xmm1,%%xmm5 \n"
91 : "movq %%xmm1,(%1) \n"
92 : "palignr $0x8,%%xmm5,%%xmm5 \n"
93 : "movq %%xmm5,(%1,%4) \n"
94 : "lea (%1,%4,2),%1 \n"
95 : "punpckldq %%xmm7,%%xmm3 \n"
96 : "movq %%xmm3,(%1) \n"
97 : "movdqa %%xmm3,%%xmm7 \n"
98 : "palignr $0x8,%%xmm7,%%xmm7 \n"
99 : "sub $0x8,%2 \n"
100 : "movq %%xmm7,(%1,%4) \n"
101 : "lea (%1,%4,2),%1 \n"
102 : "jg 1b \n"
103 : : "+r"(src), // %0
104 : "+r"(dst), // %1
105 : "+r"(width) // %2
106 0 : : "r"((intptr_t)(src_stride)), // %3
107 0 : "r"((intptr_t)(dst_stride)) // %4
108 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
109 0 : "xmm7");
110 0 : }
111 : #endif // defined(HAS_TRANSPOSEWX8_SSSE3)
112 :
113 : // Transpose 16x8. 64 bit
114 : #if defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
115 0 : void TransposeWx8_Fast_SSSE3(const uint8* src,
116 : int src_stride,
117 : uint8* dst,
118 : int dst_stride,
119 : int width) {
120 : asm volatile(
121 : // Read in the data from the source pointer.
122 : // First round of bit swap.
123 : LABELALIGN
124 : "1: \n"
125 : "movdqu (%0),%%xmm0 \n"
126 : "movdqu (%0,%3),%%xmm1 \n"
127 : "lea (%0,%3,2),%0 \n"
128 : "movdqa %%xmm0,%%xmm8 \n"
129 : "punpcklbw %%xmm1,%%xmm0 \n"
130 : "punpckhbw %%xmm1,%%xmm8 \n"
131 : "movdqu (%0),%%xmm2 \n"
132 : "movdqa %%xmm0,%%xmm1 \n"
133 : "movdqa %%xmm8,%%xmm9 \n"
134 : "palignr $0x8,%%xmm1,%%xmm1 \n"
135 : "palignr $0x8,%%xmm9,%%xmm9 \n"
136 : "movdqu (%0,%3),%%xmm3 \n"
137 : "lea (%0,%3,2),%0 \n"
138 : "movdqa %%xmm2,%%xmm10 \n"
139 : "punpcklbw %%xmm3,%%xmm2 \n"
140 : "punpckhbw %%xmm3,%%xmm10 \n"
141 : "movdqa %%xmm2,%%xmm3 \n"
142 : "movdqa %%xmm10,%%xmm11 \n"
143 : "movdqu (%0),%%xmm4 \n"
144 : "palignr $0x8,%%xmm3,%%xmm3 \n"
145 : "palignr $0x8,%%xmm11,%%xmm11 \n"
146 : "movdqu (%0,%3),%%xmm5 \n"
147 : "lea (%0,%3,2),%0 \n"
148 : "movdqa %%xmm4,%%xmm12 \n"
149 : "punpcklbw %%xmm5,%%xmm4 \n"
150 : "punpckhbw %%xmm5,%%xmm12 \n"
151 : "movdqa %%xmm4,%%xmm5 \n"
152 : "movdqa %%xmm12,%%xmm13 \n"
153 : "movdqu (%0),%%xmm6 \n"
154 : "palignr $0x8,%%xmm5,%%xmm5 \n"
155 : "palignr $0x8,%%xmm13,%%xmm13 \n"
156 : "movdqu (%0,%3),%%xmm7 \n"
157 : "lea (%0,%3,2),%0 \n"
158 : "movdqa %%xmm6,%%xmm14 \n"
159 : "punpcklbw %%xmm7,%%xmm6 \n"
160 : "punpckhbw %%xmm7,%%xmm14 \n"
161 : "neg %3 \n"
162 : "movdqa %%xmm6,%%xmm7 \n"
163 : "movdqa %%xmm14,%%xmm15 \n"
164 : "lea 0x10(%0,%3,8),%0 \n"
165 : "palignr $0x8,%%xmm7,%%xmm7 \n"
166 : "palignr $0x8,%%xmm15,%%xmm15 \n"
167 : "neg %3 \n"
168 : // Second round of bit swap.
169 : "punpcklwd %%xmm2,%%xmm0 \n"
170 : "punpcklwd %%xmm3,%%xmm1 \n"
171 : "movdqa %%xmm0,%%xmm2 \n"
172 : "movdqa %%xmm1,%%xmm3 \n"
173 : "palignr $0x8,%%xmm2,%%xmm2 \n"
174 : "palignr $0x8,%%xmm3,%%xmm3 \n"
175 : "punpcklwd %%xmm6,%%xmm4 \n"
176 : "punpcklwd %%xmm7,%%xmm5 \n"
177 : "movdqa %%xmm4,%%xmm6 \n"
178 : "movdqa %%xmm5,%%xmm7 \n"
179 : "palignr $0x8,%%xmm6,%%xmm6 \n"
180 : "palignr $0x8,%%xmm7,%%xmm7 \n"
181 : "punpcklwd %%xmm10,%%xmm8 \n"
182 : "punpcklwd %%xmm11,%%xmm9 \n"
183 : "movdqa %%xmm8,%%xmm10 \n"
184 : "movdqa %%xmm9,%%xmm11 \n"
185 : "palignr $0x8,%%xmm10,%%xmm10 \n"
186 : "palignr $0x8,%%xmm11,%%xmm11 \n"
187 : "punpcklwd %%xmm14,%%xmm12 \n"
188 : "punpcklwd %%xmm15,%%xmm13 \n"
189 : "movdqa %%xmm12,%%xmm14 \n"
190 : "movdqa %%xmm13,%%xmm15 \n"
191 : "palignr $0x8,%%xmm14,%%xmm14 \n"
192 : "palignr $0x8,%%xmm15,%%xmm15 \n"
193 : // Third round of bit swap.
194 : // Write to the destination pointer.
195 : "punpckldq %%xmm4,%%xmm0 \n"
196 : "movq %%xmm0,(%1) \n"
197 : "movdqa %%xmm0,%%xmm4 \n"
198 : "palignr $0x8,%%xmm4,%%xmm4 \n"
199 : "movq %%xmm4,(%1,%4) \n"
200 : "lea (%1,%4,2),%1 \n"
201 : "punpckldq %%xmm6,%%xmm2 \n"
202 : "movdqa %%xmm2,%%xmm6 \n"
203 : "movq %%xmm2,(%1) \n"
204 : "palignr $0x8,%%xmm6,%%xmm6 \n"
205 : "punpckldq %%xmm5,%%xmm1 \n"
206 : "movq %%xmm6,(%1,%4) \n"
207 : "lea (%1,%4,2),%1 \n"
208 : "movdqa %%xmm1,%%xmm5 \n"
209 : "movq %%xmm1,(%1) \n"
210 : "palignr $0x8,%%xmm5,%%xmm5 \n"
211 : "movq %%xmm5,(%1,%4) \n"
212 : "lea (%1,%4,2),%1 \n"
213 : "punpckldq %%xmm7,%%xmm3 \n"
214 : "movq %%xmm3,(%1) \n"
215 : "movdqa %%xmm3,%%xmm7 \n"
216 : "palignr $0x8,%%xmm7,%%xmm7 \n"
217 : "movq %%xmm7,(%1,%4) \n"
218 : "lea (%1,%4,2),%1 \n"
219 : "punpckldq %%xmm12,%%xmm8 \n"
220 : "movq %%xmm8,(%1) \n"
221 : "movdqa %%xmm8,%%xmm12 \n"
222 : "palignr $0x8,%%xmm12,%%xmm12 \n"
223 : "movq %%xmm12,(%1,%4) \n"
224 : "lea (%1,%4,2),%1 \n"
225 : "punpckldq %%xmm14,%%xmm10 \n"
226 : "movdqa %%xmm10,%%xmm14 \n"
227 : "movq %%xmm10,(%1) \n"
228 : "palignr $0x8,%%xmm14,%%xmm14 \n"
229 : "punpckldq %%xmm13,%%xmm9 \n"
230 : "movq %%xmm14,(%1,%4) \n"
231 : "lea (%1,%4,2),%1 \n"
232 : "movdqa %%xmm9,%%xmm13 \n"
233 : "movq %%xmm9,(%1) \n"
234 : "palignr $0x8,%%xmm13,%%xmm13 \n"
235 : "movq %%xmm13,(%1,%4) \n"
236 : "lea (%1,%4,2),%1 \n"
237 : "punpckldq %%xmm15,%%xmm11 \n"
238 : "movq %%xmm11,(%1) \n"
239 : "movdqa %%xmm11,%%xmm15 \n"
240 : "palignr $0x8,%%xmm15,%%xmm15 \n"
241 : "sub $0x10,%2 \n"
242 : "movq %%xmm15,(%1,%4) \n"
243 : "lea (%1,%4,2),%1 \n"
244 : "jg 1b \n"
245 : : "+r"(src), // %0
246 : "+r"(dst), // %1
247 : "+r"(width) // %2
248 0 : : "r"((intptr_t)(src_stride)), // %3
249 0 : "r"((intptr_t)(dst_stride)) // %4
250 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
251 : "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
252 0 : "xmm15");
253 0 : }
254 : #endif // defined(HAS_TRANSPOSEWX8_FAST_SSSE3)
255 :
256 : // Transpose UV 8x8. 64 bit.
257 : #if defined(HAS_TRANSPOSEUVWX8_SSE2)
258 0 : void TransposeUVWx8_SSE2(const uint8* src,
259 : int src_stride,
260 : uint8* dst_a,
261 : int dst_stride_a,
262 : uint8* dst_b,
263 : int dst_stride_b,
264 : int width) {
265 : asm volatile(
266 : // Read in the data from the source pointer.
267 : // First round of bit swap.
268 : LABELALIGN
269 : "1: \n"
270 : "movdqu (%0),%%xmm0 \n"
271 : "movdqu (%0,%4),%%xmm1 \n"
272 : "lea (%0,%4,2),%0 \n"
273 : "movdqa %%xmm0,%%xmm8 \n"
274 : "punpcklbw %%xmm1,%%xmm0 \n"
275 : "punpckhbw %%xmm1,%%xmm8 \n"
276 : "movdqa %%xmm8,%%xmm1 \n"
277 : "movdqu (%0),%%xmm2 \n"
278 : "movdqu (%0,%4),%%xmm3 \n"
279 : "lea (%0,%4,2),%0 \n"
280 : "movdqa %%xmm2,%%xmm8 \n"
281 : "punpcklbw %%xmm3,%%xmm2 \n"
282 : "punpckhbw %%xmm3,%%xmm8 \n"
283 : "movdqa %%xmm8,%%xmm3 \n"
284 : "movdqu (%0),%%xmm4 \n"
285 : "movdqu (%0,%4),%%xmm5 \n"
286 : "lea (%0,%4,2),%0 \n"
287 : "movdqa %%xmm4,%%xmm8 \n"
288 : "punpcklbw %%xmm5,%%xmm4 \n"
289 : "punpckhbw %%xmm5,%%xmm8 \n"
290 : "movdqa %%xmm8,%%xmm5 \n"
291 : "movdqu (%0),%%xmm6 \n"
292 : "movdqu (%0,%4),%%xmm7 \n"
293 : "lea (%0,%4,2),%0 \n"
294 : "movdqa %%xmm6,%%xmm8 \n"
295 : "punpcklbw %%xmm7,%%xmm6 \n"
296 : "neg %4 \n"
297 : "lea 0x10(%0,%4,8),%0 \n"
298 : "punpckhbw %%xmm7,%%xmm8 \n"
299 : "movdqa %%xmm8,%%xmm7 \n"
300 : "neg %4 \n"
301 : // Second round of bit swap.
302 : "movdqa %%xmm0,%%xmm8 \n"
303 : "movdqa %%xmm1,%%xmm9 \n"
304 : "punpckhwd %%xmm2,%%xmm8 \n"
305 : "punpckhwd %%xmm3,%%xmm9 \n"
306 : "punpcklwd %%xmm2,%%xmm0 \n"
307 : "punpcklwd %%xmm3,%%xmm1 \n"
308 : "movdqa %%xmm8,%%xmm2 \n"
309 : "movdqa %%xmm9,%%xmm3 \n"
310 : "movdqa %%xmm4,%%xmm8 \n"
311 : "movdqa %%xmm5,%%xmm9 \n"
312 : "punpckhwd %%xmm6,%%xmm8 \n"
313 : "punpckhwd %%xmm7,%%xmm9 \n"
314 : "punpcklwd %%xmm6,%%xmm4 \n"
315 : "punpcklwd %%xmm7,%%xmm5 \n"
316 : "movdqa %%xmm8,%%xmm6 \n"
317 : "movdqa %%xmm9,%%xmm7 \n"
318 : // Third round of bit swap.
319 : // Write to the destination pointer.
320 : "movdqa %%xmm0,%%xmm8 \n"
321 : "punpckldq %%xmm4,%%xmm0 \n"
322 : "movlpd %%xmm0,(%1) \n" // Write back U channel
323 : "movhpd %%xmm0,(%2) \n" // Write back V channel
324 : "punpckhdq %%xmm4,%%xmm8 \n"
325 : "movlpd %%xmm8,(%1,%5) \n"
326 : "lea (%1,%5,2),%1 \n"
327 : "movhpd %%xmm8,(%2,%6) \n"
328 : "lea (%2,%6,2),%2 \n"
329 : "movdqa %%xmm2,%%xmm8 \n"
330 : "punpckldq %%xmm6,%%xmm2 \n"
331 : "movlpd %%xmm2,(%1) \n"
332 : "movhpd %%xmm2,(%2) \n"
333 : "punpckhdq %%xmm6,%%xmm8 \n"
334 : "movlpd %%xmm8,(%1,%5) \n"
335 : "lea (%1,%5,2),%1 \n"
336 : "movhpd %%xmm8,(%2,%6) \n"
337 : "lea (%2,%6,2),%2 \n"
338 : "movdqa %%xmm1,%%xmm8 \n"
339 : "punpckldq %%xmm5,%%xmm1 \n"
340 : "movlpd %%xmm1,(%1) \n"
341 : "movhpd %%xmm1,(%2) \n"
342 : "punpckhdq %%xmm5,%%xmm8 \n"
343 : "movlpd %%xmm8,(%1,%5) \n"
344 : "lea (%1,%5,2),%1 \n"
345 : "movhpd %%xmm8,(%2,%6) \n"
346 : "lea (%2,%6,2),%2 \n"
347 : "movdqa %%xmm3,%%xmm8 \n"
348 : "punpckldq %%xmm7,%%xmm3 \n"
349 : "movlpd %%xmm3,(%1) \n"
350 : "movhpd %%xmm3,(%2) \n"
351 : "punpckhdq %%xmm7,%%xmm8 \n"
352 : "sub $0x8,%3 \n"
353 : "movlpd %%xmm8,(%1,%5) \n"
354 : "lea (%1,%5,2),%1 \n"
355 : "movhpd %%xmm8,(%2,%6) \n"
356 : "lea (%2,%6,2),%2 \n"
357 : "jg 1b \n"
358 : : "+r"(src), // %0
359 : "+r"(dst_a), // %1
360 : "+r"(dst_b), // %2
361 : "+r"(width) // %3
362 0 : : "r"((intptr_t)(src_stride)), // %4
363 0 : "r"((intptr_t)(dst_stride_a)), // %5
364 0 : "r"((intptr_t)(dst_stride_b)) // %6
365 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
366 0 : "xmm7", "xmm8", "xmm9");
367 0 : }
368 : #endif // defined(HAS_TRANSPOSEUVWX8_SSE2)
369 : #endif // defined(__x86_64__) || defined(__i386__)
370 :
371 : #ifdef __cplusplus
372 : } // extern "C"
373 : } // namespace libyuv
374 : #endif
|