Line data Source code
1 : /*
2 : * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "libyuv/row.h"
12 :
13 : #ifdef __cplusplus
14 : namespace libyuv {
15 : extern "C" {
16 : #endif
17 :
18 : // This module is for GCC x86 and x64.
19 : #if !defined(LIBYUV_DISABLE_X86) && \
20 : (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21 :
22 : #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23 :
24 : // Constants for ARGB
25 : static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26 : 13, 65, 33, 0, 13, 65, 33, 0};
27 :
28 : // JPeg full range.
29 : static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30 : 15, 75, 38, 0, 15, 75, 38, 0};
31 : #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32 :
33 : #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34 :
35 : static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36 : 112, -74, -38, 0, 112, -74, -38, 0};
37 :
38 : static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39 : 127, -84, -43, 0, 127, -84, -43, 0};
40 :
41 : static vec8 kARGBToV = {
42 : -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
43 : };
44 :
45 : static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46 : -20, -107, 127, 0, -20, -107, 127, 0};
47 :
48 : // Constants for BGRA
49 : static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
50 : 0, 33, 65, 13, 0, 33, 65, 13};
51 :
52 : static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53 : 0, -38, -74, 112, 0, -38, -74, 112};
54 :
55 : static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56 : 0, 112, -94, -18, 0, 112, -94, -18};
57 :
58 : // Constants for ABGR
59 : static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
60 : 33, 65, 13, 0, 33, 65, 13, 0};
61 :
62 : static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63 : -38, -74, 112, 0, -38, -74, 112, 0};
64 :
65 : static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66 : 112, -94, -18, 0, 112, -94, -18, 0};
67 :
68 : // Constants for RGBA.
69 : static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
70 : 0, 13, 65, 33, 0, 13, 65, 33};
71 :
72 : static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73 : 0, 112, -74, -38, 0, 112, -74, -38};
74 :
75 : static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76 : 0, -18, -94, 112, 0, -18, -94, 112};
77 :
78 : static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
79 : 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
80 :
81 : // 7 bit fixed point 0.5.
82 : static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
83 :
84 : static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
85 : 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
86 :
87 : static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
88 : 0x8080u, 0x8080u, 0x8080u, 0x8080u};
89 : #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90 :
91 : #ifdef HAS_RGB24TOARGBROW_SSSE3
92 :
93 : // Shuffle table for converting RGB24 to ARGB.
94 : static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u,
95 : 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96 :
97 : // Shuffle table for converting RAW to ARGB.
98 : static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u,
99 : 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100 :
101 : // Shuffle table for converting RAW to RGB24. First 8.
102 : static const uvec8 kShuffleMaskRAWToRGB24_0 = {
103 : 2u, 1u, 0u, 5u, 4u, 3u, 8u, 7u,
104 : 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
105 :
106 : // Shuffle table for converting RAW to RGB24. Middle 8.
107 : static const uvec8 kShuffleMaskRAWToRGB24_1 = {
108 : 2u, 7u, 6u, 5u, 10u, 9u, 8u, 13u,
109 : 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
110 :
111 : // Shuffle table for converting RAW to RGB24. Last 8.
112 : static const uvec8 kShuffleMaskRAWToRGB24_2 = {
113 : 8u, 7u, 12u, 11u, 10u, 15u, 14u, 13u,
114 : 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
115 :
116 : // Shuffle table for converting ARGB to RGB24.
117 : static uvec8 kShuffleMaskARGBToRGB24 = {
118 : 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
119 :
120 : // Shuffle table for converting ARGB to RAW.
121 : static uvec8 kShuffleMaskARGBToRAW = {
122 : 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
123 :
124 : // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
125 : static uvec8 kShuffleMaskARGBToRGB24_0 = {
126 : 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
127 :
128 : // YUY2 shuf 16 Y to 32 Y.
129 : static const lvec8 kShuffleYUY2Y = {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10,
130 : 10, 12, 12, 14, 14, 0, 0, 2, 2, 4, 4,
131 : 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
132 :
133 : // YUY2 shuf 8 UV to 16 UV.
134 : static const lvec8 kShuffleYUY2UV = {1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9,
135 : 11, 13, 15, 13, 15, 1, 3, 1, 3, 5, 7,
136 : 5, 7, 9, 11, 9, 11, 13, 15, 13, 15};
137 :
138 : // UYVY shuf 16 Y to 32 Y.
139 : static const lvec8 kShuffleUYVYY = {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11,
140 : 11, 13, 13, 15, 15, 1, 1, 3, 3, 5, 5,
141 : 7, 7, 9, 9, 11, 11, 13, 13, 15, 15};
142 :
143 : // UYVY shuf 8 UV to 16 UV.
144 : static const lvec8 kShuffleUYVYUV = {0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8,
145 : 10, 12, 14, 12, 14, 0, 2, 0, 2, 4, 6,
146 : 4, 6, 8, 10, 8, 10, 12, 14, 12, 14};
147 :
148 : // NV21 shuf 8 VU to 16 UV.
149 : static const lvec8 kShuffleNV21 = {
150 : 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151 : 1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
152 : };
153 : #endif // HAS_RGB24TOARGBROW_SSSE3
154 :
155 : #ifdef HAS_J400TOARGBROW_SSE2
156 0 : void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
157 : asm volatile (
158 : "pcmpeqb %%xmm5,%%xmm5 \n"
159 : "pslld $0x18,%%xmm5 \n"
160 : LABELALIGN
161 : "1: \n"
162 : "movq " MEMACCESS(0) ",%%xmm0 \n"
163 : "lea " MEMLEA(0x8,0) ",%0 \n"
164 : "punpcklbw %%xmm0,%%xmm0 \n"
165 : "movdqa %%xmm0,%%xmm1 \n"
166 : "punpcklwd %%xmm0,%%xmm0 \n"
167 : "punpckhwd %%xmm1,%%xmm1 \n"
168 : "por %%xmm5,%%xmm0 \n"
169 : "por %%xmm5,%%xmm1 \n"
170 : "movdqu %%xmm0," MEMACCESS(1) " \n"
171 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
172 : "lea " MEMLEA(0x20,1) ",%1 \n"
173 : "sub $0x8,%2 \n"
174 : "jg 1b \n"
175 : : "+r"(src_y), // %0
176 : "+r"(dst_argb), // %1
177 : "+r"(width) // %2
178 : :: "memory", "cc", "xmm0", "xmm1", "xmm5"
179 0 : );
180 0 : }
181 : #endif // HAS_J400TOARGBROW_SSE2
182 :
183 : #ifdef HAS_RGB24TOARGBROW_SSSE3
184 0 : void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
185 : asm volatile (
186 : "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
187 : "pslld $0x18,%%xmm5 \n"
188 : "movdqa %3,%%xmm4 \n"
189 : LABELALIGN
190 : "1: \n"
191 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
192 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
193 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
194 : "lea " MEMLEA(0x30,0) ",%0 \n"
195 : "movdqa %%xmm3,%%xmm2 \n"
196 : "palignr $0x8,%%xmm1,%%xmm2 \n"
197 : "pshufb %%xmm4,%%xmm2 \n"
198 : "por %%xmm5,%%xmm2 \n"
199 : "palignr $0xc,%%xmm0,%%xmm1 \n"
200 : "pshufb %%xmm4,%%xmm0 \n"
201 : "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
202 : "por %%xmm5,%%xmm0 \n"
203 : "pshufb %%xmm4,%%xmm1 \n"
204 : "movdqu %%xmm0," MEMACCESS(1) " \n"
205 : "por %%xmm5,%%xmm1 \n"
206 : "palignr $0x4,%%xmm3,%%xmm3 \n"
207 : "pshufb %%xmm4,%%xmm3 \n"
208 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
209 : "por %%xmm5,%%xmm3 \n"
210 : "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
211 : "lea " MEMLEA(0x40,1) ",%1 \n"
212 : "sub $0x10,%2 \n"
213 : "jg 1b \n"
214 : : "+r"(src_rgb24), // %0
215 : "+r"(dst_argb), // %1
216 : "+r"(width) // %2
217 : : "m"(kShuffleMaskRGB24ToARGB) // %3
218 : : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
219 0 : );
220 0 : }
221 :
222 0 : void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
223 : asm volatile (
224 : "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
225 : "pslld $0x18,%%xmm5 \n"
226 : "movdqa %3,%%xmm4 \n"
227 : LABELALIGN
228 : "1: \n"
229 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
230 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
231 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
232 : "lea " MEMLEA(0x30,0) ",%0 \n"
233 : "movdqa %%xmm3,%%xmm2 \n"
234 : "palignr $0x8,%%xmm1,%%xmm2 \n"
235 : "pshufb %%xmm4,%%xmm2 \n"
236 : "por %%xmm5,%%xmm2 \n"
237 : "palignr $0xc,%%xmm0,%%xmm1 \n"
238 : "pshufb %%xmm4,%%xmm0 \n"
239 : "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
240 : "por %%xmm5,%%xmm0 \n"
241 : "pshufb %%xmm4,%%xmm1 \n"
242 : "movdqu %%xmm0," MEMACCESS(1) " \n"
243 : "por %%xmm5,%%xmm1 \n"
244 : "palignr $0x4,%%xmm3,%%xmm3 \n"
245 : "pshufb %%xmm4,%%xmm3 \n"
246 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
247 : "por %%xmm5,%%xmm3 \n"
248 : "movdqu %%xmm3," MEMACCESS2(0x30,1) " \n"
249 : "lea " MEMLEA(0x40,1) ",%1 \n"
250 : "sub $0x10,%2 \n"
251 : "jg 1b \n"
252 : : "+r"(src_raw), // %0
253 : "+r"(dst_argb), // %1
254 : "+r"(width) // %2
255 : : "m"(kShuffleMaskRAWToARGB) // %3
256 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
257 0 : );
258 0 : }
259 :
260 0 : void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
261 : asm volatile (
262 : "movdqa %3,%%xmm3 \n"
263 : "movdqa %4,%%xmm4 \n"
264 : "movdqa %5,%%xmm5 \n"
265 : LABELALIGN
266 : "1: \n"
267 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
268 : "movdqu " MEMACCESS2(0x4,0) ",%%xmm1 \n"
269 : "movdqu " MEMACCESS2(0x8,0) ",%%xmm2 \n"
270 : "lea " MEMLEA(0x18,0) ",%0 \n"
271 : "pshufb %%xmm3,%%xmm0 \n"
272 : "pshufb %%xmm4,%%xmm1 \n"
273 : "pshufb %%xmm5,%%xmm2 \n"
274 : "movq %%xmm0," MEMACCESS(1) " \n"
275 : "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
276 : "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
277 : "lea " MEMLEA(0x18,1) ",%1 \n"
278 : "sub $0x8,%2 \n"
279 : "jg 1b \n"
280 : : "+r"(src_raw), // %0
281 : "+r"(dst_rgb24), // %1
282 : "+r"(width) // %2
283 : : "m"(kShuffleMaskRAWToRGB24_0), // %3
284 : "m"(kShuffleMaskRAWToRGB24_1), // %4
285 : "m"(kShuffleMaskRAWToRGB24_2) // %5
286 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
287 0 : );
288 0 : }
289 :
290 0 : void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
291 : asm volatile (
292 : "mov $0x1080108,%%eax \n"
293 : "movd %%eax,%%xmm5 \n"
294 : "pshufd $0x0,%%xmm5,%%xmm5 \n"
295 : "mov $0x20802080,%%eax \n"
296 : "movd %%eax,%%xmm6 \n"
297 : "pshufd $0x0,%%xmm6,%%xmm6 \n"
298 : "pcmpeqb %%xmm3,%%xmm3 \n"
299 : "psllw $0xb,%%xmm3 \n"
300 : "pcmpeqb %%xmm4,%%xmm4 \n"
301 : "psllw $0xa,%%xmm4 \n"
302 : "psrlw $0x5,%%xmm4 \n"
303 : "pcmpeqb %%xmm7,%%xmm7 \n"
304 : "psllw $0x8,%%xmm7 \n"
305 : "sub %0,%1 \n"
306 : "sub %0,%1 \n"
307 : LABELALIGN
308 : "1: \n"
309 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
310 : "movdqa %%xmm0,%%xmm1 \n"
311 : "movdqa %%xmm0,%%xmm2 \n"
312 : "pand %%xmm3,%%xmm1 \n"
313 : "psllw $0xb,%%xmm2 \n"
314 : "pmulhuw %%xmm5,%%xmm1 \n"
315 : "pmulhuw %%xmm5,%%xmm2 \n"
316 : "psllw $0x8,%%xmm1 \n"
317 : "por %%xmm2,%%xmm1 \n"
318 : "pand %%xmm4,%%xmm0 \n"
319 : "pmulhuw %%xmm6,%%xmm0 \n"
320 : "por %%xmm7,%%xmm0 \n"
321 : "movdqa %%xmm1,%%xmm2 \n"
322 : "punpcklbw %%xmm0,%%xmm1 \n"
323 : "punpckhbw %%xmm0,%%xmm2 \n"
324 : MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
325 : MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
326 : "lea " MEMLEA(0x10,0) ",%0 \n"
327 : "sub $0x8,%2 \n"
328 : "jg 1b \n"
329 : : "+r"(src), // %0
330 : "+r"(dst), // %1
331 : "+r"(width) // %2
332 : :
333 : : "memory", "cc", "eax", NACL_R14
334 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
335 0 : );
336 0 : }
337 :
338 0 : void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
339 : asm volatile (
340 : "mov $0x1080108,%%eax \n"
341 : "movd %%eax,%%xmm5 \n"
342 : "pshufd $0x0,%%xmm5,%%xmm5 \n"
343 : "mov $0x42004200,%%eax \n"
344 : "movd %%eax,%%xmm6 \n"
345 : "pshufd $0x0,%%xmm6,%%xmm6 \n"
346 : "pcmpeqb %%xmm3,%%xmm3 \n"
347 : "psllw $0xb,%%xmm3 \n"
348 : "movdqa %%xmm3,%%xmm4 \n"
349 : "psrlw $0x6,%%xmm4 \n"
350 : "pcmpeqb %%xmm7,%%xmm7 \n"
351 : "psllw $0x8,%%xmm7 \n"
352 : "sub %0,%1 \n"
353 : "sub %0,%1 \n"
354 : LABELALIGN
355 : "1: \n"
356 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
357 : "movdqa %%xmm0,%%xmm1 \n"
358 : "movdqa %%xmm0,%%xmm2 \n"
359 : "psllw $0x1,%%xmm1 \n"
360 : "psllw $0xb,%%xmm2 \n"
361 : "pand %%xmm3,%%xmm1 \n"
362 : "pmulhuw %%xmm5,%%xmm2 \n"
363 : "pmulhuw %%xmm5,%%xmm1 \n"
364 : "psllw $0x8,%%xmm1 \n"
365 : "por %%xmm2,%%xmm1 \n"
366 : "movdqa %%xmm0,%%xmm2 \n"
367 : "pand %%xmm4,%%xmm0 \n"
368 : "psraw $0x8,%%xmm2 \n"
369 : "pmulhuw %%xmm6,%%xmm0 \n"
370 : "pand %%xmm7,%%xmm2 \n"
371 : "por %%xmm2,%%xmm0 \n"
372 : "movdqa %%xmm1,%%xmm2 \n"
373 : "punpcklbw %%xmm0,%%xmm1 \n"
374 : "punpckhbw %%xmm0,%%xmm2 \n"
375 : MEMOPMEM(movdqu,xmm1,0x00,1,0,2) // movdqu %%xmm1,(%1,%0,2)
376 : MEMOPMEM(movdqu,xmm2,0x10,1,0,2) // movdqu %%xmm2,0x10(%1,%0,2)
377 : "lea " MEMLEA(0x10,0) ",%0 \n"
378 : "sub $0x8,%2 \n"
379 : "jg 1b \n"
380 : : "+r"(src), // %0
381 : "+r"(dst), // %1
382 : "+r"(width) // %2
383 : :
384 : : "memory", "cc", "eax", NACL_R14
385 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
386 0 : );
387 0 : }
388 :
389 0 : void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
390 : asm volatile (
391 : "mov $0xf0f0f0f,%%eax \n"
392 : "movd %%eax,%%xmm4 \n"
393 : "pshufd $0x0,%%xmm4,%%xmm4 \n"
394 : "movdqa %%xmm4,%%xmm5 \n"
395 : "pslld $0x4,%%xmm5 \n"
396 : "sub %0,%1 \n"
397 : "sub %0,%1 \n"
398 : LABELALIGN
399 : "1: \n"
400 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
401 : "movdqa %%xmm0,%%xmm2 \n"
402 : "pand %%xmm4,%%xmm0 \n"
403 : "pand %%xmm5,%%xmm2 \n"
404 : "movdqa %%xmm0,%%xmm1 \n"
405 : "movdqa %%xmm2,%%xmm3 \n"
406 : "psllw $0x4,%%xmm1 \n"
407 : "psrlw $0x4,%%xmm3 \n"
408 : "por %%xmm1,%%xmm0 \n"
409 : "por %%xmm3,%%xmm2 \n"
410 : "movdqa %%xmm0,%%xmm1 \n"
411 : "punpcklbw %%xmm2,%%xmm0 \n"
412 : "punpckhbw %%xmm2,%%xmm1 \n"
413 : MEMOPMEM(movdqu,xmm0,0x00,1,0,2) // movdqu %%xmm0,(%1,%0,2)
414 : MEMOPMEM(movdqu,xmm1,0x10,1,0,2) // movdqu %%xmm1,0x10(%1,%0,2)
415 : "lea " MEMLEA(0x10,0) ",%0 \n"
416 : "sub $0x8,%2 \n"
417 : "jg 1b \n"
418 : : "+r"(src), // %0
419 : "+r"(dst), // %1
420 : "+r"(width) // %2
421 : :
422 : : "memory", "cc", "eax", NACL_R14
423 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
424 0 : );
425 0 : }
426 :
427 0 : void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
428 : asm volatile (
429 : "movdqa %3,%%xmm6 \n"
430 : LABELALIGN
431 : "1: \n"
432 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
433 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
434 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
435 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
436 : "lea " MEMLEA(0x40,0) ",%0 \n"
437 : "pshufb %%xmm6,%%xmm0 \n"
438 : "pshufb %%xmm6,%%xmm1 \n"
439 : "pshufb %%xmm6,%%xmm2 \n"
440 : "pshufb %%xmm6,%%xmm3 \n"
441 : "movdqa %%xmm1,%%xmm4 \n"
442 : "psrldq $0x4,%%xmm1 \n"
443 : "pslldq $0xc,%%xmm4 \n"
444 : "movdqa %%xmm2,%%xmm5 \n"
445 : "por %%xmm4,%%xmm0 \n"
446 : "pslldq $0x8,%%xmm5 \n"
447 : "movdqu %%xmm0," MEMACCESS(1) " \n"
448 : "por %%xmm5,%%xmm1 \n"
449 : "psrldq $0x8,%%xmm2 \n"
450 : "pslldq $0x4,%%xmm3 \n"
451 : "por %%xmm3,%%xmm2 \n"
452 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
453 : "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
454 : "lea " MEMLEA(0x30,1) ",%1 \n"
455 : "sub $0x10,%2 \n"
456 : "jg 1b \n"
457 : : "+r"(src), // %0
458 : "+r"(dst), // %1
459 : "+r"(width) // %2
460 : : "m"(kShuffleMaskARGBToRGB24) // %3
461 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
462 0 : );
463 0 : }
464 :
465 0 : void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
466 : asm volatile (
467 : "movdqa %3,%%xmm6 \n"
468 : LABELALIGN
469 : "1: \n"
470 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
471 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
472 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
473 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
474 : "lea " MEMLEA(0x40,0) ",%0 \n"
475 : "pshufb %%xmm6,%%xmm0 \n"
476 : "pshufb %%xmm6,%%xmm1 \n"
477 : "pshufb %%xmm6,%%xmm2 \n"
478 : "pshufb %%xmm6,%%xmm3 \n"
479 : "movdqa %%xmm1,%%xmm4 \n"
480 : "psrldq $0x4,%%xmm1 \n"
481 : "pslldq $0xc,%%xmm4 \n"
482 : "movdqa %%xmm2,%%xmm5 \n"
483 : "por %%xmm4,%%xmm0 \n"
484 : "pslldq $0x8,%%xmm5 \n"
485 : "movdqu %%xmm0," MEMACCESS(1) " \n"
486 : "por %%xmm5,%%xmm1 \n"
487 : "psrldq $0x8,%%xmm2 \n"
488 : "pslldq $0x4,%%xmm3 \n"
489 : "por %%xmm3,%%xmm2 \n"
490 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
491 : "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
492 : "lea " MEMLEA(0x30,1) ",%1 \n"
493 : "sub $0x10,%2 \n"
494 : "jg 1b \n"
495 : : "+r"(src), // %0
496 : "+r"(dst), // %1
497 : "+r"(width) // %2
498 : : "m"(kShuffleMaskARGBToRAW) // %3
499 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
500 0 : );
501 0 : }
502 :
503 0 : void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
504 : asm volatile (
505 : "pcmpeqb %%xmm3,%%xmm3 \n"
506 : "psrld $0x1b,%%xmm3 \n"
507 : "pcmpeqb %%xmm4,%%xmm4 \n"
508 : "psrld $0x1a,%%xmm4 \n"
509 : "pslld $0x5,%%xmm4 \n"
510 : "pcmpeqb %%xmm5,%%xmm5 \n"
511 : "pslld $0xb,%%xmm5 \n"
512 : LABELALIGN
513 : "1: \n"
514 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
515 : "movdqa %%xmm0,%%xmm1 \n"
516 : "movdqa %%xmm0,%%xmm2 \n"
517 : "pslld $0x8,%%xmm0 \n"
518 : "psrld $0x3,%%xmm1 \n"
519 : "psrld $0x5,%%xmm2 \n"
520 : "psrad $0x10,%%xmm0 \n"
521 : "pand %%xmm3,%%xmm1 \n"
522 : "pand %%xmm4,%%xmm2 \n"
523 : "pand %%xmm5,%%xmm0 \n"
524 : "por %%xmm2,%%xmm1 \n"
525 : "por %%xmm1,%%xmm0 \n"
526 : "packssdw %%xmm0,%%xmm0 \n"
527 : "lea " MEMLEA(0x10,0) ",%0 \n"
528 : "movq %%xmm0," MEMACCESS(1) " \n"
529 : "lea " MEMLEA(0x8,1) ",%1 \n"
530 : "sub $0x4,%2 \n"
531 : "jg 1b \n"
532 : : "+r"(src), // %0
533 : "+r"(dst), // %1
534 : "+r"(width) // %2
535 : :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
536 0 : );
537 0 : }
538 :
539 0 : void ARGBToRGB565DitherRow_SSE2(const uint8* src,
540 : uint8* dst,
541 : const uint32 dither4,
542 : int width) {
543 : asm volatile(
544 : "movd %3,%%xmm6 \n"
545 : "punpcklbw %%xmm6,%%xmm6 \n"
546 : "movdqa %%xmm6,%%xmm7 \n"
547 : "punpcklwd %%xmm6,%%xmm6 \n"
548 : "punpckhwd %%xmm7,%%xmm7 \n"
549 : "pcmpeqb %%xmm3,%%xmm3 \n"
550 : "psrld $0x1b,%%xmm3 \n"
551 : "pcmpeqb %%xmm4,%%xmm4 \n"
552 : "psrld $0x1a,%%xmm4 \n"
553 : "pslld $0x5,%%xmm4 \n"
554 : "pcmpeqb %%xmm5,%%xmm5 \n"
555 : "pslld $0xb,%%xmm5 \n"
556 :
557 : LABELALIGN
558 : "1: \n"
559 : "movdqu (%0),%%xmm0 \n"
560 : "paddusb %%xmm6,%%xmm0 \n"
561 : "movdqa %%xmm0,%%xmm1 \n"
562 : "movdqa %%xmm0,%%xmm2 \n"
563 : "pslld $0x8,%%xmm0 \n"
564 : "psrld $0x3,%%xmm1 \n"
565 : "psrld $0x5,%%xmm2 \n"
566 : "psrad $0x10,%%xmm0 \n"
567 : "pand %%xmm3,%%xmm1 \n"
568 : "pand %%xmm4,%%xmm2 \n"
569 : "pand %%xmm5,%%xmm0 \n"
570 : "por %%xmm2,%%xmm1 \n"
571 : "por %%xmm1,%%xmm0 \n"
572 : "packssdw %%xmm0,%%xmm0 \n"
573 : "lea 0x10(%0),%0 \n"
574 : "movq %%xmm0,(%1) \n"
575 : "lea 0x8(%1),%1 \n"
576 : "sub $0x4,%2 \n"
577 : "jg 1b \n"
578 : : "+r"(src), // %0
579 : "+r"(dst), // %1
580 : "+r"(width) // %2
581 : : "m"(dither4) // %3
582 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
583 0 : "xmm7");
584 0 : }
585 :
586 : #ifdef HAS_ARGBTORGB565DITHERROW_AVX2
587 0 : void ARGBToRGB565DitherRow_AVX2(const uint8* src,
588 : uint8* dst,
589 : const uint32 dither4,
590 : int width) {
591 : asm volatile(
592 : "vbroadcastss %3,%%xmm6 \n"
593 : "vpunpcklbw %%xmm6,%%xmm6,%%xmm6 \n"
594 : "vpermq $0xd8,%%ymm6,%%ymm6 \n"
595 : "vpunpcklwd %%ymm6,%%ymm6,%%ymm6 \n"
596 : "vpcmpeqb %%ymm3,%%ymm3,%%ymm3 \n"
597 : "vpsrld $0x1b,%%ymm3,%%ymm3 \n"
598 : "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
599 : "vpsrld $0x1a,%%ymm4,%%ymm4 \n"
600 : "vpslld $0x5,%%ymm4,%%ymm4 \n"
601 : "vpslld $0xb,%%ymm3,%%ymm5 \n"
602 :
603 : LABELALIGN
604 : "1: \n"
605 : "vmovdqu (%0),%%ymm0 \n"
606 : "vpaddusb %%ymm6,%%ymm0,%%ymm0 \n"
607 : "vpsrld $0x5,%%ymm0,%%ymm2 \n"
608 : "vpsrld $0x3,%%ymm0,%%ymm1 \n"
609 : "vpsrld $0x8,%%ymm0,%%ymm0 \n"
610 : "vpand %%ymm4,%%ymm2,%%ymm2 \n"
611 : "vpand %%ymm3,%%ymm1,%%ymm1 \n"
612 : "vpand %%ymm5,%%ymm0,%%ymm0 \n"
613 : "vpor %%ymm2,%%ymm1,%%ymm1 \n"
614 : "vpor %%ymm1,%%ymm0,%%ymm0 \n"
615 : "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
616 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
617 : "lea 0x20(%0),%0 \n"
618 : "vmovdqu %%xmm0,(%1) \n"
619 : "lea 0x10(%1),%1 \n"
620 : "sub $0x8,%2 \n"
621 : "jg 1b \n"
622 : "vzeroupper \n"
623 : : "+r"(src), // %0
624 : "+r"(dst), // %1
625 : "+r"(width) // %2
626 : : "m"(dither4) // %3
627 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
628 0 : "xmm7");
629 0 : }
630 : #endif // HAS_ARGBTORGB565DITHERROW_AVX2
631 :
632 0 : void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
633 : asm volatile (
634 : "pcmpeqb %%xmm4,%%xmm4 \n"
635 : "psrld $0x1b,%%xmm4 \n"
636 : "movdqa %%xmm4,%%xmm5 \n"
637 : "pslld $0x5,%%xmm5 \n"
638 : "movdqa %%xmm4,%%xmm6 \n"
639 : "pslld $0xa,%%xmm6 \n"
640 : "pcmpeqb %%xmm7,%%xmm7 \n"
641 : "pslld $0xf,%%xmm7 \n"
642 :
643 : LABELALIGN
644 : "1: \n"
645 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
646 : "movdqa %%xmm0,%%xmm1 \n"
647 : "movdqa %%xmm0,%%xmm2 \n"
648 : "movdqa %%xmm0,%%xmm3 \n"
649 : "psrad $0x10,%%xmm0 \n"
650 : "psrld $0x3,%%xmm1 \n"
651 : "psrld $0x6,%%xmm2 \n"
652 : "psrld $0x9,%%xmm3 \n"
653 : "pand %%xmm7,%%xmm0 \n"
654 : "pand %%xmm4,%%xmm1 \n"
655 : "pand %%xmm5,%%xmm2 \n"
656 : "pand %%xmm6,%%xmm3 \n"
657 : "por %%xmm1,%%xmm0 \n"
658 : "por %%xmm3,%%xmm2 \n"
659 : "por %%xmm2,%%xmm0 \n"
660 : "packssdw %%xmm0,%%xmm0 \n"
661 : "lea " MEMLEA(0x10,0) ",%0 \n"
662 : "movq %%xmm0," MEMACCESS(1) " \n"
663 : "lea " MEMLEA(0x8,1) ",%1 \n"
664 : "sub $0x4,%2 \n"
665 : "jg 1b \n"
666 : : "+r"(src), // %0
667 : "+r"(dst), // %1
668 : "+r"(width) // %2
669 : :: "memory", "cc",
670 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
671 0 : );
672 0 : }
673 :
674 0 : void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
675 : asm volatile (
676 : "pcmpeqb %%xmm4,%%xmm4 \n"
677 : "psllw $0xc,%%xmm4 \n"
678 : "movdqa %%xmm4,%%xmm3 \n"
679 : "psrlw $0x8,%%xmm3 \n"
680 :
681 : LABELALIGN
682 : "1: \n"
683 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
684 : "movdqa %%xmm0,%%xmm1 \n"
685 : "pand %%xmm3,%%xmm0 \n"
686 : "pand %%xmm4,%%xmm1 \n"
687 : "psrlq $0x4,%%xmm0 \n"
688 : "psrlq $0x8,%%xmm1 \n"
689 : "por %%xmm1,%%xmm0 \n"
690 : "packuswb %%xmm0,%%xmm0 \n"
691 : "lea " MEMLEA(0x10,0) ",%0 \n"
692 : "movq %%xmm0," MEMACCESS(1) " \n"
693 : "lea " MEMLEA(0x8,1) ",%1 \n"
694 : "sub $0x4,%2 \n"
695 : "jg 1b \n"
696 : : "+r"(src), // %0
697 : "+r"(dst), // %1
698 : "+r"(width) // %2
699 : :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
700 0 : );
701 0 : }
702 : #endif // HAS_RGB24TOARGBROW_SSSE3
703 :
704 : #ifdef HAS_ARGBTOYROW_SSSE3
705 : // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
706 0 : void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
707 : asm volatile (
708 : "movdqa %3,%%xmm4 \n"
709 : "movdqa %4,%%xmm5 \n"
710 :
711 : LABELALIGN
712 : "1: \n"
713 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
714 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
715 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
716 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
717 : "pmaddubsw %%xmm4,%%xmm0 \n"
718 : "pmaddubsw %%xmm4,%%xmm1 \n"
719 : "pmaddubsw %%xmm4,%%xmm2 \n"
720 : "pmaddubsw %%xmm4,%%xmm3 \n"
721 : "lea " MEMLEA(0x40,0) ",%0 \n"
722 : "phaddw %%xmm1,%%xmm0 \n"
723 : "phaddw %%xmm3,%%xmm2 \n"
724 : "psrlw $0x7,%%xmm0 \n"
725 : "psrlw $0x7,%%xmm2 \n"
726 : "packuswb %%xmm2,%%xmm0 \n"
727 : "paddb %%xmm5,%%xmm0 \n"
728 : "movdqu %%xmm0," MEMACCESS(1) " \n"
729 : "lea " MEMLEA(0x10,1) ",%1 \n"
730 : "sub $0x10,%2 \n"
731 : "jg 1b \n"
732 : : "+r"(src_argb), // %0
733 : "+r"(dst_y), // %1
734 : "+r"(width) // %2
735 : : "m"(kARGBToY), // %3
736 : "m"(kAddY16) // %4
737 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
738 0 : );
739 0 : }
740 : #endif // HAS_ARGBTOYROW_SSSE3
741 :
742 : #ifdef HAS_ARGBTOYJROW_SSSE3
743 : // Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
744 : // Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
745 0 : void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
746 : asm volatile (
747 : "movdqa %3,%%xmm4 \n"
748 : "movdqa %4,%%xmm5 \n"
749 :
750 : LABELALIGN
751 : "1: \n"
752 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
753 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
754 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
755 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
756 : "pmaddubsw %%xmm4,%%xmm0 \n"
757 : "pmaddubsw %%xmm4,%%xmm1 \n"
758 : "pmaddubsw %%xmm4,%%xmm2 \n"
759 : "pmaddubsw %%xmm4,%%xmm3 \n"
760 : "lea " MEMLEA(0x40,0) ",%0 \n"
761 : "phaddw %%xmm1,%%xmm0 \n"
762 : "phaddw %%xmm3,%%xmm2 \n"
763 : "paddw %%xmm5,%%xmm0 \n"
764 : "paddw %%xmm5,%%xmm2 \n"
765 : "psrlw $0x7,%%xmm0 \n"
766 : "psrlw $0x7,%%xmm2 \n"
767 : "packuswb %%xmm2,%%xmm0 \n"
768 : "movdqu %%xmm0," MEMACCESS(1) " \n"
769 : "lea " MEMLEA(0x10,1) ",%1 \n"
770 : "sub $0x10,%2 \n"
771 : "jg 1b \n"
772 : : "+r"(src_argb), // %0
773 : "+r"(dst_y), // %1
774 : "+r"(width) // %2
775 : : "m"(kARGBToYJ), // %3
776 : "m"(kAddYJ64) // %4
777 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
778 0 : );
779 0 : }
780 : #endif // HAS_ARGBTOYJROW_SSSE3
781 :
782 : #ifdef HAS_ARGBTOYROW_AVX2
783 : // vpermd for vphaddw + vpackuswb vpermd.
784 : static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
785 :
786 : // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787 0 : void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
788 : asm volatile (
789 : "vbroadcastf128 %3,%%ymm4 \n"
790 : "vbroadcastf128 %4,%%ymm5 \n"
791 : "vmovdqu %5,%%ymm6 \n"
792 :
793 : LABELALIGN
794 : "1: \n"
795 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
796 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
797 : "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
798 : "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
799 : "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
800 : "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
801 : "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
802 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
803 : "lea " MEMLEA(0x80,0) ",%0 \n"
804 : "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
805 : "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
806 : "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
807 : "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
808 : "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
809 : "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
810 : "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" // add 16 for Y
811 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
812 : "lea " MEMLEA(0x20,1) ",%1 \n"
813 : "sub $0x20,%2 \n"
814 : "jg 1b \n"
815 : "vzeroupper \n"
816 : : "+r"(src_argb), // %0
817 : "+r"(dst_y), // %1
818 : "+r"(width) // %2
819 : : "m"(kARGBToY), // %3
820 : "m"(kAddY16), // %4
821 : "m"(kPermdARGBToY_AVX) // %5
822 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823 0 : );
824 0 : }
825 : #endif // HAS_ARGBTOYROW_AVX2
826 :
827 : #ifdef HAS_ARGBTOYJROW_AVX2
828 : // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
829 0 : void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
830 : asm volatile (
831 : "vbroadcastf128 %3,%%ymm4 \n"
832 : "vbroadcastf128 %4,%%ymm5 \n"
833 : "vmovdqu %5,%%ymm6 \n"
834 :
835 : LABELALIGN
836 : "1: \n"
837 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
838 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
839 : "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
840 : "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
841 : "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
842 : "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
843 : "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
844 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
845 : "lea " MEMLEA(0x80,0) ",%0 \n"
846 : "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // mutates.
847 : "vphaddw %%ymm3,%%ymm2,%%ymm2 \n"
848 : "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" // Add .5 for rounding.
849 : "vpaddw %%ymm5,%%ymm2,%%ymm2 \n"
850 : "vpsrlw $0x7,%%ymm0,%%ymm0 \n"
851 : "vpsrlw $0x7,%%ymm2,%%ymm2 \n"
852 : "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
853 : "vpermd %%ymm0,%%ymm6,%%ymm0 \n" // unmutate.
854 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
855 : "lea " MEMLEA(0x20,1) ",%1 \n"
856 : "sub $0x20,%2 \n"
857 : "jg 1b \n"
858 : "vzeroupper \n"
859 : : "+r"(src_argb), // %0
860 : "+r"(dst_y), // %1
861 : "+r"(width) // %2
862 : : "m"(kARGBToYJ), // %3
863 : "m"(kAddYJ64), // %4
864 : "m"(kPermdARGBToY_AVX) // %5
865 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
866 0 : );
867 0 : }
868 : #endif // HAS_ARGBTOYJROW_AVX2
869 :
870 : #ifdef HAS_ARGBTOUVROW_SSSE3
871 0 : void ARGBToUVRow_SSSE3(const uint8* src_argb0,
872 : int src_stride_argb,
873 : uint8* dst_u,
874 : uint8* dst_v,
875 : int width) {
876 : asm volatile (
877 : "movdqa %5,%%xmm3 \n"
878 : "movdqa %6,%%xmm4 \n"
879 : "movdqa %7,%%xmm5 \n"
880 : "sub %1,%2 \n"
881 :
882 : LABELALIGN
883 : "1: \n"
884 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
885 : MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
886 : "pavgb %%xmm7,%%xmm0 \n"
887 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
888 : MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
889 : "pavgb %%xmm7,%%xmm1 \n"
890 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
891 : MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
892 : "pavgb %%xmm7,%%xmm2 \n"
893 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
894 : MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
895 : "pavgb %%xmm7,%%xmm6 \n"
896 :
897 : "lea " MEMLEA(0x40,0) ",%0 \n"
898 : "movdqa %%xmm0,%%xmm7 \n"
899 : "shufps $0x88,%%xmm1,%%xmm0 \n"
900 : "shufps $0xdd,%%xmm1,%%xmm7 \n"
901 : "pavgb %%xmm7,%%xmm0 \n"
902 : "movdqa %%xmm2,%%xmm7 \n"
903 : "shufps $0x88,%%xmm6,%%xmm2 \n"
904 : "shufps $0xdd,%%xmm6,%%xmm7 \n"
905 : "pavgb %%xmm7,%%xmm2 \n"
906 : "movdqa %%xmm0,%%xmm1 \n"
907 : "movdqa %%xmm2,%%xmm6 \n"
908 : "pmaddubsw %%xmm4,%%xmm0 \n"
909 : "pmaddubsw %%xmm4,%%xmm2 \n"
910 : "pmaddubsw %%xmm3,%%xmm1 \n"
911 : "pmaddubsw %%xmm3,%%xmm6 \n"
912 : "phaddw %%xmm2,%%xmm0 \n"
913 : "phaddw %%xmm6,%%xmm1 \n"
914 : "psraw $0x8,%%xmm0 \n"
915 : "psraw $0x8,%%xmm1 \n"
916 : "packsswb %%xmm1,%%xmm0 \n"
917 : "paddb %%xmm5,%%xmm0 \n"
918 : "movlps %%xmm0," MEMACCESS(1) " \n"
919 : MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
920 : "lea " MEMLEA(0x8,1) ",%1 \n"
921 : "sub $0x10,%3 \n"
922 : "jg 1b \n"
923 : : "+r"(src_argb0), // %0
924 : "+r"(dst_u), // %1
925 : "+r"(dst_v), // %2
926 : "+rm"(width) // %3
927 0 : : "r"((intptr_t)(src_stride_argb)), // %4
928 : "m"(kARGBToV), // %5
929 : "m"(kARGBToU), // %6
930 : "m"(kAddUV128) // %7
931 : : "memory", "cc", NACL_R14
932 : "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
933 0 : );
934 0 : }
935 : #endif // HAS_ARGBTOUVROW_SSSE3
936 :
937 : #ifdef HAS_ARGBTOUVROW_AVX2
938 : // vpshufb for vphaddw + vpackuswb packed to shorts.
939 : static const lvec8 kShufARGBToUV_AVX = {
940 : 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
941 : 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
942 0 : void ARGBToUVRow_AVX2(const uint8* src_argb0,
943 : int src_stride_argb,
944 : uint8* dst_u,
945 : uint8* dst_v,
946 : int width) {
947 : asm volatile (
948 : "vbroadcastf128 %5,%%ymm5 \n"
949 : "vbroadcastf128 %6,%%ymm6 \n"
950 : "vbroadcastf128 %7,%%ymm7 \n"
951 : "sub %1,%2 \n"
952 :
953 : LABELALIGN
954 : "1: \n"
955 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
956 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
957 : "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
958 : "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
959 : VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
960 : VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
961 : VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
962 : VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
963 : "lea " MEMLEA(0x80,0) ",%0 \n"
964 : "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
965 : "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
966 : "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
967 : "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
968 : "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
969 : "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
970 :
971 : "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
972 : "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
973 : "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
974 : "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
975 : "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
976 : "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
977 : "vpsraw $0x8,%%ymm1,%%ymm1 \n"
978 : "vpsraw $0x8,%%ymm0,%%ymm0 \n"
979 : "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
980 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
981 : "vpshufb %8,%%ymm0,%%ymm0 \n"
982 : "vpaddb %%ymm5,%%ymm0,%%ymm0 \n"
983 :
984 : "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
985 : VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
986 : "lea " MEMLEA(0x10,1) ",%1 \n"
987 : "sub $0x20,%3 \n"
988 : "jg 1b \n"
989 : "vzeroupper \n"
990 : : "+r"(src_argb0), // %0
991 : "+r"(dst_u), // %1
992 : "+r"(dst_v), // %2
993 : "+rm"(width) // %3
994 0 : : "r"((intptr_t)(src_stride_argb)), // %4
995 : "m"(kAddUV128), // %5
996 : "m"(kARGBToV), // %6
997 : "m"(kARGBToU), // %7
998 : "m"(kShufARGBToUV_AVX) // %8
999 : : "memory", "cc", NACL_R14
1000 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1001 0 : );
1002 0 : }
1003 : #endif // HAS_ARGBTOUVROW_AVX2
1004 :
1005 : #ifdef HAS_ARGBTOUVJROW_AVX2
1006 0 : void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1007 : int src_stride_argb,
1008 : uint8* dst_u,
1009 : uint8* dst_v,
1010 : int width) {
1011 : asm volatile (
1012 : "vbroadcastf128 %5,%%ymm5 \n"
1013 : "vbroadcastf128 %6,%%ymm6 \n"
1014 : "vbroadcastf128 %7,%%ymm7 \n"
1015 : "sub %1,%2 \n"
1016 :
1017 : LABELALIGN
1018 : "1: \n"
1019 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
1020 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
1021 : "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
1022 : "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
1023 : VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1024 : VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1025 : VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1026 : VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1027 : "lea " MEMLEA(0x80,0) ",%0 \n"
1028 : "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
1029 : "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
1030 : "vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
1031 : "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
1032 : "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
1033 : "vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
1034 :
1035 : "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
1036 : "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
1037 : "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
1038 : "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
1039 : "vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
1040 : "vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
1041 : "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
1042 : "vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
1043 : "vpsraw $0x8,%%ymm1,%%ymm1 \n"
1044 : "vpsraw $0x8,%%ymm0,%%ymm0 \n"
1045 : "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
1046 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
1047 : "vpshufb %8,%%ymm0,%%ymm0 \n"
1048 :
1049 : "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1050 : VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1051 : "lea " MEMLEA(0x10,1) ",%1 \n"
1052 : "sub $0x20,%3 \n"
1053 : "jg 1b \n"
1054 : "vzeroupper \n"
1055 : : "+r"(src_argb0), // %0
1056 : "+r"(dst_u), // %1
1057 : "+r"(dst_v), // %2
1058 : "+rm"(width) // %3
1059 0 : : "r"((intptr_t)(src_stride_argb)), // %4
1060 : "m"(kAddUVJ128), // %5
1061 : "m"(kARGBToVJ), // %6
1062 : "m"(kARGBToUJ), // %7
1063 : "m"(kShufARGBToUV_AVX) // %8
1064 : : "memory", "cc", NACL_R14
1065 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1066 0 : );
1067 0 : }
1068 : #endif // HAS_ARGBTOUVJROW_AVX2
1069 :
1070 : #ifdef HAS_ARGBTOUVJROW_SSSE3
1071 0 : void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1072 : int src_stride_argb,
1073 : uint8* dst_u,
1074 : uint8* dst_v,
1075 : int width) {
1076 : asm volatile (
1077 : "movdqa %5,%%xmm3 \n"
1078 : "movdqa %6,%%xmm4 \n"
1079 : "movdqa %7,%%xmm5 \n"
1080 : "sub %1,%2 \n"
1081 :
1082 : LABELALIGN
1083 : "1: \n"
1084 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1085 : MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1086 : "pavgb %%xmm7,%%xmm0 \n"
1087 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1088 : MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1089 : "pavgb %%xmm7,%%xmm1 \n"
1090 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1091 : MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1092 : "pavgb %%xmm7,%%xmm2 \n"
1093 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1094 : MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1095 : "pavgb %%xmm7,%%xmm6 \n"
1096 :
1097 : "lea " MEMLEA(0x40,0) ",%0 \n"
1098 : "movdqa %%xmm0,%%xmm7 \n"
1099 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1100 : "shufps $0xdd,%%xmm1,%%xmm7 \n"
1101 : "pavgb %%xmm7,%%xmm0 \n"
1102 : "movdqa %%xmm2,%%xmm7 \n"
1103 : "shufps $0x88,%%xmm6,%%xmm2 \n"
1104 : "shufps $0xdd,%%xmm6,%%xmm7 \n"
1105 : "pavgb %%xmm7,%%xmm2 \n"
1106 : "movdqa %%xmm0,%%xmm1 \n"
1107 : "movdqa %%xmm2,%%xmm6 \n"
1108 : "pmaddubsw %%xmm4,%%xmm0 \n"
1109 : "pmaddubsw %%xmm4,%%xmm2 \n"
1110 : "pmaddubsw %%xmm3,%%xmm1 \n"
1111 : "pmaddubsw %%xmm3,%%xmm6 \n"
1112 : "phaddw %%xmm2,%%xmm0 \n"
1113 : "phaddw %%xmm6,%%xmm1 \n"
1114 : "paddw %%xmm5,%%xmm0 \n"
1115 : "paddw %%xmm5,%%xmm1 \n"
1116 : "psraw $0x8,%%xmm0 \n"
1117 : "psraw $0x8,%%xmm1 \n"
1118 : "packsswb %%xmm1,%%xmm0 \n"
1119 : "movlps %%xmm0," MEMACCESS(1) " \n"
1120 : MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1121 : "lea " MEMLEA(0x8,1) ",%1 \n"
1122 : "sub $0x10,%3 \n"
1123 : "jg 1b \n"
1124 : : "+r"(src_argb0), // %0
1125 : "+r"(dst_u), // %1
1126 : "+r"(dst_v), // %2
1127 : "+rm"(width) // %3
1128 0 : : "r"((intptr_t)(src_stride_argb)), // %4
1129 : "m"(kARGBToVJ), // %5
1130 : "m"(kARGBToUJ), // %6
1131 : "m"(kAddUVJ128) // %7
1132 : : "memory", "cc", NACL_R14
1133 : "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1134 0 : );
1135 0 : }
1136 : #endif // HAS_ARGBTOUVJROW_SSSE3
1137 :
1138 : #ifdef HAS_ARGBTOUV444ROW_SSSE3
1139 0 : void ARGBToUV444Row_SSSE3(const uint8* src_argb,
1140 : uint8* dst_u,
1141 : uint8* dst_v,
1142 : int width) {
1143 : asm volatile (
1144 : "movdqa %4,%%xmm3 \n"
1145 : "movdqa %5,%%xmm4 \n"
1146 : "movdqa %6,%%xmm5 \n"
1147 : "sub %1,%2 \n"
1148 :
1149 : LABELALIGN
1150 : "1: \n"
1151 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1152 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1153 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1154 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1155 : "pmaddubsw %%xmm4,%%xmm0 \n"
1156 : "pmaddubsw %%xmm4,%%xmm1 \n"
1157 : "pmaddubsw %%xmm4,%%xmm2 \n"
1158 : "pmaddubsw %%xmm4,%%xmm6 \n"
1159 : "phaddw %%xmm1,%%xmm0 \n"
1160 : "phaddw %%xmm6,%%xmm2 \n"
1161 : "psraw $0x8,%%xmm0 \n"
1162 : "psraw $0x8,%%xmm2 \n"
1163 : "packsswb %%xmm2,%%xmm0 \n"
1164 : "paddb %%xmm5,%%xmm0 \n"
1165 : "movdqu %%xmm0," MEMACCESS(1) " \n"
1166 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1167 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1168 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1169 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1170 : "pmaddubsw %%xmm3,%%xmm0 \n"
1171 : "pmaddubsw %%xmm3,%%xmm1 \n"
1172 : "pmaddubsw %%xmm3,%%xmm2 \n"
1173 : "pmaddubsw %%xmm3,%%xmm6 \n"
1174 : "phaddw %%xmm1,%%xmm0 \n"
1175 : "phaddw %%xmm6,%%xmm2 \n"
1176 : "psraw $0x8,%%xmm0 \n"
1177 : "psraw $0x8,%%xmm2 \n"
1178 : "packsswb %%xmm2,%%xmm0 \n"
1179 : "paddb %%xmm5,%%xmm0 \n"
1180 : "lea " MEMLEA(0x40,0) ",%0 \n"
1181 : MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
1182 : "lea " MEMLEA(0x10,1) ",%1 \n"
1183 : "sub $0x10,%3 \n"
1184 : "jg 1b \n"
1185 : : "+r"(src_argb), // %0
1186 : "+r"(dst_u), // %1
1187 : "+r"(dst_v), // %2
1188 : "+rm"(width) // %3
1189 : : "m"(kARGBToV), // %4
1190 : "m"(kARGBToU), // %5
1191 : "m"(kAddUV128) // %6
1192 : : "memory", "cc", NACL_R14
1193 : "xmm0", "xmm1", "xmm2", "xmm6"
1194 0 : );
1195 0 : }
1196 : #endif // HAS_ARGBTOUV444ROW_SSSE3
1197 :
1198 0 : void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1199 : asm volatile (
1200 : "movdqa %4,%%xmm5 \n"
1201 : "movdqa %3,%%xmm4 \n"
1202 :
1203 : LABELALIGN
1204 : "1: \n"
1205 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1206 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1207 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1208 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1209 : "pmaddubsw %%xmm4,%%xmm0 \n"
1210 : "pmaddubsw %%xmm4,%%xmm1 \n"
1211 : "pmaddubsw %%xmm4,%%xmm2 \n"
1212 : "pmaddubsw %%xmm4,%%xmm3 \n"
1213 : "lea " MEMLEA(0x40,0) ",%0 \n"
1214 : "phaddw %%xmm1,%%xmm0 \n"
1215 : "phaddw %%xmm3,%%xmm2 \n"
1216 : "psrlw $0x7,%%xmm0 \n"
1217 : "psrlw $0x7,%%xmm2 \n"
1218 : "packuswb %%xmm2,%%xmm0 \n"
1219 : "paddb %%xmm5,%%xmm0 \n"
1220 : "movdqu %%xmm0," MEMACCESS(1) " \n"
1221 : "lea " MEMLEA(0x10,1) ",%1 \n"
1222 : "sub $0x10,%2 \n"
1223 : "jg 1b \n"
1224 : : "+r"(src_bgra), // %0
1225 : "+r"(dst_y), // %1
1226 : "+r"(width) // %2
1227 : : "m"(kBGRAToY), // %3
1228 : "m"(kAddY16) // %4
1229 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1230 0 : );
1231 0 : }
1232 :
1233 0 : void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
1234 : int src_stride_bgra,
1235 : uint8* dst_u,
1236 : uint8* dst_v,
1237 : int width) {
1238 : asm volatile (
1239 : "movdqa %5,%%xmm3 \n"
1240 : "movdqa %6,%%xmm4 \n"
1241 : "movdqa %7,%%xmm5 \n"
1242 : "sub %1,%2 \n"
1243 :
1244 : LABELALIGN
1245 : "1: \n"
1246 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1247 : MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1248 : "pavgb %%xmm7,%%xmm0 \n"
1249 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1250 : MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1251 : "pavgb %%xmm7,%%xmm1 \n"
1252 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1253 : MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1254 : "pavgb %%xmm7,%%xmm2 \n"
1255 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1256 : MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1257 : "pavgb %%xmm7,%%xmm6 \n"
1258 :
1259 : "lea " MEMLEA(0x40,0) ",%0 \n"
1260 : "movdqa %%xmm0,%%xmm7 \n"
1261 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1262 : "shufps $0xdd,%%xmm1,%%xmm7 \n"
1263 : "pavgb %%xmm7,%%xmm0 \n"
1264 : "movdqa %%xmm2,%%xmm7 \n"
1265 : "shufps $0x88,%%xmm6,%%xmm2 \n"
1266 : "shufps $0xdd,%%xmm6,%%xmm7 \n"
1267 : "pavgb %%xmm7,%%xmm2 \n"
1268 : "movdqa %%xmm0,%%xmm1 \n"
1269 : "movdqa %%xmm2,%%xmm6 \n"
1270 : "pmaddubsw %%xmm4,%%xmm0 \n"
1271 : "pmaddubsw %%xmm4,%%xmm2 \n"
1272 : "pmaddubsw %%xmm3,%%xmm1 \n"
1273 : "pmaddubsw %%xmm3,%%xmm6 \n"
1274 : "phaddw %%xmm2,%%xmm0 \n"
1275 : "phaddw %%xmm6,%%xmm1 \n"
1276 : "psraw $0x8,%%xmm0 \n"
1277 : "psraw $0x8,%%xmm1 \n"
1278 : "packsswb %%xmm1,%%xmm0 \n"
1279 : "paddb %%xmm5,%%xmm0 \n"
1280 : "movlps %%xmm0," MEMACCESS(1) " \n"
1281 : MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1282 : "lea " MEMLEA(0x8,1) ",%1 \n"
1283 : "sub $0x10,%3 \n"
1284 : "jg 1b \n"
1285 : : "+r"(src_bgra0), // %0
1286 : "+r"(dst_u), // %1
1287 : "+r"(dst_v), // %2
1288 : "+rm"(width) // %3
1289 0 : : "r"((intptr_t)(src_stride_bgra)), // %4
1290 : "m"(kBGRAToV), // %5
1291 : "m"(kBGRAToU), // %6
1292 : "m"(kAddUV128) // %7
1293 : : "memory", "cc", NACL_R14
1294 : "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1295 0 : );
1296 0 : }
1297 :
1298 0 : void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1299 : asm volatile (
1300 : "movdqa %4,%%xmm5 \n"
1301 : "movdqa %3,%%xmm4 \n"
1302 :
1303 : LABELALIGN
1304 : "1: \n"
1305 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1306 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1307 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1308 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1309 : "pmaddubsw %%xmm4,%%xmm0 \n"
1310 : "pmaddubsw %%xmm4,%%xmm1 \n"
1311 : "pmaddubsw %%xmm4,%%xmm2 \n"
1312 : "pmaddubsw %%xmm4,%%xmm3 \n"
1313 : "lea " MEMLEA(0x40,0) ",%0 \n"
1314 : "phaddw %%xmm1,%%xmm0 \n"
1315 : "phaddw %%xmm3,%%xmm2 \n"
1316 : "psrlw $0x7,%%xmm0 \n"
1317 : "psrlw $0x7,%%xmm2 \n"
1318 : "packuswb %%xmm2,%%xmm0 \n"
1319 : "paddb %%xmm5,%%xmm0 \n"
1320 : "movdqu %%xmm0," MEMACCESS(1) " \n"
1321 : "lea " MEMLEA(0x10,1) ",%1 \n"
1322 : "sub $0x10,%2 \n"
1323 : "jg 1b \n"
1324 : : "+r"(src_abgr), // %0
1325 : "+r"(dst_y), // %1
1326 : "+r"(width) // %2
1327 : : "m"(kABGRToY), // %3
1328 : "m"(kAddY16) // %4
1329 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1330 0 : );
1331 0 : }
1332 :
1333 0 : void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1334 : asm volatile (
1335 : "movdqa %4,%%xmm5 \n"
1336 : "movdqa %3,%%xmm4 \n"
1337 :
1338 : LABELALIGN
1339 : "1: \n"
1340 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1341 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1342 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1343 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
1344 : "pmaddubsw %%xmm4,%%xmm0 \n"
1345 : "pmaddubsw %%xmm4,%%xmm1 \n"
1346 : "pmaddubsw %%xmm4,%%xmm2 \n"
1347 : "pmaddubsw %%xmm4,%%xmm3 \n"
1348 : "lea " MEMLEA(0x40,0) ",%0 \n"
1349 : "phaddw %%xmm1,%%xmm0 \n"
1350 : "phaddw %%xmm3,%%xmm2 \n"
1351 : "psrlw $0x7,%%xmm0 \n"
1352 : "psrlw $0x7,%%xmm2 \n"
1353 : "packuswb %%xmm2,%%xmm0 \n"
1354 : "paddb %%xmm5,%%xmm0 \n"
1355 : "movdqu %%xmm0," MEMACCESS(1) " \n"
1356 : "lea " MEMLEA(0x10,1) ",%1 \n"
1357 : "sub $0x10,%2 \n"
1358 : "jg 1b \n"
1359 : : "+r"(src_rgba), // %0
1360 : "+r"(dst_y), // %1
1361 : "+r"(width) // %2
1362 : : "m"(kRGBAToY), // %3
1363 : "m"(kAddY16) // %4
1364 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1365 0 : );
1366 0 : }
1367 :
1368 0 : void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
1369 : int src_stride_abgr,
1370 : uint8* dst_u,
1371 : uint8* dst_v,
1372 : int width) {
1373 : asm volatile (
1374 : "movdqa %5,%%xmm3 \n"
1375 : "movdqa %6,%%xmm4 \n"
1376 : "movdqa %7,%%xmm5 \n"
1377 : "sub %1,%2 \n"
1378 :
1379 : LABELALIGN
1380 : "1: \n"
1381 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1382 : MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1383 : "pavgb %%xmm7,%%xmm0 \n"
1384 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1385 : MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1386 : "pavgb %%xmm7,%%xmm1 \n"
1387 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1388 : MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1389 : "pavgb %%xmm7,%%xmm2 \n"
1390 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1391 : MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1392 : "pavgb %%xmm7,%%xmm6 \n"
1393 :
1394 : "lea " MEMLEA(0x40,0) ",%0 \n"
1395 : "movdqa %%xmm0,%%xmm7 \n"
1396 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1397 : "shufps $0xdd,%%xmm1,%%xmm7 \n"
1398 : "pavgb %%xmm7,%%xmm0 \n"
1399 : "movdqa %%xmm2,%%xmm7 \n"
1400 : "shufps $0x88,%%xmm6,%%xmm2 \n"
1401 : "shufps $0xdd,%%xmm6,%%xmm7 \n"
1402 : "pavgb %%xmm7,%%xmm2 \n"
1403 : "movdqa %%xmm0,%%xmm1 \n"
1404 : "movdqa %%xmm2,%%xmm6 \n"
1405 : "pmaddubsw %%xmm4,%%xmm0 \n"
1406 : "pmaddubsw %%xmm4,%%xmm2 \n"
1407 : "pmaddubsw %%xmm3,%%xmm1 \n"
1408 : "pmaddubsw %%xmm3,%%xmm6 \n"
1409 : "phaddw %%xmm2,%%xmm0 \n"
1410 : "phaddw %%xmm6,%%xmm1 \n"
1411 : "psraw $0x8,%%xmm0 \n"
1412 : "psraw $0x8,%%xmm1 \n"
1413 : "packsswb %%xmm1,%%xmm0 \n"
1414 : "paddb %%xmm5,%%xmm0 \n"
1415 : "movlps %%xmm0," MEMACCESS(1) " \n"
1416 : MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1417 : "lea " MEMLEA(0x8,1) ",%1 \n"
1418 : "sub $0x10,%3 \n"
1419 : "jg 1b \n"
1420 : : "+r"(src_abgr0), // %0
1421 : "+r"(dst_u), // %1
1422 : "+r"(dst_v), // %2
1423 : "+rm"(width) // %3
1424 0 : : "r"((intptr_t)(src_stride_abgr)), // %4
1425 : "m"(kABGRToV), // %5
1426 : "m"(kABGRToU), // %6
1427 : "m"(kAddUV128) // %7
1428 : : "memory", "cc", NACL_R14
1429 : "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1430 0 : );
1431 0 : }
1432 :
1433 0 : void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
1434 : int src_stride_rgba,
1435 : uint8* dst_u,
1436 : uint8* dst_v,
1437 : int width) {
1438 : asm volatile (
1439 : "movdqa %5,%%xmm3 \n"
1440 : "movdqa %6,%%xmm4 \n"
1441 : "movdqa %7,%%xmm5 \n"
1442 : "sub %1,%2 \n"
1443 :
1444 : LABELALIGN
1445 : "1: \n"
1446 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1447 : MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
1448 : "pavgb %%xmm7,%%xmm0 \n"
1449 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1450 : MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
1451 : "pavgb %%xmm7,%%xmm1 \n"
1452 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
1453 : MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
1454 : "pavgb %%xmm7,%%xmm2 \n"
1455 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
1456 : MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
1457 : "pavgb %%xmm7,%%xmm6 \n"
1458 :
1459 : "lea " MEMLEA(0x40,0) ",%0 \n"
1460 : "movdqa %%xmm0,%%xmm7 \n"
1461 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1462 : "shufps $0xdd,%%xmm1,%%xmm7 \n"
1463 : "pavgb %%xmm7,%%xmm0 \n"
1464 : "movdqa %%xmm2,%%xmm7 \n"
1465 : "shufps $0x88,%%xmm6,%%xmm2 \n"
1466 : "shufps $0xdd,%%xmm6,%%xmm7 \n"
1467 : "pavgb %%xmm7,%%xmm2 \n"
1468 : "movdqa %%xmm0,%%xmm1 \n"
1469 : "movdqa %%xmm2,%%xmm6 \n"
1470 : "pmaddubsw %%xmm4,%%xmm0 \n"
1471 : "pmaddubsw %%xmm4,%%xmm2 \n"
1472 : "pmaddubsw %%xmm3,%%xmm1 \n"
1473 : "pmaddubsw %%xmm3,%%xmm6 \n"
1474 : "phaddw %%xmm2,%%xmm0 \n"
1475 : "phaddw %%xmm6,%%xmm1 \n"
1476 : "psraw $0x8,%%xmm0 \n"
1477 : "psraw $0x8,%%xmm1 \n"
1478 : "packsswb %%xmm1,%%xmm0 \n"
1479 : "paddb %%xmm5,%%xmm0 \n"
1480 : "movlps %%xmm0," MEMACCESS(1) " \n"
1481 : MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
1482 : "lea " MEMLEA(0x8,1) ",%1 \n"
1483 : "sub $0x10,%3 \n"
1484 : "jg 1b \n"
1485 : : "+r"(src_rgba0), // %0
1486 : "+r"(dst_u), // %1
1487 : "+r"(dst_v), // %2
1488 : "+rm"(width) // %3
1489 0 : : "r"((intptr_t)(src_stride_rgba)), // %4
1490 : "m"(kRGBAToV), // %5
1491 : "m"(kRGBAToU), // %6
1492 : "m"(kAddUV128) // %7
1493 : : "memory", "cc", NACL_R14
1494 : "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1495 0 : );
1496 0 : }
1497 :
1498 : #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1499 :
1500 : // Read 8 UV from 444
1501 : #define READYUV444 \
1502 : "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1503 : MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1504 : "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1505 : "punpcklbw %%xmm1,%%xmm0 \n" \
1506 : "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1507 : "punpcklbw %%xmm4,%%xmm4 \n" \
1508 : "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1509 :
1510 : // Read 4 UV from 422, upsample to 8 UV
1511 : #define READYUV422 \
1512 : "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1513 : MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1514 : "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1515 : "punpcklbw %%xmm1,%%xmm0 \n" \
1516 : "punpcklwd %%xmm0,%%xmm0 \n" \
1517 : "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1518 : "punpcklbw %%xmm4,%%xmm4 \n" \
1519 : "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1520 :
1521 : // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
1522 : #define READYUVA422 \
1523 : "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1524 : MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
1525 : "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
1526 : "punpcklbw %%xmm1,%%xmm0 \n" \
1527 : "punpcklwd %%xmm0,%%xmm0 \n" \
1528 : "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1529 : "punpcklbw %%xmm4,%%xmm4 \n" \
1530 : "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
1531 : "movq " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1532 : "lea " MEMLEA(0x8, [a_buf]) ",%[a_buf] \n"
1533 :
1534 : // Read 4 UV from NV12, upsample to 8 UV
1535 : #define READNV12 \
1536 : "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1537 : "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
1538 : "punpcklwd %%xmm0,%%xmm0 \n" \
1539 : "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1540 : "punpcklbw %%xmm4,%%xmm4 \n" \
1541 : "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1542 :
1543 : // Read 4 VU from NV21, upsample to 8 UV
1544 : #define READNV21 \
1545 : "movq " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1546 : "lea " MEMLEA(0x8, [vu_buf]) ",%[vu_buf] \n" \
1547 : "pshufb %[kShuffleNV21], %%xmm0 \n" \
1548 : "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1549 : "punpcklbw %%xmm4,%%xmm4 \n" \
1550 : "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
1551 :
1552 : // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1553 : #define READYUY2 \
1554 : "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \
1555 : "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \
1556 : "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \
1557 : "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \
1558 : "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n"
1559 :
1560 : // Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1561 : #define READUYVY \
1562 : "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \
1563 : "pshufb %[kShuffleUYVYY], %%xmm4 \n" \
1564 : "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \
1565 : "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \
1566 : "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n"
1567 :
1568 : #if defined(__x86_64__)
1569 : #define YUVTORGB_SETUP(yuvconstants) \
1570 : "movdqa " MEMACCESS([yuvconstants]) ",%%xmm8 \n" \
1571 : "movdqa " MEMACCESS2(32, [yuvconstants]) ",%%xmm9 \n" \
1572 : "movdqa " MEMACCESS2(64, [yuvconstants]) ",%%xmm10 \n" \
1573 : "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm11 \n" \
1574 : "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm12 \n" \
1575 : "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm13 \n" \
1576 : "movdqa " MEMACCESS2(192, [yuvconstants]) ",%%xmm14 \n"
1577 : // Convert 8 pixels: 8 UV and 8 Y
1578 : #define YUVTORGB(yuvconstants) \
1579 : "movdqa %%xmm0,%%xmm1 \n" \
1580 : "movdqa %%xmm0,%%xmm2 \n" \
1581 : "movdqa %%xmm0,%%xmm3 \n" \
1582 : "movdqa %%xmm11,%%xmm0 \n" \
1583 : "pmaddubsw %%xmm8,%%xmm1 \n" \
1584 : "psubw %%xmm1,%%xmm0 \n" \
1585 : "movdqa %%xmm12,%%xmm1 \n" \
1586 : "pmaddubsw %%xmm9,%%xmm2 \n" \
1587 : "psubw %%xmm2,%%xmm1 \n" \
1588 : "movdqa %%xmm13,%%xmm2 \n" \
1589 : "pmaddubsw %%xmm10,%%xmm3 \n" \
1590 : "psubw %%xmm3,%%xmm2 \n" \
1591 : "pmulhuw %%xmm14,%%xmm4 \n" \
1592 : "paddsw %%xmm4,%%xmm0 \n" \
1593 : "paddsw %%xmm4,%%xmm1 \n" \
1594 : "paddsw %%xmm4,%%xmm2 \n" \
1595 : "psraw $0x6,%%xmm0 \n" \
1596 : "psraw $0x6,%%xmm1 \n" \
1597 : "psraw $0x6,%%xmm2 \n" \
1598 : "packuswb %%xmm0,%%xmm0 \n" \
1599 : "packuswb %%xmm1,%%xmm1 \n" \
1600 : "packuswb %%xmm2,%%xmm2 \n"
1601 : #define YUVTORGB_REGS \
1602 : "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1603 :
1604 : #else
1605 : #define YUVTORGB_SETUP(yuvconstants)
1606 : // Convert 8 pixels: 8 UV and 8 Y
1607 : #define YUVTORGB(yuvconstants) \
1608 : "movdqa %%xmm0,%%xmm1 \n" \
1609 : "movdqa %%xmm0,%%xmm2 \n" \
1610 : "movdqa %%xmm0,%%xmm3 \n" \
1611 : "movdqa " MEMACCESS2(96, [yuvconstants]) ",%%xmm0 \n" \
1612 : "pmaddubsw " MEMACCESS([yuvconstants]) ",%%xmm1 \n" \
1613 : "psubw %%xmm1,%%xmm0 \n" \
1614 : "movdqa " MEMACCESS2(128, [yuvconstants]) ",%%xmm1 \n" \
1615 : "pmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%xmm2 \n" \
1616 : "psubw %%xmm2,%%xmm1 \n" \
1617 : "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \
1618 : "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \
1619 : "psubw %%xmm3,%%xmm2 \n" \
1620 : "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \
1621 : "paddsw %%xmm4,%%xmm0 \n" \
1622 : "paddsw %%xmm4,%%xmm1 \n" \
1623 : "paddsw %%xmm4,%%xmm2 \n" \
1624 : "psraw $0x6,%%xmm0 \n" \
1625 : "psraw $0x6,%%xmm1 \n" \
1626 : "psraw $0x6,%%xmm2 \n" \
1627 : "packuswb %%xmm0,%%xmm0 \n" \
1628 : "packuswb %%xmm1,%%xmm1 \n" \
1629 : "packuswb %%xmm2,%%xmm2 \n"
1630 : #define YUVTORGB_REGS
1631 : #endif
1632 :
1633 : // Store 8 ARGB values.
1634 : #define STOREARGB \
1635 : "punpcklbw %%xmm1,%%xmm0 \n" \
1636 : "punpcklbw %%xmm5,%%xmm2 \n" \
1637 : "movdqa %%xmm0,%%xmm1 \n" \
1638 : "punpcklwd %%xmm2,%%xmm0 \n" \
1639 : "punpckhwd %%xmm2,%%xmm1 \n" \
1640 : "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \
1641 : "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \
1642 : "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n"
1643 :
1644 : // Store 8 RGBA values.
1645 : #define STORERGBA \
1646 : "pcmpeqb %%xmm5,%%xmm5 \n" \
1647 : "punpcklbw %%xmm2,%%xmm1 \n" \
1648 : "punpcklbw %%xmm0,%%xmm5 \n" \
1649 : "movdqa %%xmm5,%%xmm0 \n" \
1650 : "punpcklwd %%xmm1,%%xmm5 \n" \
1651 : "punpckhwd %%xmm1,%%xmm0 \n" \
1652 : "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \
1653 : "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \
1654 : "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n"
1655 :
1656 0 : void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1657 : const uint8* u_buf,
1658 : const uint8* v_buf,
1659 : uint8* dst_argb,
1660 : const struct YuvConstants* yuvconstants,
1661 : int width) {
1662 : asm volatile (
1663 : YUVTORGB_SETUP(yuvconstants)
1664 : "sub %[u_buf],%[v_buf] \n"
1665 : "pcmpeqb %%xmm5,%%xmm5 \n"
1666 :
1667 : LABELALIGN
1668 : "1: \n"
1669 : READYUV444
1670 : YUVTORGB(yuvconstants)
1671 : STOREARGB
1672 : "sub $0x8,%[width] \n"
1673 : "jg 1b \n"
1674 : : [y_buf]"+r"(y_buf), // %[y_buf]
1675 : [u_buf]"+r"(u_buf), // %[u_buf]
1676 : [v_buf]"+r"(v_buf), // %[v_buf]
1677 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1678 : [width]"+rm"(width) // %[width]
1679 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1680 : : "memory", "cc", NACL_R14 YUVTORGB_REGS
1681 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1682 0 : );
1683 0 : }
1684 :
1685 0 : void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1686 : const uint8* u_buf,
1687 : const uint8* v_buf,
1688 : uint8* dst_rgb24,
1689 : const struct YuvConstants* yuvconstants,
1690 : int width) {
1691 : asm volatile (
1692 : YUVTORGB_SETUP(yuvconstants)
1693 : "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1694 : "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
1695 : "sub %[u_buf],%[v_buf] \n"
1696 :
1697 : LABELALIGN
1698 : "1: \n"
1699 : READYUV422
1700 : YUVTORGB(yuvconstants)
1701 : "punpcklbw %%xmm1,%%xmm0 \n"
1702 : "punpcklbw %%xmm2,%%xmm2 \n"
1703 : "movdqa %%xmm0,%%xmm1 \n"
1704 : "punpcklwd %%xmm2,%%xmm0 \n"
1705 : "punpckhwd %%xmm2,%%xmm1 \n"
1706 : "pshufb %%xmm5,%%xmm0 \n"
1707 : "pshufb %%xmm6,%%xmm1 \n"
1708 : "palignr $0xc,%%xmm0,%%xmm1 \n"
1709 : "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1710 : "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1711 : "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1712 : "subl $0x8,%[width] \n"
1713 : "jg 1b \n"
1714 : : [y_buf]"+r"(y_buf), // %[y_buf]
1715 : [u_buf]"+r"(u_buf), // %[u_buf]
1716 : [v_buf]"+r"(v_buf), // %[v_buf]
1717 : [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
1718 : #if defined(__i386__)
1719 : [width]"+m"(width) // %[width]
1720 : #else
1721 : [width]"+rm"(width) // %[width]
1722 : #endif
1723 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1724 : [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1725 : [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1726 : : "memory", "cc", NACL_R14 YUVTORGB_REGS
1727 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1728 0 : );
1729 0 : }
1730 :
1731 0 : void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1732 : const uint8* u_buf,
1733 : const uint8* v_buf,
1734 : uint8* dst_argb,
1735 : const struct YuvConstants* yuvconstants,
1736 : int width) {
1737 : asm volatile (
1738 : YUVTORGB_SETUP(yuvconstants)
1739 : "sub %[u_buf],%[v_buf] \n"
1740 : "pcmpeqb %%xmm5,%%xmm5 \n"
1741 :
1742 : LABELALIGN
1743 : "1: \n"
1744 : READYUV422
1745 : YUVTORGB(yuvconstants)
1746 : STOREARGB
1747 : "sub $0x8,%[width] \n"
1748 : "jg 1b \n"
1749 : : [y_buf]"+r"(y_buf), // %[y_buf]
1750 : [u_buf]"+r"(u_buf), // %[u_buf]
1751 : [v_buf]"+r"(v_buf), // %[v_buf]
1752 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1753 : [width]"+rm"(width) // %[width]
1754 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1755 : : "memory", "cc", NACL_R14 YUVTORGB_REGS
1756 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757 0 : );
1758 0 : }
1759 :
1760 : #ifdef HAS_I422ALPHATOARGBROW_SSSE3
1761 : void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1762 : const uint8* u_buf,
1763 : const uint8* v_buf,
1764 : const uint8* a_buf,
1765 : uint8* dst_argb,
1766 : const struct YuvConstants* yuvconstants,
1767 : int width) {
1768 : // clang-format off
1769 : asm volatile (
1770 : YUVTORGB_SETUP(yuvconstants)
1771 : "sub %[u_buf],%[v_buf] \n"
1772 :
1773 : LABELALIGN
1774 : "1: \n"
1775 : READYUVA422
1776 : YUVTORGB(yuvconstants)
1777 : STOREARGB
1778 : "subl $0x8,%[width] \n"
1779 : "jg 1b \n"
1780 : : [y_buf]"+r"(y_buf), // %[y_buf]
1781 : [u_buf]"+r"(u_buf), // %[u_buf]
1782 : [v_buf]"+r"(v_buf), // %[v_buf]
1783 : [a_buf]"+r"(a_buf), // %[a_buf]
1784 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1785 : #if defined(__i386__)
1786 : [width]"+m"(width) // %[width]
1787 : #else
1788 : [width]"+rm"(width) // %[width]
1789 : #endif
1790 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1791 : : "memory", "cc", NACL_R14 YUVTORGB_REGS
1792 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1793 : );
1794 : // clang-format on
1795 : }
1796 : #endif // HAS_I422ALPHATOARGBROW_SSSE3
1797 :
1798 0 : void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1799 : const uint8* uv_buf,
1800 : uint8* dst_argb,
1801 : const struct YuvConstants* yuvconstants,
1802 : int width) {
1803 : // clang-format off
1804 : asm volatile (
1805 : YUVTORGB_SETUP(yuvconstants)
1806 : "pcmpeqb %%xmm5,%%xmm5 \n"
1807 :
1808 : LABELALIGN
1809 : "1: \n"
1810 : READNV12
1811 : YUVTORGB(yuvconstants)
1812 : STOREARGB
1813 : "sub $0x8,%[width] \n"
1814 : "jg 1b \n"
1815 : : [y_buf]"+r"(y_buf), // %[y_buf]
1816 : [uv_buf]"+r"(uv_buf), // %[uv_buf]
1817 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1818 : [width]"+rm"(width) // %[width]
1819 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1820 : : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1821 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1822 0 : );
1823 : // clang-format on
1824 0 : }
1825 :
1826 0 : void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1827 : const uint8* vu_buf,
1828 : uint8* dst_argb,
1829 : const struct YuvConstants* yuvconstants,
1830 : int width) {
1831 : // clang-format off
1832 : asm volatile (
1833 : YUVTORGB_SETUP(yuvconstants)
1834 : "pcmpeqb %%xmm5,%%xmm5 \n"
1835 :
1836 : LABELALIGN
1837 : "1: \n"
1838 : READNV21
1839 : YUVTORGB(yuvconstants)
1840 : STOREARGB
1841 : "sub $0x8,%[width] \n"
1842 : "jg 1b \n"
1843 : : [y_buf]"+r"(y_buf), // %[y_buf]
1844 : [vu_buf]"+r"(vu_buf), // %[vu_buf]
1845 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1846 : [width]"+rm"(width) // %[width]
1847 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1848 : [kShuffleNV21]"m"(kShuffleNV21)
1849 : : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1850 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1851 0 : );
1852 : // clang-format on
1853 0 : }
1854 :
1855 0 : void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1856 : uint8* dst_argb,
1857 : const struct YuvConstants* yuvconstants,
1858 : int width) {
1859 : // clang-format off
1860 : asm volatile (
1861 : YUVTORGB_SETUP(yuvconstants)
1862 : "pcmpeqb %%xmm5,%%xmm5 \n"
1863 :
1864 : LABELALIGN
1865 : "1: \n"
1866 : READYUY2
1867 : YUVTORGB(yuvconstants)
1868 : STOREARGB
1869 : "sub $0x8,%[width] \n"
1870 : "jg 1b \n"
1871 : : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
1872 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1873 : [width]"+rm"(width) // %[width]
1874 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1875 : [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1876 : [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1877 : : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1878 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1879 0 : );
1880 : // clang-format on
1881 0 : }
1882 :
1883 0 : void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1884 : uint8* dst_argb,
1885 : const struct YuvConstants* yuvconstants,
1886 : int width) {
1887 : // clang-format off
1888 : asm volatile (
1889 : YUVTORGB_SETUP(yuvconstants)
1890 : "pcmpeqb %%xmm5,%%xmm5 \n"
1891 :
1892 : LABELALIGN
1893 : "1: \n"
1894 : READUYVY
1895 : YUVTORGB(yuvconstants)
1896 : STOREARGB
1897 : "sub $0x8,%[width] \n"
1898 : "jg 1b \n"
1899 : : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
1900 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
1901 : [width]"+rm"(width) // %[width]
1902 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1903 : [kShuffleUYVYY]"m"(kShuffleUYVYY),
1904 : [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1905 : : "memory", "cc", YUVTORGB_REGS // Does not use r14.
1906 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1907 0 : );
1908 : // clang-format on
1909 0 : }
1910 :
1911 0 : void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1912 : const uint8* u_buf,
1913 : const uint8* v_buf,
1914 : uint8* dst_rgba,
1915 : const struct YuvConstants* yuvconstants,
1916 : int width) {
1917 : asm volatile (
1918 : YUVTORGB_SETUP(yuvconstants)
1919 : "sub %[u_buf],%[v_buf] \n"
1920 : "pcmpeqb %%xmm5,%%xmm5 \n"
1921 :
1922 : LABELALIGN
1923 : "1: \n"
1924 : READYUV422
1925 : YUVTORGB(yuvconstants)
1926 : STORERGBA
1927 : "sub $0x8,%[width] \n"
1928 : "jg 1b \n"
1929 : : [y_buf]"+r"(y_buf), // %[y_buf]
1930 : [u_buf]"+r"(u_buf), // %[u_buf]
1931 : [v_buf]"+r"(v_buf), // %[v_buf]
1932 : [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
1933 : [width]"+rm"(width) // %[width]
1934 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
1935 : : "memory", "cc", NACL_R14 YUVTORGB_REGS
1936 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937 0 : );
1938 0 : }
1939 :
1940 : #endif // HAS_I422TOARGBROW_SSSE3
1941 :
1942 : // Read 16 UV from 444
1943 : #define READYUV444_AVX2 \
1944 : "vmovdqu " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1945 : MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1) \
1946 : "lea " MEMLEA(0x10, [u_buf]) ",%[u_buf] \n" \
1947 : "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1948 : "vpermq $0xd8,%%ymm1,%%ymm1 \n" \
1949 : "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1950 : "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1951 : "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1952 : "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1953 : "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1954 :
1955 : // Read 8 UV from 422, upsample to 16 UV.
1956 : #define READYUV422_AVX2 \
1957 : "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1958 : MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1959 : "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1960 : "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1961 : "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1962 : "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1963 : "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1964 : "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1965 : "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1966 : "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1967 :
1968 : // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1969 : #define READYUVA422_AVX2 \
1970 : "vmovq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
1971 : MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1) \
1972 : "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
1973 : "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
1974 : "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1975 : "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1976 : "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1977 : "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1978 : "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1979 : "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \
1980 : "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \
1981 : "vpermq $0xd8,%%ymm5,%%ymm5 \n" \
1982 : "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n"
1983 :
1984 : // Read 8 UV from NV12, upsample to 16 UV.
1985 : #define READNV12_AVX2 \
1986 : "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
1987 : "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \
1988 : "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
1989 : "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \
1990 : "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
1991 : "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
1992 : "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
1993 : "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
1994 :
1995 : // Read 8 VU from NV21, upsample to 16 UV.
1996 : #define READNV21_AVX2 \
1997 : "vmovdqu " MEMACCESS([vu_buf]) ",%%xmm0 \n" \
1998 : "lea " MEMLEA(0x10, [vu_buf]) ",%[vu_buf] \n" \
1999 : "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2000 : "vpshufb %[kShuffleNV21], %%ymm0, %%ymm0 \n" \
2001 : "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
2002 : "vpermq $0xd8,%%ymm4,%%ymm4 \n" \
2003 : "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \
2004 : "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
2005 :
2006 : // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2007 : #define READYUY2_AVX2 \
2008 : "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm4 \n" \
2009 : "vpshufb %[kShuffleYUY2Y], %%ymm4, %%ymm4 \n" \
2010 : "vmovdqu " MEMACCESS([yuy2_buf]) ",%%ymm0 \n" \
2011 : "vpshufb %[kShuffleYUY2UV], %%ymm0, %%ymm0 \n" \
2012 : "lea " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf] \n"
2013 :
2014 : // Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2015 : #define READUYVY_AVX2 \
2016 : "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm4 \n" \
2017 : "vpshufb %[kShuffleUYVYY], %%ymm4, %%ymm4 \n" \
2018 : "vmovdqu " MEMACCESS([uyvy_buf]) ",%%ymm0 \n" \
2019 : "vpshufb %[kShuffleUYVYUV], %%ymm0, %%ymm0 \n" \
2020 : "lea " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf] \n"
2021 :
2022 : #if defined(__x86_64__)
2023 : #define YUVTORGB_SETUP_AVX2(yuvconstants) \
2024 : "vmovdqa " MEMACCESS([yuvconstants]) ",%%ymm8 \n" \
2025 : "vmovdqa " MEMACCESS2(32, [yuvconstants]) ",%%ymm9 \n" \
2026 : "vmovdqa " MEMACCESS2(64, [yuvconstants]) ",%%ymm10 \n" \
2027 : "vmovdqa " MEMACCESS2(96, [yuvconstants]) ",%%ymm11 \n" \
2028 : "vmovdqa " MEMACCESS2(128, [yuvconstants]) ",%%ymm12 \n" \
2029 : "vmovdqa " MEMACCESS2(160, [yuvconstants]) ",%%ymm13 \n" \
2030 : "vmovdqa " MEMACCESS2(192, [yuvconstants]) ",%%ymm14 \n"
2031 :
2032 : #define YUVTORGB_AVX2(yuvconstants) \
2033 : "vpmaddubsw %%ymm10,%%ymm0,%%ymm2 \n" \
2034 : "vpmaddubsw %%ymm9,%%ymm0,%%ymm1 \n" \
2035 : "vpmaddubsw %%ymm8,%%ymm0,%%ymm0 \n" \
2036 : "vpsubw %%ymm2,%%ymm13,%%ymm2 \n" \
2037 : "vpsubw %%ymm1,%%ymm12,%%ymm1 \n" \
2038 : "vpsubw %%ymm0,%%ymm11,%%ymm0 \n" \
2039 : "vpmulhuw %%ymm14,%%ymm4,%%ymm4 \n" \
2040 : "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2041 : "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2042 : "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2043 : "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2044 : "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2045 : "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2046 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2047 : "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2048 : "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2049 :
2050 : #define YUVTORGB_REGS_AVX2 \
2051 : "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2052 :
2053 : #else // Convert 16 pixels: 16 UV and 16 Y.
2054 :
2055 : #define YUVTORGB_SETUP_AVX2(yuvconstants)
2056 : #define YUVTORGB_AVX2(yuvconstants) \
2057 : "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
2058 : "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \
2059 : "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \
2060 : "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \
2061 : "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \
2062 : "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \
2063 : "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \
2064 : "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \
2065 : "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \
2066 : "vpmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4 \n" \
2067 : "vpaddsw %%ymm4,%%ymm0,%%ymm0 \n" \
2068 : "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \
2069 : "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \
2070 : "vpsraw $0x6,%%ymm0,%%ymm0 \n" \
2071 : "vpsraw $0x6,%%ymm1,%%ymm1 \n" \
2072 : "vpsraw $0x6,%%ymm2,%%ymm2 \n" \
2073 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \
2074 : "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \
2075 : "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
2076 : #define YUVTORGB_REGS_AVX2
2077 : #endif
2078 :
2079 : // Store 16 ARGB values.
2080 : #define STOREARGB_AVX2 \
2081 : "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \
2082 : "vpermq $0xd8,%%ymm0,%%ymm0 \n" \
2083 : "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" \
2084 : "vpermq $0xd8,%%ymm2,%%ymm2 \n" \
2085 : "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" \
2086 : "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" \
2087 : "vmovdqu %%ymm1," MEMACCESS([dst_argb]) " \n" \
2088 : "vmovdqu %%ymm0," MEMACCESS2(0x20, [dst_argb]) " \n" \
2089 : "lea " MEMLEA(0x40, [dst_argb]) ", %[dst_argb] \n"
2090 :
2091 : #ifdef HAS_I444TOARGBROW_AVX2
2092 : // 16 pixels
2093 : // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2094 0 : void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2095 : const uint8* u_buf,
2096 : const uint8* v_buf,
2097 : uint8* dst_argb,
2098 : const struct YuvConstants* yuvconstants,
2099 : int width) {
2100 : asm volatile (
2101 : YUVTORGB_SETUP_AVX2(yuvconstants)
2102 : "sub %[u_buf],%[v_buf] \n"
2103 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2104 :
2105 : LABELALIGN
2106 : "1: \n"
2107 : READYUV444_AVX2
2108 : YUVTORGB_AVX2(yuvconstants)
2109 : STOREARGB_AVX2
2110 : "sub $0x10,%[width] \n"
2111 : "jg 1b \n"
2112 : "vzeroupper \n"
2113 : : [y_buf]"+r"(y_buf), // %[y_buf]
2114 : [u_buf]"+r"(u_buf), // %[u_buf]
2115 : [v_buf]"+r"(v_buf), // %[v_buf]
2116 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2117 : [width]"+rm"(width) // %[width]
2118 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2119 : : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2120 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2121 0 : );
2122 0 : }
2123 : #endif // HAS_I444TOARGBROW_AVX2
2124 :
2125 : #if defined(HAS_I422TOARGBROW_AVX2)
2126 : // 16 pixels
2127 : // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2128 0 : void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2129 : const uint8* u_buf,
2130 : const uint8* v_buf,
2131 : uint8* dst_argb,
2132 : const struct YuvConstants* yuvconstants,
2133 : int width) {
2134 : asm volatile (
2135 : YUVTORGB_SETUP_AVX2(yuvconstants)
2136 : "sub %[u_buf],%[v_buf] \n"
2137 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2138 :
2139 : LABELALIGN
2140 : "1: \n"
2141 : READYUV422_AVX2
2142 : YUVTORGB_AVX2(yuvconstants)
2143 : STOREARGB_AVX2
2144 : "sub $0x10,%[width] \n"
2145 : "jg 1b \n"
2146 :
2147 : "vzeroupper \n"
2148 : : [y_buf]"+r"(y_buf), // %[y_buf]
2149 : [u_buf]"+r"(u_buf), // %[u_buf]
2150 : [v_buf]"+r"(v_buf), // %[v_buf]
2151 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2152 : [width]"+rm"(width) // %[width]
2153 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2154 : : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2155 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2156 0 : );
2157 0 : }
2158 : #endif // HAS_I422TOARGBROW_AVX2
2159 :
2160 : #if defined(HAS_I422ALPHATOARGBROW_AVX2)
2161 : // 16 pixels
2162 : // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2163 : void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2164 : const uint8* u_buf,
2165 : const uint8* v_buf,
2166 : const uint8* a_buf,
2167 : uint8* dst_argb,
2168 : const struct YuvConstants* yuvconstants,
2169 : int width) {
2170 : // clang-format off
2171 : asm volatile (
2172 : YUVTORGB_SETUP_AVX2(yuvconstants)
2173 : "sub %[u_buf],%[v_buf] \n"
2174 :
2175 : LABELALIGN
2176 : "1: \n"
2177 : READYUVA422_AVX2
2178 : YUVTORGB_AVX2(yuvconstants)
2179 : STOREARGB_AVX2
2180 : "subl $0x10,%[width] \n"
2181 : "jg 1b \n"
2182 : "vzeroupper \n"
2183 : : [y_buf]"+r"(y_buf), // %[y_buf]
2184 : [u_buf]"+r"(u_buf), // %[u_buf]
2185 : [v_buf]"+r"(v_buf), // %[v_buf]
2186 : [a_buf]"+r"(a_buf), // %[a_buf]
2187 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2188 : #if defined(__i386__)
2189 : [width]"+m"(width) // %[width]
2190 : #else
2191 : [width]"+rm"(width) // %[width]
2192 : #endif
2193 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2194 : : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2195 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2196 : );
2197 : // clang-format on
2198 : }
2199 : #endif // HAS_I422ALPHATOARGBROW_AVX2
2200 :
2201 : #if defined(HAS_I422TORGBAROW_AVX2)
2202 : // 16 pixels
2203 : // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2204 0 : void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2205 : const uint8* u_buf,
2206 : const uint8* v_buf,
2207 : uint8* dst_argb,
2208 : const struct YuvConstants* yuvconstants,
2209 : int width) {
2210 : asm volatile (
2211 : YUVTORGB_SETUP_AVX2(yuvconstants)
2212 : "sub %[u_buf],%[v_buf] \n"
2213 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2214 :
2215 : LABELALIGN
2216 : "1: \n"
2217 : READYUV422_AVX2
2218 : YUVTORGB_AVX2(yuvconstants)
2219 :
2220 : // Step 3: Weave into RGBA
2221 : "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
2222 : "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2223 : "vpunpcklbw %%ymm0,%%ymm5,%%ymm2 \n"
2224 : "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2225 : "vpunpcklwd %%ymm1,%%ymm2,%%ymm0 \n"
2226 : "vpunpckhwd %%ymm1,%%ymm2,%%ymm1 \n"
2227 : "vmovdqu %%ymm0," MEMACCESS([dst_argb]) "\n"
2228 : "vmovdqu %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2229 : "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2230 : "sub $0x10,%[width] \n"
2231 : "jg 1b \n"
2232 : "vzeroupper \n"
2233 : : [y_buf]"+r"(y_buf), // %[y_buf]
2234 : [u_buf]"+r"(u_buf), // %[u_buf]
2235 : [v_buf]"+r"(v_buf), // %[v_buf]
2236 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2237 : [width]"+rm"(width) // %[width]
2238 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2239 : : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2240 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2241 0 : );
2242 0 : }
2243 : #endif // HAS_I422TORGBAROW_AVX2
2244 :
2245 : #if defined(HAS_NV12TOARGBROW_AVX2)
2246 : // 16 pixels.
2247 : // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2248 0 : void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2249 : const uint8* uv_buf,
2250 : uint8* dst_argb,
2251 : const struct YuvConstants* yuvconstants,
2252 : int width) {
2253 : // clang-format off
2254 : asm volatile (
2255 : YUVTORGB_SETUP_AVX2(yuvconstants)
2256 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2257 :
2258 : LABELALIGN
2259 : "1: \n"
2260 : READNV12_AVX2
2261 : YUVTORGB_AVX2(yuvconstants)
2262 : STOREARGB_AVX2
2263 : "sub $0x10,%[width] \n"
2264 : "jg 1b \n"
2265 : "vzeroupper \n"
2266 : : [y_buf]"+r"(y_buf), // %[y_buf]
2267 : [uv_buf]"+r"(uv_buf), // %[uv_buf]
2268 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2269 : [width]"+rm"(width) // %[width]
2270 : : [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
2271 : : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2272 : "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2273 0 : );
2274 : // clang-format on
2275 0 : }
2276 : #endif // HAS_NV12TOARGBROW_AVX2
2277 :
2278 : #if defined(HAS_NV21TOARGBROW_AVX2)
2279 : // 16 pixels.
2280 : // 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2281 0 : void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2282 : const uint8* vu_buf,
2283 : uint8* dst_argb,
2284 : const struct YuvConstants* yuvconstants,
2285 : int width) {
2286 : // clang-format off
2287 : asm volatile (
2288 : YUVTORGB_SETUP_AVX2(yuvconstants)
2289 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2290 :
2291 : LABELALIGN
2292 : "1: \n"
2293 : READNV21_AVX2
2294 : YUVTORGB_AVX2(yuvconstants)
2295 : STOREARGB_AVX2
2296 : "sub $0x10,%[width] \n"
2297 : "jg 1b \n"
2298 : "vzeroupper \n"
2299 : : [y_buf]"+r"(y_buf), // %[y_buf]
2300 : [vu_buf]"+r"(vu_buf), // %[vu_buf]
2301 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2302 : [width]"+rm"(width) // %[width]
2303 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2304 : [kShuffleNV21]"m"(kShuffleNV21)
2305 : : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2306 : "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2307 0 : );
2308 : // clang-format on
2309 0 : }
2310 : #endif // HAS_NV21TOARGBROW_AVX2
2311 :
2312 : #if defined(HAS_YUY2TOARGBROW_AVX2)
2313 : // 16 pixels.
2314 : // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2315 0 : void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2316 : uint8* dst_argb,
2317 : const struct YuvConstants* yuvconstants,
2318 : int width) {
2319 : // clang-format off
2320 : asm volatile (
2321 : YUVTORGB_SETUP_AVX2(yuvconstants)
2322 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2323 :
2324 : LABELALIGN
2325 : "1: \n"
2326 : READYUY2_AVX2
2327 : YUVTORGB_AVX2(yuvconstants)
2328 : STOREARGB_AVX2
2329 : "sub $0x10,%[width] \n"
2330 : "jg 1b \n"
2331 : "vzeroupper \n"
2332 : : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf]
2333 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2334 : [width]"+rm"(width) // %[width]
2335 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2336 : [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2337 : [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2338 : : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2339 : "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2340 0 : );
2341 : // clang-format on
2342 0 : }
2343 : #endif // HAS_YUY2TOARGBROW_AVX2
2344 :
2345 : #if defined(HAS_UYVYTOARGBROW_AVX2)
2346 : // 16 pixels.
2347 : // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2348 0 : void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2349 : uint8* dst_argb,
2350 : const struct YuvConstants* yuvconstants,
2351 : int width) {
2352 : // clang-format off
2353 : asm volatile (
2354 : YUVTORGB_SETUP_AVX2(yuvconstants)
2355 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2356 :
2357 : LABELALIGN
2358 : "1: \n"
2359 : READUYVY_AVX2
2360 : YUVTORGB_AVX2(yuvconstants)
2361 : STOREARGB_AVX2
2362 : "sub $0x10,%[width] \n"
2363 : "jg 1b \n"
2364 : "vzeroupper \n"
2365 : : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf]
2366 : [dst_argb]"+r"(dst_argb), // %[dst_argb]
2367 : [width]"+rm"(width) // %[width]
2368 : : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2369 : [kShuffleUYVYY]"m"(kShuffleUYVYY),
2370 : [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2371 : : "memory", "cc", YUVTORGB_REGS_AVX2 // Does not use r14.
2372 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2373 0 : );
2374 : // clang-format on
2375 0 : }
2376 : #endif // HAS_UYVYTOARGBROW_AVX2
2377 :
2378 : #ifdef HAS_I400TOARGBROW_SSE2
2379 0 : void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2380 : asm volatile (
2381 : "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164
2382 : "movd %%eax,%%xmm2 \n"
2383 : "pshufd $0x0,%%xmm2,%%xmm2 \n"
2384 : "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * 16
2385 : "movd %%eax,%%xmm3 \n"
2386 : "pshufd $0x0,%%xmm3,%%xmm3 \n"
2387 : "pcmpeqb %%xmm4,%%xmm4 \n"
2388 : "pslld $0x18,%%xmm4 \n"
2389 :
2390 : LABELALIGN
2391 : "1: \n"
2392 : // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2393 : "movq " MEMACCESS(0) ",%%xmm0 \n"
2394 : "lea " MEMLEA(0x8,0) ",%0 \n"
2395 : "punpcklbw %%xmm0,%%xmm0 \n"
2396 : "pmulhuw %%xmm2,%%xmm0 \n"
2397 : "psubusw %%xmm3,%%xmm0 \n"
2398 : "psrlw $6, %%xmm0 \n"
2399 : "packuswb %%xmm0,%%xmm0 \n"
2400 :
2401 : // Step 2: Weave into ARGB
2402 : "punpcklbw %%xmm0,%%xmm0 \n"
2403 : "movdqa %%xmm0,%%xmm1 \n"
2404 : "punpcklwd %%xmm0,%%xmm0 \n"
2405 : "punpckhwd %%xmm1,%%xmm1 \n"
2406 : "por %%xmm4,%%xmm0 \n"
2407 : "por %%xmm4,%%xmm1 \n"
2408 : "movdqu %%xmm0," MEMACCESS(1) " \n"
2409 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2410 : "lea " MEMLEA(0x20,1) ",%1 \n"
2411 :
2412 : "sub $0x8,%2 \n"
2413 : "jg 1b \n"
2414 : : "+r"(y_buf), // %0
2415 : "+r"(dst_argb), // %1
2416 : "+rm"(width) // %2
2417 : :
2418 : : "memory", "cc", "eax"
2419 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2420 0 : );
2421 0 : }
2422 : #endif // HAS_I400TOARGBROW_SSE2
2423 :
2424 : #ifdef HAS_I400TOARGBROW_AVX2
2425 : // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2426 : // note: vpunpcklbw mutates and vpackuswb unmutates.
2427 0 : void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2428 : asm volatile (
2429 : "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16
2430 : "vmovd %%eax,%%xmm2 \n"
2431 : "vbroadcastss %%xmm2,%%ymm2 \n"
2432 : "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164
2433 : "vmovd %%eax,%%xmm3 \n"
2434 : "vbroadcastss %%xmm3,%%ymm3 \n"
2435 : "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
2436 : "vpslld $0x18,%%ymm4,%%ymm4 \n"
2437 :
2438 : LABELALIGN
2439 : "1: \n"
2440 : // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2441 : "vmovdqu " MEMACCESS(0) ",%%xmm0 \n"
2442 : "lea " MEMLEA(0x10,0) ",%0 \n"
2443 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2444 : "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
2445 : "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
2446 : "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n"
2447 : "vpsrlw $0x6,%%ymm0,%%ymm0 \n"
2448 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
2449 : "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n"
2450 : "vpermq $0xd8,%%ymm1,%%ymm1 \n"
2451 : "vpunpcklwd %%ymm1,%%ymm1,%%ymm0 \n"
2452 : "vpunpckhwd %%ymm1,%%ymm1,%%ymm1 \n"
2453 : "vpor %%ymm4,%%ymm0,%%ymm0 \n"
2454 : "vpor %%ymm4,%%ymm1,%%ymm1 \n"
2455 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2456 : "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2457 : "lea " MEMLEA(0x40,1) ",%1 \n"
2458 : "sub $0x10,%2 \n"
2459 : "jg 1b \n"
2460 : "vzeroupper \n"
2461 : : "+r"(y_buf), // %0
2462 : "+r"(dst_argb), // %1
2463 : "+rm"(width) // %2
2464 : :
2465 : : "memory", "cc", "eax"
2466 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2467 0 : );
2468 0 : }
2469 : #endif // HAS_I400TOARGBROW_AVX2
2470 :
2471 : #ifdef HAS_MIRRORROW_SSSE3
2472 : // Shuffle table for reversing the bytes.
2473 : static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
2474 : 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2475 :
2476 0 : void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2477 0 : intptr_t temp_width = (intptr_t)(width);
2478 : asm volatile (
2479 : "movdqa %3,%%xmm5 \n"
2480 :
2481 : LABELALIGN
2482 : "1: \n"
2483 : MEMOPREG(movdqu,-0x10,0,2,1,xmm0) // movdqu -0x10(%0,%2),%%xmm0
2484 : "pshufb %%xmm5,%%xmm0 \n"
2485 : "movdqu %%xmm0," MEMACCESS(1) " \n"
2486 : "lea " MEMLEA(0x10,1) ",%1 \n"
2487 : "sub $0x10,%2 \n"
2488 : "jg 1b \n"
2489 : : "+r"(src), // %0
2490 : "+r"(dst), // %1
2491 : "+r"(temp_width) // %2
2492 : : "m"(kShuffleMirror) // %3
2493 : : "memory", "cc", NACL_R14
2494 : "xmm0", "xmm5"
2495 0 : );
2496 0 : }
2497 : #endif // HAS_MIRRORROW_SSSE3
2498 :
2499 : #ifdef HAS_MIRRORROW_AVX2
2500 0 : void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2501 0 : intptr_t temp_width = (intptr_t)(width);
2502 : asm volatile (
2503 : "vbroadcastf128 %3,%%ymm5 \n"
2504 :
2505 : LABELALIGN
2506 : "1: \n"
2507 : MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0) // vmovdqu -0x20(%0,%2),%%ymm0
2508 : "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
2509 : "vpermq $0x4e,%%ymm0,%%ymm0 \n"
2510 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2511 : "lea " MEMLEA(0x20,1) ",%1 \n"
2512 : "sub $0x20,%2 \n"
2513 : "jg 1b \n"
2514 : "vzeroupper \n"
2515 : : "+r"(src), // %0
2516 : "+r"(dst), // %1
2517 : "+r"(temp_width) // %2
2518 : : "m"(kShuffleMirror) // %3
2519 : : "memory", "cc", NACL_R14
2520 : "xmm0", "xmm5"
2521 0 : );
2522 0 : }
2523 : #endif // HAS_MIRRORROW_AVX2
2524 :
2525 : #ifdef HAS_MIRRORUVROW_SSSE3
2526 : // Shuffle table for reversing the bytes of UV channels.
2527 : static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
2528 : 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
2529 0 : void MirrorUVRow_SSSE3(const uint8* src,
2530 : uint8* dst_u,
2531 : uint8* dst_v,
2532 : int width) {
2533 0 : intptr_t temp_width = (intptr_t)(width);
2534 : asm volatile (
2535 : "movdqa %4,%%xmm1 \n"
2536 : "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
2537 : "sub %1,%2 \n"
2538 :
2539 : LABELALIGN
2540 : "1: \n"
2541 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2542 : "lea " MEMLEA(-0x10,0) ",%0 \n"
2543 : "pshufb %%xmm1,%%xmm0 \n"
2544 : "movlpd %%xmm0," MEMACCESS(1) " \n"
2545 : MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
2546 : "lea " MEMLEA(0x8,1) ",%1 \n"
2547 : "sub $8,%3 \n"
2548 : "jg 1b \n"
2549 : : "+r"(src), // %0
2550 : "+r"(dst_u), // %1
2551 : "+r"(dst_v), // %2
2552 : "+r"(temp_width) // %3
2553 : : "m"(kShuffleMirrorUV) // %4
2554 : : "memory", "cc", NACL_R14
2555 : "xmm0", "xmm1"
2556 0 : );
2557 0 : }
2558 : #endif // HAS_MIRRORUVROW_SSSE3
2559 :
2560 : #ifdef HAS_ARGBMIRRORROW_SSE2
2561 :
2562 0 : void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2563 0 : intptr_t temp_width = (intptr_t)(width);
2564 : asm volatile (
2565 : "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
2566 :
2567 : LABELALIGN
2568 : "1: \n"
2569 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2570 : "pshufd $0x1b,%%xmm0,%%xmm0 \n"
2571 : "lea " MEMLEA(-0x10,0) ",%0 \n"
2572 : "movdqu %%xmm0," MEMACCESS(1) " \n"
2573 : "lea " MEMLEA(0x10,1) ",%1 \n"
2574 : "sub $0x4,%2 \n"
2575 : "jg 1b \n"
2576 : : "+r"(src), // %0
2577 : "+r"(dst), // %1
2578 : "+r"(temp_width) // %2
2579 : :
2580 : : "memory", "cc"
2581 : , "xmm0"
2582 0 : );
2583 0 : }
2584 : #endif // HAS_ARGBMIRRORROW_SSE2
2585 :
2586 : #ifdef HAS_ARGBMIRRORROW_AVX2
2587 : // Shuffle table for reversing the bytes.
2588 : static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2589 0 : void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2590 0 : intptr_t temp_width = (intptr_t)(width);
2591 : asm volatile (
2592 : "vmovdqu %3,%%ymm5 \n"
2593 :
2594 : LABELALIGN
2595 : "1: \n"
2596 : VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2597 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2598 : "lea " MEMLEA(0x20,1) ",%1 \n"
2599 : "sub $0x8,%2 \n"
2600 : "jg 1b \n"
2601 : "vzeroupper \n"
2602 : : "+r"(src), // %0
2603 : "+r"(dst), // %1
2604 : "+r"(temp_width) // %2
2605 : : "m"(kARGBShuffleMirror_AVX2) // %3
2606 : : "memory", "cc", NACL_R14
2607 : "xmm0", "xmm5"
2608 0 : );
2609 0 : }
2610 : #endif // HAS_ARGBMIRRORROW_AVX2
2611 :
2612 : #ifdef HAS_SPLITUVROW_AVX2
2613 0 : void SplitUVRow_AVX2(const uint8* src_uv,
2614 : uint8* dst_u,
2615 : uint8* dst_v,
2616 : int width) {
2617 : asm volatile (
2618 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
2619 : "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
2620 : "sub %1,%2 \n"
2621 :
2622 : LABELALIGN
2623 : "1: \n"
2624 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2625 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2626 : "lea " MEMLEA(0x40,0) ",%0 \n"
2627 : "vpsrlw $0x8,%%ymm0,%%ymm2 \n"
2628 : "vpsrlw $0x8,%%ymm1,%%ymm3 \n"
2629 : "vpand %%ymm5,%%ymm0,%%ymm0 \n"
2630 : "vpand %%ymm5,%%ymm1,%%ymm1 \n"
2631 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
2632 : "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
2633 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
2634 : "vpermq $0xd8,%%ymm2,%%ymm2 \n"
2635 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2636 : MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1) // vmovdqu %%ymm2,(%1,%2)
2637 : "lea " MEMLEA(0x20,1) ",%1 \n"
2638 : "sub $0x20,%3 \n"
2639 : "jg 1b \n"
2640 : "vzeroupper \n"
2641 : : "+r"(src_uv), // %0
2642 : "+r"(dst_u), // %1
2643 : "+r"(dst_v), // %2
2644 : "+r"(width) // %3
2645 : :
2646 : : "memory", "cc", NACL_R14
2647 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2648 0 : );
2649 0 : }
2650 : #endif // HAS_SPLITUVROW_AVX2
2651 :
2652 : #ifdef HAS_SPLITUVROW_SSE2
2653 0 : void SplitUVRow_SSE2(const uint8* src_uv,
2654 : uint8* dst_u,
2655 : uint8* dst_v,
2656 : int width) {
2657 : asm volatile (
2658 : "pcmpeqb %%xmm5,%%xmm5 \n"
2659 : "psrlw $0x8,%%xmm5 \n"
2660 : "sub %1,%2 \n"
2661 :
2662 : LABELALIGN
2663 : "1: \n"
2664 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2665 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2666 : "lea " MEMLEA(0x20,0) ",%0 \n"
2667 : "movdqa %%xmm0,%%xmm2 \n"
2668 : "movdqa %%xmm1,%%xmm3 \n"
2669 : "pand %%xmm5,%%xmm0 \n"
2670 : "pand %%xmm5,%%xmm1 \n"
2671 : "packuswb %%xmm1,%%xmm0 \n"
2672 : "psrlw $0x8,%%xmm2 \n"
2673 : "psrlw $0x8,%%xmm3 \n"
2674 : "packuswb %%xmm3,%%xmm2 \n"
2675 : "movdqu %%xmm0," MEMACCESS(1) " \n"
2676 : MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
2677 : "lea " MEMLEA(0x10,1) ",%1 \n"
2678 : "sub $0x10,%3 \n"
2679 : "jg 1b \n"
2680 : : "+r"(src_uv), // %0
2681 : "+r"(dst_u), // %1
2682 : "+r"(dst_v), // %2
2683 : "+r"(width) // %3
2684 : :
2685 : : "memory", "cc", NACL_R14
2686 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2687 0 : );
2688 0 : }
2689 : #endif // HAS_SPLITUVROW_SSE2
2690 :
2691 : #ifdef HAS_MERGEUVROW_AVX2
2692 0 : void MergeUVRow_AVX2(const uint8* src_u,
2693 : const uint8* src_v,
2694 : uint8* dst_uv,
2695 : int width) {
2696 : asm volatile (
2697 : "sub %0,%1 \n"
2698 :
2699 : LABELALIGN
2700 : "1: \n"
2701 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2702 : MEMOPREG(vmovdqu,0x00,0,1,1,ymm1) // vmovdqu (%0,%1,1),%%ymm1
2703 : "lea " MEMLEA(0x20,0) ",%0 \n"
2704 : "vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
2705 : "vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
2706 : "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2707 : "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2708 : "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2709 : "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2710 : "lea " MEMLEA(0x40,2) ",%2 \n"
2711 : "sub $0x20,%3 \n"
2712 : "jg 1b \n"
2713 : "vzeroupper \n"
2714 : : "+r"(src_u), // %0
2715 : "+r"(src_v), // %1
2716 : "+r"(dst_uv), // %2
2717 : "+r"(width) // %3
2718 : :
2719 : : "memory", "cc", NACL_R14
2720 : "xmm0", "xmm1", "xmm2"
2721 0 : );
2722 0 : }
2723 : #endif // HAS_MERGEUVROW_AVX2
2724 :
2725 : #ifdef HAS_MERGEUVROW_SSE2
2726 0 : void MergeUVRow_SSE2(const uint8* src_u,
2727 : const uint8* src_v,
2728 : uint8* dst_uv,
2729 : int width) {
2730 : asm volatile (
2731 : "sub %0,%1 \n"
2732 :
2733 : LABELALIGN
2734 : "1: \n"
2735 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2736 : MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
2737 : "lea " MEMLEA(0x10,0) ",%0 \n"
2738 : "movdqa %%xmm0,%%xmm2 \n"
2739 : "punpcklbw %%xmm1,%%xmm0 \n"
2740 : "punpckhbw %%xmm1,%%xmm2 \n"
2741 : "movdqu %%xmm0," MEMACCESS(2) " \n"
2742 : "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
2743 : "lea " MEMLEA(0x20,2) ",%2 \n"
2744 : "sub $0x10,%3 \n"
2745 : "jg 1b \n"
2746 : : "+r"(src_u), // %0
2747 : "+r"(src_v), // %1
2748 : "+r"(dst_uv), // %2
2749 : "+r"(width) // %3
2750 : :
2751 : : "memory", "cc", NACL_R14
2752 : "xmm0", "xmm1", "xmm2"
2753 0 : );
2754 0 : }
2755 : #endif // HAS_MERGEUVROW_SSE2
2756 :
2757 : #ifdef HAS_COPYROW_SSE2
2758 0 : void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2759 : asm volatile (
2760 : "test $0xf,%0 \n"
2761 : "jne 2f \n"
2762 : "test $0xf,%1 \n"
2763 : "jne 2f \n"
2764 :
2765 : LABELALIGN
2766 : "1: \n"
2767 : "movdqa " MEMACCESS(0) ",%%xmm0 \n"
2768 : "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2769 : "lea " MEMLEA(0x20,0) ",%0 \n"
2770 : "movdqa %%xmm0," MEMACCESS(1) " \n"
2771 : "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
2772 : "lea " MEMLEA(0x20,1) ",%1 \n"
2773 : "sub $0x20,%2 \n"
2774 : "jg 1b \n"
2775 : "jmp 9f \n"
2776 :
2777 : LABELALIGN
2778 : "2: \n"
2779 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
2780 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
2781 : "lea " MEMLEA(0x20,0) ",%0 \n"
2782 : "movdqu %%xmm0," MEMACCESS(1) " \n"
2783 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
2784 : "lea " MEMLEA(0x20,1) ",%1 \n"
2785 : "sub $0x20,%2 \n"
2786 : "jg 2b \n"
2787 : "9: \n"
2788 : : "+r"(src), // %0
2789 : "+r"(dst), // %1
2790 : "+r"(count) // %2
2791 : :
2792 : : "memory", "cc"
2793 : , "xmm0", "xmm1"
2794 0 : );
2795 0 : }
2796 : #endif // HAS_COPYROW_SSE2
2797 :
2798 : #ifdef HAS_COPYROW_AVX
2799 0 : void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2800 : asm volatile (
2801 : LABELALIGN
2802 : "1: \n"
2803 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
2804 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
2805 : "lea " MEMLEA(0x40,0) ",%0 \n"
2806 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2807 : "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
2808 : "lea " MEMLEA(0x40,1) ",%1 \n"
2809 : "sub $0x40,%2 \n"
2810 : "jg 1b \n"
2811 : : "+r"(src), // %0
2812 : "+r"(dst), // %1
2813 : "+r"(count) // %2
2814 : :
2815 : : "memory", "cc"
2816 : , "xmm0", "xmm1"
2817 0 : );
2818 0 : }
2819 : #endif // HAS_COPYROW_AVX
2820 :
2821 : #ifdef HAS_COPYROW_ERMS
2822 : // Multiple of 1.
2823 0 : void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2824 0 : size_t width_tmp = (size_t)(width);
2825 : asm volatile("rep movsb " MEMMOVESTRING(0, 1) " \n"
2826 : : "+S"(src), // %0
2827 : "+D"(dst), // %1
2828 : "+c"(width_tmp) // %2
2829 : :
2830 0 : : "memory", "cc");
2831 0 : }
2832 : #endif // HAS_COPYROW_ERMS
2833 :
2834 : #ifdef HAS_ARGBCOPYALPHAROW_SSE2
2835 : // width in pixels
2836 0 : void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2837 : asm volatile (
2838 : "pcmpeqb %%xmm0,%%xmm0 \n"
2839 : "pslld $0x18,%%xmm0 \n"
2840 : "pcmpeqb %%xmm1,%%xmm1 \n"
2841 : "psrld $0x8,%%xmm1 \n"
2842 :
2843 : LABELALIGN
2844 : "1: \n"
2845 : "movdqu " MEMACCESS(0) ",%%xmm2 \n"
2846 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
2847 : "lea " MEMLEA(0x20,0) ",%0 \n"
2848 : "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2849 : "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2850 : "pand %%xmm0,%%xmm2 \n"
2851 : "pand %%xmm0,%%xmm3 \n"
2852 : "pand %%xmm1,%%xmm4 \n"
2853 : "pand %%xmm1,%%xmm5 \n"
2854 : "por %%xmm4,%%xmm2 \n"
2855 : "por %%xmm5,%%xmm3 \n"
2856 : "movdqu %%xmm2," MEMACCESS(1) " \n"
2857 : "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2858 : "lea " MEMLEA(0x20,1) ",%1 \n"
2859 : "sub $0x8,%2 \n"
2860 : "jg 1b \n"
2861 : : "+r"(src), // %0
2862 : "+r"(dst), // %1
2863 : "+r"(width) // %2
2864 : :
2865 : : "memory", "cc"
2866 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867 0 : );
2868 0 : }
2869 : #endif // HAS_ARGBCOPYALPHAROW_SSE2
2870 :
2871 : #ifdef HAS_ARGBCOPYALPHAROW_AVX2
2872 : // width in pixels
2873 0 : void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2874 : asm volatile (
2875 : "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
2876 : "vpsrld $0x8,%%ymm0,%%ymm0 \n"
2877 :
2878 : LABELALIGN
2879 : "1: \n"
2880 : "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
2881 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
2882 : "lea " MEMLEA(0x40,0) ",%0 \n"
2883 : "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
2884 : "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
2885 : "vmovdqu %%ymm1," MEMACCESS(1) " \n"
2886 : "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
2887 : "lea " MEMLEA(0x40,1) ",%1 \n"
2888 : "sub $0x10,%2 \n"
2889 : "jg 1b \n"
2890 : "vzeroupper \n"
2891 : : "+r"(src), // %0
2892 : "+r"(dst), // %1
2893 : "+r"(width) // %2
2894 : :
2895 : : "memory", "cc"
2896 : , "xmm0", "xmm1", "xmm2"
2897 0 : );
2898 0 : }
2899 : #endif // HAS_ARGBCOPYALPHAROW_AVX2
2900 :
2901 : #ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2902 : // width in pixels
2903 0 : void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2904 : asm volatile (
2905 : LABELALIGN
2906 : "1: \n"
2907 : "movdqu " MEMACCESS(0) ", %%xmm0 \n"
2908 : "movdqu " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2909 : "lea " MEMLEA(0x20, 0) ", %0 \n"
2910 : "psrld $0x18, %%xmm0 \n"
2911 : "psrld $0x18, %%xmm1 \n"
2912 : "packssdw %%xmm1, %%xmm0 \n"
2913 : "packuswb %%xmm0, %%xmm0 \n"
2914 : "movq %%xmm0," MEMACCESS(1) " \n"
2915 : "lea " MEMLEA(0x8, 1) ", %1 \n"
2916 : "sub $0x8, %2 \n"
2917 : "jg 1b \n"
2918 : : "+r"(src_argb), // %0
2919 : "+r"(dst_a), // %1
2920 : "+rm"(width) // %2
2921 : :
2922 : : "memory", "cc"
2923 : , "xmm0", "xmm1"
2924 0 : );
2925 0 : }
2926 : #endif // HAS_ARGBEXTRACTALPHAROW_SSE2
2927 :
2928 : #ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
2929 : static const uvec8 kShuffleAlphaShort_AVX2 = {
2930 : 3u, 128u, 128u, 128u, 7u, 128u, 128u, 128u,
2931 : 11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
2932 :
2933 0 : void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
2934 : asm volatile (
2935 : "vmovdqa %3,%%ymm4 \n"
2936 : "vbroadcastf128 %4,%%ymm5 \n"
2937 :
2938 : LABELALIGN
2939 : "1: \n"
2940 : "vmovdqu " MEMACCESS(0) ", %%ymm0 \n"
2941 : "vmovdqu " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
2942 : "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" // vpsrld $0x18, %%ymm0
2943 : "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
2944 : "vmovdqu " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
2945 : "vmovdqu " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
2946 : "lea " MEMLEA(0x80, 0) ", %0 \n"
2947 : "vpackssdw %%ymm1, %%ymm0, %%ymm0 \n" // mutates
2948 : "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
2949 : "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
2950 : "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // mutates
2951 : "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" // mutates.
2952 : "vpermd %%ymm0,%%ymm4,%%ymm0 \n" // unmutate.
2953 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
2954 : "lea " MEMLEA(0x20,1) ",%1 \n"
2955 : "sub $0x20, %2 \n"
2956 : "jg 1b \n"
2957 : "vzeroupper \n"
2958 : : "+r"(src_argb), // %0
2959 : "+r"(dst_a), // %1
2960 : "+rm"(width) // %2
2961 : : "m"(kPermdARGBToY_AVX), // %3
2962 : "m"(kShuffleAlphaShort_AVX2) // %4
2963 : : "memory", "cc"
2964 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2965 0 : );
2966 0 : }
2967 : #endif // HAS_ARGBEXTRACTALPHAROW_AVX2
2968 :
2969 : #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2970 : // width in pixels
2971 0 : void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2972 : asm volatile (
2973 : "pcmpeqb %%xmm0,%%xmm0 \n"
2974 : "pslld $0x18,%%xmm0 \n"
2975 : "pcmpeqb %%xmm1,%%xmm1 \n"
2976 : "psrld $0x8,%%xmm1 \n"
2977 :
2978 : LABELALIGN
2979 : "1: \n"
2980 : "movq " MEMACCESS(0) ",%%xmm2 \n"
2981 : "lea " MEMLEA(0x8,0) ",%0 \n"
2982 : "punpcklbw %%xmm2,%%xmm2 \n"
2983 : "punpckhwd %%xmm2,%%xmm3 \n"
2984 : "punpcklwd %%xmm2,%%xmm2 \n"
2985 : "movdqu " MEMACCESS(1) ",%%xmm4 \n"
2986 : "movdqu " MEMACCESS2(0x10,1) ",%%xmm5 \n"
2987 : "pand %%xmm0,%%xmm2 \n"
2988 : "pand %%xmm0,%%xmm3 \n"
2989 : "pand %%xmm1,%%xmm4 \n"
2990 : "pand %%xmm1,%%xmm5 \n"
2991 : "por %%xmm4,%%xmm2 \n"
2992 : "por %%xmm5,%%xmm3 \n"
2993 : "movdqu %%xmm2," MEMACCESS(1) " \n"
2994 : "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
2995 : "lea " MEMLEA(0x20,1) ",%1 \n"
2996 : "sub $0x8,%2 \n"
2997 : "jg 1b \n"
2998 : : "+r"(src), // %0
2999 : "+r"(dst), // %1
3000 : "+r"(width) // %2
3001 : :
3002 : : "memory", "cc"
3003 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3004 0 : );
3005 0 : }
3006 : #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
3007 :
3008 : #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3009 : // width in pixels
3010 0 : void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3011 : asm volatile (
3012 : "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
3013 : "vpsrld $0x8,%%ymm0,%%ymm0 \n"
3014 :
3015 : LABELALIGN
3016 : "1: \n"
3017 : "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
3018 : "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
3019 : "lea " MEMLEA(0x10,0) ",%0 \n"
3020 : "vpslld $0x18,%%ymm1,%%ymm1 \n"
3021 : "vpslld $0x18,%%ymm2,%%ymm2 \n"
3022 : "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
3023 : "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
3024 : "vmovdqu %%ymm1," MEMACCESS(1) " \n"
3025 : "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
3026 : "lea " MEMLEA(0x40,1) ",%1 \n"
3027 : "sub $0x10,%2 \n"
3028 : "jg 1b \n"
3029 : "vzeroupper \n"
3030 : : "+r"(src), // %0
3031 : "+r"(dst), // %1
3032 : "+r"(width) // %2
3033 : :
3034 : : "memory", "cc"
3035 : , "xmm0", "xmm1", "xmm2"
3036 0 : );
3037 0 : }
3038 : #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
3039 :
3040 : #ifdef HAS_SETROW_X86
3041 0 : void SetRow_X86(uint8* dst, uint8 v8, int width) {
3042 0 : size_t width_tmp = (size_t)(width >> 2);
3043 0 : const uint32 v32 = v8 * 0x01010101u; // Duplicate byte to all bytes.
3044 : asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
3045 : : "+D"(dst), // %0
3046 : "+c"(width_tmp) // %1
3047 : : "a"(v32) // %2
3048 0 : : "memory", "cc");
3049 0 : }
3050 :
3051 0 : void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3052 0 : size_t width_tmp = (size_t)(width);
3053 : asm volatile("rep stosb " MEMSTORESTRING(al, 0) " \n"
3054 : : "+D"(dst), // %0
3055 : "+c"(width_tmp) // %1
3056 : : "a"(v8) // %2
3057 0 : : "memory", "cc");
3058 0 : }
3059 :
3060 5 : void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3061 5 : size_t width_tmp = (size_t)(width);
3062 : asm volatile("rep stosl " MEMSTORESTRING(eax, 0) " \n"
3063 : : "+D"(dst_argb), // %0
3064 : "+c"(width_tmp) // %1
3065 : : "a"(v32) // %2
3066 5 : : "memory", "cc");
3067 5 : }
3068 : #endif // HAS_SETROW_X86
3069 :
3070 : #ifdef HAS_YUY2TOYROW_SSE2
3071 0 : void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3072 : asm volatile (
3073 : "pcmpeqb %%xmm5,%%xmm5 \n"
3074 : "psrlw $0x8,%%xmm5 \n"
3075 :
3076 : LABELALIGN
3077 : "1: \n"
3078 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3079 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3080 : "lea " MEMLEA(0x20,0) ",%0 \n"
3081 : "pand %%xmm5,%%xmm0 \n"
3082 : "pand %%xmm5,%%xmm1 \n"
3083 : "packuswb %%xmm1,%%xmm0 \n"
3084 : "movdqu %%xmm0," MEMACCESS(1) " \n"
3085 : "lea " MEMLEA(0x10,1) ",%1 \n"
3086 : "sub $0x10,%2 \n"
3087 : "jg 1b \n"
3088 : : "+r"(src_yuy2), // %0
3089 : "+r"(dst_y), // %1
3090 : "+r"(width) // %2
3091 : :
3092 : : "memory", "cc"
3093 : , "xmm0", "xmm1", "xmm5"
3094 0 : );
3095 0 : }
3096 :
3097 0 : void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3098 : int stride_yuy2,
3099 : uint8* dst_u,
3100 : uint8* dst_v,
3101 : int width) {
3102 : asm volatile (
3103 : "pcmpeqb %%xmm5,%%xmm5 \n"
3104 : "psrlw $0x8,%%xmm5 \n"
3105 : "sub %1,%2 \n"
3106 :
3107 : LABELALIGN
3108 : "1: \n"
3109 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3110 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3111 : MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3112 : MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3113 : "lea " MEMLEA(0x20,0) ",%0 \n"
3114 : "pavgb %%xmm2,%%xmm0 \n"
3115 : "pavgb %%xmm3,%%xmm1 \n"
3116 : "psrlw $0x8,%%xmm0 \n"
3117 : "psrlw $0x8,%%xmm1 \n"
3118 : "packuswb %%xmm1,%%xmm0 \n"
3119 : "movdqa %%xmm0,%%xmm1 \n"
3120 : "pand %%xmm5,%%xmm0 \n"
3121 : "packuswb %%xmm0,%%xmm0 \n"
3122 : "psrlw $0x8,%%xmm1 \n"
3123 : "packuswb %%xmm1,%%xmm1 \n"
3124 : "movq %%xmm0," MEMACCESS(1) " \n"
3125 : MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3126 : "lea " MEMLEA(0x8,1) ",%1 \n"
3127 : "sub $0x10,%3 \n"
3128 : "jg 1b \n"
3129 : : "+r"(src_yuy2), // %0
3130 : "+r"(dst_u), // %1
3131 : "+r"(dst_v), // %2
3132 : "+r"(width) // %3
3133 0 : : "r"((intptr_t)(stride_yuy2)) // %4
3134 : : "memory", "cc", NACL_R14
3135 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3136 0 : );
3137 0 : }
3138 :
3139 0 : void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3140 : uint8* dst_u,
3141 : uint8* dst_v,
3142 : int width) {
3143 : asm volatile (
3144 : "pcmpeqb %%xmm5,%%xmm5 \n"
3145 : "psrlw $0x8,%%xmm5 \n"
3146 : "sub %1,%2 \n"
3147 :
3148 : LABELALIGN
3149 : "1: \n"
3150 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3151 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3152 : "lea " MEMLEA(0x20,0) ",%0 \n"
3153 : "psrlw $0x8,%%xmm0 \n"
3154 : "psrlw $0x8,%%xmm1 \n"
3155 : "packuswb %%xmm1,%%xmm0 \n"
3156 : "movdqa %%xmm0,%%xmm1 \n"
3157 : "pand %%xmm5,%%xmm0 \n"
3158 : "packuswb %%xmm0,%%xmm0 \n"
3159 : "psrlw $0x8,%%xmm1 \n"
3160 : "packuswb %%xmm1,%%xmm1 \n"
3161 : "movq %%xmm0," MEMACCESS(1) " \n"
3162 : MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3163 : "lea " MEMLEA(0x8,1) ",%1 \n"
3164 : "sub $0x10,%3 \n"
3165 : "jg 1b \n"
3166 : : "+r"(src_yuy2), // %0
3167 : "+r"(dst_u), // %1
3168 : "+r"(dst_v), // %2
3169 : "+r"(width) // %3
3170 : :
3171 : : "memory", "cc", NACL_R14
3172 : "xmm0", "xmm1", "xmm5"
3173 0 : );
3174 0 : }
3175 :
3176 0 : void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3177 : asm volatile (
3178 : LABELALIGN
3179 : "1: \n"
3180 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3181 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3182 : "lea " MEMLEA(0x20,0) ",%0 \n"
3183 : "psrlw $0x8,%%xmm0 \n"
3184 : "psrlw $0x8,%%xmm1 \n"
3185 : "packuswb %%xmm1,%%xmm0 \n"
3186 : "movdqu %%xmm0," MEMACCESS(1) " \n"
3187 : "lea " MEMLEA(0x10,1) ",%1 \n"
3188 : "sub $0x10,%2 \n"
3189 : "jg 1b \n"
3190 : : "+r"(src_uyvy), // %0
3191 : "+r"(dst_y), // %1
3192 : "+r"(width) // %2
3193 : :
3194 : : "memory", "cc"
3195 : , "xmm0", "xmm1"
3196 0 : );
3197 0 : }
3198 :
3199 0 : void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3200 : int stride_uyvy,
3201 : uint8* dst_u,
3202 : uint8* dst_v,
3203 : int width) {
3204 : asm volatile (
3205 : "pcmpeqb %%xmm5,%%xmm5 \n"
3206 : "psrlw $0x8,%%xmm5 \n"
3207 : "sub %1,%2 \n"
3208 :
3209 : LABELALIGN
3210 : "1: \n"
3211 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3212 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3213 : MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
3214 : MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
3215 : "lea " MEMLEA(0x20,0) ",%0 \n"
3216 : "pavgb %%xmm2,%%xmm0 \n"
3217 : "pavgb %%xmm3,%%xmm1 \n"
3218 : "pand %%xmm5,%%xmm0 \n"
3219 : "pand %%xmm5,%%xmm1 \n"
3220 : "packuswb %%xmm1,%%xmm0 \n"
3221 : "movdqa %%xmm0,%%xmm1 \n"
3222 : "pand %%xmm5,%%xmm0 \n"
3223 : "packuswb %%xmm0,%%xmm0 \n"
3224 : "psrlw $0x8,%%xmm1 \n"
3225 : "packuswb %%xmm1,%%xmm1 \n"
3226 : "movq %%xmm0," MEMACCESS(1) " \n"
3227 : MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3228 : "lea " MEMLEA(0x8,1) ",%1 \n"
3229 : "sub $0x10,%3 \n"
3230 : "jg 1b \n"
3231 : : "+r"(src_uyvy), // %0
3232 : "+r"(dst_u), // %1
3233 : "+r"(dst_v), // %2
3234 : "+r"(width) // %3
3235 0 : : "r"((intptr_t)(stride_uyvy)) // %4
3236 : : "memory", "cc", NACL_R14
3237 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3238 0 : );
3239 0 : }
3240 :
3241 0 : void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3242 : uint8* dst_u,
3243 : uint8* dst_v,
3244 : int width) {
3245 : asm volatile (
3246 : "pcmpeqb %%xmm5,%%xmm5 \n"
3247 : "psrlw $0x8,%%xmm5 \n"
3248 : "sub %1,%2 \n"
3249 :
3250 : LABELALIGN
3251 : "1: \n"
3252 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3253 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3254 : "lea " MEMLEA(0x20,0) ",%0 \n"
3255 : "pand %%xmm5,%%xmm0 \n"
3256 : "pand %%xmm5,%%xmm1 \n"
3257 : "packuswb %%xmm1,%%xmm0 \n"
3258 : "movdqa %%xmm0,%%xmm1 \n"
3259 : "pand %%xmm5,%%xmm0 \n"
3260 : "packuswb %%xmm0,%%xmm0 \n"
3261 : "psrlw $0x8,%%xmm1 \n"
3262 : "packuswb %%xmm1,%%xmm1 \n"
3263 : "movq %%xmm0," MEMACCESS(1) " \n"
3264 : MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
3265 : "lea " MEMLEA(0x8,1) ",%1 \n"
3266 : "sub $0x10,%3 \n"
3267 : "jg 1b \n"
3268 : : "+r"(src_uyvy), // %0
3269 : "+r"(dst_u), // %1
3270 : "+r"(dst_v), // %2
3271 : "+r"(width) // %3
3272 : :
3273 : : "memory", "cc", NACL_R14
3274 : "xmm0", "xmm1", "xmm5"
3275 0 : );
3276 0 : }
3277 : #endif // HAS_YUY2TOYROW_SSE2
3278 :
3279 : #ifdef HAS_YUY2TOYROW_AVX2
3280 0 : void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3281 : asm volatile (
3282 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3283 : "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3284 :
3285 : LABELALIGN
3286 : "1: \n"
3287 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3288 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3289 : "lea " MEMLEA(0x40,0) ",%0 \n"
3290 : "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3291 : "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3292 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3293 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3294 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3295 : "lea " MEMLEA(0x20,1) ",%1 \n"
3296 : "sub $0x20,%2 \n"
3297 : "jg 1b \n"
3298 : "vzeroupper \n"
3299 : : "+r"(src_yuy2), // %0
3300 : "+r"(dst_y), // %1
3301 : "+r"(width) // %2
3302 : :
3303 : : "memory", "cc"
3304 : , "xmm0", "xmm1", "xmm5"
3305 0 : );
3306 0 : }
3307 :
3308 0 : void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3309 : int stride_yuy2,
3310 : uint8* dst_u,
3311 : uint8* dst_v,
3312 : int width) {
3313 : asm volatile (
3314 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3315 : "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3316 : "sub %1,%2 \n"
3317 :
3318 : LABELALIGN
3319 : "1: \n"
3320 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3321 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3322 : VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3323 : VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3324 : "lea " MEMLEA(0x40,0) ",%0 \n"
3325 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3326 : "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3327 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3328 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3329 : "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3330 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3331 : "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3332 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3333 : "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3334 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3335 : "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3336 : VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3337 : "lea " MEMLEA(0x10,1) ",%1 \n"
3338 : "sub $0x20,%3 \n"
3339 : "jg 1b \n"
3340 : "vzeroupper \n"
3341 : : "+r"(src_yuy2), // %0
3342 : "+r"(dst_u), // %1
3343 : "+r"(dst_v), // %2
3344 : "+r"(width) // %3
3345 0 : : "r"((intptr_t)(stride_yuy2)) // %4
3346 : : "memory", "cc", NACL_R14
3347 : "xmm0", "xmm1", "xmm5"
3348 0 : );
3349 0 : }
3350 :
3351 0 : void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3352 : uint8* dst_u,
3353 : uint8* dst_v,
3354 : int width) {
3355 : asm volatile (
3356 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3357 : "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3358 : "sub %1,%2 \n"
3359 :
3360 : LABELALIGN
3361 : "1: \n"
3362 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3363 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3364 : "lea " MEMLEA(0x40,0) ",%0 \n"
3365 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3366 : "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3367 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3368 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3369 : "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3370 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3371 : "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3372 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3373 : "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3374 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3375 : "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3376 : VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3377 : "lea " MEMLEA(0x10,1) ",%1 \n"
3378 : "sub $0x20,%3 \n"
3379 : "jg 1b \n"
3380 : "vzeroupper \n"
3381 : : "+r"(src_yuy2), // %0
3382 : "+r"(dst_u), // %1
3383 : "+r"(dst_v), // %2
3384 : "+r"(width) // %3
3385 : :
3386 : : "memory", "cc", NACL_R14
3387 : "xmm0", "xmm1", "xmm5"
3388 0 : );
3389 0 : }
3390 :
3391 0 : void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3392 : asm volatile (
3393 : LABELALIGN
3394 : "1: \n"
3395 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3396 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3397 : "lea " MEMLEA(0x40,0) ",%0 \n"
3398 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3399 : "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3400 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3401 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3402 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
3403 : "lea " MEMLEA(0x20,1) ",%1 \n"
3404 : "sub $0x20,%2 \n"
3405 : "jg 1b \n"
3406 : "vzeroupper \n"
3407 : : "+r"(src_uyvy), // %0
3408 : "+r"(dst_y), // %1
3409 : "+r"(width) // %2
3410 : :
3411 : : "memory", "cc"
3412 : , "xmm0", "xmm1", "xmm5"
3413 0 : );
3414 0 : }
3415 0 : void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3416 : int stride_uyvy,
3417 : uint8* dst_u,
3418 : uint8* dst_v,
3419 : int width) {
3420 : asm volatile (
3421 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3422 : "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3423 : "sub %1,%2 \n"
3424 :
3425 : LABELALIGN
3426 : "1: \n"
3427 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3428 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3429 : VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3430 : VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3431 : "lea " MEMLEA(0x40,0) ",%0 \n"
3432 : "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3433 : "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3434 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3435 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3436 : "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3437 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3438 : "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3439 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3440 : "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3441 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3442 : "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3443 : VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3444 : "lea " MEMLEA(0x10,1) ",%1 \n"
3445 : "sub $0x20,%3 \n"
3446 : "jg 1b \n"
3447 : "vzeroupper \n"
3448 : : "+r"(src_uyvy), // %0
3449 : "+r"(dst_u), // %1
3450 : "+r"(dst_v), // %2
3451 : "+r"(width) // %3
3452 0 : : "r"((intptr_t)(stride_uyvy)) // %4
3453 : : "memory", "cc", NACL_R14
3454 : "xmm0", "xmm1", "xmm5"
3455 0 : );
3456 0 : }
3457 :
3458 0 : void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3459 : uint8* dst_u,
3460 : uint8* dst_v,
3461 : int width) {
3462 : asm volatile (
3463 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3464 : "vpsrlw $0x8,%%ymm5,%%ymm5 \n"
3465 : "sub %1,%2 \n"
3466 :
3467 : LABELALIGN
3468 : "1: \n"
3469 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
3470 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
3471 : "lea " MEMLEA(0x40,0) ",%0 \n"
3472 : "vpand %%ymm5,%%ymm0,%%ymm0 \n"
3473 : "vpand %%ymm5,%%ymm1,%%ymm1 \n"
3474 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3475 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3476 : "vpand %%ymm5,%%ymm0,%%ymm1 \n"
3477 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3478 : "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n"
3479 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
3480 : "vpermq $0xd8,%%ymm1,%%ymm1 \n"
3481 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
3482 : "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3483 : VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3484 : "lea " MEMLEA(0x10,1) ",%1 \n"
3485 : "sub $0x20,%3 \n"
3486 : "jg 1b \n"
3487 : "vzeroupper \n"
3488 : : "+r"(src_uyvy), // %0
3489 : "+r"(dst_u), // %1
3490 : "+r"(dst_v), // %2
3491 : "+r"(width) // %3
3492 : :
3493 : : "memory", "cc", NACL_R14
3494 : "xmm0", "xmm1", "xmm5"
3495 0 : );
3496 0 : }
3497 : #endif // HAS_YUY2TOYROW_AVX2
3498 :
3499 : #ifdef HAS_ARGBBLENDROW_SSSE3
3500 : // Shuffle table for isolating alpha.
3501 : static uvec8 kShuffleAlpha = {3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
3502 : 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
3503 :
3504 : // Blend 8 pixels at a time
3505 0 : void ARGBBlendRow_SSSE3(const uint8* src_argb0,
3506 : const uint8* src_argb1,
3507 : uint8* dst_argb,
3508 : int width) {
3509 : asm volatile (
3510 : "pcmpeqb %%xmm7,%%xmm7 \n"
3511 : "psrlw $0xf,%%xmm7 \n"
3512 : "pcmpeqb %%xmm6,%%xmm6 \n"
3513 : "psrlw $0x8,%%xmm6 \n"
3514 : "pcmpeqb %%xmm5,%%xmm5 \n"
3515 : "psllw $0x8,%%xmm5 \n"
3516 : "pcmpeqb %%xmm4,%%xmm4 \n"
3517 : "pslld $0x18,%%xmm4 \n"
3518 : "sub $0x4,%3 \n"
3519 : "jl 49f \n"
3520 :
3521 : // 4 pixel loop.
3522 : LABELALIGN
3523 : "40: \n"
3524 : "movdqu " MEMACCESS(0) ",%%xmm3 \n"
3525 : "lea " MEMLEA(0x10,0) ",%0 \n"
3526 : "movdqa %%xmm3,%%xmm0 \n"
3527 : "pxor %%xmm4,%%xmm3 \n"
3528 : "movdqu " MEMACCESS(1) ",%%xmm2 \n"
3529 : "pshufb %4,%%xmm3 \n"
3530 : "pand %%xmm6,%%xmm2 \n"
3531 : "paddw %%xmm7,%%xmm3 \n"
3532 : "pmullw %%xmm3,%%xmm2 \n"
3533 : "movdqu " MEMACCESS(1) ",%%xmm1 \n"
3534 : "lea " MEMLEA(0x10,1) ",%1 \n"
3535 : "psrlw $0x8,%%xmm1 \n"
3536 : "por %%xmm4,%%xmm0 \n"
3537 : "pmullw %%xmm3,%%xmm1 \n"
3538 : "psrlw $0x8,%%xmm2 \n"
3539 : "paddusb %%xmm2,%%xmm0 \n"
3540 : "pand %%xmm5,%%xmm1 \n"
3541 : "paddusb %%xmm1,%%xmm0 \n"
3542 : "movdqu %%xmm0," MEMACCESS(2) " \n"
3543 : "lea " MEMLEA(0x10,2) ",%2 \n"
3544 : "sub $0x4,%3 \n"
3545 : "jge 40b \n"
3546 :
3547 : "49: \n"
3548 : "add $0x3,%3 \n"
3549 : "jl 99f \n"
3550 :
3551 : // 1 pixel loop.
3552 : "91: \n"
3553 : "movd " MEMACCESS(0) ",%%xmm3 \n"
3554 : "lea " MEMLEA(0x4,0) ",%0 \n"
3555 : "movdqa %%xmm3,%%xmm0 \n"
3556 : "pxor %%xmm4,%%xmm3 \n"
3557 : "movd " MEMACCESS(1) ",%%xmm2 \n"
3558 : "pshufb %4,%%xmm3 \n"
3559 : "pand %%xmm6,%%xmm2 \n"
3560 : "paddw %%xmm7,%%xmm3 \n"
3561 : "pmullw %%xmm3,%%xmm2 \n"
3562 : "movd " MEMACCESS(1) ",%%xmm1 \n"
3563 : "lea " MEMLEA(0x4,1) ",%1 \n"
3564 : "psrlw $0x8,%%xmm1 \n"
3565 : "por %%xmm4,%%xmm0 \n"
3566 : "pmullw %%xmm3,%%xmm1 \n"
3567 : "psrlw $0x8,%%xmm2 \n"
3568 : "paddusb %%xmm2,%%xmm0 \n"
3569 : "pand %%xmm5,%%xmm1 \n"
3570 : "paddusb %%xmm1,%%xmm0 \n"
3571 : "movd %%xmm0," MEMACCESS(2) " \n"
3572 : "lea " MEMLEA(0x4,2) ",%2 \n"
3573 : "sub $0x1,%3 \n"
3574 : "jge 91b \n"
3575 : "99: \n"
3576 : : "+r"(src_argb0), // %0
3577 : "+r"(src_argb1), // %1
3578 : "+r"(dst_argb), // %2
3579 : "+r"(width) // %3
3580 : : "m"(kShuffleAlpha) // %4
3581 : : "memory", "cc"
3582 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3583 0 : );
3584 0 : }
3585 : #endif // HAS_ARGBBLENDROW_SSSE3
3586 :
3587 : #ifdef HAS_BLENDPLANEROW_SSSE3
3588 : // Blend 8 pixels at a time.
3589 : // unsigned version of math
3590 : // =((A2*C2)+(B2*(255-C2))+255)/256
3591 : // signed version of math
3592 : // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3593 0 : void BlendPlaneRow_SSSE3(const uint8* src0,
3594 : const uint8* src1,
3595 : const uint8* alpha,
3596 : uint8* dst,
3597 : int width) {
3598 : asm volatile(
3599 : "pcmpeqb %%xmm5,%%xmm5 \n"
3600 : "psllw $0x8,%%xmm5 \n"
3601 : "mov $0x80808080,%%eax \n"
3602 : "movd %%eax,%%xmm6 \n"
3603 : "pshufd $0x0,%%xmm6,%%xmm6 \n"
3604 : "mov $0x807f807f,%%eax \n"
3605 : "movd %%eax,%%xmm7 \n"
3606 : "pshufd $0x0,%%xmm7,%%xmm7 \n"
3607 : "sub %2,%0 \n"
3608 : "sub %2,%1 \n"
3609 : "sub %2,%3 \n"
3610 :
3611 : // 8 pixel loop.
3612 : LABELALIGN
3613 : "1: \n"
3614 : "movq (%2),%%xmm0 \n"
3615 : "punpcklbw %%xmm0,%%xmm0 \n"
3616 : "pxor %%xmm5,%%xmm0 \n"
3617 : "movq (%0,%2,1),%%xmm1 \n"
3618 : "movq (%1,%2,1),%%xmm2 \n"
3619 : "punpcklbw %%xmm2,%%xmm1 \n"
3620 : "psubb %%xmm6,%%xmm1 \n"
3621 : "pmaddubsw %%xmm1,%%xmm0 \n"
3622 : "paddw %%xmm7,%%xmm0 \n"
3623 : "psrlw $0x8,%%xmm0 \n"
3624 : "packuswb %%xmm0,%%xmm0 \n"
3625 : "movq %%xmm0,(%3,%2,1) \n"
3626 : "lea 0x8(%2),%2 \n"
3627 : "sub $0x8,%4 \n"
3628 : "jg 1b \n"
3629 : : "+r"(src0), // %0
3630 : "+r"(src1), // %1
3631 : "+r"(alpha), // %2
3632 : "+r"(dst), // %3
3633 : "+rm"(width) // %4
3634 : ::"memory",
3635 0 : "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
3636 0 : }
3637 : #endif // HAS_BLENDPLANEROW_SSSE3
3638 :
3639 : #ifdef HAS_BLENDPLANEROW_AVX2
3640 : // Blend 32 pixels at a time.
3641 : // unsigned version of math
3642 : // =((A2*C2)+(B2*(255-C2))+255)/256
3643 : // signed version of math
3644 : // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3645 0 : void BlendPlaneRow_AVX2(const uint8* src0,
3646 : const uint8* src1,
3647 : const uint8* alpha,
3648 : uint8* dst,
3649 : int width) {
3650 : asm volatile(
3651 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3652 : "vpsllw $0x8,%%ymm5,%%ymm5 \n"
3653 : "mov $0x80808080,%%eax \n"
3654 : "vmovd %%eax,%%xmm6 \n"
3655 : "vbroadcastss %%xmm6,%%ymm6 \n"
3656 : "mov $0x807f807f,%%eax \n"
3657 : "vmovd %%eax,%%xmm7 \n"
3658 : "vbroadcastss %%xmm7,%%ymm7 \n"
3659 : "sub %2,%0 \n"
3660 : "sub %2,%1 \n"
3661 : "sub %2,%3 \n"
3662 :
3663 : // 32 pixel loop.
3664 : LABELALIGN
3665 : "1: \n"
3666 : "vmovdqu (%2),%%ymm0 \n"
3667 : "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
3668 : "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3669 : "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
3670 : "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
3671 : "vmovdqu (%0,%2,1),%%ymm1 \n"
3672 : "vmovdqu (%1,%2,1),%%ymm2 \n"
3673 : "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
3674 : "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3675 : "vpsubb %%ymm6,%%ymm4,%%ymm4 \n"
3676 : "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
3677 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
3678 : "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
3679 : "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
3680 : "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
3681 : "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
3682 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3683 : "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
3684 : "vmovdqu %%ymm0,(%3,%2,1) \n"
3685 : "lea 0x20(%2),%2 \n"
3686 : "sub $0x20,%4 \n"
3687 : "jg 1b \n"
3688 : "vzeroupper \n"
3689 : : "+r"(src0), // %0
3690 : "+r"(src1), // %1
3691 : "+r"(alpha), // %2
3692 : "+r"(dst), // %3
3693 : "+rm"(width) // %4
3694 : ::"memory",
3695 : "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
3696 0 : "xmm7");
3697 0 : }
3698 : #endif // HAS_BLENDPLANEROW_AVX2
3699 :
3700 : #ifdef HAS_ARGBATTENUATEROW_SSSE3
3701 : // Shuffle table duplicating alpha
3702 : static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
3703 : 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
3704 : static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3705 : 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
3706 : // Attenuate 4 pixels at a time.
3707 0 : void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3708 : asm volatile (
3709 : "pcmpeqb %%xmm3,%%xmm3 \n"
3710 : "pslld $0x18,%%xmm3 \n"
3711 : "movdqa %3,%%xmm4 \n"
3712 : "movdqa %4,%%xmm5 \n"
3713 :
3714 : // 4 pixel loop.
3715 : LABELALIGN
3716 : "1: \n"
3717 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3718 : "pshufb %%xmm4,%%xmm0 \n"
3719 : "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3720 : "punpcklbw %%xmm1,%%xmm1 \n"
3721 : "pmulhuw %%xmm1,%%xmm0 \n"
3722 : "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3723 : "pshufb %%xmm5,%%xmm1 \n"
3724 : "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3725 : "punpckhbw %%xmm2,%%xmm2 \n"
3726 : "pmulhuw %%xmm2,%%xmm1 \n"
3727 : "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3728 : "lea " MEMLEA(0x10,0) ",%0 \n"
3729 : "pand %%xmm3,%%xmm2 \n"
3730 : "psrlw $0x8,%%xmm0 \n"
3731 : "psrlw $0x8,%%xmm1 \n"
3732 : "packuswb %%xmm1,%%xmm0 \n"
3733 : "por %%xmm2,%%xmm0 \n"
3734 : "movdqu %%xmm0," MEMACCESS(1) " \n"
3735 : "lea " MEMLEA(0x10,1) ",%1 \n"
3736 : "sub $0x4,%2 \n"
3737 : "jg 1b \n"
3738 : : "+r"(src_argb), // %0
3739 : "+r"(dst_argb), // %1
3740 : "+r"(width) // %2
3741 : : "m"(kShuffleAlpha0), // %3
3742 : "m"(kShuffleAlpha1) // %4
3743 : : "memory", "cc"
3744 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3745 0 : );
3746 0 : }
3747 : #endif // HAS_ARGBATTENUATEROW_SSSE3
3748 :
3749 : #ifdef HAS_ARGBATTENUATEROW_AVX2
3750 : // Shuffle table duplicating alpha.
3751 : static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
3752 : 128u, 128u, 14u, 15u, 14u, 15u,
3753 : 14u, 15u, 128u, 128u};
3754 : // Attenuate 8 pixels at a time.
3755 0 : void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3756 : asm volatile (
3757 : "vbroadcastf128 %3,%%ymm4 \n"
3758 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
3759 : "vpslld $0x18,%%ymm5,%%ymm5 \n"
3760 : "sub %0,%1 \n"
3761 :
3762 : // 8 pixel loop.
3763 : LABELALIGN
3764 : "1: \n"
3765 : "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3766 : "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3767 : "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3768 : "vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
3769 : "vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
3770 : "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3771 : "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3772 : "vpand %%ymm5,%%ymm6,%%ymm6 \n"
3773 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3774 : "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
3775 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3776 : "vpor %%ymm6,%%ymm0,%%ymm0 \n"
3777 : MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3778 : "lea " MEMLEA(0x20,0) ",%0 \n"
3779 : "sub $0x8,%2 \n"
3780 : "jg 1b \n"
3781 : "vzeroupper \n"
3782 : : "+r"(src_argb), // %0
3783 : "+r"(dst_argb), // %1
3784 : "+r"(width) // %2
3785 : : "m"(kShuffleAlpha_AVX2) // %3
3786 : : "memory", "cc"
3787 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3788 0 : );
3789 0 : }
3790 : #endif // HAS_ARGBATTENUATEROW_AVX2
3791 :
3792 : #ifdef HAS_ARGBUNATTENUATEROW_SSE2
3793 : // Unattenuate 4 pixels at a time.
3794 0 : void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
3795 : uint8* dst_argb,
3796 : int width) {
3797 : uintptr_t alpha;
3798 : asm volatile (
3799 : // 4 pixel loop.
3800 : LABELALIGN
3801 : "1: \n"
3802 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3803 : "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3804 : "punpcklbw %%xmm0,%%xmm0 \n"
3805 : MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3806 : "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3807 : MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3808 : "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3809 : "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3810 : "movlhps %%xmm3,%%xmm2 \n"
3811 : "pmulhuw %%xmm2,%%xmm0 \n"
3812 : "movdqu " MEMACCESS(0) ",%%xmm1 \n"
3813 : "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3814 : "punpckhbw %%xmm1,%%xmm1 \n"
3815 : MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
3816 : "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3817 : MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
3818 : "pshuflw $0x40,%%xmm2,%%xmm2 \n"
3819 : "pshuflw $0x40,%%xmm3,%%xmm3 \n"
3820 : "movlhps %%xmm3,%%xmm2 \n"
3821 : "pmulhuw %%xmm2,%%xmm1 \n"
3822 : "lea " MEMLEA(0x10,0) ",%0 \n"
3823 : "packuswb %%xmm1,%%xmm0 \n"
3824 : "movdqu %%xmm0," MEMACCESS(1) " \n"
3825 : "lea " MEMLEA(0x10,1) ",%1 \n"
3826 : "sub $0x4,%2 \n"
3827 : "jg 1b \n"
3828 : : "+r"(src_argb), // %0
3829 : "+r"(dst_argb), // %1
3830 : "+r"(width), // %2
3831 : "=&r"(alpha) // %3
3832 : : "r"(fixed_invtbl8) // %4
3833 : : "memory", "cc", NACL_R14
3834 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3835 0 : );
3836 0 : }
3837 : #endif // HAS_ARGBUNATTENUATEROW_SSE2
3838 :
3839 : #ifdef HAS_ARGBUNATTENUATEROW_AVX2
3840 : // Shuffle table duplicating alpha.
3841 : static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3842 : 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
3843 : // Unattenuate 8 pixels at a time.
3844 0 : void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
3845 : uint8* dst_argb,
3846 : int width) {
3847 : uintptr_t alpha;
3848 : asm volatile (
3849 : "sub %0,%1 \n"
3850 : "vbroadcastf128 %5,%%ymm5 \n"
3851 :
3852 : // 8 pixel loop.
3853 : LABELALIGN
3854 : "1: \n"
3855 : // replace VPGATHER
3856 : "movzb " MEMACCESS2(0x03,0) ",%3 \n"
3857 : MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3858 : "movzb " MEMACCESS2(0x07,0) ",%3 \n"
3859 : MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3860 : "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
3861 : "vpunpckldq %%xmm1,%%xmm0,%%xmm6 \n"
3862 : MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3863 : "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
3864 : MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3865 : "movzb " MEMACCESS2(0x13,0) ",%3 \n"
3866 : "vpunpckldq %%xmm3,%%xmm2,%%xmm7 \n"
3867 : MEMOPREG(vmovd,0x00,4,3,4,xmm0) // vmovd 0x0(%4,%3,4),%%xmm0
3868 : "movzb " MEMACCESS2(0x17,0) ",%3 \n"
3869 : MEMOPREG(vmovd,0x00,4,3,4,xmm1) // vmovd 0x0(%4,%3,4),%%xmm1
3870 : "movzb " MEMACCESS2(0x1b,0) ",%3 \n"
3871 : "vpunpckldq %%xmm1,%%xmm0,%%xmm0 \n"
3872 : MEMOPREG(vmovd,0x00,4,3,4,xmm2) // vmovd 0x0(%4,%3,4),%%xmm2
3873 : "movzb " MEMACCESS2(0x1f,0) ",%3 \n"
3874 : MEMOPREG(vmovd,0x00,4,3,4,xmm3) // vmovd 0x0(%4,%3,4),%%xmm3
3875 : "vpunpckldq %%xmm3,%%xmm2,%%xmm2 \n"
3876 : "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3 \n"
3877 : "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0 \n"
3878 : "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3 \n"
3879 : // end of VPGATHER
3880 :
3881 : "vmovdqu " MEMACCESS(0) ",%%ymm6 \n"
3882 : "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
3883 : "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
3884 : "vpunpcklwd %%ymm3,%%ymm3,%%ymm2 \n"
3885 : "vpunpckhwd %%ymm3,%%ymm3,%%ymm3 \n"
3886 : "vpshufb %%ymm5,%%ymm2,%%ymm2 \n"
3887 : "vpshufb %%ymm5,%%ymm3,%%ymm3 \n"
3888 : "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
3889 : "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
3890 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
3891 : MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1) // vmovdqu %%ymm0,(%0,%1)
3892 : "lea " MEMLEA(0x20,0) ",%0 \n"
3893 : "sub $0x8,%2 \n"
3894 : "jg 1b \n"
3895 : "vzeroupper \n"
3896 : : "+r"(src_argb), // %0
3897 : "+r"(dst_argb), // %1
3898 : "+r"(width), // %2
3899 : "=&r"(alpha) // %3
3900 : : "r"(fixed_invtbl8), // %4
3901 : "m"(kUnattenShuffleAlpha_AVX2) // %5
3902 : : "memory", "cc", NACL_R14
3903 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3904 0 : );
3905 0 : }
3906 : #endif // HAS_ARGBUNATTENUATEROW_AVX2
3907 :
3908 : #ifdef HAS_ARGBGRAYROW_SSSE3
3909 : // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3910 0 : void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3911 : asm volatile (
3912 : "movdqa %3,%%xmm4 \n"
3913 : "movdqa %4,%%xmm5 \n"
3914 :
3915 : // 8 pixel loop.
3916 : LABELALIGN
3917 : "1: \n"
3918 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3919 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3920 : "pmaddubsw %%xmm4,%%xmm0 \n"
3921 : "pmaddubsw %%xmm4,%%xmm1 \n"
3922 : "phaddw %%xmm1,%%xmm0 \n"
3923 : "paddw %%xmm5,%%xmm0 \n"
3924 : "psrlw $0x7,%%xmm0 \n"
3925 : "packuswb %%xmm0,%%xmm0 \n"
3926 : "movdqu " MEMACCESS(0) ",%%xmm2 \n"
3927 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm3 \n"
3928 : "lea " MEMLEA(0x20,0) ",%0 \n"
3929 : "psrld $0x18,%%xmm2 \n"
3930 : "psrld $0x18,%%xmm3 \n"
3931 : "packuswb %%xmm3,%%xmm2 \n"
3932 : "packuswb %%xmm2,%%xmm2 \n"
3933 : "movdqa %%xmm0,%%xmm3 \n"
3934 : "punpcklbw %%xmm0,%%xmm0 \n"
3935 : "punpcklbw %%xmm2,%%xmm3 \n"
3936 : "movdqa %%xmm0,%%xmm1 \n"
3937 : "punpcklwd %%xmm3,%%xmm0 \n"
3938 : "punpckhwd %%xmm3,%%xmm1 \n"
3939 : "movdqu %%xmm0," MEMACCESS(1) " \n"
3940 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
3941 : "lea " MEMLEA(0x20,1) ",%1 \n"
3942 : "sub $0x8,%2 \n"
3943 : "jg 1b \n"
3944 : : "+r"(src_argb), // %0
3945 : "+r"(dst_argb), // %1
3946 : "+r"(width) // %2
3947 : : "m"(kARGBToYJ), // %3
3948 : "m"(kAddYJ64) // %4
3949 : : "memory", "cc"
3950 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3951 0 : );
3952 0 : }
3953 : #endif // HAS_ARGBGRAYROW_SSSE3
3954 :
3955 : #ifdef HAS_ARGBSEPIAROW_SSSE3
3956 : // b = (r * 35 + g * 68 + b * 17) >> 7
3957 : // g = (r * 45 + g * 88 + b * 22) >> 7
3958 : // r = (r * 50 + g * 98 + b * 24) >> 7
3959 : // Constant for ARGB color to sepia tone
3960 : static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
3961 : 17, 68, 35, 0, 17, 68, 35, 0};
3962 :
3963 : static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
3964 : 22, 88, 45, 0, 22, 88, 45, 0};
3965 :
3966 : static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
3967 : 24, 98, 50, 0, 24, 98, 50, 0};
3968 :
3969 : // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3970 0 : void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3971 : asm volatile (
3972 : "movdqa %2,%%xmm2 \n"
3973 : "movdqa %3,%%xmm3 \n"
3974 : "movdqa %4,%%xmm4 \n"
3975 :
3976 : // 8 pixel loop.
3977 : LABELALIGN
3978 : "1: \n"
3979 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
3980 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
3981 : "pmaddubsw %%xmm2,%%xmm0 \n"
3982 : "pmaddubsw %%xmm2,%%xmm6 \n"
3983 : "phaddw %%xmm6,%%xmm0 \n"
3984 : "psrlw $0x7,%%xmm0 \n"
3985 : "packuswb %%xmm0,%%xmm0 \n"
3986 : "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3987 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3988 : "pmaddubsw %%xmm3,%%xmm5 \n"
3989 : "pmaddubsw %%xmm3,%%xmm1 \n"
3990 : "phaddw %%xmm1,%%xmm5 \n"
3991 : "psrlw $0x7,%%xmm5 \n"
3992 : "packuswb %%xmm5,%%xmm5 \n"
3993 : "punpcklbw %%xmm5,%%xmm0 \n"
3994 : "movdqu " MEMACCESS(0) ",%%xmm5 \n"
3995 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
3996 : "pmaddubsw %%xmm4,%%xmm5 \n"
3997 : "pmaddubsw %%xmm4,%%xmm1 \n"
3998 : "phaddw %%xmm1,%%xmm5 \n"
3999 : "psrlw $0x7,%%xmm5 \n"
4000 : "packuswb %%xmm5,%%xmm5 \n"
4001 : "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4002 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4003 : "psrld $0x18,%%xmm6 \n"
4004 : "psrld $0x18,%%xmm1 \n"
4005 : "packuswb %%xmm1,%%xmm6 \n"
4006 : "packuswb %%xmm6,%%xmm6 \n"
4007 : "punpcklbw %%xmm6,%%xmm5 \n"
4008 : "movdqa %%xmm0,%%xmm1 \n"
4009 : "punpcklwd %%xmm5,%%xmm0 \n"
4010 : "punpckhwd %%xmm5,%%xmm1 \n"
4011 : "movdqu %%xmm0," MEMACCESS(0) " \n"
4012 : "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
4013 : "lea " MEMLEA(0x20,0) ",%0 \n"
4014 : "sub $0x8,%1 \n"
4015 : "jg 1b \n"
4016 : : "+r"(dst_argb), // %0
4017 : "+r"(width) // %1
4018 : : "m"(kARGBToSepiaB), // %2
4019 : "m"(kARGBToSepiaG), // %3
4020 : "m"(kARGBToSepiaR) // %4
4021 : : "memory", "cc"
4022 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4023 0 : );
4024 0 : }
4025 : #endif // HAS_ARGBSEPIAROW_SSSE3
4026 :
4027 : #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4028 : // Tranform 8 ARGB pixels (32 bytes) with color matrix.
4029 : // Same as Sepia except matrix is provided.
4030 0 : void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4031 : uint8* dst_argb,
4032 : const int8* matrix_argb,
4033 : int width) {
4034 : asm volatile (
4035 : "movdqu " MEMACCESS(3) ",%%xmm5 \n"
4036 : "pshufd $0x00,%%xmm5,%%xmm2 \n"
4037 : "pshufd $0x55,%%xmm5,%%xmm3 \n"
4038 : "pshufd $0xaa,%%xmm5,%%xmm4 \n"
4039 : "pshufd $0xff,%%xmm5,%%xmm5 \n"
4040 :
4041 : // 8 pixel loop.
4042 : LABELALIGN
4043 : "1: \n"
4044 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4045 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4046 : "pmaddubsw %%xmm2,%%xmm0 \n"
4047 : "pmaddubsw %%xmm2,%%xmm7 \n"
4048 : "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4049 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4050 : "pmaddubsw %%xmm3,%%xmm6 \n"
4051 : "pmaddubsw %%xmm3,%%xmm1 \n"
4052 : "phaddsw %%xmm7,%%xmm0 \n"
4053 : "phaddsw %%xmm1,%%xmm6 \n"
4054 : "psraw $0x6,%%xmm0 \n"
4055 : "psraw $0x6,%%xmm6 \n"
4056 : "packuswb %%xmm0,%%xmm0 \n"
4057 : "packuswb %%xmm6,%%xmm6 \n"
4058 : "punpcklbw %%xmm6,%%xmm0 \n"
4059 : "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4060 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4061 : "pmaddubsw %%xmm4,%%xmm1 \n"
4062 : "pmaddubsw %%xmm4,%%xmm7 \n"
4063 : "phaddsw %%xmm7,%%xmm1 \n"
4064 : "movdqu " MEMACCESS(0) ",%%xmm6 \n"
4065 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm7 \n"
4066 : "pmaddubsw %%xmm5,%%xmm6 \n"
4067 : "pmaddubsw %%xmm5,%%xmm7 \n"
4068 : "phaddsw %%xmm7,%%xmm6 \n"
4069 : "psraw $0x6,%%xmm1 \n"
4070 : "psraw $0x6,%%xmm6 \n"
4071 : "packuswb %%xmm1,%%xmm1 \n"
4072 : "packuswb %%xmm6,%%xmm6 \n"
4073 : "punpcklbw %%xmm6,%%xmm1 \n"
4074 : "movdqa %%xmm0,%%xmm6 \n"
4075 : "punpcklwd %%xmm1,%%xmm0 \n"
4076 : "punpckhwd %%xmm1,%%xmm6 \n"
4077 : "movdqu %%xmm0," MEMACCESS(1) " \n"
4078 : "movdqu %%xmm6," MEMACCESS2(0x10,1) " \n"
4079 : "lea " MEMLEA(0x20,0) ",%0 \n"
4080 : "lea " MEMLEA(0x20,1) ",%1 \n"
4081 : "sub $0x8,%2 \n"
4082 : "jg 1b \n"
4083 : : "+r"(src_argb), // %0
4084 : "+r"(dst_argb), // %1
4085 : "+r"(width) // %2
4086 : : "r"(matrix_argb) // %3
4087 : : "memory", "cc"
4088 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4089 0 : );
4090 0 : }
4091 : #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
4092 :
4093 : #ifdef HAS_ARGBQUANTIZEROW_SSE2
4094 : // Quantize 4 ARGB pixels (16 bytes).
4095 0 : void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4096 : int scale,
4097 : int interval_size,
4098 : int interval_offset,
4099 : int width) {
4100 : asm volatile (
4101 : "movd %2,%%xmm2 \n"
4102 : "movd %3,%%xmm3 \n"
4103 : "movd %4,%%xmm4 \n"
4104 : "pshuflw $0x40,%%xmm2,%%xmm2 \n"
4105 : "pshufd $0x44,%%xmm2,%%xmm2 \n"
4106 : "pshuflw $0x40,%%xmm3,%%xmm3 \n"
4107 : "pshufd $0x44,%%xmm3,%%xmm3 \n"
4108 : "pshuflw $0x40,%%xmm4,%%xmm4 \n"
4109 : "pshufd $0x44,%%xmm4,%%xmm4 \n"
4110 : "pxor %%xmm5,%%xmm5 \n"
4111 : "pcmpeqb %%xmm6,%%xmm6 \n"
4112 : "pslld $0x18,%%xmm6 \n"
4113 :
4114 : // 4 pixel loop.
4115 : LABELALIGN
4116 : "1: \n"
4117 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4118 : "punpcklbw %%xmm5,%%xmm0 \n"
4119 : "pmulhuw %%xmm2,%%xmm0 \n"
4120 : "movdqu " MEMACCESS(0) ",%%xmm1 \n"
4121 : "punpckhbw %%xmm5,%%xmm1 \n"
4122 : "pmulhuw %%xmm2,%%xmm1 \n"
4123 : "pmullw %%xmm3,%%xmm0 \n"
4124 : "movdqu " MEMACCESS(0) ",%%xmm7 \n"
4125 : "pmullw %%xmm3,%%xmm1 \n"
4126 : "pand %%xmm6,%%xmm7 \n"
4127 : "paddw %%xmm4,%%xmm0 \n"
4128 : "paddw %%xmm4,%%xmm1 \n"
4129 : "packuswb %%xmm1,%%xmm0 \n"
4130 : "por %%xmm7,%%xmm0 \n"
4131 : "movdqu %%xmm0," MEMACCESS(0) " \n"
4132 : "lea " MEMLEA(0x10,0) ",%0 \n"
4133 : "sub $0x4,%1 \n"
4134 : "jg 1b \n"
4135 : : "+r"(dst_argb), // %0
4136 : "+r"(width) // %1
4137 : : "r"(scale), // %2
4138 : "r"(interval_size), // %3
4139 : "r"(interval_offset) // %4
4140 : : "memory", "cc"
4141 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4142 0 : );
4143 0 : }
4144 : #endif // HAS_ARGBQUANTIZEROW_SSE2
4145 :
4146 : #ifdef HAS_ARGBSHADEROW_SSE2
4147 : // Shade 4 pixels at a time by specified value.
4148 0 : void ARGBShadeRow_SSE2(const uint8* src_argb,
4149 : uint8* dst_argb,
4150 : int width,
4151 : uint32 value) {
4152 : asm volatile (
4153 : "movd %3,%%xmm2 \n"
4154 : "punpcklbw %%xmm2,%%xmm2 \n"
4155 : "punpcklqdq %%xmm2,%%xmm2 \n"
4156 :
4157 : // 4 pixel loop.
4158 : LABELALIGN
4159 : "1: \n"
4160 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4161 : "lea " MEMLEA(0x10,0) ",%0 \n"
4162 : "movdqa %%xmm0,%%xmm1 \n"
4163 : "punpcklbw %%xmm0,%%xmm0 \n"
4164 : "punpckhbw %%xmm1,%%xmm1 \n"
4165 : "pmulhuw %%xmm2,%%xmm0 \n"
4166 : "pmulhuw %%xmm2,%%xmm1 \n"
4167 : "psrlw $0x8,%%xmm0 \n"
4168 : "psrlw $0x8,%%xmm1 \n"
4169 : "packuswb %%xmm1,%%xmm0 \n"
4170 : "movdqu %%xmm0," MEMACCESS(1) " \n"
4171 : "lea " MEMLEA(0x10,1) ",%1 \n"
4172 : "sub $0x4,%2 \n"
4173 : "jg 1b \n"
4174 : : "+r"(src_argb), // %0
4175 : "+r"(dst_argb), // %1
4176 : "+r"(width) // %2
4177 : : "r"(value) // %3
4178 : : "memory", "cc"
4179 : , "xmm0", "xmm1", "xmm2"
4180 0 : );
4181 0 : }
4182 : #endif // HAS_ARGBSHADEROW_SSE2
4183 :
4184 : #ifdef HAS_ARGBMULTIPLYROW_SSE2
4185 : // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4186 0 : void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4187 : const uint8* src_argb1,
4188 : uint8* dst_argb,
4189 : int width) {
4190 : asm volatile (
4191 : "pxor %%xmm5,%%xmm5 \n"
4192 :
4193 : // 4 pixel loop.
4194 : LABELALIGN
4195 : "1: \n"
4196 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4197 : "lea " MEMLEA(0x10,0) ",%0 \n"
4198 : "movdqu " MEMACCESS(1) ",%%xmm2 \n"
4199 : "lea " MEMLEA(0x10,1) ",%1 \n"
4200 : "movdqu %%xmm0,%%xmm1 \n"
4201 : "movdqu %%xmm2,%%xmm3 \n"
4202 : "punpcklbw %%xmm0,%%xmm0 \n"
4203 : "punpckhbw %%xmm1,%%xmm1 \n"
4204 : "punpcklbw %%xmm5,%%xmm2 \n"
4205 : "punpckhbw %%xmm5,%%xmm3 \n"
4206 : "pmulhuw %%xmm2,%%xmm0 \n"
4207 : "pmulhuw %%xmm3,%%xmm1 \n"
4208 : "packuswb %%xmm1,%%xmm0 \n"
4209 : "movdqu %%xmm0," MEMACCESS(2) " \n"
4210 : "lea " MEMLEA(0x10,2) ",%2 \n"
4211 : "sub $0x4,%3 \n"
4212 : "jg 1b \n"
4213 : : "+r"(src_argb0), // %0
4214 : "+r"(src_argb1), // %1
4215 : "+r"(dst_argb), // %2
4216 : "+r"(width) // %3
4217 : :
4218 : : "memory", "cc"
4219 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4220 0 : );
4221 0 : }
4222 : #endif // HAS_ARGBMULTIPLYROW_SSE2
4223 :
4224 : #ifdef HAS_ARGBMULTIPLYROW_AVX2
4225 : // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4226 0 : void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4227 : const uint8* src_argb1,
4228 : uint8* dst_argb,
4229 : int width) {
4230 : asm volatile (
4231 : "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
4232 :
4233 : // 4 pixel loop.
4234 : LABELALIGN
4235 : "1: \n"
4236 : "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
4237 : "lea " MEMLEA(0x20,0) ",%0 \n"
4238 : "vmovdqu " MEMACCESS(1) ",%%ymm3 \n"
4239 : "lea " MEMLEA(0x20,1) ",%1 \n"
4240 : "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n"
4241 : "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n"
4242 : "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
4243 : "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
4244 : "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
4245 : "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
4246 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
4247 : "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4248 : "lea " MEMLEA(0x20,2) ",%2 \n"
4249 : "sub $0x8,%3 \n"
4250 : "jg 1b \n"
4251 : "vzeroupper \n"
4252 : : "+r"(src_argb0), // %0
4253 : "+r"(src_argb1), // %1
4254 : "+r"(dst_argb), // %2
4255 : "+r"(width) // %3
4256 : :
4257 : : "memory", "cc"
4258 : #if defined(__AVX2__)
4259 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4260 : #endif
4261 0 : );
4262 0 : }
4263 : #endif // HAS_ARGBMULTIPLYROW_AVX2
4264 :
4265 : #ifdef HAS_ARGBADDROW_SSE2
4266 : // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4267 0 : void ARGBAddRow_SSE2(const uint8* src_argb0,
4268 : const uint8* src_argb1,
4269 : uint8* dst_argb,
4270 : int width) {
4271 : asm volatile (
4272 : // 4 pixel loop.
4273 : LABELALIGN
4274 : "1: \n"
4275 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4276 : "lea " MEMLEA(0x10,0) ",%0 \n"
4277 : "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4278 : "lea " MEMLEA(0x10,1) ",%1 \n"
4279 : "paddusb %%xmm1,%%xmm0 \n"
4280 : "movdqu %%xmm0," MEMACCESS(2) " \n"
4281 : "lea " MEMLEA(0x10,2) ",%2 \n"
4282 : "sub $0x4,%3 \n"
4283 : "jg 1b \n"
4284 : : "+r"(src_argb0), // %0
4285 : "+r"(src_argb1), // %1
4286 : "+r"(dst_argb), // %2
4287 : "+r"(width) // %3
4288 : :
4289 : : "memory", "cc"
4290 : , "xmm0", "xmm1"
4291 0 : );
4292 0 : }
4293 : #endif // HAS_ARGBADDROW_SSE2
4294 :
4295 : #ifdef HAS_ARGBADDROW_AVX2
4296 : // Add 2 rows of ARGB pixels together, 4 pixels at a time.
4297 0 : void ARGBAddRow_AVX2(const uint8* src_argb0,
4298 : const uint8* src_argb1,
4299 : uint8* dst_argb,
4300 : int width) {
4301 : asm volatile (
4302 : // 4 pixel loop.
4303 : LABELALIGN
4304 : "1: \n"
4305 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4306 : "lea " MEMLEA(0x20,0) ",%0 \n"
4307 : "vpaddusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4308 : "lea " MEMLEA(0x20,1) ",%1 \n"
4309 : "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4310 : "lea " MEMLEA(0x20,2) ",%2 \n"
4311 : "sub $0x8,%3 \n"
4312 : "jg 1b \n"
4313 : "vzeroupper \n"
4314 : : "+r"(src_argb0), // %0
4315 : "+r"(src_argb1), // %1
4316 : "+r"(dst_argb), // %2
4317 : "+r"(width) // %3
4318 : :
4319 : : "memory", "cc"
4320 : , "xmm0"
4321 0 : );
4322 0 : }
4323 : #endif // HAS_ARGBADDROW_AVX2
4324 :
4325 : #ifdef HAS_ARGBSUBTRACTROW_SSE2
4326 : // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4327 0 : void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4328 : const uint8* src_argb1,
4329 : uint8* dst_argb,
4330 : int width) {
4331 : asm volatile (
4332 : // 4 pixel loop.
4333 : LABELALIGN
4334 : "1: \n"
4335 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4336 : "lea " MEMLEA(0x10,0) ",%0 \n"
4337 : "movdqu " MEMACCESS(1) ",%%xmm1 \n"
4338 : "lea " MEMLEA(0x10,1) ",%1 \n"
4339 : "psubusb %%xmm1,%%xmm0 \n"
4340 : "movdqu %%xmm0," MEMACCESS(2) " \n"
4341 : "lea " MEMLEA(0x10,2) ",%2 \n"
4342 : "sub $0x4,%3 \n"
4343 : "jg 1b \n"
4344 : : "+r"(src_argb0), // %0
4345 : "+r"(src_argb1), // %1
4346 : "+r"(dst_argb), // %2
4347 : "+r"(width) // %3
4348 : :
4349 : : "memory", "cc"
4350 : , "xmm0", "xmm1"
4351 0 : );
4352 0 : }
4353 : #endif // HAS_ARGBSUBTRACTROW_SSE2
4354 :
4355 : #ifdef HAS_ARGBSUBTRACTROW_AVX2
4356 : // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4357 0 : void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4358 : const uint8* src_argb1,
4359 : uint8* dst_argb,
4360 : int width) {
4361 : asm volatile (
4362 : // 4 pixel loop.
4363 : LABELALIGN
4364 : "1: \n"
4365 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
4366 : "lea " MEMLEA(0x20,0) ",%0 \n"
4367 : "vpsubusb " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4368 : "lea " MEMLEA(0x20,1) ",%1 \n"
4369 : "vmovdqu %%ymm0," MEMACCESS(2) " \n"
4370 : "lea " MEMLEA(0x20,2) ",%2 \n"
4371 : "sub $0x8,%3 \n"
4372 : "jg 1b \n"
4373 : "vzeroupper \n"
4374 : : "+r"(src_argb0), // %0
4375 : "+r"(src_argb1), // %1
4376 : "+r"(dst_argb), // %2
4377 : "+r"(width) // %3
4378 : :
4379 : : "memory", "cc"
4380 : , "xmm0"
4381 0 : );
4382 0 : }
4383 : #endif // HAS_ARGBSUBTRACTROW_AVX2
4384 :
4385 : #ifdef HAS_SOBELXROW_SSE2
4386 : // SobelX as a matrix is
4387 : // -1 0 1
4388 : // -2 0 2
4389 : // -1 0 1
4390 0 : void SobelXRow_SSE2(const uint8* src_y0,
4391 : const uint8* src_y1,
4392 : const uint8* src_y2,
4393 : uint8* dst_sobelx,
4394 : int width) {
4395 : asm volatile (
4396 : "sub %0,%1 \n"
4397 : "sub %0,%2 \n"
4398 : "sub %0,%3 \n"
4399 : "pxor %%xmm5,%%xmm5 \n"
4400 :
4401 : // 8 pixel loop.
4402 : LABELALIGN
4403 : "1: \n"
4404 : "movq " MEMACCESS(0) ",%%xmm0 \n"
4405 : "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
4406 : "punpcklbw %%xmm5,%%xmm0 \n"
4407 : "punpcklbw %%xmm5,%%xmm1 \n"
4408 : "psubw %%xmm1,%%xmm0 \n"
4409 : MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4410 : MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
4411 : "punpcklbw %%xmm5,%%xmm1 \n"
4412 : "punpcklbw %%xmm5,%%xmm2 \n"
4413 : "psubw %%xmm2,%%xmm1 \n"
4414 : MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
4415 : MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
4416 : "punpcklbw %%xmm5,%%xmm2 \n"
4417 : "punpcklbw %%xmm5,%%xmm3 \n"
4418 : "psubw %%xmm3,%%xmm2 \n"
4419 : "paddw %%xmm2,%%xmm0 \n"
4420 : "paddw %%xmm1,%%xmm0 \n"
4421 : "paddw %%xmm1,%%xmm0 \n"
4422 : "pxor %%xmm1,%%xmm1 \n"
4423 : "psubw %%xmm0,%%xmm1 \n"
4424 : "pmaxsw %%xmm1,%%xmm0 \n"
4425 : "packuswb %%xmm0,%%xmm0 \n"
4426 : MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
4427 : "lea " MEMLEA(0x8,0) ",%0 \n"
4428 : "sub $0x8,%4 \n"
4429 : "jg 1b \n"
4430 : : "+r"(src_y0), // %0
4431 : "+r"(src_y1), // %1
4432 : "+r"(src_y2), // %2
4433 : "+r"(dst_sobelx), // %3
4434 : "+r"(width) // %4
4435 : :
4436 : : "memory", "cc", NACL_R14
4437 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4438 0 : );
4439 0 : }
4440 : #endif // HAS_SOBELXROW_SSE2
4441 :
4442 : #ifdef HAS_SOBELYROW_SSE2
4443 : // SobelY as a matrix is
4444 : // -1 -2 -1
4445 : // 0 0 0
4446 : // 1 2 1
4447 0 : void SobelYRow_SSE2(const uint8* src_y0,
4448 : const uint8* src_y1,
4449 : uint8* dst_sobely,
4450 : int width) {
4451 : asm volatile (
4452 : "sub %0,%1 \n"
4453 : "sub %0,%2 \n"
4454 : "pxor %%xmm5,%%xmm5 \n"
4455 :
4456 : // 8 pixel loop.
4457 : LABELALIGN
4458 : "1: \n"
4459 : "movq " MEMACCESS(0) ",%%xmm0 \n"
4460 : MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
4461 : "punpcklbw %%xmm5,%%xmm0 \n"
4462 : "punpcklbw %%xmm5,%%xmm1 \n"
4463 : "psubw %%xmm1,%%xmm0 \n"
4464 : "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
4465 : MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
4466 : "punpcklbw %%xmm5,%%xmm1 \n"
4467 : "punpcklbw %%xmm5,%%xmm2 \n"
4468 : "psubw %%xmm2,%%xmm1 \n"
4469 : "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
4470 : MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
4471 : "punpcklbw %%xmm5,%%xmm2 \n"
4472 : "punpcklbw %%xmm5,%%xmm3 \n"
4473 : "psubw %%xmm3,%%xmm2 \n"
4474 : "paddw %%xmm2,%%xmm0 \n"
4475 : "paddw %%xmm1,%%xmm0 \n"
4476 : "paddw %%xmm1,%%xmm0 \n"
4477 : "pxor %%xmm1,%%xmm1 \n"
4478 : "psubw %%xmm0,%%xmm1 \n"
4479 : "pmaxsw %%xmm1,%%xmm0 \n"
4480 : "packuswb %%xmm0,%%xmm0 \n"
4481 : MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
4482 : "lea " MEMLEA(0x8,0) ",%0 \n"
4483 : "sub $0x8,%3 \n"
4484 : "jg 1b \n"
4485 : : "+r"(src_y0), // %0
4486 : "+r"(src_y1), // %1
4487 : "+r"(dst_sobely), // %2
4488 : "+r"(width) // %3
4489 : :
4490 : : "memory", "cc", NACL_R14
4491 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4492 0 : );
4493 0 : }
4494 : #endif // HAS_SOBELYROW_SSE2
4495 :
4496 : #ifdef HAS_SOBELROW_SSE2
4497 : // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4498 : // A = 255
4499 : // R = Sobel
4500 : // G = Sobel
4501 : // B = Sobel
4502 0 : void SobelRow_SSE2(const uint8* src_sobelx,
4503 : const uint8* src_sobely,
4504 : uint8* dst_argb,
4505 : int width) {
4506 : asm volatile (
4507 : "sub %0,%1 \n"
4508 : "pcmpeqb %%xmm5,%%xmm5 \n"
4509 : "pslld $0x18,%%xmm5 \n"
4510 :
4511 : // 8 pixel loop.
4512 : LABELALIGN
4513 : "1: \n"
4514 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4515 : MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4516 : "lea " MEMLEA(0x10,0) ",%0 \n"
4517 : "paddusb %%xmm1,%%xmm0 \n"
4518 : "movdqa %%xmm0,%%xmm2 \n"
4519 : "punpcklbw %%xmm0,%%xmm2 \n"
4520 : "punpckhbw %%xmm0,%%xmm0 \n"
4521 : "movdqa %%xmm2,%%xmm1 \n"
4522 : "punpcklwd %%xmm2,%%xmm1 \n"
4523 : "punpckhwd %%xmm2,%%xmm2 \n"
4524 : "por %%xmm5,%%xmm1 \n"
4525 : "por %%xmm5,%%xmm2 \n"
4526 : "movdqa %%xmm0,%%xmm3 \n"
4527 : "punpcklwd %%xmm0,%%xmm3 \n"
4528 : "punpckhwd %%xmm0,%%xmm0 \n"
4529 : "por %%xmm5,%%xmm3 \n"
4530 : "por %%xmm5,%%xmm0 \n"
4531 : "movdqu %%xmm1," MEMACCESS(2) " \n"
4532 : "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
4533 : "movdqu %%xmm3," MEMACCESS2(0x20,2) " \n"
4534 : "movdqu %%xmm0," MEMACCESS2(0x30,2) " \n"
4535 : "lea " MEMLEA(0x40,2) ",%2 \n"
4536 : "sub $0x10,%3 \n"
4537 : "jg 1b \n"
4538 : : "+r"(src_sobelx), // %0
4539 : "+r"(src_sobely), // %1
4540 : "+r"(dst_argb), // %2
4541 : "+r"(width) // %3
4542 : :
4543 : : "memory", "cc", NACL_R14
4544 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4545 0 : );
4546 0 : }
4547 : #endif // HAS_SOBELROW_SSE2
4548 :
4549 : #ifdef HAS_SOBELTOPLANEROW_SSE2
4550 : // Adds Sobel X and Sobel Y and stores Sobel into a plane.
4551 0 : void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
4552 : const uint8* src_sobely,
4553 : uint8* dst_y,
4554 : int width) {
4555 : asm volatile (
4556 : "sub %0,%1 \n"
4557 : "pcmpeqb %%xmm5,%%xmm5 \n"
4558 : "pslld $0x18,%%xmm5 \n"
4559 :
4560 : // 8 pixel loop.
4561 : LABELALIGN
4562 : "1: \n"
4563 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4564 : MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4565 : "lea " MEMLEA(0x10,0) ",%0 \n"
4566 : "paddusb %%xmm1,%%xmm0 \n"
4567 : "movdqu %%xmm0," MEMACCESS(2) " \n"
4568 : "lea " MEMLEA(0x10,2) ",%2 \n"
4569 : "sub $0x10,%3 \n"
4570 : "jg 1b \n"
4571 : : "+r"(src_sobelx), // %0
4572 : "+r"(src_sobely), // %1
4573 : "+r"(dst_y), // %2
4574 : "+r"(width) // %3
4575 : :
4576 : : "memory", "cc", NACL_R14
4577 : "xmm0", "xmm1"
4578 0 : );
4579 0 : }
4580 : #endif // HAS_SOBELTOPLANEROW_SSE2
4581 :
4582 : #ifdef HAS_SOBELXYROW_SSE2
4583 : // Mixes Sobel X, Sobel Y and Sobel into ARGB.
4584 : // A = 255
4585 : // R = Sobel X
4586 : // G = Sobel
4587 : // B = Sobel Y
4588 0 : void SobelXYRow_SSE2(const uint8* src_sobelx,
4589 : const uint8* src_sobely,
4590 : uint8* dst_argb,
4591 : int width) {
4592 : asm volatile (
4593 : "sub %0,%1 \n"
4594 : "pcmpeqb %%xmm5,%%xmm5 \n"
4595 :
4596 : // 8 pixel loop.
4597 : LABELALIGN
4598 : "1: \n"
4599 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4600 : MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
4601 : "lea " MEMLEA(0x10,0) ",%0 \n"
4602 : "movdqa %%xmm0,%%xmm2 \n"
4603 : "paddusb %%xmm1,%%xmm2 \n"
4604 : "movdqa %%xmm0,%%xmm3 \n"
4605 : "punpcklbw %%xmm5,%%xmm3 \n"
4606 : "punpckhbw %%xmm5,%%xmm0 \n"
4607 : "movdqa %%xmm1,%%xmm4 \n"
4608 : "punpcklbw %%xmm2,%%xmm4 \n"
4609 : "punpckhbw %%xmm2,%%xmm1 \n"
4610 : "movdqa %%xmm4,%%xmm6 \n"
4611 : "punpcklwd %%xmm3,%%xmm6 \n"
4612 : "punpckhwd %%xmm3,%%xmm4 \n"
4613 : "movdqa %%xmm1,%%xmm7 \n"
4614 : "punpcklwd %%xmm0,%%xmm7 \n"
4615 : "punpckhwd %%xmm0,%%xmm1 \n"
4616 : "movdqu %%xmm6," MEMACCESS(2) " \n"
4617 : "movdqu %%xmm4," MEMACCESS2(0x10,2) " \n"
4618 : "movdqu %%xmm7," MEMACCESS2(0x20,2) " \n"
4619 : "movdqu %%xmm1," MEMACCESS2(0x30,2) " \n"
4620 : "lea " MEMLEA(0x40,2) ",%2 \n"
4621 : "sub $0x10,%3 \n"
4622 : "jg 1b \n"
4623 : : "+r"(src_sobelx), // %0
4624 : "+r"(src_sobely), // %1
4625 : "+r"(dst_argb), // %2
4626 : "+r"(width) // %3
4627 : :
4628 : : "memory", "cc", NACL_R14
4629 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4630 0 : );
4631 0 : }
4632 : #endif // HAS_SOBELXYROW_SSE2
4633 :
4634 : #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4635 : // Creates a table of cumulative sums where each value is a sum of all values
4636 : // above and to the left of the value, inclusive of the value.
4637 0 : void ComputeCumulativeSumRow_SSE2(const uint8* row,
4638 : int32* cumsum,
4639 : const int32* previous_cumsum,
4640 : int width) {
4641 : asm volatile (
4642 : "pxor %%xmm0,%%xmm0 \n"
4643 : "pxor %%xmm1,%%xmm1 \n"
4644 : "sub $0x4,%3 \n"
4645 : "jl 49f \n"
4646 : "test $0xf,%1 \n"
4647 : "jne 49f \n"
4648 :
4649 : // 4 pixel loop.
4650 : LABELALIGN
4651 : "40: \n"
4652 : "movdqu " MEMACCESS(0) ",%%xmm2 \n"
4653 : "lea " MEMLEA(0x10,0) ",%0 \n"
4654 : "movdqa %%xmm2,%%xmm4 \n"
4655 : "punpcklbw %%xmm1,%%xmm2 \n"
4656 : "movdqa %%xmm2,%%xmm3 \n"
4657 : "punpcklwd %%xmm1,%%xmm2 \n"
4658 : "punpckhwd %%xmm1,%%xmm3 \n"
4659 : "punpckhbw %%xmm1,%%xmm4 \n"
4660 : "movdqa %%xmm4,%%xmm5 \n"
4661 : "punpcklwd %%xmm1,%%xmm4 \n"
4662 : "punpckhwd %%xmm1,%%xmm5 \n"
4663 : "paddd %%xmm2,%%xmm0 \n"
4664 : "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4665 : "paddd %%xmm0,%%xmm2 \n"
4666 : "paddd %%xmm3,%%xmm0 \n"
4667 : "movdqu " MEMACCESS2(0x10,2) ",%%xmm3 \n"
4668 : "paddd %%xmm0,%%xmm3 \n"
4669 : "paddd %%xmm4,%%xmm0 \n"
4670 : "movdqu " MEMACCESS2(0x20,2) ",%%xmm4 \n"
4671 : "paddd %%xmm0,%%xmm4 \n"
4672 : "paddd %%xmm5,%%xmm0 \n"
4673 : "movdqu " MEMACCESS2(0x30,2) ",%%xmm5 \n"
4674 : "lea " MEMLEA(0x40,2) ",%2 \n"
4675 : "paddd %%xmm0,%%xmm5 \n"
4676 : "movdqu %%xmm2," MEMACCESS(1) " \n"
4677 : "movdqu %%xmm3," MEMACCESS2(0x10,1) " \n"
4678 : "movdqu %%xmm4," MEMACCESS2(0x20,1) " \n"
4679 : "movdqu %%xmm5," MEMACCESS2(0x30,1) " \n"
4680 : "lea " MEMLEA(0x40,1) ",%1 \n"
4681 : "sub $0x4,%3 \n"
4682 : "jge 40b \n"
4683 :
4684 : "49: \n"
4685 : "add $0x3,%3 \n"
4686 : "jl 19f \n"
4687 :
4688 : // 1 pixel loop.
4689 : LABELALIGN
4690 : "10: \n"
4691 : "movd " MEMACCESS(0) ",%%xmm2 \n"
4692 : "lea " MEMLEA(0x4,0) ",%0 \n"
4693 : "punpcklbw %%xmm1,%%xmm2 \n"
4694 : "punpcklwd %%xmm1,%%xmm2 \n"
4695 : "paddd %%xmm2,%%xmm0 \n"
4696 : "movdqu " MEMACCESS(2) ",%%xmm2 \n"
4697 : "lea " MEMLEA(0x10,2) ",%2 \n"
4698 : "paddd %%xmm0,%%xmm2 \n"
4699 : "movdqu %%xmm2," MEMACCESS(1) " \n"
4700 : "lea " MEMLEA(0x10,1) ",%1 \n"
4701 : "sub $0x1,%3 \n"
4702 : "jge 10b \n"
4703 :
4704 : "19: \n"
4705 : : "+r"(row), // %0
4706 : "+r"(cumsum), // %1
4707 : "+r"(previous_cumsum), // %2
4708 : "+r"(width) // %3
4709 : :
4710 : : "memory", "cc"
4711 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4712 0 : );
4713 0 : }
4714 : #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
4715 :
4716 : #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4717 0 : void CumulativeSumToAverageRow_SSE2(const int32* topleft,
4718 : const int32* botleft,
4719 : int width,
4720 : int area,
4721 : uint8* dst,
4722 : int count) {
4723 : asm volatile (
4724 : "movd %5,%%xmm5 \n"
4725 : "cvtdq2ps %%xmm5,%%xmm5 \n"
4726 : "rcpss %%xmm5,%%xmm4 \n"
4727 : "pshufd $0x0,%%xmm4,%%xmm4 \n"
4728 : "sub $0x4,%3 \n"
4729 : "jl 49f \n"
4730 : "cmpl $0x80,%5 \n"
4731 : "ja 40f \n"
4732 :
4733 : "pshufd $0x0,%%xmm5,%%xmm5 \n"
4734 : "pcmpeqb %%xmm6,%%xmm6 \n"
4735 : "psrld $0x10,%%xmm6 \n"
4736 : "cvtdq2ps %%xmm6,%%xmm6 \n"
4737 : "addps %%xmm6,%%xmm5 \n"
4738 : "mulps %%xmm4,%%xmm5 \n"
4739 : "cvtps2dq %%xmm5,%%xmm5 \n"
4740 : "packssdw %%xmm5,%%xmm5 \n"
4741 :
4742 : // 4 pixel small loop.
4743 : LABELALIGN
4744 : "4: \n"
4745 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4746 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4747 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4748 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4749 : MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4750 : MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4751 : MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4752 : MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4753 : "lea " MEMLEA(0x40,0) ",%0 \n"
4754 : "psubd " MEMACCESS(1) ",%%xmm0 \n"
4755 : "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4756 : "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4757 : "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4758 : MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4759 : MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4760 : MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4761 : MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4762 : "lea " MEMLEA(0x40,1) ",%1 \n"
4763 : "packssdw %%xmm1,%%xmm0 \n"
4764 : "packssdw %%xmm3,%%xmm2 \n"
4765 : "pmulhuw %%xmm5,%%xmm0 \n"
4766 : "pmulhuw %%xmm5,%%xmm2 \n"
4767 : "packuswb %%xmm2,%%xmm0 \n"
4768 : "movdqu %%xmm0," MEMACCESS(2) " \n"
4769 : "lea " MEMLEA(0x10,2) ",%2 \n"
4770 : "sub $0x4,%3 \n"
4771 : "jge 4b \n"
4772 : "jmp 49f \n"
4773 :
4774 : // 4 pixel loop \n"
4775 : LABELALIGN
4776 : "40: \n"
4777 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4778 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
4779 : "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
4780 : "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
4781 : MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4782 : MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
4783 : MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
4784 : MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
4785 : "lea " MEMLEA(0x40,0) ",%0 \n"
4786 : "psubd " MEMACCESS(1) ",%%xmm0 \n"
4787 : "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
4788 : "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
4789 : "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
4790 : MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4791 : MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
4792 : MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
4793 : MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
4794 : "lea " MEMLEA(0x40,1) ",%1 \n"
4795 : "cvtdq2ps %%xmm0,%%xmm0 \n"
4796 : "cvtdq2ps %%xmm1,%%xmm1 \n"
4797 : "mulps %%xmm4,%%xmm0 \n"
4798 : "mulps %%xmm4,%%xmm1 \n"
4799 : "cvtdq2ps %%xmm2,%%xmm2 \n"
4800 : "cvtdq2ps %%xmm3,%%xmm3 \n"
4801 : "mulps %%xmm4,%%xmm2 \n"
4802 : "mulps %%xmm4,%%xmm3 \n"
4803 : "cvtps2dq %%xmm0,%%xmm0 \n"
4804 : "cvtps2dq %%xmm1,%%xmm1 \n"
4805 : "cvtps2dq %%xmm2,%%xmm2 \n"
4806 : "cvtps2dq %%xmm3,%%xmm3 \n"
4807 : "packssdw %%xmm1,%%xmm0 \n"
4808 : "packssdw %%xmm3,%%xmm2 \n"
4809 : "packuswb %%xmm2,%%xmm0 \n"
4810 : "movdqu %%xmm0," MEMACCESS(2) " \n"
4811 : "lea " MEMLEA(0x10,2) ",%2 \n"
4812 : "sub $0x4,%3 \n"
4813 : "jge 40b \n"
4814 :
4815 : "49: \n"
4816 : "add $0x3,%3 \n"
4817 : "jl 19f \n"
4818 :
4819 : // 1 pixel loop \n"
4820 : LABELALIGN
4821 : "10: \n"
4822 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
4823 : MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
4824 : "lea " MEMLEA(0x10,0) ",%0 \n"
4825 : "psubd " MEMACCESS(1) ",%%xmm0 \n"
4826 : MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
4827 : "lea " MEMLEA(0x10,1) ",%1 \n"
4828 : "cvtdq2ps %%xmm0,%%xmm0 \n"
4829 : "mulps %%xmm4,%%xmm0 \n"
4830 : "cvtps2dq %%xmm0,%%xmm0 \n"
4831 : "packssdw %%xmm0,%%xmm0 \n"
4832 : "packuswb %%xmm0,%%xmm0 \n"
4833 : "movd %%xmm0," MEMACCESS(2) " \n"
4834 : "lea " MEMLEA(0x4,2) ",%2 \n"
4835 : "sub $0x1,%3 \n"
4836 : "jge 10b \n"
4837 : "19: \n"
4838 : : "+r"(topleft), // %0
4839 : "+r"(botleft), // %1
4840 : "+r"(dst), // %2
4841 : "+rm"(count) // %3
4842 0 : : "r"((intptr_t)(width)), // %4
4843 : "rm"(area) // %5
4844 : : "memory", "cc", NACL_R14
4845 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4846 0 : );
4847 0 : }
4848 : #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4849 :
4850 : #ifdef HAS_ARGBAFFINEROW_SSE2
4851 : // Copy ARGB pixels from source image with slope to a row of destination.
4852 : LIBYUV_API
4853 0 : void ARGBAffineRow_SSE2(const uint8* src_argb,
4854 : int src_argb_stride,
4855 : uint8* dst_argb,
4856 : const float* src_dudv,
4857 : int width) {
4858 0 : intptr_t src_argb_stride_temp = src_argb_stride;
4859 : intptr_t temp;
4860 : asm volatile (
4861 : "movq " MEMACCESS(3) ",%%xmm2 \n"
4862 : "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
4863 : "shl $0x10,%1 \n"
4864 : "add $0x4,%1 \n"
4865 : "movd %1,%%xmm5 \n"
4866 : "sub $0x4,%4 \n"
4867 : "jl 49f \n"
4868 :
4869 : "pshufd $0x44,%%xmm7,%%xmm7 \n"
4870 : "pshufd $0x0,%%xmm5,%%xmm5 \n"
4871 : "movdqa %%xmm2,%%xmm0 \n"
4872 : "addps %%xmm7,%%xmm0 \n"
4873 : "movlhps %%xmm0,%%xmm2 \n"
4874 : "movdqa %%xmm7,%%xmm4 \n"
4875 : "addps %%xmm4,%%xmm4 \n"
4876 : "movdqa %%xmm2,%%xmm3 \n"
4877 : "addps %%xmm4,%%xmm3 \n"
4878 : "addps %%xmm4,%%xmm4 \n"
4879 :
4880 : // 4 pixel loop \n"
4881 : LABELALIGN
4882 : "40: \n"
4883 : "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
4884 : "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
4885 : "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
4886 : "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
4887 : "movd %%xmm0,%k1 \n"
4888 : "pshufd $0x39,%%xmm0,%%xmm0 \n"
4889 : "movd %%xmm0,%k5 \n"
4890 : "pshufd $0x39,%%xmm0,%%xmm0 \n"
4891 : MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
4892 : MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4893 : "punpckldq %%xmm6,%%xmm1 \n"
4894 : "addps %%xmm4,%%xmm2 \n"
4895 : "movq %%xmm1," MEMACCESS(2) " \n"
4896 : "movd %%xmm0,%k1 \n"
4897 : "pshufd $0x39,%%xmm0,%%xmm0 \n"
4898 : "movd %%xmm0,%k5 \n"
4899 : MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4900 : MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
4901 : "punpckldq %%xmm6,%%xmm0 \n"
4902 : "addps %%xmm4,%%xmm3 \n"
4903 : "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
4904 : "lea " MEMLEA(0x10,2) ",%2 \n"
4905 : "sub $0x4,%4 \n"
4906 : "jge 40b \n"
4907 :
4908 : "49: \n"
4909 : "add $0x3,%4 \n"
4910 : "jl 19f \n"
4911 :
4912 : // 1 pixel loop \n"
4913 : LABELALIGN
4914 : "10: \n"
4915 : "cvttps2dq %%xmm2,%%xmm0 \n"
4916 : "packssdw %%xmm0,%%xmm0 \n"
4917 : "pmaddwd %%xmm5,%%xmm0 \n"
4918 : "addps %%xmm7,%%xmm2 \n"
4919 : "movd %%xmm0,%k1 \n"
4920 : MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
4921 : "movd %%xmm0," MEMACCESS(2) " \n"
4922 : "lea " MEMLEA(0x04,2) ",%2 \n"
4923 : "sub $0x1,%4 \n"
4924 : "jge 10b \n"
4925 : "19: \n"
4926 : : "+r"(src_argb), // %0
4927 : "+r"(src_argb_stride_temp), // %1
4928 : "+r"(dst_argb), // %2
4929 : "+r"(src_dudv), // %3
4930 : "+rm"(width), // %4
4931 : "=&r"(temp) // %5
4932 : :
4933 : : "memory", "cc", NACL_R14
4934 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4935 0 : );
4936 0 : }
4937 : #endif // HAS_ARGBAFFINEROW_SSE2
4938 :
4939 : #ifdef HAS_INTERPOLATEROW_SSSE3
4940 : // Bilinear filter 16x2 -> 16x1
4941 0 : void InterpolateRow_SSSE3(uint8* dst_ptr,
4942 : const uint8* src_ptr,
4943 : ptrdiff_t src_stride,
4944 : int dst_width,
4945 : int source_y_fraction) {
4946 : asm volatile (
4947 : "sub %1,%0 \n"
4948 : "cmp $0x0,%3 \n"
4949 : "je 100f \n"
4950 : "cmp $0x80,%3 \n"
4951 : "je 50f \n"
4952 :
4953 : "movd %3,%%xmm0 \n"
4954 : "neg %3 \n"
4955 : "add $0x100,%3 \n"
4956 : "movd %3,%%xmm5 \n"
4957 : "punpcklbw %%xmm0,%%xmm5 \n"
4958 : "punpcklwd %%xmm5,%%xmm5 \n"
4959 : "pshufd $0x0,%%xmm5,%%xmm5 \n"
4960 : "mov $0x80808080,%%eax \n"
4961 : "movd %%eax,%%xmm4 \n"
4962 : "pshufd $0x0,%%xmm4,%%xmm4 \n"
4963 :
4964 : // General purpose row blend.
4965 : LABELALIGN
4966 : "1: \n"
4967 : "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4968 : MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4969 : "movdqa %%xmm0,%%xmm1 \n"
4970 : "punpcklbw %%xmm2,%%xmm0 \n"
4971 : "punpckhbw %%xmm2,%%xmm1 \n"
4972 : "psubb %%xmm4,%%xmm0 \n"
4973 : "psubb %%xmm4,%%xmm1 \n"
4974 : "movdqa %%xmm5,%%xmm2 \n"
4975 : "movdqa %%xmm5,%%xmm3 \n"
4976 : "pmaddubsw %%xmm0,%%xmm2 \n"
4977 : "pmaddubsw %%xmm1,%%xmm3 \n"
4978 : "paddw %%xmm4,%%xmm2 \n"
4979 : "paddw %%xmm4,%%xmm3 \n"
4980 : "psrlw $0x8,%%xmm2 \n"
4981 : "psrlw $0x8,%%xmm3 \n"
4982 : "packuswb %%xmm3,%%xmm2 \n"
4983 : MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4984 : "lea " MEMLEA(0x10,1) ",%1 \n"
4985 : "sub $0x10,%2 \n"
4986 : "jg 1b \n"
4987 : "jmp 99f \n"
4988 :
4989 : // Blend 50 / 50.
4990 : LABELALIGN
4991 : "50: \n"
4992 : "movdqu " MEMACCESS(1) ",%%xmm0 \n"
4993 : MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4994 : "pavgb %%xmm1,%%xmm0 \n"
4995 : MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4996 : "lea " MEMLEA(0x10,1) ",%1 \n"
4997 : "sub $0x10,%2 \n"
4998 : "jg 50b \n"
4999 : "jmp 99f \n"
5000 :
5001 : // Blend 100 / 0 - Copy row unchanged.
5002 : LABELALIGN
5003 : "100: \n"
5004 : "movdqu " MEMACCESS(1) ",%%xmm0 \n"
5005 : MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5006 : "lea " MEMLEA(0x10,1) ",%1 \n"
5007 : "sub $0x10,%2 \n"
5008 : "jg 100b \n"
5009 :
5010 : "99: \n"
5011 : : "+r"(dst_ptr), // %0
5012 : "+r"(src_ptr), // %1
5013 : "+rm"(dst_width), // %2
5014 : "+r"(source_y_fraction) // %3
5015 : : "r"((intptr_t)(src_stride)) // %4
5016 : : "memory", "cc", "eax", NACL_R14
5017 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5018 0 : );
5019 0 : }
5020 : #endif // HAS_INTERPOLATEROW_SSSE3
5021 :
5022 : #ifdef HAS_INTERPOLATEROW_AVX2
5023 : // Bilinear filter 32x2 -> 32x1
5024 0 : void InterpolateRow_AVX2(uint8* dst_ptr,
5025 : const uint8* src_ptr,
5026 : ptrdiff_t src_stride,
5027 : int dst_width,
5028 : int source_y_fraction) {
5029 : asm volatile (
5030 : "cmp $0x0,%3 \n"
5031 : "je 100f \n"
5032 : "sub %1,%0 \n"
5033 : "cmp $0x80,%3 \n"
5034 : "je 50f \n"
5035 :
5036 : "vmovd %3,%%xmm0 \n"
5037 : "neg %3 \n"
5038 : "add $0x100,%3 \n"
5039 : "vmovd %3,%%xmm5 \n"
5040 : "vpunpcklbw %%xmm0,%%xmm5,%%xmm5 \n"
5041 : "vpunpcklwd %%xmm5,%%xmm5,%%xmm5 \n"
5042 : "vbroadcastss %%xmm5,%%ymm5 \n"
5043 : "mov $0x80808080,%%eax \n"
5044 : "vmovd %%eax,%%xmm4 \n"
5045 : "vbroadcastss %%xmm4,%%ymm4 \n"
5046 :
5047 : // General purpose row blend.
5048 : LABELALIGN
5049 : "1: \n"
5050 : "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
5051 : MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
5052 : "vpunpckhbw %%ymm2,%%ymm0,%%ymm1 \n"
5053 : "vpunpcklbw %%ymm2,%%ymm0,%%ymm0 \n"
5054 : "vpsubb %%ymm4,%%ymm1,%%ymm1 \n"
5055 : "vpsubb %%ymm4,%%ymm0,%%ymm0 \n"
5056 : "vpmaddubsw %%ymm1,%%ymm5,%%ymm1 \n"
5057 : "vpmaddubsw %%ymm0,%%ymm5,%%ymm0 \n"
5058 : "vpaddw %%ymm4,%%ymm1,%%ymm1 \n"
5059 : "vpaddw %%ymm4,%%ymm0,%%ymm0 \n"
5060 : "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
5061 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
5062 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
5063 : MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5064 : "lea " MEMLEA(0x20,1) ",%1 \n"
5065 : "sub $0x20,%2 \n"
5066 : "jg 1b \n"
5067 : "jmp 99f \n"
5068 :
5069 : // Blend 50 / 50.
5070 : LABELALIGN
5071 : "50: \n"
5072 : "vmovdqu " MEMACCESS(1) ",%%ymm0 \n"
5073 : VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0) // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5074 : MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5075 : "lea " MEMLEA(0x20,1) ",%1 \n"
5076 : "sub $0x20,%2 \n"
5077 : "jg 50b \n"
5078 : "jmp 99f \n"
5079 :
5080 : // Blend 100 / 0 - Copy row unchanged.
5081 : LABELALIGN
5082 : "100: \n"
5083 : "rep movsb " MEMMOVESTRING(1,0) " \n"
5084 : "jmp 999f \n"
5085 :
5086 : "99: \n"
5087 : "vzeroupper \n"
5088 : "999: \n"
5089 : : "+D"(dst_ptr), // %0
5090 : "+S"(src_ptr), // %1
5091 : "+cm"(dst_width), // %2
5092 : "+r"(source_y_fraction) // %3
5093 : : "r"((intptr_t)(src_stride)) // %4
5094 : : "memory", "cc", "eax", NACL_R14
5095 : "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5096 0 : );
5097 0 : }
5098 : #endif // HAS_INTERPOLATEROW_AVX2
5099 :
5100 : #ifdef HAS_ARGBSHUFFLEROW_SSSE3
5101 : // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5102 0 : void ARGBShuffleRow_SSSE3(const uint8* src_argb,
5103 : uint8* dst_argb,
5104 : const uint8* shuffler,
5105 : int width) {
5106 : asm volatile (
5107 : "movdqu " MEMACCESS(3) ",%%xmm5 \n"
5108 : LABELALIGN
5109 : "1: \n"
5110 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5111 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
5112 : "lea " MEMLEA(0x20,0) ",%0 \n"
5113 : "pshufb %%xmm5,%%xmm0 \n"
5114 : "pshufb %%xmm5,%%xmm1 \n"
5115 : "movdqu %%xmm0," MEMACCESS(1) " \n"
5116 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
5117 : "lea " MEMLEA(0x20,1) ",%1 \n"
5118 : "sub $0x8,%2 \n"
5119 : "jg 1b \n"
5120 : : "+r"(src_argb), // %0
5121 : "+r"(dst_argb), // %1
5122 : "+r"(width) // %2
5123 : : "r"(shuffler) // %3
5124 : : "memory", "cc"
5125 : , "xmm0", "xmm1", "xmm5"
5126 0 : );
5127 0 : }
5128 : #endif // HAS_ARGBSHUFFLEROW_SSSE3
5129 :
5130 : #ifdef HAS_ARGBSHUFFLEROW_AVX2
5131 : // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5132 0 : void ARGBShuffleRow_AVX2(const uint8* src_argb,
5133 : uint8* dst_argb,
5134 : const uint8* shuffler,
5135 : int width) {
5136 : asm volatile (
5137 : "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
5138 : LABELALIGN
5139 : "1: \n"
5140 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
5141 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
5142 : "lea " MEMLEA(0x40,0) ",%0 \n"
5143 : "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
5144 : "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
5145 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
5146 : "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
5147 : "lea " MEMLEA(0x40,1) ",%1 \n"
5148 : "sub $0x10,%2 \n"
5149 : "jg 1b \n"
5150 : "vzeroupper \n"
5151 : : "+r"(src_argb), // %0
5152 : "+r"(dst_argb), // %1
5153 : "+r"(width) // %2
5154 : : "r"(shuffler) // %3
5155 : : "memory", "cc"
5156 : , "xmm0", "xmm1", "xmm5"
5157 0 : );
5158 0 : }
5159 : #endif // HAS_ARGBSHUFFLEROW_AVX2
5160 :
5161 : #ifdef HAS_ARGBSHUFFLEROW_SSE2
5162 : // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5163 0 : void ARGBShuffleRow_SSE2(const uint8* src_argb,
5164 : uint8* dst_argb,
5165 : const uint8* shuffler,
5166 : int width) {
5167 : uintptr_t pixel_temp;
5168 : asm volatile (
5169 : "pxor %%xmm5,%%xmm5 \n"
5170 : "mov " MEMACCESS(4) ",%k2 \n"
5171 : "cmp $0x3000102,%k2 \n"
5172 : "je 3012f \n"
5173 : "cmp $0x10203,%k2 \n"
5174 : "je 123f \n"
5175 : "cmp $0x30201,%k2 \n"
5176 : "je 321f \n"
5177 : "cmp $0x2010003,%k2 \n"
5178 : "je 2103f \n"
5179 :
5180 : LABELALIGN
5181 : "1: \n"
5182 : "movzb " MEMACCESS(4) ",%2 \n"
5183 : MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5184 : "mov %b2," MEMACCESS(1) " \n"
5185 : "movzb " MEMACCESS2(0x1,4) ",%2 \n"
5186 : MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5187 : "mov %b2," MEMACCESS2(0x1,1) " \n"
5188 : "movzb " MEMACCESS2(0x2,4) ",%2 \n"
5189 : MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5190 : "mov %b2," MEMACCESS2(0x2,1) " \n"
5191 : "movzb " MEMACCESS2(0x3,4) ",%2 \n"
5192 : MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
5193 : "mov %b2," MEMACCESS2(0x3,1) " \n"
5194 : "lea " MEMLEA(0x4,0) ",%0 \n"
5195 : "lea " MEMLEA(0x4,1) ",%1 \n"
5196 : "sub $0x1,%3 \n"
5197 : "jg 1b \n"
5198 : "jmp 99f \n"
5199 :
5200 : LABELALIGN
5201 : "123: \n"
5202 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5203 : "lea " MEMLEA(0x10,0) ",%0 \n"
5204 : "movdqa %%xmm0,%%xmm1 \n"
5205 : "punpcklbw %%xmm5,%%xmm0 \n"
5206 : "punpckhbw %%xmm5,%%xmm1 \n"
5207 : "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
5208 : "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
5209 : "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
5210 : "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
5211 : "packuswb %%xmm1,%%xmm0 \n"
5212 : "movdqu %%xmm0," MEMACCESS(1) " \n"
5213 : "lea " MEMLEA(0x10,1) ",%1 \n"
5214 : "sub $0x4,%3 \n"
5215 : "jg 123b \n"
5216 : "jmp 99f \n"
5217 :
5218 : LABELALIGN
5219 : "321: \n"
5220 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5221 : "lea " MEMLEA(0x10,0) ",%0 \n"
5222 : "movdqa %%xmm0,%%xmm1 \n"
5223 : "punpcklbw %%xmm5,%%xmm0 \n"
5224 : "punpckhbw %%xmm5,%%xmm1 \n"
5225 : "pshufhw $0x39,%%xmm0,%%xmm0 \n"
5226 : "pshuflw $0x39,%%xmm0,%%xmm0 \n"
5227 : "pshufhw $0x39,%%xmm1,%%xmm1 \n"
5228 : "pshuflw $0x39,%%xmm1,%%xmm1 \n"
5229 : "packuswb %%xmm1,%%xmm0 \n"
5230 : "movdqu %%xmm0," MEMACCESS(1) " \n"
5231 : "lea " MEMLEA(0x10,1) ",%1 \n"
5232 : "sub $0x4,%3 \n"
5233 : "jg 321b \n"
5234 : "jmp 99f \n"
5235 :
5236 : LABELALIGN
5237 : "2103: \n"
5238 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5239 : "lea " MEMLEA(0x10,0) ",%0 \n"
5240 : "movdqa %%xmm0,%%xmm1 \n"
5241 : "punpcklbw %%xmm5,%%xmm0 \n"
5242 : "punpckhbw %%xmm5,%%xmm1 \n"
5243 : "pshufhw $0x93,%%xmm0,%%xmm0 \n"
5244 : "pshuflw $0x93,%%xmm0,%%xmm0 \n"
5245 : "pshufhw $0x93,%%xmm1,%%xmm1 \n"
5246 : "pshuflw $0x93,%%xmm1,%%xmm1 \n"
5247 : "packuswb %%xmm1,%%xmm0 \n"
5248 : "movdqu %%xmm0," MEMACCESS(1) " \n"
5249 : "lea " MEMLEA(0x10,1) ",%1 \n"
5250 : "sub $0x4,%3 \n"
5251 : "jg 2103b \n"
5252 : "jmp 99f \n"
5253 :
5254 : LABELALIGN
5255 : "3012: \n"
5256 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5257 : "lea " MEMLEA(0x10,0) ",%0 \n"
5258 : "movdqa %%xmm0,%%xmm1 \n"
5259 : "punpcklbw %%xmm5,%%xmm0 \n"
5260 : "punpckhbw %%xmm5,%%xmm1 \n"
5261 : "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
5262 : "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
5263 : "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
5264 : "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
5265 : "packuswb %%xmm1,%%xmm0 \n"
5266 : "movdqu %%xmm0," MEMACCESS(1) " \n"
5267 : "lea " MEMLEA(0x10,1) ",%1 \n"
5268 : "sub $0x4,%3 \n"
5269 : "jg 3012b \n"
5270 :
5271 : "99: \n"
5272 : : "+r"(src_argb), // %0
5273 : "+r"(dst_argb), // %1
5274 : "=&d"(pixel_temp), // %2
5275 : "+r"(width) // %3
5276 : : "r"(shuffler) // %4
5277 : : "memory", "cc", NACL_R14
5278 : "xmm0", "xmm1", "xmm5"
5279 0 : );
5280 0 : }
5281 : #endif // HAS_ARGBSHUFFLEROW_SSE2
5282 :
5283 : #ifdef HAS_I422TOYUY2ROW_SSE2
5284 0 : void I422ToYUY2Row_SSE2(const uint8* src_y,
5285 : const uint8* src_u,
5286 : const uint8* src_v,
5287 : uint8* dst_frame,
5288 : int width) {
5289 : asm volatile (
5290 : "sub %1,%2 \n"
5291 : LABELALIGN
5292 : "1: \n"
5293 : "movq " MEMACCESS(1) ",%%xmm2 \n"
5294 : MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5295 : "lea " MEMLEA(0x8,1) ",%1 \n"
5296 : "punpcklbw %%xmm3,%%xmm2 \n"
5297 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5298 : "lea " MEMLEA(0x10,0) ",%0 \n"
5299 : "movdqa %%xmm0,%%xmm1 \n"
5300 : "punpcklbw %%xmm2,%%xmm0 \n"
5301 : "punpckhbw %%xmm2,%%xmm1 \n"
5302 : "movdqu %%xmm0," MEMACCESS(3) " \n"
5303 : "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
5304 : "lea " MEMLEA(0x20,3) ",%3 \n"
5305 : "sub $0x10,%4 \n"
5306 : "jg 1b \n"
5307 : : "+r"(src_y), // %0
5308 : "+r"(src_u), // %1
5309 : "+r"(src_v), // %2
5310 : "+r"(dst_frame), // %3
5311 : "+rm"(width) // %4
5312 : :
5313 : : "memory", "cc", NACL_R14
5314 : "xmm0", "xmm1", "xmm2", "xmm3"
5315 0 : );
5316 0 : }
5317 : #endif // HAS_I422TOYUY2ROW_SSE2
5318 :
5319 : #ifdef HAS_I422TOUYVYROW_SSE2
5320 0 : void I422ToUYVYRow_SSE2(const uint8* src_y,
5321 : const uint8* src_u,
5322 : const uint8* src_v,
5323 : uint8* dst_frame,
5324 : int width) {
5325 : asm volatile (
5326 : "sub %1,%2 \n"
5327 : LABELALIGN
5328 : "1: \n"
5329 : "movq " MEMACCESS(1) ",%%xmm2 \n"
5330 : MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
5331 : "lea " MEMLEA(0x8,1) ",%1 \n"
5332 : "punpcklbw %%xmm3,%%xmm2 \n"
5333 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
5334 : "movdqa %%xmm2,%%xmm1 \n"
5335 : "lea " MEMLEA(0x10,0) ",%0 \n"
5336 : "punpcklbw %%xmm0,%%xmm1 \n"
5337 : "punpckhbw %%xmm0,%%xmm2 \n"
5338 : "movdqu %%xmm1," MEMACCESS(3) " \n"
5339 : "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
5340 : "lea " MEMLEA(0x20,3) ",%3 \n"
5341 : "sub $0x10,%4 \n"
5342 : "jg 1b \n"
5343 : : "+r"(src_y), // %0
5344 : "+r"(src_u), // %1
5345 : "+r"(src_v), // %2
5346 : "+r"(dst_frame), // %3
5347 : "+rm"(width) // %4
5348 : :
5349 : : "memory", "cc", NACL_R14
5350 : "xmm0", "xmm1", "xmm2", "xmm3"
5351 0 : );
5352 0 : }
5353 : #endif // HAS_I422TOUYVYROW_SSE2
5354 :
5355 : #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5356 0 : void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5357 : uint8* dst_argb,
5358 : const float* poly,
5359 : int width) {
5360 : asm volatile (
5361 : "pxor %%xmm3,%%xmm3 \n"
5362 :
5363 : // 2 pixel loop.
5364 : LABELALIGN
5365 : "1: \n"
5366 : "movq " MEMACCESS(0) ",%%xmm0 \n"
5367 : "lea " MEMLEA(0x8,0) ",%0 \n"
5368 : "punpcklbw %%xmm3,%%xmm0 \n"
5369 : "movdqa %%xmm0,%%xmm4 \n"
5370 : "punpcklwd %%xmm3,%%xmm0 \n"
5371 : "punpckhwd %%xmm3,%%xmm4 \n"
5372 : "cvtdq2ps %%xmm0,%%xmm0 \n"
5373 : "cvtdq2ps %%xmm4,%%xmm4 \n"
5374 : "movdqa %%xmm0,%%xmm1 \n"
5375 : "movdqa %%xmm4,%%xmm5 \n"
5376 : "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
5377 : "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
5378 : "addps " MEMACCESS(3) ",%%xmm0 \n"
5379 : "addps " MEMACCESS(3) ",%%xmm4 \n"
5380 : "movdqa %%xmm1,%%xmm2 \n"
5381 : "movdqa %%xmm5,%%xmm6 \n"
5382 : "mulps %%xmm1,%%xmm2 \n"
5383 : "mulps %%xmm5,%%xmm6 \n"
5384 : "mulps %%xmm2,%%xmm1 \n"
5385 : "mulps %%xmm6,%%xmm5 \n"
5386 : "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
5387 : "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
5388 : "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
5389 : "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
5390 : "addps %%xmm2,%%xmm0 \n"
5391 : "addps %%xmm6,%%xmm4 \n"
5392 : "addps %%xmm1,%%xmm0 \n"
5393 : "addps %%xmm5,%%xmm4 \n"
5394 : "cvttps2dq %%xmm0,%%xmm0 \n"
5395 : "cvttps2dq %%xmm4,%%xmm4 \n"
5396 : "packuswb %%xmm4,%%xmm0 \n"
5397 : "packuswb %%xmm0,%%xmm0 \n"
5398 : "movq %%xmm0," MEMACCESS(1) " \n"
5399 : "lea " MEMLEA(0x8,1) ",%1 \n"
5400 : "sub $0x2,%2 \n"
5401 : "jg 1b \n"
5402 : : "+r"(src_argb), // %0
5403 : "+r"(dst_argb), // %1
5404 : "+r"(width) // %2
5405 : : "r"(poly) // %3
5406 : : "memory", "cc"
5407 : , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5408 0 : );
5409 0 : }
5410 : #endif // HAS_ARGBPOLYNOMIALROW_SSE2
5411 :
5412 : #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5413 0 : void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5414 : uint8* dst_argb,
5415 : const float* poly,
5416 : int width) {
5417 : asm volatile (
5418 : "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
5419 : "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5420 : "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5421 : "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5422 :
5423 : // 2 pixel loop.
5424 : LABELALIGN
5425 : "1: \n"
5426 : "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
5427 : "lea " MEMLEA(0x8,0) ",%0 \n"
5428 : "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
5429 : "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
5430 : "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
5431 : "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
5432 : "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
5433 : "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
5434 : "vcvttps2dq %%ymm0,%%ymm0 \n"
5435 : "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
5436 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
5437 : "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
5438 : "vmovq %%xmm0," MEMACCESS(1) " \n"
5439 : "lea " MEMLEA(0x8,1) ",%1 \n"
5440 : "sub $0x2,%2 \n"
5441 : "jg 1b \n"
5442 : "vzeroupper \n"
5443 : : "+r"(src_argb), // %0
5444 : "+r"(dst_argb), // %1
5445 : "+r"(width) // %2
5446 : : "r"(poly) // %3
5447 : : "memory", "cc",
5448 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5449 0 : );
5450 0 : }
5451 : #endif // HAS_ARGBPOLYNOMIALROW_AVX2
5452 :
5453 : #ifdef HAS_HALFFLOATROW_SSE2
5454 : static float kScaleBias = 1.9259299444e-34f;
5455 0 : void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
5456 : asm volatile (
5457 : "pshufd $0x0,%3,%%xmm4 \n"
5458 : "pxor %%xmm5,%%xmm5 \n"
5459 : "sub %0,%1 \n"
5460 :
5461 : // 16 pixel loop.
5462 : LABELALIGN
5463 : "1: \n"
5464 : "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts
5465 : "add $0x10,%0 \n"
5466 : "movdqa %%xmm2,%%xmm3 \n"
5467 : "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1
5468 : "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats
5469 : "punpckhwd %%xmm5,%%xmm3 \n"
5470 : "cvtdq2ps %%xmm3,%%xmm3 \n"
5471 : "mulps %%xmm4,%%xmm2 \n"
5472 : "mulps %%xmm4,%%xmm3 \n"
5473 : "psrld $0xd,%%xmm2 \n"
5474 : "psrld $0xd,%%xmm3 \n"
5475 : "packssdw %%xmm3,%%xmm2 \n"
5476 : MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
5477 : "sub $0x8,%2 \n"
5478 : "jg 1b \n"
5479 : : "+r"(src), // %0
5480 : "+r"(dst), // %1
5481 : "+r"(width) // %2
5482 0 : : "x"(scale * kScaleBias) // %3
5483 : : "memory", "cc",
5484 : "xmm2", "xmm3", "xmm4", "xmm5"
5485 0 : );
5486 0 : }
5487 : #endif // HAS_HALFFLOATROW_SSE2
5488 :
5489 : #ifdef HAS_HALFFLOATROW_AVX2
5490 0 : void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5491 : asm volatile (
5492 : "vbroadcastss %3, %%ymm4 \n"
5493 : "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
5494 : "sub %0,%1 \n"
5495 :
5496 : // 16 pixel loop.
5497 : LABELALIGN
5498 : "1: \n"
5499 : "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts
5500 : "add $0x20,%0 \n"
5501 : "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates
5502 : "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n"
5503 : "vcvtdq2ps %%ymm3,%%ymm3 \n"
5504 : "vcvtdq2ps %%ymm2,%%ymm2 \n"
5505 : "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5506 : "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5507 : "vpsrld $0xd,%%ymm3,%%ymm3 \n"
5508 : "vpsrld $0xd,%%ymm2,%%ymm2 \n"
5509 : "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates
5510 : MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
5511 : "sub $0x10,%2 \n"
5512 : "jg 1b \n"
5513 :
5514 : "vzeroupper \n"
5515 : : "+r"(src), // %0
5516 : "+r"(dst), // %1
5517 : "+r"(width) // %2
5518 0 : : "x"(scale * kScaleBias) // %3
5519 : : "memory", "cc",
5520 : "xmm2", "xmm3", "xmm4", "xmm5"
5521 0 : );
5522 0 : }
5523 : #endif // HAS_HALFFLOATROW_AVX2
5524 :
5525 : #ifdef HAS_HALFFLOATROW_F16C
5526 : void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5527 : asm volatile (
5528 : "vbroadcastss %3, %%ymm4 \n"
5529 : "sub %0,%1 \n"
5530 :
5531 : // 16 pixel loop.
5532 : LABELALIGN
5533 : "1: \n"
5534 : "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
5535 : "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5536 : "vcvtdq2ps %%ymm2,%%ymm2 \n"
5537 : "vcvtdq2ps %%ymm3,%%ymm3 \n"
5538 : "vmulps %%ymm2,%%ymm4,%%ymm2 \n"
5539 : "vmulps %%ymm3,%%ymm4,%%ymm3 \n"
5540 : "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
5541 : "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
5542 : MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5543 : MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5544 : "add $0x20,%0 \n"
5545 : "sub $0x10,%2 \n"
5546 : "jg 1b \n"
5547 : "vzeroupper \n"
5548 : : "+r"(src), // %0
5549 : "+r"(dst), // %1
5550 : "+r"(width) // %2
5551 : : "x"(scale) // %3
5552 : : "memory", "cc",
5553 : "xmm2", "xmm3", "xmm4"
5554 : );
5555 : }
5556 : #endif // HAS_HALFFLOATROW_F16C
5557 :
5558 : #ifdef HAS_HALFFLOATROW_F16C
5559 : void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
5560 : asm volatile (
5561 : "sub %0,%1 \n"
5562 : // 16 pixel loop.
5563 : LABELALIGN
5564 : "1: \n"
5565 : "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints
5566 : "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5567 : "vcvtdq2ps %%ymm2,%%ymm2 \n"
5568 : "vcvtdq2ps %%ymm3,%%ymm3 \n"
5569 : "vcvtps2ph $3, %%ymm2, %%xmm2 \n"
5570 : "vcvtps2ph $3, %%ymm3, %%xmm3 \n"
5571 : MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5572 : MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5573 : "add $0x20,%0 \n"
5574 : "sub $0x10,%2 \n"
5575 : "jg 1b \n"
5576 : "vzeroupper \n"
5577 : : "+r"(src), // %0
5578 : "+r"(dst), // %1
5579 : "+r"(width) // %2
5580 : :
5581 : : "memory", "cc",
5582 : "xmm2", "xmm3"
5583 : );
5584 : }
5585 : #endif // HAS_HALFFLOATROW_F16C
5586 :
5587 : #ifdef HAS_ARGBCOLORTABLEROW_X86
5588 : // Tranform ARGB pixels with color table.
5589 0 : void ARGBColorTableRow_X86(uint8* dst_argb,
5590 : const uint8* table_argb,
5591 : int width) {
5592 : uintptr_t pixel_temp;
5593 : asm volatile (
5594 : // 1 pixel loop.
5595 : LABELALIGN
5596 : "1: \n"
5597 : "movzb " MEMACCESS(0) ",%1 \n"
5598 : "lea " MEMLEA(0x4,0) ",%0 \n"
5599 : MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5600 : "mov %b1," MEMACCESS2(-0x4,0) " \n"
5601 : "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5602 : MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5603 : "mov %b1," MEMACCESS2(-0x3,0) " \n"
5604 : "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5605 : MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5606 : "mov %b1," MEMACCESS2(-0x2,0) " \n"
5607 : "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
5608 : MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
5609 : "mov %b1," MEMACCESS2(-0x1,0) " \n"
5610 : "dec %2 \n"
5611 : "jg 1b \n"
5612 : : "+r"(dst_argb), // %0
5613 : "=&d"(pixel_temp), // %1
5614 : "+r"(width) // %2
5615 : : "r"(table_argb) // %3
5616 0 : : "memory", "cc");
5617 0 : }
5618 : #endif // HAS_ARGBCOLORTABLEROW_X86
5619 :
5620 : #ifdef HAS_RGBCOLORTABLEROW_X86
5621 : // Tranform RGB pixels with color table.
5622 0 : void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5623 : uintptr_t pixel_temp;
5624 : asm volatile (
5625 : // 1 pixel loop.
5626 : LABELALIGN
5627 : "1: \n"
5628 : "movzb " MEMACCESS(0) ",%1 \n"
5629 : "lea " MEMLEA(0x4,0) ",%0 \n"
5630 : MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
5631 : "mov %b1," MEMACCESS2(-0x4,0) " \n"
5632 : "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
5633 : MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
5634 : "mov %b1," MEMACCESS2(-0x3,0) " \n"
5635 : "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
5636 : MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
5637 : "mov %b1," MEMACCESS2(-0x2,0) " \n"
5638 : "dec %2 \n"
5639 : "jg 1b \n"
5640 : : "+r"(dst_argb), // %0
5641 : "=&d"(pixel_temp), // %1
5642 : "+r"(width) // %2
5643 : : "r"(table_argb) // %3
5644 0 : : "memory", "cc");
5645 0 : }
5646 : #endif // HAS_RGBCOLORTABLEROW_X86
5647 :
5648 : #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5649 : // Tranform RGB pixels with luma table.
5650 0 : void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
5651 : uint8* dst_argb,
5652 : int width,
5653 : const uint8* luma,
5654 : uint32 lumacoeff) {
5655 : uintptr_t pixel_temp;
5656 : uintptr_t table_temp;
5657 : asm volatile (
5658 : "movd %6,%%xmm3 \n"
5659 : "pshufd $0x0,%%xmm3,%%xmm3 \n"
5660 : "pcmpeqb %%xmm4,%%xmm4 \n"
5661 : "psllw $0x8,%%xmm4 \n"
5662 : "pxor %%xmm5,%%xmm5 \n"
5663 :
5664 : // 4 pixel loop.
5665 : LABELALIGN
5666 : "1: \n"
5667 : "movdqu " MEMACCESS(2) ",%%xmm0 \n"
5668 : "pmaddubsw %%xmm3,%%xmm0 \n"
5669 : "phaddw %%xmm0,%%xmm0 \n"
5670 : "pand %%xmm4,%%xmm0 \n"
5671 : "punpcklwd %%xmm5,%%xmm0 \n"
5672 : "movd %%xmm0,%k1 \n" // 32 bit offset
5673 : "add %5,%1 \n"
5674 : "pshufd $0x39,%%xmm0,%%xmm0 \n"
5675 :
5676 : "movzb " MEMACCESS(2) ",%0 \n"
5677 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5678 : "mov %b0," MEMACCESS(3) " \n"
5679 : "movzb " MEMACCESS2(0x1,2) ",%0 \n"
5680 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5681 : "mov %b0," MEMACCESS2(0x1,3) " \n"
5682 : "movzb " MEMACCESS2(0x2,2) ",%0 \n"
5683 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5684 : "mov %b0," MEMACCESS2(0x2,3) " \n"
5685 : "movzb " MEMACCESS2(0x3,2) ",%0 \n"
5686 : "mov %b0," MEMACCESS2(0x3,3) " \n"
5687 :
5688 : "movd %%xmm0,%k1 \n" // 32 bit offset
5689 : "add %5,%1 \n"
5690 : "pshufd $0x39,%%xmm0,%%xmm0 \n"
5691 :
5692 : "movzb " MEMACCESS2(0x4,2) ",%0 \n"
5693 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5694 : "mov %b0," MEMACCESS2(0x4,3) " \n"
5695 : "movzb " MEMACCESS2(0x5,2) ",%0 \n"
5696 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5697 : "mov %b0," MEMACCESS2(0x5,3) " \n"
5698 : "movzb " MEMACCESS2(0x6,2) ",%0 \n"
5699 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5700 : "mov %b0," MEMACCESS2(0x6,3) " \n"
5701 : "movzb " MEMACCESS2(0x7,2) ",%0 \n"
5702 : "mov %b0," MEMACCESS2(0x7,3) " \n"
5703 :
5704 : "movd %%xmm0,%k1 \n" // 32 bit offset
5705 : "add %5,%1 \n"
5706 : "pshufd $0x39,%%xmm0,%%xmm0 \n"
5707 :
5708 : "movzb " MEMACCESS2(0x8,2) ",%0 \n"
5709 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5710 : "mov %b0," MEMACCESS2(0x8,3) " \n"
5711 : "movzb " MEMACCESS2(0x9,2) ",%0 \n"
5712 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5713 : "mov %b0," MEMACCESS2(0x9,3) " \n"
5714 : "movzb " MEMACCESS2(0xa,2) ",%0 \n"
5715 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5716 : "mov %b0," MEMACCESS2(0xa,3) " \n"
5717 : "movzb " MEMACCESS2(0xb,2) ",%0 \n"
5718 : "mov %b0," MEMACCESS2(0xb,3) " \n"
5719 :
5720 : "movd %%xmm0,%k1 \n" // 32 bit offset
5721 : "add %5,%1 \n"
5722 :
5723 : "movzb " MEMACCESS2(0xc,2) ",%0 \n"
5724 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5725 : "mov %b0," MEMACCESS2(0xc,3) " \n"
5726 : "movzb " MEMACCESS2(0xd,2) ",%0 \n"
5727 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5728 : "mov %b0," MEMACCESS2(0xd,3) " \n"
5729 : "movzb " MEMACCESS2(0xe,2) ",%0 \n"
5730 : MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
5731 : "mov %b0," MEMACCESS2(0xe,3) " \n"
5732 : "movzb " MEMACCESS2(0xf,2) ",%0 \n"
5733 : "mov %b0," MEMACCESS2(0xf,3) " \n"
5734 : "lea " MEMLEA(0x10,2) ",%2 \n"
5735 : "lea " MEMLEA(0x10,3) ",%3 \n"
5736 : "sub $0x4,%4 \n"
5737 : "jg 1b \n"
5738 : : "=&d"(pixel_temp), // %0
5739 : "=&a"(table_temp), // %1
5740 : "+r"(src_argb), // %2
5741 : "+r"(dst_argb), // %3
5742 : "+rm"(width) // %4
5743 : : "r"(luma), // %5
5744 : "rm"(lumacoeff) // %6
5745 : : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5746 0 : );
5747 0 : }
5748 : #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5749 :
5750 : #endif // defined(__x86_64__) || defined(__i386__)
5751 :
5752 : #ifdef __cplusplus
5753 : } // extern "C"
5754 : } // namespace libyuv
5755 : #endif
|