Line data Source code
1 : /*
2 : * Copyright 2013 The LibYuv Project Authors. All rights reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "libyuv/row.h"
12 : #include "libyuv/scale_row.h"
13 :
14 : #ifdef __cplusplus
15 : namespace libyuv {
16 : extern "C" {
17 : #endif
18 :
19 : // This module is for GCC x86 and x64.
20 : #if !defined(LIBYUV_DISABLE_X86) && \
21 : (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
22 :
23 : // Offsets for source bytes 0 to 9
24 : static uvec8 kShuf0 = {0, 1, 3, 4, 5, 7, 8, 9,
25 : 128, 128, 128, 128, 128, 128, 128, 128};
26 :
27 : // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
28 : static uvec8 kShuf1 = {3, 4, 5, 7, 8, 9, 11, 12,
29 : 128, 128, 128, 128, 128, 128, 128, 128};
30 :
31 : // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
32 : static uvec8 kShuf2 = {5, 7, 8, 9, 11, 12, 13, 15,
33 : 128, 128, 128, 128, 128, 128, 128, 128};
34 :
35 : // Offsets for source bytes 0 to 10
36 : static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
37 :
38 : // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
39 : static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
40 :
41 : // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
42 : static uvec8 kShuf21 = {5, 6, 6, 7, 8, 9, 9, 10,
43 : 10, 11, 12, 13, 13, 14, 14, 15};
44 :
45 : // Coefficients for source bytes 0 to 10
46 : static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
47 :
48 : // Coefficients for source bytes 10 to 21
49 : static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
50 :
51 : // Coefficients for source bytes 21 to 31
52 : static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
53 :
54 : // Coefficients for source bytes 21 to 31
55 : static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
56 :
57 : static uvec8 kShuf38a = {0, 3, 6, 8, 11, 14, 128, 128,
58 : 128, 128, 128, 128, 128, 128, 128, 128};
59 :
60 : static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0, 3,
61 : 6, 8, 11, 14, 128, 128, 128, 128};
62 :
63 : // Arrange words 0,3,6 into 0,1,2
64 : static uvec8 kShufAc = {0, 1, 6, 7, 12, 13, 128, 128,
65 : 128, 128, 128, 128, 128, 128, 128, 128};
66 :
67 : // Arrange words 0,3,6 into 3,4,5
68 : static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0, 1,
69 : 6, 7, 12, 13, 128, 128, 128, 128};
70 :
71 : // Scaling values for boxes of 3x3 and 2x3
72 : static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
73 : 65536 / 9, 65536 / 6, 0, 0};
74 :
75 : // Arrange first value for pixels 0,1,2,3,4,5
76 : static uvec8 kShufAb0 = {0, 128, 3, 128, 6, 128, 8, 128,
77 : 11, 128, 14, 128, 128, 128, 128, 128};
78 :
79 : // Arrange second value for pixels 0,1,2,3,4,5
80 : static uvec8 kShufAb1 = {1, 128, 4, 128, 7, 128, 9, 128,
81 : 12, 128, 15, 128, 128, 128, 128, 128};
82 :
83 : // Arrange third value for pixels 0,1,2,3,4,5
84 : static uvec8 kShufAb2 = {2, 128, 5, 128, 128, 128, 10, 128,
85 : 13, 128, 128, 128, 128, 128, 128, 128};
86 :
87 : // Scaling values for boxes of 3x2 and 2x2
88 : static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
89 : 65536 / 3, 65536 / 2, 0, 0};
90 :
91 : // GCC versions of row functions are verbatim conversions from Visual C.
92 : // Generated using gcc disassembly on Visual C object file:
93 : // objdump -D yuvscaler.obj >yuvscaler.txt
94 :
95 0 : void ScaleRowDown2_SSSE3(const uint8* src_ptr,
96 : ptrdiff_t src_stride,
97 : uint8* dst_ptr,
98 : int dst_width) {
99 : (void)src_stride;
100 : asm volatile (
101 : LABELALIGN
102 : "1: \n"
103 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
104 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
105 : "lea " MEMLEA(0x20,0) ",%0 \n"
106 : "psrlw $0x8,%%xmm0 \n"
107 : "psrlw $0x8,%%xmm1 \n"
108 : "packuswb %%xmm1,%%xmm0 \n"
109 : "movdqu %%xmm0," MEMACCESS(1) " \n"
110 : "lea " MEMLEA(0x10,1) ",%1 \n"
111 : "sub $0x10,%2 \n"
112 : "jg 1b \n"
113 : : "+r"(src_ptr), // %0
114 : "+r"(dst_ptr), // %1
115 : "+r"(dst_width) // %2
116 : :: "memory", "cc", "xmm0", "xmm1"
117 0 : );
118 0 : }
119 :
120 0 : void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
121 : ptrdiff_t src_stride,
122 : uint8* dst_ptr,
123 : int dst_width) {
124 : (void)src_stride;
125 : asm volatile (
126 : "pcmpeqb %%xmm4,%%xmm4 \n"
127 : "psrlw $0xf,%%xmm4 \n"
128 : "packuswb %%xmm4,%%xmm4 \n"
129 : "pxor %%xmm5,%%xmm5 \n"
130 :
131 : LABELALIGN
132 : "1: \n"
133 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
134 : "movdqu " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
135 : "lea " MEMLEA(0x20,0) ",%0 \n"
136 : "pmaddubsw %%xmm4,%%xmm0 \n"
137 : "pmaddubsw %%xmm4,%%xmm1 \n"
138 : "pavgw %%xmm5,%%xmm0 \n"
139 : "pavgw %%xmm5,%%xmm1 \n"
140 : "packuswb %%xmm1,%%xmm0 \n"
141 : "movdqu %%xmm0," MEMACCESS(1) " \n"
142 : "lea " MEMLEA(0x10,1) ",%1 \n"
143 : "sub $0x10,%2 \n"
144 : "jg 1b \n"
145 : : "+r"(src_ptr), // %0
146 : "+r"(dst_ptr), // %1
147 : "+r"(dst_width) // %2
148 : :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
149 0 : );
150 0 : }
151 :
152 0 : void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
153 : ptrdiff_t src_stride,
154 : uint8* dst_ptr,
155 : int dst_width) {
156 : asm volatile (
157 : "pcmpeqb %%xmm4,%%xmm4 \n"
158 : "psrlw $0xf,%%xmm4 \n"
159 : "packuswb %%xmm4,%%xmm4 \n"
160 : "pxor %%xmm5,%%xmm5 \n"
161 :
162 : LABELALIGN
163 : "1: \n"
164 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
165 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
166 : MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
167 : MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
168 : "lea " MEMLEA(0x20,0) ",%0 \n"
169 : "pmaddubsw %%xmm4,%%xmm0 \n"
170 : "pmaddubsw %%xmm4,%%xmm1 \n"
171 : "pmaddubsw %%xmm4,%%xmm2 \n"
172 : "pmaddubsw %%xmm4,%%xmm3 \n"
173 : "paddw %%xmm2,%%xmm0 \n"
174 : "paddw %%xmm3,%%xmm1 \n"
175 : "psrlw $0x1,%%xmm0 \n"
176 : "psrlw $0x1,%%xmm1 \n"
177 : "pavgw %%xmm5,%%xmm0 \n"
178 : "pavgw %%xmm5,%%xmm1 \n"
179 : "packuswb %%xmm1,%%xmm0 \n"
180 : "movdqu %%xmm0," MEMACCESS(1) " \n"
181 : "lea " MEMLEA(0x10,1) ",%1 \n"
182 : "sub $0x10,%2 \n"
183 : "jg 1b \n"
184 : : "+r"(src_ptr), // %0
185 : "+r"(dst_ptr), // %1
186 : "+r"(dst_width) // %2
187 : : "r"((intptr_t)(src_stride)) // %3
188 : : "memory", "cc", NACL_R14
189 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
190 0 : );
191 0 : }
192 :
193 : #ifdef HAS_SCALEROWDOWN2_AVX2
194 0 : void ScaleRowDown2_AVX2(const uint8* src_ptr,
195 : ptrdiff_t src_stride,
196 : uint8* dst_ptr,
197 : int dst_width) {
198 : (void)src_stride;
199 : asm volatile (
200 : LABELALIGN
201 : "1: \n"
202 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
203 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
204 : "lea " MEMLEA(0x40,0) ",%0 \n"
205 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
206 : "vpsrlw $0x8,%%ymm1,%%ymm1 \n"
207 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
208 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
209 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
210 : "lea " MEMLEA(0x20,1) ",%1 \n"
211 : "sub $0x20,%2 \n"
212 : "jg 1b \n"
213 : "vzeroupper \n"
214 : : "+r"(src_ptr), // %0
215 : "+r"(dst_ptr), // %1
216 : "+r"(dst_width) // %2
217 : :: "memory", "cc", "xmm0", "xmm1"
218 0 : );
219 0 : }
220 :
221 0 : void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
222 : ptrdiff_t src_stride,
223 : uint8* dst_ptr,
224 : int dst_width) {
225 : (void)src_stride;
226 : asm volatile (
227 : "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
228 : "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
229 : "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
230 : "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
231 :
232 : LABELALIGN
233 : "1: \n"
234 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
235 : "vmovdqu " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
236 : "lea " MEMLEA(0x40,0) ",%0 \n"
237 : "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
238 : "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
239 : "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
240 : "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
241 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
242 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
243 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
244 : "lea " MEMLEA(0x20,1) ",%1 \n"
245 : "sub $0x20,%2 \n"
246 : "jg 1b \n"
247 : "vzeroupper \n"
248 : : "+r"(src_ptr), // %0
249 : "+r"(dst_ptr), // %1
250 : "+r"(dst_width) // %2
251 : :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
252 0 : );
253 0 : }
254 :
255 0 : void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
256 : ptrdiff_t src_stride,
257 : uint8* dst_ptr,
258 : int dst_width) {
259 : asm volatile (
260 : "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
261 : "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
262 : "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
263 : "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
264 :
265 : LABELALIGN
266 : "1: \n"
267 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
268 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
269 : MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
270 : MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
271 : "lea " MEMLEA(0x40,0) ",%0 \n"
272 : "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
273 : "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
274 : "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
275 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
276 : "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
277 : "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
278 : "vpsrlw $0x1,%%ymm0,%%ymm0 \n"
279 : "vpsrlw $0x1,%%ymm1,%%ymm1 \n"
280 : "vpavgw %%ymm5,%%ymm0,%%ymm0 \n"
281 : "vpavgw %%ymm5,%%ymm1,%%ymm1 \n"
282 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
283 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
284 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
285 : "lea " MEMLEA(0x20,1) ",%1 \n"
286 : "sub $0x20,%2 \n"
287 : "jg 1b \n"
288 : "vzeroupper \n"
289 : : "+r"(src_ptr), // %0
290 : "+r"(dst_ptr), // %1
291 : "+r"(dst_width) // %2
292 : : "r"((intptr_t)(src_stride)) // %3
293 : : "memory", "cc", NACL_R14
294 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
295 0 : );
296 0 : }
297 : #endif // HAS_SCALEROWDOWN2_AVX2
298 :
299 0 : void ScaleRowDown4_SSSE3(const uint8* src_ptr,
300 : ptrdiff_t src_stride,
301 : uint8* dst_ptr,
302 : int dst_width) {
303 : (void)src_stride;
304 : asm volatile (
305 : "pcmpeqb %%xmm5,%%xmm5 \n"
306 : "psrld $0x18,%%xmm5 \n"
307 : "pslld $0x10,%%xmm5 \n"
308 :
309 : LABELALIGN
310 : "1: \n"
311 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
312 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
313 : "lea " MEMLEA(0x20,0) ",%0 \n"
314 : "pand %%xmm5,%%xmm0 \n"
315 : "pand %%xmm5,%%xmm1 \n"
316 : "packuswb %%xmm1,%%xmm0 \n"
317 : "psrlw $0x8,%%xmm0 \n"
318 : "packuswb %%xmm0,%%xmm0 \n"
319 : "movq %%xmm0," MEMACCESS(1) " \n"
320 : "lea " MEMLEA(0x8,1) ",%1 \n"
321 : "sub $0x8,%2 \n"
322 : "jg 1b \n"
323 : : "+r"(src_ptr), // %0
324 : "+r"(dst_ptr), // %1
325 : "+r"(dst_width) // %2
326 : :: "memory", "cc", "xmm0", "xmm1", "xmm5"
327 0 : );
328 0 : }
329 :
330 0 : void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
331 : ptrdiff_t src_stride,
332 : uint8* dst_ptr,
333 : int dst_width) {
334 : intptr_t stridex3;
335 : asm volatile (
336 : "pcmpeqb %%xmm4,%%xmm4 \n"
337 : "psrlw $0xf,%%xmm4 \n"
338 : "movdqa %%xmm4,%%xmm5 \n"
339 : "packuswb %%xmm4,%%xmm4 \n"
340 : "psllw $0x3,%%xmm5 \n"
341 : "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
342 :
343 : LABELALIGN
344 : "1: \n"
345 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
346 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
347 : MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
348 : MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
349 : "pmaddubsw %%xmm4,%%xmm0 \n"
350 : "pmaddubsw %%xmm4,%%xmm1 \n"
351 : "pmaddubsw %%xmm4,%%xmm2 \n"
352 : "pmaddubsw %%xmm4,%%xmm3 \n"
353 : "paddw %%xmm2,%%xmm0 \n"
354 : "paddw %%xmm3,%%xmm1 \n"
355 : MEMOPREG(movdqu,0x00,0,4,2,xmm2) // movdqu (%0,%4,2),%%xmm2
356 : MEMOPREG(movdqu,0x10,0,4,2,xmm3) // movdqu 0x10(%0,%4,2),%%xmm3
357 : "pmaddubsw %%xmm4,%%xmm2 \n"
358 : "pmaddubsw %%xmm4,%%xmm3 \n"
359 : "paddw %%xmm2,%%xmm0 \n"
360 : "paddw %%xmm3,%%xmm1 \n"
361 : MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
362 : MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
363 : "lea " MEMLEA(0x20,0) ",%0 \n"
364 : "pmaddubsw %%xmm4,%%xmm2 \n"
365 : "pmaddubsw %%xmm4,%%xmm3 \n"
366 : "paddw %%xmm2,%%xmm0 \n"
367 : "paddw %%xmm3,%%xmm1 \n"
368 : "phaddw %%xmm1,%%xmm0 \n"
369 : "paddw %%xmm5,%%xmm0 \n"
370 : "psrlw $0x4,%%xmm0 \n"
371 : "packuswb %%xmm0,%%xmm0 \n"
372 : "movq %%xmm0," MEMACCESS(1) " \n"
373 : "lea " MEMLEA(0x8,1) ",%1 \n"
374 : "sub $0x8,%2 \n"
375 : "jg 1b \n"
376 : : "+r"(src_ptr), // %0
377 : "+r"(dst_ptr), // %1
378 : "+r"(dst_width), // %2
379 : "=&r"(stridex3) // %3
380 : : "r"((intptr_t)(src_stride)) // %4
381 : : "memory", "cc", NACL_R14
382 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
383 0 : );
384 0 : }
385 :
386 : #ifdef HAS_SCALEROWDOWN4_AVX2
387 0 : void ScaleRowDown4_AVX2(const uint8* src_ptr,
388 : ptrdiff_t src_stride,
389 : uint8* dst_ptr,
390 : int dst_width) {
391 : (void)src_stride;
392 : asm volatile (
393 : "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
394 : "vpsrld $0x18,%%ymm5,%%ymm5 \n"
395 : "vpslld $0x10,%%ymm5,%%ymm5 \n"
396 : LABELALIGN
397 : "1: \n"
398 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
399 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
400 : "lea " MEMLEA(0x40,0) ",%0 \n"
401 : "vpand %%ymm5,%%ymm0,%%ymm0 \n"
402 : "vpand %%ymm5,%%ymm1,%%ymm1 \n"
403 : "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
404 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
405 : "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
406 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
407 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
408 : "vmovdqu %%xmm0," MEMACCESS(1) " \n"
409 : "lea " MEMLEA(0x10,1) ",%1 \n"
410 : "sub $0x10,%2 \n"
411 : "jg 1b \n"
412 : "vzeroupper \n"
413 : : "+r"(src_ptr), // %0
414 : "+r"(dst_ptr), // %1
415 : "+r"(dst_width) // %2
416 : :: "memory", "cc", "xmm0", "xmm1", "xmm5"
417 0 : );
418 0 : }
419 :
420 0 : void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
421 : ptrdiff_t src_stride,
422 : uint8* dst_ptr,
423 : int dst_width) {
424 : asm volatile (
425 : "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
426 : "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
427 : "vpsllw $0x3,%%ymm4,%%ymm5 \n"
428 : "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
429 :
430 : LABELALIGN
431 : "1: \n"
432 : "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
433 : "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
434 : MEMOPREG(vmovdqu,0x00,0,3,1,ymm2) // vmovdqu (%0,%3,1),%%ymm2
435 : MEMOPREG(vmovdqu,0x20,0,3,1,ymm3) // vmovdqu 0x20(%0,%3,1),%%ymm3
436 : "vpmaddubsw %%ymm4,%%ymm0,%%ymm0 \n"
437 : "vpmaddubsw %%ymm4,%%ymm1,%%ymm1 \n"
438 : "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
439 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
440 : "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
441 : "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
442 : MEMOPREG(vmovdqu,0x00,0,3,2,ymm2) // vmovdqu (%0,%3,2),%%ymm2
443 : MEMOPREG(vmovdqu,0x20,0,3,2,ymm3) // vmovdqu 0x20(%0,%3,2),%%ymm3
444 : "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
445 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
446 : "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
447 : "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
448 : MEMOPREG(vmovdqu,0x00,0,4,1,ymm2) // vmovdqu (%0,%4,1),%%ymm2
449 : MEMOPREG(vmovdqu,0x20,0,4,1,ymm3) // vmovdqu 0x20(%0,%4,1),%%ymm3
450 : "lea " MEMLEA(0x40,0) ",%0 \n"
451 : "vpmaddubsw %%ymm4,%%ymm2,%%ymm2 \n"
452 : "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
453 : "vpaddw %%ymm2,%%ymm0,%%ymm0 \n"
454 : "vpaddw %%ymm3,%%ymm1,%%ymm1 \n"
455 : "vphaddw %%ymm1,%%ymm0,%%ymm0 \n"
456 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
457 : "vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
458 : "vpsrlw $0x4,%%ymm0,%%ymm0 \n"
459 : "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n"
460 : "vpermq $0xd8,%%ymm0,%%ymm0 \n"
461 : "vmovdqu %%xmm0," MEMACCESS(1) " \n"
462 : "lea " MEMLEA(0x10,1) ",%1 \n"
463 : "sub $0x10,%2 \n"
464 : "jg 1b \n"
465 : "vzeroupper \n"
466 : : "+r"(src_ptr), // %0
467 : "+r"(dst_ptr), // %1
468 : "+r"(dst_width) // %2
469 : : "r"((intptr_t)(src_stride)), // %3
470 0 : "r"((intptr_t)(src_stride * 3)) // %4
471 : : "memory", "cc", NACL_R14
472 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
473 0 : );
474 0 : }
475 : #endif // HAS_SCALEROWDOWN4_AVX2
476 :
477 0 : void ScaleRowDown34_SSSE3(const uint8* src_ptr,
478 : ptrdiff_t src_stride,
479 : uint8* dst_ptr,
480 : int dst_width) {
481 : (void)src_stride;
482 : asm volatile(
483 : "movdqa %0,%%xmm3 \n"
484 : "movdqa %1,%%xmm4 \n"
485 : "movdqa %2,%%xmm5 \n"
486 : :
487 : : "m"(kShuf0), // %0
488 : "m"(kShuf1), // %1
489 : "m"(kShuf2) // %2
490 0 : );
491 : asm volatile (
492 : LABELALIGN
493 : "1: \n"
494 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
495 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm2 \n"
496 : "lea " MEMLEA(0x20,0) ",%0 \n"
497 : "movdqa %%xmm2,%%xmm1 \n"
498 : "palignr $0x8,%%xmm0,%%xmm1 \n"
499 : "pshufb %%xmm3,%%xmm0 \n"
500 : "pshufb %%xmm4,%%xmm1 \n"
501 : "pshufb %%xmm5,%%xmm2 \n"
502 : "movq %%xmm0," MEMACCESS(1) " \n"
503 : "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
504 : "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
505 : "lea " MEMLEA(0x18,1) ",%1 \n"
506 : "sub $0x18,%2 \n"
507 : "jg 1b \n"
508 : : "+r"(src_ptr), // %0
509 : "+r"(dst_ptr), // %1
510 : "+r"(dst_width) // %2
511 : :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
512 0 : );
513 0 : }
514 :
515 0 : void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
516 : ptrdiff_t src_stride,
517 : uint8* dst_ptr,
518 : int dst_width) {
519 : asm volatile(
520 : "movdqa %0,%%xmm2 \n" // kShuf01
521 : "movdqa %1,%%xmm3 \n" // kShuf11
522 : "movdqa %2,%%xmm4 \n" // kShuf21
523 : :
524 : : "m"(kShuf01), // %0
525 : "m"(kShuf11), // %1
526 : "m"(kShuf21) // %2
527 0 : );
528 : asm volatile(
529 : "movdqa %0,%%xmm5 \n" // kMadd01
530 : "movdqa %1,%%xmm0 \n" // kMadd11
531 : "movdqa %2,%%xmm1 \n" // kRound34
532 : :
533 : : "m"(kMadd01), // %0
534 : "m"(kMadd11), // %1
535 : "m"(kRound34) // %2
536 0 : );
537 : asm volatile (
538 : LABELALIGN
539 : "1: \n"
540 : "movdqu " MEMACCESS(0) ",%%xmm6 \n"
541 : MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3),%%xmm7
542 : "pavgb %%xmm7,%%xmm6 \n"
543 : "pshufb %%xmm2,%%xmm6 \n"
544 : "pmaddubsw %%xmm5,%%xmm6 \n"
545 : "paddsw %%xmm1,%%xmm6 \n"
546 : "psrlw $0x2,%%xmm6 \n"
547 : "packuswb %%xmm6,%%xmm6 \n"
548 : "movq %%xmm6," MEMACCESS(1) " \n"
549 : "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
550 : MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
551 : "pavgb %%xmm7,%%xmm6 \n"
552 : "pshufb %%xmm3,%%xmm6 \n"
553 : "pmaddubsw %%xmm0,%%xmm6 \n"
554 : "paddsw %%xmm1,%%xmm6 \n"
555 : "psrlw $0x2,%%xmm6 \n"
556 : "packuswb %%xmm6,%%xmm6 \n"
557 : "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
558 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
559 : MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3),%%xmm7
560 : "lea " MEMLEA(0x20,0) ",%0 \n"
561 : "pavgb %%xmm7,%%xmm6 \n"
562 : "pshufb %%xmm4,%%xmm6 \n"
563 : "pmaddubsw %4,%%xmm6 \n"
564 : "paddsw %%xmm1,%%xmm6 \n"
565 : "psrlw $0x2,%%xmm6 \n"
566 : "packuswb %%xmm6,%%xmm6 \n"
567 : "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
568 : "lea " MEMLEA(0x18,1) ",%1 \n"
569 : "sub $0x18,%2 \n"
570 : "jg 1b \n"
571 : : "+r"(src_ptr), // %0
572 : "+r"(dst_ptr), // %1
573 : "+r"(dst_width) // %2
574 : : "r"((intptr_t)(src_stride)), // %3
575 : "m"(kMadd21) // %4
576 : : "memory", "cc", NACL_R14
577 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
578 0 : );
579 0 : }
580 :
581 0 : void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
582 : ptrdiff_t src_stride,
583 : uint8* dst_ptr,
584 : int dst_width) {
585 : asm volatile(
586 : "movdqa %0,%%xmm2 \n" // kShuf01
587 : "movdqa %1,%%xmm3 \n" // kShuf11
588 : "movdqa %2,%%xmm4 \n" // kShuf21
589 : :
590 : : "m"(kShuf01), // %0
591 : "m"(kShuf11), // %1
592 : "m"(kShuf21) // %2
593 0 : );
594 : asm volatile(
595 : "movdqa %0,%%xmm5 \n" // kMadd01
596 : "movdqa %1,%%xmm0 \n" // kMadd11
597 : "movdqa %2,%%xmm1 \n" // kRound34
598 : :
599 : : "m"(kMadd01), // %0
600 : "m"(kMadd11), // %1
601 : "m"(kRound34) // %2
602 0 : );
603 :
604 : asm volatile (
605 : LABELALIGN
606 : "1: \n"
607 : "movdqu " MEMACCESS(0) ",%%xmm6 \n"
608 : MEMOPREG(movdqu,0x00,0,3,1,xmm7) // movdqu (%0,%3,1),%%xmm7
609 : "pavgb %%xmm6,%%xmm7 \n"
610 : "pavgb %%xmm7,%%xmm6 \n"
611 : "pshufb %%xmm2,%%xmm6 \n"
612 : "pmaddubsw %%xmm5,%%xmm6 \n"
613 : "paddsw %%xmm1,%%xmm6 \n"
614 : "psrlw $0x2,%%xmm6 \n"
615 : "packuswb %%xmm6,%%xmm6 \n"
616 : "movq %%xmm6," MEMACCESS(1) " \n"
617 : "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
618 : MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
619 : "pavgb %%xmm6,%%xmm7 \n"
620 : "pavgb %%xmm7,%%xmm6 \n"
621 : "pshufb %%xmm3,%%xmm6 \n"
622 : "pmaddubsw %%xmm0,%%xmm6 \n"
623 : "paddsw %%xmm1,%%xmm6 \n"
624 : "psrlw $0x2,%%xmm6 \n"
625 : "packuswb %%xmm6,%%xmm6 \n"
626 : "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
627 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm6 \n"
628 : MEMOPREG(movdqu,0x10,0,3,1,xmm7) // movdqu 0x10(%0,%3,1),%%xmm7
629 : "lea " MEMLEA(0x20,0) ",%0 \n"
630 : "pavgb %%xmm6,%%xmm7 \n"
631 : "pavgb %%xmm7,%%xmm6 \n"
632 : "pshufb %%xmm4,%%xmm6 \n"
633 : "pmaddubsw %4,%%xmm6 \n"
634 : "paddsw %%xmm1,%%xmm6 \n"
635 : "psrlw $0x2,%%xmm6 \n"
636 : "packuswb %%xmm6,%%xmm6 \n"
637 : "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
638 : "lea " MEMLEA(0x18,1) ",%1 \n"
639 : "sub $0x18,%2 \n"
640 : "jg 1b \n"
641 : : "+r"(src_ptr), // %0
642 : "+r"(dst_ptr), // %1
643 : "+r"(dst_width) // %2
644 : : "r"((intptr_t)(src_stride)), // %3
645 : "m"(kMadd21) // %4
646 : : "memory", "cc", NACL_R14
647 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
648 0 : );
649 0 : }
650 :
651 0 : void ScaleRowDown38_SSSE3(const uint8* src_ptr,
652 : ptrdiff_t src_stride,
653 : uint8* dst_ptr,
654 : int dst_width) {
655 : (void)src_stride;
656 : asm volatile (
657 : "movdqa %3,%%xmm4 \n"
658 : "movdqa %4,%%xmm5 \n"
659 :
660 : LABELALIGN
661 : "1: \n"
662 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
663 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
664 : "lea " MEMLEA(0x20,0) ",%0 \n"
665 : "pshufb %%xmm4,%%xmm0 \n"
666 : "pshufb %%xmm5,%%xmm1 \n"
667 : "paddusb %%xmm1,%%xmm0 \n"
668 : "movq %%xmm0," MEMACCESS(1) " \n"
669 : "movhlps %%xmm0,%%xmm1 \n"
670 : "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
671 : "lea " MEMLEA(0xc,1) ",%1 \n"
672 : "sub $0xc,%2 \n"
673 : "jg 1b \n"
674 : : "+r"(src_ptr), // %0
675 : "+r"(dst_ptr), // %1
676 : "+r"(dst_width) // %2
677 : : "m"(kShuf38a), // %3
678 : "m"(kShuf38b) // %4
679 : : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
680 0 : );
681 0 : }
682 :
683 0 : void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
684 : ptrdiff_t src_stride,
685 : uint8* dst_ptr,
686 : int dst_width) {
687 : asm volatile(
688 : "movdqa %0,%%xmm2 \n"
689 : "movdqa %1,%%xmm3 \n"
690 : "movdqa %2,%%xmm4 \n"
691 : "movdqa %3,%%xmm5 \n"
692 : :
693 : : "m"(kShufAb0), // %0
694 : "m"(kShufAb1), // %1
695 : "m"(kShufAb2), // %2
696 : "m"(kScaleAb2) // %3
697 0 : );
698 : asm volatile (
699 : LABELALIGN
700 : "1: \n"
701 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
702 : MEMOPREG(movdqu,0x00,0,3,1,xmm1) // movdqu (%0,%3,1),%%xmm1
703 : "lea " MEMLEA(0x10,0) ",%0 \n"
704 : "pavgb %%xmm1,%%xmm0 \n"
705 : "movdqa %%xmm0,%%xmm1 \n"
706 : "pshufb %%xmm2,%%xmm1 \n"
707 : "movdqa %%xmm0,%%xmm6 \n"
708 : "pshufb %%xmm3,%%xmm6 \n"
709 : "paddusw %%xmm6,%%xmm1 \n"
710 : "pshufb %%xmm4,%%xmm0 \n"
711 : "paddusw %%xmm0,%%xmm1 \n"
712 : "pmulhuw %%xmm5,%%xmm1 \n"
713 : "packuswb %%xmm1,%%xmm1 \n"
714 : "movd %%xmm1," MEMACCESS(1) " \n"
715 : "psrlq $0x10,%%xmm1 \n"
716 : "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
717 : "lea " MEMLEA(0x6,1) ",%1 \n"
718 : "sub $0x6,%2 \n"
719 : "jg 1b \n"
720 : : "+r"(src_ptr), // %0
721 : "+r"(dst_ptr), // %1
722 : "+r"(dst_width) // %2
723 : : "r"((intptr_t)(src_stride)) // %3
724 : : "memory", "cc", NACL_R14
725 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
726 0 : );
727 0 : }
728 :
729 0 : void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
730 : ptrdiff_t src_stride,
731 : uint8* dst_ptr,
732 : int dst_width) {
733 : asm volatile(
734 : "movdqa %0,%%xmm2 \n"
735 : "movdqa %1,%%xmm3 \n"
736 : "movdqa %2,%%xmm4 \n"
737 : "pxor %%xmm5,%%xmm5 \n"
738 : :
739 : : "m"(kShufAc), // %0
740 : "m"(kShufAc3), // %1
741 : "m"(kScaleAc33) // %2
742 0 : );
743 : asm volatile (
744 : LABELALIGN
745 : "1: \n"
746 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
747 : MEMOPREG(movdqu,0x00,0,3,1,xmm6) // movdqu (%0,%3,1),%%xmm6
748 : "movhlps %%xmm0,%%xmm1 \n"
749 : "movhlps %%xmm6,%%xmm7 \n"
750 : "punpcklbw %%xmm5,%%xmm0 \n"
751 : "punpcklbw %%xmm5,%%xmm1 \n"
752 : "punpcklbw %%xmm5,%%xmm6 \n"
753 : "punpcklbw %%xmm5,%%xmm7 \n"
754 : "paddusw %%xmm6,%%xmm0 \n"
755 : "paddusw %%xmm7,%%xmm1 \n"
756 : MEMOPREG(movdqu,0x00,0,3,2,xmm6) // movdqu (%0,%3,2),%%xmm6
757 : "lea " MEMLEA(0x10,0) ",%0 \n"
758 : "movhlps %%xmm6,%%xmm7 \n"
759 : "punpcklbw %%xmm5,%%xmm6 \n"
760 : "punpcklbw %%xmm5,%%xmm7 \n"
761 : "paddusw %%xmm6,%%xmm0 \n"
762 : "paddusw %%xmm7,%%xmm1 \n"
763 : "movdqa %%xmm0,%%xmm6 \n"
764 : "psrldq $0x2,%%xmm0 \n"
765 : "paddusw %%xmm0,%%xmm6 \n"
766 : "psrldq $0x2,%%xmm0 \n"
767 : "paddusw %%xmm0,%%xmm6 \n"
768 : "pshufb %%xmm2,%%xmm6 \n"
769 : "movdqa %%xmm1,%%xmm7 \n"
770 : "psrldq $0x2,%%xmm1 \n"
771 : "paddusw %%xmm1,%%xmm7 \n"
772 : "psrldq $0x2,%%xmm1 \n"
773 : "paddusw %%xmm1,%%xmm7 \n"
774 : "pshufb %%xmm3,%%xmm7 \n"
775 : "paddusw %%xmm7,%%xmm6 \n"
776 : "pmulhuw %%xmm4,%%xmm6 \n"
777 : "packuswb %%xmm6,%%xmm6 \n"
778 : "movd %%xmm6," MEMACCESS(1) " \n"
779 : "psrlq $0x10,%%xmm6 \n"
780 : "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
781 : "lea " MEMLEA(0x6,1) ",%1 \n"
782 : "sub $0x6,%2 \n"
783 : "jg 1b \n"
784 : : "+r"(src_ptr), // %0
785 : "+r"(dst_ptr), // %1
786 : "+r"(dst_width) // %2
787 : : "r"((intptr_t)(src_stride)) // %3
788 : : "memory", "cc", NACL_R14
789 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
790 0 : );
791 0 : }
792 :
793 : // Reads 16xN bytes and produces 16 shorts at a time.
794 0 : void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
795 : asm volatile (
796 : "pxor %%xmm5,%%xmm5 \n"
797 :
798 : LABELALIGN
799 : "1: \n"
800 : "movdqu " MEMACCESS(0) ",%%xmm3 \n"
801 : "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16
802 : "movdqu " MEMACCESS(1) ",%%xmm0 \n"
803 : "movdqu " MEMACCESS2(0x10,1) ",%%xmm1 \n"
804 : "movdqa %%xmm3,%%xmm2 \n"
805 : "punpcklbw %%xmm5,%%xmm2 \n"
806 : "punpckhbw %%xmm5,%%xmm3 \n"
807 : "paddusw %%xmm2,%%xmm0 \n"
808 : "paddusw %%xmm3,%%xmm1 \n"
809 : "movdqu %%xmm0," MEMACCESS(1) " \n"
810 : "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
811 : "lea " MEMLEA(0x20,1) ",%1 \n"
812 : "sub $0x10,%2 \n"
813 : "jg 1b \n"
814 : : "+r"(src_ptr), // %0
815 : "+r"(dst_ptr), // %1
816 : "+r"(src_width) // %2
817 : :
818 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
819 0 : );
820 0 : }
821 :
822 : #ifdef HAS_SCALEADDROW_AVX2
823 : // Reads 32 bytes and accumulates to 32 shorts at a time.
824 0 : void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
825 : asm volatile (
826 : "vpxor %%ymm5,%%ymm5,%%ymm5 \n"
827 :
828 : LABELALIGN
829 : "1: \n"
830 : "vmovdqu " MEMACCESS(0) ",%%ymm3 \n"
831 : "lea " MEMLEA(0x20,0) ",%0 \n" // src_ptr += 32
832 : "vpermq $0xd8,%%ymm3,%%ymm3 \n"
833 : "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n"
834 : "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n"
835 : "vpaddusw " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
836 : "vpaddusw " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
837 : "vmovdqu %%ymm0," MEMACCESS(1) " \n"
838 : "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
839 : "lea " MEMLEA(0x40,1) ",%1 \n"
840 : "sub $0x20,%2 \n"
841 : "jg 1b \n"
842 : "vzeroupper \n"
843 : : "+r"(src_ptr), // %0
844 : "+r"(dst_ptr), // %1
845 : "+r"(src_width) // %2
846 : :
847 : : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
848 0 : );
849 0 : }
850 : #endif // HAS_SCALEADDROW_AVX2
851 :
852 : // Constant for making pixels signed to avoid pmaddubsw
853 : // saturation.
854 : static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
855 : 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
856 :
857 : // Constant for making pixels unsigned and adding .5 for rounding.
858 : static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
859 : 0x4040, 0x4040, 0x4040, 0x4040};
860 :
861 : // Bilinear column filtering. SSSE3 version.
862 0 : void ScaleFilterCols_SSSE3(uint8* dst_ptr,
863 : const uint8* src_ptr,
864 : int dst_width,
865 : int x,
866 : int dx) {
867 : intptr_t x0, x1, temp_pixel;
868 : asm volatile (
869 : "movd %6,%%xmm2 \n"
870 : "movd %7,%%xmm3 \n"
871 : "movl $0x04040000,%k2 \n"
872 : "movd %k2,%%xmm5 \n"
873 : "pcmpeqb %%xmm6,%%xmm6 \n"
874 : "psrlw $0x9,%%xmm6 \n" // 0x007f007f
875 : "pcmpeqb %%xmm7,%%xmm7 \n"
876 : "psrlw $15,%%xmm7 \n" // 0x00010001
877 :
878 : "pextrw $0x1,%%xmm2,%k3 \n"
879 : "subl $0x2,%5 \n"
880 : "jl 29f \n"
881 : "movdqa %%xmm2,%%xmm0 \n"
882 : "paddd %%xmm3,%%xmm0 \n"
883 : "punpckldq %%xmm0,%%xmm2 \n"
884 : "punpckldq %%xmm3,%%xmm3 \n"
885 : "paddd %%xmm3,%%xmm3 \n"
886 : "pextrw $0x3,%%xmm2,%k4 \n"
887 :
888 : LABELALIGN
889 : "2: \n"
890 : "movdqa %%xmm2,%%xmm1 \n"
891 : "paddd %%xmm3,%%xmm2 \n"
892 : MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
893 : "movd %k2,%%xmm0 \n"
894 : "psrlw $0x9,%%xmm1 \n"
895 : MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
896 : "movd %k2,%%xmm4 \n"
897 : "pshufb %%xmm5,%%xmm1 \n"
898 : "punpcklwd %%xmm4,%%xmm0 \n"
899 : "psubb %8,%%xmm0 \n" // make pixels signed.
900 : "pxor %%xmm6,%%xmm1 \n" // 128 - f = (f ^ 127 ) + 1
901 : "paddusb %%xmm7,%%xmm1 \n"
902 : "pmaddubsw %%xmm0,%%xmm1 \n"
903 : "pextrw $0x1,%%xmm2,%k3 \n"
904 : "pextrw $0x3,%%xmm2,%k4 \n"
905 : "paddw %9,%%xmm1 \n" // make pixels unsigned.
906 : "psrlw $0x7,%%xmm1 \n"
907 : "packuswb %%xmm1,%%xmm1 \n"
908 : "movd %%xmm1,%k2 \n"
909 : "mov %w2," MEMACCESS(0) " \n"
910 : "lea " MEMLEA(0x2,0) ",%0 \n"
911 : "subl $0x2,%5 \n"
912 : "jge 2b \n"
913 :
914 : LABELALIGN
915 : "29: \n"
916 : "addl $0x1,%5 \n"
917 : "jl 99f \n"
918 : MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
919 : "movd %k2,%%xmm0 \n"
920 : "psrlw $0x9,%%xmm2 \n"
921 : "pshufb %%xmm5,%%xmm2 \n"
922 : "psubb %8,%%xmm0 \n" // make pixels signed.
923 : "pxor %%xmm6,%%xmm2 \n"
924 : "paddusb %%xmm7,%%xmm2 \n"
925 : "pmaddubsw %%xmm0,%%xmm2 \n"
926 : "paddw %9,%%xmm2 \n" // make pixels unsigned.
927 : "psrlw $0x7,%%xmm2 \n"
928 : "packuswb %%xmm2,%%xmm2 \n"
929 : "movd %%xmm2,%k2 \n"
930 : "mov %b2," MEMACCESS(0) " \n"
931 : "99: \n"
932 : : "+r"(dst_ptr), // %0
933 : "+r"(src_ptr), // %1
934 : "=&a"(temp_pixel), // %2
935 : "=&r"(x0), // %3
936 : "=&r"(x1), // %4
937 : #if defined(__x86_64__)
938 : "+rm"(dst_width) // %5
939 : #else
940 : "+m"(dst_width) // %5
941 : #endif
942 : : "rm"(x), // %6
943 : "rm"(dx), // %7
944 : #if defined(__x86_64__)
945 : "x"(kFsub80), // %8
946 : "x"(kFadd40) // %9
947 : #else
948 : "m"(kFsub80), // %8
949 : "m"(kFadd40) // %9
950 : #endif
951 : : "memory", "cc", NACL_R14
952 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
953 0 : );
954 0 : }
955 :
956 : // Reads 4 pixels, duplicates them and writes 8 pixels.
957 : // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
958 0 : void ScaleColsUp2_SSE2(uint8* dst_ptr,
959 : const uint8* src_ptr,
960 : int dst_width,
961 : int x,
962 : int dx) {
963 : (void)x;
964 : (void)dx;
965 : asm volatile (
966 : LABELALIGN
967 : "1: \n"
968 : "movdqu " MEMACCESS(1) ",%%xmm0 \n"
969 : "lea " MEMLEA(0x10,1) ",%1 \n"
970 : "movdqa %%xmm0,%%xmm1 \n"
971 : "punpcklbw %%xmm0,%%xmm0 \n"
972 : "punpckhbw %%xmm1,%%xmm1 \n"
973 : "movdqu %%xmm0," MEMACCESS(0) " \n"
974 : "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
975 : "lea " MEMLEA(0x20,0) ",%0 \n"
976 : "sub $0x20,%2 \n"
977 : "jg 1b \n"
978 :
979 : : "+r"(dst_ptr), // %0
980 : "+r"(src_ptr), // %1
981 : "+r"(dst_width) // %2
982 : :: "memory", "cc", "xmm0", "xmm1"
983 0 : );
984 0 : }
985 :
986 0 : void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
987 : ptrdiff_t src_stride,
988 : uint8* dst_argb,
989 : int dst_width) {
990 : (void)src_stride;
991 : asm volatile (
992 : LABELALIGN
993 : "1: \n"
994 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
995 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
996 : "lea " MEMLEA(0x20,0) ",%0 \n"
997 : "shufps $0xdd,%%xmm1,%%xmm0 \n"
998 : "movdqu %%xmm0," MEMACCESS(1) " \n"
999 : "lea " MEMLEA(0x10,1) ",%1 \n"
1000 : "sub $0x4,%2 \n"
1001 : "jg 1b \n"
1002 : : "+r"(src_argb), // %0
1003 : "+r"(dst_argb), // %1
1004 : "+r"(dst_width) // %2
1005 : :: "memory", "cc", "xmm0", "xmm1"
1006 0 : );
1007 0 : }
1008 :
1009 0 : void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
1010 : ptrdiff_t src_stride,
1011 : uint8* dst_argb,
1012 : int dst_width) {
1013 : (void)src_stride;
1014 : asm volatile (
1015 : LABELALIGN
1016 : "1: \n"
1017 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1018 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1019 : "lea " MEMLEA(0x20,0) ",%0 \n"
1020 : "movdqa %%xmm0,%%xmm2 \n"
1021 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1022 : "shufps $0xdd,%%xmm1,%%xmm2 \n"
1023 : "pavgb %%xmm2,%%xmm0 \n"
1024 : "movdqu %%xmm0," MEMACCESS(1) " \n"
1025 : "lea " MEMLEA(0x10,1) ",%1 \n"
1026 : "sub $0x4,%2 \n"
1027 : "jg 1b \n"
1028 : : "+r"(src_argb), // %0
1029 : "+r"(dst_argb), // %1
1030 : "+r"(dst_width) // %2
1031 : :: "memory", "cc", "xmm0", "xmm1"
1032 0 : );
1033 0 : }
1034 :
1035 0 : void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
1036 : ptrdiff_t src_stride,
1037 : uint8* dst_argb,
1038 : int dst_width) {
1039 : asm volatile (
1040 : LABELALIGN
1041 : "1: \n"
1042 : "movdqu " MEMACCESS(0) ",%%xmm0 \n"
1043 : "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
1044 : MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
1045 : MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
1046 : "lea " MEMLEA(0x20,0) ",%0 \n"
1047 : "pavgb %%xmm2,%%xmm0 \n"
1048 : "pavgb %%xmm3,%%xmm1 \n"
1049 : "movdqa %%xmm0,%%xmm2 \n"
1050 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1051 : "shufps $0xdd,%%xmm1,%%xmm2 \n"
1052 : "pavgb %%xmm2,%%xmm0 \n"
1053 : "movdqu %%xmm0," MEMACCESS(1) " \n"
1054 : "lea " MEMLEA(0x10,1) ",%1 \n"
1055 : "sub $0x4,%2 \n"
1056 : "jg 1b \n"
1057 : : "+r"(src_argb), // %0
1058 : "+r"(dst_argb), // %1
1059 : "+r"(dst_width) // %2
1060 : : "r"((intptr_t)(src_stride)) // %3
1061 : : "memory", "cc", NACL_R14
1062 : "xmm0", "xmm1", "xmm2", "xmm3"
1063 0 : );
1064 0 : }
1065 :
1066 : // Reads 4 pixels at a time.
1067 : // Alignment requirement: dst_argb 16 byte aligned.
1068 0 : void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
1069 : ptrdiff_t src_stride,
1070 : int src_stepx,
1071 : uint8* dst_argb,
1072 : int dst_width) {
1073 0 : intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1074 : intptr_t src_stepx_x12;
1075 : (void)src_stride;
1076 : asm volatile (
1077 : "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1078 : "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1079 : LABELALIGN
1080 : "1: \n"
1081 : "movd " MEMACCESS(0) ",%%xmm0 \n"
1082 : MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
1083 : "punpckldq %%xmm1,%%xmm0 \n"
1084 : MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
1085 : MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
1086 : "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1087 : "punpckldq %%xmm3,%%xmm2 \n"
1088 : "punpcklqdq %%xmm2,%%xmm0 \n"
1089 : "movdqu %%xmm0," MEMACCESS(2) " \n"
1090 : "lea " MEMLEA(0x10,2) ",%2 \n"
1091 : "sub $0x4,%3 \n"
1092 : "jg 1b \n"
1093 : : "+r"(src_argb), // %0
1094 : "+r"(src_stepx_x4), // %1
1095 : "+r"(dst_argb), // %2
1096 : "+r"(dst_width), // %3
1097 : "=&r"(src_stepx_x12) // %4
1098 : :: "memory", "cc", NACL_R14
1099 : "xmm0", "xmm1", "xmm2", "xmm3"
1100 0 : );
1101 0 : }
1102 :
1103 : // Blends four 2x2 to 4x1.
1104 : // Alignment requirement: dst_argb 16 byte aligned.
1105 0 : void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
1106 : ptrdiff_t src_stride,
1107 : int src_stepx,
1108 : uint8* dst_argb,
1109 : int dst_width) {
1110 0 : intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
1111 : intptr_t src_stepx_x12;
1112 0 : intptr_t row1 = (intptr_t)(src_stride);
1113 : asm volatile (
1114 : "lea " MEMLEA3(0x00,1,4) ",%1 \n"
1115 : "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
1116 : "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
1117 :
1118 : LABELALIGN
1119 : "1: \n"
1120 : "movq " MEMACCESS(0) ",%%xmm0 \n"
1121 : MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
1122 : MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
1123 : MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
1124 : "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
1125 : "movq " MEMACCESS(5) ",%%xmm2 \n"
1126 : MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
1127 : MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
1128 : MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
1129 : "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
1130 : "pavgb %%xmm2,%%xmm0 \n"
1131 : "pavgb %%xmm3,%%xmm1 \n"
1132 : "movdqa %%xmm0,%%xmm2 \n"
1133 : "shufps $0x88,%%xmm1,%%xmm0 \n"
1134 : "shufps $0xdd,%%xmm1,%%xmm2 \n"
1135 : "pavgb %%xmm2,%%xmm0 \n"
1136 : "movdqu %%xmm0," MEMACCESS(2) " \n"
1137 : "lea " MEMLEA(0x10,2) ",%2 \n"
1138 : "sub $0x4,%3 \n"
1139 : "jg 1b \n"
1140 : : "+r"(src_argb), // %0
1141 : "+r"(src_stepx_x4), // %1
1142 : "+r"(dst_argb), // %2
1143 : "+rm"(dst_width), // %3
1144 : "=&r"(src_stepx_x12), // %4
1145 : "+r"(row1) // %5
1146 : :: "memory", "cc", NACL_R14
1147 : "xmm0", "xmm1", "xmm2", "xmm3"
1148 0 : );
1149 0 : }
1150 :
1151 0 : void ScaleARGBCols_SSE2(uint8* dst_argb,
1152 : const uint8* src_argb,
1153 : int dst_width,
1154 : int x,
1155 : int dx) {
1156 : intptr_t x0, x1;
1157 : asm volatile (
1158 : "movd %5,%%xmm2 \n"
1159 : "movd %6,%%xmm3 \n"
1160 : "pshufd $0x0,%%xmm2,%%xmm2 \n"
1161 : "pshufd $0x11,%%xmm3,%%xmm0 \n"
1162 : "paddd %%xmm0,%%xmm2 \n"
1163 : "paddd %%xmm3,%%xmm3 \n"
1164 : "pshufd $0x5,%%xmm3,%%xmm0 \n"
1165 : "paddd %%xmm0,%%xmm2 \n"
1166 : "paddd %%xmm3,%%xmm3 \n"
1167 : "pshufd $0x0,%%xmm3,%%xmm3 \n"
1168 : "pextrw $0x1,%%xmm2,%k0 \n"
1169 : "pextrw $0x3,%%xmm2,%k1 \n"
1170 : "cmp $0x0,%4 \n"
1171 : "jl 99f \n"
1172 : "sub $0x4,%4 \n"
1173 : "jl 49f \n"
1174 :
1175 : LABELALIGN
1176 : "40: \n"
1177 : MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1178 : MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1179 : "pextrw $0x5,%%xmm2,%k0 \n"
1180 : "pextrw $0x7,%%xmm2,%k1 \n"
1181 : "paddd %%xmm3,%%xmm2 \n"
1182 : "punpckldq %%xmm1,%%xmm0 \n"
1183 : MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
1184 : MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
1185 : "pextrw $0x1,%%xmm2,%k0 \n"
1186 : "pextrw $0x3,%%xmm2,%k1 \n"
1187 : "punpckldq %%xmm4,%%xmm1 \n"
1188 : "punpcklqdq %%xmm1,%%xmm0 \n"
1189 : "movdqu %%xmm0," MEMACCESS(2) " \n"
1190 : "lea " MEMLEA(0x10,2) ",%2 \n"
1191 : "sub $0x4,%4 \n"
1192 : "jge 40b \n"
1193 :
1194 : "49: \n"
1195 : "test $0x2,%4 \n"
1196 : "je 29f \n"
1197 : MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1198 : MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
1199 : "pextrw $0x5,%%xmm2,%k0 \n"
1200 : "punpckldq %%xmm1,%%xmm0 \n"
1201 : "movq %%xmm0," MEMACCESS(2) " \n"
1202 : "lea " MEMLEA(0x8,2) ",%2 \n"
1203 : "29: \n"
1204 : "test $0x1,%4 \n"
1205 : "je 99f \n"
1206 : MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
1207 : "movd %%xmm0," MEMACCESS(2) " \n"
1208 : "99: \n"
1209 : : "=&a"(x0), // %0
1210 : "=&d"(x1), // %1
1211 : "+r"(dst_argb), // %2
1212 : "+r"(src_argb), // %3
1213 : "+r"(dst_width) // %4
1214 : : "rm"(x), // %5
1215 : "rm"(dx) // %6
1216 : : "memory", "cc", NACL_R14
1217 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1218 0 : );
1219 0 : }
1220 :
1221 : // Reads 4 pixels, duplicates them and writes 8 pixels.
1222 : // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
1223 0 : void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
1224 : const uint8* src_argb,
1225 : int dst_width,
1226 : int x,
1227 : int dx) {
1228 : (void)x;
1229 : (void)dx;
1230 : asm volatile (
1231 : LABELALIGN
1232 : "1: \n"
1233 : "movdqu " MEMACCESS(1) ",%%xmm0 \n"
1234 : "lea " MEMLEA(0x10,1) ",%1 \n"
1235 : "movdqa %%xmm0,%%xmm1 \n"
1236 : "punpckldq %%xmm0,%%xmm0 \n"
1237 : "punpckhdq %%xmm1,%%xmm1 \n"
1238 : "movdqu %%xmm0," MEMACCESS(0) " \n"
1239 : "movdqu %%xmm1," MEMACCESS2(0x10,0) " \n"
1240 : "lea " MEMLEA(0x20,0) ",%0 \n"
1241 : "sub $0x8,%2 \n"
1242 : "jg 1b \n"
1243 :
1244 : : "+r"(dst_argb), // %0
1245 : "+r"(src_argb), // %1
1246 : "+r"(dst_width) // %2
1247 : :: "memory", "cc", NACL_R14
1248 : "xmm0", "xmm1"
1249 0 : );
1250 0 : }
1251 :
1252 : // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
1253 : static uvec8 kShuffleColARGB = {
1254 : 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
1255 : 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
1256 : };
1257 :
1258 : // Shuffle table for duplicating 2 fractions into 8 bytes each
1259 : static uvec8 kShuffleFractions = {
1260 : 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
1261 : };
1262 :
1263 : // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
1264 0 : void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
1265 : const uint8* src_argb,
1266 : int dst_width,
1267 : int x,
1268 : int dx) {
1269 : intptr_t x0, x1;
1270 : asm volatile(
1271 : "movdqa %0,%%xmm4 \n"
1272 : "movdqa %1,%%xmm5 \n"
1273 : :
1274 : : "m"(kShuffleColARGB), // %0
1275 : "m"(kShuffleFractions) // %1
1276 0 : );
1277 :
1278 : asm volatile (
1279 : "movd %5,%%xmm2 \n"
1280 : "movd %6,%%xmm3 \n"
1281 : "pcmpeqb %%xmm6,%%xmm6 \n"
1282 : "psrlw $0x9,%%xmm6 \n"
1283 : "pextrw $0x1,%%xmm2,%k3 \n"
1284 : "sub $0x2,%2 \n"
1285 : "jl 29f \n"
1286 : "movdqa %%xmm2,%%xmm0 \n"
1287 : "paddd %%xmm3,%%xmm0 \n"
1288 : "punpckldq %%xmm0,%%xmm2 \n"
1289 : "punpckldq %%xmm3,%%xmm3 \n"
1290 : "paddd %%xmm3,%%xmm3 \n"
1291 : "pextrw $0x3,%%xmm2,%k4 \n"
1292 :
1293 : LABELALIGN
1294 : "2: \n"
1295 : "movdqa %%xmm2,%%xmm1 \n"
1296 : "paddd %%xmm3,%%xmm2 \n"
1297 : MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1298 : "psrlw $0x9,%%xmm1 \n"
1299 : MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
1300 : "pshufb %%xmm5,%%xmm1 \n"
1301 : "pshufb %%xmm4,%%xmm0 \n"
1302 : "pxor %%xmm6,%%xmm1 \n"
1303 : "pmaddubsw %%xmm1,%%xmm0 \n"
1304 : "psrlw $0x7,%%xmm0 \n"
1305 : "pextrw $0x1,%%xmm2,%k3 \n"
1306 : "pextrw $0x3,%%xmm2,%k4 \n"
1307 : "packuswb %%xmm0,%%xmm0 \n"
1308 : "movq %%xmm0," MEMACCESS(0) " \n"
1309 : "lea " MEMLEA(0x8,0) ",%0 \n"
1310 : "sub $0x2,%2 \n"
1311 : "jge 2b \n"
1312 :
1313 : LABELALIGN
1314 : "29: \n"
1315 : "add $0x1,%2 \n"
1316 : "jl 99f \n"
1317 : "psrlw $0x9,%%xmm2 \n"
1318 : MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
1319 : "pshufb %%xmm5,%%xmm2 \n"
1320 : "pshufb %%xmm4,%%xmm0 \n"
1321 : "pxor %%xmm6,%%xmm2 \n"
1322 : "pmaddubsw %%xmm2,%%xmm0 \n"
1323 : "psrlw $0x7,%%xmm0 \n"
1324 : "packuswb %%xmm0,%%xmm0 \n"
1325 : "movd %%xmm0," MEMACCESS(0) " \n"
1326 :
1327 : LABELALIGN
1328 : "99: \n"
1329 : : "+r"(dst_argb), // %0
1330 : "+r"(src_argb), // %1
1331 : "+rm"(dst_width), // %2
1332 : "=&r"(x0), // %3
1333 : "=&r"(x1) // %4
1334 : : "rm"(x), // %5
1335 : "rm"(dx) // %6
1336 : : "memory", "cc", NACL_R14
1337 : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1338 0 : );
1339 0 : }
1340 :
1341 : // Divide num by div and return as 16.16 fixed point result.
1342 0 : int FixedDiv_X86(int num, int div) {
1343 : asm volatile(
1344 : "cdq \n"
1345 : "shld $0x10,%%eax,%%edx \n"
1346 : "shl $0x10,%%eax \n"
1347 : "idiv %1 \n"
1348 : "mov %0, %%eax \n"
1349 : : "+a"(num) // %0
1350 : : "c"(div) // %1
1351 0 : : "memory", "cc", "edx");
1352 0 : return num;
1353 : }
1354 :
1355 : // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
1356 0 : int FixedDiv1_X86(int num, int div) {
1357 : asm volatile(
1358 : "cdq \n"
1359 : "shld $0x10,%%eax,%%edx \n"
1360 : "shl $0x10,%%eax \n"
1361 : "sub $0x10001,%%eax \n"
1362 : "sbb $0x0,%%edx \n"
1363 : "sub $0x1,%1 \n"
1364 : "idiv %1 \n"
1365 : "mov %0, %%eax \n"
1366 : : "+a"(num) // %0
1367 : : "c"(div) // %1
1368 0 : : "memory", "cc", "edx");
1369 0 : return num;
1370 : }
1371 :
1372 : #endif // defined(__x86_64__) || defined(__i386__)
1373 :
1374 : #ifdef __cplusplus
1375 : } // extern "C"
1376 : } // namespace libyuv
1377 : #endif
|