Line data Source code
1 : /********************************************************************
2 : * *
3 : * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 : * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 : * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 : * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 : * *
8 : * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 : * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 : * *
11 : ********************************************************************
12 :
13 : function:
14 : last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
15 :
16 : ********************************************************************/
17 :
18 : /*MMX acceleration of Theora's iDCT.
19 : Originally written by Rudolf Marek, based on code from On2's VP3.*/
20 : #include "x86int.h"
21 : #include "../dct.h"
22 :
23 : #if defined(OC_X86_ASM)
24 :
25 : /*These are offsets into the table of constants below.*/
26 : /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
27 : #define OC_COSINE_OFFSET (0)
28 : /*A row of 8's.*/
29 : #define OC_EIGHT_OFFSET (56)
30 :
31 :
32 :
33 : /*38 cycles*/
34 : #define OC_IDCT_BEGIN(_y,_x) \
35 : "#OC_IDCT_BEGIN\n\t" \
36 : "movq "OC_I(3,_x)",%%mm2\n\t" \
37 : "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
38 : "movq %%mm2,%%mm4\n\t" \
39 : "movq "OC_J(5,_x)",%%mm7\n\t" \
40 : "pmulhw %%mm6,%%mm4\n\t" \
41 : "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
42 : "pmulhw %%mm7,%%mm6\n\t" \
43 : "movq %%mm1,%%mm5\n\t" \
44 : "pmulhw %%mm2,%%mm1\n\t" \
45 : "movq "OC_I(1,_x)",%%mm3\n\t" \
46 : "pmulhw %%mm7,%%mm5\n\t" \
47 : "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
48 : "paddw %%mm2,%%mm4\n\t" \
49 : "paddw %%mm7,%%mm6\n\t" \
50 : "paddw %%mm1,%%mm2\n\t" \
51 : "movq "OC_J(7,_x)",%%mm1\n\t" \
52 : "paddw %%mm5,%%mm7\n\t" \
53 : "movq %%mm0,%%mm5\n\t" \
54 : "pmulhw %%mm3,%%mm0\n\t" \
55 : "paddw %%mm7,%%mm4\n\t" \
56 : "pmulhw %%mm1,%%mm5\n\t" \
57 : "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
58 : "psubw %%mm2,%%mm6\n\t" \
59 : "paddw %%mm3,%%mm0\n\t" \
60 : "pmulhw %%mm7,%%mm3\n\t" \
61 : "movq "OC_I(2,_x)",%%mm2\n\t" \
62 : "pmulhw %%mm1,%%mm7\n\t" \
63 : "paddw %%mm1,%%mm5\n\t" \
64 : "movq %%mm2,%%mm1\n\t" \
65 : "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
66 : "psubw %%mm5,%%mm3\n\t" \
67 : "movq "OC_J(6,_x)",%%mm5\n\t" \
68 : "paddw %%mm7,%%mm0\n\t" \
69 : "movq %%mm5,%%mm7\n\t" \
70 : "psubw %%mm4,%%mm0\n\t" \
71 : "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
72 : "paddw %%mm1,%%mm2\n\t" \
73 : "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
74 : "paddw %%mm4,%%mm4\n\t" \
75 : "paddw %%mm0,%%mm4\n\t" \
76 : "psubw %%mm6,%%mm3\n\t" \
77 : "paddw %%mm7,%%mm5\n\t" \
78 : "paddw %%mm6,%%mm6\n\t" \
79 : "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
80 : "paddw %%mm3,%%mm6\n\t" \
81 : "movq %%mm4,"OC_I(1,_y)"\n\t" \
82 : "psubw %%mm5,%%mm1\n\t" \
83 : "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
84 : "movq %%mm3,%%mm5\n\t" \
85 : "pmulhw %%mm4,%%mm3\n\t" \
86 : "paddw %%mm2,%%mm7\n\t" \
87 : "movq %%mm6,"OC_I(2,_y)"\n\t" \
88 : "movq %%mm0,%%mm2\n\t" \
89 : "movq "OC_I(0,_x)",%%mm6\n\t" \
90 : "pmulhw %%mm4,%%mm0\n\t" \
91 : "paddw %%mm3,%%mm5\n\t" \
92 : "movq "OC_J(4,_x)",%%mm3\n\t" \
93 : "psubw %%mm1,%%mm5\n\t" \
94 : "paddw %%mm0,%%mm2\n\t" \
95 : "psubw %%mm3,%%mm6\n\t" \
96 : "movq %%mm6,%%mm0\n\t" \
97 : "pmulhw %%mm4,%%mm6\n\t" \
98 : "paddw %%mm3,%%mm3\n\t" \
99 : "paddw %%mm1,%%mm1\n\t" \
100 : "paddw %%mm0,%%mm3\n\t" \
101 : "paddw %%mm5,%%mm1\n\t" \
102 : "pmulhw %%mm3,%%mm4\n\t" \
103 : "paddw %%mm0,%%mm6\n\t" \
104 : "psubw %%mm2,%%mm6\n\t" \
105 : "paddw %%mm2,%%mm2\n\t" \
106 : "movq "OC_I(1,_y)",%%mm0\n\t" \
107 : "paddw %%mm6,%%mm2\n\t" \
108 : "paddw %%mm3,%%mm4\n\t" \
109 : "psubw %%mm1,%%mm2\n\t" \
110 : "#end OC_IDCT_BEGIN\n\t" \
111 :
112 : /*38+8=46 cycles.*/
113 : #define OC_ROW_IDCT(_y,_x) \
114 : "#OC_ROW_IDCT\n" \
115 : OC_IDCT_BEGIN(_y,_x) \
116 : /*r3=D'*/ \
117 : "movq "OC_I(2,_y)",%%mm3\n\t" \
118 : /*r4=E'=E-G*/ \
119 : "psubw %%mm7,%%mm4\n\t" \
120 : /*r1=H'+H'*/ \
121 : "paddw %%mm1,%%mm1\n\t" \
122 : /*r7=G+G*/ \
123 : "paddw %%mm7,%%mm7\n\t" \
124 : /*r1=R1=A''+H'*/ \
125 : "paddw %%mm2,%%mm1\n\t" \
126 : /*r7=G'=E+G*/ \
127 : "paddw %%mm4,%%mm7\n\t" \
128 : /*r4=R4=E'-D'*/ \
129 : "psubw %%mm3,%%mm4\n\t" \
130 : "paddw %%mm3,%%mm3\n\t" \
131 : /*r6=R6=F'-B''*/ \
132 : "psubw %%mm5,%%mm6\n\t" \
133 : "paddw %%mm5,%%mm5\n\t" \
134 : /*r3=R3=E'+D'*/ \
135 : "paddw %%mm4,%%mm3\n\t" \
136 : /*r5=R5=F'+B''*/ \
137 : "paddw %%mm6,%%mm5\n\t" \
138 : /*r7=R7=G'-C'*/ \
139 : "psubw %%mm0,%%mm7\n\t" \
140 : "paddw %%mm0,%%mm0\n\t" \
141 : /*Save R1.*/ \
142 : "movq %%mm1,"OC_I(1,_y)"\n\t" \
143 : /*r0=R0=G.+C.*/ \
144 : "paddw %%mm7,%%mm0\n\t" \
145 : "#end OC_ROW_IDCT\n\t" \
146 :
147 : /*The following macro does two 4x4 transposes in place.
148 : At entry, we assume:
149 : r0 = a3 a2 a1 a0
150 : I(1) = b3 b2 b1 b0
151 : r2 = c3 c2 c1 c0
152 : r3 = d3 d2 d1 d0
153 :
154 : r4 = e3 e2 e1 e0
155 : r5 = f3 f2 f1 f0
156 : r6 = g3 g2 g1 g0
157 : r7 = h3 h2 h1 h0
158 :
159 : At exit, we have:
160 : I(0) = d0 c0 b0 a0
161 : I(1) = d1 c1 b1 a1
162 : I(2) = d2 c2 b2 a2
163 : I(3) = d3 c3 b3 a3
164 :
165 : J(4) = h0 g0 f0 e0
166 : J(5) = h1 g1 f1 e1
167 : J(6) = h2 g2 f2 e2
168 : J(7) = h3 g3 f3 e3
169 :
170 : I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
171 : J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
172 :
173 : Since r1 is free at entry, we calculate the Js first.*/
174 : /*19 cycles.*/
175 : #define OC_TRANSPOSE(_y) \
176 : "#OC_TRANSPOSE\n\t" \
177 : "movq %%mm4,%%mm1\n\t" \
178 : "punpcklwd %%mm5,%%mm4\n\t" \
179 : "movq %%mm0,"OC_I(0,_y)"\n\t" \
180 : "punpckhwd %%mm5,%%mm1\n\t" \
181 : "movq %%mm6,%%mm0\n\t" \
182 : "punpcklwd %%mm7,%%mm6\n\t" \
183 : "movq %%mm4,%%mm5\n\t" \
184 : "punpckldq %%mm6,%%mm4\n\t" \
185 : "punpckhdq %%mm6,%%mm5\n\t" \
186 : "movq %%mm1,%%mm6\n\t" \
187 : "movq %%mm4,"OC_J(4,_y)"\n\t" \
188 : "punpckhwd %%mm7,%%mm0\n\t" \
189 : "movq %%mm5,"OC_J(5,_y)"\n\t" \
190 : "punpckhdq %%mm0,%%mm6\n\t" \
191 : "movq "OC_I(0,_y)",%%mm4\n\t" \
192 : "punpckldq %%mm0,%%mm1\n\t" \
193 : "movq "OC_I(1,_y)",%%mm5\n\t" \
194 : "movq %%mm4,%%mm0\n\t" \
195 : "movq %%mm6,"OC_J(7,_y)"\n\t" \
196 : "punpcklwd %%mm5,%%mm0\n\t" \
197 : "movq %%mm1,"OC_J(6,_y)"\n\t" \
198 : "punpckhwd %%mm5,%%mm4\n\t" \
199 : "movq %%mm2,%%mm5\n\t" \
200 : "punpcklwd %%mm3,%%mm2\n\t" \
201 : "movq %%mm0,%%mm1\n\t" \
202 : "punpckldq %%mm2,%%mm0\n\t" \
203 : "punpckhdq %%mm2,%%mm1\n\t" \
204 : "movq %%mm4,%%mm2\n\t" \
205 : "movq %%mm0,"OC_I(0,_y)"\n\t" \
206 : "punpckhwd %%mm3,%%mm5\n\t" \
207 : "movq %%mm1,"OC_I(1,_y)"\n\t" \
208 : "punpckhdq %%mm5,%%mm4\n\t" \
209 : "punpckldq %%mm5,%%mm2\n\t" \
210 : "movq %%mm4,"OC_I(3,_y)"\n\t" \
211 : "movq %%mm2,"OC_I(2,_y)"\n\t" \
212 : "#end OC_TRANSPOSE\n\t" \
213 :
214 : /*38+19=57 cycles.*/
215 : #define OC_COLUMN_IDCT(_y) \
216 : "#OC_COLUMN_IDCT\n" \
217 : OC_IDCT_BEGIN(_y,_y) \
218 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
219 : /*r1=H'+H'*/ \
220 : "paddw %%mm1,%%mm1\n\t" \
221 : /*r1=R1=A''+H'*/ \
222 : "paddw %%mm2,%%mm1\n\t" \
223 : /*r2=NR2*/ \
224 : "psraw $4,%%mm2\n\t" \
225 : /*r4=E'=E-G*/ \
226 : "psubw %%mm7,%%mm4\n\t" \
227 : /*r1=NR1*/ \
228 : "psraw $4,%%mm1\n\t" \
229 : /*r3=D'*/ \
230 : "movq "OC_I(2,_y)",%%mm3\n\t" \
231 : /*r7=G+G*/ \
232 : "paddw %%mm7,%%mm7\n\t" \
233 : /*Store NR2 at I(2).*/ \
234 : "movq %%mm2,"OC_I(2,_y)"\n\t" \
235 : /*r7=G'=E+G*/ \
236 : "paddw %%mm4,%%mm7\n\t" \
237 : /*Store NR1 at I(1).*/ \
238 : "movq %%mm1,"OC_I(1,_y)"\n\t" \
239 : /*r4=R4=E'-D'*/ \
240 : "psubw %%mm3,%%mm4\n\t" \
241 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
242 : /*r3=D'+D'*/ \
243 : "paddw %%mm3,%%mm3\n\t" \
244 : /*r3=R3=E'+D'*/ \
245 : "paddw %%mm4,%%mm3\n\t" \
246 : /*r4=NR4*/ \
247 : "psraw $4,%%mm4\n\t" \
248 : /*r6=R6=F'-B''*/ \
249 : "psubw %%mm5,%%mm6\n\t" \
250 : /*r3=NR3*/ \
251 : "psraw $4,%%mm3\n\t" \
252 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
253 : /*r5=B''+B''*/ \
254 : "paddw %%mm5,%%mm5\n\t" \
255 : /*r5=R5=F'+B''*/ \
256 : "paddw %%mm6,%%mm5\n\t" \
257 : /*r6=NR6*/ \
258 : "psraw $4,%%mm6\n\t" \
259 : /*Store NR4 at J(4).*/ \
260 : "movq %%mm4,"OC_J(4,_y)"\n\t" \
261 : /*r5=NR5*/ \
262 : "psraw $4,%%mm5\n\t" \
263 : /*Store NR3 at I(3).*/ \
264 : "movq %%mm3,"OC_I(3,_y)"\n\t" \
265 : /*r7=R7=G'-C'*/ \
266 : "psubw %%mm0,%%mm7\n\t" \
267 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
268 : /*r0=C'+C'*/ \
269 : "paddw %%mm0,%%mm0\n\t" \
270 : /*r0=R0=G'+C'*/ \
271 : "paddw %%mm7,%%mm0\n\t" \
272 : /*r7=NR7*/ \
273 : "psraw $4,%%mm7\n\t" \
274 : /*Store NR6 at J(6).*/ \
275 : "movq %%mm6,"OC_J(6,_y)"\n\t" \
276 : /*r0=NR0*/ \
277 : "psraw $4,%%mm0\n\t" \
278 : /*Store NR5 at J(5).*/ \
279 : "movq %%mm5,"OC_J(5,_y)"\n\t" \
280 : /*Store NR7 at J(7).*/ \
281 : "movq %%mm7,"OC_J(7,_y)"\n\t" \
282 : /*Store NR0 at I(0).*/ \
283 : "movq %%mm0,"OC_I(0,_y)"\n\t" \
284 : "#end OC_COLUMN_IDCT\n\t" \
285 :
286 0 : static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
287 : /*This routine accepts an 8x8 matrix, but in partially transposed form.
288 : Every 4x4 block is transposed.*/
289 0 : __asm__ __volatile__(
290 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
291 : #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
292 : OC_ROW_IDCT(y,x)
293 : OC_TRANSPOSE(y)
294 : #undef OC_I
295 : #undef OC_J
296 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
297 : #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
298 : OC_ROW_IDCT(y,x)
299 : OC_TRANSPOSE(y)
300 : #undef OC_I
301 : #undef OC_J
302 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
303 : #define OC_J(_k,_y) OC_I(_k,_y)
304 : OC_COLUMN_IDCT(y)
305 : #undef OC_I
306 : #undef OC_J
307 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
308 : #define OC_J(_k,_y) OC_I(_k,_y)
309 : OC_COLUMN_IDCT(y)
310 : #undef OC_I
311 : #undef OC_J
312 0 : :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
313 0 : :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
314 0 : [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
315 : );
316 0 : if(_x!=_y){
317 : int i;
318 0 : __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
319 0 : for(i=0;i<4;i++){
320 0 : __asm__ __volatile__(
321 : "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
322 : "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
323 : "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
324 : "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
325 0 : :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
326 : );
327 : }
328 : }
329 0 : }
330 :
331 : /*25 cycles.*/
332 : #define OC_IDCT_BEGIN_10(_y,_x) \
333 : "#OC_IDCT_BEGIN_10\n\t" \
334 : "movq "OC_I(3,_x)",%%mm2\n\t" \
335 : "nop\n\t" \
336 : "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
337 : "movq %%mm2,%%mm4\n\t" \
338 : "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
339 : "pmulhw %%mm6,%%mm4\n\t" \
340 : "movq "OC_I(1,_x)",%%mm3\n\t" \
341 : "pmulhw %%mm2,%%mm1\n\t" \
342 : "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
343 : "paddw %%mm2,%%mm4\n\t" \
344 : "pxor %%mm6,%%mm6\n\t" \
345 : "paddw %%mm1,%%mm2\n\t" \
346 : "movq "OC_I(2,_x)",%%mm5\n\t" \
347 : "pmulhw %%mm3,%%mm0\n\t" \
348 : "movq %%mm5,%%mm1\n\t" \
349 : "paddw %%mm3,%%mm0\n\t" \
350 : "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
351 : "psubw %%mm2,%%mm6\n\t" \
352 : "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
353 : "psubw %%mm4,%%mm0\n\t" \
354 : "movq "OC_I(2,_x)",%%mm7\n\t" \
355 : "paddw %%mm4,%%mm4\n\t" \
356 : "paddw %%mm5,%%mm7\n\t" \
357 : "paddw %%mm0,%%mm4\n\t" \
358 : "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
359 : "psubw %%mm6,%%mm3\n\t" \
360 : "movq %%mm4,"OC_I(1,_y)"\n\t" \
361 : "paddw %%mm6,%%mm6\n\t" \
362 : "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
363 : "paddw %%mm3,%%mm6\n\t" \
364 : "movq %%mm3,%%mm5\n\t" \
365 : "pmulhw %%mm4,%%mm3\n\t" \
366 : "movq %%mm6,"OC_I(2,_y)"\n\t" \
367 : "movq %%mm0,%%mm2\n\t" \
368 : "movq "OC_I(0,_x)",%%mm6\n\t" \
369 : "pmulhw %%mm4,%%mm0\n\t" \
370 : "paddw %%mm3,%%mm5\n\t" \
371 : "paddw %%mm0,%%mm2\n\t" \
372 : "psubw %%mm1,%%mm5\n\t" \
373 : "pmulhw %%mm4,%%mm6\n\t" \
374 : "paddw "OC_I(0,_x)",%%mm6\n\t" \
375 : "paddw %%mm1,%%mm1\n\t" \
376 : "movq %%mm6,%%mm4\n\t" \
377 : "paddw %%mm5,%%mm1\n\t" \
378 : "psubw %%mm2,%%mm6\n\t" \
379 : "paddw %%mm2,%%mm2\n\t" \
380 : "movq "OC_I(1,_y)",%%mm0\n\t" \
381 : "paddw %%mm6,%%mm2\n\t" \
382 : "psubw %%mm1,%%mm2\n\t" \
383 : "nop\n\t" \
384 : "#end OC_IDCT_BEGIN_10\n\t" \
385 :
386 : /*25+8=33 cycles.*/
387 : #define OC_ROW_IDCT_10(_y,_x) \
388 : "#OC_ROW_IDCT_10\n\t" \
389 : OC_IDCT_BEGIN_10(_y,_x) \
390 : /*r3=D'*/ \
391 : "movq "OC_I(2,_y)",%%mm3\n\t" \
392 : /*r4=E'=E-G*/ \
393 : "psubw %%mm7,%%mm4\n\t" \
394 : /*r1=H'+H'*/ \
395 : "paddw %%mm1,%%mm1\n\t" \
396 : /*r7=G+G*/ \
397 : "paddw %%mm7,%%mm7\n\t" \
398 : /*r1=R1=A''+H'*/ \
399 : "paddw %%mm2,%%mm1\n\t" \
400 : /*r7=G'=E+G*/ \
401 : "paddw %%mm4,%%mm7\n\t" \
402 : /*r4=R4=E'-D'*/ \
403 : "psubw %%mm3,%%mm4\n\t" \
404 : "paddw %%mm3,%%mm3\n\t" \
405 : /*r6=R6=F'-B''*/ \
406 : "psubw %%mm5,%%mm6\n\t" \
407 : "paddw %%mm5,%%mm5\n\t" \
408 : /*r3=R3=E'+D'*/ \
409 : "paddw %%mm4,%%mm3\n\t" \
410 : /*r5=R5=F'+B''*/ \
411 : "paddw %%mm6,%%mm5\n\t" \
412 : /*r7=R7=G'-C'*/ \
413 : "psubw %%mm0,%%mm7\n\t" \
414 : "paddw %%mm0,%%mm0\n\t" \
415 : /*Save R1.*/ \
416 : "movq %%mm1,"OC_I(1,_y)"\n\t" \
417 : /*r0=R0=G'+C'*/ \
418 : "paddw %%mm7,%%mm0\n\t" \
419 : "#end OC_ROW_IDCT_10\n\t" \
420 :
421 : /*25+19=44 cycles'*/
422 : #define OC_COLUMN_IDCT_10(_y) \
423 : "#OC_COLUMN_IDCT_10\n\t" \
424 : OC_IDCT_BEGIN_10(_y,_y) \
425 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
426 : /*r1=H'+H'*/ \
427 : "paddw %%mm1,%%mm1\n\t" \
428 : /*r1=R1=A''+H'*/ \
429 : "paddw %%mm2,%%mm1\n\t" \
430 : /*r2=NR2*/ \
431 : "psraw $4,%%mm2\n\t" \
432 : /*r4=E'=E-G*/ \
433 : "psubw %%mm7,%%mm4\n\t" \
434 : /*r1=NR1*/ \
435 : "psraw $4,%%mm1\n\t" \
436 : /*r3=D'*/ \
437 : "movq "OC_I(2,_y)",%%mm3\n\t" \
438 : /*r7=G+G*/ \
439 : "paddw %%mm7,%%mm7\n\t" \
440 : /*Store NR2 at I(2).*/ \
441 : "movq %%mm2,"OC_I(2,_y)"\n\t" \
442 : /*r7=G'=E+G*/ \
443 : "paddw %%mm4,%%mm7\n\t" \
444 : /*Store NR1 at I(1).*/ \
445 : "movq %%mm1,"OC_I(1,_y)"\n\t" \
446 : /*r4=R4=E'-D'*/ \
447 : "psubw %%mm3,%%mm4\n\t" \
448 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
449 : /*r3=D'+D'*/ \
450 : "paddw %%mm3,%%mm3\n\t" \
451 : /*r3=R3=E'+D'*/ \
452 : "paddw %%mm4,%%mm3\n\t" \
453 : /*r4=NR4*/ \
454 : "psraw $4,%%mm4\n\t" \
455 : /*r6=R6=F'-B''*/ \
456 : "psubw %%mm5,%%mm6\n\t" \
457 : /*r3=NR3*/ \
458 : "psraw $4,%%mm3\n\t" \
459 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
460 : /*r5=B''+B''*/ \
461 : "paddw %%mm5,%%mm5\n\t" \
462 : /*r5=R5=F'+B''*/ \
463 : "paddw %%mm6,%%mm5\n\t" \
464 : /*r6=NR6*/ \
465 : "psraw $4,%%mm6\n\t" \
466 : /*Store NR4 at J(4).*/ \
467 : "movq %%mm4,"OC_J(4,_y)"\n\t" \
468 : /*r5=NR5*/ \
469 : "psraw $4,%%mm5\n\t" \
470 : /*Store NR3 at I(3).*/ \
471 : "movq %%mm3,"OC_I(3,_y)"\n\t" \
472 : /*r7=R7=G'-C'*/ \
473 : "psubw %%mm0,%%mm7\n\t" \
474 : "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
475 : /*r0=C'+C'*/ \
476 : "paddw %%mm0,%%mm0\n\t" \
477 : /*r0=R0=G'+C'*/ \
478 : "paddw %%mm7,%%mm0\n\t" \
479 : /*r7=NR7*/ \
480 : "psraw $4,%%mm7\n\t" \
481 : /*Store NR6 at J(6).*/ \
482 : "movq %%mm6,"OC_J(6,_y)"\n\t" \
483 : /*r0=NR0*/ \
484 : "psraw $4,%%mm0\n\t" \
485 : /*Store NR5 at J(5).*/ \
486 : "movq %%mm5,"OC_J(5,_y)"\n\t" \
487 : /*Store NR7 at J(7).*/ \
488 : "movq %%mm7,"OC_J(7,_y)"\n\t" \
489 : /*Store NR0 at I(0).*/ \
490 : "movq %%mm0,"OC_I(0,_y)"\n\t" \
491 : "#end OC_COLUMN_IDCT_10\n\t" \
492 :
493 0 : static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
494 0 : __asm__ __volatile__(
495 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
496 : #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
497 : /*Done with dequant, descramble, and partial transpose.
498 : Now do the iDCT itself.*/
499 : OC_ROW_IDCT_10(y,x)
500 : OC_TRANSPOSE(y)
501 : #undef OC_I
502 : #undef OC_J
503 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
504 : #define OC_J(_k,_y) OC_I(_k,_y)
505 : OC_COLUMN_IDCT_10(y)
506 : #undef OC_I
507 : #undef OC_J
508 : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
509 : #define OC_J(_k,_y) OC_I(_k,_y)
510 : OC_COLUMN_IDCT_10(y)
511 : #undef OC_I
512 : #undef OC_J
513 0 : :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
514 0 : :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
515 0 : [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
516 : );
517 0 : if(_x!=_y){
518 0 : __asm__ __volatile__(
519 : "pxor %%mm0,%%mm0\n\t"
520 : "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
521 : "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
522 : "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
523 : "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
524 0 : :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
525 : );
526 : }
527 0 : }
528 :
529 : /*Performs an inverse 8x8 Type-II DCT transform.
530 : The input is assumed to be scaled by a factor of 4 relative to orthonormal
531 : version of the transform.*/
532 0 : void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
533 : /*_last_zzi is subtly different from an actual count of the number of
534 : coefficients we decoded for this block.
535 : It contains the value of zzi BEFORE the final token in the block was
536 : decoded.
537 : In most cases this is an EOB token (the continuation of an EOB run from a
538 : previous block counts), and so this is the same as the coefficient count.
539 : However, in the case that the last token was NOT an EOB token, but filled
540 : the block up with exactly 64 coefficients, _last_zzi will be less than 64.
541 : Provided the last token was not a pure zero run, the minimum value it can
542 : be is 46, and so that doesn't affect any of the cases in this routine.
543 : However, if the last token WAS a pure zero run of length 63, then _last_zzi
544 : will be 1 while the number of coefficients decoded is 64.
545 : Thus, we will trigger the following special case, where the real
546 : coefficient count would not.
547 : Note also that a zero run of length 64 will give _last_zzi a value of 0,
548 : but we still process the DC coefficient, which might have a non-zero value
549 : due to DC prediction.
550 : Although convoluted, this is arguably the correct behavior: it allows us to
551 : use a smaller transform when the block ends with a long zero run instead
552 : of a normal EOB token.
553 : It could be smarter... multiple separate zero runs at the end of a block
554 : will fool it, but an encoder that generates these really deserves what it
555 : gets.
556 : Needless to say we inherited this approach from VP3.*/
557 : /*Then perform the iDCT.*/
558 0 : if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
559 0 : else oc_idct8x8_slow_mmx(_y,_x);
560 0 : }
561 :
562 : #endif
|