Line data Source code
1 : /********************************************************************
2 : * *
3 : * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 : * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 : * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 : * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 : * *
8 : * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 : * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 : * *
11 : ********************************************************************
12 :
13 : function:
14 : last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
15 :
16 : ********************************************************************/
17 :
18 : /*SSE2 acceleration of Theora's iDCT.*/
19 : #include "x86int.h"
20 : #include "sse2trans.h"
21 : #include "../dct.h"
22 :
23 : #if defined(OC_X86_ASM)
24 :
25 : /*A table of constants used by the MMX routines.*/
26 : const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
27 : 8, 8, 8, 8, 8, 8, 8, 8,
28 : OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
29 : OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
30 : OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
31 : OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
32 : OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
33 : OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
34 : OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
35 : };
36 :
37 :
38 : /*Performs the first three stages of the iDCT.
39 : xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
40 : (accessed in that order).
41 : The remaining rows must be in _x at their corresponding locations.
42 : On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
43 : contain rows 4 through 7.*/
44 : #define OC_IDCT_8x8_ABC(_x) \
45 : "#OC_IDCT_8x8_ABC\n\t" \
46 : /*Stage 1:*/ \
47 : /*2-3 rotation by 6pi/16. \
48 : xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
49 : "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
50 : "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
51 : "movdqa %%xmm1,%%xmm0\n\t" \
52 : "pmulhw %%xmm2,%%xmm1\n\t" \
53 : "movdqa %%xmm4,%%xmm7\n\t" \
54 : "pmulhw %%xmm6,%%xmm0\n\t" \
55 : "pmulhw %%xmm2,%%xmm7\n\t" \
56 : "pmulhw %%xmm6,%%xmm4\n\t" \
57 : "paddw %%xmm6,%%xmm0\n\t" \
58 : "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
59 : "paddw %%xmm1,%%xmm2\n\t" \
60 : "psubw %%xmm0,%%xmm7\n\t" \
61 : "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
62 : "paddw %%xmm4,%%xmm2\n\t" \
63 : "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
64 : "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
65 : /*5-6 rotation by 3pi/16. \
66 : xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
67 : "movdqa %%xmm4,%%xmm2\n\t" \
68 : "movdqa %%xmm6,%%xmm1\n\t" \
69 : "pmulhw %%xmm3,%%xmm4\n\t" \
70 : "pmulhw %%xmm5,%%xmm1\n\t" \
71 : "pmulhw %%xmm3,%%xmm6\n\t" \
72 : "pmulhw %%xmm5,%%xmm2\n\t" \
73 : "paddw %%xmm3,%%xmm4\n\t" \
74 : "paddw %%xmm5,%%xmm3\n\t" \
75 : "paddw %%xmm6,%%xmm3\n\t" \
76 : "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
77 : "paddw %%xmm5,%%xmm1\n\t" \
78 : "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
79 : "paddw %%xmm3,%%xmm2\n\t" \
80 : "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
81 : "psubw %%xmm4,%%xmm1\n\t" \
82 : "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
83 : /*4-7 rotation by 7pi/16. \
84 : xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
85 : "movdqa %%xmm3,%%xmm0\n\t" \
86 : "movdqa %%xmm4,%%xmm7\n\t" \
87 : "pmulhw %%xmm5,%%xmm3\n\t" \
88 : "pmulhw %%xmm5,%%xmm7\n\t" \
89 : "pmulhw %%xmm6,%%xmm4\n\t" \
90 : "pmulhw %%xmm6,%%xmm0\n\t" \
91 : "paddw %%xmm6,%%xmm4\n\t" \
92 : "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
93 : "paddw %%xmm5,%%xmm7\n\t" \
94 : "psubw %%xmm4,%%xmm3\n\t" \
95 : "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
96 : "paddw %%xmm7,%%xmm0\n\t" \
97 : "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
98 : /*0-1 butterfly. \
99 : xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
100 : "paddw %%xmm7,%%xmm6\n\t" \
101 : "movdqa %%xmm4,%%xmm5\n\t" \
102 : "pmulhw %%xmm6,%%xmm4\n\t" \
103 : "paddw %%xmm7,%%xmm7\n\t" \
104 : "psubw %%xmm6,%%xmm7\n\t" \
105 : "paddw %%xmm6,%%xmm4\n\t" \
106 : /*Stage 2:*/ \
107 : /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
108 : 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
109 : "movdqa %%xmm3,%%xmm6\n\t" \
110 : "paddw %%xmm1,%%xmm3\n\t" \
111 : "psubw %%xmm1,%%xmm6\n\t" \
112 : "movdqa %%xmm5,%%xmm1\n\t" \
113 : "pmulhw %%xmm7,%%xmm5\n\t" \
114 : "paddw %%xmm7,%%xmm5\n\t" \
115 : "movdqa %%xmm0,%%xmm7\n\t" \
116 : "paddw %%xmm2,%%xmm0\n\t" \
117 : "psubw %%xmm2,%%xmm7\n\t" \
118 : "movdqa %%xmm1,%%xmm2\n\t" \
119 : "pmulhw %%xmm6,%%xmm1\n\t" \
120 : "pmulhw %%xmm7,%%xmm2\n\t" \
121 : "paddw %%xmm6,%%xmm1\n\t" \
122 : "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
123 : "paddw %%xmm7,%%xmm2\n\t" \
124 : "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
125 : /*Stage 3: \
126 : 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
127 : 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
128 : 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
129 : "paddw %%xmm2,%%xmm1\n\t" \
130 : "paddw %%xmm5,%%xmm6\n\t" \
131 : "paddw %%xmm4,%%xmm7\n\t" \
132 : "paddw %%xmm2,%%xmm2\n\t" \
133 : "paddw %%xmm4,%%xmm4\n\t" \
134 : "paddw %%xmm5,%%xmm5\n\t" \
135 : "psubw %%xmm1,%%xmm2\n\t" \
136 : "psubw %%xmm7,%%xmm4\n\t" \
137 : "psubw %%xmm6,%%xmm5\n\t" \
138 :
139 : /*Performs the last stage of the iDCT.
140 : On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
141 : contain rows 4 through 7.
142 : On output, xmm0 through xmm7 contain the corresponding rows.*/
143 : #define OC_IDCT_8x8_D \
144 : "#OC_IDCT_8x8_D\n\t" \
145 : /*Stage 4: \
146 : 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
147 : 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
148 : 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
149 : 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
150 : "psubw %%xmm0,%%xmm7\n\t" \
151 : "psubw %%xmm1,%%xmm6\n\t" \
152 : "psubw %%xmm2,%%xmm5\n\t" \
153 : "psubw %%xmm3,%%xmm4\n\t" \
154 : "paddw %%xmm0,%%xmm0\n\t" \
155 : "paddw %%xmm1,%%xmm1\n\t" \
156 : "paddw %%xmm2,%%xmm2\n\t" \
157 : "paddw %%xmm3,%%xmm3\n\t" \
158 : "paddw %%xmm7,%%xmm0\n\t" \
159 : "paddw %%xmm6,%%xmm1\n\t" \
160 : "paddw %%xmm5,%%xmm2\n\t" \
161 : "paddw %%xmm4,%%xmm3\n\t" \
162 :
163 : /*Performs the last stage of the iDCT.
164 : On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
165 : contain rows 4 through 7.
166 : On output, xmm0 through xmm7 contain the corresponding rows.*/
167 : #define OC_IDCT_8x8_D_STORE \
168 : "#OC_IDCT_8x8_D_STORE\n\t" \
169 : /*Stage 4: \
170 : 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
171 : 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
172 : 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
173 : 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
174 : "psubw %%xmm3,%%xmm4\n\t" \
175 : "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
176 : "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
177 : "psubw %%xmm0,%%xmm7\n\t" \
178 : "psubw %%xmm1,%%xmm6\n\t" \
179 : "psubw %%xmm2,%%xmm5\n\t" \
180 : "paddw %%xmm4,%%xmm7\n\t" \
181 : "paddw %%xmm4,%%xmm6\n\t" \
182 : "paddw %%xmm4,%%xmm5\n\t" \
183 : "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
184 : "paddw %%xmm0,%%xmm0\n\t" \
185 : "paddw %%xmm1,%%xmm1\n\t" \
186 : "paddw %%xmm2,%%xmm2\n\t" \
187 : "paddw %%xmm3,%%xmm3\n\t" \
188 : "paddw %%xmm7,%%xmm0\n\t" \
189 : "paddw %%xmm6,%%xmm1\n\t" \
190 : "psraw $4,%%xmm0\n\t" \
191 : "paddw %%xmm5,%%xmm2\n\t" \
192 : "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
193 : "psraw $4,%%xmm1\n\t" \
194 : "paddw %%xmm4,%%xmm3\n\t" \
195 : "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
196 : "psraw $4,%%xmm2\n\t" \
197 : "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
198 : "psraw $4,%%xmm3\n\t" \
199 : "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
200 : "psraw $4,%%xmm4\n\t" \
201 : "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
202 : "psraw $4,%%xmm5\n\t" \
203 : "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
204 : "psraw $4,%%xmm6\n\t" \
205 : "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
206 : "psraw $4,%%xmm7\n\t" \
207 : "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
208 :
209 0 : static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
210 : OC_ALIGN16(ogg_int16_t buf[16]);
211 : /*This routine accepts an 8x8 matrix pre-transposed.*/
212 0 : __asm__ __volatile__(
213 : /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
214 : "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
215 : "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
216 : "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
217 : "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
218 : OC_IDCT_8x8_ABC(x)
219 : OC_IDCT_8x8_D
220 : OC_TRANSPOSE_8x8
221 : /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
222 : "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
223 : "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
224 : "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
225 : "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
226 : OC_IDCT_8x8_ABC(y)
227 : OC_IDCT_8x8_D_STORE
228 0 : :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
229 0 : [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
230 0 : :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
231 0 : [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
232 : );
233 0 : if(_x!=_y){
234 : int i;
235 0 : __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
236 : /*Clear input data for next block (decoder only).*/
237 0 : for(i=0;i<2;i++){
238 0 : __asm__ __volatile__(
239 : "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
240 : "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
241 : "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
242 : "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
243 0 : :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
244 : );
245 : }
246 : }
247 0 : }
248 :
249 : /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
250 : need to work with four columns at a time.
251 : Doing this in MMX is faster on processors with a 64-bit data path.*/
252 : #define OC_IDCT_8x8_10_MMX \
253 : "#OC_IDCT_8x8_10_MMX\n\t" \
254 : /*Stage 1:*/ \
255 : /*2-3 rotation by 6pi/16. \
256 : mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
257 : "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
258 : "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
259 : "pmulhw %%mm2,%%mm6\n\t" \
260 : "pmulhw %%mm2,%%mm7\n\t" \
261 : "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
262 : "paddw %%mm6,%%mm2\n\t" \
263 : "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
264 : "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
265 : "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
266 : /*5-6 rotation by 3pi/16. \
267 : mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
268 : "pmulhw %%mm3,%%mm5\n\t" \
269 : "pmulhw %%mm3,%%mm2\n\t" \
270 : "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
271 : "paddw %%mm3,%%mm5\n\t" \
272 : "paddw %%mm3,%%mm2\n\t" \
273 : "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
274 : /*4-7 rotation by 7pi/16. \
275 : mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
276 : "pmulhw %%mm1,%%mm3\n\t" \
277 : "pmulhw %%mm1,%%mm7\n\t" \
278 : "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
279 : "movq %%mm3,%%mm6\n\t" \
280 : "paddw %%mm1,%%mm7\n\t" \
281 : /*0-1 butterfly. \
282 : mm4=C4, mm0=X0, X4=0.*/ \
283 : /*Stage 2:*/ \
284 : /*4-5 butterfly: mm3=t[4], mm5=t[5] \
285 : 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
286 : "psubw %%mm5,%%mm3\n\t" \
287 : "paddw %%mm5,%%mm6\n\t" \
288 : "movq %%mm4,%%mm1\n\t" \
289 : "pmulhw %%mm0,%%mm4\n\t" \
290 : "paddw %%mm0,%%mm4\n\t" \
291 : "movq %%mm7,%%mm0\n\t" \
292 : "movq %%mm4,%%mm5\n\t" \
293 : "paddw %%mm2,%%mm0\n\t" \
294 : "psubw %%mm2,%%mm7\n\t" \
295 : "movq %%mm1,%%mm2\n\t" \
296 : "pmulhw %%mm6,%%mm1\n\t" \
297 : "pmulhw %%mm7,%%mm2\n\t" \
298 : "paddw %%mm6,%%mm1\n\t" \
299 : "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
300 : "paddw %%mm7,%%mm2\n\t" \
301 : "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
302 : /*Stage 3: \
303 : 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
304 : 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
305 : 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
306 : "paddw %%mm2,%%mm1\n\t" \
307 : "paddw %%mm5,%%mm6\n\t" \
308 : "paddw %%mm4,%%mm7\n\t" \
309 : "paddw %%mm2,%%mm2\n\t" \
310 : "paddw %%mm4,%%mm4\n\t" \
311 : "paddw %%mm5,%%mm5\n\t" \
312 : "psubw %%mm1,%%mm2\n\t" \
313 : "psubw %%mm7,%%mm4\n\t" \
314 : "psubw %%mm6,%%mm5\n\t" \
315 : /*Stage 4: \
316 : 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
317 : 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
318 : 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
319 : 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
320 : "psubw %%mm0,%%mm7\n\t" \
321 : "psubw %%mm1,%%mm6\n\t" \
322 : "psubw %%mm2,%%mm5\n\t" \
323 : "psubw %%mm3,%%mm4\n\t" \
324 : "paddw %%mm0,%%mm0\n\t" \
325 : "paddw %%mm1,%%mm1\n\t" \
326 : "paddw %%mm2,%%mm2\n\t" \
327 : "paddw %%mm3,%%mm3\n\t" \
328 : "paddw %%mm7,%%mm0\n\t" \
329 : "paddw %%mm6,%%mm1\n\t" \
330 : "paddw %%mm5,%%mm2\n\t" \
331 : "paddw %%mm4,%%mm3\n\t" \
332 :
333 : #define OC_IDCT_8x8_10_ABC \
334 : "#OC_IDCT_8x8_10_ABC\n\t" \
335 : /*Stage 1:*/ \
336 : /*2-3 rotation by 6pi/16. \
337 : xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
338 : "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
339 : "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
340 : "pmulhw %%xmm2,%%xmm6\n\t" \
341 : "pmulhw %%xmm2,%%xmm7\n\t" \
342 : "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
343 : "paddw %%xmm6,%%xmm2\n\t" \
344 : "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
345 : "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
346 : "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
347 : /*5-6 rotation by 3pi/16. \
348 : xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
349 : "pmulhw %%xmm3,%%xmm5\n\t" \
350 : "pmulhw %%xmm3,%%xmm2\n\t" \
351 : "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
352 : "paddw %%xmm3,%%xmm5\n\t" \
353 : "paddw %%xmm3,%%xmm2\n\t" \
354 : "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
355 : /*4-7 rotation by 7pi/16. \
356 : xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
357 : "pmulhw %%xmm1,%%xmm3\n\t" \
358 : "pmulhw %%xmm1,%%xmm7\n\t" \
359 : "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
360 : "movdqa %%xmm3,%%xmm6\n\t" \
361 : "paddw %%xmm1,%%xmm7\n\t" \
362 : /*0-1 butterfly. \
363 : xmm4=C4, xmm0=X0, X4=0.*/ \
364 : /*Stage 2:*/ \
365 : /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
366 : 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
367 : "psubw %%xmm5,%%xmm3\n\t" \
368 : "paddw %%xmm5,%%xmm6\n\t" \
369 : "movdqa %%xmm4,%%xmm1\n\t" \
370 : "pmulhw %%xmm0,%%xmm4\n\t" \
371 : "paddw %%xmm0,%%xmm4\n\t" \
372 : "movdqa %%xmm7,%%xmm0\n\t" \
373 : "movdqa %%xmm4,%%xmm5\n\t" \
374 : "paddw %%xmm2,%%xmm0\n\t" \
375 : "psubw %%xmm2,%%xmm7\n\t" \
376 : "movdqa %%xmm1,%%xmm2\n\t" \
377 : "pmulhw %%xmm6,%%xmm1\n\t" \
378 : "pmulhw %%xmm7,%%xmm2\n\t" \
379 : "paddw %%xmm6,%%xmm1\n\t" \
380 : "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
381 : "paddw %%xmm7,%%xmm2\n\t" \
382 : "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
383 : /*Stage 3: \
384 : 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
385 : 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
386 : 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
387 : "paddw %%xmm2,%%xmm1\n\t" \
388 : "paddw %%xmm5,%%xmm6\n\t" \
389 : "paddw %%xmm4,%%xmm7\n\t" \
390 : "paddw %%xmm2,%%xmm2\n\t" \
391 : "paddw %%xmm4,%%xmm4\n\t" \
392 : "paddw %%xmm5,%%xmm5\n\t" \
393 : "psubw %%xmm1,%%xmm2\n\t" \
394 : "psubw %%xmm7,%%xmm4\n\t" \
395 : "psubw %%xmm6,%%xmm5\n\t" \
396 :
397 0 : static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
398 : OC_ALIGN16(ogg_int16_t buf[16]);
399 : /*This routine accepts an 8x8 matrix pre-transposed.*/
400 0 : __asm__ __volatile__(
401 : "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
402 : "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
403 : "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
404 : "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
405 : OC_IDCT_8x8_10_MMX
406 : OC_TRANSPOSE_8x4_MMX2SSE
407 : OC_IDCT_8x8_10_ABC
408 : OC_IDCT_8x8_D_STORE
409 0 : :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
410 0 : [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
411 0 : :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
412 0 : [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
413 : );
414 0 : if(_x!=_y){
415 : /*Clear input data for next block (decoder only).*/
416 0 : __asm__ __volatile__(
417 : "pxor %%mm0,%%mm0\n\t"
418 : "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
419 : "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
420 : "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
421 : "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
422 0 : :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
423 : );
424 : }
425 0 : }
426 :
427 : /*Performs an inverse 8x8 Type-II DCT transform.
428 : The input is assumed to be scaled by a factor of 4 relative to orthonormal
429 : version of the transform.*/
430 0 : void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
431 : /*_last_zzi is subtly different from an actual count of the number of
432 : coefficients we decoded for this block.
433 : It contains the value of zzi BEFORE the final token in the block was
434 : decoded.
435 : In most cases this is an EOB token (the continuation of an EOB run from a
436 : previous block counts), and so this is the same as the coefficient count.
437 : However, in the case that the last token was NOT an EOB token, but filled
438 : the block up with exactly 64 coefficients, _last_zzi will be less than 64.
439 : Provided the last token was not a pure zero run, the minimum value it can
440 : be is 46, and so that doesn't affect any of the cases in this routine.
441 : However, if the last token WAS a pure zero run of length 63, then _last_zzi
442 : will be 1 while the number of coefficients decoded is 64.
443 : Thus, we will trigger the following special case, where the real
444 : coefficient count would not.
445 : Note also that a zero run of length 64 will give _last_zzi a value of 0,
446 : but we still process the DC coefficient, which might have a non-zero value
447 : due to DC prediction.
448 : Although convoluted, this is arguably the correct behavior: it allows us to
449 : use a smaller transform when the block ends with a long zero run instead
450 : of a normal EOB token.
451 : It could be smarter... multiple separate zero runs at the end of a block
452 : will fool it, but an encoder that generates these really deserves what it
453 : gets.
454 : Needless to say we inherited this approach from VP3.*/
455 : /*Then perform the iDCT.*/
456 0 : if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
457 0 : else oc_idct8x8_slow_sse2(_y,_x);
458 0 : }
459 :
460 : #endif
|