Line data Source code
1 : /********************************************************************
2 : * *
3 : * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 : * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 : * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 : * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 : * *
8 : * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 : * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
10 : * *
11 : ********************************************************************
12 :
13 : function:
14 : last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $
15 :
16 : ********************************************************************/
17 :
18 : /*MMX acceleration of complete fragment reconstruction algorithm.
19 : Originally written by Rudolf Marek.*/
20 : #include <string.h>
21 : #include "x86int.h"
22 : #include "mmxloop.h"
23 :
24 : #if defined(OC_X86_ASM)
25 :
26 0 : void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
27 : int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
28 : unsigned char *dst;
29 : ptrdiff_t frag_buf_off;
30 : int ystride;
31 : int refi;
32 : /*Apply the inverse transform.*/
33 : /*Special case only having a DC component.*/
34 0 : if(_last_zzi<2){
35 : /*Note that this value must be unsigned, to keep the __asm__ block from
36 : sign-extending it when it puts it in a register.*/
37 : ogg_uint16_t p;
38 : int i;
39 : /*We round this dequant product (and not any of the others) because there's
40 : no iDCT rounding.*/
41 0 : p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
42 : /*Fill _dct_coeffs with p.*/
43 0 : __asm__ __volatile__(
44 : /*mm0=0000 0000 0000 AAAA*/
45 : "movd %[p],%%mm0\n\t"
46 : /*mm0=0000 0000 AAAA AAAA*/
47 : "punpcklwd %%mm0,%%mm0\n\t"
48 : /*mm0=AAAA AAAA AAAA AAAA*/
49 : "punpckldq %%mm0,%%mm0\n\t"
50 : :
51 0 : :[p]"r"((unsigned)p)
52 : );
53 0 : for(i=0;i<4;i++){
54 0 : __asm__ __volatile__(
55 : "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
56 : "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
57 : "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
58 : "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
59 0 : :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
60 : );
61 : }
62 : }
63 : else{
64 : /*Dequantize the DC coefficient.*/
65 0 : _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
66 0 : oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
67 : }
68 : /*Fill in the target buffer.*/
69 0 : frag_buf_off=_state->frag_buf_offs[_fragi];
70 0 : refi=_state->frags[_fragi].refi;
71 0 : ystride=_state->ref_ystride[_pli];
72 0 : dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
73 0 : if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
74 : else{
75 : const unsigned char *ref;
76 : int mvoffsets[2];
77 0 : ref=_state->ref_frame_data[refi]+frag_buf_off;
78 0 : if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
79 0 : _state->frag_mvs[_fragi])>1){
80 0 : oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
81 0 : _dct_coeffs+64);
82 : }
83 0 : else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
84 : }
85 0 : }
86 :
87 : /*We copy these entire function to inline the actual MMX routines so that we
88 : use only a single indirect call.*/
89 :
90 0 : void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
91 0 : memset(_bv,_flimit,8);
92 0 : }
93 :
94 : /*Apply the loop filter to a given set of fragment rows in the given plane.
95 : The filter may be run on the bottom edge, affecting pixels in the next row of
96 : fragments, so this row also needs to be available.
97 : _bv: The bounding values array.
98 : _refi: The index of the frame buffer to filter.
99 : _pli: The color plane to filter.
100 : _fragy0: The Y coordinate of the first fragment row to filter.
101 : _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
102 0 : void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
103 : signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
104 : OC_ALIGN8(unsigned char ll[8]);
105 : const oc_fragment_plane *fplane;
106 : const oc_fragment *frags;
107 : const ptrdiff_t *frag_buf_offs;
108 : unsigned char *ref_frame_data;
109 : ptrdiff_t fragi_top;
110 : ptrdiff_t fragi_bot;
111 : ptrdiff_t fragi0;
112 : ptrdiff_t fragi0_end;
113 : int ystride;
114 : int nhfrags;
115 0 : memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
116 0 : fplane=_state->fplanes+_pli;
117 0 : nhfrags=fplane->nhfrags;
118 0 : fragi_top=fplane->froffset;
119 0 : fragi_bot=fragi_top+fplane->nfrags;
120 0 : fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
121 0 : fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
122 0 : ystride=_state->ref_ystride[_pli];
123 0 : frags=_state->frags;
124 0 : frag_buf_offs=_state->frag_buf_offs;
125 0 : ref_frame_data=_state->ref_frame_data[_refi];
126 : /*The following loops are constructed somewhat non-intuitively on purpose.
127 : The main idea is: if a block boundary has at least one coded fragment on
128 : it, the filter is applied to it.
129 : However, the order that the filters are applied in matters, and VP3 chose
130 : the somewhat strange ordering used below.*/
131 0 : while(fragi0<fragi0_end){
132 : ptrdiff_t fragi;
133 : ptrdiff_t fragi_end;
134 0 : fragi=fragi0;
135 0 : fragi_end=fragi+nhfrags;
136 0 : while(fragi<fragi_end){
137 0 : if(frags[fragi].coded){
138 : unsigned char *ref;
139 0 : ref=ref_frame_data+frag_buf_offs[fragi];
140 0 : if(fragi>fragi0){
141 0 : OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
142 : }
143 0 : if(fragi0>fragi_top){
144 0 : OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
145 : }
146 0 : if(fragi+1<fragi_end&&!frags[fragi+1].coded){
147 0 : OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
148 : }
149 0 : if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
150 0 : OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
151 : }
152 : }
153 0 : fragi++;
154 : }
155 0 : fragi0+=nhfrags;
156 : }
157 0 : }
158 :
159 0 : void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
160 0 : memset(_bv,~(_flimit<<1),8);
161 0 : }
162 :
163 : /*Apply the loop filter to a given set of fragment rows in the given plane.
164 : The filter may be run on the bottom edge, affecting pixels in the next row of
165 : fragments, so this row also needs to be available.
166 : _bv: The bounding values array.
167 : _refi: The index of the frame buffer to filter.
168 : _pli: The color plane to filter.
169 : _fragy0: The Y coordinate of the first fragment row to filter.
170 : _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
171 0 : void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
172 : signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
173 : const oc_fragment_plane *fplane;
174 : const oc_fragment *frags;
175 : const ptrdiff_t *frag_buf_offs;
176 : unsigned char *ref_frame_data;
177 : ptrdiff_t fragi_top;
178 : ptrdiff_t fragi_bot;
179 : ptrdiff_t fragi0;
180 : ptrdiff_t fragi0_end;
181 : int ystride;
182 : int nhfrags;
183 0 : fplane=_state->fplanes+_pli;
184 0 : nhfrags=fplane->nhfrags;
185 0 : fragi_top=fplane->froffset;
186 0 : fragi_bot=fragi_top+fplane->nfrags;
187 0 : fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
188 0 : fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
189 0 : ystride=_state->ref_ystride[_pli];
190 0 : frags=_state->frags;
191 0 : frag_buf_offs=_state->frag_buf_offs;
192 0 : ref_frame_data=_state->ref_frame_data[_refi];
193 : /*The following loops are constructed somewhat non-intuitively on purpose.
194 : The main idea is: if a block boundary has at least one coded fragment on
195 : it, the filter is applied to it.
196 : However, the order that the filters are applied in matters, and VP3 chose
197 : the somewhat strange ordering used below.*/
198 0 : while(fragi0<fragi0_end){
199 : ptrdiff_t fragi;
200 : ptrdiff_t fragi_end;
201 0 : fragi=fragi0;
202 0 : fragi_end=fragi+nhfrags;
203 0 : while(fragi<fragi_end){
204 0 : if(frags[fragi].coded){
205 : unsigned char *ref;
206 0 : ref=ref_frame_data+frag_buf_offs[fragi];
207 0 : if(fragi>fragi0){
208 0 : OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
209 : }
210 0 : if(fragi0>fragi_top){
211 0 : OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
212 : }
213 0 : if(fragi+1<fragi_end&&!frags[fragi+1].coded){
214 0 : OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
215 : }
216 0 : if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
217 0 : OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
218 : }
219 : }
220 0 : fragi++;
221 : }
222 0 : fragi0+=nhfrags;
223 : }
224 0 : }
225 :
226 : #endif
|