Line data Source code
1 : /*
2 : * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <assert.h>
12 : #include <math.h>
13 :
14 : #include "./vp9_rtcd.h"
15 : #include "./vpx_config.h"
16 : #include "./vpx_dsp_rtcd.h"
17 :
18 : #include "vp9/common/vp9_blockd.h"
19 : #include "vp9/common/vp9_idct.h"
20 : #include "vpx_dsp/fwd_txfm.h"
21 : #include "vpx_ports/mem.h"
22 :
23 0 : static void fdct4(const tran_low_t *input, tran_low_t *output) {
24 : tran_high_t step[4];
25 : tran_high_t temp1, temp2;
26 :
27 0 : step[0] = input[0] + input[3];
28 0 : step[1] = input[1] + input[2];
29 0 : step[2] = input[1] - input[2];
30 0 : step[3] = input[0] - input[3];
31 :
32 0 : temp1 = (step[0] + step[1]) * cospi_16_64;
33 0 : temp2 = (step[0] - step[1]) * cospi_16_64;
34 0 : output[0] = (tran_low_t)fdct_round_shift(temp1);
35 0 : output[2] = (tran_low_t)fdct_round_shift(temp2);
36 0 : temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64;
37 0 : temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64;
38 0 : output[1] = (tran_low_t)fdct_round_shift(temp1);
39 0 : output[3] = (tran_low_t)fdct_round_shift(temp2);
40 0 : }
41 :
42 0 : static void fdct8(const tran_low_t *input, tran_low_t *output) {
43 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
44 : tran_high_t t0, t1, t2, t3; // needs32
45 : tran_high_t x0, x1, x2, x3; // canbe16
46 :
47 : // stage 1
48 0 : s0 = input[0] + input[7];
49 0 : s1 = input[1] + input[6];
50 0 : s2 = input[2] + input[5];
51 0 : s3 = input[3] + input[4];
52 0 : s4 = input[3] - input[4];
53 0 : s5 = input[2] - input[5];
54 0 : s6 = input[1] - input[6];
55 0 : s7 = input[0] - input[7];
56 :
57 : // fdct4(step, step);
58 0 : x0 = s0 + s3;
59 0 : x1 = s1 + s2;
60 0 : x2 = s1 - s2;
61 0 : x3 = s0 - s3;
62 0 : t0 = (x0 + x1) * cospi_16_64;
63 0 : t1 = (x0 - x1) * cospi_16_64;
64 0 : t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
65 0 : t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
66 0 : output[0] = (tran_low_t)fdct_round_shift(t0);
67 0 : output[2] = (tran_low_t)fdct_round_shift(t2);
68 0 : output[4] = (tran_low_t)fdct_round_shift(t1);
69 0 : output[6] = (tran_low_t)fdct_round_shift(t3);
70 :
71 : // Stage 2
72 0 : t0 = (s6 - s5) * cospi_16_64;
73 0 : t1 = (s6 + s5) * cospi_16_64;
74 0 : t2 = (tran_low_t)fdct_round_shift(t0);
75 0 : t3 = (tran_low_t)fdct_round_shift(t1);
76 :
77 : // Stage 3
78 0 : x0 = s4 + t2;
79 0 : x1 = s4 - t2;
80 0 : x2 = s7 - t3;
81 0 : x3 = s7 + t3;
82 :
83 : // Stage 4
84 0 : t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
85 0 : t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
86 0 : t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
87 0 : t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
88 0 : output[1] = (tran_low_t)fdct_round_shift(t0);
89 0 : output[3] = (tran_low_t)fdct_round_shift(t2);
90 0 : output[5] = (tran_low_t)fdct_round_shift(t1);
91 0 : output[7] = (tran_low_t)fdct_round_shift(t3);
92 0 : }
93 :
94 0 : static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
95 : tran_high_t step1[8]; // canbe16
96 : tran_high_t step2[8]; // canbe16
97 : tran_high_t step3[8]; // canbe16
98 : tran_high_t input[8]; // canbe16
99 : tran_high_t temp1, temp2; // needs32
100 :
101 : // step 1
102 0 : input[0] = in[0] + in[15];
103 0 : input[1] = in[1] + in[14];
104 0 : input[2] = in[2] + in[13];
105 0 : input[3] = in[3] + in[12];
106 0 : input[4] = in[4] + in[11];
107 0 : input[5] = in[5] + in[10];
108 0 : input[6] = in[6] + in[9];
109 0 : input[7] = in[7] + in[8];
110 :
111 0 : step1[0] = in[7] - in[8];
112 0 : step1[1] = in[6] - in[9];
113 0 : step1[2] = in[5] - in[10];
114 0 : step1[3] = in[4] - in[11];
115 0 : step1[4] = in[3] - in[12];
116 0 : step1[5] = in[2] - in[13];
117 0 : step1[6] = in[1] - in[14];
118 0 : step1[7] = in[0] - in[15];
119 :
120 : // fdct8(step, step);
121 : {
122 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
123 : tran_high_t t0, t1, t2, t3; // needs32
124 : tran_high_t x0, x1, x2, x3; // canbe16
125 :
126 : // stage 1
127 0 : s0 = input[0] + input[7];
128 0 : s1 = input[1] + input[6];
129 0 : s2 = input[2] + input[5];
130 0 : s3 = input[3] + input[4];
131 0 : s4 = input[3] - input[4];
132 0 : s5 = input[2] - input[5];
133 0 : s6 = input[1] - input[6];
134 0 : s7 = input[0] - input[7];
135 :
136 : // fdct4(step, step);
137 0 : x0 = s0 + s3;
138 0 : x1 = s1 + s2;
139 0 : x2 = s1 - s2;
140 0 : x3 = s0 - s3;
141 0 : t0 = (x0 + x1) * cospi_16_64;
142 0 : t1 = (x0 - x1) * cospi_16_64;
143 0 : t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
144 0 : t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
145 0 : out[0] = (tran_low_t)fdct_round_shift(t0);
146 0 : out[4] = (tran_low_t)fdct_round_shift(t2);
147 0 : out[8] = (tran_low_t)fdct_round_shift(t1);
148 0 : out[12] = (tran_low_t)fdct_round_shift(t3);
149 :
150 : // Stage 2
151 0 : t0 = (s6 - s5) * cospi_16_64;
152 0 : t1 = (s6 + s5) * cospi_16_64;
153 0 : t2 = fdct_round_shift(t0);
154 0 : t3 = fdct_round_shift(t1);
155 :
156 : // Stage 3
157 0 : x0 = s4 + t2;
158 0 : x1 = s4 - t2;
159 0 : x2 = s7 - t3;
160 0 : x3 = s7 + t3;
161 :
162 : // Stage 4
163 0 : t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
164 0 : t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
165 0 : t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
166 0 : t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
167 0 : out[2] = (tran_low_t)fdct_round_shift(t0);
168 0 : out[6] = (tran_low_t)fdct_round_shift(t2);
169 0 : out[10] = (tran_low_t)fdct_round_shift(t1);
170 0 : out[14] = (tran_low_t)fdct_round_shift(t3);
171 : }
172 :
173 : // step 2
174 0 : temp1 = (step1[5] - step1[2]) * cospi_16_64;
175 0 : temp2 = (step1[4] - step1[3]) * cospi_16_64;
176 0 : step2[2] = fdct_round_shift(temp1);
177 0 : step2[3] = fdct_round_shift(temp2);
178 0 : temp1 = (step1[4] + step1[3]) * cospi_16_64;
179 0 : temp2 = (step1[5] + step1[2]) * cospi_16_64;
180 0 : step2[4] = fdct_round_shift(temp1);
181 0 : step2[5] = fdct_round_shift(temp2);
182 :
183 : // step 3
184 0 : step3[0] = step1[0] + step2[3];
185 0 : step3[1] = step1[1] + step2[2];
186 0 : step3[2] = step1[1] - step2[2];
187 0 : step3[3] = step1[0] - step2[3];
188 0 : step3[4] = step1[7] - step2[4];
189 0 : step3[5] = step1[6] - step2[5];
190 0 : step3[6] = step1[6] + step2[5];
191 0 : step3[7] = step1[7] + step2[4];
192 :
193 : // step 4
194 0 : temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
195 0 : temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
196 0 : step2[1] = fdct_round_shift(temp1);
197 0 : step2[2] = fdct_round_shift(temp2);
198 0 : temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
199 0 : temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
200 0 : step2[5] = fdct_round_shift(temp1);
201 0 : step2[6] = fdct_round_shift(temp2);
202 :
203 : // step 5
204 0 : step1[0] = step3[0] + step2[1];
205 0 : step1[1] = step3[0] - step2[1];
206 0 : step1[2] = step3[3] + step2[2];
207 0 : step1[3] = step3[3] - step2[2];
208 0 : step1[4] = step3[4] - step2[5];
209 0 : step1[5] = step3[4] + step2[5];
210 0 : step1[6] = step3[7] - step2[6];
211 0 : step1[7] = step3[7] + step2[6];
212 :
213 : // step 6
214 0 : temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
215 0 : temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
216 0 : out[1] = (tran_low_t)fdct_round_shift(temp1);
217 0 : out[9] = (tran_low_t)fdct_round_shift(temp2);
218 :
219 0 : temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
220 0 : temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
221 0 : out[5] = (tran_low_t)fdct_round_shift(temp1);
222 0 : out[13] = (tran_low_t)fdct_round_shift(temp2);
223 :
224 0 : temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
225 0 : temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
226 0 : out[3] = (tran_low_t)fdct_round_shift(temp1);
227 0 : out[11] = (tran_low_t)fdct_round_shift(temp2);
228 :
229 0 : temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
230 0 : temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
231 0 : out[7] = (tran_low_t)fdct_round_shift(temp1);
232 0 : out[15] = (tran_low_t)fdct_round_shift(temp2);
233 0 : }
234 :
235 0 : static void fadst4(const tran_low_t *input, tran_low_t *output) {
236 : tran_high_t x0, x1, x2, x3;
237 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
238 :
239 0 : x0 = input[0];
240 0 : x1 = input[1];
241 0 : x2 = input[2];
242 0 : x3 = input[3];
243 :
244 0 : if (!(x0 | x1 | x2 | x3)) {
245 0 : output[0] = output[1] = output[2] = output[3] = 0;
246 0 : return;
247 : }
248 :
249 0 : s0 = sinpi_1_9 * x0;
250 0 : s1 = sinpi_4_9 * x0;
251 0 : s2 = sinpi_2_9 * x1;
252 0 : s3 = sinpi_1_9 * x1;
253 0 : s4 = sinpi_3_9 * x2;
254 0 : s5 = sinpi_4_9 * x3;
255 0 : s6 = sinpi_2_9 * x3;
256 0 : s7 = x0 + x1 - x3;
257 :
258 0 : x0 = s0 + s2 + s5;
259 0 : x1 = sinpi_3_9 * s7;
260 0 : x2 = s1 - s3 + s6;
261 0 : x3 = s4;
262 :
263 0 : s0 = x0 + x3;
264 0 : s1 = x1;
265 0 : s2 = x2 - x3;
266 0 : s3 = x2 - x0 + x3;
267 :
268 : // 1-D transform scaling factor is sqrt(2).
269 0 : output[0] = (tran_low_t)fdct_round_shift(s0);
270 0 : output[1] = (tran_low_t)fdct_round_shift(s1);
271 0 : output[2] = (tran_low_t)fdct_round_shift(s2);
272 0 : output[3] = (tran_low_t)fdct_round_shift(s3);
273 : }
274 :
275 0 : static void fadst8(const tran_low_t *input, tran_low_t *output) {
276 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
277 :
278 0 : tran_high_t x0 = input[7];
279 0 : tran_high_t x1 = input[0];
280 0 : tran_high_t x2 = input[5];
281 0 : tran_high_t x3 = input[2];
282 0 : tran_high_t x4 = input[3];
283 0 : tran_high_t x5 = input[4];
284 0 : tran_high_t x6 = input[1];
285 0 : tran_high_t x7 = input[6];
286 :
287 : // stage 1
288 0 : s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
289 0 : s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
290 0 : s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
291 0 : s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
292 0 : s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
293 0 : s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
294 0 : s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
295 0 : s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
296 :
297 0 : x0 = fdct_round_shift(s0 + s4);
298 0 : x1 = fdct_round_shift(s1 + s5);
299 0 : x2 = fdct_round_shift(s2 + s6);
300 0 : x3 = fdct_round_shift(s3 + s7);
301 0 : x4 = fdct_round_shift(s0 - s4);
302 0 : x5 = fdct_round_shift(s1 - s5);
303 0 : x6 = fdct_round_shift(s2 - s6);
304 0 : x7 = fdct_round_shift(s3 - s7);
305 :
306 : // stage 2
307 0 : s0 = x0;
308 0 : s1 = x1;
309 0 : s2 = x2;
310 0 : s3 = x3;
311 0 : s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
312 0 : s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
313 0 : s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
314 0 : s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
315 :
316 0 : x0 = s0 + s2;
317 0 : x1 = s1 + s3;
318 0 : x2 = s0 - s2;
319 0 : x3 = s1 - s3;
320 0 : x4 = fdct_round_shift(s4 + s6);
321 0 : x5 = fdct_round_shift(s5 + s7);
322 0 : x6 = fdct_round_shift(s4 - s6);
323 0 : x7 = fdct_round_shift(s5 - s7);
324 :
325 : // stage 3
326 0 : s2 = cospi_16_64 * (x2 + x3);
327 0 : s3 = cospi_16_64 * (x2 - x3);
328 0 : s6 = cospi_16_64 * (x6 + x7);
329 0 : s7 = cospi_16_64 * (x6 - x7);
330 :
331 0 : x2 = fdct_round_shift(s2);
332 0 : x3 = fdct_round_shift(s3);
333 0 : x6 = fdct_round_shift(s6);
334 0 : x7 = fdct_round_shift(s7);
335 :
336 0 : output[0] = (tran_low_t)x0;
337 0 : output[1] = (tran_low_t)-x4;
338 0 : output[2] = (tran_low_t)x6;
339 0 : output[3] = (tran_low_t)-x2;
340 0 : output[4] = (tran_low_t)x3;
341 0 : output[5] = (tran_low_t)-x7;
342 0 : output[6] = (tran_low_t)x5;
343 0 : output[7] = (tran_low_t)-x1;
344 0 : }
345 :
346 0 : static void fadst16(const tran_low_t *input, tran_low_t *output) {
347 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
348 : tran_high_t s9, s10, s11, s12, s13, s14, s15;
349 :
350 0 : tran_high_t x0 = input[15];
351 0 : tran_high_t x1 = input[0];
352 0 : tran_high_t x2 = input[13];
353 0 : tran_high_t x3 = input[2];
354 0 : tran_high_t x4 = input[11];
355 0 : tran_high_t x5 = input[4];
356 0 : tran_high_t x6 = input[9];
357 0 : tran_high_t x7 = input[6];
358 0 : tran_high_t x8 = input[7];
359 0 : tran_high_t x9 = input[8];
360 0 : tran_high_t x10 = input[5];
361 0 : tran_high_t x11 = input[10];
362 0 : tran_high_t x12 = input[3];
363 0 : tran_high_t x13 = input[12];
364 0 : tran_high_t x14 = input[1];
365 0 : tran_high_t x15 = input[14];
366 :
367 : // stage 1
368 0 : s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
369 0 : s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
370 0 : s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
371 0 : s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
372 0 : s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
373 0 : s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
374 0 : s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
375 0 : s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
376 0 : s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
377 0 : s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
378 0 : s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
379 0 : s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
380 0 : s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
381 0 : s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
382 0 : s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
383 0 : s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
384 :
385 0 : x0 = fdct_round_shift(s0 + s8);
386 0 : x1 = fdct_round_shift(s1 + s9);
387 0 : x2 = fdct_round_shift(s2 + s10);
388 0 : x3 = fdct_round_shift(s3 + s11);
389 0 : x4 = fdct_round_shift(s4 + s12);
390 0 : x5 = fdct_round_shift(s5 + s13);
391 0 : x6 = fdct_round_shift(s6 + s14);
392 0 : x7 = fdct_round_shift(s7 + s15);
393 0 : x8 = fdct_round_shift(s0 - s8);
394 0 : x9 = fdct_round_shift(s1 - s9);
395 0 : x10 = fdct_round_shift(s2 - s10);
396 0 : x11 = fdct_round_shift(s3 - s11);
397 0 : x12 = fdct_round_shift(s4 - s12);
398 0 : x13 = fdct_round_shift(s5 - s13);
399 0 : x14 = fdct_round_shift(s6 - s14);
400 0 : x15 = fdct_round_shift(s7 - s15);
401 :
402 : // stage 2
403 0 : s0 = x0;
404 0 : s1 = x1;
405 0 : s2 = x2;
406 0 : s3 = x3;
407 0 : s4 = x4;
408 0 : s5 = x5;
409 0 : s6 = x6;
410 0 : s7 = x7;
411 0 : s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
412 0 : s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
413 0 : s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
414 0 : s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
415 0 : s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
416 0 : s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
417 0 : s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
418 0 : s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
419 :
420 0 : x0 = s0 + s4;
421 0 : x1 = s1 + s5;
422 0 : x2 = s2 + s6;
423 0 : x3 = s3 + s7;
424 0 : x4 = s0 - s4;
425 0 : x5 = s1 - s5;
426 0 : x6 = s2 - s6;
427 0 : x7 = s3 - s7;
428 0 : x8 = fdct_round_shift(s8 + s12);
429 0 : x9 = fdct_round_shift(s9 + s13);
430 0 : x10 = fdct_round_shift(s10 + s14);
431 0 : x11 = fdct_round_shift(s11 + s15);
432 0 : x12 = fdct_round_shift(s8 - s12);
433 0 : x13 = fdct_round_shift(s9 - s13);
434 0 : x14 = fdct_round_shift(s10 - s14);
435 0 : x15 = fdct_round_shift(s11 - s15);
436 :
437 : // stage 3
438 0 : s0 = x0;
439 0 : s1 = x1;
440 0 : s2 = x2;
441 0 : s3 = x3;
442 0 : s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
443 0 : s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
444 0 : s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
445 0 : s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
446 0 : s8 = x8;
447 0 : s9 = x9;
448 0 : s10 = x10;
449 0 : s11 = x11;
450 0 : s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
451 0 : s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
452 0 : s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
453 0 : s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
454 :
455 0 : x0 = s0 + s2;
456 0 : x1 = s1 + s3;
457 0 : x2 = s0 - s2;
458 0 : x3 = s1 - s3;
459 0 : x4 = fdct_round_shift(s4 + s6);
460 0 : x5 = fdct_round_shift(s5 + s7);
461 0 : x6 = fdct_round_shift(s4 - s6);
462 0 : x7 = fdct_round_shift(s5 - s7);
463 0 : x8 = s8 + s10;
464 0 : x9 = s9 + s11;
465 0 : x10 = s8 - s10;
466 0 : x11 = s9 - s11;
467 0 : x12 = fdct_round_shift(s12 + s14);
468 0 : x13 = fdct_round_shift(s13 + s15);
469 0 : x14 = fdct_round_shift(s12 - s14);
470 0 : x15 = fdct_round_shift(s13 - s15);
471 :
472 : // stage 4
473 0 : s2 = (-cospi_16_64) * (x2 + x3);
474 0 : s3 = cospi_16_64 * (x2 - x3);
475 0 : s6 = cospi_16_64 * (x6 + x7);
476 0 : s7 = cospi_16_64 * (-x6 + x7);
477 0 : s10 = cospi_16_64 * (x10 + x11);
478 0 : s11 = cospi_16_64 * (-x10 + x11);
479 0 : s14 = (-cospi_16_64) * (x14 + x15);
480 0 : s15 = cospi_16_64 * (x14 - x15);
481 :
482 0 : x2 = fdct_round_shift(s2);
483 0 : x3 = fdct_round_shift(s3);
484 0 : x6 = fdct_round_shift(s6);
485 0 : x7 = fdct_round_shift(s7);
486 0 : x10 = fdct_round_shift(s10);
487 0 : x11 = fdct_round_shift(s11);
488 0 : x14 = fdct_round_shift(s14);
489 0 : x15 = fdct_round_shift(s15);
490 :
491 0 : output[0] = (tran_low_t)x0;
492 0 : output[1] = (tran_low_t)-x8;
493 0 : output[2] = (tran_low_t)x12;
494 0 : output[3] = (tran_low_t)-x4;
495 0 : output[4] = (tran_low_t)x6;
496 0 : output[5] = (tran_low_t)x14;
497 0 : output[6] = (tran_low_t)x10;
498 0 : output[7] = (tran_low_t)x2;
499 0 : output[8] = (tran_low_t)x3;
500 0 : output[9] = (tran_low_t)x11;
501 0 : output[10] = (tran_low_t)x15;
502 0 : output[11] = (tran_low_t)x7;
503 0 : output[12] = (tran_low_t)x5;
504 0 : output[13] = (tran_low_t)-x13;
505 0 : output[14] = (tran_low_t)x9;
506 0 : output[15] = (tran_low_t)-x1;
507 0 : }
508 :
509 : static const transform_2d FHT_4[] = {
510 : { fdct4, fdct4 }, // DCT_DCT = 0
511 : { fadst4, fdct4 }, // ADST_DCT = 1
512 : { fdct4, fadst4 }, // DCT_ADST = 2
513 : { fadst4, fadst4 } // ADST_ADST = 3
514 : };
515 :
516 : static const transform_2d FHT_8[] = {
517 : { fdct8, fdct8 }, // DCT_DCT = 0
518 : { fadst8, fdct8 }, // ADST_DCT = 1
519 : { fdct8, fadst8 }, // DCT_ADST = 2
520 : { fadst8, fadst8 } // ADST_ADST = 3
521 : };
522 :
523 : static const transform_2d FHT_16[] = {
524 : { fdct16, fdct16 }, // DCT_DCT = 0
525 : { fadst16, fdct16 }, // ADST_DCT = 1
526 : { fdct16, fadst16 }, // DCT_ADST = 2
527 : { fadst16, fadst16 } // ADST_ADST = 3
528 : };
529 :
530 0 : void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
531 : int tx_type) {
532 0 : if (tx_type == DCT_DCT) {
533 0 : vpx_fdct4x4_c(input, output, stride);
534 : } else {
535 : tran_low_t out[4 * 4];
536 : int i, j;
537 : tran_low_t temp_in[4], temp_out[4];
538 0 : const transform_2d ht = FHT_4[tx_type];
539 :
540 : // Columns
541 0 : for (i = 0; i < 4; ++i) {
542 0 : for (j = 0; j < 4; ++j) temp_in[j] = input[j * stride + i] * 16;
543 0 : if (i == 0 && temp_in[0]) temp_in[0] += 1;
544 0 : ht.cols(temp_in, temp_out);
545 0 : for (j = 0; j < 4; ++j) out[j * 4 + i] = temp_out[j];
546 : }
547 :
548 : // Rows
549 0 : for (i = 0; i < 4; ++i) {
550 0 : for (j = 0; j < 4; ++j) temp_in[j] = out[j + i * 4];
551 0 : ht.rows(temp_in, temp_out);
552 0 : for (j = 0; j < 4; ++j) output[j + i * 4] = (temp_out[j] + 1) >> 2;
553 : }
554 : }
555 0 : }
556 :
557 0 : void vp9_fdct8x8_quant_c(const int16_t *input, int stride,
558 : tran_low_t *coeff_ptr, intptr_t n_coeffs,
559 : int skip_block, const int16_t *zbin_ptr,
560 : const int16_t *round_ptr, const int16_t *quant_ptr,
561 : const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
562 : tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
563 : uint16_t *eob_ptr, const int16_t *scan,
564 : const int16_t *iscan) {
565 0 : int eob = -1;
566 :
567 : int i, j;
568 : tran_low_t intermediate[64];
569 :
570 : // Transform columns
571 : {
572 0 : tran_low_t *output = intermediate;
573 : tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
574 : tran_high_t t0, t1, t2, t3; // needs32
575 : tran_high_t x0, x1, x2, x3; // canbe16
576 :
577 : int i;
578 0 : for (i = 0; i < 8; i++) {
579 : // stage 1
580 0 : s0 = (input[0 * stride] + input[7 * stride]) * 4;
581 0 : s1 = (input[1 * stride] + input[6 * stride]) * 4;
582 0 : s2 = (input[2 * stride] + input[5 * stride]) * 4;
583 0 : s3 = (input[3 * stride] + input[4 * stride]) * 4;
584 0 : s4 = (input[3 * stride] - input[4 * stride]) * 4;
585 0 : s5 = (input[2 * stride] - input[5 * stride]) * 4;
586 0 : s6 = (input[1 * stride] - input[6 * stride]) * 4;
587 0 : s7 = (input[0 * stride] - input[7 * stride]) * 4;
588 :
589 : // fdct4(step, step);
590 0 : x0 = s0 + s3;
591 0 : x1 = s1 + s2;
592 0 : x2 = s1 - s2;
593 0 : x3 = s0 - s3;
594 0 : t0 = (x0 + x1) * cospi_16_64;
595 0 : t1 = (x0 - x1) * cospi_16_64;
596 0 : t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
597 0 : t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
598 0 : output[0 * 8] = (tran_low_t)fdct_round_shift(t0);
599 0 : output[2 * 8] = (tran_low_t)fdct_round_shift(t2);
600 0 : output[4 * 8] = (tran_low_t)fdct_round_shift(t1);
601 0 : output[6 * 8] = (tran_low_t)fdct_round_shift(t3);
602 :
603 : // Stage 2
604 0 : t0 = (s6 - s5) * cospi_16_64;
605 0 : t1 = (s6 + s5) * cospi_16_64;
606 0 : t2 = fdct_round_shift(t0);
607 0 : t3 = fdct_round_shift(t1);
608 :
609 : // Stage 3
610 0 : x0 = s4 + t2;
611 0 : x1 = s4 - t2;
612 0 : x2 = s7 - t3;
613 0 : x3 = s7 + t3;
614 :
615 : // Stage 4
616 0 : t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
617 0 : t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
618 0 : t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
619 0 : t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
620 0 : output[1 * 8] = (tran_low_t)fdct_round_shift(t0);
621 0 : output[3 * 8] = (tran_low_t)fdct_round_shift(t2);
622 0 : output[5 * 8] = (tran_low_t)fdct_round_shift(t1);
623 0 : output[7 * 8] = (tran_low_t)fdct_round_shift(t3);
624 0 : input++;
625 0 : output++;
626 : }
627 : }
628 :
629 : // Rows
630 0 : for (i = 0; i < 8; ++i) {
631 0 : fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]);
632 0 : for (j = 0; j < 8; ++j) coeff_ptr[j + i * 8] /= 2;
633 : }
634 :
635 : // TODO(jingning) Decide the need of these arguments after the
636 : // quantization process is completed.
637 : (void)zbin_ptr;
638 : (void)quant_shift_ptr;
639 : (void)iscan;
640 :
641 0 : memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
642 0 : memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
643 :
644 0 : if (!skip_block) {
645 : // Quantization pass: All coefficients with index >= zero_flag are
646 : // skippable. Note: zero_flag can be zero.
647 0 : for (i = 0; i < n_coeffs; i++) {
648 0 : const int rc = scan[i];
649 0 : const int coeff = coeff_ptr[rc];
650 0 : const int coeff_sign = (coeff >> 31);
651 0 : const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
652 :
653 0 : int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
654 0 : tmp = (tmp * quant_ptr[rc != 0]) >> 16;
655 :
656 0 : qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
657 0 : dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
658 :
659 0 : if (tmp) eob = i;
660 : }
661 : }
662 0 : *eob_ptr = eob + 1;
663 0 : }
664 :
665 0 : void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
666 : int tx_type) {
667 0 : if (tx_type == DCT_DCT) {
668 0 : vpx_fdct8x8_c(input, output, stride);
669 : } else {
670 : tran_low_t out[64];
671 : int i, j;
672 : tran_low_t temp_in[8], temp_out[8];
673 0 : const transform_2d ht = FHT_8[tx_type];
674 :
675 : // Columns
676 0 : for (i = 0; i < 8; ++i) {
677 0 : for (j = 0; j < 8; ++j) temp_in[j] = input[j * stride + i] * 4;
678 0 : ht.cols(temp_in, temp_out);
679 0 : for (j = 0; j < 8; ++j) out[j * 8 + i] = temp_out[j];
680 : }
681 :
682 : // Rows
683 0 : for (i = 0; i < 8; ++i) {
684 0 : for (j = 0; j < 8; ++j) temp_in[j] = out[j + i * 8];
685 0 : ht.rows(temp_in, temp_out);
686 0 : for (j = 0; j < 8; ++j)
687 0 : output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1;
688 : }
689 : }
690 0 : }
691 :
692 : /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
693 : pixel. */
694 0 : void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
695 : int i;
696 : tran_high_t a1, b1, c1, d1, e1;
697 0 : const int16_t *ip_pass0 = input;
698 0 : const tran_low_t *ip = NULL;
699 0 : tran_low_t *op = output;
700 :
701 0 : for (i = 0; i < 4; i++) {
702 0 : a1 = ip_pass0[0 * stride];
703 0 : b1 = ip_pass0[1 * stride];
704 0 : c1 = ip_pass0[2 * stride];
705 0 : d1 = ip_pass0[3 * stride];
706 :
707 0 : a1 += b1;
708 0 : d1 = d1 - c1;
709 0 : e1 = (a1 - d1) >> 1;
710 0 : b1 = e1 - b1;
711 0 : c1 = e1 - c1;
712 0 : a1 -= c1;
713 0 : d1 += b1;
714 0 : op[0] = (tran_low_t)a1;
715 0 : op[4] = (tran_low_t)c1;
716 0 : op[8] = (tran_low_t)d1;
717 0 : op[12] = (tran_low_t)b1;
718 :
719 0 : ip_pass0++;
720 0 : op++;
721 : }
722 0 : ip = output;
723 0 : op = output;
724 :
725 0 : for (i = 0; i < 4; i++) {
726 0 : a1 = ip[0];
727 0 : b1 = ip[1];
728 0 : c1 = ip[2];
729 0 : d1 = ip[3];
730 :
731 0 : a1 += b1;
732 0 : d1 -= c1;
733 0 : e1 = (a1 - d1) >> 1;
734 0 : b1 = e1 - b1;
735 0 : c1 = e1 - c1;
736 0 : a1 -= c1;
737 0 : d1 += b1;
738 0 : op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
739 0 : op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
740 0 : op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
741 0 : op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
742 :
743 0 : ip += 4;
744 0 : op += 4;
745 : }
746 0 : }
747 :
748 0 : void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
749 : int tx_type) {
750 0 : if (tx_type == DCT_DCT) {
751 0 : vpx_fdct16x16_c(input, output, stride);
752 : } else {
753 : tran_low_t out[256];
754 : int i, j;
755 : tran_low_t temp_in[16], temp_out[16];
756 0 : const transform_2d ht = FHT_16[tx_type];
757 :
758 : // Columns
759 0 : for (i = 0; i < 16; ++i) {
760 0 : for (j = 0; j < 16; ++j) temp_in[j] = input[j * stride + i] * 4;
761 0 : ht.cols(temp_in, temp_out);
762 0 : for (j = 0; j < 16; ++j)
763 0 : out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
764 : }
765 :
766 : // Rows
767 0 : for (i = 0; i < 16; ++i) {
768 0 : for (j = 0; j < 16; ++j) temp_in[j] = out[j + i * 16];
769 0 : ht.rows(temp_in, temp_out);
770 0 : for (j = 0; j < 16; ++j) output[j + i * 16] = temp_out[j];
771 : }
772 : }
773 0 : }
774 :
775 : #if CONFIG_VP9_HIGHBITDEPTH
776 : void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride,
777 : int tx_type) {
778 : vp9_fht4x4_c(input, output, stride, tx_type);
779 : }
780 :
781 : void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride,
782 : int tx_type) {
783 : vp9_fht8x8_c(input, output, stride, tx_type);
784 : }
785 :
786 : void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
787 : int stride) {
788 : vp9_fwht4x4_c(input, output, stride);
789 : }
790 :
791 : void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
792 : int tx_type) {
793 : vp9_fht16x16_c(input, output, stride, tx_type);
794 : }
795 : #endif // CONFIG_VP9_HIGHBITDEPTH
|