Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h> // SSE2
13 :
14 : #include "aom_dsp/fwd_txfm.h"
15 : #include "aom_dsp/txfm_common.h"
16 : #include "aom_dsp/x86/txfm_common_sse2.h"
17 :
18 : // Apply a 32-element IDCT to 8 columns. This does not do any transposition
19 : // of its output - the caller is expected to do that.
20 : // The input buffers are the top and bottom halves of an 8x32 block.
21 0 : void fdct32_8col(__m128i *in0, __m128i *in1) {
22 : // Constants
23 : // When we use them, in one case, they are all the same. In all others
24 : // it's a pair of them that we need to repeat four times. This is done
25 : // by constructing the 32 bit constant corresponding to that pair.
26 0 : const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
27 0 : const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
28 0 : const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
29 0 : const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
30 0 : const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
31 0 : const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
32 0 : const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
33 0 : const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
34 0 : const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
35 0 : const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
36 0 : const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
37 0 : const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
38 0 : const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
39 0 : const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
40 0 : const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
41 0 : const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
42 0 : const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
43 0 : const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
44 0 : const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
45 0 : const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
46 0 : const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
47 0 : const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
48 0 : const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
49 0 : const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
50 0 : const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
51 0 : const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
52 0 : const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
53 0 : const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
54 0 : const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
55 0 : const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
56 0 : const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
57 0 : const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
58 0 : const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
59 0 : const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
60 0 : const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
61 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
62 :
63 : __m128i step1[32];
64 : __m128i step2[32];
65 : __m128i step3[32];
66 : __m128i out[32];
67 : // Stage 1
68 : {
69 0 : const __m128i *ina = in0;
70 0 : const __m128i *inb = in1 + 15;
71 0 : __m128i *step1a = &step1[0];
72 0 : __m128i *step1b = &step1[31];
73 0 : const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
74 0 : const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
75 0 : const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
76 0 : const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
77 0 : const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
78 0 : const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
79 0 : const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
80 0 : const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
81 0 : step1a[0] = _mm_add_epi16(ina0, inb0);
82 0 : step1a[1] = _mm_add_epi16(ina1, inb1);
83 0 : step1a[2] = _mm_add_epi16(ina2, inb2);
84 0 : step1a[3] = _mm_add_epi16(ina3, inb3);
85 0 : step1b[-3] = _mm_sub_epi16(ina3, inb3);
86 0 : step1b[-2] = _mm_sub_epi16(ina2, inb2);
87 0 : step1b[-1] = _mm_sub_epi16(ina1, inb1);
88 0 : step1b[-0] = _mm_sub_epi16(ina0, inb0);
89 : }
90 : {
91 0 : const __m128i *ina = in0 + 4;
92 0 : const __m128i *inb = in1 + 11;
93 0 : __m128i *step1a = &step1[4];
94 0 : __m128i *step1b = &step1[27];
95 0 : const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
96 0 : const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
97 0 : const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
98 0 : const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
99 0 : const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
100 0 : const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
101 0 : const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
102 0 : const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
103 0 : step1a[0] = _mm_add_epi16(ina0, inb0);
104 0 : step1a[1] = _mm_add_epi16(ina1, inb1);
105 0 : step1a[2] = _mm_add_epi16(ina2, inb2);
106 0 : step1a[3] = _mm_add_epi16(ina3, inb3);
107 0 : step1b[-3] = _mm_sub_epi16(ina3, inb3);
108 0 : step1b[-2] = _mm_sub_epi16(ina2, inb2);
109 0 : step1b[-1] = _mm_sub_epi16(ina1, inb1);
110 0 : step1b[-0] = _mm_sub_epi16(ina0, inb0);
111 : }
112 : {
113 0 : const __m128i *ina = in0 + 8;
114 0 : const __m128i *inb = in1 + 7;
115 0 : __m128i *step1a = &step1[8];
116 0 : __m128i *step1b = &step1[23];
117 0 : const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
118 0 : const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
119 0 : const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
120 0 : const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
121 0 : const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
122 0 : const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
123 0 : const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
124 0 : const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
125 0 : step1a[0] = _mm_add_epi16(ina0, inb0);
126 0 : step1a[1] = _mm_add_epi16(ina1, inb1);
127 0 : step1a[2] = _mm_add_epi16(ina2, inb2);
128 0 : step1a[3] = _mm_add_epi16(ina3, inb3);
129 0 : step1b[-3] = _mm_sub_epi16(ina3, inb3);
130 0 : step1b[-2] = _mm_sub_epi16(ina2, inb2);
131 0 : step1b[-1] = _mm_sub_epi16(ina1, inb1);
132 0 : step1b[-0] = _mm_sub_epi16(ina0, inb0);
133 : }
134 : {
135 0 : const __m128i *ina = in0 + 12;
136 0 : const __m128i *inb = in1 + 3;
137 0 : __m128i *step1a = &step1[12];
138 0 : __m128i *step1b = &step1[19];
139 0 : const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
140 0 : const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + 1));
141 0 : const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + 2));
142 0 : const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + 3));
143 0 : const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - 3));
144 0 : const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - 2));
145 0 : const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - 1));
146 0 : const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
147 0 : step1a[0] = _mm_add_epi16(ina0, inb0);
148 0 : step1a[1] = _mm_add_epi16(ina1, inb1);
149 0 : step1a[2] = _mm_add_epi16(ina2, inb2);
150 0 : step1a[3] = _mm_add_epi16(ina3, inb3);
151 0 : step1b[-3] = _mm_sub_epi16(ina3, inb3);
152 0 : step1b[-2] = _mm_sub_epi16(ina2, inb2);
153 0 : step1b[-1] = _mm_sub_epi16(ina1, inb1);
154 0 : step1b[-0] = _mm_sub_epi16(ina0, inb0);
155 : }
156 : // Stage 2
157 : {
158 0 : step2[0] = _mm_add_epi16(step1[0], step1[15]);
159 0 : step2[1] = _mm_add_epi16(step1[1], step1[14]);
160 0 : step2[2] = _mm_add_epi16(step1[2], step1[13]);
161 0 : step2[3] = _mm_add_epi16(step1[3], step1[12]);
162 0 : step2[4] = _mm_add_epi16(step1[4], step1[11]);
163 0 : step2[5] = _mm_add_epi16(step1[5], step1[10]);
164 0 : step2[6] = _mm_add_epi16(step1[6], step1[9]);
165 0 : step2[7] = _mm_add_epi16(step1[7], step1[8]);
166 0 : step2[8] = _mm_sub_epi16(step1[7], step1[8]);
167 0 : step2[9] = _mm_sub_epi16(step1[6], step1[9]);
168 0 : step2[10] = _mm_sub_epi16(step1[5], step1[10]);
169 0 : step2[11] = _mm_sub_epi16(step1[4], step1[11]);
170 0 : step2[12] = _mm_sub_epi16(step1[3], step1[12]);
171 0 : step2[13] = _mm_sub_epi16(step1[2], step1[13]);
172 0 : step2[14] = _mm_sub_epi16(step1[1], step1[14]);
173 0 : step2[15] = _mm_sub_epi16(step1[0], step1[15]);
174 : }
175 : {
176 0 : const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
177 0 : const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
178 0 : const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
179 0 : const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
180 0 : const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
181 0 : const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
182 0 : const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
183 0 : const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
184 0 : const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
185 0 : const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
186 0 : const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
187 0 : const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
188 0 : const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
189 0 : const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
190 0 : const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
191 0 : const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
192 0 : const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
193 0 : const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
194 0 : const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
195 0 : const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
196 0 : const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
197 0 : const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
198 0 : const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
199 0 : const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
200 : // dct_const_round_shift
201 0 : const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
202 0 : const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
203 0 : const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
204 0 : const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
205 0 : const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
206 0 : const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
207 0 : const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
208 0 : const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
209 0 : const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
210 0 : const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
211 0 : const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
212 0 : const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
213 0 : const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
214 0 : const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
215 0 : const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
216 0 : const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
217 0 : const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
218 0 : const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
219 0 : const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
220 0 : const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
221 0 : const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
222 0 : const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
223 0 : const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
224 0 : const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
225 0 : const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
226 0 : const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
227 0 : const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
228 0 : const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
229 0 : const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
230 0 : const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
231 0 : const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
232 0 : const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
233 : // Combine
234 0 : step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
235 0 : step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
236 0 : step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
237 0 : step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
238 0 : step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
239 0 : step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
240 0 : step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
241 0 : step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
242 : }
243 : // Stage 3
244 : {
245 0 : step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
246 0 : step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
247 0 : step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
248 0 : step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
249 0 : step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
250 0 : step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
251 0 : step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
252 0 : step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
253 : }
254 : {
255 0 : const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
256 0 : const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
257 0 : const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
258 0 : const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
259 0 : const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
260 0 : const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
261 0 : const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
262 0 : const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
263 0 : const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
264 0 : const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
265 0 : const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
266 0 : const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
267 : // dct_const_round_shift
268 0 : const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
269 0 : const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
270 0 : const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
271 0 : const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
272 0 : const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
273 0 : const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
274 0 : const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
275 0 : const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
276 0 : const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
277 0 : const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
278 0 : const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
279 0 : const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
280 0 : const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
281 0 : const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
282 0 : const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
283 0 : const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
284 : // Combine
285 0 : step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
286 0 : step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
287 0 : step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
288 0 : step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
289 : }
290 : {
291 0 : step3[16] = _mm_add_epi16(step2[23], step1[16]);
292 0 : step3[17] = _mm_add_epi16(step2[22], step1[17]);
293 0 : step3[18] = _mm_add_epi16(step2[21], step1[18]);
294 0 : step3[19] = _mm_add_epi16(step2[20], step1[19]);
295 0 : step3[20] = _mm_sub_epi16(step1[19], step2[20]);
296 0 : step3[21] = _mm_sub_epi16(step1[18], step2[21]);
297 0 : step3[22] = _mm_sub_epi16(step1[17], step2[22]);
298 0 : step3[23] = _mm_sub_epi16(step1[16], step2[23]);
299 0 : step3[24] = _mm_sub_epi16(step1[31], step2[24]);
300 0 : step3[25] = _mm_sub_epi16(step1[30], step2[25]);
301 0 : step3[26] = _mm_sub_epi16(step1[29], step2[26]);
302 0 : step3[27] = _mm_sub_epi16(step1[28], step2[27]);
303 0 : step3[28] = _mm_add_epi16(step2[27], step1[28]);
304 0 : step3[29] = _mm_add_epi16(step2[26], step1[29]);
305 0 : step3[30] = _mm_add_epi16(step2[25], step1[30]);
306 0 : step3[31] = _mm_add_epi16(step2[24], step1[31]);
307 : }
308 :
309 : // Stage 4
310 : {
311 0 : step1[0] = _mm_add_epi16(step3[3], step3[0]);
312 0 : step1[1] = _mm_add_epi16(step3[2], step3[1]);
313 0 : step1[2] = _mm_sub_epi16(step3[1], step3[2]);
314 0 : step1[3] = _mm_sub_epi16(step3[0], step3[3]);
315 0 : step1[8] = _mm_add_epi16(step3[11], step2[8]);
316 0 : step1[9] = _mm_add_epi16(step3[10], step2[9]);
317 0 : step1[10] = _mm_sub_epi16(step2[9], step3[10]);
318 0 : step1[11] = _mm_sub_epi16(step2[8], step3[11]);
319 0 : step1[12] = _mm_sub_epi16(step2[15], step3[12]);
320 0 : step1[13] = _mm_sub_epi16(step2[14], step3[13]);
321 0 : step1[14] = _mm_add_epi16(step3[13], step2[14]);
322 0 : step1[15] = _mm_add_epi16(step3[12], step2[15]);
323 : }
324 : {
325 0 : const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
326 0 : const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
327 0 : const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
328 0 : const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
329 0 : const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
330 0 : const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
331 : // dct_const_round_shift
332 0 : const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
333 0 : const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
334 0 : const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
335 0 : const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
336 0 : const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
337 0 : const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
338 0 : const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
339 0 : const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
340 : // Combine
341 0 : step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
342 0 : step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
343 : }
344 : {
345 0 : const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
346 0 : const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
347 0 : const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
348 0 : const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
349 0 : const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
350 0 : const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
351 0 : const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
352 0 : const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
353 0 : const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
354 0 : const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
355 0 : const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
356 0 : const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
357 0 : const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
358 0 : const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
359 0 : const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
360 0 : const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
361 0 : const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
362 0 : const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
363 0 : const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
364 0 : const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
365 0 : const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
366 0 : const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
367 0 : const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
368 0 : const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
369 : // dct_const_round_shift
370 0 : const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
371 0 : const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
372 0 : const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
373 0 : const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
374 0 : const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
375 0 : const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
376 0 : const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
377 0 : const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
378 0 : const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
379 0 : const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
380 0 : const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
381 0 : const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
382 0 : const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
383 0 : const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
384 0 : const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
385 0 : const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
386 0 : const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
387 0 : const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
388 0 : const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
389 0 : const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
390 0 : const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
391 0 : const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
392 0 : const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
393 0 : const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
394 0 : const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
395 0 : const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
396 0 : const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
397 0 : const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
398 0 : const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
399 0 : const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
400 0 : const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
401 0 : const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
402 : // Combine
403 0 : step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
404 0 : step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
405 0 : step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
406 0 : step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
407 0 : step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
408 0 : step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
409 0 : step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
410 0 : step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
411 : }
412 : // Stage 5
413 : {
414 0 : step2[4] = _mm_add_epi16(step1[5], step3[4]);
415 0 : step2[5] = _mm_sub_epi16(step3[4], step1[5]);
416 0 : step2[6] = _mm_sub_epi16(step3[7], step1[6]);
417 0 : step2[7] = _mm_add_epi16(step1[6], step3[7]);
418 : }
419 : {
420 0 : const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
421 0 : const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
422 0 : const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
423 0 : const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
424 0 : const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
425 0 : const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
426 0 : const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
427 0 : const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
428 0 : const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
429 0 : const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
430 0 : const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
431 0 : const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
432 : // dct_const_round_shift
433 0 : const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
434 0 : const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
435 0 : const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
436 0 : const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
437 0 : const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
438 0 : const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
439 0 : const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
440 0 : const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
441 0 : const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
442 0 : const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
443 0 : const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
444 0 : const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
445 0 : const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
446 0 : const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
447 0 : const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
448 0 : const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
449 : // Combine
450 0 : out[0] = _mm_packs_epi32(out_00_6, out_00_7);
451 0 : out[16] = _mm_packs_epi32(out_16_6, out_16_7);
452 0 : out[8] = _mm_packs_epi32(out_08_6, out_08_7);
453 0 : out[24] = _mm_packs_epi32(out_24_6, out_24_7);
454 : }
455 : {
456 0 : const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[9], step1[14]);
457 0 : const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[9], step1[14]);
458 0 : const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
459 0 : const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
460 0 : const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
461 0 : const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
462 0 : const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
463 0 : const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
464 0 : const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
465 0 : const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
466 0 : const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
467 0 : const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
468 : // dct_const_round_shift
469 0 : const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
470 0 : const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
471 0 : const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
472 0 : const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
473 0 : const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
474 0 : const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
475 0 : const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
476 0 : const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
477 0 : const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
478 0 : const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
479 0 : const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
480 0 : const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
481 0 : const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
482 0 : const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
483 0 : const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
484 0 : const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
485 : // Combine
486 0 : step2[9] = _mm_packs_epi32(s2_09_6, s2_09_7);
487 0 : step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
488 0 : step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
489 0 : step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
490 : }
491 : {
492 0 : step2[16] = _mm_add_epi16(step1[19], step3[16]);
493 0 : step2[17] = _mm_add_epi16(step1[18], step3[17]);
494 0 : step2[18] = _mm_sub_epi16(step3[17], step1[18]);
495 0 : step2[19] = _mm_sub_epi16(step3[16], step1[19]);
496 0 : step2[20] = _mm_sub_epi16(step3[23], step1[20]);
497 0 : step2[21] = _mm_sub_epi16(step3[22], step1[21]);
498 0 : step2[22] = _mm_add_epi16(step1[21], step3[22]);
499 0 : step2[23] = _mm_add_epi16(step1[20], step3[23]);
500 0 : step2[24] = _mm_add_epi16(step1[27], step3[24]);
501 0 : step2[25] = _mm_add_epi16(step1[26], step3[25]);
502 0 : step2[26] = _mm_sub_epi16(step3[25], step1[26]);
503 0 : step2[27] = _mm_sub_epi16(step3[24], step1[27]);
504 0 : step2[28] = _mm_sub_epi16(step3[31], step1[28]);
505 0 : step2[29] = _mm_sub_epi16(step3[30], step1[29]);
506 0 : step2[30] = _mm_add_epi16(step1[29], step3[30]);
507 0 : step2[31] = _mm_add_epi16(step1[28], step3[31]);
508 : }
509 : // Stage 6
510 : {
511 0 : const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
512 0 : const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
513 0 : const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
514 0 : const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
515 0 : const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
516 0 : const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
517 0 : const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
518 0 : const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
519 0 : const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
520 0 : const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
521 0 : const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
522 0 : const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
523 0 : const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
524 0 : const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
525 0 : const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
526 0 : const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
527 : // dct_const_round_shift
528 0 : const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
529 0 : const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
530 0 : const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
531 0 : const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
532 0 : const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
533 0 : const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
534 0 : const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
535 0 : const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
536 0 : const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
537 0 : const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
538 0 : const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
539 0 : const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
540 0 : const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
541 0 : const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
542 0 : const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
543 0 : const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
544 : // Combine
545 0 : out[4] = _mm_packs_epi32(out_04_6, out_04_7);
546 0 : out[20] = _mm_packs_epi32(out_20_6, out_20_7);
547 0 : out[12] = _mm_packs_epi32(out_12_6, out_12_7);
548 0 : out[28] = _mm_packs_epi32(out_28_6, out_28_7);
549 : }
550 : {
551 0 : step3[8] = _mm_add_epi16(step2[9], step1[8]);
552 0 : step3[9] = _mm_sub_epi16(step1[8], step2[9]);
553 0 : step3[10] = _mm_sub_epi16(step1[11], step2[10]);
554 0 : step3[11] = _mm_add_epi16(step2[10], step1[11]);
555 0 : step3[12] = _mm_add_epi16(step2[13], step1[12]);
556 0 : step3[13] = _mm_sub_epi16(step1[12], step2[13]);
557 0 : step3[14] = _mm_sub_epi16(step1[15], step2[14]);
558 0 : step3[15] = _mm_add_epi16(step2[14], step1[15]);
559 : }
560 : {
561 0 : const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
562 0 : const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
563 0 : const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
564 0 : const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
565 0 : const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
566 0 : const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
567 0 : const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
568 0 : const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
569 0 : const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
570 0 : const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
571 0 : const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
572 0 : const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
573 0 : const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
574 0 : const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
575 0 : const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
576 0 : const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
577 0 : const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
578 0 : const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
579 0 : const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
580 0 : const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
581 0 : const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
582 0 : const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
583 0 : const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
584 0 : const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
585 : // dct_const_round_shift
586 0 : const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
587 0 : const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
588 0 : const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
589 0 : const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
590 0 : const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
591 0 : const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
592 0 : const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
593 0 : const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
594 0 : const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
595 0 : const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
596 0 : const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
597 0 : const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
598 0 : const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
599 0 : const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
600 0 : const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
601 0 : const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
602 0 : const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
603 0 : const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
604 0 : const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
605 0 : const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
606 0 : const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
607 0 : const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
608 0 : const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
609 0 : const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
610 0 : const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
611 0 : const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
612 0 : const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
613 0 : const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
614 0 : const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
615 0 : const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
616 0 : const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
617 0 : const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
618 : // Combine
619 0 : step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
620 0 : step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
621 0 : step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
622 0 : step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
623 : // Combine
624 0 : step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
625 0 : step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
626 0 : step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
627 0 : step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
628 : }
629 : // Stage 7
630 : {
631 0 : const __m128i out_02_0 = _mm_unpacklo_epi16(step3[8], step3[15]);
632 0 : const __m128i out_02_1 = _mm_unpackhi_epi16(step3[8], step3[15]);
633 0 : const __m128i out_18_0 = _mm_unpacklo_epi16(step3[9], step3[14]);
634 0 : const __m128i out_18_1 = _mm_unpackhi_epi16(step3[9], step3[14]);
635 0 : const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
636 0 : const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
637 0 : const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
638 0 : const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
639 0 : const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
640 0 : const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
641 0 : const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
642 0 : const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
643 0 : const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
644 0 : const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
645 0 : const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
646 0 : const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
647 0 : const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
648 0 : const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
649 0 : const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
650 0 : const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
651 0 : const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
652 0 : const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
653 0 : const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
654 0 : const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
655 : // dct_const_round_shift
656 0 : const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
657 0 : const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
658 0 : const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
659 0 : const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
660 0 : const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
661 0 : const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
662 0 : const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
663 0 : const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
664 0 : const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
665 0 : const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
666 0 : const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
667 0 : const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
668 0 : const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
669 0 : const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
670 0 : const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
671 0 : const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
672 0 : const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
673 0 : const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
674 0 : const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
675 0 : const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
676 0 : const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
677 0 : const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
678 0 : const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
679 0 : const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
680 0 : const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
681 0 : const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
682 0 : const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
683 0 : const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
684 0 : const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
685 0 : const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
686 0 : const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
687 0 : const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
688 : // Combine
689 0 : out[2] = _mm_packs_epi32(out_02_6, out_02_7);
690 0 : out[18] = _mm_packs_epi32(out_18_6, out_18_7);
691 0 : out[10] = _mm_packs_epi32(out_10_6, out_10_7);
692 0 : out[26] = _mm_packs_epi32(out_26_6, out_26_7);
693 0 : out[6] = _mm_packs_epi32(out_06_6, out_06_7);
694 0 : out[22] = _mm_packs_epi32(out_22_6, out_22_7);
695 0 : out[14] = _mm_packs_epi32(out_14_6, out_14_7);
696 0 : out[30] = _mm_packs_epi32(out_30_6, out_30_7);
697 : }
698 : {
699 0 : step1[16] = _mm_add_epi16(step3[17], step2[16]);
700 0 : step1[17] = _mm_sub_epi16(step2[16], step3[17]);
701 0 : step1[18] = _mm_sub_epi16(step2[19], step3[18]);
702 0 : step1[19] = _mm_add_epi16(step3[18], step2[19]);
703 0 : step1[20] = _mm_add_epi16(step3[21], step2[20]);
704 0 : step1[21] = _mm_sub_epi16(step2[20], step3[21]);
705 0 : step1[22] = _mm_sub_epi16(step2[23], step3[22]);
706 0 : step1[23] = _mm_add_epi16(step3[22], step2[23]);
707 0 : step1[24] = _mm_add_epi16(step3[25], step2[24]);
708 0 : step1[25] = _mm_sub_epi16(step2[24], step3[25]);
709 0 : step1[26] = _mm_sub_epi16(step2[27], step3[26]);
710 0 : step1[27] = _mm_add_epi16(step3[26], step2[27]);
711 0 : step1[28] = _mm_add_epi16(step3[29], step2[28]);
712 0 : step1[29] = _mm_sub_epi16(step2[28], step3[29]);
713 0 : step1[30] = _mm_sub_epi16(step2[31], step3[30]);
714 0 : step1[31] = _mm_add_epi16(step3[30], step2[31]);
715 : }
716 : // Final stage --- outputs indices are bit-reversed.
717 : {
718 0 : const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
719 0 : const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
720 0 : const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
721 0 : const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
722 0 : const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
723 0 : const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
724 0 : const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
725 0 : const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
726 0 : const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
727 0 : const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
728 0 : const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
729 0 : const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
730 0 : const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
731 0 : const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
732 0 : const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
733 0 : const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
734 0 : const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
735 0 : const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
736 0 : const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
737 0 : const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
738 0 : const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
739 0 : const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
740 0 : const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
741 0 : const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
742 : // dct_const_round_shift
743 0 : const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
744 0 : const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
745 0 : const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
746 0 : const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
747 0 : const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
748 0 : const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
749 0 : const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
750 0 : const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
751 0 : const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
752 0 : const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
753 0 : const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
754 0 : const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
755 0 : const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
756 0 : const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
757 0 : const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
758 0 : const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
759 0 : const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
760 0 : const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
761 0 : const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
762 0 : const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
763 0 : const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
764 0 : const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
765 0 : const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
766 0 : const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
767 0 : const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
768 0 : const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
769 0 : const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
770 0 : const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
771 0 : const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
772 0 : const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
773 0 : const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
774 0 : const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
775 : // Combine
776 0 : out[1] = _mm_packs_epi32(out_01_6, out_01_7);
777 0 : out[17] = _mm_packs_epi32(out_17_6, out_17_7);
778 0 : out[9] = _mm_packs_epi32(out_09_6, out_09_7);
779 0 : out[25] = _mm_packs_epi32(out_25_6, out_25_7);
780 0 : out[7] = _mm_packs_epi32(out_07_6, out_07_7);
781 0 : out[23] = _mm_packs_epi32(out_23_6, out_23_7);
782 0 : out[15] = _mm_packs_epi32(out_15_6, out_15_7);
783 0 : out[31] = _mm_packs_epi32(out_31_6, out_31_7);
784 : }
785 : {
786 0 : const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
787 0 : const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
788 0 : const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
789 0 : const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
790 0 : const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
791 0 : const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
792 0 : const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
793 0 : const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
794 0 : const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
795 0 : const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
796 0 : const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
797 0 : const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
798 0 : const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
799 0 : const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
800 0 : const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
801 0 : const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
802 0 : const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
803 0 : const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
804 0 : const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
805 0 : const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
806 0 : const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
807 0 : const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
808 0 : const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
809 0 : const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
810 : // dct_const_round_shift
811 0 : const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
812 0 : const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
813 0 : const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
814 0 : const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
815 0 : const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
816 0 : const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
817 0 : const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
818 0 : const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
819 0 : const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
820 0 : const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
821 0 : const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
822 0 : const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
823 0 : const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
824 0 : const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
825 0 : const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
826 0 : const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
827 0 : const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
828 0 : const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
829 0 : const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
830 0 : const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
831 0 : const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
832 0 : const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
833 0 : const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
834 0 : const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
835 0 : const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
836 0 : const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
837 0 : const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
838 0 : const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
839 0 : const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
840 0 : const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
841 0 : const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
842 0 : const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
843 : // Combine
844 0 : out[5] = _mm_packs_epi32(out_05_6, out_05_7);
845 0 : out[21] = _mm_packs_epi32(out_21_6, out_21_7);
846 0 : out[13] = _mm_packs_epi32(out_13_6, out_13_7);
847 0 : out[29] = _mm_packs_epi32(out_29_6, out_29_7);
848 0 : out[3] = _mm_packs_epi32(out_03_6, out_03_7);
849 0 : out[19] = _mm_packs_epi32(out_19_6, out_19_7);
850 0 : out[11] = _mm_packs_epi32(out_11_6, out_11_7);
851 0 : out[27] = _mm_packs_epi32(out_27_6, out_27_7);
852 : }
853 :
854 : // Output results
855 : {
856 : int j;
857 0 : for (j = 0; j < 16; ++j) {
858 0 : _mm_storeu_si128((__m128i *)(in0 + j), out[j]);
859 0 : _mm_storeu_si128((__m128i *)(in1 + j), out[j + 16]);
860 : }
861 : }
862 0 : } // NOLINT
|