Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #ifndef AOM_DSP_X86_TXFM_COMMON_SSE2_H_
13 : #define AOM_DSP_X86_TXFM_COMMON_SSE2_H_
14 :
15 : #include <emmintrin.h>
16 : #include "aom/aom_integer.h"
17 : #include "aom_dsp/x86/synonyms.h"
18 :
19 : #define pair_set_epi16(a, b) \
20 : _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
21 : (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
22 :
23 : #define dual_set_epi16(a, b) \
24 : _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
25 : (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
26 :
27 : #define octa_set_epi16(a, b, c, d, e, f, g, h) \
28 : _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
29 : (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
30 :
31 : // Reverse the 8 16 bit words in __m128i
32 0 : static INLINE __m128i mm_reverse_epi16(const __m128i x) {
33 0 : const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
34 0 : const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
35 0 : return _mm_shuffle_epi32(b, 0x4e);
36 : }
37 :
38 : #if CONFIG_EXT_TX
39 : // Identity transform (both forward and inverse).
40 0 : static INLINE void idtx16_8col(__m128i *in) {
41 0 : const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
42 0 : const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
43 0 : const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
44 :
45 : __m128i v0, v1, v2, v3, v4, v5, v6, v7;
46 : __m128i u0, u1, u2, u3, u4, u5, u6, u7;
47 : __m128i x0, x1, x2, x3, x4, x5, x6, x7;
48 : __m128i y0, y1, y2, y3, y4, y5, y6, y7;
49 :
50 0 : in[0] = _mm_slli_epi16(in[0], 1);
51 0 : in[1] = _mm_slli_epi16(in[1], 1);
52 0 : in[2] = _mm_slli_epi16(in[2], 1);
53 0 : in[3] = _mm_slli_epi16(in[3], 1);
54 0 : in[4] = _mm_slli_epi16(in[4], 1);
55 0 : in[5] = _mm_slli_epi16(in[5], 1);
56 0 : in[6] = _mm_slli_epi16(in[6], 1);
57 0 : in[7] = _mm_slli_epi16(in[7], 1);
58 0 : in[8] = _mm_slli_epi16(in[8], 1);
59 0 : in[9] = _mm_slli_epi16(in[9], 1);
60 0 : in[10] = _mm_slli_epi16(in[10], 1);
61 0 : in[11] = _mm_slli_epi16(in[11], 1);
62 0 : in[12] = _mm_slli_epi16(in[12], 1);
63 0 : in[13] = _mm_slli_epi16(in[13], 1);
64 0 : in[14] = _mm_slli_epi16(in[14], 1);
65 0 : in[15] = _mm_slli_epi16(in[15], 1);
66 :
67 0 : v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
68 0 : v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
69 0 : v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
70 0 : v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
71 0 : v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
72 0 : v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
73 0 : v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
74 0 : v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
75 :
76 0 : u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
77 0 : u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
78 0 : u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
79 0 : u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
80 0 : u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
81 0 : u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
82 0 : u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
83 0 : u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
84 :
85 0 : x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
86 0 : x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
87 0 : x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
88 0 : x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
89 0 : x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
90 0 : x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
91 0 : x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
92 0 : x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
93 :
94 0 : y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
95 0 : y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
96 0 : y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
97 0 : y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
98 0 : y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
99 0 : y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
100 0 : y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
101 0 : y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
102 :
103 0 : v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
104 0 : v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
105 0 : v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
106 0 : v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
107 0 : v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
108 0 : v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
109 0 : v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
110 0 : v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
111 :
112 0 : x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
113 0 : x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
114 0 : x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
115 0 : x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
116 0 : x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
117 0 : x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
118 0 : x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
119 0 : x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
120 :
121 0 : u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
122 0 : u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
123 0 : u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
124 0 : u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
125 0 : u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
126 0 : u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
127 0 : u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
128 0 : u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
129 :
130 0 : y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
131 0 : y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
132 0 : y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
133 0 : y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
134 0 : y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
135 0 : y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
136 0 : y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
137 0 : y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
138 :
139 0 : v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
140 0 : v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
141 0 : v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
142 0 : v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
143 0 : v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
144 0 : v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
145 0 : v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
146 0 : v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
147 :
148 0 : x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
149 0 : x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
150 0 : x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
151 0 : x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
152 0 : x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
153 0 : x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
154 0 : x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
155 0 : x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
156 :
157 0 : u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
158 0 : u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
159 0 : u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
160 0 : u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
161 0 : u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
162 0 : u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
163 0 : u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
164 0 : u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
165 :
166 0 : y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
167 0 : y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
168 0 : y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
169 0 : y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
170 0 : y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
171 0 : y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
172 0 : y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
173 0 : y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
174 :
175 0 : v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
176 0 : v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
177 0 : v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
178 0 : v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
179 0 : v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
180 0 : v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
181 0 : v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
182 0 : v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
183 :
184 0 : x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
185 0 : x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
186 0 : x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
187 0 : x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
188 0 : x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
189 0 : x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
190 0 : x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
191 0 : x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
192 :
193 0 : u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
194 0 : u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
195 0 : u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
196 0 : u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
197 0 : u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
198 0 : u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
199 0 : u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
200 0 : u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
201 :
202 0 : y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
203 0 : y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
204 0 : y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
205 0 : y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
206 0 : y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
207 0 : y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
208 0 : y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
209 0 : y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
210 :
211 0 : in[0] = _mm_packs_epi32(v0, x0);
212 0 : in[1] = _mm_packs_epi32(v1, x1);
213 0 : in[2] = _mm_packs_epi32(v2, x2);
214 0 : in[3] = _mm_packs_epi32(v3, x3);
215 0 : in[4] = _mm_packs_epi32(v4, x4);
216 0 : in[5] = _mm_packs_epi32(v5, x5);
217 0 : in[6] = _mm_packs_epi32(v6, x6);
218 0 : in[7] = _mm_packs_epi32(v7, x7);
219 :
220 0 : in[8] = _mm_packs_epi32(u0, y0);
221 0 : in[9] = _mm_packs_epi32(u1, y1);
222 0 : in[10] = _mm_packs_epi32(u2, y2);
223 0 : in[11] = _mm_packs_epi32(u3, y3);
224 0 : in[12] = _mm_packs_epi32(u4, y4);
225 0 : in[13] = _mm_packs_epi32(u5, y5);
226 0 : in[14] = _mm_packs_epi32(u6, y6);
227 0 : in[15] = _mm_packs_epi32(u7, y7);
228 0 : }
229 : #endif // CONFIG_EXT_TX
230 :
231 0 : static INLINE void scale_sqrt2_8x4(__m128i *in) {
232 : // Implements ROUND_POWER_OF_TWO(input * Sqrt2, DCT_CONST_BITS), for 32
233 : // consecutive elements.
234 0 : const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
235 :
236 0 : const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
237 0 : const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
238 0 : const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
239 0 : const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
240 0 : const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
241 0 : const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
242 0 : const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
243 0 : const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
244 :
245 0 : const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
246 0 : const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
247 0 : const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
248 0 : const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
249 0 : const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
250 0 : const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
251 0 : const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
252 0 : const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
253 :
254 0 : in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
255 : xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
256 0 : in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
257 : xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
258 0 : in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
259 : xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
260 0 : in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
261 : xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
262 0 : }
263 :
264 0 : static INLINE void scale_sqrt2_8x8(__m128i *in) {
265 : // Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
266 : // for each element.
267 0 : const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
268 :
269 0 : const __m128i v_p0l_w = _mm_mullo_epi16(in[0], v_scale_w);
270 0 : const __m128i v_p0h_w = _mm_mulhi_epi16(in[0], v_scale_w);
271 0 : const __m128i v_p1l_w = _mm_mullo_epi16(in[1], v_scale_w);
272 0 : const __m128i v_p1h_w = _mm_mulhi_epi16(in[1], v_scale_w);
273 0 : const __m128i v_p2l_w = _mm_mullo_epi16(in[2], v_scale_w);
274 0 : const __m128i v_p2h_w = _mm_mulhi_epi16(in[2], v_scale_w);
275 0 : const __m128i v_p3l_w = _mm_mullo_epi16(in[3], v_scale_w);
276 0 : const __m128i v_p3h_w = _mm_mulhi_epi16(in[3], v_scale_w);
277 0 : const __m128i v_p4l_w = _mm_mullo_epi16(in[4], v_scale_w);
278 0 : const __m128i v_p4h_w = _mm_mulhi_epi16(in[4], v_scale_w);
279 0 : const __m128i v_p5l_w = _mm_mullo_epi16(in[5], v_scale_w);
280 0 : const __m128i v_p5h_w = _mm_mulhi_epi16(in[5], v_scale_w);
281 0 : const __m128i v_p6l_w = _mm_mullo_epi16(in[6], v_scale_w);
282 0 : const __m128i v_p6h_w = _mm_mulhi_epi16(in[6], v_scale_w);
283 0 : const __m128i v_p7l_w = _mm_mullo_epi16(in[7], v_scale_w);
284 0 : const __m128i v_p7h_w = _mm_mulhi_epi16(in[7], v_scale_w);
285 :
286 0 : const __m128i v_p0a_d = _mm_unpacklo_epi16(v_p0l_w, v_p0h_w);
287 0 : const __m128i v_p0b_d = _mm_unpackhi_epi16(v_p0l_w, v_p0h_w);
288 0 : const __m128i v_p1a_d = _mm_unpacklo_epi16(v_p1l_w, v_p1h_w);
289 0 : const __m128i v_p1b_d = _mm_unpackhi_epi16(v_p1l_w, v_p1h_w);
290 0 : const __m128i v_p2a_d = _mm_unpacklo_epi16(v_p2l_w, v_p2h_w);
291 0 : const __m128i v_p2b_d = _mm_unpackhi_epi16(v_p2l_w, v_p2h_w);
292 0 : const __m128i v_p3a_d = _mm_unpacklo_epi16(v_p3l_w, v_p3h_w);
293 0 : const __m128i v_p3b_d = _mm_unpackhi_epi16(v_p3l_w, v_p3h_w);
294 0 : const __m128i v_p4a_d = _mm_unpacklo_epi16(v_p4l_w, v_p4h_w);
295 0 : const __m128i v_p4b_d = _mm_unpackhi_epi16(v_p4l_w, v_p4h_w);
296 0 : const __m128i v_p5a_d = _mm_unpacklo_epi16(v_p5l_w, v_p5h_w);
297 0 : const __m128i v_p5b_d = _mm_unpackhi_epi16(v_p5l_w, v_p5h_w);
298 0 : const __m128i v_p6a_d = _mm_unpacklo_epi16(v_p6l_w, v_p6h_w);
299 0 : const __m128i v_p6b_d = _mm_unpackhi_epi16(v_p6l_w, v_p6h_w);
300 0 : const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
301 0 : const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
302 :
303 0 : in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
304 : xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
305 0 : in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
306 : xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
307 0 : in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
308 : xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
309 0 : in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
310 : xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
311 0 : in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
312 : xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
313 0 : in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
314 : xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
315 0 : in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
316 : xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
317 0 : in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
318 : xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
319 0 : }
320 :
321 0 : static INLINE void scale_sqrt2_8x16(__m128i *in) {
322 0 : scale_sqrt2_8x8(in);
323 0 : scale_sqrt2_8x8(in + 8);
324 0 : }
325 :
326 : #endif // AOM_DSP_X86_TXFM_COMMON_SSE2_H_
|