Line data Source code
1 : /*
2 : * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include <emmintrin.h> // SSE2
12 :
13 : #include "./vpx_config.h"
14 : #include "./vpx_dsp_rtcd.h"
15 : #include "vpx_dsp/vpx_dsp_common.h"
16 : #include "vpx_dsp/x86/fwd_txfm_sse2.h"
17 :
18 0 : void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
19 : __m128i in0, in1;
20 : __m128i tmp;
21 0 : const __m128i zero = _mm_setzero_si128();
22 0 : in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
23 0 : in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
24 0 : in1 = _mm_unpacklo_epi64(
25 0 : in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
26 0 : in0 = _mm_unpacklo_epi64(
27 0 : in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
28 :
29 0 : tmp = _mm_add_epi16(in0, in1);
30 0 : in0 = _mm_unpacklo_epi16(zero, tmp);
31 0 : in1 = _mm_unpackhi_epi16(zero, tmp);
32 0 : in0 = _mm_srai_epi32(in0, 16);
33 0 : in1 = _mm_srai_epi32(in1, 16);
34 :
35 0 : tmp = _mm_add_epi32(in0, in1);
36 0 : in0 = _mm_unpacklo_epi32(tmp, zero);
37 0 : in1 = _mm_unpackhi_epi32(tmp, zero);
38 :
39 0 : tmp = _mm_add_epi32(in0, in1);
40 0 : in0 = _mm_srli_si128(tmp, 8);
41 :
42 0 : in1 = _mm_add_epi32(tmp, in0);
43 0 : in0 = _mm_slli_epi32(in1, 1);
44 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
45 0 : }
46 :
47 0 : void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
48 0 : __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
49 0 : __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
50 0 : __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
51 0 : __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
52 : __m128i u0, u1, sum;
53 :
54 0 : u0 = _mm_add_epi16(in0, in1);
55 0 : u1 = _mm_add_epi16(in2, in3);
56 :
57 0 : in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
58 0 : in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
59 0 : in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
60 0 : in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
61 :
62 0 : sum = _mm_add_epi16(u0, u1);
63 :
64 0 : in0 = _mm_add_epi16(in0, in1);
65 0 : in2 = _mm_add_epi16(in2, in3);
66 0 : sum = _mm_add_epi16(sum, in0);
67 :
68 0 : u0 = _mm_setzero_si128();
69 0 : sum = _mm_add_epi16(sum, in2);
70 :
71 0 : in0 = _mm_unpacklo_epi16(u0, sum);
72 0 : in1 = _mm_unpackhi_epi16(u0, sum);
73 0 : in0 = _mm_srai_epi32(in0, 16);
74 0 : in1 = _mm_srai_epi32(in1, 16);
75 :
76 0 : sum = _mm_add_epi32(in0, in1);
77 0 : in0 = _mm_unpacklo_epi32(sum, u0);
78 0 : in1 = _mm_unpackhi_epi32(sum, u0);
79 :
80 0 : sum = _mm_add_epi32(in0, in1);
81 0 : in0 = _mm_srli_si128(sum, 8);
82 :
83 0 : in1 = _mm_add_epi32(sum, in0);
84 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
85 0 : }
86 :
87 0 : void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
88 : int stride) {
89 : __m128i in0, in1, in2, in3;
90 : __m128i u0, u1;
91 0 : __m128i sum = _mm_setzero_si128();
92 : int i;
93 :
94 0 : for (i = 0; i < 2; ++i) {
95 0 : in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
96 0 : in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
97 0 : in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
98 0 : in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
99 :
100 0 : u0 = _mm_add_epi16(in0, in1);
101 0 : u1 = _mm_add_epi16(in2, in3);
102 0 : sum = _mm_add_epi16(sum, u0);
103 :
104 0 : in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
105 0 : in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
106 0 : in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
107 0 : in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
108 :
109 0 : sum = _mm_add_epi16(sum, u1);
110 0 : u0 = _mm_add_epi16(in0, in1);
111 0 : u1 = _mm_add_epi16(in2, in3);
112 0 : sum = _mm_add_epi16(sum, u0);
113 :
114 0 : in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
115 0 : in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
116 0 : in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
117 0 : in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
118 :
119 0 : sum = _mm_add_epi16(sum, u1);
120 0 : u0 = _mm_add_epi16(in0, in1);
121 0 : u1 = _mm_add_epi16(in2, in3);
122 0 : sum = _mm_add_epi16(sum, u0);
123 :
124 0 : in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
125 0 : in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
126 0 : in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
127 0 : in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
128 :
129 0 : sum = _mm_add_epi16(sum, u1);
130 0 : u0 = _mm_add_epi16(in0, in1);
131 0 : u1 = _mm_add_epi16(in2, in3);
132 0 : sum = _mm_add_epi16(sum, u0);
133 :
134 0 : sum = _mm_add_epi16(sum, u1);
135 0 : input += 8 * stride;
136 : }
137 :
138 0 : u0 = _mm_setzero_si128();
139 0 : in0 = _mm_unpacklo_epi16(u0, sum);
140 0 : in1 = _mm_unpackhi_epi16(u0, sum);
141 0 : in0 = _mm_srai_epi32(in0, 16);
142 0 : in1 = _mm_srai_epi32(in1, 16);
143 :
144 0 : sum = _mm_add_epi32(in0, in1);
145 0 : in0 = _mm_unpacklo_epi32(sum, u0);
146 0 : in1 = _mm_unpackhi_epi32(sum, u0);
147 :
148 0 : sum = _mm_add_epi32(in0, in1);
149 0 : in0 = _mm_srli_si128(sum, 8);
150 :
151 0 : in1 = _mm_add_epi32(sum, in0);
152 0 : in1 = _mm_srai_epi32(in1, 1);
153 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
154 0 : }
155 :
156 0 : void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
157 : int stride) {
158 : __m128i in0, in1, in2, in3;
159 : __m128i u0, u1;
160 0 : __m128i sum = _mm_setzero_si128();
161 : int i;
162 :
163 0 : for (i = 0; i < 8; ++i) {
164 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
165 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
166 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
167 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
168 :
169 0 : input += stride;
170 0 : u0 = _mm_add_epi16(in0, in1);
171 0 : u1 = _mm_add_epi16(in2, in3);
172 0 : sum = _mm_add_epi16(sum, u0);
173 :
174 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
175 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
176 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
177 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
178 :
179 0 : input += stride;
180 0 : sum = _mm_add_epi16(sum, u1);
181 0 : u0 = _mm_add_epi16(in0, in1);
182 0 : u1 = _mm_add_epi16(in2, in3);
183 0 : sum = _mm_add_epi16(sum, u0);
184 :
185 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
186 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
187 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
188 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
189 :
190 0 : input += stride;
191 0 : sum = _mm_add_epi16(sum, u1);
192 0 : u0 = _mm_add_epi16(in0, in1);
193 0 : u1 = _mm_add_epi16(in2, in3);
194 0 : sum = _mm_add_epi16(sum, u0);
195 :
196 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
197 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
198 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
199 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
200 :
201 0 : input += stride;
202 0 : sum = _mm_add_epi16(sum, u1);
203 0 : u0 = _mm_add_epi16(in0, in1);
204 0 : u1 = _mm_add_epi16(in2, in3);
205 0 : sum = _mm_add_epi16(sum, u0);
206 :
207 0 : sum = _mm_add_epi16(sum, u1);
208 : }
209 :
210 0 : u0 = _mm_setzero_si128();
211 0 : in0 = _mm_unpacklo_epi16(u0, sum);
212 0 : in1 = _mm_unpackhi_epi16(u0, sum);
213 0 : in0 = _mm_srai_epi32(in0, 16);
214 0 : in1 = _mm_srai_epi32(in1, 16);
215 :
216 0 : sum = _mm_add_epi32(in0, in1);
217 0 : in0 = _mm_unpacklo_epi32(sum, u0);
218 0 : in1 = _mm_unpackhi_epi32(sum, u0);
219 :
220 0 : sum = _mm_add_epi32(in0, in1);
221 0 : in0 = _mm_srli_si128(sum, 8);
222 :
223 0 : in1 = _mm_add_epi32(sum, in0);
224 0 : in1 = _mm_srai_epi32(in1, 3);
225 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
226 0 : }
227 :
228 : #define DCT_HIGH_BIT_DEPTH 0
229 : #define FDCT4x4_2D vpx_fdct4x4_sse2
230 : #define FDCT8x8_2D vpx_fdct8x8_sse2
231 : #define FDCT16x16_2D vpx_fdct16x16_sse2
232 : #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
233 : #undef FDCT4x4_2D
234 : #undef FDCT8x8_2D
235 : #undef FDCT16x16_2D
236 :
237 : #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
238 : #define FDCT32x32_HIGH_PRECISION 0
239 : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
240 : #undef FDCT32x32_2D
241 : #undef FDCT32x32_HIGH_PRECISION
242 :
243 : #define FDCT32x32_2D vpx_fdct32x32_sse2
244 : #define FDCT32x32_HIGH_PRECISION 1
245 : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
246 : #undef FDCT32x32_2D
247 : #undef FDCT32x32_HIGH_PRECISION
248 : #undef DCT_HIGH_BIT_DEPTH
249 :
250 : #if CONFIG_VP9_HIGHBITDEPTH
251 : #define DCT_HIGH_BIT_DEPTH 1
252 : #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
253 : #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
254 : #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
255 : #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
256 : #undef FDCT4x4_2D
257 : #undef FDCT8x8_2D
258 : #undef FDCT16x16_2D
259 :
260 : #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
261 : #define FDCT32x32_HIGH_PRECISION 0
262 : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
263 : #undef FDCT32x32_2D
264 : #undef FDCT32x32_HIGH_PRECISION
265 :
266 : #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
267 : #define FDCT32x32_HIGH_PRECISION 1
268 : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
269 : #undef FDCT32x32_2D
270 : #undef FDCT32x32_HIGH_PRECISION
271 : #undef DCT_HIGH_BIT_DEPTH
272 : #endif // CONFIG_VP9_HIGHBITDEPTH
|