Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <emmintrin.h> // SSE2
13 :
14 : #include "./aom_config.h"
15 : #include "./aom_dsp_rtcd.h"
16 : #include "aom_dsp/aom_dsp_common.h"
17 : #include "aom_dsp/x86/fwd_txfm_sse2.h"
18 :
19 0 : void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
20 : __m128i in0, in1;
21 : __m128i tmp;
22 0 : const __m128i zero = _mm_setzero_si128();
23 0 : in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
24 0 : in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
25 0 : in1 = _mm_unpacklo_epi64(
26 0 : in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
27 0 : in0 = _mm_unpacklo_epi64(
28 0 : in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
29 :
30 0 : tmp = _mm_add_epi16(in0, in1);
31 0 : in0 = _mm_unpacklo_epi16(zero, tmp);
32 0 : in1 = _mm_unpackhi_epi16(zero, tmp);
33 0 : in0 = _mm_srai_epi32(in0, 16);
34 0 : in1 = _mm_srai_epi32(in1, 16);
35 :
36 0 : tmp = _mm_add_epi32(in0, in1);
37 0 : in0 = _mm_unpacklo_epi32(tmp, zero);
38 0 : in1 = _mm_unpackhi_epi32(tmp, zero);
39 :
40 0 : tmp = _mm_add_epi32(in0, in1);
41 0 : in0 = _mm_srli_si128(tmp, 8);
42 :
43 0 : in1 = _mm_add_epi32(tmp, in0);
44 0 : in0 = _mm_slli_epi32(in1, 1);
45 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
46 0 : }
47 :
48 0 : void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
49 0 : __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
50 0 : __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
51 0 : __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
52 0 : __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
53 : __m128i u0, u1, sum;
54 :
55 0 : u0 = _mm_add_epi16(in0, in1);
56 0 : u1 = _mm_add_epi16(in2, in3);
57 :
58 0 : in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
59 0 : in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
60 0 : in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
61 0 : in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
62 :
63 0 : sum = _mm_add_epi16(u0, u1);
64 :
65 0 : in0 = _mm_add_epi16(in0, in1);
66 0 : in2 = _mm_add_epi16(in2, in3);
67 0 : sum = _mm_add_epi16(sum, in0);
68 :
69 0 : u0 = _mm_setzero_si128();
70 0 : sum = _mm_add_epi16(sum, in2);
71 :
72 0 : in0 = _mm_unpacklo_epi16(u0, sum);
73 0 : in1 = _mm_unpackhi_epi16(u0, sum);
74 0 : in0 = _mm_srai_epi32(in0, 16);
75 0 : in1 = _mm_srai_epi32(in1, 16);
76 :
77 0 : sum = _mm_add_epi32(in0, in1);
78 0 : in0 = _mm_unpacklo_epi32(sum, u0);
79 0 : in1 = _mm_unpackhi_epi32(sum, u0);
80 :
81 0 : sum = _mm_add_epi32(in0, in1);
82 0 : in0 = _mm_srli_si128(sum, 8);
83 :
84 0 : in1 = _mm_add_epi32(sum, in0);
85 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
86 0 : }
87 :
88 0 : void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
89 : int stride) {
90 : __m128i in0, in1, in2, in3;
91 : __m128i u0, u1;
92 0 : __m128i sum = _mm_setzero_si128();
93 : int i;
94 :
95 0 : for (i = 0; i < 2; ++i) {
96 0 : in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
97 0 : in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
98 0 : in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
99 0 : in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
100 :
101 0 : u0 = _mm_add_epi16(in0, in1);
102 0 : u1 = _mm_add_epi16(in2, in3);
103 0 : sum = _mm_add_epi16(sum, u0);
104 :
105 0 : in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
106 0 : in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
107 0 : in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
108 0 : in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
109 :
110 0 : sum = _mm_add_epi16(sum, u1);
111 0 : u0 = _mm_add_epi16(in0, in1);
112 0 : u1 = _mm_add_epi16(in2, in3);
113 0 : sum = _mm_add_epi16(sum, u0);
114 :
115 0 : in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
116 0 : in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
117 0 : in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
118 0 : in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
119 :
120 0 : sum = _mm_add_epi16(sum, u1);
121 0 : u0 = _mm_add_epi16(in0, in1);
122 0 : u1 = _mm_add_epi16(in2, in3);
123 0 : sum = _mm_add_epi16(sum, u0);
124 :
125 0 : in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
126 0 : in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
127 0 : in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
128 0 : in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
129 :
130 0 : sum = _mm_add_epi16(sum, u1);
131 0 : u0 = _mm_add_epi16(in0, in1);
132 0 : u1 = _mm_add_epi16(in2, in3);
133 0 : sum = _mm_add_epi16(sum, u0);
134 :
135 0 : sum = _mm_add_epi16(sum, u1);
136 0 : input += 8 * stride;
137 : }
138 :
139 0 : u0 = _mm_setzero_si128();
140 0 : in0 = _mm_unpacklo_epi16(u0, sum);
141 0 : in1 = _mm_unpackhi_epi16(u0, sum);
142 0 : in0 = _mm_srai_epi32(in0, 16);
143 0 : in1 = _mm_srai_epi32(in1, 16);
144 :
145 0 : sum = _mm_add_epi32(in0, in1);
146 0 : in0 = _mm_unpacklo_epi32(sum, u0);
147 0 : in1 = _mm_unpackhi_epi32(sum, u0);
148 :
149 0 : sum = _mm_add_epi32(in0, in1);
150 0 : in0 = _mm_srli_si128(sum, 8);
151 :
152 0 : in1 = _mm_add_epi32(sum, in0);
153 0 : in1 = _mm_srai_epi32(in1, 1);
154 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
155 0 : }
156 :
157 0 : void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
158 : int stride) {
159 : __m128i in0, in1, in2, in3;
160 : __m128i u0, u1;
161 0 : __m128i sum = _mm_setzero_si128();
162 : int i;
163 :
164 0 : for (i = 0; i < 8; ++i) {
165 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
166 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
167 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
168 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
169 :
170 0 : input += stride;
171 0 : u0 = _mm_add_epi16(in0, in1);
172 0 : u1 = _mm_add_epi16(in2, in3);
173 0 : sum = _mm_add_epi16(sum, u0);
174 :
175 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
176 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
177 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
178 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
179 :
180 0 : input += stride;
181 0 : sum = _mm_add_epi16(sum, u1);
182 0 : u0 = _mm_add_epi16(in0, in1);
183 0 : u1 = _mm_add_epi16(in2, in3);
184 0 : sum = _mm_add_epi16(sum, u0);
185 :
186 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
187 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
188 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
189 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
190 :
191 0 : input += stride;
192 0 : sum = _mm_add_epi16(sum, u1);
193 0 : u0 = _mm_add_epi16(in0, in1);
194 0 : u1 = _mm_add_epi16(in2, in3);
195 0 : sum = _mm_add_epi16(sum, u0);
196 :
197 0 : in0 = _mm_load_si128((const __m128i *)(input + 0));
198 0 : in1 = _mm_load_si128((const __m128i *)(input + 8));
199 0 : in2 = _mm_load_si128((const __m128i *)(input + 16));
200 0 : in3 = _mm_load_si128((const __m128i *)(input + 24));
201 :
202 0 : input += stride;
203 0 : sum = _mm_add_epi16(sum, u1);
204 0 : u0 = _mm_add_epi16(in0, in1);
205 0 : u1 = _mm_add_epi16(in2, in3);
206 0 : sum = _mm_add_epi16(sum, u0);
207 :
208 0 : sum = _mm_add_epi16(sum, u1);
209 : }
210 :
211 0 : u0 = _mm_setzero_si128();
212 0 : in0 = _mm_unpacklo_epi16(u0, sum);
213 0 : in1 = _mm_unpackhi_epi16(u0, sum);
214 0 : in0 = _mm_srai_epi32(in0, 16);
215 0 : in1 = _mm_srai_epi32(in1, 16);
216 :
217 0 : sum = _mm_add_epi32(in0, in1);
218 0 : in0 = _mm_unpacklo_epi32(sum, u0);
219 0 : in1 = _mm_unpackhi_epi32(sum, u0);
220 :
221 0 : sum = _mm_add_epi32(in0, in1);
222 0 : in0 = _mm_srli_si128(sum, 8);
223 :
224 0 : in1 = _mm_add_epi32(sum, in0);
225 0 : in1 = _mm_srai_epi32(in1, 3);
226 0 : output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
227 0 : }
228 :
229 : #define DCT_HIGH_BIT_DEPTH 0
230 : #define FDCT4x4_2D aom_fdct4x4_sse2
231 : #define FDCT8x8_2D aom_fdct8x8_sse2
232 : #define FDCT16x16_2D aom_fdct16x16_sse2
233 : #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
234 : #undef FDCT4x4_2D
235 : #undef FDCT8x8_2D
236 : #undef FDCT16x16_2D
237 :
238 : #define FDCT32x32_2D aom_fdct32x32_rd_sse2
239 : #define FDCT32x32_HIGH_PRECISION 0
240 : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
241 : #undef FDCT32x32_2D
242 : #undef FDCT32x32_HIGH_PRECISION
243 :
244 : #define FDCT32x32_2D aom_fdct32x32_sse2
245 : #define FDCT32x32_HIGH_PRECISION 1
246 : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
247 : #undef FDCT32x32_2D
248 : #undef FDCT32x32_HIGH_PRECISION
249 : #undef DCT_HIGH_BIT_DEPTH
250 :
251 : #if CONFIG_HIGHBITDEPTH
252 : #define DCT_HIGH_BIT_DEPTH 1
253 : #define FDCT4x4_2D aom_highbd_fdct4x4_sse2
254 : #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
255 : #define FDCT16x16_2D aom_highbd_fdct16x16_sse2
256 : #include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
257 : #undef FDCT4x4_2D
258 : #undef FDCT8x8_2D
259 : #undef FDCT16x16_2D
260 :
261 : #define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
262 : #define FDCT32x32_HIGH_PRECISION 0
263 : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
264 : #undef FDCT32x32_2D
265 : #undef FDCT32x32_HIGH_PRECISION
266 :
267 : #define FDCT32x32_2D aom_highbd_fdct32x32_sse2
268 : #define FDCT32x32_HIGH_PRECISION 1
269 : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
270 : #undef FDCT32x32_2D
271 : #undef FDCT32x32_HIGH_PRECISION
272 : #undef DCT_HIGH_BIT_DEPTH
273 : #endif // CONFIG_HIGHBITDEPTH
|