Line data Source code
1 : /*
2 : * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 : *
4 : * Use of this source code is governed by a BSD-style license
5 : * that can be found in the LICENSE file in the root of the source
6 : * tree. An additional intellectual property rights grant can be found
7 : * in the file PATENTS. All contributing project authors may
8 : * be found in the AUTHORS file in the root of the source tree.
9 : */
10 :
11 : #include "./vp9_rtcd.h"
12 : #include "vpx_dsp/x86/inv_txfm_sse2.h"
13 : #include "vpx_dsp/x86/txfm_common_sse2.h"
14 : #include "vpx_ports/mem.h"
15 :
16 0 : void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
17 : int tx_type) {
18 : __m128i in[2];
19 0 : const __m128i zero = _mm_setzero_si128();
20 0 : const __m128i eight = _mm_set1_epi16(8);
21 :
22 0 : in[0] = load_input_data(input);
23 0 : in[1] = load_input_data(input + 8);
24 :
25 0 : switch (tx_type) {
26 : case 0: // DCT_DCT
27 0 : idct4_sse2(in);
28 0 : idct4_sse2(in);
29 0 : break;
30 : case 1: // ADST_DCT
31 0 : idct4_sse2(in);
32 0 : iadst4_sse2(in);
33 0 : break;
34 : case 2: // DCT_ADST
35 0 : iadst4_sse2(in);
36 0 : idct4_sse2(in);
37 0 : break;
38 : case 3: // ADST_ADST
39 0 : iadst4_sse2(in);
40 0 : iadst4_sse2(in);
41 0 : break;
42 0 : default: assert(0); break;
43 : }
44 :
45 : // Final round and shift
46 0 : in[0] = _mm_add_epi16(in[0], eight);
47 0 : in[1] = _mm_add_epi16(in[1], eight);
48 :
49 0 : in[0] = _mm_srai_epi16(in[0], 4);
50 0 : in[1] = _mm_srai_epi16(in[1], 4);
51 :
52 : // Reconstruction and Store
53 : {
54 0 : __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
55 0 : __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
56 0 : d0 = _mm_unpacklo_epi32(d0,
57 0 : _mm_cvtsi32_si128(*(const int *)(dest + stride)));
58 0 : d2 = _mm_unpacklo_epi32(
59 0 : d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
60 0 : d0 = _mm_unpacklo_epi8(d0, zero);
61 0 : d2 = _mm_unpacklo_epi8(d2, zero);
62 0 : d0 = _mm_add_epi16(d0, in[0]);
63 0 : d2 = _mm_add_epi16(d2, in[1]);
64 0 : d0 = _mm_packus_epi16(d0, d2);
65 : // store result[0]
66 0 : *(int *)dest = _mm_cvtsi128_si32(d0);
67 : // store result[1]
68 0 : d0 = _mm_srli_si128(d0, 4);
69 0 : *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
70 : // store result[2]
71 0 : d0 = _mm_srli_si128(d0, 4);
72 0 : *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
73 : // store result[3]
74 0 : d0 = _mm_srli_si128(d0, 4);
75 0 : *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
76 : }
77 0 : }
78 :
79 0 : void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
80 : int tx_type) {
81 : __m128i in[8];
82 0 : const __m128i zero = _mm_setzero_si128();
83 0 : const __m128i final_rounding = _mm_set1_epi16(1 << 4);
84 :
85 : // load input data
86 0 : in[0] = load_input_data(input);
87 0 : in[1] = load_input_data(input + 8 * 1);
88 0 : in[2] = load_input_data(input + 8 * 2);
89 0 : in[3] = load_input_data(input + 8 * 3);
90 0 : in[4] = load_input_data(input + 8 * 4);
91 0 : in[5] = load_input_data(input + 8 * 5);
92 0 : in[6] = load_input_data(input + 8 * 6);
93 0 : in[7] = load_input_data(input + 8 * 7);
94 :
95 0 : switch (tx_type) {
96 : case 0: // DCT_DCT
97 0 : idct8_sse2(in);
98 0 : idct8_sse2(in);
99 0 : break;
100 : case 1: // ADST_DCT
101 0 : idct8_sse2(in);
102 0 : iadst8_sse2(in);
103 0 : break;
104 : case 2: // DCT_ADST
105 0 : iadst8_sse2(in);
106 0 : idct8_sse2(in);
107 0 : break;
108 : case 3: // ADST_ADST
109 0 : iadst8_sse2(in);
110 0 : iadst8_sse2(in);
111 0 : break;
112 0 : default: assert(0); break;
113 : }
114 :
115 : // Final rounding and shift
116 0 : in[0] = _mm_adds_epi16(in[0], final_rounding);
117 0 : in[1] = _mm_adds_epi16(in[1], final_rounding);
118 0 : in[2] = _mm_adds_epi16(in[2], final_rounding);
119 0 : in[3] = _mm_adds_epi16(in[3], final_rounding);
120 0 : in[4] = _mm_adds_epi16(in[4], final_rounding);
121 0 : in[5] = _mm_adds_epi16(in[5], final_rounding);
122 0 : in[6] = _mm_adds_epi16(in[6], final_rounding);
123 0 : in[7] = _mm_adds_epi16(in[7], final_rounding);
124 :
125 0 : in[0] = _mm_srai_epi16(in[0], 5);
126 0 : in[1] = _mm_srai_epi16(in[1], 5);
127 0 : in[2] = _mm_srai_epi16(in[2], 5);
128 0 : in[3] = _mm_srai_epi16(in[3], 5);
129 0 : in[4] = _mm_srai_epi16(in[4], 5);
130 0 : in[5] = _mm_srai_epi16(in[5], 5);
131 0 : in[6] = _mm_srai_epi16(in[6], 5);
132 0 : in[7] = _mm_srai_epi16(in[7], 5);
133 :
134 0 : RECON_AND_STORE(dest + 0 * stride, in[0]);
135 0 : RECON_AND_STORE(dest + 1 * stride, in[1]);
136 0 : RECON_AND_STORE(dest + 2 * stride, in[2]);
137 0 : RECON_AND_STORE(dest + 3 * stride, in[3]);
138 0 : RECON_AND_STORE(dest + 4 * stride, in[4]);
139 0 : RECON_AND_STORE(dest + 5 * stride, in[5]);
140 0 : RECON_AND_STORE(dest + 6 * stride, in[6]);
141 0 : RECON_AND_STORE(dest + 7 * stride, in[7]);
142 0 : }
143 :
144 0 : void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
145 : int stride, int tx_type) {
146 : __m128i in0[16], in1[16];
147 :
148 0 : load_buffer_8x16(input, in0);
149 0 : input += 8;
150 0 : load_buffer_8x16(input, in1);
151 :
152 0 : switch (tx_type) {
153 : case 0: // DCT_DCT
154 0 : idct16_sse2(in0, in1);
155 0 : idct16_sse2(in0, in1);
156 0 : break;
157 : case 1: // ADST_DCT
158 0 : idct16_sse2(in0, in1);
159 0 : iadst16_sse2(in0, in1);
160 0 : break;
161 : case 2: // DCT_ADST
162 0 : iadst16_sse2(in0, in1);
163 0 : idct16_sse2(in0, in1);
164 0 : break;
165 : case 3: // ADST_ADST
166 0 : iadst16_sse2(in0, in1);
167 0 : iadst16_sse2(in0, in1);
168 0 : break;
169 0 : default: assert(0); break;
170 : }
171 :
172 0 : write_buffer_8x16(dest, in0, stride);
173 0 : dest += 8;
174 0 : write_buffer_8x16(dest, in1, stride);
175 0 : }
|