Line data Source code
1 : #ifndef AV1_TXMF1D_SSE2_H_
2 : #define AV1_TXMF1D_SSE2_H_
3 :
4 : #include <smmintrin.h>
5 : #include "av1/common/av1_txfm.h"
6 :
7 : #ifdef __cplusplus
8 : extern "C" {
9 : #endif
10 :
11 : void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
12 : const int8_t *cos_bit, const int8_t *stage_range);
13 : void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
14 : const int8_t *cos_bit, const int8_t *stage_range);
15 : void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
16 : const int8_t *cos_bit, const int8_t *stage_range);
17 : void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
18 : const int8_t *cos_bit, const int8_t *stage_range);
19 : void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
20 : const int8_t *cos_bit, const int8_t *stage_range);
21 :
22 : void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
23 : const int8_t *cos_bit, const int8_t *stage_range);
24 : void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
25 : const int8_t *cos_bit, const int8_t *stage_range);
26 : void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
27 : const int8_t *cos_bit, const int8_t *stage_range);
28 : void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
29 : const int8_t *cos_bit, const int8_t *stage_range);
30 :
31 : void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
32 : const int8_t *cos_bit, const int8_t *stage_range);
33 : void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
34 : const int8_t *cos_bit, const int8_t *stage_range);
35 : void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
36 : const int8_t *cos_bit, const int8_t *stage_range);
37 : void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
38 : const int8_t *cos_bit, const int8_t *stage_range);
39 : void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
40 : const int8_t *cos_bit, const int8_t *stage_range);
41 :
42 : void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
43 : const int8_t *cos_bit, const int8_t *stage_range);
44 : void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
45 : const int8_t *cos_bit, const int8_t *stage_range);
46 : void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
47 : const int8_t *cos_bit, const int8_t *stage_range);
48 : void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
49 : const int8_t *cos_bit, const int8_t *stage_range);
50 :
51 0 : static INLINE void transpose_32_4x4(int stride, const __m128i *input,
52 : __m128i *output) {
53 0 : __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
54 0 : __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
55 0 : __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
56 0 : __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
57 :
58 0 : output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
59 0 : output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
60 0 : output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
61 0 : output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
62 0 : }
63 :
64 : // the entire input block can be represent by a grid of 4x4 blocks
65 : // each 4x4 blocks can be represent by 4 vertical __m128i
66 : // we first transpose each 4x4 block internally
67 : // than transpose the grid
68 0 : static INLINE void transpose_32(int txfm_size, const __m128i *input,
69 : __m128i *output) {
70 0 : const int num_per_128 = 4;
71 0 : const int row_size = txfm_size;
72 0 : const int col_size = txfm_size / num_per_128;
73 : int r, c;
74 :
75 : // transpose each 4x4 block internally
76 0 : for (r = 0; r < row_size; r += 4) {
77 0 : for (c = 0; c < col_size; c++) {
78 0 : transpose_32_4x4(col_size, &input[r * col_size + c],
79 0 : &output[c * 4 * col_size + r / 4]);
80 : }
81 : }
82 0 : }
83 :
84 0 : static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
85 : __m128i tmp, round;
86 0 : round = _mm_set1_epi32(1 << (bit - 1));
87 0 : tmp = _mm_add_epi32(vec, round);
88 0 : return _mm_srai_epi32(tmp, bit);
89 : }
90 :
91 0 : static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
92 : const int size, const int bit) {
93 0 : if (bit > 0) {
94 : int i;
95 0 : for (i = 0; i < size; i++) {
96 0 : output[i] = round_shift_32_sse4_1(input[i], bit);
97 : }
98 : } else {
99 : int i;
100 0 : for (i = 0; i < size; i++) {
101 0 : output[i] = _mm_slli_epi32(input[i], -bit);
102 : }
103 : }
104 0 : }
105 :
106 : // out0 = in0*w0 + in1*w1
107 : // out1 = -in1*w0 + in0*w1
108 : #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
109 : do { \
110 : __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \
111 : ww0 = _mm_set1_epi32(w0); \
112 : ww1 = _mm_set1_epi32(w1); \
113 : in0_w0 = _mm_mullo_epi32(in0, ww0); \
114 : in1_w1 = _mm_mullo_epi32(in1, ww1); \
115 : out0 = _mm_add_epi32(in0_w0, in1_w1); \
116 : out0 = round_shift_32_sse4_1(out0, bit); \
117 : in0_w1 = _mm_mullo_epi32(in0, ww1); \
118 : in1_w0 = _mm_mullo_epi32(in1, ww0); \
119 : out1 = _mm_sub_epi32(in0_w1, in1_w0); \
120 : out1 = round_shift_32_sse4_1(out1, bit); \
121 : } while (0)
122 :
123 : // out0 = in0*w0 + in1*w1
124 : // out1 = in1*w0 - in0*w1
125 : #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
126 : do { \
127 : __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \
128 : ww0 = _mm_set1_epi32(w0); \
129 : ww1 = _mm_set1_epi32(w1); \
130 : in0_w0 = _mm_mullo_epi32(in0, ww0); \
131 : in1_w1 = _mm_mullo_epi32(in1, ww1); \
132 : out0 = _mm_add_epi32(in0_w0, in1_w1); \
133 : out0 = round_shift_32_sse4_1(out0, bit); \
134 : in0_w1 = _mm_mullo_epi32(in0, ww1); \
135 : in1_w0 = _mm_mullo_epi32(in1, ww0); \
136 : out1 = _mm_sub_epi32(in1_w0, in0_w1); \
137 : out1 = round_shift_32_sse4_1(out1, bit); \
138 : } while (0)
139 :
140 : #ifdef __cplusplus
141 : }
142 : #endif
143 :
144 : #endif // AV1_TXMF1D_SSE2_H_
|