Line data Source code
1 : /*
2 : * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 : *
4 : * This source code is subject to the terms of the BSD 2 Clause License and
5 : * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 : * was not distributed with this source code in the LICENSE file, you can
7 : * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 : * Media Patent License 1.0 was not distributed with this source code in the
9 : * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 : */
11 :
12 : #include <smmintrin.h>
13 : #include <stdint.h>
14 :
15 : #include "./av1_rtcd.h"
16 : #include "aom_dsp/aom_dsp_common.h"
17 :
18 : // Coefficient quantization phase 1
19 : // param[0-2] : rounding/quan/dequan constants
20 0 : static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
21 : const int shift, const int scale,
22 : __m128i *qcoeff, __m128i *dquan,
23 : __m128i *sign) {
24 0 : const __m128i zero = _mm_setzero_si128();
25 0 : const __m128i one = _mm_set1_epi32(1);
26 :
27 0 : *sign = _mm_cmplt_epi32(*coeff, zero);
28 0 : *sign = _mm_or_si128(*sign, one);
29 0 : *coeff = _mm_abs_epi32(*coeff);
30 :
31 0 : qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
32 0 : qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
33 0 : qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
34 :
35 0 : qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
36 0 : qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
37 0 : dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
38 0 : dquan[0] = _mm_srli_epi64(dquan[0], scale);
39 0 : }
40 :
41 : // Coefficient quantization phase 2
42 0 : static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
43 : const __m128i *sign,
44 : const __m128i *param, const int shift,
45 : const int scale, tran_low_t *qAddr,
46 : tran_low_t *dqAddr) {
47 0 : __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
48 0 : __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
49 :
50 0 : qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
51 0 : qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
52 0 : dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
53 0 : dquan[1] = _mm_srli_epi64(dquan[1], scale);
54 :
55 : // combine L&H
56 0 : qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
57 0 : qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
58 :
59 0 : qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
60 0 : qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
61 :
62 0 : dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
63 0 : dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
64 :
65 0 : dquan[0] = _mm_and_si128(dquan[0], mask0H);
66 0 : dquan[1] = _mm_and_si128(dquan[1], mask0L);
67 :
68 0 : qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
69 0 : dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
70 :
71 0 : qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
72 0 : dquan[0] = _mm_sign_epi32(dquan[0], *sign);
73 :
74 0 : _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
75 0 : _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
76 0 : }
77 :
78 0 : static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
79 : __m128i *eob) {
80 0 : const __m128i zero = _mm_setzero_si128();
81 : __m128i mask, iscanIdx;
82 0 : const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
83 0 : const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
84 0 : __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
85 0 : __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
86 :
87 0 : nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
88 0 : nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
89 :
90 0 : mask = _mm_packs_epi32(nz_flag0, nz_flag1);
91 0 : iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
92 0 : iscanIdx = _mm_sub_epi16(iscanIdx, mask);
93 0 : iscanIdx = _mm_and_si128(iscanIdx, mask);
94 0 : *eob = _mm_max_epi16(*eob, iscanIdx);
95 0 : }
96 :
97 0 : static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
98 : __m128i eob_shuffled;
99 : uint16_t eobValue;
100 0 : eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
101 0 : *eob = _mm_max_epi16(*eob, eob_shuffled);
102 0 : eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
103 0 : *eob = _mm_max_epi16(*eob, eob_shuffled);
104 0 : eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
105 0 : *eob = _mm_max_epi16(*eob, eob_shuffled);
106 0 : eobValue = _mm_extract_epi16(*eob, 0);
107 0 : return eobValue;
108 : }
109 :
110 0 : void av1_highbd_quantize_fp_sse4_1(
111 : const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
112 : const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
113 : const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
114 : tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
115 : const int16_t *scan, const int16_t *iscan, int log_scale) {
116 : __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
117 0 : __m128i eob = _mm_setzero_si128();
118 0 : const tran_low_t *src = coeff_ptr;
119 0 : tran_low_t *quanAddr = qcoeff_ptr;
120 0 : tran_low_t *dquanAddr = dqcoeff_ptr;
121 0 : const int shift = 16 - log_scale;
122 0 : const int coeff_stride = 4;
123 0 : const int quan_stride = coeff_stride;
124 : (void)skip_block;
125 : (void)zbin_ptr;
126 : (void)quant_shift_ptr;
127 : (void)scan;
128 :
129 0 : memset(quanAddr, 0, count * sizeof(quanAddr[0]));
130 0 : memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
131 :
132 0 : if (!skip_block) {
133 0 : coeff[0] = _mm_loadu_si128((__m128i const *)src);
134 :
135 0 : qparam[0] =
136 0 : _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
137 0 : qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
138 0 : qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
139 :
140 : // DC and first 3 AC
141 0 : quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
142 : &coeff_sign);
143 :
144 : // update round/quan/dquan for AC
145 0 : qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
146 0 : qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
147 0 : qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
148 :
149 0 : quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
150 : log_scale, quanAddr, dquanAddr);
151 :
152 : // next 4 AC
153 0 : coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
154 0 : quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
155 : &coeff_sign);
156 0 : quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
157 0 : log_scale, quanAddr + quan_stride,
158 0 : dquanAddr + quan_stride);
159 :
160 0 : find_eob(quanAddr, iscan, &eob);
161 :
162 0 : count -= 8;
163 :
164 : // loop for the rest of AC
165 0 : while (count > 0) {
166 0 : src += coeff_stride << 1;
167 0 : quanAddr += quan_stride << 1;
168 0 : dquanAddr += quan_stride << 1;
169 0 : iscan += quan_stride << 1;
170 :
171 0 : coeff[0] = _mm_loadu_si128((__m128i const *)src);
172 0 : coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
173 :
174 0 : quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
175 : dequant, &coeff_sign);
176 0 : quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
177 : log_scale, quanAddr, dquanAddr);
178 :
179 0 : quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
180 : dequant, &coeff_sign);
181 0 : quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
182 0 : log_scale, quanAddr + quan_stride,
183 0 : dquanAddr + quan_stride);
184 :
185 0 : find_eob(quanAddr, iscan, &eob);
186 :
187 0 : count -= 8;
188 : }
189 0 : *eob_ptr = get_accumulated_eob(&eob);
190 : } else {
191 0 : *eob_ptr = 0;
192 : }
193 0 : }
|