Line data Source code
1 : /* Copyright (c) 2014, Cisco Systems, INC
2 : Written by XiangMingZhu WeiZhou MinPeng YanWang
3 :
4 : Redistribution and use in source and binary forms, with or without
5 : modification, are permitted provided that the following conditions
6 : are met:
7 :
8 : - Redistributions of source code must retain the above copyright
9 : notice, this list of conditions and the following disclaimer.
10 :
11 : - Redistributions in binary form must reproduce the above copyright
12 : notice, this list of conditions and the following disclaimer in the
13 : documentation and/or other materials provided with the distribution.
14 :
15 : THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
16 : ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
17 : LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
18 : A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
19 : OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 : EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 : PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 : PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
23 : LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 : NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 : SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 : */
27 :
28 : #ifdef HAVE_CONFIG_H
29 : #include "config.h"
30 : #endif
31 :
32 : #include <xmmintrin.h>
33 : #include <emmintrin.h>
34 : #include <smmintrin.h>
35 : #include "main.h"
36 : #include "celt/x86/x86cpu.h"
37 :
38 : /* Entropy constrained matrix-weighted VQ, hard-coded to 5-element vectors, for a single input data vector */
39 0 : void silk_VQ_WMat_EC_sse4_1(
40 : opus_int8 *ind, /* O index of best codebook vector */
41 : opus_int32 *rate_dist_Q14, /* O best weighted quant error + mu * rate */
42 : opus_int *gain_Q7, /* O sum of absolute LTP coefficients */
43 : const opus_int16 *in_Q14, /* I input vector to be quantized */
44 : const opus_int32 *W_Q18, /* I weighting matrix */
45 : const opus_int8 *cb_Q7, /* I codebook */
46 : const opus_uint8 *cb_gain_Q7, /* I codebook effective gain */
47 : const opus_uint8 *cl_Q5, /* I code length for each codebook vector */
48 : const opus_int mu_Q9, /* I tradeoff betw. weighted error and rate */
49 : const opus_int32 max_gain_Q7, /* I maximum sum of absolute LTP coefficients */
50 : opus_int L /* I number of vectors in codebook */
51 : )
52 : {
53 : opus_int k, gain_tmp_Q7;
54 : const opus_int8 *cb_row_Q7;
55 : opus_int16 diff_Q14[ 5 ];
56 : opus_int32 sum1_Q14, sum2_Q16;
57 :
58 : __m128i C_tmp1, C_tmp2, C_tmp3, C_tmp4, C_tmp5;
59 : /* Loop over codebook */
60 0 : *rate_dist_Q14 = silk_int32_MAX;
61 0 : cb_row_Q7 = cb_Q7;
62 0 : for( k = 0; k < L; k++ ) {
63 0 : gain_tmp_Q7 = cb_gain_Q7[k];
64 :
65 0 : diff_Q14[ 0 ] = in_Q14[ 0 ] - silk_LSHIFT( cb_row_Q7[ 0 ], 7 );
66 :
67 0 : C_tmp1 = OP_CVTEPI16_EPI32_M64( &in_Q14[ 1 ] );
68 0 : C_tmp2 = OP_CVTEPI8_EPI32_M32( &cb_row_Q7[ 1 ] );
69 0 : C_tmp2 = _mm_slli_epi32( C_tmp2, 7 );
70 0 : C_tmp1 = _mm_sub_epi32( C_tmp1, C_tmp2 );
71 :
72 0 : diff_Q14[ 1 ] = _mm_extract_epi16( C_tmp1, 0 );
73 0 : diff_Q14[ 2 ] = _mm_extract_epi16( C_tmp1, 2 );
74 0 : diff_Q14[ 3 ] = _mm_extract_epi16( C_tmp1, 4 );
75 0 : diff_Q14[ 4 ] = _mm_extract_epi16( C_tmp1, 6 );
76 :
77 : /* Weighted rate */
78 0 : sum1_Q14 = silk_SMULBB( mu_Q9, cl_Q5[ k ] );
79 :
80 : /* Penalty for too large gain */
81 0 : sum1_Q14 = silk_ADD_LSHIFT32( sum1_Q14, silk_max( silk_SUB32( gain_tmp_Q7, max_gain_Q7 ), 0 ), 10 );
82 :
83 0 : silk_assert( sum1_Q14 >= 0 );
84 :
85 : /* first row of W_Q18 */
86 0 : C_tmp3 = _mm_loadu_si128( (__m128i *)(&W_Q18[ 1 ] ) );
87 0 : C_tmp4 = _mm_mul_epi32( C_tmp3, C_tmp1 );
88 0 : C_tmp4 = _mm_srli_si128( C_tmp4, 2 );
89 :
90 0 : C_tmp1 = _mm_shuffle_epi32( C_tmp1, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
91 0 : C_tmp3 = _mm_shuffle_epi32( C_tmp3, _MM_SHUFFLE( 0, 3, 2, 1 ) ); /* shift right 4 bytes */
92 :
93 0 : C_tmp5 = _mm_mul_epi32( C_tmp3, C_tmp1 );
94 0 : C_tmp5 = _mm_srli_si128( C_tmp5, 2 );
95 :
96 0 : C_tmp5 = _mm_add_epi32( C_tmp4, C_tmp5 );
97 0 : C_tmp5 = _mm_slli_epi32( C_tmp5, 1 );
98 :
99 0 : C_tmp5 = _mm_add_epi32( C_tmp5, _mm_shuffle_epi32( C_tmp5, _MM_SHUFFLE( 0, 0, 0, 2 ) ) );
100 0 : sum2_Q16 = _mm_cvtsi128_si32( C_tmp5 );
101 :
102 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 0 ], diff_Q14[ 0 ] );
103 0 : sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 0 ] );
104 :
105 : /* second row of W_Q18 */
106 0 : sum2_Q16 = silk_SMULWB( W_Q18[ 7 ], diff_Q14[ 2 ] );
107 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 8 ], diff_Q14[ 3 ] );
108 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 9 ], diff_Q14[ 4 ] );
109 0 : sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
110 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 6 ], diff_Q14[ 1 ] );
111 0 : sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 1 ] );
112 :
113 : /* third row of W_Q18 */
114 0 : sum2_Q16 = silk_SMULWB( W_Q18[ 13 ], diff_Q14[ 3 ] );
115 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 14 ], diff_Q14[ 4 ] );
116 0 : sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
117 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 12 ], diff_Q14[ 2 ] );
118 0 : sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 2 ] );
119 :
120 : /* fourth row of W_Q18 */
121 0 : sum2_Q16 = silk_SMULWB( W_Q18[ 19 ], diff_Q14[ 4 ] );
122 0 : sum2_Q16 = silk_LSHIFT( sum2_Q16, 1 );
123 0 : sum2_Q16 = silk_SMLAWB( sum2_Q16, W_Q18[ 18 ], diff_Q14[ 3 ] );
124 0 : sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 3 ] );
125 :
126 : /* last row of W_Q18 */
127 0 : sum2_Q16 = silk_SMULWB( W_Q18[ 24 ], diff_Q14[ 4 ] );
128 0 : sum1_Q14 = silk_SMLAWB( sum1_Q14, sum2_Q16, diff_Q14[ 4 ] );
129 :
130 0 : silk_assert( sum1_Q14 >= 0 );
131 :
132 : /* find best */
133 0 : if( sum1_Q14 < *rate_dist_Q14 ) {
134 0 : *rate_dist_Q14 = sum1_Q14;
135 0 : *ind = (opus_int8)k;
136 0 : *gain_Q7 = gain_tmp_Q7;
137 : }
138 :
139 : /* Go to next cbk vector */
140 0 : cb_row_Q7 += LTP_ORDER;
141 : }
142 0 : }
|