LCOV - code coverage report
Current view: top level - media/webrtc/trunk/webrtc/modules/audio_processing/aec - aec_core_sse2.cc (source / functions) Hit Total Coverage
Test: output.info Lines: 0 385 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 15 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : /*
      12             :  * The core AEC algorithm, SSE2 version of speed-critical functions.
      13             :  */
      14             : 
      15             : #include <emmintrin.h>
      16             : #include <math.h>
      17             : #include <string.h>  // memset
      18             : 
      19             : extern "C" {
      20             : #include "webrtc/common_audio/signal_processing/include/signal_processing_library.h"
      21             : }
      22             : #include "webrtc/modules/audio_processing/aec/aec_common.h"
      23             : #include "webrtc/modules/audio_processing/aec/aec_core_optimized_methods.h"
      24             : #include "webrtc/modules/audio_processing/utility/ooura_fft.h"
      25             : 
      26             : namespace webrtc {
      27             : 
      28           0 : __inline static float MulRe(float aRe, float aIm, float bRe, float bIm) {
      29           0 :   return aRe * bRe - aIm * bIm;
      30             : }
      31             : 
      32           0 : __inline static float MulIm(float aRe, float aIm, float bRe, float bIm) {
      33           0 :   return aRe * bIm + aIm * bRe;
      34             : }
      35             : 
      36           0 : static void FilterFarSSE2(int num_partitions,
      37             :                           int x_fft_buf_block_pos,
      38             :                           float x_fft_buf[2]
      39             :                                          [kExtendedNumPartitions * PART_LEN1],
      40             :                           float h_fft_buf[2]
      41             :                                          [kExtendedNumPartitions * PART_LEN1],
      42             :                           float y_fft[2][PART_LEN1]) {
      43             :   int i;
      44           0 :   for (i = 0; i < num_partitions; i++) {
      45             :     int j;
      46           0 :     int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
      47           0 :     int pos = i * PART_LEN1;
      48             :     // Check for wrap
      49           0 :     if (i + x_fft_buf_block_pos >= num_partitions) {
      50           0 :       xPos -= num_partitions * (PART_LEN1);
      51             :     }
      52             : 
      53             :     // vectorized code (four at once)
      54           0 :     for (j = 0; j + 3 < PART_LEN1; j += 4) {
      55           0 :       const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);
      56           0 :       const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);
      57           0 :       const __m128 h_fft_buf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
      58           0 :       const __m128 h_fft_buf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
      59           0 :       const __m128 y_fft_re = _mm_loadu_ps(&y_fft[0][j]);
      60           0 :       const __m128 y_fft_im = _mm_loadu_ps(&y_fft[1][j]);
      61           0 :       const __m128 a = _mm_mul_ps(x_fft_buf_re, h_fft_buf_re);
      62           0 :       const __m128 b = _mm_mul_ps(x_fft_buf_im, h_fft_buf_im);
      63           0 :       const __m128 c = _mm_mul_ps(x_fft_buf_re, h_fft_buf_im);
      64           0 :       const __m128 d = _mm_mul_ps(x_fft_buf_im, h_fft_buf_re);
      65           0 :       const __m128 e = _mm_sub_ps(a, b);
      66           0 :       const __m128 f = _mm_add_ps(c, d);
      67           0 :       const __m128 g = _mm_add_ps(y_fft_re, e);
      68           0 :       const __m128 h = _mm_add_ps(y_fft_im, f);
      69           0 :       _mm_storeu_ps(&y_fft[0][j], g);
      70           0 :       _mm_storeu_ps(&y_fft[1][j], h);
      71             :     }
      72             :     // scalar code for the remaining items.
      73           0 :     for (; j < PART_LEN1; j++) {
      74           0 :       y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j],
      75           0 :                            h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]);
      76           0 :       y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j],
      77           0 :                            h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]);
      78             :     }
      79             :   }
      80           0 : }
      81             : 
      82           0 : static void ScaleErrorSignalSSE2(float mu,
      83             :                                  float error_threshold,
      84             :                                  float x_pow[PART_LEN1],
      85             :                                  float ef[2][PART_LEN1]) {
      86           0 :   const __m128 k1e_10f = _mm_set1_ps(1e-10f);
      87           0 :   const __m128 kMu = _mm_set1_ps(mu);
      88           0 :   const __m128 kThresh = _mm_set1_ps(error_threshold);
      89             : 
      90             :   int i;
      91             :   // vectorized code (four at once)
      92           0 :   for (i = 0; i + 3 < PART_LEN1; i += 4) {
      93           0 :     const __m128 x_pow_local = _mm_loadu_ps(&x_pow[i]);
      94           0 :     const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
      95           0 :     const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);
      96             : 
      97           0 :     const __m128 xPowPlus = _mm_add_ps(x_pow_local, k1e_10f);
      98           0 :     __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
      99           0 :     __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
     100           0 :     const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
     101           0 :     const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
     102           0 :     const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
     103           0 :     const __m128 absEf = _mm_sqrt_ps(ef_sum2);
     104           0 :     const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
     105           0 :     __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
     106           0 :     const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
     107           0 :     __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
     108           0 :     __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
     109           0 :     ef_re_if = _mm_and_ps(bigger, ef_re_if);
     110           0 :     ef_im_if = _mm_and_ps(bigger, ef_im_if);
     111           0 :     ef_re = _mm_andnot_ps(bigger, ef_re);
     112           0 :     ef_im = _mm_andnot_ps(bigger, ef_im);
     113           0 :     ef_re = _mm_or_ps(ef_re, ef_re_if);
     114           0 :     ef_im = _mm_or_ps(ef_im, ef_im_if);
     115           0 :     ef_re = _mm_mul_ps(ef_re, kMu);
     116           0 :     ef_im = _mm_mul_ps(ef_im, kMu);
     117             : 
     118           0 :     _mm_storeu_ps(&ef[0][i], ef_re);
     119           0 :     _mm_storeu_ps(&ef[1][i], ef_im);
     120             :   }
     121             :   // scalar code for the remaining items.
     122             :   {
     123           0 :     for (; i < (PART_LEN1); i++) {
     124             :       float abs_ef;
     125           0 :       ef[0][i] /= (x_pow[i] + 1e-10f);
     126           0 :       ef[1][i] /= (x_pow[i] + 1e-10f);
     127           0 :       abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);
     128             : 
     129           0 :       if (abs_ef > error_threshold) {
     130           0 :         abs_ef = error_threshold / (abs_ef + 1e-10f);
     131           0 :         ef[0][i] *= abs_ef;
     132           0 :         ef[1][i] *= abs_ef;
     133             :       }
     134             : 
     135             :       // Stepsize factor
     136           0 :       ef[0][i] *= mu;
     137           0 :       ef[1][i] *= mu;
     138             :     }
     139             :   }
     140           0 : }
     141             : 
     142           0 : static void FilterAdaptationSSE2(
     143             :     const OouraFft& ooura_fft,
     144             :     int num_partitions,
     145             :     int x_fft_buf_block_pos,
     146             :     float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
     147             :     float e_fft[2][PART_LEN1],
     148             :     float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
     149             :   float fft[PART_LEN2];
     150             :   int i, j;
     151           0 :   for (i = 0; i < num_partitions; i++) {
     152           0 :     int xPos = (i + x_fft_buf_block_pos) * (PART_LEN1);
     153           0 :     int pos = i * PART_LEN1;
     154             :     // Check for wrap
     155           0 :     if (i + x_fft_buf_block_pos >= num_partitions) {
     156           0 :       xPos -= num_partitions * PART_LEN1;
     157             :     }
     158             : 
     159             :     // Process the whole array...
     160           0 :     for (j = 0; j < PART_LEN; j += 4) {
     161             :       // Load x_fft_buf and e_fft.
     162           0 :       const __m128 x_fft_buf_re = _mm_loadu_ps(&x_fft_buf[0][xPos + j]);
     163           0 :       const __m128 x_fft_buf_im = _mm_loadu_ps(&x_fft_buf[1][xPos + j]);
     164           0 :       const __m128 e_fft_re = _mm_loadu_ps(&e_fft[0][j]);
     165           0 :       const __m128 e_fft_im = _mm_loadu_ps(&e_fft[1][j]);
     166             :       // Calculate the product of conjugate(x_fft_buf) by e_fft.
     167             :       //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
     168             :       //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
     169           0 :       const __m128 a = _mm_mul_ps(x_fft_buf_re, e_fft_re);
     170           0 :       const __m128 b = _mm_mul_ps(x_fft_buf_im, e_fft_im);
     171           0 :       const __m128 c = _mm_mul_ps(x_fft_buf_re, e_fft_im);
     172           0 :       const __m128 d = _mm_mul_ps(x_fft_buf_im, e_fft_re);
     173           0 :       const __m128 e = _mm_add_ps(a, b);
     174           0 :       const __m128 f = _mm_sub_ps(c, d);
     175             :       // Interleave real and imaginary parts.
     176           0 :       const __m128 g = _mm_unpacklo_ps(e, f);
     177           0 :       const __m128 h = _mm_unpackhi_ps(e, f);
     178             :       // Store
     179           0 :       _mm_storeu_ps(&fft[2 * j + 0], g);
     180           0 :       _mm_storeu_ps(&fft[2 * j + 4], h);
     181             :     }
     182             :     // ... and fixup the first imaginary entry.
     183           0 :     fft[1] =
     184           0 :         MulRe(x_fft_buf[0][xPos + PART_LEN], -x_fft_buf[1][xPos + PART_LEN],
     185           0 :               e_fft[0][PART_LEN], e_fft[1][PART_LEN]);
     186             : 
     187           0 :     ooura_fft.InverseFft(fft);
     188           0 :     memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);
     189             : 
     190             :     // fft scaling
     191             :     {
     192           0 :       float scale = 2.0f / PART_LEN2;
     193           0 :       const __m128 scale_ps = _mm_load_ps1(&scale);
     194           0 :       for (j = 0; j < PART_LEN; j += 4) {
     195           0 :         const __m128 fft_ps = _mm_loadu_ps(&fft[j]);
     196           0 :         const __m128 fft_scale = _mm_mul_ps(fft_ps, scale_ps);
     197           0 :         _mm_storeu_ps(&fft[j], fft_scale);
     198             :       }
     199             :     }
     200           0 :     ooura_fft.Fft(fft);
     201             : 
     202             :     {
     203           0 :       float wt1 = h_fft_buf[1][pos];
     204           0 :       h_fft_buf[0][pos + PART_LEN] += fft[1];
     205           0 :       for (j = 0; j < PART_LEN; j += 4) {
     206           0 :         __m128 wtBuf_re = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
     207           0 :         __m128 wtBuf_im = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
     208           0 :         const __m128 fft0 = _mm_loadu_ps(&fft[2 * j + 0]);
     209           0 :         const __m128 fft4 = _mm_loadu_ps(&fft[2 * j + 4]);
     210             :         const __m128 fft_re =
     211           0 :             _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(2, 0, 2, 0));
     212             :         const __m128 fft_im =
     213           0 :             _mm_shuffle_ps(fft0, fft4, _MM_SHUFFLE(3, 1, 3, 1));
     214           0 :         wtBuf_re = _mm_add_ps(wtBuf_re, fft_re);
     215           0 :         wtBuf_im = _mm_add_ps(wtBuf_im, fft_im);
     216           0 :         _mm_storeu_ps(&h_fft_buf[0][pos + j], wtBuf_re);
     217           0 :         _mm_storeu_ps(&h_fft_buf[1][pos + j], wtBuf_im);
     218             :       }
     219           0 :       h_fft_buf[1][pos] = wt1;
     220             :     }
     221             :   }
     222           0 : }
     223             : 
     224           0 : static __m128 mm_pow_ps(__m128 a, __m128 b) {
     225             :   // a^b = exp2(b * log2(a))
     226             :   //   exp2(x) and log2(x) are calculated using polynomial approximations.
     227             :   __m128 log2_a, b_log2_a, a_exp_b;
     228             : 
     229             :   // Calculate log2(x), x = a.
     230             :   {
     231             :     // To calculate log2(x), we decompose x like this:
     232             :     //   x = y * 2^n
     233             :     //     n is an integer
     234             :     //     y is in the [1.0, 2.0) range
     235             :     //
     236             :     //   log2(x) = log2(y) + n
     237             :     //     n       can be evaluated by playing with float representation.
     238             :     //     log2(y) in a small range can be approximated, this code uses an order
     239             :     //             five polynomial approximation. The coefficients have been
     240             :     //             estimated with the Remez algorithm and the resulting
     241             :     //             polynomial has a maximum relative error of 0.00086%.
     242             : 
     243             :     // Compute n.
     244             :     //    This is done by masking the exponent, shifting it into the top bit of
     245             :     //    the mantissa, putting eight into the biased exponent (to shift/
     246             :     //    compensate the fact that the exponent has been shifted in the top/
     247             :     //    fractional part and finally getting rid of the implicit leading one
     248             :     //    from the mantissa by substracting it out.
     249             :     static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
     250             :         0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
     251             :     static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
     252             :         0x43800000, 0x43800000, 0x43800000, 0x43800000};
     253             :     static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
     254             :         0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
     255             :     static const int shift_exponent_into_top_mantissa = 8;
     256             :     const __m128 two_n =
     257           0 :         _mm_and_ps(a, *(reinterpret_cast<const __m128*>(float_exponent_mask)));
     258           0 :     const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
     259           0 :         _mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
     260             :     const __m128 n_0 =
     261           0 :       _mm_or_ps(n_1, *(reinterpret_cast<const __m128*>(eight_biased_exponent)));
     262             :     const __m128 n =
     263           0 :       _mm_sub_ps(n_0, *(reinterpret_cast<const __m128*>(implicit_leading_one)));
     264             : 
     265             :     // Compute y.
     266             :     static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
     267             :         0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
     268             :     static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
     269             :         0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
     270             :     const __m128 mantissa =
     271           0 :         _mm_and_ps(a, *(reinterpret_cast<const __m128*>(mantissa_mask)));
     272             :     const __m128 y =
     273           0 :         _mm_or_ps(mantissa,
     274           0 :                *(reinterpret_cast<const __m128*>(zero_biased_exponent_is_one)));
     275             : 
     276             :     // Approximate log2(y) ~= (y - 1) * pol5(y).
     277             :     //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
     278             :     static const ALIGN16_BEG float ALIGN16_END C5[4] = {
     279             :         -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
     280             :     static const ALIGN16_BEG float ALIGN16_END C4[4] = {
     281             :         3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
     282             :     static const ALIGN16_BEG float ALIGN16_END C3[4] = {
     283             :         -1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
     284             :     static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f,
     285             :                                                         2.5988452f, 2.5988452f};
     286             :     static const ALIGN16_BEG float ALIGN16_END C1[4] = {
     287             :         -3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
     288             :     static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f,
     289             :                                                         3.1157899f, 3.1157899f};
     290             :     const __m128 pol5_y_0 =
     291           0 :         _mm_mul_ps(y, *(reinterpret_cast<const __m128*>(C5)));
     292             :     const __m128 pol5_y_1 =
     293           0 :         _mm_add_ps(pol5_y_0, *(reinterpret_cast<const __m128*>(C4)));
     294           0 :     const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
     295             :     const __m128 pol5_y_3 =
     296           0 :         _mm_add_ps(pol5_y_2, *(reinterpret_cast<const __m128*>(C3)));
     297           0 :     const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
     298             :     const __m128 pol5_y_5 =
     299           0 :         _mm_add_ps(pol5_y_4, *(reinterpret_cast<const __m128*>(C2)));
     300           0 :     const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
     301             :     const __m128 pol5_y_7 =
     302           0 :         _mm_add_ps(pol5_y_6, *(reinterpret_cast<const __m128*>(C1)));
     303           0 :     const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
     304             :     const __m128 pol5_y =
     305           0 :         _mm_add_ps(pol5_y_8, *(reinterpret_cast<const __m128*>(C0)));
     306             :     const __m128 y_minus_one =
     307           0 :         _mm_sub_ps(y,
     308           0 :                *(reinterpret_cast<const __m128*>(zero_biased_exponent_is_one)));
     309           0 :     const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);
     310             : 
     311             :     // Combine parts.
     312           0 :     log2_a = _mm_add_ps(n, log2_y);
     313             :   }
     314             : 
     315             :   // b * log2(a)
     316           0 :   b_log2_a = _mm_mul_ps(b, log2_a);
     317             : 
     318             :   // Calculate exp2(x), x = b * log2(a).
     319             :   {
     320             :     // To calculate 2^x, we decompose x like this:
     321             :     //   x = n + y
     322             :     //     n is an integer, the value of x - 0.5 rounded down, therefore
     323             :     //     y is in the [0.5, 1.5) range
     324             :     //
     325             :     //   2^x = 2^n * 2^y
     326             :     //     2^n can be evaluated by playing with float representation.
     327             :     //     2^y in a small range can be approximated, this code uses an order two
     328             :     //         polynomial approximation. The coefficients have been estimated
     329             :     //         with the Remez algorithm and the resulting polynomial has a
     330             :     //         maximum relative error of 0.17%.
     331             : 
     332             :     // To avoid over/underflow, we reduce the range of input to ]-127, 129].
     333             :     static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
     334             :                                                                129.f, 129.f};
     335             :     static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
     336             :         -126.99999f, -126.99999f, -126.99999f, -126.99999f};
     337             :     const __m128 x_min =
     338           0 :         _mm_min_ps(b_log2_a, *(reinterpret_cast<const __m128*>(max_input)));
     339             :     const __m128 x_max =
     340           0 :         _mm_max_ps(x_min, *(reinterpret_cast<const __m128*>(min_input)));
     341             :     // Compute n.
     342             :     static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f,
     343             :                                                           0.5f};
     344             :     const __m128 x_minus_half =
     345           0 :         _mm_sub_ps(x_max, *(reinterpret_cast<const __m128*>(half)));
     346           0 :     const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
     347             :     // Compute 2^n.
     348             :     static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
     349             :         127, 127, 127, 127};
     350             :     static const int float_exponent_shift = 23;
     351             :     const __m128i two_n_exponent =
     352           0 :         _mm_add_epi32(x_minus_half_floor,
     353           0 :                       *(reinterpret_cast<const __m128i*>(float_exponent_bias)));
     354             :     const __m128 two_n =
     355           0 :         _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
     356             :     // Compute y.
     357           0 :     const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
     358             :     // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
     359             :     static const ALIGN16_BEG float C2[4] ALIGN16_END = {
     360             :         3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
     361             :     static const ALIGN16_BEG float C1[4] ALIGN16_END = {
     362             :         6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
     363             :     static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
     364             :                                                         1.0017247f, 1.0017247f};
     365             :     const __m128 exp2_y_0 =
     366           0 :         _mm_mul_ps(y, *(reinterpret_cast<const __m128*>(C2)));
     367             :     const __m128 exp2_y_1 =
     368           0 :         _mm_add_ps(exp2_y_0, *(reinterpret_cast<const __m128*>(C1)));
     369           0 :     const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
     370             :     const __m128 exp2_y =
     371           0 :         _mm_add_ps(exp2_y_2, *(reinterpret_cast<const __m128*>(C0)));
     372             : 
     373             :     // Combine parts.
     374           0 :     a_exp_b = _mm_mul_ps(exp2_y, two_n);
     375             :   }
     376           0 :   return a_exp_b;
     377             : }
     378             : 
     379           0 : static void OverdriveSSE2(float overdrive_scaling,
     380             :                           float hNlFb,
     381             :                           float hNl[PART_LEN1]) {
     382             :   int i;
     383           0 :   const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
     384           0 :   const __m128 vec_one = _mm_set1_ps(1.0f);
     385           0 :   const __m128 vec_overdrive_scaling = _mm_set1_ps(overdrive_scaling);
     386             :   // vectorized code (four at once)
     387           0 :   for (i = 0; i + 3 < PART_LEN1; i += 4) {
     388             :     // Weight subbands
     389           0 :     __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
     390           0 :     const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
     391           0 :     const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
     392           0 :     const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
     393           0 :     const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
     394             :     const __m128 vec_one_weightCurve_hNl =
     395           0 :         _mm_mul_ps(vec_one_weightCurve, vec_hNl);
     396           0 :     const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
     397           0 :     const __m128 vec_if1 = _mm_and_ps(
     398           0 :         bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
     399           0 :     vec_hNl = _mm_or_ps(vec_if0, vec_if1);
     400             : 
     401             :     const __m128 vec_overDriveCurve =
     402           0 :         _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
     403             :     const __m128 vec_overDriveSm_overDriveCurve =
     404           0 :         _mm_mul_ps(vec_overdrive_scaling, vec_overDriveCurve);
     405           0 :     vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
     406           0 :     _mm_storeu_ps(&hNl[i], vec_hNl);
     407             :   }
     408             :   // scalar code for the remaining items.
     409           0 :   for (; i < PART_LEN1; i++) {
     410             :     // Weight subbands
     411           0 :     if (hNl[i] > hNlFb) {
     412           0 :       hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
     413           0 :                (1 - WebRtcAec_weightCurve[i]) * hNl[i];
     414             :     }
     415           0 :     hNl[i] = powf(hNl[i], overdrive_scaling * WebRtcAec_overDriveCurve[i]);
     416             :   }
     417           0 : }
     418             : 
     419           0 : static void SuppressSSE2(const float hNl[PART_LEN1], float efw[2][PART_LEN1]) {
     420             :   int i;
     421           0 :   const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
     422             :   // vectorized code (four at once)
     423           0 :   for (i = 0; i + 3 < PART_LEN1; i += 4) {
     424             :     // Suppress error signal
     425           0 :     __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
     426           0 :     __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
     427           0 :     __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
     428           0 :     vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
     429           0 :     vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);
     430             : 
     431             :     // Ooura fft returns incorrect sign on imaginary component. It matters
     432             :     // here because we are making an additive change with comfort noise.
     433           0 :     vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
     434           0 :     _mm_storeu_ps(&efw[0][i], vec_efw_re);
     435           0 :     _mm_storeu_ps(&efw[1][i], vec_efw_im);
     436             :   }
     437             :   // scalar code for the remaining items.
     438           0 :   for (; i < PART_LEN1; i++) {
     439             :     // Suppress error signal
     440           0 :     efw[0][i] *= hNl[i];
     441           0 :     efw[1][i] *= hNl[i];
     442             : 
     443             :     // Ooura fft returns incorrect sign on imaginary component. It matters
     444             :     // here because we are making an additive change with comfort noise.
     445           0 :     efw[1][i] *= -1;
     446             :   }
     447           0 : }
     448             : 
     449           0 : __inline static void _mm_add_ps_4x1(__m128 sum, float* dst) {
     450             :   // A+B C+D
     451           0 :   sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(0, 0, 3, 2)));
     452             :   // A+B+C+D A+B+C+D
     453           0 :   sum = _mm_add_ps(sum, _mm_shuffle_ps(sum, sum, _MM_SHUFFLE(1, 1, 1, 1)));
     454             :   _mm_store_ss(dst, sum);
     455           0 : }
     456             : 
     457           0 : static int PartitionDelaySSE2(
     458             :     int num_partitions,
     459             :     float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
     460             :   // Measures the energy in each filter partition and returns the partition with
     461             :   // highest energy.
     462             :   // TODO(bjornv): Spread computational cost by computing one partition per
     463             :   // block?
     464           0 :   float wfEnMax = 0;
     465             :   int i;
     466           0 :   int delay = 0;
     467             : 
     468           0 :   for (i = 0; i < num_partitions; i++) {
     469             :     int j;
     470           0 :     int pos = i * PART_LEN1;
     471           0 :     float wfEn = 0;
     472           0 :     __m128 vec_wfEn = _mm_set1_ps(0.0f);
     473             :     // vectorized code (four at once)
     474           0 :     for (j = 0; j + 3 < PART_LEN1; j += 4) {
     475           0 :       const __m128 vec_wfBuf0 = _mm_loadu_ps(&h_fft_buf[0][pos + j]);
     476           0 :       const __m128 vec_wfBuf1 = _mm_loadu_ps(&h_fft_buf[1][pos + j]);
     477           0 :       vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf0, vec_wfBuf0));
     478           0 :       vec_wfEn = _mm_add_ps(vec_wfEn, _mm_mul_ps(vec_wfBuf1, vec_wfBuf1));
     479             :     }
     480           0 :     _mm_add_ps_4x1(vec_wfEn, &wfEn);
     481             : 
     482             :     // scalar code for the remaining items.
     483           0 :     for (; j < PART_LEN1; j++) {
     484           0 :       wfEn += h_fft_buf[0][pos + j] * h_fft_buf[0][pos + j] +
     485           0 :               h_fft_buf[1][pos + j] * h_fft_buf[1][pos + j];
     486             :     }
     487             : 
     488           0 :     if (wfEn > wfEnMax) {
     489           0 :       wfEnMax = wfEn;
     490           0 :       delay = i;
     491             :     }
     492             :   }
     493           0 :   return delay;
     494             : }
     495             : 
     496             : // Updates the following smoothed  Power Spectral Densities (PSD):
     497             : //  - sd  : near-end
     498             : //  - se  : residual echo
     499             : //  - sx  : far-end
     500             : //  - sde : cross-PSD of near-end and residual echo
     501             : //  - sxd : cross-PSD of near-end and far-end
     502             : //
     503             : // In addition to updating the PSDs, also the filter diverge state is determined
     504             : // upon actions are taken.
     505           0 : static void UpdateCoherenceSpectraSSE2(int mult,
     506             :                                        bool extended_filter_enabled,
     507             :                                        float efw[2][PART_LEN1],
     508             :                                        float dfw[2][PART_LEN1],
     509             :                                        float xfw[2][PART_LEN1],
     510             :                                        CoherenceState* coherence_state,
     511             :                                        short* filter_divergence_state,
     512             :                                        int* extreme_filter_divergence) {
     513             :   // Power estimate smoothing coefficients.
     514             :   const float* ptrGCoh =
     515             :       extended_filter_enabled
     516           0 :           ? WebRtcAec_kExtendedSmoothingCoefficients[mult - 1]
     517           0 :           : WebRtcAec_kNormalSmoothingCoefficients[mult - 1];
     518             :   int i;
     519           0 :   float sdSum = 0, seSum = 0;
     520           0 :   const __m128 vec_15 = _mm_set1_ps(WebRtcAec_kMinFarendPSD);
     521           0 :   const __m128 vec_GCoh0 = _mm_set1_ps(ptrGCoh[0]);
     522           0 :   const __m128 vec_GCoh1 = _mm_set1_ps(ptrGCoh[1]);
     523           0 :   __m128 vec_sdSum = _mm_set1_ps(0.0f);
     524           0 :   __m128 vec_seSum = _mm_set1_ps(0.0f);
     525             : 
     526           0 :   for (i = 0; i + 3 < PART_LEN1; i += 4) {
     527           0 :     const __m128 vec_dfw0 = _mm_loadu_ps(&dfw[0][i]);
     528           0 :     const __m128 vec_dfw1 = _mm_loadu_ps(&dfw[1][i]);
     529           0 :     const __m128 vec_efw0 = _mm_loadu_ps(&efw[0][i]);
     530           0 :     const __m128 vec_efw1 = _mm_loadu_ps(&efw[1][i]);
     531           0 :     const __m128 vec_xfw0 = _mm_loadu_ps(&xfw[0][i]);
     532           0 :     const __m128 vec_xfw1 = _mm_loadu_ps(&xfw[1][i]);
     533             :     __m128 vec_sd =
     534           0 :         _mm_mul_ps(_mm_loadu_ps(&coherence_state->sd[i]), vec_GCoh0);
     535             :     __m128 vec_se =
     536           0 :         _mm_mul_ps(_mm_loadu_ps(&coherence_state->se[i]), vec_GCoh0);
     537             :     __m128 vec_sx =
     538           0 :         _mm_mul_ps(_mm_loadu_ps(&coherence_state->sx[i]), vec_GCoh0);
     539           0 :     __m128 vec_dfw_sumsq = _mm_mul_ps(vec_dfw0, vec_dfw0);
     540           0 :     __m128 vec_efw_sumsq = _mm_mul_ps(vec_efw0, vec_efw0);
     541           0 :     __m128 vec_xfw_sumsq = _mm_mul_ps(vec_xfw0, vec_xfw0);
     542           0 :     vec_dfw_sumsq = _mm_add_ps(vec_dfw_sumsq, _mm_mul_ps(vec_dfw1, vec_dfw1));
     543           0 :     vec_efw_sumsq = _mm_add_ps(vec_efw_sumsq, _mm_mul_ps(vec_efw1, vec_efw1));
     544           0 :     vec_xfw_sumsq = _mm_add_ps(vec_xfw_sumsq, _mm_mul_ps(vec_xfw1, vec_xfw1));
     545           0 :     vec_xfw_sumsq = _mm_max_ps(vec_xfw_sumsq, vec_15);
     546           0 :     vec_sd = _mm_add_ps(vec_sd, _mm_mul_ps(vec_dfw_sumsq, vec_GCoh1));
     547           0 :     vec_se = _mm_add_ps(vec_se, _mm_mul_ps(vec_efw_sumsq, vec_GCoh1));
     548           0 :     vec_sx = _mm_add_ps(vec_sx, _mm_mul_ps(vec_xfw_sumsq, vec_GCoh1));
     549           0 :     _mm_storeu_ps(&coherence_state->sd[i], vec_sd);
     550           0 :     _mm_storeu_ps(&coherence_state->se[i], vec_se);
     551           0 :     _mm_storeu_ps(&coherence_state->sx[i], vec_sx);
     552             : 
     553             :     {
     554           0 :       const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]);
     555           0 :       const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]);
     556             :       __m128 vec_a =
     557           0 :           _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));
     558             :       __m128 vec_b =
     559           0 :           _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));
     560           0 :       __m128 vec_dfwefw0011 = _mm_mul_ps(vec_dfw0, vec_efw0);
     561           0 :       __m128 vec_dfwefw0110 = _mm_mul_ps(vec_dfw0, vec_efw1);
     562           0 :       vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
     563           0 :       vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
     564             :       vec_dfwefw0011 =
     565           0 :           _mm_add_ps(vec_dfwefw0011, _mm_mul_ps(vec_dfw1, vec_efw1));
     566             :       vec_dfwefw0110 =
     567           0 :           _mm_sub_ps(vec_dfwefw0110, _mm_mul_ps(vec_dfw1, vec_efw0));
     568           0 :       vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwefw0011, vec_GCoh1));
     569           0 :       vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwefw0110, vec_GCoh1));
     570           0 :       _mm_storeu_ps(&coherence_state->sde[i][0], _mm_unpacklo_ps(vec_a, vec_b));
     571           0 :       _mm_storeu_ps(&coherence_state->sde[i + 2][0],
     572             :                     _mm_unpackhi_ps(vec_a, vec_b));
     573             :     }
     574             : 
     575             :     {
     576           0 :       const __m128 vec_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]);
     577           0 :       const __m128 vec_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]);
     578             :       __m128 vec_a =
     579           0 :           _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(2, 0, 2, 0));
     580             :       __m128 vec_b =
     581           0 :           _mm_shuffle_ps(vec_3210, vec_7654, _MM_SHUFFLE(3, 1, 3, 1));
     582           0 :       __m128 vec_dfwxfw0011 = _mm_mul_ps(vec_dfw0, vec_xfw0);
     583           0 :       __m128 vec_dfwxfw0110 = _mm_mul_ps(vec_dfw0, vec_xfw1);
     584           0 :       vec_a = _mm_mul_ps(vec_a, vec_GCoh0);
     585           0 :       vec_b = _mm_mul_ps(vec_b, vec_GCoh0);
     586             :       vec_dfwxfw0011 =
     587           0 :           _mm_add_ps(vec_dfwxfw0011, _mm_mul_ps(vec_dfw1, vec_xfw1));
     588             :       vec_dfwxfw0110 =
     589           0 :           _mm_sub_ps(vec_dfwxfw0110, _mm_mul_ps(vec_dfw1, vec_xfw0));
     590           0 :       vec_a = _mm_add_ps(vec_a, _mm_mul_ps(vec_dfwxfw0011, vec_GCoh1));
     591           0 :       vec_b = _mm_add_ps(vec_b, _mm_mul_ps(vec_dfwxfw0110, vec_GCoh1));
     592           0 :       _mm_storeu_ps(&coherence_state->sxd[i][0], _mm_unpacklo_ps(vec_a, vec_b));
     593           0 :       _mm_storeu_ps(&coherence_state->sxd[i + 2][0],
     594             :                     _mm_unpackhi_ps(vec_a, vec_b));
     595             :     }
     596             : 
     597           0 :     vec_sdSum = _mm_add_ps(vec_sdSum, vec_sd);
     598           0 :     vec_seSum = _mm_add_ps(vec_seSum, vec_se);
     599             :   }
     600             : 
     601           0 :   _mm_add_ps_4x1(vec_sdSum, &sdSum);
     602           0 :   _mm_add_ps_4x1(vec_seSum, &seSum);
     603             : 
     604           0 :   for (; i < PART_LEN1; i++) {
     605           0 :     coherence_state->sd[i] =
     606           0 :         ptrGCoh[0] * coherence_state->sd[i] +
     607           0 :         ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
     608           0 :     coherence_state->se[i] =
     609           0 :         ptrGCoh[0] * coherence_state->se[i] +
     610           0 :         ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
     611             :     // We threshold here to protect against the ill-effects of a zero farend.
     612             :     // The threshold is not arbitrarily chosen, but balances protection and
     613             :     // adverse interaction with the algorithm's tuning.
     614             :     // TODO(bjornv): investigate further why this is so sensitive.
     615           0 :     coherence_state->sx[i] =
     616           0 :         ptrGCoh[0] * coherence_state->sx[i] +
     617           0 :         ptrGCoh[1] *
     618           0 :             WEBRTC_SPL_MAX(xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
     619             :                            WebRtcAec_kMinFarendPSD);
     620             : 
     621           0 :     coherence_state->sde[i][0] =
     622           0 :         ptrGCoh[0] * coherence_state->sde[i][0] +
     623           0 :         ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
     624           0 :     coherence_state->sde[i][1] =
     625           0 :         ptrGCoh[0] * coherence_state->sde[i][1] +
     626           0 :         ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);
     627             : 
     628           0 :     coherence_state->sxd[i][0] =
     629           0 :         ptrGCoh[0] * coherence_state->sxd[i][0] +
     630           0 :         ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
     631           0 :     coherence_state->sxd[i][1] =
     632           0 :         ptrGCoh[0] * coherence_state->sxd[i][1] +
     633           0 :         ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);
     634             : 
     635           0 :     sdSum += coherence_state->sd[i];
     636           0 :     seSum += coherence_state->se[i];
     637             :   }
     638             : 
     639             :   // Divergent filter safeguard update.
     640           0 :   *filter_divergence_state =
     641           0 :       (*filter_divergence_state ? 1.05f : 1.0f) * seSum > sdSum;
     642             : 
     643             :   // Signal extreme filter divergence if the error is significantly larger
     644             :   // than the nearend (13 dB).
     645           0 :   *extreme_filter_divergence = (seSum > (19.95f * sdSum));
     646           0 : }
     647             : 
     648             : // Window time domain data to be used by the fft.
     649           0 : static void WindowDataSSE2(float* x_windowed, const float* x) {
     650             :   int i;
     651           0 :   for (i = 0; i < PART_LEN; i += 4) {
     652           0 :     const __m128 vec_Buf1 = _mm_loadu_ps(&x[i]);
     653           0 :     const __m128 vec_Buf2 = _mm_loadu_ps(&x[PART_LEN + i]);
     654           0 :     const __m128 vec_sqrtHanning = _mm_load_ps(&WebRtcAec_sqrtHanning[i]);
     655             :     // A B C D
     656             :     __m128 vec_sqrtHanning_rev =
     657           0 :         _mm_loadu_ps(&WebRtcAec_sqrtHanning[PART_LEN - i - 3]);
     658             :     // D C B A
     659             :     vec_sqrtHanning_rev = _mm_shuffle_ps(
     660           0 :         vec_sqrtHanning_rev, vec_sqrtHanning_rev, _MM_SHUFFLE(0, 1, 2, 3));
     661           0 :     _mm_storeu_ps(&x_windowed[i], _mm_mul_ps(vec_Buf1, vec_sqrtHanning));
     662           0 :     _mm_storeu_ps(&x_windowed[PART_LEN + i],
     663             :                   _mm_mul_ps(vec_Buf2, vec_sqrtHanning_rev));
     664             :   }
     665           0 : }
     666             : 
     667             : // Puts fft output data into a complex valued array.
     668           0 : static void StoreAsComplexSSE2(const float* data,
     669             :                                float data_complex[2][PART_LEN1]) {
     670             :   int i;
     671           0 :   for (i = 0; i < PART_LEN; i += 4) {
     672           0 :     const __m128 vec_fft0 = _mm_loadu_ps(&data[2 * i]);
     673           0 :     const __m128 vec_fft4 = _mm_loadu_ps(&data[2 * i + 4]);
     674             :     const __m128 vec_a =
     675           0 :         _mm_shuffle_ps(vec_fft0, vec_fft4, _MM_SHUFFLE(2, 0, 2, 0));
     676             :     const __m128 vec_b =
     677           0 :         _mm_shuffle_ps(vec_fft0, vec_fft4, _MM_SHUFFLE(3, 1, 3, 1));
     678           0 :     _mm_storeu_ps(&data_complex[0][i], vec_a);
     679           0 :     _mm_storeu_ps(&data_complex[1][i], vec_b);
     680             :   }
     681             :   // fix beginning/end values
     682           0 :   data_complex[1][0] = 0;
     683           0 :   data_complex[1][PART_LEN] = 0;
     684           0 :   data_complex[0][0] = data[0];
     685           0 :   data_complex[0][PART_LEN] = data[1];
     686           0 : }
     687             : 
     688           0 : static void ComputeCoherenceSSE2(const CoherenceState* coherence_state,
     689             :                                  float* cohde,
     690             :                                  float* cohxd) {
     691             :   int i;
     692             : 
     693             :   {
     694           0 :     const __m128 vec_1eminus10 = _mm_set1_ps(1e-10f);
     695             : 
     696             :     // Subband coherence
     697           0 :     for (i = 0; i + 3 < PART_LEN1; i += 4) {
     698           0 :       const __m128 vec_sd = _mm_loadu_ps(&coherence_state->sd[i]);
     699           0 :       const __m128 vec_se = _mm_loadu_ps(&coherence_state->se[i]);
     700           0 :       const __m128 vec_sx = _mm_loadu_ps(&coherence_state->sx[i]);
     701             :       const __m128 vec_sdse =
     702           0 :           _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_se));
     703             :       const __m128 vec_sdsx =
     704           0 :           _mm_add_ps(vec_1eminus10, _mm_mul_ps(vec_sd, vec_sx));
     705           0 :       const __m128 vec_sde_3210 = _mm_loadu_ps(&coherence_state->sde[i][0]);
     706           0 :       const __m128 vec_sde_7654 = _mm_loadu_ps(&coherence_state->sde[i + 2][0]);
     707           0 :       const __m128 vec_sxd_3210 = _mm_loadu_ps(&coherence_state->sxd[i][0]);
     708           0 :       const __m128 vec_sxd_7654 = _mm_loadu_ps(&coherence_state->sxd[i + 2][0]);
     709             :       const __m128 vec_sde_0 =
     710           0 :           _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(2, 0, 2, 0));
     711             :       const __m128 vec_sde_1 =
     712           0 :           _mm_shuffle_ps(vec_sde_3210, vec_sde_7654, _MM_SHUFFLE(3, 1, 3, 1));
     713             :       const __m128 vec_sxd_0 =
     714           0 :           _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(2, 0, 2, 0));
     715             :       const __m128 vec_sxd_1 =
     716           0 :           _mm_shuffle_ps(vec_sxd_3210, vec_sxd_7654, _MM_SHUFFLE(3, 1, 3, 1));
     717           0 :       __m128 vec_cohde = _mm_mul_ps(vec_sde_0, vec_sde_0);
     718           0 :       __m128 vec_cohxd = _mm_mul_ps(vec_sxd_0, vec_sxd_0);
     719           0 :       vec_cohde = _mm_add_ps(vec_cohde, _mm_mul_ps(vec_sde_1, vec_sde_1));
     720           0 :       vec_cohde = _mm_div_ps(vec_cohde, vec_sdse);
     721           0 :       vec_cohxd = _mm_add_ps(vec_cohxd, _mm_mul_ps(vec_sxd_1, vec_sxd_1));
     722           0 :       vec_cohxd = _mm_div_ps(vec_cohxd, vec_sdsx);
     723           0 :       _mm_storeu_ps(&cohde[i], vec_cohde);
     724           0 :       _mm_storeu_ps(&cohxd[i], vec_cohxd);
     725             :     }
     726             : 
     727             :     // scalar code for the remaining items.
     728           0 :     for (; i < PART_LEN1; i++) {
     729           0 :       cohde[i] = (coherence_state->sde[i][0] * coherence_state->sde[i][0] +
     730           0 :                   coherence_state->sde[i][1] * coherence_state->sde[i][1]) /
     731           0 :                  (coherence_state->sd[i] * coherence_state->se[i] + 1e-10f);
     732           0 :       cohxd[i] = (coherence_state->sxd[i][0] * coherence_state->sxd[i][0] +
     733           0 :                   coherence_state->sxd[i][1] * coherence_state->sxd[i][1]) /
     734           0 :                  (coherence_state->sx[i] * coherence_state->sd[i] + 1e-10f);
     735             :     }
     736             :   }
     737           0 : }
     738             : 
     739           0 : void WebRtcAec_InitAec_SSE2(void) {
     740           0 :   WebRtcAec_FilterFar = FilterFarSSE2;
     741           0 :   WebRtcAec_ScaleErrorSignal = ScaleErrorSignalSSE2;
     742           0 :   WebRtcAec_FilterAdaptation = FilterAdaptationSSE2;
     743           0 :   WebRtcAec_Overdrive = OverdriveSSE2;
     744           0 :   WebRtcAec_Suppress = SuppressSSE2;
     745           0 :   WebRtcAec_ComputeCoherence = ComputeCoherenceSSE2;
     746           0 :   WebRtcAec_UpdateCoherenceSpectra = UpdateCoherenceSpectraSSE2;
     747           0 :   WebRtcAec_StoreAsComplex = StoreAsComplexSSE2;
     748           0 :   WebRtcAec_PartitionDelay = PartitionDelaySSE2;
     749           0 :   WebRtcAec_WindowData = WindowDataSSE2;
     750           0 : }
     751             : }  // namespace webrtc

Generated by: LCOV version 1.13