LCOV - code coverage report
Current view: top level - media/webrtc/trunk/webrtc/modules/audio_processing/utility - ooura_fft_sse2.cc (source / functions) Hit Total Coverage
Test: output.info Lines: 0 269 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2011 The WebRTC project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include "webrtc/modules/audio_processing/utility/ooura_fft.h"
      12             : 
      13             : #include <emmintrin.h>
      14             : 
      15             : #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_common.h"
      16             : #include "webrtc/modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h"
      17             : 
      18             : namespace webrtc {
      19             : 
      20             : #if defined(WEBRTC_ARCH_X86_FAMILY)
      21             : 
      22             : namespace {
      23             : // These intrinsics were unavailable before VS 2008.
      24             : // TODO(andrew): move to a common file.
      25             : #if defined(_MSC_VER) && _MSC_VER < 1500
      26             : static __inline __m128 _mm_castsi128_ps(__m128i a) {
      27             :   return *(__m128*)&a;
      28             : }
      29             : static __inline __m128i _mm_castps_si128(__m128 a) {
      30             :   return *(__m128i*)&a;
      31             : }
      32             : #endif
      33             : 
      34             : }  // namespace
      35             : 
      36           0 : void cft1st_128_SSE2(float* a) {
      37           0 :   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
      38             :   int j, k2;
      39             : 
      40           0 :   for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
      41           0 :     __m128 a00v = _mm_loadu_ps(&a[j + 0]);
      42           0 :     __m128 a04v = _mm_loadu_ps(&a[j + 4]);
      43           0 :     __m128 a08v = _mm_loadu_ps(&a[j + 8]);
      44           0 :     __m128 a12v = _mm_loadu_ps(&a[j + 12]);
      45           0 :     __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
      46           0 :     __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
      47           0 :     __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
      48           0 :     __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));
      49             : 
      50           0 :     const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
      51           0 :     const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
      52           0 :     const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
      53           0 :     const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
      54           0 :     const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
      55           0 :     const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
      56           0 :     __m128 x0v = _mm_add_ps(a01v, a23v);
      57           0 :     const __m128 x1v = _mm_sub_ps(a01v, a23v);
      58           0 :     const __m128 x2v = _mm_add_ps(a45v, a67v);
      59           0 :     const __m128 x3v = _mm_sub_ps(a45v, a67v);
      60             :     __m128 x0w;
      61           0 :     a01v = _mm_add_ps(x0v, x2v);
      62           0 :     x0v = _mm_sub_ps(x0v, x2v);
      63           0 :     x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
      64             :     {
      65           0 :       const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
      66           0 :       const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
      67           0 :       a45v = _mm_add_ps(a45_0v, a45_1v);
      68             :     }
      69             :     {
      70             :       __m128 a23_0v, a23_1v;
      71           0 :       const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
      72           0 :       const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
      73           0 :       x0v = _mm_add_ps(x1v, x3s);
      74           0 :       x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
      75           0 :       a23_0v = _mm_mul_ps(wk1rv, x0v);
      76           0 :       a23_1v = _mm_mul_ps(wk1iv, x0w);
      77           0 :       a23v = _mm_add_ps(a23_0v, a23_1v);
      78             : 
      79           0 :       x0v = _mm_sub_ps(x1v, x3s);
      80           0 :       x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
      81             :     }
      82             :     {
      83           0 :       const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
      84           0 :       const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
      85           0 :       a67v = _mm_add_ps(a67_0v, a67_1v);
      86             :     }
      87             : 
      88           0 :     a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
      89           0 :     a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
      90           0 :     a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
      91           0 :     a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
      92           0 :     _mm_storeu_ps(&a[j + 0], a00v);
      93           0 :     _mm_storeu_ps(&a[j + 4], a04v);
      94           0 :     _mm_storeu_ps(&a[j + 8], a08v);
      95           0 :     _mm_storeu_ps(&a[j + 12], a12v);
      96             :   }
      97           0 : }
      98             : 
      99           0 : void cftmdl_128_SSE2(float* a) {
     100           0 :   const int l = 8;
     101           0 :   const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
     102             :   int j0;
     103             : 
     104           0 :   __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
     105           0 :   for (j0 = 0; j0 < l; j0 += 2) {
     106           0 :     const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
     107           0 :     const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
     108           0 :     const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
     109           0 :     const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
     110             :     const __m128 a_00_32 =
     111           0 :         _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
     112             :                        _MM_SHUFFLE(1, 0, 1, 0));
     113             :     const __m128 a_08_40 =
     114           0 :         _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
     115             :                        _MM_SHUFFLE(1, 0, 1, 0));
     116           0 :     __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
     117           0 :     const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
     118             : 
     119           0 :     const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
     120           0 :     const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
     121           0 :     const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
     122           0 :     const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
     123             :     const __m128 a_16_48 =
     124           0 :         _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
     125             :                        _MM_SHUFFLE(1, 0, 1, 0));
     126             :     const __m128 a_24_56 =
     127           0 :         _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
     128             :                        _MM_SHUFFLE(1, 0, 1, 0));
     129           0 :     const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
     130           0 :     const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
     131             : 
     132           0 :     const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
     133           0 :     const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
     134             : 
     135           0 :     const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
     136           0 :         _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
     137           0 :     const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
     138           0 :     const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
     139           0 :     const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
     140             : 
     141             :     const __m128 yy0 =
     142           0 :         _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
     143             :     const __m128 yy1 =
     144           0 :         _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
     145           0 :     const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
     146           0 :     const __m128 yy3 = _mm_add_ps(yy0, yy2);
     147           0 :     const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);
     148             : 
     149           0 :     _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
     150           0 :     _mm_storel_epi64(
     151           0 :         (__m128i*)&a[j0 + 32],
     152           0 :         _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));
     153             : 
     154           0 :     _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
     155           0 :     _mm_storel_epi64(
     156           0 :         (__m128i*)&a[j0 + 48],
     157           0 :         _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
     158           0 :     a[j0 + 48] = -a[j0 + 48];
     159             : 
     160           0 :     _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
     161           0 :     _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));
     162             : 
     163           0 :     _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
     164           0 :     _mm_storel_epi64(
     165           0 :         (__m128i*)&a[j0 + 56],
     166           0 :         _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
     167             :   }
     168             : 
     169             :   {
     170           0 :     int k = 64;
     171           0 :     int k1 = 2;
     172           0 :     int k2 = 2 * k1;
     173           0 :     const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
     174           0 :     const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
     175           0 :     const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
     176           0 :     const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
     177           0 :     const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
     178           0 :     wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
     179           0 :     for (j0 = k; j0 < l + k; j0 += 2) {
     180           0 :       const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
     181           0 :       const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
     182           0 :       const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
     183           0 :       const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
     184             :       const __m128 a_00_32 =
     185           0 :           _mm_shuffle_ps(_mm_castsi128_ps(a_00), _mm_castsi128_ps(a_32),
     186             :                          _MM_SHUFFLE(1, 0, 1, 0));
     187             :       const __m128 a_08_40 =
     188           0 :           _mm_shuffle_ps(_mm_castsi128_ps(a_08), _mm_castsi128_ps(a_40),
     189             :                          _MM_SHUFFLE(1, 0, 1, 0));
     190           0 :       __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
     191           0 :       const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);
     192             : 
     193           0 :       const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
     194           0 :       const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
     195           0 :       const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
     196           0 :       const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
     197             :       const __m128 a_16_48 =
     198           0 :           _mm_shuffle_ps(_mm_castsi128_ps(a_16), _mm_castsi128_ps(a_48),
     199             :                          _MM_SHUFFLE(1, 0, 1, 0));
     200             :       const __m128 a_24_56 =
     201           0 :           _mm_shuffle_ps(_mm_castsi128_ps(a_24), _mm_castsi128_ps(a_56),
     202             :                          _MM_SHUFFLE(1, 0, 1, 0));
     203           0 :       const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
     204           0 :       const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);
     205             : 
     206           0 :       const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
     207           0 :       const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
     208           0 :       const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
     209           0 :       const __m128 xx3 = _mm_mul_ps(
     210           0 :           wk2iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(xx1),
     211           0 :                                                     _MM_SHUFFLE(2, 3, 0, 1))));
     212           0 :       const __m128 xx4 = _mm_add_ps(xx2, xx3);
     213             : 
     214           0 :       const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
     215           0 :           _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
     216           0 :       const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
     217           0 :       const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
     218           0 :       const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
     219             : 
     220           0 :       const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
     221           0 :       const __m128 xx11 = _mm_mul_ps(
     222           0 :           wk1iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
     223           0 :                                                     _MM_SHUFFLE(2, 3, 0, 1))));
     224           0 :       const __m128 xx12 = _mm_add_ps(xx10, xx11);
     225             : 
     226           0 :       const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
     227           0 :       const __m128 xx21 = _mm_mul_ps(
     228           0 :           wk3iv, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
     229           0 :                                                     _MM_SHUFFLE(2, 3, 0, 1))));
     230           0 :       const __m128 xx22 = _mm_add_ps(xx20, xx21);
     231             : 
     232           0 :       _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
     233           0 :       _mm_storel_epi64(
     234           0 :           (__m128i*)&a[j0 + 32],
     235           0 :           _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));
     236             : 
     237           0 :       _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
     238           0 :       _mm_storel_epi64(
     239           0 :           (__m128i*)&a[j0 + 48],
     240           0 :           _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));
     241             : 
     242           0 :       _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
     243           0 :       _mm_storel_epi64(
     244           0 :           (__m128i*)&a[j0 + 40],
     245           0 :           _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));
     246             : 
     247           0 :       _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
     248           0 :       _mm_storel_epi64(
     249           0 :           (__m128i*)&a[j0 + 56],
     250           0 :           _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
     251             :     }
     252             :   }
     253           0 : }
     254             : 
     255           0 : void rftfsub_128_SSE2(float* a) {
     256           0 :   const float* c = rdft_w + 32;
     257             :   int j1, j2, k1, k2;
     258             :   float wkr, wki, xr, xi, yr, yi;
     259             : 
     260             :   static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
     261             :                                                           0.5f};
     262           0 :   const __m128 mm_half = _mm_load_ps(k_half);
     263             : 
     264             :   // Vectorized code (four at once).
     265             :   //    Note: commented number are indexes for the first iteration of the loop.
     266           0 :   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
     267             :     // Load 'wk'.
     268           0 :     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
     269           0 :     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
     270           0 :     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
     271             :     const __m128 wkr_ =
     272           0 :         _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
     273           0 :     const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
     274             :     // Load and shuffle 'a'.
     275           0 :     const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
     276           0 :     const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
     277           0 :     const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
     278           0 :     const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
     279           0 :     const __m128 a_j2_p0 = _mm_shuffle_ps(
     280             :         a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
     281           0 :     const __m128 a_j2_p1 = _mm_shuffle_ps(
     282             :         a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
     283           0 :     const __m128 a_k2_p0 = _mm_shuffle_ps(
     284             :         a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
     285           0 :     const __m128 a_k2_p1 = _mm_shuffle_ps(
     286             :         a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
     287             :     // Calculate 'x'.
     288           0 :     const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
     289             :     // 2-126, 4-124, 6-122, 8-120,
     290           0 :     const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
     291             :     // 3-127, 5-125, 7-123, 9-121,
     292             :     // Calculate product into 'y'.
     293             :     //    yr = wkr * xr - wki * xi;
     294             :     //    yi = wkr * xi + wki * xr;
     295           0 :     const __m128 a_ = _mm_mul_ps(wkr_, xr_);
     296           0 :     const __m128 b_ = _mm_mul_ps(wki_, xi_);
     297           0 :     const __m128 c_ = _mm_mul_ps(wkr_, xi_);
     298           0 :     const __m128 d_ = _mm_mul_ps(wki_, xr_);
     299           0 :     const __m128 yr_ = _mm_sub_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
     300           0 :     const __m128 yi_ = _mm_add_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
     301             :                                             // Update 'a'.
     302             :                                             //    a[j2 + 0] -= yr;
     303             :                                             //    a[j2 + 1] -= yi;
     304             :                                             //    a[k2 + 0] += yr;
     305             :     //    a[k2 + 1] -= yi;
     306           0 :     const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
     307           0 :     const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
     308           0 :     const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
     309           0 :     const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
     310             :     // Shuffle in right order and store.
     311           0 :     const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
     312             :     //   2,   3,   4,   5,
     313           0 :     const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
     314             :     //   6,   7,   8,   9,
     315           0 :     const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
     316             :     // 122, 123, 120, 121,
     317           0 :     const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
     318             :     // 126, 127, 124, 125,
     319           0 :     const __m128 a_k2_0n = _mm_shuffle_ps(
     320             :         a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
     321           0 :     const __m128 a_k2_4n = _mm_shuffle_ps(
     322             :         a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
     323           0 :     _mm_storeu_ps(&a[0 + j2], a_j2_0n);
     324           0 :     _mm_storeu_ps(&a[4 + j2], a_j2_4n);
     325           0 :     _mm_storeu_ps(&a[122 - j2], a_k2_0n);
     326           0 :     _mm_storeu_ps(&a[126 - j2], a_k2_4n);
     327             :   }
     328             :   // Scalar code for the remaining items.
     329           0 :   for (; j2 < 64; j1 += 1, j2 += 2) {
     330           0 :     k2 = 128 - j2;
     331           0 :     k1 = 32 - j1;
     332           0 :     wkr = 0.5f - c[k1];
     333           0 :     wki = c[j1];
     334           0 :     xr = a[j2 + 0] - a[k2 + 0];
     335           0 :     xi = a[j2 + 1] + a[k2 + 1];
     336           0 :     yr = wkr * xr - wki * xi;
     337           0 :     yi = wkr * xi + wki * xr;
     338           0 :     a[j2 + 0] -= yr;
     339           0 :     a[j2 + 1] -= yi;
     340           0 :     a[k2 + 0] += yr;
     341           0 :     a[k2 + 1] -= yi;
     342             :   }
     343           0 : }
     344             : 
     345           0 : void rftbsub_128_SSE2(float* a) {
     346           0 :   const float* c = rdft_w + 32;
     347             :   int j1, j2, k1, k2;
     348             :   float wkr, wki, xr, xi, yr, yi;
     349             : 
     350             :   static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f,
     351             :                                                           0.5f};
     352           0 :   const __m128 mm_half = _mm_load_ps(k_half);
     353             : 
     354           0 :   a[1] = -a[1];
     355             :   // Vectorized code (four at once).
     356             :   //    Note: commented number are indexes for the first iteration of the loop.
     357           0 :   for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
     358             :     // Load 'wk'.
     359           0 :     const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
     360           0 :     const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
     361           0 :     const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
     362             :     const __m128 wkr_ =
     363           0 :         _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
     364           0 :     const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
     365             :     // Load and shuffle 'a'.
     366           0 :     const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
     367           0 :     const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
     368           0 :     const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
     369           0 :     const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
     370           0 :     const __m128 a_j2_p0 = _mm_shuffle_ps(
     371             :         a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
     372           0 :     const __m128 a_j2_p1 = _mm_shuffle_ps(
     373             :         a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
     374           0 :     const __m128 a_k2_p0 = _mm_shuffle_ps(
     375             :         a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
     376           0 :     const __m128 a_k2_p1 = _mm_shuffle_ps(
     377             :         a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
     378             :     // Calculate 'x'.
     379           0 :     const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
     380             :     // 2-126, 4-124, 6-122, 8-120,
     381           0 :     const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
     382             :     // 3-127, 5-125, 7-123, 9-121,
     383             :     // Calculate product into 'y'.
     384             :     //    yr = wkr * xr + wki * xi;
     385             :     //    yi = wkr * xi - wki * xr;
     386           0 :     const __m128 a_ = _mm_mul_ps(wkr_, xr_);
     387           0 :     const __m128 b_ = _mm_mul_ps(wki_, xi_);
     388           0 :     const __m128 c_ = _mm_mul_ps(wkr_, xi_);
     389           0 :     const __m128 d_ = _mm_mul_ps(wki_, xr_);
     390           0 :     const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
     391           0 :     const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
     392             :                                             // Update 'a'.
     393             :                                             //    a[j2 + 0] = a[j2 + 0] - yr;
     394             :                                             //    a[j2 + 1] = yi - a[j2 + 1];
     395             :                                             //    a[k2 + 0] = yr + a[k2 + 0];
     396             :     //    a[k2 + 1] = yi - a[k2 + 1];
     397           0 :     const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
     398           0 :     const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
     399           0 :     const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
     400           0 :     const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
     401             :     // Shuffle in right order and store.
     402           0 :     const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
     403             :     //   2,   3,   4,   5,
     404           0 :     const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
     405             :     //   6,   7,   8,   9,
     406           0 :     const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
     407             :     // 122, 123, 120, 121,
     408           0 :     const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
     409             :     // 126, 127, 124, 125,
     410           0 :     const __m128 a_k2_0n = _mm_shuffle_ps(
     411             :         a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
     412           0 :     const __m128 a_k2_4n = _mm_shuffle_ps(
     413             :         a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
     414           0 :     _mm_storeu_ps(&a[0 + j2], a_j2_0n);
     415           0 :     _mm_storeu_ps(&a[4 + j2], a_j2_4n);
     416           0 :     _mm_storeu_ps(&a[122 - j2], a_k2_0n);
     417           0 :     _mm_storeu_ps(&a[126 - j2], a_k2_4n);
     418             :   }
     419             :   // Scalar code for the remaining items.
     420           0 :   for (; j2 < 64; j1 += 1, j2 += 2) {
     421           0 :     k2 = 128 - j2;
     422           0 :     k1 = 32 - j1;
     423           0 :     wkr = 0.5f - c[k1];
     424           0 :     wki = c[j1];
     425           0 :     xr = a[j2 + 0] - a[k2 + 0];
     426           0 :     xi = a[j2 + 1] + a[k2 + 1];
     427           0 :     yr = wkr * xr + wki * xi;
     428           0 :     yi = wkr * xi - wki * xr;
     429           0 :     a[j2 + 0] = a[j2 + 0] - yr;
     430           0 :     a[j2 + 1] = yi - a[j2 + 1];
     431           0 :     a[k2 + 0] = yr + a[k2 + 0];
     432           0 :     a[k2 + 1] = yi - a[k2 + 1];
     433             :   }
     434           0 :   a[65] = -a[65];
     435           0 : }
     436             : #endif
     437             : 
     438             : }  // namespace webrtc

Generated by: LCOV version 1.13