LCOV - code coverage report
Current view: top level - media/libopus/celt/x86 - vq_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 94 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 1 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* Copyright (c) 2007-2008 CSIRO
       2             :    Copyright (c) 2007-2009 Xiph.Org Foundation
       3             :    Copyright (c) 2007-2016 Jean-Marc Valin */
       4             : /*
       5             :    Redistribution and use in source and binary forms, with or without
       6             :    modification, are permitted provided that the following conditions
       7             :    are met:
       8             : 
       9             :    - Redistributions of source code must retain the above copyright
      10             :    notice, this list of conditions and the following disclaimer.
      11             : 
      12             :    - Redistributions in binary form must reproduce the above copyright
      13             :    notice, this list of conditions and the following disclaimer in the
      14             :    documentation and/or other materials provided with the distribution.
      15             : 
      16             :    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      17             :    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      18             :    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      19             :    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
      20             :    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      21             :    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      22             :    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
      23             :    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
      24             :    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
      25             :    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
      26             :    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      27             : */
      28             : 
      29             : #ifdef HAVE_CONFIG_H
      30             : #include "config.h"
      31             : #endif
      32             : 
      33             : #include <xmmintrin.h>
      34             : #include <emmintrin.h>
      35             : #include "celt_lpc.h"
      36             : #include "stack_alloc.h"
      37             : #include "mathops.h"
      38             : #include "vq.h"
      39             : #include "x86cpu.h"
      40             : 
      41             : 
      42             : #ifndef FIXED_POINT
      43             : 
      44           0 : opus_val16 op_pvq_search_sse2(celt_norm *_X, int *iy, int K, int N, int arch)
      45             : {
      46             :    int i, j;
      47             :    int pulsesLeft;
      48             :    float xy, yy;
      49             :    VARDECL(celt_norm, y);
      50             :    VARDECL(celt_norm, X);
      51             :    VARDECL(float, signy);
      52             :    __m128 signmask;
      53             :    __m128 sums;
      54             :    __m128i fours;
      55             :    SAVE_STACK;
      56             : 
      57             :    (void)arch;
      58             :    /* All bits set to zero, except for the sign bit. */
      59           0 :    signmask = _mm_set_ps1(-0.f);
      60           0 :    fours = _mm_set_epi32(4, 4, 4, 4);
      61           0 :    ALLOC(y, N+3, celt_norm);
      62           0 :    ALLOC(X, N+3, celt_norm);
      63           0 :    ALLOC(signy, N+3, float);
      64             : 
      65           0 :    OPUS_COPY(X, _X, N);
      66           0 :    X[N] = X[N+1] = X[N+2] = 0;
      67           0 :    sums = _mm_setzero_ps();
      68           0 :    for (j=0;j<N;j+=4)
      69             :    {
      70             :       __m128 x4, s4;
      71           0 :       x4 = _mm_loadu_ps(&X[j]);
      72           0 :       s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
      73             :       /* Get rid of the sign */
      74           0 :       x4 = _mm_andnot_ps(signmask, x4);
      75           0 :       sums = _mm_add_ps(sums, x4);
      76             :       /* Clear y and iy in case we don't do the projection. */
      77           0 :       _mm_storeu_ps(&y[j], _mm_setzero_ps());
      78           0 :       _mm_storeu_si128((__m128i*)&iy[j], _mm_setzero_si128());
      79           0 :       _mm_storeu_ps(&X[j], x4);
      80           0 :       _mm_storeu_ps(&signy[j], s4);
      81             :    }
      82           0 :    sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(1, 0, 3, 2)));
      83           0 :    sums = _mm_add_ps(sums, _mm_shuffle_ps(sums, sums, _MM_SHUFFLE(2, 3, 0, 1)));
      84             : 
      85           0 :    xy = yy = 0;
      86             : 
      87           0 :    pulsesLeft = K;
      88             : 
      89             :    /* Do a pre-search by projecting on the pyramid */
      90           0 :    if (K > (N>>1))
      91             :    {
      92             :       __m128i pulses_sum;
      93             :       __m128 yy4, xy4;
      94             :       __m128 rcp4;
      95           0 :       opus_val32 sum = _mm_cvtss_f32(sums);
      96             :       /* If X is too small, just replace it with a pulse at 0 */
      97             :       /* Prevents infinities and NaNs from causing too many pulses
      98             :          to be allocated. 64 is an approximation of infinity here. */
      99           0 :       if (!(sum > EPSILON && sum < 64))
     100             :       {
     101           0 :          X[0] = QCONST16(1.f,14);
     102           0 :          j=1; do
     103           0 :             X[j]=0;
     104           0 :          while (++j<N);
     105           0 :          sums = _mm_set_ps1(1.f);
     106             :       }
     107             :       /* Using K+e with e < 1 guarantees we cannot get more than K pulses. */
     108           0 :       rcp4 = _mm_mul_ps(_mm_set_ps1((float)(K+.8)), _mm_rcp_ps(sums));
     109           0 :       xy4 = yy4 = _mm_setzero_ps();
     110           0 :       pulses_sum = _mm_setzero_si128();
     111           0 :       for (j=0;j<N;j+=4)
     112             :       {
     113             :          __m128 rx4, x4, y4;
     114             :          __m128i iy4;
     115           0 :          x4 = _mm_loadu_ps(&X[j]);
     116           0 :          rx4 = _mm_mul_ps(x4, rcp4);
     117           0 :          iy4 = _mm_cvttps_epi32(rx4);
     118           0 :          pulses_sum = _mm_add_epi32(pulses_sum, iy4);
     119           0 :          _mm_storeu_si128((__m128i*)&iy[j], iy4);
     120           0 :          y4 = _mm_cvtepi32_ps(iy4);
     121           0 :          xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
     122           0 :          yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
     123             :          /* double the y[] vector so we don't have to do it in the search loop. */
     124           0 :          _mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
     125             :       }
     126           0 :       pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(1, 0, 3, 2)));
     127           0 :       pulses_sum = _mm_add_epi32(pulses_sum, _mm_shuffle_epi32(pulses_sum, _MM_SHUFFLE(2, 3, 0, 1)));
     128           0 :       pulsesLeft -= _mm_cvtsi128_si32(pulses_sum);
     129           0 :       xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(1, 0, 3, 2)));
     130           0 :       xy4 = _mm_add_ps(xy4, _mm_shuffle_ps(xy4, xy4, _MM_SHUFFLE(2, 3, 0, 1)));
     131           0 :       xy = _mm_cvtss_f32(xy4);
     132           0 :       yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(1, 0, 3, 2)));
     133           0 :       yy4 = _mm_add_ps(yy4, _mm_shuffle_ps(yy4, yy4, _MM_SHUFFLE(2, 3, 0, 1)));
     134           0 :       yy = _mm_cvtss_f32(yy4);
     135             :    }
     136           0 :    X[N] = X[N+1] = X[N+2] = -100;
     137           0 :    y[N] = y[N+1] = y[N+2] = 100;
     138           0 :    celt_assert2(pulsesLeft>=0, "Allocated too many pulses in the quick pass");
     139             : 
     140             :    /* This should never happen, but just in case it does (e.g. on silence)
     141             :       we fill the first bin with pulses. */
     142           0 :    if (pulsesLeft > N+3)
     143             :    {
     144           0 :       opus_val16 tmp = (opus_val16)pulsesLeft;
     145           0 :       yy = MAC16_16(yy, tmp, tmp);
     146           0 :       yy = MAC16_16(yy, tmp, y[0]);
     147           0 :       iy[0] += pulsesLeft;
     148           0 :       pulsesLeft=0;
     149             :    }
     150             : 
     151           0 :    for (i=0;i<pulsesLeft;i++)
     152             :    {
     153             :       int best_id;
     154             :       __m128 xy4, yy4;
     155             :       __m128 max, max2;
     156             :       __m128i count;
     157             :       __m128i pos;
     158             :       /* The squared magnitude term gets added anyway, so we might as well
     159             :          add it outside the loop */
     160           0 :       yy = ADD16(yy, 1);
     161           0 :       xy4 = _mm_load1_ps(&xy);
     162           0 :       yy4 = _mm_load1_ps(&yy);
     163           0 :       max = _mm_setzero_ps();
     164           0 :       pos = _mm_setzero_si128();
     165           0 :       count = _mm_set_epi32(3, 2, 1, 0);
     166           0 :       for (j=0;j<N;j+=4)
     167             :       {
     168             :          __m128 x4, y4, r4;
     169           0 :          x4 = _mm_loadu_ps(&X[j]);
     170           0 :          y4 = _mm_loadu_ps(&y[j]);
     171           0 :          x4 = _mm_add_ps(x4, xy4);
     172           0 :          y4 = _mm_add_ps(y4, yy4);
     173           0 :          y4 = _mm_rsqrt_ps(y4);
     174           0 :          r4 = _mm_mul_ps(x4, y4);
     175             :          /* Update the index of the max. */
     176           0 :          pos = _mm_max_epi16(pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
     177             :          /* Update the max. */
     178           0 :          max = _mm_max_ps(max, r4);
     179             :          /* Update the indices (+4) */
     180           0 :          count = _mm_add_epi32(count, fours);
     181             :       }
     182             :       /* Horizontal max */
     183           0 :       max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
     184           0 :       max2 = _mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
     185             :       /* Now that max2 contains the max at all positions, look at which value(s) of the
     186             :          partial max is equal to the global max. */
     187           0 :       pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
     188           0 :       pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
     189           0 :       pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
     190           0 :       best_id = _mm_cvtsi128_si32(pos);
     191             : 
     192             :       /* Updating the sums of the new pulse(s) */
     193           0 :       xy = ADD32(xy, EXTEND32(X[best_id]));
     194             :       /* We're multiplying y[j] by two so we don't have to do it here */
     195           0 :       yy = ADD16(yy, y[best_id]);
     196             : 
     197             :       /* Only now that we've made the final choice, update y/iy */
     198             :       /* Multiplying y[j] by 2 so we don't have to do it everywhere else */
     199           0 :       y[best_id] += 2;
     200           0 :       iy[best_id]++;
     201             :    }
     202             : 
     203             :    /* Put the original sign back */
     204           0 :    for (j=0;j<N;j+=4)
     205             :    {
     206             :       __m128i y4;
     207             :       __m128i s4;
     208           0 :       y4 = _mm_loadu_si128((__m128i*)&iy[j]);
     209           0 :       s4 = _mm_castps_si128(_mm_loadu_ps(&signy[j]));
     210           0 :       y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
     211           0 :       _mm_storeu_si128((__m128i*)&iy[j], y4);
     212             :    }
     213             :    RESTORE_STACK;
     214           0 :    return yy;
     215             : }
     216             : 
     217             : #endif

Generated by: LCOV version 1.13