LCOV - code coverage report
Current view: top level - media/libopus/celt/x86 - pitch_sse.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 67 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* Copyright (c) 2014, Cisco Systems, INC
       2             :    Written by XiangMingZhu WeiZhou MinPeng YanWang
       3             : 
       4             :    Redistribution and use in source and binary forms, with or without
       5             :    modification, are permitted provided that the following conditions
       6             :    are met:
       7             : 
       8             :    - Redistributions of source code must retain the above copyright
       9             :    notice, this list of conditions and the following disclaimer.
      10             : 
      11             :    - Redistributions in binary form must reproduce the above copyright
      12             :    notice, this list of conditions and the following disclaimer in the
      13             :    documentation and/or other materials provided with the distribution.
      14             : 
      15             :    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
      16             :    ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
      17             :    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
      18             :    A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
      19             :    OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      20             :    EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      21             :    PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
      22             :    PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
      23             :    LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
      24             :    NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
      25             :    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      26             : */
      27             : 
      28             : #ifdef HAVE_CONFIG_H
      29             : #include "config.h"
      30             : #endif
      31             : 
      32             : #include "macros.h"
      33             : #include "celt_lpc.h"
      34             : #include "stack_alloc.h"
      35             : #include "mathops.h"
      36             : #include "pitch.h"
      37             : 
      38             : #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
      39             : 
      40             : #include <xmmintrin.h>
      41             : #include "arch.h"
      42             : 
      43           0 : void xcorr_kernel_sse(const opus_val16 *x, const opus_val16 *y, opus_val32 sum[4], int len)
      44             : {
      45             :    int j;
      46             :    __m128 xsum1, xsum2;
      47           0 :    xsum1 = _mm_loadu_ps(sum);
      48           0 :    xsum2 = _mm_setzero_ps();
      49             : 
      50           0 :    for (j = 0; j < len-3; j += 4)
      51             :    {
      52           0 :       __m128 x0 = _mm_loadu_ps(x+j);
      53           0 :       __m128 yj = _mm_loadu_ps(y+j);
      54           0 :       __m128 y3 = _mm_loadu_ps(y+j+3);
      55             : 
      56           0 :       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x00),yj));
      57           0 :       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0x55),
      58           0 :                                           _mm_shuffle_ps(yj,y3,0x49)));
      59           0 :       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xaa),
      60           0 :                                           _mm_shuffle_ps(yj,y3,0x9e)));
      61           0 :       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_shuffle_ps(x0,x0,0xff),y3));
      62             :    }
      63           0 :    if (j < len)
      64             :    {
      65           0 :       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
      66           0 :       if (++j < len)
      67             :       {
      68           0 :          xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
      69           0 :          if (++j < len)
      70             :          {
      71           0 :             xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(_mm_load1_ps(x+j),_mm_loadu_ps(y+j)));
      72             :          }
      73             :       }
      74             :    }
      75           0 :    _mm_storeu_ps(sum,_mm_add_ps(xsum1,xsum2));
      76           0 : }
      77             : 
      78             : 
      79           0 : void dual_inner_prod_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
      80             :       int N, opus_val32 *xy1, opus_val32 *xy2)
      81             : {
      82             :    int i;
      83             :    __m128 xsum1, xsum2;
      84           0 :    xsum1 = _mm_setzero_ps();
      85           0 :    xsum2 = _mm_setzero_ps();
      86           0 :    for (i=0;i<N-3;i+=4)
      87             :    {
      88           0 :       __m128 xi = _mm_loadu_ps(x+i);
      89           0 :       __m128 y1i = _mm_loadu_ps(y01+i);
      90           0 :       __m128 y2i = _mm_loadu_ps(y02+i);
      91           0 :       xsum1 = _mm_add_ps(xsum1,_mm_mul_ps(xi, y1i));
      92           0 :       xsum2 = _mm_add_ps(xsum2,_mm_mul_ps(xi, y2i));
      93             :    }
      94             :    /* Horizontal sum */
      95           0 :    xsum1 = _mm_add_ps(xsum1, _mm_movehl_ps(xsum1, xsum1));
      96           0 :    xsum1 = _mm_add_ss(xsum1, _mm_shuffle_ps(xsum1, xsum1, 0x55));
      97             :    _mm_store_ss(xy1, xsum1);
      98           0 :    xsum2 = _mm_add_ps(xsum2, _mm_movehl_ps(xsum2, xsum2));
      99           0 :    xsum2 = _mm_add_ss(xsum2, _mm_shuffle_ps(xsum2, xsum2, 0x55));
     100             :    _mm_store_ss(xy2, xsum2);
     101           0 :    for (;i<N;i++)
     102             :    {
     103           0 :       *xy1 = MAC16_16(*xy1, x[i], y01[i]);
     104           0 :       *xy2 = MAC16_16(*xy2, x[i], y02[i]);
     105             :    }
     106           0 : }
     107             : 
     108           0 : opus_val32 celt_inner_prod_sse(const opus_val16 *x, const opus_val16 *y,
     109             :       int N)
     110             : {
     111             :    int i;
     112             :    float xy;
     113             :    __m128 sum;
     114           0 :    sum = _mm_setzero_ps();
     115             :    /* FIXME: We should probably go 8-way and use 2 sums. */
     116           0 :    for (i=0;i<N-3;i+=4)
     117             :    {
     118           0 :       __m128 xi = _mm_loadu_ps(x+i);
     119           0 :       __m128 yi = _mm_loadu_ps(y+i);
     120           0 :       sum = _mm_add_ps(sum,_mm_mul_ps(xi, yi));
     121             :    }
     122             :    /* Horizontal sum */
     123           0 :    sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
     124           0 :    sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
     125             :    _mm_store_ss(&xy, sum);
     126           0 :    for (;i<N;i++)
     127             :    {
     128           0 :       xy = MAC16_16(xy, x[i], y[i]);
     129             :    }
     130           0 :    return xy;
     131             : }
     132             : 
     133           0 : void comb_filter_const_sse(opus_val32 *y, opus_val32 *x, int T, int N,
     134             :       opus_val16 g10, opus_val16 g11, opus_val16 g12)
     135             : {
     136             :    int i;
     137             :    __m128 x0v;
     138             :    __m128 g10v, g11v, g12v;
     139           0 :    g10v = _mm_load1_ps(&g10);
     140           0 :    g11v = _mm_load1_ps(&g11);
     141           0 :    g12v = _mm_load1_ps(&g12);
     142           0 :    x0v = _mm_loadu_ps(&x[-T-2]);
     143           0 :    for (i=0;i<N-3;i+=4)
     144             :    {
     145             :       __m128 yi, yi2, x1v, x2v, x3v, x4v;
     146           0 :       const opus_val32 *xp = &x[i-T-2];
     147           0 :       yi = _mm_loadu_ps(x+i);
     148           0 :       x4v = _mm_loadu_ps(xp+4);
     149             : #if 0
     150             :       /* Slower version with all loads */
     151             :       x1v = _mm_loadu_ps(xp+1);
     152             :       x2v = _mm_loadu_ps(xp+2);
     153             :       x3v = _mm_loadu_ps(xp+3);
     154             : #else
     155           0 :       x2v = _mm_shuffle_ps(x0v, x4v, 0x4e);
     156           0 :       x1v = _mm_shuffle_ps(x0v, x2v, 0x99);
     157           0 :       x3v = _mm_shuffle_ps(x2v, x4v, 0x99);
     158             : #endif
     159             : 
     160           0 :       yi = _mm_add_ps(yi, _mm_mul_ps(g10v,x2v));
     161             : #if 0 /* Set to 1 to make it bit-exact with the non-SSE version */
     162             :       yi = _mm_add_ps(yi, _mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)));
     163             :       yi = _mm_add_ps(yi, _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
     164             : #else
     165             :       /* Use partial sums */
     166           0 :       yi2 = _mm_add_ps(_mm_mul_ps(g11v,_mm_add_ps(x3v,x1v)),
     167             :                        _mm_mul_ps(g12v,_mm_add_ps(x4v,x0v)));
     168           0 :       yi = _mm_add_ps(yi, yi2);
     169             : #endif
     170           0 :       x0v=x4v;
     171           0 :       _mm_storeu_ps(y+i, yi);
     172             :    }
     173             : #ifdef CUSTOM_MODES
     174             :    for (;i<N;i++)
     175             :    {
     176             :       y[i] = x[i]
     177             :                + MULT16_32_Q15(g10,x[i-T])
     178             :                + MULT16_32_Q15(g11,ADD32(x[i-T+1],x[i-T-1]))
     179             :                + MULT16_32_Q15(g12,ADD32(x[i-T+2],x[i-T-2]));
     180             :    }
     181             : #endif
     182           0 : }
     183             : 
     184             : 
     185             : #endif

Generated by: LCOV version 1.13