LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - fwd_txfm_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 154 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <emmintrin.h>  // SSE2
      12             : 
      13             : #include "./vpx_config.h"
      14             : #include "./vpx_dsp_rtcd.h"
      15             : #include "vpx_dsp/vpx_dsp_common.h"
      16             : #include "vpx_dsp/x86/fwd_txfm_sse2.h"
      17             : 
      18           0 : void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
      19             :   __m128i in0, in1;
      20             :   __m128i tmp;
      21           0 :   const __m128i zero = _mm_setzero_si128();
      22           0 :   in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      23           0 :   in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      24           0 :   in1 = _mm_unpacklo_epi64(
      25           0 :       in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
      26           0 :   in0 = _mm_unpacklo_epi64(
      27           0 :       in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
      28             : 
      29           0 :   tmp = _mm_add_epi16(in0, in1);
      30           0 :   in0 = _mm_unpacklo_epi16(zero, tmp);
      31           0 :   in1 = _mm_unpackhi_epi16(zero, tmp);
      32           0 :   in0 = _mm_srai_epi32(in0, 16);
      33           0 :   in1 = _mm_srai_epi32(in1, 16);
      34             : 
      35           0 :   tmp = _mm_add_epi32(in0, in1);
      36           0 :   in0 = _mm_unpacklo_epi32(tmp, zero);
      37           0 :   in1 = _mm_unpackhi_epi32(tmp, zero);
      38             : 
      39           0 :   tmp = _mm_add_epi32(in0, in1);
      40           0 :   in0 = _mm_srli_si128(tmp, 8);
      41             : 
      42           0 :   in1 = _mm_add_epi32(tmp, in0);
      43           0 :   in0 = _mm_slli_epi32(in1, 1);
      44           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
      45           0 : }
      46             : 
      47           0 : void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
      48           0 :   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
      49           0 :   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
      50           0 :   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
      51           0 :   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
      52             :   __m128i u0, u1, sum;
      53             : 
      54           0 :   u0 = _mm_add_epi16(in0, in1);
      55           0 :   u1 = _mm_add_epi16(in2, in3);
      56             : 
      57           0 :   in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
      58           0 :   in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
      59           0 :   in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
      60           0 :   in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
      61             : 
      62           0 :   sum = _mm_add_epi16(u0, u1);
      63             : 
      64           0 :   in0 = _mm_add_epi16(in0, in1);
      65           0 :   in2 = _mm_add_epi16(in2, in3);
      66           0 :   sum = _mm_add_epi16(sum, in0);
      67             : 
      68           0 :   u0 = _mm_setzero_si128();
      69           0 :   sum = _mm_add_epi16(sum, in2);
      70             : 
      71           0 :   in0 = _mm_unpacklo_epi16(u0, sum);
      72           0 :   in1 = _mm_unpackhi_epi16(u0, sum);
      73           0 :   in0 = _mm_srai_epi32(in0, 16);
      74           0 :   in1 = _mm_srai_epi32(in1, 16);
      75             : 
      76           0 :   sum = _mm_add_epi32(in0, in1);
      77           0 :   in0 = _mm_unpacklo_epi32(sum, u0);
      78           0 :   in1 = _mm_unpackhi_epi32(sum, u0);
      79             : 
      80           0 :   sum = _mm_add_epi32(in0, in1);
      81           0 :   in0 = _mm_srli_si128(sum, 8);
      82             : 
      83           0 :   in1 = _mm_add_epi32(sum, in0);
      84           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
      85           0 : }
      86             : 
      87           0 : void vpx_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
      88             :                           int stride) {
      89             :   __m128i in0, in1, in2, in3;
      90             :   __m128i u0, u1;
      91           0 :   __m128i sum = _mm_setzero_si128();
      92             :   int i;
      93             : 
      94           0 :   for (i = 0; i < 2; ++i) {
      95           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
      96           0 :     in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
      97           0 :     in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
      98           0 :     in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
      99             : 
     100           0 :     u0 = _mm_add_epi16(in0, in1);
     101           0 :     u1 = _mm_add_epi16(in2, in3);
     102           0 :     sum = _mm_add_epi16(sum, u0);
     103             : 
     104           0 :     in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
     105           0 :     in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
     106           0 :     in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
     107           0 :     in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
     108             : 
     109           0 :     sum = _mm_add_epi16(sum, u1);
     110           0 :     u0 = _mm_add_epi16(in0, in1);
     111           0 :     u1 = _mm_add_epi16(in2, in3);
     112           0 :     sum = _mm_add_epi16(sum, u0);
     113             : 
     114           0 :     in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
     115           0 :     in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
     116           0 :     in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
     117           0 :     in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
     118             : 
     119           0 :     sum = _mm_add_epi16(sum, u1);
     120           0 :     u0 = _mm_add_epi16(in0, in1);
     121           0 :     u1 = _mm_add_epi16(in2, in3);
     122           0 :     sum = _mm_add_epi16(sum, u0);
     123             : 
     124           0 :     in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
     125           0 :     in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
     126           0 :     in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
     127           0 :     in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
     128             : 
     129           0 :     sum = _mm_add_epi16(sum, u1);
     130           0 :     u0 = _mm_add_epi16(in0, in1);
     131           0 :     u1 = _mm_add_epi16(in2, in3);
     132           0 :     sum = _mm_add_epi16(sum, u0);
     133             : 
     134           0 :     sum = _mm_add_epi16(sum, u1);
     135           0 :     input += 8 * stride;
     136             :   }
     137             : 
     138           0 :   u0 = _mm_setzero_si128();
     139           0 :   in0 = _mm_unpacklo_epi16(u0, sum);
     140           0 :   in1 = _mm_unpackhi_epi16(u0, sum);
     141           0 :   in0 = _mm_srai_epi32(in0, 16);
     142           0 :   in1 = _mm_srai_epi32(in1, 16);
     143             : 
     144           0 :   sum = _mm_add_epi32(in0, in1);
     145           0 :   in0 = _mm_unpacklo_epi32(sum, u0);
     146           0 :   in1 = _mm_unpackhi_epi32(sum, u0);
     147             : 
     148           0 :   sum = _mm_add_epi32(in0, in1);
     149           0 :   in0 = _mm_srli_si128(sum, 8);
     150             : 
     151           0 :   in1 = _mm_add_epi32(sum, in0);
     152           0 :   in1 = _mm_srai_epi32(in1, 1);
     153           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
     154           0 : }
     155             : 
     156           0 : void vpx_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
     157             :                           int stride) {
     158             :   __m128i in0, in1, in2, in3;
     159             :   __m128i u0, u1;
     160           0 :   __m128i sum = _mm_setzero_si128();
     161             :   int i;
     162             : 
     163           0 :   for (i = 0; i < 8; ++i) {
     164           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     165           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     166           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     167           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     168             : 
     169           0 :     input += stride;
     170           0 :     u0 = _mm_add_epi16(in0, in1);
     171           0 :     u1 = _mm_add_epi16(in2, in3);
     172           0 :     sum = _mm_add_epi16(sum, u0);
     173             : 
     174           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     175           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     176           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     177           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     178             : 
     179           0 :     input += stride;
     180           0 :     sum = _mm_add_epi16(sum, u1);
     181           0 :     u0 = _mm_add_epi16(in0, in1);
     182           0 :     u1 = _mm_add_epi16(in2, in3);
     183           0 :     sum = _mm_add_epi16(sum, u0);
     184             : 
     185           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     186           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     187           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     188           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     189             : 
     190           0 :     input += stride;
     191           0 :     sum = _mm_add_epi16(sum, u1);
     192           0 :     u0 = _mm_add_epi16(in0, in1);
     193           0 :     u1 = _mm_add_epi16(in2, in3);
     194           0 :     sum = _mm_add_epi16(sum, u0);
     195             : 
     196           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     197           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     198           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     199           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     200             : 
     201           0 :     input += stride;
     202           0 :     sum = _mm_add_epi16(sum, u1);
     203           0 :     u0 = _mm_add_epi16(in0, in1);
     204           0 :     u1 = _mm_add_epi16(in2, in3);
     205           0 :     sum = _mm_add_epi16(sum, u0);
     206             : 
     207           0 :     sum = _mm_add_epi16(sum, u1);
     208             :   }
     209             : 
     210           0 :   u0 = _mm_setzero_si128();
     211           0 :   in0 = _mm_unpacklo_epi16(u0, sum);
     212           0 :   in1 = _mm_unpackhi_epi16(u0, sum);
     213           0 :   in0 = _mm_srai_epi32(in0, 16);
     214           0 :   in1 = _mm_srai_epi32(in1, 16);
     215             : 
     216           0 :   sum = _mm_add_epi32(in0, in1);
     217           0 :   in0 = _mm_unpacklo_epi32(sum, u0);
     218           0 :   in1 = _mm_unpackhi_epi32(sum, u0);
     219             : 
     220           0 :   sum = _mm_add_epi32(in0, in1);
     221           0 :   in0 = _mm_srli_si128(sum, 8);
     222             : 
     223           0 :   in1 = _mm_add_epi32(sum, in0);
     224           0 :   in1 = _mm_srai_epi32(in1, 3);
     225           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
     226           0 : }
     227             : 
     228             : #define DCT_HIGH_BIT_DEPTH 0
     229             : #define FDCT4x4_2D vpx_fdct4x4_sse2
     230             : #define FDCT8x8_2D vpx_fdct8x8_sse2
     231             : #define FDCT16x16_2D vpx_fdct16x16_sse2
     232             : #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"
     233             : #undef FDCT4x4_2D
     234             : #undef FDCT8x8_2D
     235             : #undef FDCT16x16_2D
     236             : 
     237             : #define FDCT32x32_2D vpx_fdct32x32_rd_sse2
     238             : #define FDCT32x32_HIGH_PRECISION 0
     239             : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"
     240             : #undef FDCT32x32_2D
     241             : #undef FDCT32x32_HIGH_PRECISION
     242             : 
     243             : #define FDCT32x32_2D vpx_fdct32x32_sse2
     244             : #define FDCT32x32_HIGH_PRECISION 1
     245             : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
     246             : #undef FDCT32x32_2D
     247             : #undef FDCT32x32_HIGH_PRECISION
     248             : #undef DCT_HIGH_BIT_DEPTH
     249             : 
     250             : #if CONFIG_VP9_HIGHBITDEPTH
     251             : #define DCT_HIGH_BIT_DEPTH 1
     252             : #define FDCT4x4_2D vpx_highbd_fdct4x4_sse2
     253             : #define FDCT8x8_2D vpx_highbd_fdct8x8_sse2
     254             : #define FDCT16x16_2D vpx_highbd_fdct16x16_sse2
     255             : #include "vpx_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
     256             : #undef FDCT4x4_2D
     257             : #undef FDCT8x8_2D
     258             : #undef FDCT16x16_2D
     259             : 
     260             : #define FDCT32x32_2D vpx_highbd_fdct32x32_rd_sse2
     261             : #define FDCT32x32_HIGH_PRECISION 0
     262             : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
     263             : #undef FDCT32x32_2D
     264             : #undef FDCT32x32_HIGH_PRECISION
     265             : 
     266             : #define FDCT32x32_2D vpx_highbd_fdct32x32_sse2
     267             : #define FDCT32x32_HIGH_PRECISION 1
     268             : #include "vpx_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
     269             : #undef FDCT32x32_2D
     270             : #undef FDCT32x32_HIGH_PRECISION
     271             : #undef DCT_HIGH_BIT_DEPTH
     272             : #endif  // CONFIG_VP9_HIGHBITDEPTH

Generated by: LCOV version 1.13