LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - fwd_txfm_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 154 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>  // SSE2
      13             : 
      14             : #include "./aom_config.h"
      15             : #include "./aom_dsp_rtcd.h"
      16             : #include "aom_dsp/aom_dsp_common.h"
      17             : #include "aom_dsp/x86/fwd_txfm_sse2.h"
      18             : 
      19           0 : void aom_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
      20             :   __m128i in0, in1;
      21             :   __m128i tmp;
      22           0 :   const __m128i zero = _mm_setzero_si128();
      23           0 :   in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
      24           0 :   in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
      25           0 :   in1 = _mm_unpacklo_epi64(
      26           0 :       in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
      27           0 :   in0 = _mm_unpacklo_epi64(
      28           0 :       in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
      29             : 
      30           0 :   tmp = _mm_add_epi16(in0, in1);
      31           0 :   in0 = _mm_unpacklo_epi16(zero, tmp);
      32           0 :   in1 = _mm_unpackhi_epi16(zero, tmp);
      33           0 :   in0 = _mm_srai_epi32(in0, 16);
      34           0 :   in1 = _mm_srai_epi32(in1, 16);
      35             : 
      36           0 :   tmp = _mm_add_epi32(in0, in1);
      37           0 :   in0 = _mm_unpacklo_epi32(tmp, zero);
      38           0 :   in1 = _mm_unpackhi_epi32(tmp, zero);
      39             : 
      40           0 :   tmp = _mm_add_epi32(in0, in1);
      41           0 :   in0 = _mm_srli_si128(tmp, 8);
      42             : 
      43           0 :   in1 = _mm_add_epi32(tmp, in0);
      44           0 :   in0 = _mm_slli_epi32(in1, 1);
      45           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in0);
      46           0 : }
      47             : 
      48           0 : void aom_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
      49           0 :   __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
      50           0 :   __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
      51           0 :   __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
      52           0 :   __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
      53             :   __m128i u0, u1, sum;
      54             : 
      55           0 :   u0 = _mm_add_epi16(in0, in1);
      56           0 :   u1 = _mm_add_epi16(in2, in3);
      57             : 
      58           0 :   in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
      59           0 :   in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
      60           0 :   in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
      61           0 :   in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
      62             : 
      63           0 :   sum = _mm_add_epi16(u0, u1);
      64             : 
      65           0 :   in0 = _mm_add_epi16(in0, in1);
      66           0 :   in2 = _mm_add_epi16(in2, in3);
      67           0 :   sum = _mm_add_epi16(sum, in0);
      68             : 
      69           0 :   u0 = _mm_setzero_si128();
      70           0 :   sum = _mm_add_epi16(sum, in2);
      71             : 
      72           0 :   in0 = _mm_unpacklo_epi16(u0, sum);
      73           0 :   in1 = _mm_unpackhi_epi16(u0, sum);
      74           0 :   in0 = _mm_srai_epi32(in0, 16);
      75           0 :   in1 = _mm_srai_epi32(in1, 16);
      76             : 
      77           0 :   sum = _mm_add_epi32(in0, in1);
      78           0 :   in0 = _mm_unpacklo_epi32(sum, u0);
      79           0 :   in1 = _mm_unpackhi_epi32(sum, u0);
      80             : 
      81           0 :   sum = _mm_add_epi32(in0, in1);
      82           0 :   in0 = _mm_srli_si128(sum, 8);
      83             : 
      84           0 :   in1 = _mm_add_epi32(sum, in0);
      85           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
      86           0 : }
      87             : 
      88           0 : void aom_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output,
      89             :                           int stride) {
      90             :   __m128i in0, in1, in2, in3;
      91             :   __m128i u0, u1;
      92           0 :   __m128i sum = _mm_setzero_si128();
      93             :   int i;
      94             : 
      95           0 :   for (i = 0; i < 2; ++i) {
      96           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
      97           0 :     in1 = _mm_load_si128((const __m128i *)(input + 0 * stride + 8));
      98           0 :     in2 = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
      99           0 :     in3 = _mm_load_si128((const __m128i *)(input + 1 * stride + 8));
     100             : 
     101           0 :     u0 = _mm_add_epi16(in0, in1);
     102           0 :     u1 = _mm_add_epi16(in2, in3);
     103           0 :     sum = _mm_add_epi16(sum, u0);
     104             : 
     105           0 :     in0 = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
     106           0 :     in1 = _mm_load_si128((const __m128i *)(input + 2 * stride + 8));
     107           0 :     in2 = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
     108           0 :     in3 = _mm_load_si128((const __m128i *)(input + 3 * stride + 8));
     109             : 
     110           0 :     sum = _mm_add_epi16(sum, u1);
     111           0 :     u0 = _mm_add_epi16(in0, in1);
     112           0 :     u1 = _mm_add_epi16(in2, in3);
     113           0 :     sum = _mm_add_epi16(sum, u0);
     114             : 
     115           0 :     in0 = _mm_load_si128((const __m128i *)(input + 4 * stride + 0));
     116           0 :     in1 = _mm_load_si128((const __m128i *)(input + 4 * stride + 8));
     117           0 :     in2 = _mm_load_si128((const __m128i *)(input + 5 * stride + 0));
     118           0 :     in3 = _mm_load_si128((const __m128i *)(input + 5 * stride + 8));
     119             : 
     120           0 :     sum = _mm_add_epi16(sum, u1);
     121           0 :     u0 = _mm_add_epi16(in0, in1);
     122           0 :     u1 = _mm_add_epi16(in2, in3);
     123           0 :     sum = _mm_add_epi16(sum, u0);
     124             : 
     125           0 :     in0 = _mm_load_si128((const __m128i *)(input + 6 * stride + 0));
     126           0 :     in1 = _mm_load_si128((const __m128i *)(input + 6 * stride + 8));
     127           0 :     in2 = _mm_load_si128((const __m128i *)(input + 7 * stride + 0));
     128           0 :     in3 = _mm_load_si128((const __m128i *)(input + 7 * stride + 8));
     129             : 
     130           0 :     sum = _mm_add_epi16(sum, u1);
     131           0 :     u0 = _mm_add_epi16(in0, in1);
     132           0 :     u1 = _mm_add_epi16(in2, in3);
     133           0 :     sum = _mm_add_epi16(sum, u0);
     134             : 
     135           0 :     sum = _mm_add_epi16(sum, u1);
     136           0 :     input += 8 * stride;
     137             :   }
     138             : 
     139           0 :   u0 = _mm_setzero_si128();
     140           0 :   in0 = _mm_unpacklo_epi16(u0, sum);
     141           0 :   in1 = _mm_unpackhi_epi16(u0, sum);
     142           0 :   in0 = _mm_srai_epi32(in0, 16);
     143           0 :   in1 = _mm_srai_epi32(in1, 16);
     144             : 
     145           0 :   sum = _mm_add_epi32(in0, in1);
     146           0 :   in0 = _mm_unpacklo_epi32(sum, u0);
     147           0 :   in1 = _mm_unpackhi_epi32(sum, u0);
     148             : 
     149           0 :   sum = _mm_add_epi32(in0, in1);
     150           0 :   in0 = _mm_srli_si128(sum, 8);
     151             : 
     152           0 :   in1 = _mm_add_epi32(sum, in0);
     153           0 :   in1 = _mm_srai_epi32(in1, 1);
     154           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
     155           0 : }
     156             : 
     157           0 : void aom_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output,
     158             :                           int stride) {
     159             :   __m128i in0, in1, in2, in3;
     160             :   __m128i u0, u1;
     161           0 :   __m128i sum = _mm_setzero_si128();
     162             :   int i;
     163             : 
     164           0 :   for (i = 0; i < 8; ++i) {
     165           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     166           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     167           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     168           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     169             : 
     170           0 :     input += stride;
     171           0 :     u0 = _mm_add_epi16(in0, in1);
     172           0 :     u1 = _mm_add_epi16(in2, in3);
     173           0 :     sum = _mm_add_epi16(sum, u0);
     174             : 
     175           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     176           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     177           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     178           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     179             : 
     180           0 :     input += stride;
     181           0 :     sum = _mm_add_epi16(sum, u1);
     182           0 :     u0 = _mm_add_epi16(in0, in1);
     183           0 :     u1 = _mm_add_epi16(in2, in3);
     184           0 :     sum = _mm_add_epi16(sum, u0);
     185             : 
     186           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     187           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     188           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     189           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     190             : 
     191           0 :     input += stride;
     192           0 :     sum = _mm_add_epi16(sum, u1);
     193           0 :     u0 = _mm_add_epi16(in0, in1);
     194           0 :     u1 = _mm_add_epi16(in2, in3);
     195           0 :     sum = _mm_add_epi16(sum, u0);
     196             : 
     197           0 :     in0 = _mm_load_si128((const __m128i *)(input + 0));
     198           0 :     in1 = _mm_load_si128((const __m128i *)(input + 8));
     199           0 :     in2 = _mm_load_si128((const __m128i *)(input + 16));
     200           0 :     in3 = _mm_load_si128((const __m128i *)(input + 24));
     201             : 
     202           0 :     input += stride;
     203           0 :     sum = _mm_add_epi16(sum, u1);
     204           0 :     u0 = _mm_add_epi16(in0, in1);
     205           0 :     u1 = _mm_add_epi16(in2, in3);
     206           0 :     sum = _mm_add_epi16(sum, u0);
     207             : 
     208           0 :     sum = _mm_add_epi16(sum, u1);
     209             :   }
     210             : 
     211           0 :   u0 = _mm_setzero_si128();
     212           0 :   in0 = _mm_unpacklo_epi16(u0, sum);
     213           0 :   in1 = _mm_unpackhi_epi16(u0, sum);
     214           0 :   in0 = _mm_srai_epi32(in0, 16);
     215           0 :   in1 = _mm_srai_epi32(in1, 16);
     216             : 
     217           0 :   sum = _mm_add_epi32(in0, in1);
     218           0 :   in0 = _mm_unpacklo_epi32(sum, u0);
     219           0 :   in1 = _mm_unpackhi_epi32(sum, u0);
     220             : 
     221           0 :   sum = _mm_add_epi32(in0, in1);
     222           0 :   in0 = _mm_srli_si128(sum, 8);
     223             : 
     224           0 :   in1 = _mm_add_epi32(sum, in0);
     225           0 :   in1 = _mm_srai_epi32(in1, 3);
     226           0 :   output[0] = (tran_low_t)_mm_cvtsi128_si32(in1);
     227           0 : }
     228             : 
     229             : #define DCT_HIGH_BIT_DEPTH 0
     230             : #define FDCT4x4_2D aom_fdct4x4_sse2
     231             : #define FDCT8x8_2D aom_fdct8x8_sse2
     232             : #define FDCT16x16_2D aom_fdct16x16_sse2
     233             : #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
     234             : #undef FDCT4x4_2D
     235             : #undef FDCT8x8_2D
     236             : #undef FDCT16x16_2D
     237             : 
     238             : #define FDCT32x32_2D aom_fdct32x32_rd_sse2
     239             : #define FDCT32x32_HIGH_PRECISION 0
     240             : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
     241             : #undef FDCT32x32_2D
     242             : #undef FDCT32x32_HIGH_PRECISION
     243             : 
     244             : #define FDCT32x32_2D aom_fdct32x32_sse2
     245             : #define FDCT32x32_HIGH_PRECISION 1
     246             : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
     247             : #undef FDCT32x32_2D
     248             : #undef FDCT32x32_HIGH_PRECISION
     249             : #undef DCT_HIGH_BIT_DEPTH
     250             : 
     251             : #if CONFIG_HIGHBITDEPTH
     252             : #define DCT_HIGH_BIT_DEPTH 1
     253             : #define FDCT4x4_2D aom_highbd_fdct4x4_sse2
     254             : #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
     255             : #define FDCT16x16_2D aom_highbd_fdct16x16_sse2
     256             : #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
     257             : #undef FDCT4x4_2D
     258             : #undef FDCT8x8_2D
     259             : #undef FDCT16x16_2D
     260             : 
     261             : #define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
     262             : #define FDCT32x32_HIGH_PRECISION 0
     263             : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
     264             : #undef FDCT32x32_2D
     265             : #undef FDCT32x32_HIGH_PRECISION
     266             : 
     267             : #define FDCT32x32_2D aom_highbd_fdct32x32_sse2
     268             : #define FDCT32x32_HIGH_PRECISION 1
     269             : #include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
     270             : #undef FDCT32x32_2D
     271             : #undef FDCT32x32_HIGH_PRECISION
     272             : #undef DCT_HIGH_BIT_DEPTH
     273             : #endif  // CONFIG_HIGHBITDEPTH

Generated by: LCOV version 1.13