LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - blend_sse4.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 60 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 6 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #ifndef AOM_DSP_X86_BLEND_SSE4_H_
      13             : #define AOM_DSP_X86_BLEND_SSE4_H_
      14             : 
      15             : #include "aom_dsp/blend.h"
      16             : #include "aom_dsp/x86/synonyms.h"
      17             : 
      18             : //////////////////////////////////////////////////////////////////////////////
      19             : // Common kernels
      20             : //////////////////////////////////////////////////////////////////////////////
      21             : 
      22           0 : static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
      23             :                               const __m128i v_m0_w, const __m128i v_m1_w) {
      24           0 :   const __m128i v_s0_b = xx_loadl_32(src0);
      25           0 :   const __m128i v_s1_b = xx_loadl_32(src1);
      26           0 :   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
      27           0 :   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
      28             : 
      29           0 :   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
      30           0 :   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
      31             : 
      32           0 :   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
      33             : 
      34           0 :   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
      35             : 
      36           0 :   return v_res_w;
      37             : }
      38             : 
      39           0 : static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
      40             :                               const __m128i v_m0_w, const __m128i v_m1_w) {
      41           0 :   const __m128i v_s0_b = xx_loadl_64(src0);
      42           0 :   const __m128i v_s1_b = xx_loadl_64(src1);
      43           0 :   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
      44           0 :   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
      45             : 
      46           0 :   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
      47           0 :   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
      48             : 
      49           0 :   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
      50             : 
      51           0 :   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
      52             : 
      53           0 :   return v_res_w;
      54             : }
      55             : 
      56             : #if CONFIG_HIGHBITDEPTH
      57             : typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
      58             :                                  const __m128i v_m0_w, const __m128i v_m1_w);
      59             : 
      60           0 : static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
      61             :                                   const __m128i v_m0_w, const __m128i v_m1_w) {
      62           0 :   const __m128i v_s0_w = xx_loadl_64(src0);
      63           0 :   const __m128i v_s1_w = xx_loadl_64(src1);
      64             : 
      65           0 :   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
      66           0 :   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
      67             : 
      68           0 :   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
      69             : 
      70           0 :   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
      71             : 
      72           0 :   return v_res_w;
      73             : }
      74             : 
      75           0 : static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
      76             :                                   const __m128i v_m0_w, const __m128i v_m1_w) {
      77           0 :   const __m128i v_s0_w = xx_loadu_128(src0);
      78           0 :   const __m128i v_s1_w = xx_loadu_128(src1);
      79             : 
      80           0 :   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
      81           0 :   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
      82             : 
      83           0 :   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
      84             : 
      85           0 :   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
      86             : 
      87           0 :   return v_res_w;
      88             : }
      89             : 
      90           0 : static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
      91             :                                   const __m128i v_m0_w, const __m128i v_m1_w) {
      92           0 :   const __m128i v_s0_w = xx_loadl_64(src0);
      93           0 :   const __m128i v_s1_w = xx_loadl_64(src1);
      94             : 
      95             :   // Interleave
      96           0 :   const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
      97           0 :   const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
      98             : 
      99             :   // Multiply-Add
     100           0 :   const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
     101             : 
     102             :   // Scale
     103           0 :   const __m128i v_ssum_d =
     104             :       _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
     105             : 
     106             :   // Pack
     107           0 :   const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
     108             : 
     109             :   // Round
     110           0 :   const __m128i v_res_w = xx_round_epu16(v_pssum_d);
     111             : 
     112           0 :   return v_res_w;
     113             : }
     114             : 
     115           0 : static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
     116             :                                   const __m128i v_m0_w, const __m128i v_m1_w) {
     117           0 :   const __m128i v_s0_w = xx_loadu_128(src0);
     118           0 :   const __m128i v_s1_w = xx_loadu_128(src1);
     119             : 
     120             :   // Interleave
     121           0 :   const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
     122           0 :   const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
     123           0 :   const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
     124           0 :   const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
     125             : 
     126             :   // Multiply-Add
     127           0 :   const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
     128           0 :   const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
     129             : 
     130             :   // Scale
     131           0 :   const __m128i v_ssuml_d =
     132             :       _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
     133           0 :   const __m128i v_ssumh_d =
     134             :       _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
     135             : 
     136             :   // Pack
     137           0 :   const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
     138             : 
     139             :   // Round
     140           0 :   const __m128i v_res_w = xx_round_epu16(v_pssum_d);
     141             : 
     142           0 :   return v_res_w;
     143             : }
     144             : #endif  // CONFIG_HIGHBITDEPTH
     145             : 
     146             : #endif  // AOM_DSP_X86_BLEND_SSE4_H_

Generated by: LCOV version 1.13