LCOV - code coverage report
Current view: top level - third_party/aom/av1/encoder/x86 - error_intrin_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 27 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 1 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <immintrin.h>  // AVX2
      13             : 
      14             : #include "./av1_rtcd.h"
      15             : #include "aom/aom_integer.h"
      16             : 
      17           0 : int64_t av1_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff,
      18             :                              intptr_t block_size, int64_t *ssz) {
      19             :   __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
      20             :   __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
      21             :   __m256i sse_reg_64hi, ssz_reg_64hi;
      22             :   __m128i sse_reg128, ssz_reg128;
      23             :   int64_t sse;
      24             :   int i;
      25           0 :   const __m256i zero_reg = _mm256_set1_epi16(0);
      26             : 
      27             :   // init sse and ssz registerd to zero
      28           0 :   sse_reg = _mm256_set1_epi16(0);
      29           0 :   ssz_reg = _mm256_set1_epi16(0);
      30             : 
      31           0 :   for (i = 0; i < block_size; i += 16) {
      32             :     // load 32 bytes from coeff and dqcoeff
      33           0 :     coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
      34           0 :     dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
      35             :     // dqcoeff - coeff
      36           0 :     dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
      37             :     // madd (dqcoeff - coeff)
      38           0 :     dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
      39             :     // madd coeff
      40           0 :     coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
      41             :     // expand each double word of madd (dqcoeff - coeff) to quad word
      42           0 :     exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
      43           0 :     exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
      44             :     // expand each double word of madd (coeff) to quad word
      45           0 :     exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
      46           0 :     exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
      47             :     // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
      48           0 :     sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
      49           0 :     ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
      50           0 :     sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
      51           0 :     ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
      52             :   }
      53             :   // save the higher 64 bit of each 128 bit lane
      54           0 :   sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
      55           0 :   ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
      56             :   // add the higher 64 bit to the low 64 bit
      57           0 :   sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
      58           0 :   ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
      59             : 
      60             :   // add each 64 bit from each of the 128 bit lane of the 256 bit
      61           0 :   sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
      62           0 :                              _mm256_extractf128_si256(sse_reg, 1));
      63             : 
      64           0 :   ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
      65           0 :                              _mm256_extractf128_si256(ssz_reg, 1));
      66             : 
      67             :   // store the results
      68             :   _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
      69             : 
      70             :   _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
      71             :   _mm256_zeroupper();
      72           0 :   return sse;
      73             : }

Generated by: LCOV version 1.13