LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - sad4d_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 89 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 2 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : #include <immintrin.h>  // AVX2
      11             : #include "./vpx_dsp_rtcd.h"
      12             : #include "vpx/vpx_integer.h"
      13             : 
      14           0 : void vpx_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
      15             :                           const uint8_t *const ref[4], int ref_stride,
      16             :                           uint32_t res[4]) {
      17             :   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
      18             :   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
      19             :   __m256i sum_mlow, sum_mhigh;
      20             :   int i;
      21             :   const uint8_t *ref0, *ref1, *ref2, *ref3;
      22             : 
      23           0 :   ref0 = ref[0];
      24           0 :   ref1 = ref[1];
      25           0 :   ref2 = ref[2];
      26           0 :   ref3 = ref[3];
      27           0 :   sum_ref0 = _mm256_set1_epi16(0);
      28           0 :   sum_ref1 = _mm256_set1_epi16(0);
      29           0 :   sum_ref2 = _mm256_set1_epi16(0);
      30           0 :   sum_ref3 = _mm256_set1_epi16(0);
      31           0 :   for (i = 0; i < 32; i++) {
      32             :     // load src and all refs
      33           0 :     src_reg = _mm256_loadu_si256((const __m256i *)src);
      34           0 :     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
      35           0 :     ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
      36           0 :     ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
      37           0 :     ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
      38             :     // sum of the absolute differences between every ref-i to src
      39           0 :     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
      40           0 :     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
      41           0 :     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
      42           0 :     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
      43             :     // sum every ref-i
      44           0 :     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
      45           0 :     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
      46           0 :     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
      47           0 :     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
      48             : 
      49           0 :     src += src_stride;
      50           0 :     ref0 += ref_stride;
      51           0 :     ref1 += ref_stride;
      52           0 :     ref2 += ref_stride;
      53           0 :     ref3 += ref_stride;
      54             :   }
      55             :   {
      56             :     __m128i sum;
      57             :     // in sum_ref-i the result is saved in the first 4 bytes
      58             :     // the other 4 bytes are zeroed.
      59             :     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
      60           0 :     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
      61           0 :     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
      62             : 
      63             :     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
      64           0 :     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
      65           0 :     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
      66             : 
      67             :     // merge every 64 bit from each sum_ref-i
      68           0 :     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
      69           0 :     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
      70             : 
      71             :     // add the low 64 bit to the high 64 bit
      72           0 :     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
      73             : 
      74             :     // add the low 128 bit to the high 128 bit
      75           0 :     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
      76           0 :                         _mm256_extractf128_si256(sum_mlow, 1));
      77             : 
      78             :     _mm_storeu_si128((__m128i *)(res), sum);
      79             :   }
      80           0 : }
      81             : 
      82           0 : void vpx_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
      83             :                           const uint8_t *const ref[4], int ref_stride,
      84             :                           uint32_t res[4]) {
      85             :   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
      86             :   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
      87             :   __m256i ref3_reg, ref3next_reg;
      88             :   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
      89             :   __m256i sum_mlow, sum_mhigh;
      90             :   int i;
      91             :   const uint8_t *ref0, *ref1, *ref2, *ref3;
      92             : 
      93           0 :   ref0 = ref[0];
      94           0 :   ref1 = ref[1];
      95           0 :   ref2 = ref[2];
      96           0 :   ref3 = ref[3];
      97           0 :   sum_ref0 = _mm256_set1_epi16(0);
      98           0 :   sum_ref1 = _mm256_set1_epi16(0);
      99           0 :   sum_ref2 = _mm256_set1_epi16(0);
     100           0 :   sum_ref3 = _mm256_set1_epi16(0);
     101           0 :   for (i = 0; i < 64; i++) {
     102             :     // load 64 bytes from src and all refs
     103           0 :     src_reg = _mm256_loadu_si256((const __m256i *)src);
     104           0 :     srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
     105           0 :     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
     106           0 :     ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
     107           0 :     ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
     108           0 :     ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
     109           0 :     ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
     110           0 :     ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
     111           0 :     ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
     112           0 :     ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
     113             :     // sum of the absolute differences between every ref-i to src
     114           0 :     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
     115           0 :     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
     116           0 :     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
     117           0 :     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
     118           0 :     ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
     119           0 :     ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
     120           0 :     ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
     121           0 :     ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
     122             : 
     123             :     // sum every ref-i
     124           0 :     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
     125           0 :     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
     126           0 :     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
     127           0 :     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
     128           0 :     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
     129           0 :     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
     130           0 :     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
     131           0 :     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
     132           0 :     src += src_stride;
     133           0 :     ref0 += ref_stride;
     134           0 :     ref1 += ref_stride;
     135           0 :     ref2 += ref_stride;
     136           0 :     ref3 += ref_stride;
     137             :   }
     138             :   {
     139             :     __m128i sum;
     140             : 
     141             :     // in sum_ref-i the result is saved in the first 4 bytes
     142             :     // the other 4 bytes are zeroed.
     143             :     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
     144           0 :     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
     145           0 :     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
     146             : 
     147             :     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
     148           0 :     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
     149           0 :     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
     150             : 
     151             :     // merge every 64 bit from each sum_ref-i
     152           0 :     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
     153           0 :     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
     154             : 
     155             :     // add the low 64 bit to the high 64 bit
     156           0 :     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
     157             : 
     158             :     // add the low 128 bit to the high 128 bit
     159           0 :     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
     160           0 :                         _mm256_extractf128_si256(sum_mlow, 1));
     161             : 
     162             :     _mm_storeu_si128((__m128i *)(res), sum);
     163             :   }
     164           0 : }

Generated by: LCOV version 1.13