LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - sad4d_avx2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 124 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : #include <immintrin.h>  // AVX2
      12             : #include "./aom_dsp_rtcd.h"
      13             : #include "aom/aom_integer.h"
      14             : 
      15           0 : void aom_sad32x32x4d_avx2(const uint8_t *src, int src_stride,
      16             :                           const uint8_t *const ref[4], int ref_stride,
      17             :                           uint32_t res[4]) {
      18             :   __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
      19             :   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
      20             :   __m256i sum_mlow, sum_mhigh;
      21             :   int i;
      22             :   const uint8_t *ref0, *ref1, *ref2, *ref3;
      23             : 
      24           0 :   ref0 = ref[0];
      25           0 :   ref1 = ref[1];
      26           0 :   ref2 = ref[2];
      27           0 :   ref3 = ref[3];
      28           0 :   sum_ref0 = _mm256_set1_epi16(0);
      29           0 :   sum_ref1 = _mm256_set1_epi16(0);
      30           0 :   sum_ref2 = _mm256_set1_epi16(0);
      31           0 :   sum_ref3 = _mm256_set1_epi16(0);
      32           0 :   for (i = 0; i < 32; i++) {
      33             :     // load src and all refs
      34           0 :     src_reg = _mm256_loadu_si256((const __m256i *)src);
      35           0 :     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
      36           0 :     ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
      37           0 :     ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
      38           0 :     ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
      39             :     // sum of the absolute differences between every ref-i to src
      40           0 :     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
      41           0 :     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
      42           0 :     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
      43           0 :     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
      44             :     // sum every ref-i
      45           0 :     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
      46           0 :     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
      47           0 :     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
      48           0 :     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
      49             : 
      50           0 :     src += src_stride;
      51           0 :     ref0 += ref_stride;
      52           0 :     ref1 += ref_stride;
      53           0 :     ref2 += ref_stride;
      54           0 :     ref3 += ref_stride;
      55             :   }
      56             :   {
      57             :     __m128i sum;
      58             :     // in sum_ref-i the result is saved in the first 4 bytes
      59             :     // the other 4 bytes are zeroed.
      60             :     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
      61           0 :     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
      62           0 :     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
      63             : 
      64             :     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
      65           0 :     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
      66           0 :     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
      67             : 
      68             :     // merge every 64 bit from each sum_ref-i
      69           0 :     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
      70           0 :     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
      71             : 
      72             :     // add the low 64 bit to the high 64 bit
      73           0 :     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
      74             : 
      75             :     // add the low 128 bit to the high 128 bit
      76           0 :     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
      77           0 :                         _mm256_extractf128_si256(sum_mlow, 1));
      78             : 
      79             :     _mm_storeu_si128((__m128i *)(res), sum);
      80             :   }
      81             :   _mm256_zeroupper();
      82           0 : }
      83             : 
      84           0 : void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
      85             :                           const uint8_t *const ref[4], int ref_stride,
      86             :                           uint32_t res[4]) {
      87             :   __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
      88             :   __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
      89             :   __m256i ref3_reg, ref3next_reg;
      90             :   __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
      91             :   __m256i sum_mlow, sum_mhigh;
      92             :   int i;
      93             :   const uint8_t *ref0, *ref1, *ref2, *ref3;
      94             : 
      95           0 :   ref0 = ref[0];
      96           0 :   ref1 = ref[1];
      97           0 :   ref2 = ref[2];
      98           0 :   ref3 = ref[3];
      99           0 :   sum_ref0 = _mm256_set1_epi16(0);
     100           0 :   sum_ref1 = _mm256_set1_epi16(0);
     101           0 :   sum_ref2 = _mm256_set1_epi16(0);
     102           0 :   sum_ref3 = _mm256_set1_epi16(0);
     103           0 :   for (i = 0; i < 64; i++) {
     104             :     // load 64 bytes from src and all refs
     105           0 :     src_reg = _mm256_loadu_si256((const __m256i *)src);
     106           0 :     srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
     107           0 :     ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
     108           0 :     ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
     109           0 :     ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
     110           0 :     ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
     111           0 :     ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
     112           0 :     ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
     113           0 :     ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
     114           0 :     ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
     115             :     // sum of the absolute differences between every ref-i to src
     116           0 :     ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
     117           0 :     ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
     118           0 :     ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
     119           0 :     ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
     120           0 :     ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
     121           0 :     ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
     122           0 :     ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
     123           0 :     ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);
     124             : 
     125             :     // sum every ref-i
     126           0 :     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
     127           0 :     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
     128           0 :     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
     129           0 :     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
     130           0 :     sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
     131           0 :     sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
     132           0 :     sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
     133           0 :     sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
     134           0 :     src += src_stride;
     135           0 :     ref0 += ref_stride;
     136           0 :     ref1 += ref_stride;
     137           0 :     ref2 += ref_stride;
     138           0 :     ref3 += ref_stride;
     139             :   }
     140             :   {
     141             :     __m128i sum;
     142             : 
     143             :     // in sum_ref-i the result is saved in the first 4 bytes
     144             :     // the other 4 bytes are zeroed.
     145             :     // sum_ref1 and sum_ref3 are shifted left by 4 bytes
     146           0 :     sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
     147           0 :     sum_ref3 = _mm256_slli_si256(sum_ref3, 4);
     148             : 
     149             :     // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
     150           0 :     sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
     151           0 :     sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);
     152             : 
     153             :     // merge every 64 bit from each sum_ref-i
     154           0 :     sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
     155           0 :     sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);
     156             : 
     157             :     // add the low 64 bit to the high 64 bit
     158           0 :     sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);
     159             : 
     160             :     // add the low 128 bit to the high 128 bit
     161           0 :     sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
     162           0 :                         _mm256_extractf128_si256(sum_mlow, 1));
     163             : 
     164             :     _mm_storeu_si128((__m128i *)(res), sum);
     165             :   }
     166             :   _mm256_zeroupper();
     167           0 : }
     168             : 
     169           0 : void aom_sad32x64x4d_avx2(const uint8_t *src, int src_stride,
     170             :                           const uint8_t *const ref[4], int ref_stride,
     171             :                           uint32_t res[4]) {
     172             :   const uint8_t *rf[4];
     173             :   uint32_t sum0[4];
     174             :   uint32_t sum1[4];
     175             : 
     176           0 :   rf[0] = ref[0];
     177           0 :   rf[1] = ref[1];
     178           0 :   rf[2] = ref[2];
     179           0 :   rf[3] = ref[3];
     180           0 :   aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
     181           0 :   src += src_stride << 5;
     182           0 :   rf[0] += ref_stride << 5;
     183           0 :   rf[1] += ref_stride << 5;
     184           0 :   rf[2] += ref_stride << 5;
     185           0 :   rf[3] += ref_stride << 5;
     186           0 :   aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
     187           0 :   res[0] = sum0[0] + sum1[0];
     188           0 :   res[1] = sum0[1] + sum1[1];
     189           0 :   res[2] = sum0[2] + sum1[2];
     190           0 :   res[3] = sum0[3] + sum1[3];
     191           0 : }
     192             : 
     193           0 : void aom_sad64x32x4d_avx2(const uint8_t *src, int src_stride,
     194             :                           const uint8_t *const ref[4], int ref_stride,
     195             :                           uint32_t res[4]) {
     196             :   const uint8_t *rf[4];
     197             :   uint32_t sum0[4];
     198             :   uint32_t sum1[4];
     199           0 :   unsigned int half_width = 32;
     200             : 
     201           0 :   rf[0] = ref[0];
     202           0 :   rf[1] = ref[1];
     203           0 :   rf[2] = ref[2];
     204           0 :   rf[3] = ref[3];
     205           0 :   aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum0);
     206           0 :   src += half_width;
     207           0 :   rf[0] += half_width;
     208           0 :   rf[1] += half_width;
     209           0 :   rf[2] += half_width;
     210           0 :   rf[3] += half_width;
     211           0 :   aom_sad32x32x4d_avx2(src, src_stride, rf, ref_stride, sum1);
     212           0 :   res[0] = sum0[0] + sum1[0];
     213           0 :   res[1] = sum0[1] + sum1[1];
     214           0 :   res[2] = sum0[2] + sum1[2];
     215           0 :   res[3] = sum0[3] + sum1[3];
     216           0 : }

Generated by: LCOV version 1.13