LCOV - code coverage report
Current view: top level - third_party/aom/av1/encoder/x86 - wedge_utils_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 147 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <immintrin.h>
      14             : 
      15             : #include "aom_dsp/x86/synonyms.h"
      16             : 
      17             : #include "aom/aom_integer.h"
      18             : 
      19             : #include "av1/common/reconinter.h"
      20             : 
      21             : #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
      22             : 
      23             : /**
      24             :  * See av1_wedge_sse_from_residuals_c
      25             :  */
      26           0 : uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
      27             :                                            const uint8_t *m, int N) {
      28           0 :   int n = -N;
      29           0 :   int n8 = n + 8;
      30             : 
      31             :   uint64_t csse;
      32             : 
      33           0 :   const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
      34           0 :   const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
      35             : 
      36           0 :   __m128i v_acc0_q = _mm_setzero_si128();
      37             : 
      38           0 :   assert(N % 64 == 0);
      39             : 
      40           0 :   r1 += N;
      41           0 :   d += N;
      42           0 :   m += N;
      43             : 
      44             :   do {
      45           0 :     const __m128i v_r0_w = xx_load_128(r1 + n);
      46           0 :     const __m128i v_r1_w = xx_load_128(r1 + n8);
      47           0 :     const __m128i v_d0_w = xx_load_128(d + n);
      48           0 :     const __m128i v_d1_w = xx_load_128(d + n8);
      49           0 :     const __m128i v_m01_b = xx_load_128(m + n);
      50             : 
      51           0 :     const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
      52           0 :     const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
      53           0 :     const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
      54           0 :     const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
      55           0 :     const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
      56           0 :     const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
      57             : 
      58           0 :     const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
      59           0 :     const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
      60           0 :     const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
      61           0 :     const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
      62             : 
      63           0 :     const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
      64           0 :     const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
      65           0 :     const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
      66           0 :     const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
      67             : 
      68           0 :     const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
      69           0 :     const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
      70             : 
      71           0 :     const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
      72           0 :     const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
      73             : 
      74           0 :     const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
      75             :                                            _mm_srli_epi64(v_sq0_d, 32));
      76           0 :     const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
      77             :                                            _mm_srli_epi64(v_sq1_d, 32));
      78             : 
      79           0 :     v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
      80           0 :     v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
      81             : 
      82           0 :     n8 += 16;
      83           0 :     n += 16;
      84           0 :   } while (n);
      85             : 
      86           0 :   v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
      87             : 
      88             : #if ARCH_X86_64
      89           0 :   csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
      90             : #else
      91             :   xx_storel_64(&csse, v_acc0_q);
      92             : #endif
      93             : 
      94           0 :   return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
      95             : }
      96             : 
      97             : /**
      98             :  * See av1_wedge_sign_from_residuals_c
      99             :  */
     100           0 : int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
     101             :                                        int N, int64_t limit) {
     102             :   int64_t acc;
     103             : 
     104             :   __m128i v_sign_d;
     105           0 :   __m128i v_acc0_d = _mm_setzero_si128();
     106           0 :   __m128i v_acc1_d = _mm_setzero_si128();
     107             :   __m128i v_acc_q;
     108             : 
     109             :   // Input size limited to 8192 by the use of 32 bit accumulators and m
     110             :   // being between [0, 64]. Overflow might happen at larger sizes,
     111             :   // though it is practically impossible on real video input.
     112           0 :   assert(N < 8192);
     113           0 :   assert(N % 64 == 0);
     114             : 
     115             :   do {
     116           0 :     const __m128i v_m01_b = xx_load_128(m);
     117           0 :     const __m128i v_m23_b = xx_load_128(m + 16);
     118           0 :     const __m128i v_m45_b = xx_load_128(m + 32);
     119           0 :     const __m128i v_m67_b = xx_load_128(m + 48);
     120             : 
     121           0 :     const __m128i v_d0_w = xx_load_128(ds);
     122           0 :     const __m128i v_d1_w = xx_load_128(ds + 8);
     123           0 :     const __m128i v_d2_w = xx_load_128(ds + 16);
     124           0 :     const __m128i v_d3_w = xx_load_128(ds + 24);
     125           0 :     const __m128i v_d4_w = xx_load_128(ds + 32);
     126           0 :     const __m128i v_d5_w = xx_load_128(ds + 40);
     127           0 :     const __m128i v_d6_w = xx_load_128(ds + 48);
     128           0 :     const __m128i v_d7_w = xx_load_128(ds + 56);
     129             : 
     130           0 :     const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
     131           0 :     const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
     132           0 :     const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
     133           0 :     const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
     134           0 :     const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
     135           0 :     const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
     136           0 :     const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
     137           0 :     const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
     138             : 
     139           0 :     const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
     140           0 :     const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
     141           0 :     const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
     142           0 :     const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
     143           0 :     const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
     144           0 :     const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
     145           0 :     const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
     146           0 :     const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
     147             : 
     148           0 :     const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
     149           0 :     const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
     150           0 :     const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
     151           0 :     const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
     152             : 
     153           0 :     const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
     154           0 :     const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
     155             : 
     156           0 :     v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
     157           0 :     v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
     158             : 
     159           0 :     ds += 64;
     160           0 :     m += 64;
     161             : 
     162           0 :     N -= 64;
     163           0 :   } while (N);
     164             : 
     165           0 :   v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
     166           0 :   v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
     167             :                            _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
     168             : 
     169           0 :   v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
     170           0 :   v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
     171             :                            _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
     172             : 
     173           0 :   v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
     174             : 
     175           0 :   v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
     176             : 
     177             : #if ARCH_X86_64
     178           0 :   acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
     179             : #else
     180             :   xx_storel_64(&acc, v_acc_q);
     181             : #endif
     182             : 
     183           0 :   return acc > limit;
     184             : }
     185             : 
     186             : // Negate under mask
     187           0 : static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
     188           0 :   return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
     189             : }
     190             : 
     191             : /**
     192             :  * av1_wedge_compute_delta_squares_c
     193             :  */
     194           0 : void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
     195             :                                           const int16_t *b, int N) {
     196           0 :   const __m128i v_neg_w =
     197             :       _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
     198             : 
     199           0 :   assert(N % 64 == 0);
     200             : 
     201             :   do {
     202           0 :     const __m128i v_a0_w = xx_load_128(a);
     203           0 :     const __m128i v_b0_w = xx_load_128(b);
     204           0 :     const __m128i v_a1_w = xx_load_128(a + 8);
     205           0 :     const __m128i v_b1_w = xx_load_128(b + 8);
     206           0 :     const __m128i v_a2_w = xx_load_128(a + 16);
     207           0 :     const __m128i v_b2_w = xx_load_128(b + 16);
     208           0 :     const __m128i v_a3_w = xx_load_128(a + 24);
     209           0 :     const __m128i v_b3_w = xx_load_128(b + 24);
     210             : 
     211           0 :     const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
     212           0 :     const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
     213           0 :     const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
     214           0 :     const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
     215           0 :     const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
     216           0 :     const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
     217           0 :     const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
     218           0 :     const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
     219             : 
     220             :     // Negate top word of pairs
     221           0 :     const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
     222           0 :     const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
     223           0 :     const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
     224           0 :     const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
     225           0 :     const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
     226           0 :     const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
     227           0 :     const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
     228           0 :     const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
     229             : 
     230           0 :     const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
     231           0 :     const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
     232           0 :     const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
     233           0 :     const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
     234           0 :     const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
     235           0 :     const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
     236           0 :     const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
     237           0 :     const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
     238             : 
     239           0 :     const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
     240           0 :     const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
     241           0 :     const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
     242           0 :     const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
     243             : 
     244           0 :     xx_store_128(d, v_r0_w);
     245           0 :     xx_store_128(d + 8, v_r1_w);
     246           0 :     xx_store_128(d + 16, v_r2_w);
     247           0 :     xx_store_128(d + 24, v_r3_w);
     248             : 
     249           0 :     a += 32;
     250           0 :     b += 32;
     251           0 :     d += 32;
     252           0 :     N -= 32;
     253           0 :   } while (N);
     254           0 : }

Generated by: LCOV version 1.13