LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - quantize_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 134 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 3 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>
      13             : #include <xmmintrin.h>
      14             : 
      15             : #include "./aom_dsp_rtcd.h"
      16             : #include "aom/aom_integer.h"
      17             : 
      18           0 : static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
      19             : #if CONFIG_HIGHBITDEPTH
      20           0 :   return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
      21           0 :                         (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
      22           0 :                         (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
      23           0 :                         (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
      24             : #else
      25             :   return _mm_load_si128((const __m128i *)coeff_ptr);
      26             : #endif
      27             : }
      28             : 
      29           0 : static INLINE void store_coefficients(__m128i coeff_vals,
      30             :                                       tran_low_t *coeff_ptr) {
      31             : #if CONFIG_HIGHBITDEPTH
      32           0 :   __m128i one = _mm_set1_epi16(1);
      33           0 :   __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
      34           0 :   __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
      35           0 :   __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
      36           0 :   __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
      37             :   _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
      38           0 :   _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
      39             : #else
      40             :   _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
      41             : #endif
      42           0 : }
      43             : 
      44           0 : void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      45             :                          int skip_block, const int16_t *zbin_ptr,
      46             :                          const int16_t *round_ptr, const int16_t *quant_ptr,
      47             :                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
      48             :                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
      49             :                          uint16_t *eob_ptr, const int16_t *scan_ptr,
      50             :                          const int16_t *iscan_ptr) {
      51             :   __m128i zero;
      52             :   (void)scan_ptr;
      53             : 
      54           0 :   coeff_ptr += n_coeffs;
      55           0 :   iscan_ptr += n_coeffs;
      56           0 :   qcoeff_ptr += n_coeffs;
      57           0 :   dqcoeff_ptr += n_coeffs;
      58           0 :   n_coeffs = -n_coeffs;
      59           0 :   zero = _mm_setzero_si128();
      60           0 :   if (!skip_block) {
      61             :     __m128i eob;
      62             :     __m128i zbin;
      63             :     __m128i round, quant, dequant, shift;
      64             :     {
      65             :       __m128i coeff0, coeff1;
      66             : 
      67             :       // Setup global values
      68             :       {
      69             :         __m128i pw_1;
      70           0 :         zbin = _mm_load_si128((const __m128i *)zbin_ptr);
      71           0 :         round = _mm_load_si128((const __m128i *)round_ptr);
      72           0 :         quant = _mm_load_si128((const __m128i *)quant_ptr);
      73           0 :         pw_1 = _mm_set1_epi16(1);
      74           0 :         zbin = _mm_sub_epi16(zbin, pw_1);
      75           0 :         dequant = _mm_load_si128((const __m128i *)dequant_ptr);
      76           0 :         shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
      77             :       }
      78             : 
      79             :       {
      80             :         __m128i coeff0_sign, coeff1_sign;
      81             :         __m128i qcoeff0, qcoeff1;
      82             :         __m128i qtmp0, qtmp1;
      83             :         __m128i cmp_mask0, cmp_mask1;
      84             :         // Do DC and first 15 AC
      85           0 :         coeff0 = load_coefficients(coeff_ptr + n_coeffs);
      86           0 :         coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
      87             : 
      88             :         // Poor man's sign extract
      89           0 :         coeff0_sign = _mm_srai_epi16(coeff0, 15);
      90           0 :         coeff1_sign = _mm_srai_epi16(coeff1, 15);
      91           0 :         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
      92           0 :         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
      93           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
      94           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
      95             : 
      96           0 :         cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
      97           0 :         zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
      98           0 :         cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
      99           0 :         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     100           0 :         round = _mm_unpackhi_epi64(round, round);
     101           0 :         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     102           0 :         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     103           0 :         quant = _mm_unpackhi_epi64(quant, quant);
     104           0 :         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     105           0 :         qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
     106           0 :         qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
     107           0 :         qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
     108           0 :         shift = _mm_unpackhi_epi64(shift, shift);
     109           0 :         qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
     110             : 
     111             :         // Reinsert signs
     112           0 :         qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
     113           0 :         qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
     114           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     115           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     116             : 
     117             :         // Mask out zbin threshold coeffs
     118           0 :         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
     119           0 :         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
     120             : 
     121           0 :         store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
     122           0 :         store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
     123             : 
     124           0 :         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     125           0 :         dequant = _mm_unpackhi_epi64(dequant, dequant);
     126           0 :         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     127             : 
     128           0 :         store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
     129           0 :         store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
     130             :       }
     131             : 
     132             :       {
     133             :         // Scan for eob
     134             :         __m128i zero_coeff0, zero_coeff1;
     135             :         __m128i nzero_coeff0, nzero_coeff1;
     136             :         __m128i iscan0, iscan1;
     137             :         __m128i eob1;
     138           0 :         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
     139           0 :         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
     140           0 :         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
     141           0 :         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
     142           0 :         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
     143           0 :         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
     144             :         // Add one to convert from indices to counts
     145           0 :         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
     146           0 :         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
     147           0 :         eob = _mm_and_si128(iscan0, nzero_coeff0);
     148           0 :         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
     149           0 :         eob = _mm_max_epi16(eob, eob1);
     150             :       }
     151           0 :       n_coeffs += 8 * 2;
     152             :     }
     153             : 
     154             :     // AC only loop
     155           0 :     while (n_coeffs < 0) {
     156             :       __m128i coeff0, coeff1;
     157             :       {
     158             :         __m128i coeff0_sign, coeff1_sign;
     159             :         __m128i qcoeff0, qcoeff1;
     160             :         __m128i qtmp0, qtmp1;
     161             :         __m128i cmp_mask0, cmp_mask1;
     162             : 
     163           0 :         coeff0 = load_coefficients(coeff_ptr + n_coeffs);
     164           0 :         coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8);
     165             : 
     166             :         // Poor man's sign extract
     167           0 :         coeff0_sign = _mm_srai_epi16(coeff0, 15);
     168           0 :         coeff1_sign = _mm_srai_epi16(coeff1, 15);
     169           0 :         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
     170           0 :         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
     171           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     172           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     173             : 
     174           0 :         cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
     175           0 :         cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
     176           0 :         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
     177           0 :         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
     178           0 :         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
     179           0 :         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
     180           0 :         qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
     181           0 :         qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
     182           0 :         qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
     183           0 :         qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
     184             : 
     185             :         // Reinsert signs
     186           0 :         qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
     187           0 :         qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
     188           0 :         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
     189           0 :         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
     190             : 
     191             :         // Mask out zbin threshold coeffs
     192           0 :         qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
     193           0 :         qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
     194             : 
     195           0 :         store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs);
     196           0 :         store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8);
     197             : 
     198           0 :         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
     199           0 :         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
     200             : 
     201           0 :         store_coefficients(coeff0, dqcoeff_ptr + n_coeffs);
     202           0 :         store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8);
     203             :       }
     204             : 
     205             :       {
     206             :         // Scan for eob
     207             :         __m128i zero_coeff0, zero_coeff1;
     208             :         __m128i nzero_coeff0, nzero_coeff1;
     209             :         __m128i iscan0, iscan1;
     210             :         __m128i eob0, eob1;
     211           0 :         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
     212           0 :         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
     213           0 :         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
     214           0 :         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
     215           0 :         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
     216           0 :         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
     217             :         // Add one to convert from indices to counts
     218           0 :         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
     219           0 :         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
     220           0 :         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
     221           0 :         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
     222           0 :         eob0 = _mm_max_epi16(eob0, eob1);
     223           0 :         eob = _mm_max_epi16(eob, eob0);
     224             :       }
     225           0 :       n_coeffs += 8 * 2;
     226             :     }
     227             : 
     228             :     // Accumulate EOB
     229             :     {
     230             :       __m128i eob_shuffled;
     231           0 :       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
     232           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     233           0 :       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
     234           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     235           0 :       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
     236           0 :       eob = _mm_max_epi16(eob, eob_shuffled);
     237           0 :       *eob_ptr = _mm_extract_epi16(eob, 1);
     238             :     }
     239             :   } else {
     240             :     do {
     241           0 :       store_coefficients(zero, dqcoeff_ptr + n_coeffs);
     242           0 :       store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8);
     243           0 :       store_coefficients(zero, qcoeff_ptr + n_coeffs);
     244           0 :       store_coefficients(zero, qcoeff_ptr + n_coeffs + 8);
     245           0 :       n_coeffs += 8 * 2;
     246           0 :     } while (n_coeffs < 0);
     247           0 :     *eob_ptr = 0;
     248             :   }
     249           0 : }

Generated by: LCOV version 1.13