LCOV - code coverage report
Current view: top level - third_party/aom/av1/encoder/x86 - av1_highbd_quantize_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 105 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 5 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <smmintrin.h>
      13             : #include <stdint.h>
      14             : 
      15             : #include "./av1_rtcd.h"
      16             : #include "aom_dsp/aom_dsp_common.h"
      17             : 
      18             : // Coefficient quantization phase 1
      19             : // param[0-2] : rounding/quan/dequan constants
      20           0 : static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
      21             :                                          const int shift, const int scale,
      22             :                                          __m128i *qcoeff, __m128i *dquan,
      23             :                                          __m128i *sign) {
      24           0 :   const __m128i zero = _mm_setzero_si128();
      25           0 :   const __m128i one = _mm_set1_epi32(1);
      26             : 
      27           0 :   *sign = _mm_cmplt_epi32(*coeff, zero);
      28           0 :   *sign = _mm_or_si128(*sign, one);
      29           0 :   *coeff = _mm_abs_epi32(*coeff);
      30             : 
      31           0 :   qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
      32           0 :   qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
      33           0 :   qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
      34             : 
      35           0 :   qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
      36           0 :   qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
      37           0 :   dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
      38           0 :   dquan[0] = _mm_srli_epi64(dquan[0], scale);
      39           0 : }
      40             : 
      41             : // Coefficient quantization phase 2
      42           0 : static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
      43             :                                          const __m128i *sign,
      44             :                                          const __m128i *param, const int shift,
      45             :                                          const int scale, tran_low_t *qAddr,
      46             :                                          tran_low_t *dqAddr) {
      47           0 :   __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
      48           0 :   __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
      49             : 
      50           0 :   qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
      51           0 :   qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
      52           0 :   dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
      53           0 :   dquan[1] = _mm_srli_epi64(dquan[1], scale);
      54             : 
      55             :   // combine L&H
      56           0 :   qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
      57           0 :   qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
      58             : 
      59           0 :   qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
      60           0 :   qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
      61             : 
      62           0 :   dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
      63           0 :   dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
      64             : 
      65           0 :   dquan[0] = _mm_and_si128(dquan[0], mask0H);
      66           0 :   dquan[1] = _mm_and_si128(dquan[1], mask0L);
      67             : 
      68           0 :   qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
      69           0 :   dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
      70             : 
      71           0 :   qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
      72           0 :   dquan[0] = _mm_sign_epi32(dquan[0], *sign);
      73             : 
      74           0 :   _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
      75           0 :   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
      76           0 : }
      77             : 
      78           0 : static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
      79             :                             __m128i *eob) {
      80           0 :   const __m128i zero = _mm_setzero_si128();
      81             :   __m128i mask, iscanIdx;
      82           0 :   const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
      83           0 :   const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
      84           0 :   __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
      85           0 :   __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
      86             : 
      87           0 :   nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
      88           0 :   nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
      89             : 
      90           0 :   mask = _mm_packs_epi32(nz_flag0, nz_flag1);
      91           0 :   iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
      92           0 :   iscanIdx = _mm_sub_epi16(iscanIdx, mask);
      93           0 :   iscanIdx = _mm_and_si128(iscanIdx, mask);
      94           0 :   *eob = _mm_max_epi16(*eob, iscanIdx);
      95           0 : }
      96             : 
      97           0 : static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
      98             :   __m128i eob_shuffled;
      99             :   uint16_t eobValue;
     100           0 :   eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
     101           0 :   *eob = _mm_max_epi16(*eob, eob_shuffled);
     102           0 :   eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
     103           0 :   *eob = _mm_max_epi16(*eob, eob_shuffled);
     104           0 :   eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
     105           0 :   *eob = _mm_max_epi16(*eob, eob_shuffled);
     106           0 :   eobValue = _mm_extract_epi16(*eob, 0);
     107           0 :   return eobValue;
     108             : }
     109             : 
     110           0 : void av1_highbd_quantize_fp_sse4_1(
     111             :     const tran_low_t *coeff_ptr, intptr_t count, int skip_block,
     112             :     const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
     113             :     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     114             :     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     115             :     const int16_t *scan, const int16_t *iscan, int log_scale) {
     116             :   __m128i coeff[2], qcoeff[2], dequant[2], qparam[3], coeff_sign;
     117           0 :   __m128i eob = _mm_setzero_si128();
     118           0 :   const tran_low_t *src = coeff_ptr;
     119           0 :   tran_low_t *quanAddr = qcoeff_ptr;
     120           0 :   tran_low_t *dquanAddr = dqcoeff_ptr;
     121           0 :   const int shift = 16 - log_scale;
     122           0 :   const int coeff_stride = 4;
     123           0 :   const int quan_stride = coeff_stride;
     124             :   (void)skip_block;
     125             :   (void)zbin_ptr;
     126             :   (void)quant_shift_ptr;
     127             :   (void)scan;
     128             : 
     129           0 :   memset(quanAddr, 0, count * sizeof(quanAddr[0]));
     130           0 :   memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
     131             : 
     132           0 :   if (!skip_block) {
     133           0 :     coeff[0] = _mm_loadu_si128((__m128i const *)src);
     134             : 
     135           0 :     qparam[0] =
     136           0 :         _mm_set_epi32(round_ptr[1], round_ptr[1], round_ptr[1], round_ptr[0]);
     137           0 :     qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[0]);
     138           0 :     qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[0]);
     139             : 
     140             :     // DC and first 3 AC
     141           0 :     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
     142             :                           &coeff_sign);
     143             : 
     144             :     // update round/quan/dquan for AC
     145           0 :     qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
     146           0 :     qparam[1] = _mm_set_epi64x(quant_ptr[1], quant_ptr[1]);
     147           0 :     qparam[2] = _mm_set_epi64x(dequant_ptr[1], dequant_ptr[1]);
     148             : 
     149           0 :     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
     150             :                           log_scale, quanAddr, dquanAddr);
     151             : 
     152             :     // next 4 AC
     153           0 :     coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
     154           0 :     quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
     155             :                           &coeff_sign);
     156           0 :     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
     157           0 :                           log_scale, quanAddr + quan_stride,
     158           0 :                           dquanAddr + quan_stride);
     159             : 
     160           0 :     find_eob(quanAddr, iscan, &eob);
     161             : 
     162           0 :     count -= 8;
     163             : 
     164             :     // loop for the rest of AC
     165           0 :     while (count > 0) {
     166           0 :       src += coeff_stride << 1;
     167           0 :       quanAddr += quan_stride << 1;
     168           0 :       dquanAddr += quan_stride << 1;
     169           0 :       iscan += quan_stride << 1;
     170             : 
     171           0 :       coeff[0] = _mm_loadu_si128((__m128i const *)src);
     172           0 :       coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
     173             : 
     174           0 :       quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff,
     175             :                             dequant, &coeff_sign);
     176           0 :       quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
     177             :                             log_scale, quanAddr, dquanAddr);
     178             : 
     179           0 :       quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff,
     180             :                             dequant, &coeff_sign);
     181           0 :       quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
     182           0 :                             log_scale, quanAddr + quan_stride,
     183           0 :                             dquanAddr + quan_stride);
     184             : 
     185           0 :       find_eob(quanAddr, iscan, &eob);
     186             : 
     187           0 :       count -= 8;
     188             :     }
     189           0 :     *eob_ptr = get_accumulated_eob(&eob);
     190             :   } else {
     191           0 :     *eob_ptr = 0;
     192             :   }
     193           0 : }

Generated by: LCOV version 1.13