LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - highbd_variance_sse4.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 87 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 10 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <smmintrin.h> /* SSE4.1 */
      13             : 
      14             : #include "./aom_config.h"
      15             : #include "./aom_dsp_rtcd.h"
      16             : 
      17             : #include "aom_dsp/variance.h"
      18             : #include "aom_dsp/aom_filter.h"
      19             : 
      20           0 : static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
      21             :                                          const uint8_t *b8, int b_stride,
      22             :                                          uint64_t *sse, int64_t *sum) {
      23             :   __m128i u0, u1, u2, u3;
      24             :   __m128i s0, s1, s2, s3;
      25             :   __m128i t0, t1, x0, y0;
      26             :   __m128i a0, a1, a2, a3;
      27             :   __m128i b0, b1, b2, b3;
      28           0 :   __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
      29             : 
      30           0 :   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
      31           0 :   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
      32             : 
      33           0 :   a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
      34           0 :   a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
      35           0 :   a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
      36           0 :   a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
      37             : 
      38           0 :   b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
      39           0 :   b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
      40           0 :   b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
      41           0 :   b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
      42             : 
      43           0 :   u0 = _mm_unpacklo_epi16(a0, a1);
      44           0 :   u1 = _mm_unpacklo_epi16(a2, a3);
      45           0 :   u2 = _mm_unpacklo_epi16(b0, b1);
      46           0 :   u3 = _mm_unpacklo_epi16(b2, b3);
      47             : 
      48           0 :   s0 = _mm_sub_epi16(u0, u2);
      49           0 :   s1 = _mm_sub_epi16(u1, u3);
      50             : 
      51           0 :   t0 = _mm_madd_epi16(s0, k_one_epi16);
      52           0 :   t1 = _mm_madd_epi16(s1, k_one_epi16);
      53             : 
      54           0 :   s2 = _mm_hadd_epi32(t0, t1);
      55           0 :   s3 = _mm_hadd_epi32(s2, s2);
      56           0 :   y0 = _mm_hadd_epi32(s3, s3);
      57             : 
      58           0 :   t0 = _mm_madd_epi16(s0, s0);
      59           0 :   t1 = _mm_madd_epi16(s1, s1);
      60             : 
      61           0 :   s2 = _mm_hadd_epi32(t0, t1);
      62           0 :   s3 = _mm_hadd_epi32(s2, s2);
      63           0 :   x0 = _mm_hadd_epi32(s3, s3);
      64             : 
      65           0 :   *sse = (uint64_t)_mm_extract_epi32(x0, 0);
      66           0 :   *sum = (int64_t)_mm_extract_epi32(y0, 0);
      67           0 : }
      68             : 
      69           0 : uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
      70             :                                          const uint8_t *b, int b_stride,
      71             :                                          uint32_t *sse) {
      72             :   int64_t sum, diff;
      73             :   uint64_t local_sse;
      74             : 
      75           0 :   variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
      76           0 :   *sse = (uint32_t)local_sse;
      77             : 
      78           0 :   diff = (int64_t)*sse - ((sum * sum) >> 4);
      79           0 :   return (diff >= 0) ? (uint32_t)diff : 0;
      80             : }
      81             : 
      82           0 : uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
      83             :                                           const uint8_t *b, int b_stride,
      84             :                                           uint32_t *sse) {
      85             :   int64_t sum, diff;
      86             :   uint64_t local_sse;
      87             : 
      88           0 :   variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
      89           0 :   *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
      90           0 :   sum = ROUND_POWER_OF_TWO(sum, 2);
      91             : 
      92           0 :   diff = (int64_t)*sse - ((sum * sum) >> 4);
      93           0 :   return (diff >= 0) ? (uint32_t)diff : 0;
      94             : }
      95             : 
      96           0 : uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
      97             :                                           const uint8_t *b, int b_stride,
      98             :                                           uint32_t *sse) {
      99             :   int64_t sum, diff;
     100             :   uint64_t local_sse;
     101             : 
     102           0 :   variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
     103           0 :   *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
     104           0 :   sum = ROUND_POWER_OF_TWO(sum, 4);
     105             : 
     106           0 :   diff = (int64_t)*sse - ((sum * sum) >> 4);
     107           0 :   return diff >= 0 ? (uint32_t)diff : 0;
     108             : }
     109             : 
     110             : // Sub-pixel
     111           0 : uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
     112             :     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     113             :     const uint8_t *dst, int dst_stride, uint32_t *sse) {
     114             :   uint16_t fdata3[(4 + 1) * 4];
     115             :   uint16_t temp2[4 * 4];
     116             : 
     117           0 :   aom_highbd_var_filter_block2d_bil_first_pass(
     118           0 :       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
     119           0 :   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
     120           0 :                                                 bilinear_filters_2t[yoffset]);
     121             : 
     122           0 :   return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
     123             :                                   sse);
     124             : }
     125             : 
     126           0 : uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
     127             :     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     128             :     const uint8_t *dst, int dst_stride, uint32_t *sse) {
     129             :   uint16_t fdata3[(4 + 1) * 4];
     130             :   uint16_t temp2[4 * 4];
     131             : 
     132           0 :   aom_highbd_var_filter_block2d_bil_first_pass(
     133           0 :       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
     134           0 :   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
     135           0 :                                                 bilinear_filters_2t[yoffset]);
     136             : 
     137           0 :   return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
     138             :                                    dst_stride, sse);
     139             : }
     140             : 
     141           0 : uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
     142             :     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     143             :     const uint8_t *dst, int dst_stride, uint32_t *sse) {
     144             :   uint16_t fdata3[(4 + 1) * 4];
     145             :   uint16_t temp2[4 * 4];
     146             : 
     147           0 :   aom_highbd_var_filter_block2d_bil_first_pass(
     148           0 :       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
     149           0 :   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
     150           0 :                                                 bilinear_filters_2t[yoffset]);
     151             : 
     152           0 :   return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
     153             :                                    dst_stride, sse);
     154             : }
     155             : 
     156             : // Sub-pixel average
     157             : 
     158           0 : uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
     159             :     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     160             :     const uint8_t *dst, int dst_stride, uint32_t *sse,
     161             :     const uint8_t *second_pred) {
     162             :   uint16_t fdata3[(4 + 1) * 4];
     163             :   uint16_t temp2[4 * 4];
     164             :   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
     165             : 
     166           0 :   aom_highbd_var_filter_block2d_bil_first_pass(
     167           0 :       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
     168           0 :   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
     169           0 :                                                 bilinear_filters_2t[yoffset]);
     170             : 
     171           0 :   aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
     172             :                            4);
     173             : 
     174           0 :   return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
     175             :                                   sse);
     176             : }
     177             : 
     178           0 : uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
     179             :     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     180             :     const uint8_t *dst, int dst_stride, uint32_t *sse,
     181             :     const uint8_t *second_pred) {
     182             :   uint16_t fdata3[(4 + 1) * 4];
     183             :   uint16_t temp2[4 * 4];
     184             :   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
     185             : 
     186           0 :   aom_highbd_var_filter_block2d_bil_first_pass(
     187           0 :       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
     188           0 :   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
     189           0 :                                                 bilinear_filters_2t[yoffset]);
     190             : 
     191           0 :   aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
     192             :                            4);
     193             : 
     194           0 :   return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
     195             :                                    dst_stride, sse);
     196             : }
     197             : 
     198           0 : uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
     199             :     const uint8_t *src, int src_stride, int xoffset, int yoffset,
     200             :     const uint8_t *dst, int dst_stride, uint32_t *sse,
     201             :     const uint8_t *second_pred) {
     202             :   uint16_t fdata3[(4 + 1) * 4];
     203             :   uint16_t temp2[4 * 4];
     204             :   DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
     205             : 
     206           0 :   aom_highbd_var_filter_block2d_bil_first_pass(
     207           0 :       src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
     208           0 :   aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
     209           0 :                                                 bilinear_filters_2t[yoffset]);
     210             : 
     211           0 :   aom_highbd_comp_avg_pred(temp3, second_pred, 4, 4, CONVERT_TO_BYTEPTR(temp2),
     212             :                            4);
     213             : 
     214           0 :   return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
     215             :                                    dst_stride, sse);
     216             : }

Generated by: LCOV version 1.13