LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - avg_intrin_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 280 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 8 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>
      13             : 
      14             : #include "aom_dsp/x86/synonyms.h"
      15             : 
      16             : #include "./aom_dsp_rtcd.h"
      17             : #include "aom_ports/mem.h"
      18             : 
      19           0 : void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
      20             :                          int *min, int *max) {
      21             :   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
      22           0 :   u0 = _mm_setzero_si128();
      23             :   // Row 0
      24           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
      25           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
      26           0 :   diff = _mm_subs_epi16(s0, d0);
      27           0 :   negdiff = _mm_subs_epi16(u0, diff);
      28           0 :   absdiff0 = _mm_max_epi16(diff, negdiff);
      29             :   // Row 1
      30           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
      31           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
      32           0 :   diff = _mm_subs_epi16(s0, d0);
      33           0 :   negdiff = _mm_subs_epi16(u0, diff);
      34           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      35           0 :   maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
      36           0 :   minabsdiff = _mm_min_epi16(absdiff0, absdiff);
      37             :   // Row 2
      38           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
      39           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
      40           0 :   diff = _mm_subs_epi16(s0, d0);
      41           0 :   negdiff = _mm_subs_epi16(u0, diff);
      42           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      43           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
      44           0 :   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
      45             :   // Row 3
      46           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
      47           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
      48           0 :   diff = _mm_subs_epi16(s0, d0);
      49           0 :   negdiff = _mm_subs_epi16(u0, diff);
      50           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      51           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
      52           0 :   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
      53             :   // Row 4
      54           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
      55           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
      56           0 :   diff = _mm_subs_epi16(s0, d0);
      57           0 :   negdiff = _mm_subs_epi16(u0, diff);
      58           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      59           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
      60           0 :   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
      61             :   // Row 5
      62           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
      63           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
      64           0 :   diff = _mm_subs_epi16(s0, d0);
      65           0 :   negdiff = _mm_subs_epi16(u0, diff);
      66           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      67           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
      68           0 :   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
      69             :   // Row 6
      70           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
      71           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
      72           0 :   diff = _mm_subs_epi16(s0, d0);
      73           0 :   negdiff = _mm_subs_epi16(u0, diff);
      74           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      75           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
      76           0 :   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
      77             :   // Row 7
      78           0 :   s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
      79           0 :   d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
      80           0 :   diff = _mm_subs_epi16(s0, d0);
      81           0 :   negdiff = _mm_subs_epi16(u0, diff);
      82           0 :   absdiff = _mm_max_epi16(diff, negdiff);
      83           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
      84           0 :   minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
      85             : 
      86           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
      87           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
      88           0 :   maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
      89           0 :   *max = _mm_extract_epi16(maxabsdiff, 0);
      90             : 
      91           0 :   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
      92           0 :   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
      93           0 :   minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
      94           0 :   *min = _mm_extract_epi16(minabsdiff, 0);
      95           0 : }
      96             : 
      97           0 : static void hadamard_col8_sse2(__m128i *in, int iter) {
      98           0 :   __m128i a0 = in[0];
      99           0 :   __m128i a1 = in[1];
     100           0 :   __m128i a2 = in[2];
     101           0 :   __m128i a3 = in[3];
     102           0 :   __m128i a4 = in[4];
     103           0 :   __m128i a5 = in[5];
     104           0 :   __m128i a6 = in[6];
     105           0 :   __m128i a7 = in[7];
     106             : 
     107           0 :   __m128i b0 = _mm_add_epi16(a0, a1);
     108           0 :   __m128i b1 = _mm_sub_epi16(a0, a1);
     109           0 :   __m128i b2 = _mm_add_epi16(a2, a3);
     110           0 :   __m128i b3 = _mm_sub_epi16(a2, a3);
     111           0 :   __m128i b4 = _mm_add_epi16(a4, a5);
     112           0 :   __m128i b5 = _mm_sub_epi16(a4, a5);
     113           0 :   __m128i b6 = _mm_add_epi16(a6, a7);
     114           0 :   __m128i b7 = _mm_sub_epi16(a6, a7);
     115             : 
     116           0 :   a0 = _mm_add_epi16(b0, b2);
     117           0 :   a1 = _mm_add_epi16(b1, b3);
     118           0 :   a2 = _mm_sub_epi16(b0, b2);
     119           0 :   a3 = _mm_sub_epi16(b1, b3);
     120           0 :   a4 = _mm_add_epi16(b4, b6);
     121           0 :   a5 = _mm_add_epi16(b5, b7);
     122           0 :   a6 = _mm_sub_epi16(b4, b6);
     123           0 :   a7 = _mm_sub_epi16(b5, b7);
     124             : 
     125           0 :   if (iter == 0) {
     126           0 :     b0 = _mm_add_epi16(a0, a4);
     127           0 :     b7 = _mm_add_epi16(a1, a5);
     128           0 :     b3 = _mm_add_epi16(a2, a6);
     129           0 :     b4 = _mm_add_epi16(a3, a7);
     130           0 :     b2 = _mm_sub_epi16(a0, a4);
     131           0 :     b6 = _mm_sub_epi16(a1, a5);
     132           0 :     b1 = _mm_sub_epi16(a2, a6);
     133           0 :     b5 = _mm_sub_epi16(a3, a7);
     134             : 
     135           0 :     a0 = _mm_unpacklo_epi16(b0, b1);
     136           0 :     a1 = _mm_unpacklo_epi16(b2, b3);
     137           0 :     a2 = _mm_unpackhi_epi16(b0, b1);
     138           0 :     a3 = _mm_unpackhi_epi16(b2, b3);
     139           0 :     a4 = _mm_unpacklo_epi16(b4, b5);
     140           0 :     a5 = _mm_unpacklo_epi16(b6, b7);
     141           0 :     a6 = _mm_unpackhi_epi16(b4, b5);
     142           0 :     a7 = _mm_unpackhi_epi16(b6, b7);
     143             : 
     144           0 :     b0 = _mm_unpacklo_epi32(a0, a1);
     145           0 :     b1 = _mm_unpacklo_epi32(a4, a5);
     146           0 :     b2 = _mm_unpackhi_epi32(a0, a1);
     147           0 :     b3 = _mm_unpackhi_epi32(a4, a5);
     148           0 :     b4 = _mm_unpacklo_epi32(a2, a3);
     149           0 :     b5 = _mm_unpacklo_epi32(a6, a7);
     150           0 :     b6 = _mm_unpackhi_epi32(a2, a3);
     151           0 :     b7 = _mm_unpackhi_epi32(a6, a7);
     152             : 
     153           0 :     in[0] = _mm_unpacklo_epi64(b0, b1);
     154           0 :     in[1] = _mm_unpackhi_epi64(b0, b1);
     155           0 :     in[2] = _mm_unpacklo_epi64(b2, b3);
     156           0 :     in[3] = _mm_unpackhi_epi64(b2, b3);
     157           0 :     in[4] = _mm_unpacklo_epi64(b4, b5);
     158           0 :     in[5] = _mm_unpackhi_epi64(b4, b5);
     159           0 :     in[6] = _mm_unpacklo_epi64(b6, b7);
     160           0 :     in[7] = _mm_unpackhi_epi64(b6, b7);
     161             :   } else {
     162           0 :     in[0] = _mm_add_epi16(a0, a4);
     163           0 :     in[7] = _mm_add_epi16(a1, a5);
     164           0 :     in[3] = _mm_add_epi16(a2, a6);
     165           0 :     in[4] = _mm_add_epi16(a3, a7);
     166           0 :     in[2] = _mm_sub_epi16(a0, a4);
     167           0 :     in[6] = _mm_sub_epi16(a1, a5);
     168           0 :     in[1] = _mm_sub_epi16(a2, a6);
     169           0 :     in[5] = _mm_sub_epi16(a3, a7);
     170             :   }
     171           0 : }
     172             : 
     173           0 : void aom_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
     174             :                            int16_t *coeff) {
     175             :   __m128i src[8];
     176           0 :   src[0] = _mm_load_si128((const __m128i *)src_diff);
     177           0 :   src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     178           0 :   src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     179           0 :   src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     180           0 :   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     181           0 :   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     182           0 :   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     183           0 :   src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
     184             : 
     185           0 :   hadamard_col8_sse2(src, 0);
     186           0 :   hadamard_col8_sse2(src, 1);
     187             : 
     188           0 :   _mm_store_si128((__m128i *)coeff, src[0]);
     189           0 :   coeff += 8;
     190           0 :   _mm_store_si128((__m128i *)coeff, src[1]);
     191           0 :   coeff += 8;
     192           0 :   _mm_store_si128((__m128i *)coeff, src[2]);
     193           0 :   coeff += 8;
     194           0 :   _mm_store_si128((__m128i *)coeff, src[3]);
     195           0 :   coeff += 8;
     196           0 :   _mm_store_si128((__m128i *)coeff, src[4]);
     197           0 :   coeff += 8;
     198           0 :   _mm_store_si128((__m128i *)coeff, src[5]);
     199           0 :   coeff += 8;
     200           0 :   _mm_store_si128((__m128i *)coeff, src[6]);
     201           0 :   coeff += 8;
     202           0 :   _mm_store_si128((__m128i *)coeff, src[7]);
     203           0 : }
     204             : 
     205           0 : void aom_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
     206             :                              int16_t *coeff) {
     207             :   int idx;
     208           0 :   for (idx = 0; idx < 4; ++idx) {
     209           0 :     int16_t const *src_ptr =
     210           0 :         src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
     211           0 :     aom_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
     212             :   }
     213             : 
     214           0 :   for (idx = 0; idx < 64; idx += 8) {
     215           0 :     __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
     216           0 :     __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
     217           0 :     __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
     218           0 :     __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
     219             : 
     220           0 :     __m128i b0 = _mm_add_epi16(coeff0, coeff1);
     221           0 :     __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
     222           0 :     __m128i b2 = _mm_add_epi16(coeff2, coeff3);
     223           0 :     __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
     224             : 
     225           0 :     b0 = _mm_srai_epi16(b0, 1);
     226           0 :     b1 = _mm_srai_epi16(b1, 1);
     227           0 :     b2 = _mm_srai_epi16(b2, 1);
     228           0 :     b3 = _mm_srai_epi16(b3, 1);
     229             : 
     230           0 :     coeff0 = _mm_add_epi16(b0, b2);
     231           0 :     coeff1 = _mm_add_epi16(b1, b3);
     232             :     _mm_store_si128((__m128i *)coeff, coeff0);
     233           0 :     _mm_store_si128((__m128i *)(coeff + 64), coeff1);
     234             : 
     235           0 :     coeff2 = _mm_sub_epi16(b0, b2);
     236           0 :     coeff3 = _mm_sub_epi16(b1, b3);
     237           0 :     _mm_store_si128((__m128i *)(coeff + 128), coeff2);
     238           0 :     _mm_store_si128((__m128i *)(coeff + 192), coeff3);
     239             : 
     240           0 :     coeff += 8;
     241             :   }
     242           0 : }
     243             : 
     244           0 : int aom_satd_sse2(const int16_t *coeff, int length) {
     245             :   int i;
     246           0 :   const __m128i zero = _mm_setzero_si128();
     247           0 :   __m128i accum = zero;
     248             : 
     249           0 :   for (i = 0; i < length; i += 8) {
     250           0 :     const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
     251           0 :     const __m128i inv = _mm_sub_epi16(zero, src_line);
     252           0 :     const __m128i abs = _mm_max_epi16(src_line, inv);  // abs(src_line)
     253           0 :     const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
     254           0 :     const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
     255           0 :     const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
     256           0 :     accum = _mm_add_epi32(accum, sum);
     257           0 :     coeff += 8;
     258             :   }
     259             : 
     260             :   {  // cascading summation of accum
     261           0 :     __m128i hi = _mm_srli_si128(accum, 8);
     262           0 :     accum = _mm_add_epi32(accum, hi);
     263           0 :     hi = _mm_srli_epi64(accum, 32);
     264           0 :     accum = _mm_add_epi32(accum, hi);
     265             :   }
     266             : 
     267           0 :   return _mm_cvtsi128_si32(accum);
     268             : }
     269             : 
     270           0 : void aom_int_pro_row_sse2(int16_t *hbuf, uint8_t const *ref, int ref_stride,
     271             :                           int height) {
     272             :   int idx;
     273           0 :   __m128i zero = _mm_setzero_si128();
     274           0 :   __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
     275           0 :   __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
     276           0 :   __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
     277             :   __m128i t0, t1;
     278           0 :   int height_1 = height - 1;
     279           0 :   ref += ref_stride;
     280             : 
     281           0 :   for (idx = 1; idx < height_1; idx += 2) {
     282           0 :     src_line = _mm_loadu_si128((const __m128i *)ref);
     283           0 :     t0 = _mm_unpacklo_epi8(src_line, zero);
     284           0 :     t1 = _mm_unpackhi_epi8(src_line, zero);
     285           0 :     s0 = _mm_adds_epu16(s0, t0);
     286           0 :     s1 = _mm_adds_epu16(s1, t1);
     287           0 :     ref += ref_stride;
     288             : 
     289           0 :     src_line = _mm_loadu_si128((const __m128i *)ref);
     290           0 :     t0 = _mm_unpacklo_epi8(src_line, zero);
     291           0 :     t1 = _mm_unpackhi_epi8(src_line, zero);
     292           0 :     s0 = _mm_adds_epu16(s0, t0);
     293           0 :     s1 = _mm_adds_epu16(s1, t1);
     294           0 :     ref += ref_stride;
     295             :   }
     296             : 
     297           0 :   src_line = _mm_loadu_si128((const __m128i *)ref);
     298           0 :   t0 = _mm_unpacklo_epi8(src_line, zero);
     299           0 :   t1 = _mm_unpackhi_epi8(src_line, zero);
     300           0 :   s0 = _mm_adds_epu16(s0, t0);
     301           0 :   s1 = _mm_adds_epu16(s1, t1);
     302             : 
     303           0 :   if (height == 64) {
     304           0 :     s0 = _mm_srai_epi16(s0, 5);
     305           0 :     s1 = _mm_srai_epi16(s1, 5);
     306           0 :   } else if (height == 32) {
     307           0 :     s0 = _mm_srai_epi16(s0, 4);
     308           0 :     s1 = _mm_srai_epi16(s1, 4);
     309             :   } else {
     310           0 :     s0 = _mm_srai_epi16(s0, 3);
     311           0 :     s1 = _mm_srai_epi16(s1, 3);
     312             :   }
     313             : 
     314             :   _mm_storeu_si128((__m128i *)hbuf, s0);
     315           0 :   hbuf += 8;
     316             :   _mm_storeu_si128((__m128i *)hbuf, s1);
     317           0 : }
     318             : 
     319           0 : int16_t aom_int_pro_col_sse2(uint8_t const *ref, int width) {
     320           0 :   __m128i zero = _mm_setzero_si128();
     321           0 :   __m128i src_line = _mm_load_si128((const __m128i *)ref);
     322           0 :   __m128i s0 = _mm_sad_epu8(src_line, zero);
     323             :   __m128i s1;
     324             :   int i;
     325             : 
     326           0 :   for (i = 16; i < width; i += 16) {
     327           0 :     ref += 16;
     328           0 :     src_line = _mm_load_si128((const __m128i *)ref);
     329           0 :     s1 = _mm_sad_epu8(src_line, zero);
     330           0 :     s0 = _mm_adds_epu16(s0, s1);
     331             :   }
     332             : 
     333           0 :   s1 = _mm_srli_si128(s0, 8);
     334           0 :   s0 = _mm_adds_epu16(s0, s1);
     335             : 
     336           0 :   return _mm_extract_epi16(s0, 0);
     337             : }
     338             : 
     339           0 : int aom_vector_var_sse2(int16_t const *ref, int16_t const *src, int bwl) {
     340             :   int idx;
     341           0 :   int width = 4 << bwl;
     342             :   int16_t mean;
     343           0 :   __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
     344           0 :   __m128i v1 = _mm_load_si128((const __m128i *)src);
     345           0 :   __m128i diff = _mm_subs_epi16(v0, v1);
     346           0 :   __m128i sum = diff;
     347           0 :   __m128i sse = _mm_madd_epi16(diff, diff);
     348             : 
     349           0 :   ref += 8;
     350           0 :   src += 8;
     351             : 
     352           0 :   for (idx = 8; idx < width; idx += 8) {
     353           0 :     v0 = _mm_loadu_si128((const __m128i *)ref);
     354           0 :     v1 = _mm_load_si128((const __m128i *)src);
     355           0 :     diff = _mm_subs_epi16(v0, v1);
     356             : 
     357           0 :     sum = _mm_add_epi16(sum, diff);
     358           0 :     v0 = _mm_madd_epi16(diff, diff);
     359           0 :     sse = _mm_add_epi32(sse, v0);
     360             : 
     361           0 :     ref += 8;
     362           0 :     src += 8;
     363             :   }
     364             : 
     365           0 :   v0 = _mm_srli_si128(sum, 8);
     366           0 :   sum = _mm_add_epi16(sum, v0);
     367           0 :   v0 = _mm_srli_epi64(sum, 32);
     368           0 :   sum = _mm_add_epi16(sum, v0);
     369           0 :   v0 = _mm_srli_epi32(sum, 16);
     370           0 :   sum = _mm_add_epi16(sum, v0);
     371             : 
     372           0 :   v1 = _mm_srli_si128(sse, 8);
     373           0 :   sse = _mm_add_epi32(sse, v1);
     374           0 :   v1 = _mm_srli_epi64(sse, 32);
     375           0 :   sse = _mm_add_epi32(sse, v1);
     376             : 
     377           0 :   mean = _mm_extract_epi16(sum, 0);
     378             : 
     379           0 :   return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
     380             : }

Generated by: LCOV version 1.13