LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - highbd_subtract_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 235 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 18 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <assert.h>
      13             : #include <emmintrin.h>
      14             : #include <stddef.h>
      15             : 
      16             : #include "./aom_config.h"
      17             : #include "./aom_dsp_rtcd.h"
      18             : 
      19             : typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
      20             :                                     const uint16_t *src, ptrdiff_t src_stride,
      21             :                                     const uint16_t *pred,
      22             :                                     ptrdiff_t pred_stride);
      23             : 
      24           0 : static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
      25             :                          const uint16_t *src, ptrdiff_t src_stride,
      26             :                          const uint16_t *pred, ptrdiff_t pred_stride) {
      27             :   __m128i u0, u1, u2, u3;
      28             :   __m128i v0, v1, v2, v3;
      29             :   __m128i x0, x1, x2, x3;
      30           0 :   int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
      31             : 
      32           0 :   u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
      33           0 :   u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
      34           0 :   u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
      35           0 :   u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
      36             : 
      37           0 :   v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
      38           0 :   v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
      39           0 :   v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
      40           0 :   v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
      41             : 
      42           0 :   x0 = _mm_sub_epi16(u0, v0);
      43           0 :   x1 = _mm_sub_epi16(u1, v1);
      44           0 :   x2 = _mm_sub_epi16(u2, v2);
      45           0 :   x3 = _mm_sub_epi16(u3, v3);
      46             : 
      47             :   _mm_storel_epi64((__m128i *)store_diff, x0);
      48           0 :   store_diff = (int64_t *)(diff + 1 * diff_stride);
      49             :   _mm_storel_epi64((__m128i *)store_diff, x1);
      50           0 :   store_diff = (int64_t *)(diff + 2 * diff_stride);
      51             :   _mm_storel_epi64((__m128i *)store_diff, x2);
      52           0 :   store_diff = (int64_t *)(diff + 3 * diff_stride);
      53             :   _mm_storel_epi64((__m128i *)store_diff, x3);
      54           0 : }
      55             : 
      56           0 : static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
      57             :                          const uint16_t *src, ptrdiff_t src_stride,
      58             :                          const uint16_t *pred, ptrdiff_t pred_stride) {
      59             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
      60             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
      61             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
      62           0 :   int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
      63             : 
      64           0 :   u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
      65           0 :   u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
      66           0 :   u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
      67           0 :   u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
      68           0 :   u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
      69           0 :   u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
      70           0 :   u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
      71           0 :   u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
      72             : 
      73           0 :   v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
      74           0 :   v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
      75           0 :   v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
      76           0 :   v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
      77           0 :   v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
      78           0 :   v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
      79           0 :   v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
      80           0 :   v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
      81             : 
      82           0 :   x0 = _mm_sub_epi16(u0, v0);
      83           0 :   x1 = _mm_sub_epi16(u1, v1);
      84           0 :   x2 = _mm_sub_epi16(u2, v2);
      85           0 :   x3 = _mm_sub_epi16(u3, v3);
      86           0 :   x4 = _mm_sub_epi16(u4, v4);
      87           0 :   x5 = _mm_sub_epi16(u5, v5);
      88           0 :   x6 = _mm_sub_epi16(u6, v6);
      89           0 :   x7 = _mm_sub_epi16(u7, v7);
      90             : 
      91             :   _mm_storel_epi64((__m128i *)store_diff, x0);
      92           0 :   store_diff = (int64_t *)(diff + 1 * diff_stride);
      93             :   _mm_storel_epi64((__m128i *)store_diff, x1);
      94           0 :   store_diff = (int64_t *)(diff + 2 * diff_stride);
      95             :   _mm_storel_epi64((__m128i *)store_diff, x2);
      96           0 :   store_diff = (int64_t *)(diff + 3 * diff_stride);
      97             :   _mm_storel_epi64((__m128i *)store_diff, x3);
      98           0 :   store_diff = (int64_t *)(diff + 4 * diff_stride);
      99             :   _mm_storel_epi64((__m128i *)store_diff, x4);
     100           0 :   store_diff = (int64_t *)(diff + 5 * diff_stride);
     101             :   _mm_storel_epi64((__m128i *)store_diff, x5);
     102           0 :   store_diff = (int64_t *)(diff + 6 * diff_stride);
     103             :   _mm_storel_epi64((__m128i *)store_diff, x6);
     104           0 :   store_diff = (int64_t *)(diff + 7 * diff_stride);
     105             :   _mm_storel_epi64((__m128i *)store_diff, x7);
     106           0 : }
     107             : 
     108           0 : static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
     109             :                          const uint16_t *src, ptrdiff_t src_stride,
     110             :                          const uint16_t *pred, ptrdiff_t pred_stride) {
     111             :   __m128i u0, u1, u2, u3;
     112             :   __m128i v0, v1, v2, v3;
     113             :   __m128i x0, x1, x2, x3;
     114             : 
     115           0 :   u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
     116           0 :   u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
     117           0 :   u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
     118           0 :   u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
     119             : 
     120           0 :   v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
     121           0 :   v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
     122           0 :   v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
     123           0 :   v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
     124             : 
     125           0 :   x0 = _mm_sub_epi16(u0, v0);
     126           0 :   x1 = _mm_sub_epi16(u1, v1);
     127           0 :   x2 = _mm_sub_epi16(u2, v2);
     128           0 :   x3 = _mm_sub_epi16(u3, v3);
     129             : 
     130             :   _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
     131           0 :   _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
     132           0 :   _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
     133           0 :   _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
     134           0 : }
     135             : 
     136           0 : static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
     137             :                          const uint16_t *src, ptrdiff_t src_stride,
     138             :                          const uint16_t *pred, ptrdiff_t pred_stride) {
     139             :   __m128i u0, u1, u2, u3, u4, u5, u6, u7;
     140             :   __m128i v0, v1, v2, v3, v4, v5, v6, v7;
     141             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
     142             : 
     143           0 :   u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
     144           0 :   u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
     145           0 :   u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
     146           0 :   u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
     147           0 :   u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
     148           0 :   u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
     149           0 :   u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
     150           0 :   u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
     151             : 
     152           0 :   v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
     153           0 :   v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
     154           0 :   v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
     155           0 :   v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
     156           0 :   v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
     157           0 :   v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
     158           0 :   v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
     159           0 :   v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
     160             : 
     161           0 :   x0 = _mm_sub_epi16(u0, v0);
     162           0 :   x1 = _mm_sub_epi16(u1, v1);
     163           0 :   x2 = _mm_sub_epi16(u2, v2);
     164           0 :   x3 = _mm_sub_epi16(u3, v3);
     165           0 :   x4 = _mm_sub_epi16(u4, v4);
     166           0 :   x5 = _mm_sub_epi16(u5, v5);
     167           0 :   x6 = _mm_sub_epi16(u6, v6);
     168           0 :   x7 = _mm_sub_epi16(u7, v7);
     169             : 
     170             :   _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
     171           0 :   _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
     172           0 :   _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
     173           0 :   _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
     174           0 :   _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
     175           0 :   _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
     176           0 :   _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
     177           0 :   _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
     178           0 : }
     179             : 
     180           0 : static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
     181             :                           const uint16_t *src, ptrdiff_t src_stride,
     182             :                           const uint16_t *pred, ptrdiff_t pred_stride) {
     183           0 :   subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
     184           0 :   diff += diff_stride << 3;
     185           0 :   src += src_stride << 3;
     186           0 :   pred += pred_stride << 3;
     187           0 :   subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
     188           0 : }
     189             : 
     190           0 : static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
     191             :                           const uint16_t *src, ptrdiff_t src_stride,
     192             :                           const uint16_t *pred, ptrdiff_t pred_stride) {
     193           0 :   subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
     194           0 :   diff += 8;
     195           0 :   src += 8;
     196           0 :   pred += 8;
     197           0 :   subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
     198           0 : }
     199             : 
     200           0 : static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
     201             :                            const uint16_t *src, ptrdiff_t src_stride,
     202             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     203           0 :   subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
     204           0 :   diff += diff_stride << 3;
     205           0 :   src += src_stride << 3;
     206           0 :   pred += pred_stride << 3;
     207           0 :   subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
     208           0 : }
     209             : 
     210           0 : static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
     211             :                            const uint16_t *src, ptrdiff_t src_stride,
     212             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     213           0 :   subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
     214           0 :   diff += diff_stride << 4;
     215           0 :   src += src_stride << 4;
     216           0 :   pred += pred_stride << 4;
     217           0 :   subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
     218           0 : }
     219             : 
     220           0 : static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
     221             :                            const uint16_t *src, ptrdiff_t src_stride,
     222             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     223           0 :   subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
     224           0 :   diff += 16;
     225           0 :   src += 16;
     226           0 :   pred += 16;
     227           0 :   subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
     228           0 : }
     229             : 
     230           0 : static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
     231             :                            const uint16_t *src, ptrdiff_t src_stride,
     232             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     233           0 :   subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
     234           0 :   diff += diff_stride << 4;
     235           0 :   src += src_stride << 4;
     236           0 :   pred += pred_stride << 4;
     237           0 :   subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
     238           0 : }
     239             : 
     240           0 : static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
     241             :                            const uint16_t *src, ptrdiff_t src_stride,
     242             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     243           0 :   subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
     244           0 :   diff += diff_stride << 5;
     245           0 :   src += src_stride << 5;
     246           0 :   pred += pred_stride << 5;
     247           0 :   subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
     248           0 : }
     249             : 
     250           0 : static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
     251             :                            const uint16_t *src, ptrdiff_t src_stride,
     252             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     253           0 :   subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
     254           0 :   diff += 32;
     255           0 :   src += 32;
     256           0 :   pred += 32;
     257           0 :   subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
     258           0 : }
     259             : 
     260           0 : static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
     261             :                            const uint16_t *src, ptrdiff_t src_stride,
     262             :                            const uint16_t *pred, ptrdiff_t pred_stride) {
     263           0 :   subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
     264           0 :   diff += diff_stride << 5;
     265           0 :   src += src_stride << 5;
     266           0 :   pred += pred_stride << 5;
     267           0 :   subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
     268           0 : }
     269             : 
     270           0 : static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
     271             :                             const uint16_t *src, ptrdiff_t src_stride,
     272             :                             const uint16_t *pred, ptrdiff_t pred_stride) {
     273           0 :   subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
     274           0 :   diff += diff_stride << 6;
     275           0 :   src += src_stride << 6;
     276           0 :   pred += pred_stride << 6;
     277           0 :   subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
     278           0 : }
     279             : 
     280           0 : static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
     281             :                             const uint16_t *src, ptrdiff_t src_stride,
     282             :                             const uint16_t *pred, ptrdiff_t pred_stride) {
     283           0 :   subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
     284           0 :   diff += 64;
     285           0 :   src += 64;
     286           0 :   pred += 64;
     287           0 :   subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
     288           0 : }
     289             : 
     290           0 : static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
     291             :                              const uint16_t *src, ptrdiff_t src_stride,
     292             :                              const uint16_t *pred, ptrdiff_t pred_stride) {
     293           0 :   subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
     294           0 :   diff += diff_stride << 6;
     295           0 :   src += src_stride << 6;
     296           0 :   pred += pred_stride << 6;
     297           0 :   subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
     298           0 : }
     299             : 
     300           0 : static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
     301           0 :   SubtractWxHFuncType ret_func_ptr = NULL;
     302           0 :   if (rows == 4) {
     303           0 :     if (cols == 4) {
     304           0 :       ret_func_ptr = subtract_4x4;
     305           0 :     } else if (cols == 8) {
     306           0 :       ret_func_ptr = subtract_8x4;
     307             :     }
     308           0 :   } else if (rows == 8) {
     309           0 :     if (cols == 4) {
     310           0 :       ret_func_ptr = subtract_4x8;
     311           0 :     } else if (cols == 8) {
     312           0 :       ret_func_ptr = subtract_8x8;
     313           0 :     } else if (cols == 16) {
     314           0 :       ret_func_ptr = subtract_16x8;
     315             :     }
     316           0 :   } else if (rows == 16) {
     317           0 :     if (cols == 8) {
     318           0 :       ret_func_ptr = subtract_8x16;
     319           0 :     } else if (cols == 16) {
     320           0 :       ret_func_ptr = subtract_16x16;
     321           0 :     } else if (cols == 32) {
     322           0 :       ret_func_ptr = subtract_32x16;
     323             :     }
     324           0 :   } else if (rows == 32) {
     325           0 :     if (cols == 16) {
     326           0 :       ret_func_ptr = subtract_16x32;
     327           0 :     } else if (cols == 32) {
     328           0 :       ret_func_ptr = subtract_32x32;
     329           0 :     } else if (cols == 64) {
     330           0 :       ret_func_ptr = subtract_64x32;
     331             :     }
     332           0 :   } else if (rows == 64) {
     333           0 :     if (cols == 32) {
     334           0 :       ret_func_ptr = subtract_32x64;
     335           0 :     } else if (cols == 64) {
     336           0 :       ret_func_ptr = subtract_64x64;
     337           0 :     } else if (cols == 128) {
     338           0 :       ret_func_ptr = subtract_128x64;
     339             :     }
     340           0 :   } else if (rows == 128) {
     341           0 :     if (cols == 64) {
     342           0 :       ret_func_ptr = subtract_64x128;
     343           0 :     } else if (cols == 128) {
     344           0 :       ret_func_ptr = subtract_128x128;
     345             :     }
     346             :   }
     347           0 :   if (!ret_func_ptr) {
     348           0 :     assert(0);
     349             :   }
     350           0 :   return ret_func_ptr;
     351             : }
     352             : 
     353           0 : void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
     354             :                                     ptrdiff_t diff_stride, const uint8_t *src8,
     355             :                                     ptrdiff_t src_stride, const uint8_t *pred8,
     356             :                                     ptrdiff_t pred_stride, int bd) {
     357           0 :   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
     358           0 :   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
     359             :   SubtractWxHFuncType func;
     360             :   (void)bd;
     361             : 
     362           0 :   func = getSubtractFunc(rows, cols);
     363           0 :   func(diff, diff_stride, src, src_stride, pred, pred_stride);
     364           0 : }

Generated by: LCOV version 1.13