LCOV - code coverage report
Current view: top level - third_party/aom/av1/common/x86 - av1_txfm1d_sse4.h (source / functions) Hit Total Coverage
Test: output.info Lines: 0 30 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 4 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : #ifndef AV1_TXMF1D_SSE2_H_
       2             : #define AV1_TXMF1D_SSE2_H_
       3             : 
       4             : #include <smmintrin.h>
       5             : #include "av1/common/av1_txfm.h"
       6             : 
       7             : #ifdef __cplusplus
       8             : extern "C" {
       9             : #endif
      10             : 
      11             : void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
      12             :                           const int8_t *cos_bit, const int8_t *stage_range);
      13             : void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
      14             :                           const int8_t *cos_bit, const int8_t *stage_range);
      15             : void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
      16             :                            const int8_t *cos_bit, const int8_t *stage_range);
      17             : void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
      18             :                            const int8_t *cos_bit, const int8_t *stage_range);
      19             : void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
      20             :                            const int8_t *cos_bit, const int8_t *stage_range);
      21             : 
      22             : void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
      23             :                            const int8_t *cos_bit, const int8_t *stage_range);
      24             : void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
      25             :                            const int8_t *cos_bit, const int8_t *stage_range);
      26             : void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
      27             :                             const int8_t *cos_bit, const int8_t *stage_range);
      28             : void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
      29             :                             const int8_t *cos_bit, const int8_t *stage_range);
      30             : 
      31             : void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
      32             :                           const int8_t *cos_bit, const int8_t *stage_range);
      33             : void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
      34             :                           const int8_t *cos_bit, const int8_t *stage_range);
      35             : void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
      36             :                            const int8_t *cos_bit, const int8_t *stage_range);
      37             : void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
      38             :                            const int8_t *cos_bit, const int8_t *stage_range);
      39             : void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
      40             :                            const int8_t *cos_bit, const int8_t *stage_range);
      41             : 
      42             : void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
      43             :                            const int8_t *cos_bit, const int8_t *stage_range);
      44             : void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
      45             :                            const int8_t *cos_bit, const int8_t *stage_range);
      46             : void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
      47             :                             const int8_t *cos_bit, const int8_t *stage_range);
      48             : void av1_iadst32_new_sse4_1(const __m128i *input, __m128i *output,
      49             :                             const int8_t *cos_bit, const int8_t *stage_range);
      50             : 
      51           0 : static INLINE void transpose_32_4x4(int stride, const __m128i *input,
      52             :                                     __m128i *output) {
      53           0 :   __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
      54           0 :   __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
      55           0 :   __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
      56           0 :   __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
      57             : 
      58           0 :   output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
      59           0 :   output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
      60           0 :   output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
      61           0 :   output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
      62           0 : }
      63             : 
      64             : // the entire input block can be represent by a grid of 4x4 blocks
      65             : // each 4x4 blocks can be represent by 4 vertical __m128i
      66             : // we first transpose each 4x4 block internally
      67             : // than transpose the grid
      68           0 : static INLINE void transpose_32(int txfm_size, const __m128i *input,
      69             :                                 __m128i *output) {
      70           0 :   const int num_per_128 = 4;
      71           0 :   const int row_size = txfm_size;
      72           0 :   const int col_size = txfm_size / num_per_128;
      73             :   int r, c;
      74             : 
      75             :   // transpose each 4x4 block internally
      76           0 :   for (r = 0; r < row_size; r += 4) {
      77           0 :     for (c = 0; c < col_size; c++) {
      78           0 :       transpose_32_4x4(col_size, &input[r * col_size + c],
      79           0 :                        &output[c * 4 * col_size + r / 4]);
      80             :     }
      81             :   }
      82           0 : }
      83             : 
      84           0 : static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
      85             :   __m128i tmp, round;
      86           0 :   round = _mm_set1_epi32(1 << (bit - 1));
      87           0 :   tmp = _mm_add_epi32(vec, round);
      88           0 :   return _mm_srai_epi32(tmp, bit);
      89             : }
      90             : 
      91           0 : static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
      92             :                                                const int size, const int bit) {
      93           0 :   if (bit > 0) {
      94             :     int i;
      95           0 :     for (i = 0; i < size; i++) {
      96           0 :       output[i] = round_shift_32_sse4_1(input[i], bit);
      97             :     }
      98             :   } else {
      99             :     int i;
     100           0 :     for (i = 0; i < size; i++) {
     101           0 :       output[i] = _mm_slli_epi32(input[i], -bit);
     102             :     }
     103             :   }
     104           0 : }
     105             : 
     106             : // out0 = in0*w0 + in1*w1
     107             : // out1 = -in1*w0 + in0*w1
     108             : #define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
     109             :   do {                                                         \
     110             :     __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
     111             :     ww0 = _mm_set1_epi32(w0);                                  \
     112             :     ww1 = _mm_set1_epi32(w1);                                  \
     113             :     in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
     114             :     in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
     115             :     out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
     116             :     out0 = round_shift_32_sse4_1(out0, bit);                   \
     117             :     in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
     118             :     in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
     119             :     out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
     120             :     out1 = round_shift_32_sse4_1(out1, bit);                   \
     121             :   } while (0)
     122             : 
     123             : // out0 = in0*w0 + in1*w1
     124             : // out1 = in1*w0 - in0*w1
     125             : #define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
     126             :   do {                                                         \
     127             :     __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
     128             :     ww0 = _mm_set1_epi32(w0);                                  \
     129             :     ww1 = _mm_set1_epi32(w1);                                  \
     130             :     in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
     131             :     in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
     132             :     out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
     133             :     out0 = round_shift_32_sse4_1(out0, bit);                   \
     134             :     in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
     135             :     in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
     136             :     out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
     137             :     out1 = round_shift_32_sse4_1(out1, bit);                   \
     138             :   } while (0)
     139             : 
     140             : #ifdef __cplusplus
     141             : }
     142             : #endif
     143             : 
     144             : #endif  // AV1_TXMF1D_SSE2_H_

Generated by: LCOV version 1.13