LCOV - code coverage report
Current view: top level - media/libvpx/libvpx/vpx_dsp/x86 - loopfilter_intrin_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 1085 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 18 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS.  All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include <emmintrin.h>  // SSE2
      12             : 
      13             : #include "./vpx_dsp_rtcd.h"
      14             : #include "vpx_ports/mem.h"
      15             : #include "vpx_ports/emmintrin_compat.h"
      16             : 
      17           0 : static INLINE __m128i abs_diff(__m128i a, __m128i b) {
      18           0 :   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
      19             : }
      20             : 
      21             : // filter_mask and hev_mask
      22             : #define FILTER_HEV_MASK                                                       \
      23             :   do {                                                                        \
      24             :     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
      25             :     __m128i flat = abs_diff(q1p1, q0p0);                                      \
      26             :     /* abs(p1 - q1), abs(p0 - q0) */                                          \
      27             :     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
      28             :     __m128i abs_p0q0, abs_p1q1, work;                                         \
      29             :                                                                               \
      30             :     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
      31             :     hev =                                                                     \
      32             :         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
      33             :     hev = _mm_cmpgt_epi16(hev, thresh);                                       \
      34             :     hev = _mm_packs_epi16(hev, hev);                                          \
      35             :                                                                               \
      36             :     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
      37             :     /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
      38             :     abs_p0q0 =                                                                \
      39             :         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
      40             :     abs_p1q1 =                                                                \
      41             :         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
      42             :     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
      43             :     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
      44             :     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
      45             :     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
      46             :     /* abs(p3 - p2), abs(p2 - p1) */                                          \
      47             :     work = abs_diff(p3p2, p2p1);                                              \
      48             :     flat = _mm_max_epu8(work, flat);                                          \
      49             :     /* abs(q3 - q2), abs(q2 - q1) */                                          \
      50             :     work = abs_diff(q3q2, q2q1);                                              \
      51             :     flat = _mm_max_epu8(work, flat);                                          \
      52             :     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
      53             :     mask = _mm_unpacklo_epi64(mask, flat);                                    \
      54             :     mask = _mm_subs_epu8(mask, limit);                                        \
      55             :     mask = _mm_cmpeq_epi8(mask, zero);                                        \
      56             :     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
      57             :   } while (0)
      58             : 
      59             : #define FILTER4                                                             \
      60             :   do {                                                                      \
      61             :     const __m128i t3t4 =                                                    \
      62             :         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
      63             :     const __m128i t80 = _mm_set1_epi8(0x80);                                \
      64             :     __m128i filter, filter2filter1, work;                                   \
      65             :                                                                             \
      66             :     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
      67             :     qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
      68             :                                                                             \
      69             :     /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
      70             :     work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
      71             :     filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
      72             :     /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
      73             :     filter = _mm_subs_epi8(filter, work);                                   \
      74             :     filter = _mm_subs_epi8(filter, work);                                   \
      75             :     filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
      76             :     filter = _mm_and_si128(filter, mask); /* & mask */                      \
      77             :     filter = _mm_unpacklo_epi64(filter, filter);                            \
      78             :                                                                             \
      79             :     /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
      80             :     /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
      81             :     filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
      82             :     filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
      83             :     filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
      84             :     filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
      85             :     filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
      86             :     filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
      87             :                                                                             \
      88             :     /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
      89             :     filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
      90             :     filter = _mm_unpacklo_epi8(filter, filter);                             \
      91             :     filter = _mm_srai_epi16(filter, 9); /* round */                         \
      92             :     filter = _mm_packs_epi16(filter, filter);                               \
      93             :     filter = _mm_andnot_si128(hev, filter);                                 \
      94             :                                                                             \
      95             :     hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
      96             :     filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
      97             :                                                                             \
      98             :     /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
      99             :     qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
     100             :     /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
     101             :     ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
     102             :     qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
     103             :     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
     104             :   } while (0)
     105             : 
     106           0 : void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
     107             :                                const uint8_t *_blimit, const uint8_t *_limit,
     108             :                                const uint8_t *_thresh) {
     109           0 :   const __m128i zero = _mm_set1_epi16(0);
     110           0 :   const __m128i limit =
     111           0 :       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
     112             :                          _mm_loadl_epi64((const __m128i *)_limit));
     113           0 :   const __m128i thresh =
     114           0 :       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
     115           0 :   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     116             :   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
     117             :   __m128i mask, hev;
     118             : 
     119           0 :   p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
     120           0 :                             _mm_loadl_epi64((__m128i *)(s - 4 * p)));
     121           0 :   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
     122           0 :                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
     123           0 :   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
     124             :                             _mm_loadl_epi64((__m128i *)(s + 0 * p)));
     125           0 :   q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
     126           0 :                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
     127           0 :   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
     128           0 :   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
     129           0 :   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
     130           0 :   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
     131             : 
     132           0 :   FILTER_HEV_MASK;
     133           0 :   FILTER4;
     134             : 
     135           0 :   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
     136           0 :   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
     137             :   _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
     138           0 :   _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
     139           0 : }
     140             : 
     141           0 : void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
     142             :                              const uint8_t *_blimit, const uint8_t *_limit,
     143             :                              const uint8_t *_thresh) {
     144           0 :   const __m128i zero = _mm_set1_epi16(0);
     145           0 :   const __m128i limit =
     146           0 :       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
     147             :                          _mm_loadl_epi64((const __m128i *)_limit));
     148           0 :   const __m128i thresh =
     149           0 :       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
     150           0 :   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     151             :   __m128i x0, x1, x2, x3;
     152             :   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
     153             :   __m128i mask, hev;
     154             : 
     155             :   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     156           0 :   q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
     157           0 :                            _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
     158             : 
     159             :   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     160           0 :   x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
     161           0 :                          _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
     162             : 
     163             :   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     164           0 :   x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
     165           0 :                          _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
     166             : 
     167             :   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     168           0 :   x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
     169           0 :                          _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
     170             : 
     171             :   // Transpose 8x8
     172             :   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     173           0 :   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
     174             :   // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     175           0 :   x0 = _mm_unpacklo_epi16(x2, x3);
     176             :   // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     177           0 :   p3p2 = _mm_unpacklo_epi32(p1p0, x0);
     178             :   // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     179           0 :   p1p0 = _mm_unpackhi_epi32(p1p0, x0);
     180           0 :   p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
     181           0 :   p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
     182             : 
     183             :   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     184           0 :   q1q0 = _mm_unpackhi_epi16(q1q0, x1);
     185             :   // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     186           0 :   x2 = _mm_unpackhi_epi16(x2, x3);
     187             :   // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     188           0 :   q3q2 = _mm_unpackhi_epi32(q1q0, x2);
     189             :   // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     190           0 :   q1q0 = _mm_unpacklo_epi32(q1q0, x2);
     191             : 
     192           0 :   q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
     193           0 :   q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
     194           0 :   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
     195           0 :   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
     196           0 :   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
     197             : 
     198           0 :   FILTER_HEV_MASK;
     199           0 :   FILTER4;
     200             : 
     201             :   // Transpose 8x4 to 4x8
     202             :   // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
     203             :   // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
     204             :   // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
     205           0 :   ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
     206             :   // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
     207           0 :   x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
     208             :   // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
     209           0 :   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
     210             :   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     211           0 :   qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
     212             :   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     213           0 :   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
     214             : 
     215           0 :   *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     216           0 :   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
     217           0 :   *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     218           0 :   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
     219           0 :   *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     220           0 :   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
     221           0 :   *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     222             : 
     223           0 :   *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     224           0 :   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
     225           0 :   *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     226           0 :   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
     227           0 :   *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     228           0 :   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
     229           0 :   *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     230           0 : }
     231             : 
     232           0 : void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
     233             :                                 const unsigned char *_blimit,
     234             :                                 const unsigned char *_limit,
     235             :                                 const unsigned char *_thresh) {
     236           0 :   const __m128i zero = _mm_set1_epi16(0);
     237           0 :   const __m128i one = _mm_set1_epi8(1);
     238           0 :   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
     239           0 :   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
     240           0 :   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
     241             :   __m128i mask, hev, flat, flat2;
     242             :   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
     243             :   __m128i abs_p1p0;
     244             : 
     245           0 :   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
     246           0 :   q4p4 = _mm_castps_si128(
     247           0 :       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
     248           0 :   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
     249           0 :   q3p3 = _mm_castps_si128(
     250           0 :       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
     251           0 :   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
     252           0 :   q2p2 = _mm_castps_si128(
     253           0 :       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
     254           0 :   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
     255           0 :   q1p1 = _mm_castps_si128(
     256           0 :       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
     257           0 :   p1q1 = _mm_shuffle_epi32(q1p1, 78);
     258           0 :   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
     259           0 :   q0p0 = _mm_castps_si128(
     260             :       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
     261           0 :   p0q0 = _mm_shuffle_epi32(q0p0, 78);
     262             : 
     263             :   {
     264             :     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     265           0 :     abs_p1p0 = abs_diff(q1p1, q0p0);
     266           0 :     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     267           0 :     fe = _mm_set1_epi8(0xfe);
     268           0 :     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     269           0 :     abs_p0q0 = abs_diff(q0p0, p0q0);
     270           0 :     abs_p1q1 = abs_diff(q1p1, p1q1);
     271           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     272           0 :     hev = _mm_subs_epu8(flat, thresh);
     273           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     274             : 
     275           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     276           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     277           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     278           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     279             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     280           0 :     mask = _mm_max_epu8(abs_p1p0, mask);
     281             :     // mask |= (abs(p1 - p0) > limit) * -1;
     282             :     // mask |= (abs(q1 - q0) > limit) * -1;
     283             : 
     284           0 :     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     285           0 :     mask = _mm_max_epu8(work, mask);
     286           0 :     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     287           0 :     mask = _mm_subs_epu8(mask, limit);
     288           0 :     mask = _mm_cmpeq_epi8(mask, zero);
     289             :   }
     290             : 
     291             :   // lp filter
     292             :   {
     293           0 :     const __m128i t4 = _mm_set1_epi8(4);
     294           0 :     const __m128i t3 = _mm_set1_epi8(3);
     295           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
     296           0 :     const __m128i t1 = _mm_set1_epi16(0x1);
     297           0 :     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     298           0 :     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
     299           0 :     __m128i qs0 = _mm_xor_si128(p0q0, t80);
     300           0 :     __m128i qs1 = _mm_xor_si128(p1q1, t80);
     301             :     __m128i filt;
     302             :     __m128i work_a;
     303             :     __m128i filter1, filter2;
     304             :     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
     305             :     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
     306             : 
     307           0 :     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
     308           0 :     work_a = _mm_subs_epi8(qs0, qs0ps0);
     309           0 :     filt = _mm_adds_epi8(filt, work_a);
     310           0 :     filt = _mm_adds_epi8(filt, work_a);
     311           0 :     filt = _mm_adds_epi8(filt, work_a);
     312             :     // (vpx_filter + 3 * (qs0 - ps0)) & mask
     313           0 :     filt = _mm_and_si128(filt, mask);
     314             : 
     315           0 :     filter1 = _mm_adds_epi8(filt, t4);
     316           0 :     filter2 = _mm_adds_epi8(filt, t3);
     317             : 
     318           0 :     filter1 = _mm_unpacklo_epi8(zero, filter1);
     319           0 :     filter1 = _mm_srai_epi16(filter1, 0xB);
     320           0 :     filter2 = _mm_unpacklo_epi8(zero, filter2);
     321           0 :     filter2 = _mm_srai_epi16(filter2, 0xB);
     322             : 
     323             :     // Filter1 >> 3
     324           0 :     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
     325           0 :     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
     326             : 
     327             :     // filt >> 1
     328           0 :     filt = _mm_adds_epi16(filter1, t1);
     329           0 :     filt = _mm_srai_epi16(filt, 1);
     330           0 :     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
     331             :                             filt);
     332           0 :     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
     333           0 :     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
     334             :     // loopfilter done
     335             : 
     336             :     {
     337             :       __m128i work;
     338           0 :       flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     339           0 :       flat = _mm_max_epu8(abs_p1p0, flat);
     340           0 :       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     341           0 :       flat = _mm_subs_epu8(flat, one);
     342           0 :       flat = _mm_cmpeq_epi8(flat, zero);
     343           0 :       flat = _mm_and_si128(flat, mask);
     344             : 
     345           0 :       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
     346           0 :       q5p5 = _mm_castps_si128(
     347           0 :           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
     348             : 
     349           0 :       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
     350           0 :       q6p6 = _mm_castps_si128(
     351           0 :           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
     352           0 :       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
     353             : 
     354           0 :       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
     355           0 :       q7p7 = _mm_castps_si128(
     356           0 :           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
     357           0 :       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
     358           0 :       flat2 = _mm_max_epu8(work, flat2);
     359           0 :       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
     360           0 :       flat2 = _mm_subs_epu8(flat2, one);
     361           0 :       flat2 = _mm_cmpeq_epi8(flat2, zero);
     362           0 :       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     363             :     }
     364             : 
     365             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     366             :     // flat and wide flat calculations
     367             :     {
     368           0 :       const __m128i eight = _mm_set1_epi16(8);
     369           0 :       const __m128i four = _mm_set1_epi16(4);
     370             :       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
     371             :       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
     372             :       __m128i pixelFilter_p, pixelFilter_q;
     373             :       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
     374             :       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
     375             : 
     376           0 :       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
     377           0 :       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
     378           0 :       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
     379           0 :       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
     380           0 :       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
     381           0 :       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
     382           0 :       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
     383           0 :       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
     384           0 :       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
     385           0 :       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
     386           0 :       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
     387           0 :       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
     388           0 :       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
     389           0 :       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
     390           0 :       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
     391           0 :       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
     392             : 
     393           0 :       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
     394             :                                     _mm_add_epi16(p4_16, p3_16));
     395           0 :       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
     396             :                                     _mm_add_epi16(q4_16, q3_16));
     397             : 
     398           0 :       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
     399           0 :       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
     400             : 
     401           0 :       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
     402           0 :       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
     403           0 :       pixelFilter_p =
     404           0 :           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
     405           0 :       pixetFilter_p2p1p0 = _mm_add_epi16(
     406             :           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
     407           0 :       res_p = _mm_srli_epi16(
     408             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
     409           0 :       res_q = _mm_srli_epi16(
     410             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
     411           0 :       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
     412           0 :       res_p = _mm_srli_epi16(
     413             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
     414           0 :       res_q = _mm_srli_epi16(
     415             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
     416             : 
     417           0 :       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
     418             : 
     419           0 :       sum_p7 = _mm_add_epi16(p7_16, p7_16);
     420           0 :       sum_q7 = _mm_add_epi16(q7_16, q7_16);
     421           0 :       sum_p3 = _mm_add_epi16(p3_16, p3_16);
     422           0 :       sum_q3 = _mm_add_epi16(q3_16, q3_16);
     423             : 
     424           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
     425           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
     426           0 :       res_p = _mm_srli_epi16(
     427             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
     428           0 :       res_q = _mm_srli_epi16(
     429             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
     430           0 :       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
     431             : 
     432           0 :       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
     433           0 :       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
     434           0 :       res_p = _mm_srli_epi16(
     435             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
     436           0 :       res_q = _mm_srli_epi16(
     437             :           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
     438           0 :       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
     439             : 
     440           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     441           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     442           0 :       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
     443           0 :       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
     444             : 
     445           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
     446           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
     447           0 :       res_p = _mm_srli_epi16(
     448             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
     449           0 :       res_q = _mm_srli_epi16(
     450             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
     451           0 :       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
     452             : 
     453           0 :       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
     454           0 :       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
     455             : 
     456           0 :       res_p = _mm_srli_epi16(
     457             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
     458           0 :       res_q = _mm_srli_epi16(
     459             :           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
     460           0 :       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
     461             : 
     462           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     463           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     464           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
     465           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
     466           0 :       res_p = _mm_srli_epi16(
     467             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
     468           0 :       res_q = _mm_srli_epi16(
     469             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
     470           0 :       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
     471             : 
     472           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     473           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     474           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
     475           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
     476           0 :       res_p = _mm_srli_epi16(
     477             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
     478           0 :       res_q = _mm_srli_epi16(
     479             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
     480           0 :       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
     481             : 
     482           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     483           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     484           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
     485           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
     486           0 :       res_p = _mm_srli_epi16(
     487             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
     488           0 :       res_q = _mm_srli_epi16(
     489             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
     490           0 :       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
     491             : 
     492           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     493           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     494           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
     495           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
     496           0 :       res_p = _mm_srli_epi16(
     497             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
     498           0 :       res_q = _mm_srli_epi16(
     499             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
     500           0 :       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
     501             :     }
     502             :     // wide flat
     503             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     504             : 
     505           0 :     flat = _mm_shuffle_epi32(flat, 68);
     506           0 :     flat2 = _mm_shuffle_epi32(flat2, 68);
     507             : 
     508           0 :     q2p2 = _mm_andnot_si128(flat, q2p2);
     509           0 :     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
     510           0 :     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
     511             : 
     512           0 :     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
     513           0 :     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
     514           0 :     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
     515             : 
     516           0 :     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
     517           0 :     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
     518           0 :     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
     519             : 
     520           0 :     q6p6 = _mm_andnot_si128(flat2, q6p6);
     521           0 :     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     522           0 :     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
     523           0 :     _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
     524           0 :     _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
     525             : 
     526           0 :     q5p5 = _mm_andnot_si128(flat2, q5p5);
     527           0 :     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     528           0 :     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
     529           0 :     _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
     530           0 :     _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
     531             : 
     532           0 :     q4p4 = _mm_andnot_si128(flat2, q4p4);
     533           0 :     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     534           0 :     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
     535           0 :     _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
     536           0 :     _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
     537             : 
     538           0 :     q3p3 = _mm_andnot_si128(flat2, q3p3);
     539           0 :     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     540           0 :     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
     541           0 :     _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
     542           0 :     _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
     543             : 
     544           0 :     q2p2 = _mm_andnot_si128(flat2, q2p2);
     545           0 :     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     546           0 :     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
     547           0 :     _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
     548           0 :     _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
     549             : 
     550           0 :     q1p1 = _mm_andnot_si128(flat2, q1p1);
     551           0 :     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     552           0 :     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
     553           0 :     _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
     554           0 :     _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
     555             : 
     556           0 :     q0p0 = _mm_andnot_si128(flat2, q0p0);
     557           0 :     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     558           0 :     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
     559           0 :     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
     560           0 :     _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
     561             :   }
     562           0 : }
     563             : 
     564           0 : static INLINE __m128i filter_add2_sub2(const __m128i *const total,
     565             :                                        const __m128i *const a1,
     566             :                                        const __m128i *const a2,
     567             :                                        const __m128i *const s1,
     568             :                                        const __m128i *const s2) {
     569           0 :   __m128i x = _mm_add_epi16(*a1, *total);
     570           0 :   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
     571           0 :   return x;
     572             : }
     573             : 
     574           0 : static INLINE __m128i filter8_mask(const __m128i *const flat,
     575             :                                    const __m128i *const other_filt,
     576             :                                    const __m128i *const f8_lo,
     577             :                                    const __m128i *const f8_hi) {
     578           0 :   const __m128i f8 =
     579           0 :       _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
     580           0 :   const __m128i result = _mm_and_si128(*flat, f8);
     581           0 :   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
     582             : }
     583             : 
     584           0 : static INLINE __m128i filter16_mask(const __m128i *const flat,
     585             :                                     const __m128i *const other_filt,
     586             :                                     const __m128i *const f_lo,
     587             :                                     const __m128i *const f_hi) {
     588           0 :   const __m128i f =
     589           0 :       _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
     590           0 :   const __m128i result = _mm_and_si128(*flat, f);
     591           0 :   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
     592             : }
     593             : 
     594           0 : void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
     595             :                                      const unsigned char *_blimit,
     596             :                                      const unsigned char *_limit,
     597             :                                      const unsigned char *_thresh) {
     598           0 :   const __m128i zero = _mm_set1_epi16(0);
     599           0 :   const __m128i one = _mm_set1_epi8(1);
     600           0 :   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
     601           0 :   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
     602           0 :   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
     603             :   __m128i mask, hev, flat, flat2;
     604             :   __m128i p7, p6, p5;
     605             :   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
     606             :   __m128i q5, q6, q7;
     607             : 
     608             :   __m128i op2, op1, op0, oq0, oq1, oq2;
     609             : 
     610             :   __m128i max_abs_p1p0q1q0;
     611             : 
     612           0 :   p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
     613           0 :   p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
     614           0 :   p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
     615           0 :   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
     616           0 :   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
     617           0 :   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
     618           0 :   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
     619           0 :   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
     620           0 :   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
     621           0 :   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
     622           0 :   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
     623           0 :   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
     624           0 :   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
     625           0 :   q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
     626           0 :   q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
     627           0 :   q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
     628             : 
     629             :   {
     630           0 :     const __m128i abs_p1p0 = abs_diff(p1, p0);
     631           0 :     const __m128i abs_q1q0 = abs_diff(q1, q0);
     632           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
     633           0 :     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     634           0 :     __m128i abs_p0q0 = abs_diff(p0, q0);
     635           0 :     __m128i abs_p1q1 = abs_diff(p1, q1);
     636             :     __m128i work;
     637           0 :     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
     638             : 
     639           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     640           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     641           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     642           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     643             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     644           0 :     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
     645             :     // mask |= (abs(p1 - p0) > limit) * -1;
     646             :     // mask |= (abs(q1 - q0) > limit) * -1;
     647           0 :     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
     648           0 :     mask = _mm_max_epu8(work, mask);
     649           0 :     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
     650           0 :     mask = _mm_max_epu8(work, mask);
     651           0 :     mask = _mm_subs_epu8(mask, limit);
     652           0 :     mask = _mm_cmpeq_epi8(mask, zero);
     653             :   }
     654             : 
     655             :   {
     656             :     __m128i work;
     657           0 :     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
     658           0 :     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
     659           0 :     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
     660           0 :     flat = _mm_max_epu8(work, flat);
     661           0 :     work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
     662           0 :     flat = _mm_subs_epu8(flat, one);
     663           0 :     flat = _mm_cmpeq_epi8(flat, zero);
     664           0 :     flat = _mm_and_si128(flat, mask);
     665           0 :     flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
     666           0 :     flat2 = _mm_max_epu8(work, flat2);
     667           0 :     work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
     668           0 :     flat2 = _mm_max_epu8(work, flat2);
     669           0 :     work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
     670           0 :     flat2 = _mm_max_epu8(work, flat2);
     671           0 :     flat2 = _mm_subs_epu8(flat2, one);
     672           0 :     flat2 = _mm_cmpeq_epi8(flat2, zero);
     673           0 :     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     674             :   }
     675             : 
     676             :   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     677             :   // filter4
     678             :   {
     679           0 :     const __m128i t4 = _mm_set1_epi8(4);
     680           0 :     const __m128i t3 = _mm_set1_epi8(3);
     681           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
     682           0 :     const __m128i te0 = _mm_set1_epi8(0xe0);
     683           0 :     const __m128i t1f = _mm_set1_epi8(0x1f);
     684           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
     685           0 :     const __m128i t7f = _mm_set1_epi8(0x7f);
     686           0 :     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
     687             : 
     688             :     __m128i filt;
     689             :     __m128i work_a;
     690             :     __m128i filter1, filter2;
     691             : 
     692           0 :     op1 = _mm_xor_si128(p1, t80);
     693           0 :     op0 = _mm_xor_si128(p0, t80);
     694           0 :     oq0 = _mm_xor_si128(q0, t80);
     695           0 :     oq1 = _mm_xor_si128(q1, t80);
     696             : 
     697           0 :     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
     698           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     699           0 :     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
     700             : 
     701           0 :     work_a = _mm_subs_epi8(oq0, op0);
     702           0 :     filt = _mm_adds_epi8(filt, work_a);
     703           0 :     filt = _mm_adds_epi8(filt, work_a);
     704           0 :     filt = _mm_adds_epi8(filt, work_a);
     705             :     // (vpx_filter + 3 * (qs0 - ps0)) & mask
     706           0 :     filt = _mm_and_si128(filt, mask);
     707           0 :     filter1 = _mm_adds_epi8(filt, t4);
     708           0 :     filter2 = _mm_adds_epi8(filt, t3);
     709             : 
     710             :     // Filter1 >> 3
     711           0 :     work_a = _mm_cmpgt_epi8(zero, filter1);
     712           0 :     filter1 = _mm_srli_epi16(filter1, 3);
     713           0 :     work_a = _mm_and_si128(work_a, te0);
     714           0 :     filter1 = _mm_and_si128(filter1, t1f);
     715           0 :     filter1 = _mm_or_si128(filter1, work_a);
     716           0 :     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
     717             : 
     718             :     // Filter2 >> 3
     719           0 :     work_a = _mm_cmpgt_epi8(zero, filter2);
     720           0 :     filter2 = _mm_srli_epi16(filter2, 3);
     721           0 :     work_a = _mm_and_si128(work_a, te0);
     722           0 :     filter2 = _mm_and_si128(filter2, t1f);
     723           0 :     filter2 = _mm_or_si128(filter2, work_a);
     724           0 :     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
     725             : 
     726             :     // filt >> 1
     727           0 :     filt = _mm_adds_epi8(filter1, t1);
     728           0 :     work_a = _mm_cmpgt_epi8(zero, filt);
     729           0 :     filt = _mm_srli_epi16(filt, 1);
     730           0 :     work_a = _mm_and_si128(work_a, t80);
     731           0 :     filt = _mm_and_si128(filt, t7f);
     732           0 :     filt = _mm_or_si128(filt, work_a);
     733           0 :     filt = _mm_andnot_si128(hev, filt);
     734           0 :     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
     735           0 :     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
     736             :     // loopfilter done
     737             : 
     738             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     739             :     // filter8
     740             :     {
     741           0 :       const __m128i four = _mm_set1_epi16(4);
     742           0 :       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
     743           0 :       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
     744           0 :       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
     745           0 :       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
     746           0 :       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
     747           0 :       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
     748           0 :       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
     749           0 :       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
     750             : 
     751           0 :       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
     752           0 :       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
     753           0 :       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
     754           0 :       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
     755           0 :       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
     756           0 :       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
     757           0 :       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
     758           0 :       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
     759             :       __m128i f8_lo, f8_hi;
     760             : 
     761           0 :       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
     762             :                             _mm_add_epi16(p3_lo, p2_lo));
     763           0 :       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
     764             :                             _mm_add_epi16(p2_lo, p1_lo));
     765           0 :       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
     766             : 
     767           0 :       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
     768             :                             _mm_add_epi16(p3_hi, p2_hi));
     769           0 :       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
     770             :                             _mm_add_epi16(p2_hi, p1_hi));
     771           0 :       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
     772             : 
     773           0 :       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
     774             : 
     775           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
     776           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
     777           0 :       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
     778             : 
     779           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
     780           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
     781           0 :       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
     782             : 
     783           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
     784           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
     785           0 :       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
     786             : 
     787           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
     788           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
     789           0 :       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
     790             : 
     791           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
     792           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
     793           0 :       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
     794             :     }
     795             : 
     796             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     797             :     // wide flat calculations
     798             :     {
     799           0 :       const __m128i eight = _mm_set1_epi16(8);
     800           0 :       const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
     801           0 :       const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
     802           0 :       const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
     803           0 :       const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
     804           0 :       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
     805           0 :       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
     806           0 :       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
     807           0 :       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
     808           0 :       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
     809           0 :       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
     810           0 :       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
     811           0 :       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
     812           0 :       const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
     813           0 :       const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
     814           0 :       const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
     815           0 :       const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
     816             : 
     817           0 :       const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
     818           0 :       const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
     819           0 :       const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
     820           0 :       const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
     821           0 :       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
     822           0 :       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
     823           0 :       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
     824           0 :       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
     825           0 :       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
     826           0 :       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
     827           0 :       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
     828           0 :       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
     829           0 :       const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
     830           0 :       const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
     831           0 :       const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
     832           0 :       const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
     833             : 
     834             :       __m128i f_lo;
     835             :       __m128i f_hi;
     836             : 
     837           0 :       f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
     838           0 :       f_lo =
     839           0 :           _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
     840           0 :       f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
     841             :                            _mm_add_epi16(p2_lo, p1_lo));
     842           0 :       f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
     843           0 :       f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
     844             : 
     845           0 :       f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
     846           0 :       f_hi =
     847           0 :           _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
     848           0 :       f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
     849             :                            _mm_add_epi16(p2_hi, p1_hi));
     850           0 :       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
     851           0 :       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
     852             : 
     853           0 :       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
     854           0 :       _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
     855             : 
     856           0 :       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
     857           0 :       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
     858           0 :       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
     859           0 :       _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
     860             : 
     861           0 :       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
     862           0 :       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
     863           0 :       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
     864           0 :       _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
     865             : 
     866           0 :       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
     867           0 :       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
     868           0 :       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
     869           0 :       _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
     870             : 
     871           0 :       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
     872           0 :       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
     873           0 :       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
     874           0 :       _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
     875             : 
     876           0 :       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
     877           0 :       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
     878           0 :       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
     879           0 :       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
     880             : 
     881           0 :       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
     882           0 :       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
     883           0 :       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
     884           0 :       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
     885             : 
     886           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
     887           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
     888           0 :       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
     889           0 :       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
     890             : 
     891           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
     892           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
     893           0 :       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
     894           0 :       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
     895             : 
     896           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
     897           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
     898           0 :       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
     899           0 :       _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
     900             : 
     901           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
     902           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
     903           0 :       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
     904           0 :       _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
     905             : 
     906           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
     907           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
     908           0 :       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
     909           0 :       _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
     910             : 
     911           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
     912           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
     913           0 :       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
     914           0 :       _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
     915             : 
     916           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
     917           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
     918           0 :       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
     919           0 :       _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
     920             :     }
     921             :     // wide flat
     922             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     923             :   }
     924           0 : }
     925             : 
     926           0 : void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
     927             :                                const unsigned char *_blimit,
     928             :                                const unsigned char *_limit,
     929             :                                const unsigned char *_thresh) {
     930             :   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
     931             :   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
     932             :   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
     933             :   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
     934             :   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
     935             :   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
     936           0 :   const __m128i zero = _mm_set1_epi16(0);
     937           0 :   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
     938           0 :   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
     939           0 :   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
     940             :   __m128i mask, hev, flat;
     941             :   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
     942             :   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
     943             : 
     944           0 :   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
     945           0 :                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
     946           0 :   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
     947           0 :                             _mm_loadl_epi64((__m128i *)(s + 2 * p)));
     948           0 :   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
     949           0 :                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
     950           0 :   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
     951             :                             _mm_loadl_epi64((__m128i *)(s - 0 * p)));
     952           0 :   p1q1 = _mm_shuffle_epi32(q1p1, 78);
     953           0 :   p0q0 = _mm_shuffle_epi32(q0p0, 78);
     954             : 
     955             :   {
     956             :     // filter_mask and hev_mask
     957           0 :     const __m128i one = _mm_set1_epi8(1);
     958           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
     959           0 :     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
     960             :     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
     961           0 :     abs_p1p0 = abs_diff(q1p1, q0p0);
     962           0 :     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     963             : 
     964           0 :     abs_p0q0 = abs_diff(q0p0, p0q0);
     965           0 :     abs_p1q1 = abs_diff(q1p1, p1q1);
     966           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     967           0 :     hev = _mm_subs_epu8(flat, thresh);
     968           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     969             : 
     970           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     971           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     972           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     973           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     974             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     975           0 :     mask = _mm_max_epu8(abs_p1p0, mask);
     976             :     // mask |= (abs(p1 - p0) > limit) * -1;
     977             :     // mask |= (abs(q1 - q0) > limit) * -1;
     978             : 
     979           0 :     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     980           0 :     mask = _mm_max_epu8(work, mask);
     981           0 :     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     982           0 :     mask = _mm_subs_epu8(mask, limit);
     983           0 :     mask = _mm_cmpeq_epi8(mask, zero);
     984             : 
     985             :     // flat_mask4
     986             : 
     987           0 :     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     988           0 :     flat = _mm_max_epu8(abs_p1p0, flat);
     989           0 :     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     990           0 :     flat = _mm_subs_epu8(flat, one);
     991           0 :     flat = _mm_cmpeq_epi8(flat, zero);
     992           0 :     flat = _mm_and_si128(flat, mask);
     993             :   }
     994             : 
     995             :   {
     996           0 :     const __m128i four = _mm_set1_epi16(4);
     997           0 :     unsigned char *src = s;
     998             :     {
     999             :       __m128i workp_a, workp_b, workp_shft;
    1000           0 :       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
    1001           0 :       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
    1002           0 :       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
    1003           0 :       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
    1004           0 :       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
    1005           0 :       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
    1006           0 :       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
    1007           0 :       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
    1008             : 
    1009           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
    1010           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
    1011           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
    1012           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1013           0 :       _mm_storel_epi64((__m128i *)&flat_op2[0],
    1014             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1015             : 
    1016           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
    1017           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1018           0 :       _mm_storel_epi64((__m128i *)&flat_op1[0],
    1019             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1020             : 
    1021           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
    1022           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
    1023           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1024           0 :       _mm_storel_epi64((__m128i *)&flat_op0[0],
    1025             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1026             : 
    1027           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
    1028           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
    1029           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1030           0 :       _mm_storel_epi64((__m128i *)&flat_oq0[0],
    1031             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1032             : 
    1033           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
    1034           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
    1035           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1036           0 :       _mm_storel_epi64((__m128i *)&flat_oq1[0],
    1037             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1038             : 
    1039           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
    1040           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
    1041           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1042           0 :       _mm_storel_epi64((__m128i *)&flat_oq2[0],
    1043             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1044             :     }
    1045             :   }
    1046             :   // lp filter
    1047             :   {
    1048           0 :     const __m128i t4 = _mm_set1_epi8(4);
    1049           0 :     const __m128i t3 = _mm_set1_epi8(3);
    1050           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
    1051           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
    1052           0 :     const __m128i ps1 =
    1053           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
    1054           0 :     const __m128i ps0 =
    1055           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
    1056           0 :     const __m128i qs0 =
    1057           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
    1058           0 :     const __m128i qs1 =
    1059           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
    1060             :     __m128i filt;
    1061             :     __m128i work_a;
    1062             :     __m128i filter1, filter2;
    1063             : 
    1064           0 :     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    1065           0 :     work_a = _mm_subs_epi8(qs0, ps0);
    1066           0 :     filt = _mm_adds_epi8(filt, work_a);
    1067           0 :     filt = _mm_adds_epi8(filt, work_a);
    1068           0 :     filt = _mm_adds_epi8(filt, work_a);
    1069             :     // (vpx_filter + 3 * (qs0 - ps0)) & mask
    1070           0 :     filt = _mm_and_si128(filt, mask);
    1071             : 
    1072           0 :     filter1 = _mm_adds_epi8(filt, t4);
    1073           0 :     filter2 = _mm_adds_epi8(filt, t3);
    1074             : 
    1075             :     // Filter1 >> 3
    1076           0 :     filter1 = _mm_unpacklo_epi8(zero, filter1);
    1077           0 :     filter1 = _mm_srai_epi16(filter1, 11);
    1078           0 :     filter1 = _mm_packs_epi16(filter1, filter1);
    1079             : 
    1080             :     // Filter2 >> 3
    1081           0 :     filter2 = _mm_unpacklo_epi8(zero, filter2);
    1082           0 :     filter2 = _mm_srai_epi16(filter2, 11);
    1083           0 :     filter2 = _mm_packs_epi16(filter2, zero);
    1084             : 
    1085             :     // filt >> 1
    1086           0 :     filt = _mm_adds_epi8(filter1, t1);
    1087           0 :     filt = _mm_unpacklo_epi8(zero, filt);
    1088           0 :     filt = _mm_srai_epi16(filt, 9);
    1089           0 :     filt = _mm_packs_epi16(filt, zero);
    1090             : 
    1091           0 :     filt = _mm_andnot_si128(hev, filt);
    1092             : 
    1093           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    1094           0 :     q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
    1095           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1096           0 :     q0 = _mm_and_si128(flat, q0);
    1097           0 :     q0 = _mm_or_si128(work_a, q0);
    1098             : 
    1099           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    1100           0 :     q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
    1101           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1102           0 :     q1 = _mm_and_si128(flat, q1);
    1103           0 :     q1 = _mm_or_si128(work_a, q1);
    1104             : 
    1105           0 :     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1106           0 :     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
    1107           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1108           0 :     q2 = _mm_and_si128(flat, q2);
    1109           0 :     q2 = _mm_or_si128(work_a, q2);
    1110             : 
    1111           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    1112           0 :     p0 = _mm_loadl_epi64((__m128i *)flat_op0);
    1113           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1114           0 :     p0 = _mm_and_si128(flat, p0);
    1115           0 :     p0 = _mm_or_si128(work_a, p0);
    1116             : 
    1117           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    1118           0 :     p1 = _mm_loadl_epi64((__m128i *)flat_op1);
    1119           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1120           0 :     p1 = _mm_and_si128(flat, p1);
    1121           0 :     p1 = _mm_or_si128(work_a, p1);
    1122             : 
    1123           0 :     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1124           0 :     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
    1125           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1126           0 :     p2 = _mm_and_si128(flat, p2);
    1127           0 :     p2 = _mm_or_si128(work_a, p2);
    1128             : 
    1129           0 :     _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
    1130           0 :     _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
    1131           0 :     _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
    1132             :     _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
    1133           0 :     _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
    1134           0 :     _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
    1135             :   }
    1136           0 : }
    1137             : 
    1138           0 : void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
    1139             :                                     const uint8_t *_limit0,
    1140             :                                     const uint8_t *_thresh0,
    1141             :                                     const uint8_t *_blimit1,
    1142             :                                     const uint8_t *_limit1,
    1143             :                                     const uint8_t *_thresh1) {
    1144             :   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
    1145             :   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
    1146             :   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
    1147             :   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
    1148             :   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
    1149             :   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
    1150           0 :   const __m128i zero = _mm_set1_epi16(0);
    1151           0 :   const __m128i blimit =
    1152           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
    1153             :                          _mm_load_si128((const __m128i *)_blimit1));
    1154           0 :   const __m128i limit =
    1155           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
    1156             :                          _mm_load_si128((const __m128i *)_limit1));
    1157           0 :   const __m128i thresh =
    1158           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
    1159             :                          _mm_load_si128((const __m128i *)_thresh1));
    1160             : 
    1161             :   __m128i mask, hev, flat;
    1162             :   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    1163             : 
    1164           0 :   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
    1165           0 :   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1166           0 :   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
    1167           0 :   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
    1168           0 :   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
    1169           0 :   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
    1170           0 :   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1171           0 :   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
    1172             :   {
    1173           0 :     const __m128i abs_p1p0 =
    1174           0 :         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
    1175           0 :     const __m128i abs_q1q0 =
    1176           0 :         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
    1177           0 :     const __m128i one = _mm_set1_epi8(1);
    1178           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
    1179           0 :     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    1180           0 :     __m128i abs_p0q0 =
    1181           0 :         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    1182           0 :     __m128i abs_p1q1 =
    1183           0 :         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
    1184             :     __m128i work;
    1185             : 
    1186             :     // filter_mask and hev_mask
    1187           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    1188           0 :     hev = _mm_subs_epu8(flat, thresh);
    1189           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    1190             : 
    1191           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    1192           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    1193           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    1194           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    1195             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    1196           0 :     mask = _mm_max_epu8(flat, mask);
    1197             :     // mask |= (abs(p1 - p0) > limit) * -1;
    1198             :     // mask |= (abs(q1 - q0) > limit) * -1;
    1199           0 :     work = _mm_max_epu8(
    1200             :         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
    1201             :         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    1202           0 :     mask = _mm_max_epu8(work, mask);
    1203           0 :     work = _mm_max_epu8(
    1204             :         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
    1205             :         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    1206           0 :     mask = _mm_max_epu8(work, mask);
    1207           0 :     mask = _mm_subs_epu8(mask, limit);
    1208           0 :     mask = _mm_cmpeq_epi8(mask, zero);
    1209             : 
    1210             :     // flat_mask4
    1211           0 :     work = _mm_max_epu8(
    1212             :         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
    1213             :         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
    1214           0 :     flat = _mm_max_epu8(work, flat);
    1215           0 :     work = _mm_max_epu8(
    1216             :         _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
    1217             :         _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
    1218           0 :     flat = _mm_max_epu8(work, flat);
    1219           0 :     flat = _mm_subs_epu8(flat, one);
    1220           0 :     flat = _mm_cmpeq_epi8(flat, zero);
    1221           0 :     flat = _mm_and_si128(flat, mask);
    1222             :   }
    1223             :   {
    1224           0 :     const __m128i four = _mm_set1_epi16(4);
    1225           0 :     unsigned char *src = s;
    1226           0 :     int i = 0;
    1227             : 
    1228             :     do {
    1229             :       __m128i workp_a, workp_b, workp_shft;
    1230           0 :       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
    1231           0 :       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
    1232           0 :       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
    1233           0 :       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
    1234           0 :       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
    1235           0 :       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
    1236           0 :       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
    1237           0 :       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
    1238             : 
    1239           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
    1240           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
    1241           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
    1242           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1243           0 :       _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
    1244             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1245             : 
    1246           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
    1247           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1248           0 :       _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
    1249             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1250             : 
    1251           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
    1252           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
    1253           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1254           0 :       _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
    1255             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1256             : 
    1257           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
    1258           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
    1259           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1260           0 :       _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
    1261             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1262             : 
    1263           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
    1264           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
    1265           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1266           0 :       _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
    1267             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1268             : 
    1269           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
    1270           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
    1271           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1272           0 :       _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
    1273             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1274             : 
    1275           0 :       src += 8;
    1276           0 :     } while (++i < 2);
    1277             :   }
    1278             :   // lp filter
    1279             :   {
    1280           0 :     const __m128i t4 = _mm_set1_epi8(4);
    1281           0 :     const __m128i t3 = _mm_set1_epi8(3);
    1282           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
    1283           0 :     const __m128i te0 = _mm_set1_epi8(0xe0);
    1284           0 :     const __m128i t1f = _mm_set1_epi8(0x1f);
    1285           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
    1286           0 :     const __m128i t7f = _mm_set1_epi8(0x7f);
    1287             : 
    1288           0 :     const __m128i ps1 =
    1289           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
    1290           0 :     const __m128i ps0 =
    1291           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
    1292           0 :     const __m128i qs0 =
    1293           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
    1294           0 :     const __m128i qs1 =
    1295           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
    1296             :     __m128i filt;
    1297             :     __m128i work_a;
    1298             :     __m128i filter1, filter2;
    1299             : 
    1300           0 :     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    1301           0 :     work_a = _mm_subs_epi8(qs0, ps0);
    1302           0 :     filt = _mm_adds_epi8(filt, work_a);
    1303           0 :     filt = _mm_adds_epi8(filt, work_a);
    1304           0 :     filt = _mm_adds_epi8(filt, work_a);
    1305             :     // (vpx_filter + 3 * (qs0 - ps0)) & mask
    1306           0 :     filt = _mm_and_si128(filt, mask);
    1307             : 
    1308           0 :     filter1 = _mm_adds_epi8(filt, t4);
    1309           0 :     filter2 = _mm_adds_epi8(filt, t3);
    1310             : 
    1311             :     // Filter1 >> 3
    1312           0 :     work_a = _mm_cmpgt_epi8(zero, filter1);
    1313           0 :     filter1 = _mm_srli_epi16(filter1, 3);
    1314           0 :     work_a = _mm_and_si128(work_a, te0);
    1315           0 :     filter1 = _mm_and_si128(filter1, t1f);
    1316           0 :     filter1 = _mm_or_si128(filter1, work_a);
    1317             : 
    1318             :     // Filter2 >> 3
    1319           0 :     work_a = _mm_cmpgt_epi8(zero, filter2);
    1320           0 :     filter2 = _mm_srli_epi16(filter2, 3);
    1321           0 :     work_a = _mm_and_si128(work_a, te0);
    1322           0 :     filter2 = _mm_and_si128(filter2, t1f);
    1323           0 :     filter2 = _mm_or_si128(filter2, work_a);
    1324             : 
    1325             :     // filt >> 1
    1326           0 :     filt = _mm_adds_epi8(filter1, t1);
    1327           0 :     work_a = _mm_cmpgt_epi8(zero, filt);
    1328           0 :     filt = _mm_srli_epi16(filt, 1);
    1329           0 :     work_a = _mm_and_si128(work_a, t80);
    1330           0 :     filt = _mm_and_si128(filt, t7f);
    1331           0 :     filt = _mm_or_si128(filt, work_a);
    1332             : 
    1333           0 :     filt = _mm_andnot_si128(hev, filt);
    1334             : 
    1335           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    1336           0 :     q0 = _mm_load_si128((__m128i *)flat_oq0);
    1337           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1338           0 :     q0 = _mm_and_si128(flat, q0);
    1339           0 :     q0 = _mm_or_si128(work_a, q0);
    1340             : 
    1341           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    1342           0 :     q1 = _mm_load_si128((__m128i *)flat_oq1);
    1343           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1344           0 :     q1 = _mm_and_si128(flat, q1);
    1345           0 :     q1 = _mm_or_si128(work_a, q1);
    1346             : 
    1347           0 :     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1348           0 :     q2 = _mm_load_si128((__m128i *)flat_oq2);
    1349           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1350           0 :     q2 = _mm_and_si128(flat, q2);
    1351           0 :     q2 = _mm_or_si128(work_a, q2);
    1352             : 
    1353           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    1354           0 :     p0 = _mm_load_si128((__m128i *)flat_op0);
    1355           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1356           0 :     p0 = _mm_and_si128(flat, p0);
    1357           0 :     p0 = _mm_or_si128(work_a, p0);
    1358             : 
    1359           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    1360           0 :     p1 = _mm_load_si128((__m128i *)flat_op1);
    1361           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1362           0 :     p1 = _mm_and_si128(flat, p1);
    1363           0 :     p1 = _mm_or_si128(work_a, p1);
    1364             : 
    1365           0 :     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1366           0 :     p2 = _mm_load_si128((__m128i *)flat_op2);
    1367           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1368           0 :     p2 = _mm_and_si128(flat, p2);
    1369           0 :     p2 = _mm_or_si128(work_a, p2);
    1370             : 
    1371           0 :     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
    1372           0 :     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
    1373           0 :     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
    1374             :     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
    1375           0 :     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
    1376           0 :     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
    1377             :   }
    1378           0 : }
    1379             : 
    1380           0 : void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
    1381             :                                     const unsigned char *_blimit0,
    1382             :                                     const unsigned char *_limit0,
    1383             :                                     const unsigned char *_thresh0,
    1384             :                                     const unsigned char *_blimit1,
    1385             :                                     const unsigned char *_limit1,
    1386             :                                     const unsigned char *_thresh1) {
    1387           0 :   const __m128i blimit =
    1388           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
    1389             :                          _mm_load_si128((const __m128i *)_blimit1));
    1390           0 :   const __m128i limit =
    1391           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
    1392             :                          _mm_load_si128((const __m128i *)_limit1));
    1393           0 :   const __m128i thresh =
    1394           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
    1395             :                          _mm_load_si128((const __m128i *)_thresh1));
    1396           0 :   const __m128i zero = _mm_set1_epi16(0);
    1397             :   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    1398             :   __m128i mask, hev, flat;
    1399             : 
    1400           0 :   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
    1401           0 :   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1402           0 :   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
    1403           0 :   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
    1404           0 :   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
    1405           0 :   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
    1406           0 :   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1407           0 :   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
    1408             : 
    1409             :   // filter_mask and hev_mask
    1410             :   {
    1411           0 :     const __m128i abs_p1p0 =
    1412           0 :         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
    1413           0 :     const __m128i abs_q1q0 =
    1414           0 :         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
    1415           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
    1416           0 :     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    1417           0 :     __m128i abs_p0q0 =
    1418           0 :         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    1419           0 :     __m128i abs_p1q1 =
    1420           0 :         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
    1421             :     __m128i work;
    1422             : 
    1423           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    1424           0 :     hev = _mm_subs_epu8(flat, thresh);
    1425           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    1426             : 
    1427           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    1428           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    1429           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    1430           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    1431             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    1432           0 :     mask = _mm_max_epu8(flat, mask);
    1433             :     // mask |= (abs(p1 - p0) > limit) * -1;
    1434             :     // mask |= (abs(q1 - q0) > limit) * -1;
    1435           0 :     work = _mm_max_epu8(
    1436             :         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
    1437             :         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    1438           0 :     mask = _mm_max_epu8(work, mask);
    1439           0 :     work = _mm_max_epu8(
    1440             :         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
    1441             :         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    1442           0 :     mask = _mm_max_epu8(work, mask);
    1443           0 :     mask = _mm_subs_epu8(mask, limit);
    1444           0 :     mask = _mm_cmpeq_epi8(mask, zero);
    1445             :   }
    1446             : 
    1447             :   // filter4
    1448             :   {
    1449           0 :     const __m128i t4 = _mm_set1_epi8(4);
    1450           0 :     const __m128i t3 = _mm_set1_epi8(3);
    1451           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
    1452           0 :     const __m128i te0 = _mm_set1_epi8(0xe0);
    1453           0 :     const __m128i t1f = _mm_set1_epi8(0x1f);
    1454           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
    1455           0 :     const __m128i t7f = _mm_set1_epi8(0x7f);
    1456             : 
    1457           0 :     const __m128i ps1 =
    1458           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
    1459           0 :     const __m128i ps0 =
    1460           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
    1461           0 :     const __m128i qs0 =
    1462           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
    1463           0 :     const __m128i qs1 =
    1464           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
    1465             :     __m128i filt;
    1466             :     __m128i work_a;
    1467             :     __m128i filter1, filter2;
    1468             : 
    1469           0 :     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    1470           0 :     work_a = _mm_subs_epi8(qs0, ps0);
    1471           0 :     filt = _mm_adds_epi8(filt, work_a);
    1472           0 :     filt = _mm_adds_epi8(filt, work_a);
    1473           0 :     filt = _mm_adds_epi8(filt, work_a);
    1474             :     // (vpx_filter + 3 * (qs0 - ps0)) & mask
    1475           0 :     filt = _mm_and_si128(filt, mask);
    1476             : 
    1477           0 :     filter1 = _mm_adds_epi8(filt, t4);
    1478           0 :     filter2 = _mm_adds_epi8(filt, t3);
    1479             : 
    1480             :     // Filter1 >> 3
    1481           0 :     work_a = _mm_cmpgt_epi8(zero, filter1);
    1482           0 :     filter1 = _mm_srli_epi16(filter1, 3);
    1483           0 :     work_a = _mm_and_si128(work_a, te0);
    1484           0 :     filter1 = _mm_and_si128(filter1, t1f);
    1485           0 :     filter1 = _mm_or_si128(filter1, work_a);
    1486             : 
    1487             :     // Filter2 >> 3
    1488           0 :     work_a = _mm_cmpgt_epi8(zero, filter2);
    1489           0 :     filter2 = _mm_srli_epi16(filter2, 3);
    1490           0 :     work_a = _mm_and_si128(work_a, te0);
    1491           0 :     filter2 = _mm_and_si128(filter2, t1f);
    1492           0 :     filter2 = _mm_or_si128(filter2, work_a);
    1493             : 
    1494             :     // filt >> 1
    1495           0 :     filt = _mm_adds_epi8(filter1, t1);
    1496           0 :     work_a = _mm_cmpgt_epi8(zero, filt);
    1497           0 :     filt = _mm_srli_epi16(filt, 1);
    1498           0 :     work_a = _mm_and_si128(work_a, t80);
    1499           0 :     filt = _mm_and_si128(filt, t7f);
    1500           0 :     filt = _mm_or_si128(filt, work_a);
    1501             : 
    1502           0 :     filt = _mm_andnot_si128(hev, filt);
    1503             : 
    1504           0 :     q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    1505           0 :     q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    1506           0 :     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    1507           0 :     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    1508             : 
    1509           0 :     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
    1510           0 :     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
    1511             :     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
    1512           0 :     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
    1513             :   }
    1514           0 : }
    1515             : 
    1516           0 : static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
    1517             :                                  int in_p, unsigned char *out, int out_p) {
    1518             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    1519             :   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
    1520             : 
    1521             :   // 2-way interleave w/hoisting of unpacks
    1522           0 :   x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
    1523           0 :   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
    1524           0 :   x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
    1525             : 
    1526           0 :   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
    1527           0 :   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
    1528           0 :   x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
    1529             : 
    1530           0 :   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
    1531           0 :   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
    1532           0 :   x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
    1533             : 
    1534           0 :   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
    1535           0 :   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
    1536           0 :   x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
    1537           0 :   x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
    1538             : 
    1539           0 :   x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
    1540           0 :   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
    1541           0 :   x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
    1542           0 :   x5 = _mm_unpacklo_epi16(x2, x3);                // 10
    1543             : 
    1544           0 :   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
    1545           0 :   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
    1546           0 :   x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
    1547             : 
    1548           0 :   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
    1549           0 :   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
    1550           0 :   x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
    1551           0 :   x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
    1552             : 
    1553           0 :   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
    1554           0 :   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
    1555           0 :   x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
    1556           0 :   x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
    1557             : 
    1558           0 :   x6 = _mm_unpacklo_epi32(x4, x5);     // 13
    1559           0 :   x7 = _mm_unpackhi_epi32(x4, x5);     // 14
    1560           0 :   x14 = _mm_unpacklo_epi32(x12, x13);  // 15
    1561           0 :   x15 = _mm_unpackhi_epi32(x12, x13);  // 16
    1562             : 
    1563             :   // Store first 4-line result
    1564           0 :   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
    1565           0 :   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
    1566           0 :   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
    1567           0 :   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
    1568             : 
    1569           0 :   x4 = _mm_unpackhi_epi16(x0, x1);
    1570           0 :   x5 = _mm_unpackhi_epi16(x2, x3);
    1571           0 :   x12 = _mm_unpackhi_epi16(x8, x9);
    1572           0 :   x13 = _mm_unpackhi_epi16(x10, x11);
    1573             : 
    1574           0 :   x6 = _mm_unpacklo_epi32(x4, x5);
    1575           0 :   x7 = _mm_unpackhi_epi32(x4, x5);
    1576           0 :   x14 = _mm_unpacklo_epi32(x12, x13);
    1577           0 :   x15 = _mm_unpackhi_epi32(x12, x13);
    1578             : 
    1579             :   // Store second 4-line result
    1580           0 :   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
    1581           0 :   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
    1582           0 :   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
    1583           0 :   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
    1584           0 : }
    1585             : 
    1586           0 : static INLINE void transpose(unsigned char *src[], int in_p,
    1587             :                              unsigned char *dst[], int out_p,
    1588             :                              int num_8x8_to_transpose) {
    1589           0 :   int idx8x8 = 0;
    1590             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    1591             :   do {
    1592           0 :     unsigned char *in = src[idx8x8];
    1593           0 :     unsigned char *out = dst[idx8x8];
    1594             : 
    1595           0 :     x0 =
    1596             :         _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
    1597           0 :     x1 =
    1598           0 :         _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
    1599             :     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    1600           0 :     x0 = _mm_unpacklo_epi8(x0, x1);
    1601             : 
    1602           0 :     x2 =
    1603           0 :         _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
    1604           0 :     x3 =
    1605           0 :         _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
    1606             :     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    1607           0 :     x1 = _mm_unpacklo_epi8(x2, x3);
    1608             : 
    1609           0 :     x4 =
    1610           0 :         _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
    1611           0 :     x5 =
    1612           0 :         _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
    1613             :     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
    1614           0 :     x2 = _mm_unpacklo_epi8(x4, x5);
    1615             : 
    1616           0 :     x6 =
    1617           0 :         _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
    1618           0 :     x7 =
    1619           0 :         _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
    1620             :     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
    1621           0 :     x3 = _mm_unpacklo_epi8(x6, x7);
    1622             : 
    1623             :     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    1624           0 :     x4 = _mm_unpacklo_epi16(x0, x1);
    1625             :     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
    1626           0 :     x5 = _mm_unpacklo_epi16(x2, x3);
    1627             :     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
    1628           0 :     x6 = _mm_unpacklo_epi32(x4, x5);
    1629           0 :     _mm_storel_pd((double *)(out + 0 * out_p),
    1630             :                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
    1631           0 :     _mm_storeh_pd((double *)(out + 1 * out_p),
    1632             :                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
    1633             :     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
    1634           0 :     x7 = _mm_unpackhi_epi32(x4, x5);
    1635           0 :     _mm_storel_pd((double *)(out + 2 * out_p),
    1636             :                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
    1637           0 :     _mm_storeh_pd((double *)(out + 3 * out_p),
    1638             :                   _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
    1639             : 
    1640             :     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    1641           0 :     x4 = _mm_unpackhi_epi16(x0, x1);
    1642             :     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
    1643           0 :     x5 = _mm_unpackhi_epi16(x2, x3);
    1644             :     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
    1645           0 :     x6 = _mm_unpacklo_epi32(x4, x5);
    1646           0 :     _mm_storel_pd((double *)(out + 4 * out_p),
    1647             :                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
    1648           0 :     _mm_storeh_pd((double *)(out + 5 * out_p),
    1649             :                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
    1650             :     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
    1651           0 :     x7 = _mm_unpackhi_epi32(x4, x5);
    1652             : 
    1653           0 :     _mm_storel_pd((double *)(out + 6 * out_p),
    1654             :                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
    1655           0 :     _mm_storeh_pd((double *)(out + 7 * out_p),
    1656             :                   _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
    1657           0 :   } while (++idx8x8 < num_8x8_to_transpose);
    1658           0 : }
    1659             : 
    1660           0 : void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    1661             :                                   const uint8_t *limit0, const uint8_t *thresh0,
    1662             :                                   const uint8_t *blimit1, const uint8_t *limit1,
    1663             :                                   const uint8_t *thresh1) {
    1664             :   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
    1665             :   unsigned char *src[2];
    1666             :   unsigned char *dst[2];
    1667             : 
    1668             :   // Transpose 8x16
    1669           0 :   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
    1670             : 
    1671             :   // Loop filtering
    1672           0 :   vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
    1673             :                                  blimit1, limit1, thresh1);
    1674           0 :   src[0] = t_dst;
    1675           0 :   src[1] = t_dst + 8;
    1676           0 :   dst[0] = s - 4;
    1677           0 :   dst[1] = s - 4 + p * 8;
    1678             : 
    1679             :   // Transpose back
    1680           0 :   transpose(src, 16, dst, p, 2);
    1681           0 : }
    1682             : 
    1683           0 : void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
    1684             :                              const unsigned char *blimit,
    1685             :                              const unsigned char *limit,
    1686             :                              const unsigned char *thresh) {
    1687             :   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
    1688             :   unsigned char *src[1];
    1689             :   unsigned char *dst[1];
    1690             : 
    1691             :   // Transpose 8x8
    1692           0 :   src[0] = s - 4;
    1693           0 :   dst[0] = t_dst;
    1694             : 
    1695           0 :   transpose(src, p, dst, 8, 1);
    1696             : 
    1697             :   // Loop filtering
    1698           0 :   vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
    1699             : 
    1700           0 :   src[0] = t_dst;
    1701           0 :   dst[0] = s - 4;
    1702             : 
    1703             :   // Transpose back
    1704           0 :   transpose(src, 8, dst, p, 1);
    1705           0 : }
    1706             : 
    1707           0 : void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    1708             :                                   const uint8_t *limit0, const uint8_t *thresh0,
    1709             :                                   const uint8_t *blimit1, const uint8_t *limit1,
    1710             :                                   const uint8_t *thresh1) {
    1711             :   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
    1712             :   unsigned char *src[2];
    1713             :   unsigned char *dst[2];
    1714             : 
    1715             :   // Transpose 8x16
    1716           0 :   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
    1717             : 
    1718             :   // Loop filtering
    1719           0 :   vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
    1720             :                                  blimit1, limit1, thresh1);
    1721           0 :   src[0] = t_dst;
    1722           0 :   src[1] = t_dst + 8;
    1723             : 
    1724           0 :   dst[0] = s - 4;
    1725           0 :   dst[1] = s - 4 + p * 8;
    1726             : 
    1727             :   // Transpose back
    1728           0 :   transpose(src, 16, dst, p, 2);
    1729           0 : }
    1730             : 
    1731           0 : void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
    1732             :                               const unsigned char *blimit,
    1733             :                               const unsigned char *limit,
    1734             :                               const unsigned char *thresh) {
    1735             :   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
    1736             :   unsigned char *src[2];
    1737             :   unsigned char *dst[2];
    1738             : 
    1739           0 :   src[0] = s - 8;
    1740           0 :   src[1] = s;
    1741           0 :   dst[0] = t_dst;
    1742           0 :   dst[1] = t_dst + 8 * 8;
    1743             : 
    1744             :   // Transpose 16x8
    1745           0 :   transpose(src, p, dst, 8, 2);
    1746             : 
    1747             :   // Loop filtering
    1748           0 :   vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
    1749             : 
    1750           0 :   src[0] = t_dst;
    1751           0 :   src[1] = t_dst + 8 * 8;
    1752           0 :   dst[0] = s - 8;
    1753           0 :   dst[1] = s;
    1754             : 
    1755             :   // Transpose back
    1756           0 :   transpose(src, 8, dst, p, 2);
    1757           0 : }
    1758             : 
    1759           0 : void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
    1760             :                                    const uint8_t *blimit, const uint8_t *limit,
    1761             :                                    const uint8_t *thresh) {
    1762             :   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
    1763             : 
    1764             :   // Transpose 16x16
    1765           0 :   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
    1766           0 :   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
    1767             : 
    1768             :   // Loop filtering
    1769           0 :   vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
    1770             : 
    1771             :   // Transpose back
    1772           0 :   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
    1773           0 :   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
    1774           0 : }

Generated by: LCOV version 1.13