LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - loopfilter_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 1085 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 18 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>  // SSE2
      13             : 
      14             : #include "./aom_dsp_rtcd.h"
      15             : #include "aom_ports/mem.h"
      16             : #include "aom_ports/emmintrin_compat.h"
      17             : 
      18           0 : static INLINE __m128i abs_diff(__m128i a, __m128i b) {
      19           0 :   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
      20             : }
      21             : 
      22             : #if CONFIG_PARALLEL_DEBLOCKING
      23             : // filter_mask and hev_mask
      24             : #define FILTER_HEV_MASK4                                                      \
      25             :   do {                                                                        \
      26             :     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
      27             :     __m128i flat = abs_diff(q1p1, q0p0);                                      \
      28             :     /* abs(p1 - q1), abs(p0 - q0) */                                          \
      29             :     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
      30             :     __m128i abs_p0q0, abs_p1q1;                                               \
      31             :                                                                               \
      32             :     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
      33             :     hev =                                                                     \
      34             :         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
      35             :     hev = _mm_cmpgt_epi16(hev, thresh);                                       \
      36             :     hev = _mm_packs_epi16(hev, hev);                                          \
      37             :                                                                               \
      38             :     /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
      39             :     /*                                  p1, p0, q0, q1); */                   \
      40             :     abs_p0q0 =                                                                \
      41             :         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
      42             :     abs_p1q1 =                                                                \
      43             :         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
      44             :     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
      45             :     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
      46             :     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
      47             :     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
      48             :     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
      49             :     mask = _mm_unpacklo_epi64(mask, flat);                                    \
      50             :     mask = _mm_subs_epu8(mask, limit);                                        \
      51             :     mask = _mm_cmpeq_epi8(mask, zero);                                        \
      52             :     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
      53             :   } while (0)
      54             : #endif  // CONFIG_PARALLEL_DEBLOCKING
      55             : 
      56             : // filter_mask and hev_mask
      57             : #define FILTER_HEV_MASK                                                       \
      58             :   do {                                                                        \
      59             :     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
      60             :     __m128i flat = abs_diff(q1p1, q0p0);                                      \
      61             :     /* abs(p1 - q1), abs(p0 - q0) */                                          \
      62             :     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
      63             :     __m128i abs_p0q0, abs_p1q1, work;                                         \
      64             :                                                                               \
      65             :     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
      66             :     hev =                                                                     \
      67             :         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
      68             :     hev = _mm_cmpgt_epi16(hev, thresh);                                       \
      69             :     hev = _mm_packs_epi16(hev, hev);                                          \
      70             :                                                                               \
      71             :     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
      72             :     /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
      73             :     abs_p0q0 =                                                                \
      74             :         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
      75             :     abs_p1q1 =                                                                \
      76             :         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
      77             :     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
      78             :     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
      79             :     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
      80             :     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
      81             :     /* abs(p3 - p2), abs(p2 - p1) */                                          \
      82             :     work = abs_diff(p3p2, p2p1);                                              \
      83             :     flat = _mm_max_epu8(work, flat);                                          \
      84             :     /* abs(q3 - q2), abs(q2 - q1) */                                          \
      85             :     work = abs_diff(q3q2, q2q1);                                              \
      86             :     flat = _mm_max_epu8(work, flat);                                          \
      87             :     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
      88             :     mask = _mm_unpacklo_epi64(mask, flat);                                    \
      89             :     mask = _mm_subs_epu8(mask, limit);                                        \
      90             :     mask = _mm_cmpeq_epi8(mask, zero);                                        \
      91             :     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
      92             :   } while (0)
      93             : 
      94             : #define FILTER4                                                             \
      95             :   do {                                                                      \
      96             :     const __m128i t3t4 =                                                    \
      97             :         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
      98             :     const __m128i t80 = _mm_set1_epi8(0x80);                                \
      99             :     __m128i filter, filter2filter1, work;                                   \
     100             :                                                                             \
     101             :     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
     102             :     qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
     103             :                                                                             \
     104             :     /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
     105             :     work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
     106             :     filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
     107             :     /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
     108             :     filter = _mm_subs_epi8(filter, work);                                   \
     109             :     filter = _mm_subs_epi8(filter, work);                                   \
     110             :     filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
     111             :     filter = _mm_and_si128(filter, mask); /* & mask */                      \
     112             :     filter = _mm_unpacklo_epi64(filter, filter);                            \
     113             :                                                                             \
     114             :     /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
     115             :     /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
     116             :     filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
     117             :     filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
     118             :     filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
     119             :     filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
     120             :     filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
     121             :     filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
     122             :                                                                             \
     123             :     /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
     124             :     filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
     125             :     filter = _mm_unpacklo_epi8(filter, filter);                             \
     126             :     filter = _mm_srai_epi16(filter, 9); /* round */                         \
     127             :     filter = _mm_packs_epi16(filter, filter);                               \
     128             :     filter = _mm_andnot_si128(hev, filter);                                 \
     129             :                                                                             \
     130             :     hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
     131             :     filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
     132             :                                                                             \
     133             :     /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
     134             :     qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
     135             :     /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
     136             :     ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
     137             :     qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
     138             :     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
     139             :   } while (0)
     140             : 
     141           0 : void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
     142             :                                const uint8_t *_blimit, const uint8_t *_limit,
     143             :                                const uint8_t *_thresh) {
     144           0 :   const __m128i zero = _mm_set1_epi16(0);
     145           0 :   const __m128i limit =
     146           0 :       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
     147             :                          _mm_loadl_epi64((const __m128i *)_limit));
     148           0 :   const __m128i thresh =
     149           0 :       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
     150           0 :   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     151             : #if !CONFIG_PARALLEL_DEBLOCKING
     152             :   __m128i p3p2, p2p1, q3q2, q2q1;
     153             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     154             :   __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
     155             :   __m128i mask, hev;
     156             : #if !CONFIG_PARALLEL_DEBLOCKING
     157           0 :   p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
     158           0 :                             _mm_loadl_epi64((__m128i *)(s - 4 * p)));
     159             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     160           0 :   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
     161           0 :                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
     162           0 :   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
     163             :                             _mm_loadl_epi64((__m128i *)(s + 0 * p)));
     164             : #if !CONFIG_PARALLEL_DEBLOCKING
     165           0 :   q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
     166           0 :                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
     167             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     168           0 :   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
     169           0 :   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
     170             : #if !CONFIG_PARALLEL_DEBLOCKING
     171           0 :   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
     172           0 :   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
     173             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     174             : #if !CONFIG_PARALLEL_DEBLOCKING
     175           0 :   FILTER_HEV_MASK;
     176             : #else   // CONFIG_PARALLEL_DEBLOCKING
     177             :   FILTER_HEV_MASK4;
     178             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     179           0 :   FILTER4;
     180             : 
     181           0 :   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
     182           0 :   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
     183             :   _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
     184           0 :   _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
     185           0 : }
     186             : 
     187           0 : void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
     188             :                              const uint8_t *_blimit, const uint8_t *_limit,
     189             :                              const uint8_t *_thresh) {
     190           0 :   const __m128i zero = _mm_set1_epi16(0);
     191           0 :   const __m128i limit =
     192           0 :       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
     193             :                          _mm_loadl_epi64((const __m128i *)_limit));
     194           0 :   const __m128i thresh =
     195           0 :       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
     196           0 :   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     197             :   __m128i x0, x1, x2, x3;
     198             : #if !CONFIG_PARALLEL_DEBLOCKING
     199             :   __m128i p3p2, p2p1, q3q2, q2q1;
     200             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     201             :   __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
     202             :   __m128i mask, hev;
     203             : 
     204             :   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
     205           0 :   q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
     206           0 :                            _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
     207             : 
     208             :   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
     209           0 :   x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
     210           0 :                          _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
     211             : 
     212             :   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
     213           0 :   x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
     214           0 :                          _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
     215             : 
     216             :   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
     217           0 :   x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
     218           0 :                          _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
     219             : 
     220             :   // Transpose 8x8
     221             :   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     222           0 :   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
     223             :   // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     224           0 :   x0 = _mm_unpacklo_epi16(x2, x3);
     225             : #if !CONFIG_PARALLEL_DEBLOCKING
     226             :   // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     227           0 :   p3p2 = _mm_unpacklo_epi32(p1p0, x0);
     228             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     229             :   // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     230           0 :   p1p0 = _mm_unpackhi_epi32(p1p0, x0);
     231             : #if !CONFIG_PARALLEL_DEBLOCKING
     232           0 :   p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
     233             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     234           0 :   p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
     235             : 
     236             :   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     237           0 :   q1q0 = _mm_unpackhi_epi16(q1q0, x1);
     238             :   // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     239           0 :   x2 = _mm_unpackhi_epi16(x2, x3);
     240             : #if !CONFIG_PARALLEL_DEBLOCKING
     241             :   // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     242           0 :   q3q2 = _mm_unpackhi_epi32(q1q0, x2);
     243             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     244             :   // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     245           0 :   q1q0 = _mm_unpacklo_epi32(q1q0, x2);
     246             : 
     247           0 :   q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
     248           0 :   q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
     249           0 :   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
     250             : #if !CONFIG_PARALLEL_DEBLOCKING
     251           0 :   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
     252           0 :   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
     253             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     254             : #if !CONFIG_PARALLEL_DEBLOCKING
     255           0 :   FILTER_HEV_MASK;
     256             : #else   // CONFIG_PARALLEL_DEBLOCKING
     257             :   FILTER_HEV_MASK4;
     258             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
     259           0 :   FILTER4;
     260             : 
     261             :   // Transpose 8x4 to 4x8
     262             :   // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
     263             :   // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
     264             :   // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
     265           0 :   ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
     266             :   // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
     267           0 :   x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
     268             :   // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
     269           0 :   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
     270             :   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     271           0 :   qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
     272             :   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     273           0 :   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
     274             : 
     275           0 :   *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     276           0 :   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
     277           0 :   *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     278           0 :   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
     279           0 :   *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     280           0 :   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
     281           0 :   *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
     282             : 
     283           0 :   *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     284           0 :   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
     285           0 :   *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     286           0 :   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
     287           0 :   *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     288           0 :   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
     289           0 :   *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
     290           0 : }
     291             : 
     292           0 : void aom_lpf_horizontal_edge_8_sse2(unsigned char *s, int p,
     293             :                                     const unsigned char *_blimit,
     294             :                                     const unsigned char *_limit,
     295             :                                     const unsigned char *_thresh) {
     296           0 :   const __m128i zero = _mm_set1_epi16(0);
     297           0 :   const __m128i one = _mm_set1_epi8(1);
     298           0 :   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
     299           0 :   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
     300           0 :   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
     301             :   __m128i mask, hev, flat, flat2;
     302             :   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
     303             :   __m128i abs_p1p0;
     304             : 
     305           0 :   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
     306           0 :   q4p4 = _mm_castps_si128(
     307           0 :       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
     308           0 :   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
     309           0 :   q3p3 = _mm_castps_si128(
     310           0 :       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
     311           0 :   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
     312           0 :   q2p2 = _mm_castps_si128(
     313           0 :       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
     314           0 :   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
     315           0 :   q1p1 = _mm_castps_si128(
     316           0 :       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
     317           0 :   p1q1 = _mm_shuffle_epi32(q1p1, 78);
     318           0 :   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
     319           0 :   q0p0 = _mm_castps_si128(
     320             :       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
     321           0 :   p0q0 = _mm_shuffle_epi32(q0p0, 78);
     322             : 
     323             :   {
     324             :     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     325           0 :     abs_p1p0 = abs_diff(q1p1, q0p0);
     326           0 :     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     327           0 :     fe = _mm_set1_epi8(0xfe);
     328           0 :     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     329           0 :     abs_p0q0 = abs_diff(q0p0, p0q0);
     330           0 :     abs_p1q1 = abs_diff(q1p1, p1q1);
     331           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     332           0 :     hev = _mm_subs_epu8(flat, thresh);
     333           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     334             : 
     335           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     336           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     337           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     338           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     339             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     340           0 :     mask = _mm_max_epu8(abs_p1p0, mask);
     341             :     // mask |= (abs(p1 - p0) > limit) * -1;
     342             :     // mask |= (abs(q1 - q0) > limit) * -1;
     343             : 
     344           0 :     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
     345           0 :     mask = _mm_max_epu8(work, mask);
     346           0 :     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     347           0 :     mask = _mm_subs_epu8(mask, limit);
     348           0 :     mask = _mm_cmpeq_epi8(mask, zero);
     349             :   }
     350             : 
     351             :   // lp filter
     352             :   {
     353           0 :     const __m128i t4 = _mm_set1_epi8(4);
     354           0 :     const __m128i t3 = _mm_set1_epi8(3);
     355           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
     356           0 :     const __m128i t1 = _mm_set1_epi16(0x1);
     357           0 :     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     358           0 :     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
     359           0 :     __m128i qs0 = _mm_xor_si128(p0q0, t80);
     360           0 :     __m128i qs1 = _mm_xor_si128(p1q1, t80);
     361             :     __m128i filt;
     362             :     __m128i work_a;
     363             :     __m128i filter1, filter2;
     364             :     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
     365             :     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
     366             : 
     367           0 :     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
     368           0 :     work_a = _mm_subs_epi8(qs0, qs0ps0);
     369           0 :     filt = _mm_adds_epi8(filt, work_a);
     370           0 :     filt = _mm_adds_epi8(filt, work_a);
     371           0 :     filt = _mm_adds_epi8(filt, work_a);
     372             :     // (aom_filter + 3 * (qs0 - ps0)) & mask
     373           0 :     filt = _mm_and_si128(filt, mask);
     374             : 
     375           0 :     filter1 = _mm_adds_epi8(filt, t4);
     376           0 :     filter2 = _mm_adds_epi8(filt, t3);
     377             : 
     378           0 :     filter1 = _mm_unpacklo_epi8(zero, filter1);
     379           0 :     filter1 = _mm_srai_epi16(filter1, 0xB);
     380           0 :     filter2 = _mm_unpacklo_epi8(zero, filter2);
     381           0 :     filter2 = _mm_srai_epi16(filter2, 0xB);
     382             : 
     383             :     // Filter1 >> 3
     384           0 :     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
     385           0 :     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
     386             : 
     387             :     // filt >> 1
     388           0 :     filt = _mm_adds_epi16(filter1, t1);
     389           0 :     filt = _mm_srai_epi16(filt, 1);
     390           0 :     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
     391             :                             filt);
     392           0 :     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
     393           0 :     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
     394             :     // loopfilter done
     395             : 
     396             :     {
     397             :       __m128i work;
     398           0 :       flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
     399           0 :       flat = _mm_max_epu8(abs_p1p0, flat);
     400           0 :       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     401           0 :       flat = _mm_subs_epu8(flat, one);
     402           0 :       flat = _mm_cmpeq_epi8(flat, zero);
     403           0 :       flat = _mm_and_si128(flat, mask);
     404             : 
     405           0 :       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
     406           0 :       q5p5 = _mm_castps_si128(
     407           0 :           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
     408             : 
     409           0 :       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
     410           0 :       q6p6 = _mm_castps_si128(
     411           0 :           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
     412           0 :       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
     413             : 
     414           0 :       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
     415           0 :       q7p7 = _mm_castps_si128(
     416           0 :           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
     417           0 :       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
     418           0 :       flat2 = _mm_max_epu8(work, flat2);
     419           0 :       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
     420           0 :       flat2 = _mm_subs_epu8(flat2, one);
     421           0 :       flat2 = _mm_cmpeq_epi8(flat2, zero);
     422           0 :       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     423             :     }
     424             : 
     425             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     426             :     // flat and wide flat calculations
     427             :     {
     428           0 :       const __m128i eight = _mm_set1_epi16(8);
     429           0 :       const __m128i four = _mm_set1_epi16(4);
     430             :       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
     431             :       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
     432             :       __m128i pixelFilter_p, pixelFilter_q;
     433             :       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
     434             :       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
     435             : 
     436           0 :       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
     437           0 :       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
     438           0 :       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
     439           0 :       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
     440           0 :       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
     441           0 :       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
     442           0 :       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
     443           0 :       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
     444           0 :       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
     445           0 :       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
     446           0 :       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
     447           0 :       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
     448           0 :       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
     449           0 :       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
     450           0 :       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
     451           0 :       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
     452             : 
     453           0 :       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
     454             :                                     _mm_add_epi16(p4_16, p3_16));
     455           0 :       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
     456             :                                     _mm_add_epi16(q4_16, q3_16));
     457             : 
     458           0 :       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
     459           0 :       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
     460             : 
     461           0 :       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
     462           0 :       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
     463           0 :       pixelFilter_p =
     464           0 :           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
     465           0 :       pixetFilter_p2p1p0 = _mm_add_epi16(
     466             :           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
     467           0 :       res_p = _mm_srli_epi16(
     468             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
     469           0 :       res_q = _mm_srli_epi16(
     470             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
     471           0 :       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
     472           0 :       res_p = _mm_srli_epi16(
     473             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
     474           0 :       res_q = _mm_srli_epi16(
     475             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
     476             : 
     477           0 :       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
     478             : 
     479           0 :       sum_p7 = _mm_add_epi16(p7_16, p7_16);
     480           0 :       sum_q7 = _mm_add_epi16(q7_16, q7_16);
     481           0 :       sum_p3 = _mm_add_epi16(p3_16, p3_16);
     482           0 :       sum_q3 = _mm_add_epi16(q3_16, q3_16);
     483             : 
     484           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
     485           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
     486           0 :       res_p = _mm_srli_epi16(
     487             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
     488           0 :       res_q = _mm_srli_epi16(
     489             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
     490           0 :       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
     491             : 
     492           0 :       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
     493           0 :       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
     494           0 :       res_p = _mm_srli_epi16(
     495             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
     496           0 :       res_q = _mm_srli_epi16(
     497             :           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
     498           0 :       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
     499             : 
     500           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     501           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     502           0 :       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
     503           0 :       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
     504             : 
     505           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
     506           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
     507           0 :       res_p = _mm_srli_epi16(
     508             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
     509           0 :       res_q = _mm_srli_epi16(
     510             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
     511           0 :       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
     512             : 
     513           0 :       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
     514           0 :       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
     515             : 
     516           0 :       res_p = _mm_srli_epi16(
     517             :           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
     518           0 :       res_q = _mm_srli_epi16(
     519             :           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
     520           0 :       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
     521             : 
     522           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     523           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     524           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
     525           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
     526           0 :       res_p = _mm_srli_epi16(
     527             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
     528           0 :       res_q = _mm_srli_epi16(
     529             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
     530           0 :       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
     531             : 
     532           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     533           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     534           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
     535           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
     536           0 :       res_p = _mm_srli_epi16(
     537             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
     538           0 :       res_q = _mm_srli_epi16(
     539             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
     540           0 :       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
     541             : 
     542           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     543           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     544           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
     545           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
     546           0 :       res_p = _mm_srli_epi16(
     547             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
     548           0 :       res_q = _mm_srli_epi16(
     549             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
     550           0 :       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
     551             : 
     552           0 :       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
     553           0 :       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
     554           0 :       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
     555           0 :       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
     556           0 :       res_p = _mm_srli_epi16(
     557             :           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
     558           0 :       res_q = _mm_srli_epi16(
     559             :           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
     560           0 :       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
     561             :     }
     562             :     // wide flat
     563             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     564             : 
     565           0 :     flat = _mm_shuffle_epi32(flat, 68);
     566           0 :     flat2 = _mm_shuffle_epi32(flat2, 68);
     567             : 
     568           0 :     q2p2 = _mm_andnot_si128(flat, q2p2);
     569           0 :     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
     570           0 :     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
     571             : 
     572           0 :     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
     573           0 :     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
     574           0 :     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
     575             : 
     576           0 :     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
     577           0 :     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
     578           0 :     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
     579             : 
     580           0 :     q6p6 = _mm_andnot_si128(flat2, q6p6);
     581           0 :     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
     582           0 :     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
     583           0 :     _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
     584           0 :     _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
     585             : 
     586           0 :     q5p5 = _mm_andnot_si128(flat2, q5p5);
     587           0 :     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
     588           0 :     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
     589           0 :     _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
     590           0 :     _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
     591             : 
     592           0 :     q4p4 = _mm_andnot_si128(flat2, q4p4);
     593           0 :     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
     594           0 :     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
     595           0 :     _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
     596           0 :     _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
     597             : 
     598           0 :     q3p3 = _mm_andnot_si128(flat2, q3p3);
     599           0 :     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
     600           0 :     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
     601           0 :     _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
     602           0 :     _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
     603             : 
     604           0 :     q2p2 = _mm_andnot_si128(flat2, q2p2);
     605           0 :     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
     606           0 :     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
     607           0 :     _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
     608           0 :     _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
     609             : 
     610           0 :     q1p1 = _mm_andnot_si128(flat2, q1p1);
     611           0 :     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
     612           0 :     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
     613           0 :     _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
     614           0 :     _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
     615             : 
     616           0 :     q0p0 = _mm_andnot_si128(flat2, q0p0);
     617           0 :     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
     618           0 :     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
     619           0 :     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
     620           0 :     _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
     621             :   }
     622           0 : }
     623             : 
     624           0 : static INLINE __m128i filter_add2_sub2(const __m128i *const total,
     625             :                                        const __m128i *const a1,
     626             :                                        const __m128i *const a2,
     627             :                                        const __m128i *const s1,
     628             :                                        const __m128i *const s2) {
     629           0 :   __m128i x = _mm_add_epi16(*a1, *total);
     630           0 :   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
     631           0 :   return x;
     632             : }
     633             : 
     634           0 : static INLINE __m128i filter8_mask(const __m128i *const flat,
     635             :                                    const __m128i *const other_filt,
     636             :                                    const __m128i *const f8_lo,
     637             :                                    const __m128i *const f8_hi) {
     638           0 :   const __m128i f8 =
     639           0 :       _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
     640           0 :   const __m128i result = _mm_and_si128(*flat, f8);
     641           0 :   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
     642             : }
     643             : 
     644           0 : static INLINE __m128i filter16_mask(const __m128i *const flat,
     645             :                                     const __m128i *const other_filt,
     646             :                                     const __m128i *const f_lo,
     647             :                                     const __m128i *const f_hi) {
     648           0 :   const __m128i f =
     649           0 :       _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
     650           0 :   const __m128i result = _mm_and_si128(*flat, f);
     651           0 :   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
     652             : }
     653             : 
     654           0 : void aom_lpf_horizontal_edge_16_sse2(unsigned char *s, int p,
     655             :                                      const unsigned char *_blimit,
     656             :                                      const unsigned char *_limit,
     657             :                                      const unsigned char *_thresh) {
     658           0 :   const __m128i zero = _mm_set1_epi16(0);
     659           0 :   const __m128i one = _mm_set1_epi8(1);
     660           0 :   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
     661           0 :   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
     662           0 :   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
     663             :   __m128i mask, hev, flat, flat2;
     664             :   __m128i p7, p6, p5;
     665             :   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
     666             :   __m128i q5, q6, q7;
     667             : 
     668             :   __m128i op2, op1, op0, oq0, oq1, oq2;
     669             : 
     670             :   __m128i max_abs_p1p0q1q0;
     671             : 
     672           0 :   p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
     673           0 :   p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
     674           0 :   p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
     675           0 :   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
     676           0 :   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
     677           0 :   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
     678           0 :   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
     679           0 :   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
     680           0 :   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
     681           0 :   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
     682           0 :   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
     683           0 :   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
     684           0 :   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
     685           0 :   q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
     686           0 :   q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
     687           0 :   q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
     688             : 
     689             :   {
     690           0 :     const __m128i abs_p1p0 = abs_diff(p1, p0);
     691           0 :     const __m128i abs_q1q0 = abs_diff(q1, q0);
     692           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
     693           0 :     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
     694           0 :     __m128i abs_p0q0 = abs_diff(p0, q0);
     695           0 :     __m128i abs_p1q1 = abs_diff(p1, q1);
     696             :     __m128i work;
     697           0 :     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
     698             : 
     699           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     700           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     701           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     702           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     703             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     704           0 :     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
     705             :     // mask |= (abs(p1 - p0) > limit) * -1;
     706             :     // mask |= (abs(q1 - q0) > limit) * -1;
     707           0 :     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
     708           0 :     mask = _mm_max_epu8(work, mask);
     709           0 :     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
     710           0 :     mask = _mm_max_epu8(work, mask);
     711           0 :     mask = _mm_subs_epu8(mask, limit);
     712           0 :     mask = _mm_cmpeq_epi8(mask, zero);
     713             :   }
     714             : 
     715             :   {
     716             :     __m128i work;
     717           0 :     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
     718           0 :     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
     719           0 :     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
     720           0 :     flat = _mm_max_epu8(work, flat);
     721           0 :     work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
     722           0 :     flat = _mm_subs_epu8(flat, one);
     723           0 :     flat = _mm_cmpeq_epi8(flat, zero);
     724           0 :     flat = _mm_and_si128(flat, mask);
     725           0 :     flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
     726           0 :     flat2 = _mm_max_epu8(work, flat2);
     727           0 :     work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
     728           0 :     flat2 = _mm_max_epu8(work, flat2);
     729           0 :     work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
     730           0 :     flat2 = _mm_max_epu8(work, flat2);
     731           0 :     flat2 = _mm_subs_epu8(flat2, one);
     732           0 :     flat2 = _mm_cmpeq_epi8(flat2, zero);
     733           0 :     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     734             :   }
     735             : 
     736             :   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     737             :   // filter4
     738             :   {
     739           0 :     const __m128i t4 = _mm_set1_epi8(4);
     740           0 :     const __m128i t3 = _mm_set1_epi8(3);
     741           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
     742           0 :     const __m128i te0 = _mm_set1_epi8(0xe0);
     743           0 :     const __m128i t1f = _mm_set1_epi8(0x1f);
     744           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
     745           0 :     const __m128i t7f = _mm_set1_epi8(0x7f);
     746           0 :     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
     747             : 
     748             :     __m128i filt;
     749             :     __m128i work_a;
     750             :     __m128i filter1, filter2;
     751             : 
     752           0 :     op1 = _mm_xor_si128(p1, t80);
     753           0 :     op0 = _mm_xor_si128(p0, t80);
     754           0 :     oq0 = _mm_xor_si128(q0, t80);
     755           0 :     oq1 = _mm_xor_si128(q1, t80);
     756             : 
     757           0 :     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
     758           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     759           0 :     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
     760             : 
     761           0 :     work_a = _mm_subs_epi8(oq0, op0);
     762           0 :     filt = _mm_adds_epi8(filt, work_a);
     763           0 :     filt = _mm_adds_epi8(filt, work_a);
     764           0 :     filt = _mm_adds_epi8(filt, work_a);
     765             :     // (aom_filter + 3 * (qs0 - ps0)) & mask
     766           0 :     filt = _mm_and_si128(filt, mask);
     767           0 :     filter1 = _mm_adds_epi8(filt, t4);
     768           0 :     filter2 = _mm_adds_epi8(filt, t3);
     769             : 
     770             :     // Filter1 >> 3
     771           0 :     work_a = _mm_cmpgt_epi8(zero, filter1);
     772           0 :     filter1 = _mm_srli_epi16(filter1, 3);
     773           0 :     work_a = _mm_and_si128(work_a, te0);
     774           0 :     filter1 = _mm_and_si128(filter1, t1f);
     775           0 :     filter1 = _mm_or_si128(filter1, work_a);
     776           0 :     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
     777             : 
     778             :     // Filter2 >> 3
     779           0 :     work_a = _mm_cmpgt_epi8(zero, filter2);
     780           0 :     filter2 = _mm_srli_epi16(filter2, 3);
     781           0 :     work_a = _mm_and_si128(work_a, te0);
     782           0 :     filter2 = _mm_and_si128(filter2, t1f);
     783           0 :     filter2 = _mm_or_si128(filter2, work_a);
     784           0 :     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
     785             : 
     786             :     // filt >> 1
     787           0 :     filt = _mm_adds_epi8(filter1, t1);
     788           0 :     work_a = _mm_cmpgt_epi8(zero, filt);
     789           0 :     filt = _mm_srli_epi16(filt, 1);
     790           0 :     work_a = _mm_and_si128(work_a, t80);
     791           0 :     filt = _mm_and_si128(filt, t7f);
     792           0 :     filt = _mm_or_si128(filt, work_a);
     793           0 :     filt = _mm_andnot_si128(hev, filt);
     794           0 :     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
     795           0 :     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
     796             :     // loopfilter done
     797             : 
     798             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     799             :     // filter8
     800             :     {
     801           0 :       const __m128i four = _mm_set1_epi16(4);
     802           0 :       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
     803           0 :       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
     804           0 :       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
     805           0 :       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
     806           0 :       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
     807           0 :       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
     808           0 :       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
     809           0 :       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
     810             : 
     811           0 :       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
     812           0 :       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
     813           0 :       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
     814           0 :       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
     815           0 :       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
     816           0 :       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
     817           0 :       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
     818           0 :       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
     819             :       __m128i f8_lo, f8_hi;
     820             : 
     821           0 :       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
     822             :                             _mm_add_epi16(p3_lo, p2_lo));
     823           0 :       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
     824             :                             _mm_add_epi16(p2_lo, p1_lo));
     825           0 :       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
     826             : 
     827           0 :       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
     828             :                             _mm_add_epi16(p3_hi, p2_hi));
     829           0 :       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
     830             :                             _mm_add_epi16(p2_hi, p1_hi));
     831           0 :       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
     832             : 
     833           0 :       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
     834             : 
     835           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
     836           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
     837           0 :       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
     838             : 
     839           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
     840           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
     841           0 :       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
     842             : 
     843           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
     844           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
     845           0 :       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
     846             : 
     847           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
     848           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
     849           0 :       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
     850             : 
     851           0 :       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
     852           0 :       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
     853           0 :       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
     854             :     }
     855             : 
     856             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     857             :     // wide flat calculations
     858             :     {
     859           0 :       const __m128i eight = _mm_set1_epi16(8);
     860           0 :       const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
     861           0 :       const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
     862           0 :       const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
     863           0 :       const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
     864           0 :       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
     865           0 :       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
     866           0 :       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
     867           0 :       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
     868           0 :       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
     869           0 :       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
     870           0 :       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
     871           0 :       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
     872           0 :       const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
     873           0 :       const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
     874           0 :       const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
     875           0 :       const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
     876             : 
     877           0 :       const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
     878           0 :       const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
     879           0 :       const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
     880           0 :       const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
     881           0 :       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
     882           0 :       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
     883           0 :       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
     884           0 :       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
     885           0 :       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
     886           0 :       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
     887           0 :       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
     888           0 :       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
     889           0 :       const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
     890           0 :       const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
     891           0 :       const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
     892           0 :       const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
     893             : 
     894             :       __m128i f_lo;
     895             :       __m128i f_hi;
     896             : 
     897           0 :       f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
     898           0 :       f_lo =
     899           0 :           _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
     900           0 :       f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
     901             :                            _mm_add_epi16(p2_lo, p1_lo));
     902           0 :       f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
     903           0 :       f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
     904             : 
     905           0 :       f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
     906           0 :       f_hi =
     907           0 :           _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
     908           0 :       f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
     909             :                            _mm_add_epi16(p2_hi, p1_hi));
     910           0 :       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
     911           0 :       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
     912             : 
     913           0 :       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
     914           0 :       _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
     915             : 
     916           0 :       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
     917           0 :       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
     918           0 :       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
     919           0 :       _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
     920             : 
     921           0 :       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
     922           0 :       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
     923           0 :       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
     924           0 :       _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
     925             : 
     926           0 :       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
     927           0 :       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
     928           0 :       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
     929           0 :       _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
     930             : 
     931           0 :       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
     932           0 :       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
     933           0 :       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
     934           0 :       _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
     935             : 
     936           0 :       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
     937           0 :       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
     938           0 :       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
     939           0 :       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
     940             : 
     941           0 :       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
     942           0 :       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
     943           0 :       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
     944           0 :       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
     945             : 
     946           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
     947           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
     948           0 :       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
     949           0 :       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
     950             : 
     951           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
     952           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
     953           0 :       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
     954           0 :       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
     955             : 
     956           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
     957           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
     958           0 :       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
     959           0 :       _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
     960             : 
     961           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
     962           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
     963           0 :       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
     964           0 :       _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
     965             : 
     966           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
     967           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
     968           0 :       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
     969           0 :       _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
     970             : 
     971           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
     972           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
     973           0 :       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
     974           0 :       _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
     975             : 
     976           0 :       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
     977           0 :       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
     978           0 :       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
     979           0 :       _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
     980             :     }
     981             :     // wide flat
     982             :     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     983             :   }
     984           0 : }
     985             : 
     986           0 : void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
     987             :                                const unsigned char *_blimit,
     988             :                                const unsigned char *_limit,
     989             :                                const unsigned char *_thresh) {
     990             :   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
     991             :   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
     992             :   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
     993             :   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
     994             :   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
     995             :   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
     996           0 :   const __m128i zero = _mm_set1_epi16(0);
     997           0 :   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
     998           0 :   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
     999           0 :   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
    1000             :   __m128i mask, hev, flat;
    1001             :   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    1002             :   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
    1003             : 
    1004           0 :   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
    1005           0 :                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
    1006           0 :   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
    1007           0 :                             _mm_loadl_epi64((__m128i *)(s + 2 * p)));
    1008           0 :   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
    1009           0 :                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
    1010           0 :   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
    1011             :                             _mm_loadl_epi64((__m128i *)(s - 0 * p)));
    1012           0 :   p1q1 = _mm_shuffle_epi32(q1p1, 78);
    1013           0 :   p0q0 = _mm_shuffle_epi32(q0p0, 78);
    1014             : 
    1015             :   {
    1016             :     // filter_mask and hev_mask
    1017           0 :     const __m128i one = _mm_set1_epi8(1);
    1018           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
    1019           0 :     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
    1020             :     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
    1021           0 :     abs_p1p0 = abs_diff(q1p1, q0p0);
    1022           0 :     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    1023             : 
    1024           0 :     abs_p0q0 = abs_diff(q0p0, p0q0);
    1025           0 :     abs_p1q1 = abs_diff(q1p1, p1q1);
    1026           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    1027           0 :     hev = _mm_subs_epu8(flat, thresh);
    1028           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    1029             : 
    1030           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    1031           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    1032           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    1033           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    1034             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    1035           0 :     mask = _mm_max_epu8(abs_p1p0, mask);
    1036             :     // mask |= (abs(p1 - p0) > limit) * -1;
    1037             :     // mask |= (abs(q1 - q0) > limit) * -1;
    1038             : 
    1039           0 :     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
    1040           0 :     mask = _mm_max_epu8(work, mask);
    1041           0 :     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    1042           0 :     mask = _mm_subs_epu8(mask, limit);
    1043           0 :     mask = _mm_cmpeq_epi8(mask, zero);
    1044             : 
    1045             :     // flat_mask4
    1046             : 
    1047           0 :     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
    1048           0 :     flat = _mm_max_epu8(abs_p1p0, flat);
    1049           0 :     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
    1050           0 :     flat = _mm_subs_epu8(flat, one);
    1051           0 :     flat = _mm_cmpeq_epi8(flat, zero);
    1052           0 :     flat = _mm_and_si128(flat, mask);
    1053             :   }
    1054             : 
    1055             :   {
    1056           0 :     const __m128i four = _mm_set1_epi16(4);
    1057           0 :     unsigned char *src = s;
    1058             :     {
    1059             :       __m128i workp_a, workp_b, workp_shft;
    1060           0 :       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
    1061           0 :       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
    1062           0 :       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
    1063           0 :       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
    1064           0 :       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
    1065           0 :       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
    1066           0 :       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
    1067           0 :       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
    1068             : 
    1069           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
    1070           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
    1071           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
    1072           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1073           0 :       _mm_storel_epi64((__m128i *)&flat_op2[0],
    1074             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1075             : 
    1076           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
    1077           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1078           0 :       _mm_storel_epi64((__m128i *)&flat_op1[0],
    1079             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1080             : 
    1081           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
    1082           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
    1083           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1084           0 :       _mm_storel_epi64((__m128i *)&flat_op0[0],
    1085             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1086             : 
    1087           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
    1088           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
    1089           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1090           0 :       _mm_storel_epi64((__m128i *)&flat_oq0[0],
    1091             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1092             : 
    1093           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
    1094           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
    1095           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1096           0 :       _mm_storel_epi64((__m128i *)&flat_oq1[0],
    1097             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1098             : 
    1099           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
    1100           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
    1101           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1102           0 :       _mm_storel_epi64((__m128i *)&flat_oq2[0],
    1103             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1104             :     }
    1105             :   }
    1106             :   // lp filter
    1107             :   {
    1108           0 :     const __m128i t4 = _mm_set1_epi8(4);
    1109           0 :     const __m128i t3 = _mm_set1_epi8(3);
    1110           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
    1111           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
    1112           0 :     const __m128i ps1 =
    1113           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
    1114           0 :     const __m128i ps0 =
    1115           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
    1116           0 :     const __m128i qs0 =
    1117           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
    1118           0 :     const __m128i qs1 =
    1119           0 :         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
    1120             :     __m128i filt;
    1121             :     __m128i work_a;
    1122             :     __m128i filter1, filter2;
    1123             : 
    1124           0 :     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    1125           0 :     work_a = _mm_subs_epi8(qs0, ps0);
    1126           0 :     filt = _mm_adds_epi8(filt, work_a);
    1127           0 :     filt = _mm_adds_epi8(filt, work_a);
    1128           0 :     filt = _mm_adds_epi8(filt, work_a);
    1129             :     // (aom_filter + 3 * (qs0 - ps0)) & mask
    1130           0 :     filt = _mm_and_si128(filt, mask);
    1131             : 
    1132           0 :     filter1 = _mm_adds_epi8(filt, t4);
    1133           0 :     filter2 = _mm_adds_epi8(filt, t3);
    1134             : 
    1135             :     // Filter1 >> 3
    1136           0 :     filter1 = _mm_unpacklo_epi8(zero, filter1);
    1137           0 :     filter1 = _mm_srai_epi16(filter1, 11);
    1138           0 :     filter1 = _mm_packs_epi16(filter1, filter1);
    1139             : 
    1140             :     // Filter2 >> 3
    1141           0 :     filter2 = _mm_unpacklo_epi8(zero, filter2);
    1142           0 :     filter2 = _mm_srai_epi16(filter2, 11);
    1143           0 :     filter2 = _mm_packs_epi16(filter2, zero);
    1144             : 
    1145             :     // filt >> 1
    1146           0 :     filt = _mm_adds_epi8(filter1, t1);
    1147           0 :     filt = _mm_unpacklo_epi8(zero, filt);
    1148           0 :     filt = _mm_srai_epi16(filt, 9);
    1149           0 :     filt = _mm_packs_epi16(filt, zero);
    1150             : 
    1151           0 :     filt = _mm_andnot_si128(hev, filt);
    1152             : 
    1153           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    1154           0 :     q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
    1155           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1156           0 :     q0 = _mm_and_si128(flat, q0);
    1157           0 :     q0 = _mm_or_si128(work_a, q0);
    1158             : 
    1159           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    1160           0 :     q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
    1161           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1162           0 :     q1 = _mm_and_si128(flat, q1);
    1163           0 :     q1 = _mm_or_si128(work_a, q1);
    1164             : 
    1165           0 :     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1166           0 :     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
    1167           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1168           0 :     q2 = _mm_and_si128(flat, q2);
    1169           0 :     q2 = _mm_or_si128(work_a, q2);
    1170             : 
    1171           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    1172           0 :     p0 = _mm_loadl_epi64((__m128i *)flat_op0);
    1173           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1174           0 :     p0 = _mm_and_si128(flat, p0);
    1175           0 :     p0 = _mm_or_si128(work_a, p0);
    1176             : 
    1177           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    1178           0 :     p1 = _mm_loadl_epi64((__m128i *)flat_op1);
    1179           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1180           0 :     p1 = _mm_and_si128(flat, p1);
    1181           0 :     p1 = _mm_or_si128(work_a, p1);
    1182             : 
    1183           0 :     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1184           0 :     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
    1185           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1186           0 :     p2 = _mm_and_si128(flat, p2);
    1187           0 :     p2 = _mm_or_si128(work_a, p2);
    1188             : 
    1189           0 :     _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
    1190           0 :     _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
    1191           0 :     _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
    1192             :     _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
    1193           0 :     _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
    1194           0 :     _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
    1195             :   }
    1196           0 : }
    1197             : 
    1198           0 : void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
    1199             :                                     const uint8_t *_limit0,
    1200             :                                     const uint8_t *_thresh0,
    1201             :                                     const uint8_t *_blimit1,
    1202             :                                     const uint8_t *_limit1,
    1203             :                                     const uint8_t *_thresh1) {
    1204             :   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
    1205             :   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
    1206             :   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
    1207             :   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
    1208             :   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
    1209             :   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
    1210           0 :   const __m128i zero = _mm_set1_epi16(0);
    1211           0 :   const __m128i blimit =
    1212           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
    1213             :                          _mm_load_si128((const __m128i *)_blimit1));
    1214           0 :   const __m128i limit =
    1215           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
    1216             :                          _mm_load_si128((const __m128i *)_limit1));
    1217           0 :   const __m128i thresh =
    1218           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
    1219             :                          _mm_load_si128((const __m128i *)_thresh1));
    1220             : 
    1221             :   __m128i mask, hev, flat;
    1222             :   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    1223             : 
    1224           0 :   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
    1225           0 :   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1226           0 :   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
    1227           0 :   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
    1228           0 :   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
    1229           0 :   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
    1230           0 :   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1231           0 :   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
    1232             :   {
    1233           0 :     const __m128i abs_p1p0 =
    1234           0 :         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
    1235           0 :     const __m128i abs_q1q0 =
    1236           0 :         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
    1237           0 :     const __m128i one = _mm_set1_epi8(1);
    1238           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
    1239           0 :     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    1240           0 :     __m128i abs_p0q0 =
    1241           0 :         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    1242           0 :     __m128i abs_p1q1 =
    1243           0 :         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
    1244             :     __m128i work;
    1245             : 
    1246             :     // filter_mask and hev_mask
    1247           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    1248           0 :     hev = _mm_subs_epu8(flat, thresh);
    1249           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    1250             : 
    1251           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    1252           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    1253           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    1254           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    1255             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    1256           0 :     mask = _mm_max_epu8(flat, mask);
    1257             :     // mask |= (abs(p1 - p0) > limit) * -1;
    1258             :     // mask |= (abs(q1 - q0) > limit) * -1;
    1259           0 :     work = _mm_max_epu8(
    1260             :         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
    1261             :         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    1262           0 :     mask = _mm_max_epu8(work, mask);
    1263           0 :     work = _mm_max_epu8(
    1264             :         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
    1265             :         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    1266           0 :     mask = _mm_max_epu8(work, mask);
    1267           0 :     mask = _mm_subs_epu8(mask, limit);
    1268           0 :     mask = _mm_cmpeq_epi8(mask, zero);
    1269             : 
    1270             :     // flat_mask4
    1271           0 :     work = _mm_max_epu8(
    1272             :         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
    1273             :         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
    1274           0 :     flat = _mm_max_epu8(work, flat);
    1275           0 :     work = _mm_max_epu8(
    1276             :         _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
    1277             :         _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
    1278           0 :     flat = _mm_max_epu8(work, flat);
    1279           0 :     flat = _mm_subs_epu8(flat, one);
    1280           0 :     flat = _mm_cmpeq_epi8(flat, zero);
    1281           0 :     flat = _mm_and_si128(flat, mask);
    1282             :   }
    1283             :   {
    1284           0 :     const __m128i four = _mm_set1_epi16(4);
    1285           0 :     unsigned char *src = s;
    1286           0 :     int i = 0;
    1287             : 
    1288             :     do {
    1289             :       __m128i workp_a, workp_b, workp_shft;
    1290           0 :       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
    1291           0 :       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
    1292           0 :       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
    1293           0 :       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
    1294           0 :       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
    1295           0 :       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
    1296           0 :       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
    1297           0 :       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
    1298             : 
    1299           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
    1300           0 :       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
    1301           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
    1302           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1303           0 :       _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
    1304             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1305             : 
    1306           0 :       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
    1307           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1308           0 :       _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
    1309             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1310             : 
    1311           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
    1312           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
    1313           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1314           0 :       _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
    1315             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1316             : 
    1317           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
    1318           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
    1319           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1320           0 :       _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
    1321             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1322             : 
    1323           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
    1324           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
    1325           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1326           0 :       _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
    1327             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1328             : 
    1329           0 :       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
    1330           0 :       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
    1331           0 :       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    1332           0 :       _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
    1333             :                        _mm_packus_epi16(workp_shft, workp_shft));
    1334             : 
    1335           0 :       src += 8;
    1336           0 :     } while (++i < 2);
    1337             :   }
    1338             :   // lp filter
    1339             :   {
    1340           0 :     const __m128i t4 = _mm_set1_epi8(4);
    1341           0 :     const __m128i t3 = _mm_set1_epi8(3);
    1342           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
    1343           0 :     const __m128i te0 = _mm_set1_epi8(0xe0);
    1344           0 :     const __m128i t1f = _mm_set1_epi8(0x1f);
    1345           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
    1346           0 :     const __m128i t7f = _mm_set1_epi8(0x7f);
    1347             : 
    1348           0 :     const __m128i ps1 =
    1349           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
    1350           0 :     const __m128i ps0 =
    1351           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
    1352           0 :     const __m128i qs0 =
    1353           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
    1354           0 :     const __m128i qs1 =
    1355           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
    1356             :     __m128i filt;
    1357             :     __m128i work_a;
    1358             :     __m128i filter1, filter2;
    1359             : 
    1360           0 :     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    1361           0 :     work_a = _mm_subs_epi8(qs0, ps0);
    1362           0 :     filt = _mm_adds_epi8(filt, work_a);
    1363           0 :     filt = _mm_adds_epi8(filt, work_a);
    1364           0 :     filt = _mm_adds_epi8(filt, work_a);
    1365             :     // (aom_filter + 3 * (qs0 - ps0)) & mask
    1366           0 :     filt = _mm_and_si128(filt, mask);
    1367             : 
    1368           0 :     filter1 = _mm_adds_epi8(filt, t4);
    1369           0 :     filter2 = _mm_adds_epi8(filt, t3);
    1370             : 
    1371             :     // Filter1 >> 3
    1372           0 :     work_a = _mm_cmpgt_epi8(zero, filter1);
    1373           0 :     filter1 = _mm_srli_epi16(filter1, 3);
    1374           0 :     work_a = _mm_and_si128(work_a, te0);
    1375           0 :     filter1 = _mm_and_si128(filter1, t1f);
    1376           0 :     filter1 = _mm_or_si128(filter1, work_a);
    1377             : 
    1378             :     // Filter2 >> 3
    1379           0 :     work_a = _mm_cmpgt_epi8(zero, filter2);
    1380           0 :     filter2 = _mm_srli_epi16(filter2, 3);
    1381           0 :     work_a = _mm_and_si128(work_a, te0);
    1382           0 :     filter2 = _mm_and_si128(filter2, t1f);
    1383           0 :     filter2 = _mm_or_si128(filter2, work_a);
    1384             : 
    1385             :     // filt >> 1
    1386           0 :     filt = _mm_adds_epi8(filter1, t1);
    1387           0 :     work_a = _mm_cmpgt_epi8(zero, filt);
    1388           0 :     filt = _mm_srli_epi16(filt, 1);
    1389           0 :     work_a = _mm_and_si128(work_a, t80);
    1390           0 :     filt = _mm_and_si128(filt, t7f);
    1391           0 :     filt = _mm_or_si128(filt, work_a);
    1392             : 
    1393           0 :     filt = _mm_andnot_si128(hev, filt);
    1394             : 
    1395           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    1396           0 :     q0 = _mm_load_si128((__m128i *)flat_oq0);
    1397           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1398           0 :     q0 = _mm_and_si128(flat, q0);
    1399           0 :     q0 = _mm_or_si128(work_a, q0);
    1400             : 
    1401           0 :     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    1402           0 :     q1 = _mm_load_si128((__m128i *)flat_oq1);
    1403           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1404           0 :     q1 = _mm_and_si128(flat, q1);
    1405           0 :     q1 = _mm_or_si128(work_a, q1);
    1406             : 
    1407           0 :     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1408           0 :     q2 = _mm_load_si128((__m128i *)flat_oq2);
    1409           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1410           0 :     q2 = _mm_and_si128(flat, q2);
    1411           0 :     q2 = _mm_or_si128(work_a, q2);
    1412             : 
    1413           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    1414           0 :     p0 = _mm_load_si128((__m128i *)flat_op0);
    1415           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1416           0 :     p0 = _mm_and_si128(flat, p0);
    1417           0 :     p0 = _mm_or_si128(work_a, p0);
    1418             : 
    1419           0 :     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    1420           0 :     p1 = _mm_load_si128((__m128i *)flat_op1);
    1421           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1422           0 :     p1 = _mm_and_si128(flat, p1);
    1423           0 :     p1 = _mm_or_si128(work_a, p1);
    1424             : 
    1425           0 :     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1426           0 :     p2 = _mm_load_si128((__m128i *)flat_op2);
    1427           0 :     work_a = _mm_andnot_si128(flat, work_a);
    1428           0 :     p2 = _mm_and_si128(flat, p2);
    1429           0 :     p2 = _mm_or_si128(work_a, p2);
    1430             : 
    1431           0 :     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
    1432           0 :     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
    1433           0 :     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
    1434             :     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
    1435           0 :     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
    1436           0 :     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
    1437             :   }
    1438           0 : }
    1439             : 
    1440           0 : void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
    1441             :                                     const unsigned char *_blimit0,
    1442             :                                     const unsigned char *_limit0,
    1443             :                                     const unsigned char *_thresh0,
    1444             :                                     const unsigned char *_blimit1,
    1445             :                                     const unsigned char *_limit1,
    1446             :                                     const unsigned char *_thresh1) {
    1447           0 :   const __m128i blimit =
    1448           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
    1449             :                          _mm_load_si128((const __m128i *)_blimit1));
    1450           0 :   const __m128i limit =
    1451           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
    1452             :                          _mm_load_si128((const __m128i *)_limit1));
    1453           0 :   const __m128i thresh =
    1454           0 :       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
    1455             :                          _mm_load_si128((const __m128i *)_thresh1));
    1456           0 :   const __m128i zero = _mm_set1_epi16(0);
    1457             : #if !CONFIG_PARALLEL_DEBLOCKING
    1458             :   __m128i p3, p2, q2, q3;
    1459             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1460             :   __m128i p1, p0, q0, q1;
    1461             :   __m128i mask, hev, flat;
    1462             : #if !CONFIG_PARALLEL_DEBLOCKING
    1463           0 :   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
    1464           0 :   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
    1465             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1466           0 :   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
    1467           0 :   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
    1468           0 :   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
    1469           0 :   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
    1470             : #if !CONFIG_PARALLEL_DEBLOCKING
    1471           0 :   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
    1472           0 :   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
    1473             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1474             :   // filter_mask and hev_mask
    1475             :   {
    1476           0 :     const __m128i abs_p1p0 =
    1477           0 :         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
    1478           0 :     const __m128i abs_q1q0 =
    1479           0 :         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
    1480           0 :     const __m128i fe = _mm_set1_epi8(0xfe);
    1481           0 :     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    1482           0 :     __m128i abs_p0q0 =
    1483           0 :         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    1484           0 :     __m128i abs_p1q1 =
    1485           0 :         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
    1486             : #if !CONFIG_PARALLEL_DEBLOCKING
    1487             :     __m128i work;
    1488             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1489           0 :     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    1490           0 :     hev = _mm_subs_epu8(flat, thresh);
    1491           0 :     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    1492             : 
    1493           0 :     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    1494           0 :     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    1495           0 :     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    1496           0 :     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    1497             :     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    1498           0 :     mask = _mm_max_epu8(flat, mask);
    1499             : #if !CONFIG_PARALLEL_DEBLOCKING
    1500             :     // mask |= (abs(p1 - p0) > limit) * -1;
    1501             :     // mask |= (abs(q1 - q0) > limit) * -1;
    1502           0 :     work = _mm_max_epu8(
    1503             :         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
    1504             :         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    1505           0 :     mask = _mm_max_epu8(work, mask);
    1506           0 :     work = _mm_max_epu8(
    1507             :         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
    1508             :         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    1509           0 :     mask = _mm_max_epu8(work, mask);
    1510             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1511           0 :     mask = _mm_subs_epu8(mask, limit);
    1512           0 :     mask = _mm_cmpeq_epi8(mask, zero);
    1513             :   }
    1514             : 
    1515             :   // filter4
    1516             :   {
    1517           0 :     const __m128i t4 = _mm_set1_epi8(4);
    1518           0 :     const __m128i t3 = _mm_set1_epi8(3);
    1519           0 :     const __m128i t80 = _mm_set1_epi8(0x80);
    1520           0 :     const __m128i te0 = _mm_set1_epi8(0xe0);
    1521           0 :     const __m128i t1f = _mm_set1_epi8(0x1f);
    1522           0 :     const __m128i t1 = _mm_set1_epi8(0x1);
    1523           0 :     const __m128i t7f = _mm_set1_epi8(0x7f);
    1524             : 
    1525           0 :     const __m128i ps1 =
    1526           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
    1527           0 :     const __m128i ps0 =
    1528           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
    1529           0 :     const __m128i qs0 =
    1530           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
    1531           0 :     const __m128i qs1 =
    1532           0 :         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
    1533             :     __m128i filt;
    1534             :     __m128i work_a;
    1535             :     __m128i filter1, filter2;
    1536             : 
    1537           0 :     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    1538           0 :     work_a = _mm_subs_epi8(qs0, ps0);
    1539           0 :     filt = _mm_adds_epi8(filt, work_a);
    1540           0 :     filt = _mm_adds_epi8(filt, work_a);
    1541           0 :     filt = _mm_adds_epi8(filt, work_a);
    1542             :     // (aom_filter + 3 * (qs0 - ps0)) & mask
    1543           0 :     filt = _mm_and_si128(filt, mask);
    1544             : 
    1545           0 :     filter1 = _mm_adds_epi8(filt, t4);
    1546           0 :     filter2 = _mm_adds_epi8(filt, t3);
    1547             : 
    1548             :     // Filter1 >> 3
    1549           0 :     work_a = _mm_cmpgt_epi8(zero, filter1);
    1550           0 :     filter1 = _mm_srli_epi16(filter1, 3);
    1551           0 :     work_a = _mm_and_si128(work_a, te0);
    1552           0 :     filter1 = _mm_and_si128(filter1, t1f);
    1553           0 :     filter1 = _mm_or_si128(filter1, work_a);
    1554             : 
    1555             :     // Filter2 >> 3
    1556           0 :     work_a = _mm_cmpgt_epi8(zero, filter2);
    1557           0 :     filter2 = _mm_srli_epi16(filter2, 3);
    1558           0 :     work_a = _mm_and_si128(work_a, te0);
    1559           0 :     filter2 = _mm_and_si128(filter2, t1f);
    1560           0 :     filter2 = _mm_or_si128(filter2, work_a);
    1561             : 
    1562             :     // filt >> 1
    1563           0 :     filt = _mm_adds_epi8(filter1, t1);
    1564           0 :     work_a = _mm_cmpgt_epi8(zero, filt);
    1565           0 :     filt = _mm_srli_epi16(filt, 1);
    1566           0 :     work_a = _mm_and_si128(work_a, t80);
    1567           0 :     filt = _mm_and_si128(filt, t7f);
    1568           0 :     filt = _mm_or_si128(filt, work_a);
    1569             : 
    1570           0 :     filt = _mm_andnot_si128(hev, filt);
    1571             : 
    1572           0 :     q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    1573           0 :     q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    1574           0 :     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    1575           0 :     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    1576             : 
    1577           0 :     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
    1578           0 :     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
    1579             :     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
    1580           0 :     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
    1581             :   }
    1582           0 : }
    1583             : 
    1584           0 : static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
    1585             :                                  int in_p, unsigned char *out, int out_p) {
    1586             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    1587             :   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
    1588             : 
    1589             :   // 2-way interleave w/hoisting of unpacks
    1590           0 :   x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
    1591           0 :   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
    1592           0 :   x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
    1593             : 
    1594           0 :   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
    1595           0 :   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
    1596           0 :   x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
    1597             : 
    1598           0 :   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
    1599           0 :   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
    1600           0 :   x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
    1601             : 
    1602           0 :   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
    1603           0 :   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
    1604           0 :   x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
    1605           0 :   x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
    1606             : 
    1607           0 :   x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
    1608           0 :   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
    1609           0 :   x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
    1610           0 :   x5 = _mm_unpacklo_epi16(x2, x3);                // 10
    1611             : 
    1612           0 :   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
    1613           0 :   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
    1614           0 :   x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
    1615             : 
    1616           0 :   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
    1617           0 :   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
    1618           0 :   x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
    1619           0 :   x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
    1620             : 
    1621           0 :   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
    1622           0 :   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
    1623           0 :   x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
    1624           0 :   x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
    1625             : 
    1626           0 :   x6 = _mm_unpacklo_epi32(x4, x5);     // 13
    1627           0 :   x7 = _mm_unpackhi_epi32(x4, x5);     // 14
    1628           0 :   x14 = _mm_unpacklo_epi32(x12, x13);  // 15
    1629           0 :   x15 = _mm_unpackhi_epi32(x12, x13);  // 16
    1630             : 
    1631             :   // Store first 4-line result
    1632           0 :   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
    1633           0 :   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
    1634           0 :   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
    1635           0 :   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
    1636             : 
    1637           0 :   x4 = _mm_unpackhi_epi16(x0, x1);
    1638           0 :   x5 = _mm_unpackhi_epi16(x2, x3);
    1639           0 :   x12 = _mm_unpackhi_epi16(x8, x9);
    1640           0 :   x13 = _mm_unpackhi_epi16(x10, x11);
    1641             : 
    1642           0 :   x6 = _mm_unpacklo_epi32(x4, x5);
    1643           0 :   x7 = _mm_unpackhi_epi32(x4, x5);
    1644           0 :   x14 = _mm_unpacklo_epi32(x12, x13);
    1645           0 :   x15 = _mm_unpackhi_epi32(x12, x13);
    1646             : 
    1647             :   // Store second 4-line result
    1648           0 :   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
    1649           0 :   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
    1650           0 :   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
    1651           0 :   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
    1652           0 : }
    1653             : 
    1654             : #if CONFIG_PARALLEL_DEBLOCKING
    1655             : #define movq(p) _mm_loadl_epi64((const __m128i *)(p))
    1656             : #define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
    1657             : #define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
    1658             : #define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
    1659             : #define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
    1660             : #define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
    1661             : enum { ROTATE_DWORD_RIGHT = 0x39 };
    1662             : static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
    1663             :                                  const uint8_t *pSrc,
    1664             :                                  const ptrdiff_t srcStride) {
    1665             :   for (uint32_t idx = 0; idx < 2; idx += 1) {
    1666             :     __m128i r0, r1, r2, r3;
    1667             :     // load data
    1668             :     r0 = movq(pSrc);
    1669             :     r1 = movq(pSrc + srcStride);
    1670             :     r2 = movq(pSrc + srcStride * 2);
    1671             :     r3 = movq(pSrc + srcStride * 3);
    1672             :     // transpose
    1673             :     r0 = punpcklbw(r0, r1);
    1674             :     r2 = punpcklbw(r2, r3);
    1675             :     r1 = punpckhwd(r0, r2);
    1676             :     r0 = punpcklwd(r0, r2);
    1677             :     // store data
    1678             :     movd(pDst, r0);
    1679             :     r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
    1680             :     movd(pDst + dstStride, r0);
    1681             :     r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
    1682             :     movd(pDst + dstStride * 2, r0);
    1683             :     r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
    1684             :     movd(pDst + dstStride * 3, r0);
    1685             :     movd(pDst + dstStride * 4, r1);
    1686             :     r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
    1687             :     movd(pDst + dstStride * 5, r1);
    1688             :     r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
    1689             :     movd(pDst + dstStride * 6, r1);
    1690             :     r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
    1691             :     movd(pDst + dstStride * 7, r1);
    1692             :     // advance the pointers
    1693             :     pDst += dstStride * 8;
    1694             :     pSrc += 8;
    1695             :   }
    1696             : }
    1697             : 
    1698             : #endif  // CONFIG_PARALLEL_DEBLOCKING
    1699           0 : static INLINE void transpose(unsigned char *src[], int in_p,
    1700             :                              unsigned char *dst[], int out_p,
    1701             :                              int num_8x8_to_transpose) {
    1702           0 :   int idx8x8 = 0;
    1703             :   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    1704             :   do {
    1705           0 :     unsigned char *in = src[idx8x8];
    1706           0 :     unsigned char *out = dst[idx8x8];
    1707             : 
    1708           0 :     x0 =
    1709             :         _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
    1710           0 :     x1 =
    1711           0 :         _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
    1712             :     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    1713           0 :     x0 = _mm_unpacklo_epi8(x0, x1);
    1714             : 
    1715           0 :     x2 =
    1716           0 :         _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
    1717           0 :     x3 =
    1718           0 :         _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
    1719             :     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    1720           0 :     x1 = _mm_unpacklo_epi8(x2, x3);
    1721             : 
    1722           0 :     x4 =
    1723           0 :         _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
    1724           0 :     x5 =
    1725           0 :         _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
    1726             :     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
    1727           0 :     x2 = _mm_unpacklo_epi8(x4, x5);
    1728             : 
    1729           0 :     x6 =
    1730           0 :         _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
    1731           0 :     x7 =
    1732           0 :         _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
    1733             :     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
    1734           0 :     x3 = _mm_unpacklo_epi8(x6, x7);
    1735             : 
    1736             :     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    1737           0 :     x4 = _mm_unpacklo_epi16(x0, x1);
    1738             :     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
    1739           0 :     x5 = _mm_unpacklo_epi16(x2, x3);
    1740             :     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
    1741           0 :     x6 = _mm_unpacklo_epi32(x4, x5);
    1742           0 :     _mm_storel_pd((double *)(out + 0 * out_p),
    1743             :                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
    1744           0 :     _mm_storeh_pd((double *)(out + 1 * out_p),
    1745             :                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
    1746             :     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
    1747           0 :     x7 = _mm_unpackhi_epi32(x4, x5);
    1748           0 :     _mm_storel_pd((double *)(out + 2 * out_p),
    1749             :                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
    1750           0 :     _mm_storeh_pd((double *)(out + 3 * out_p),
    1751             :                   _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
    1752             : 
    1753             :     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    1754           0 :     x4 = _mm_unpackhi_epi16(x0, x1);
    1755             :     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
    1756           0 :     x5 = _mm_unpackhi_epi16(x2, x3);
    1757             :     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
    1758           0 :     x6 = _mm_unpacklo_epi32(x4, x5);
    1759           0 :     _mm_storel_pd((double *)(out + 4 * out_p),
    1760             :                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
    1761           0 :     _mm_storeh_pd((double *)(out + 5 * out_p),
    1762             :                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
    1763             :     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
    1764           0 :     x7 = _mm_unpackhi_epi32(x4, x5);
    1765             : 
    1766           0 :     _mm_storel_pd((double *)(out + 6 * out_p),
    1767             :                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
    1768           0 :     _mm_storeh_pd((double *)(out + 7 * out_p),
    1769             :                   _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
    1770           0 :   } while (++idx8x8 < num_8x8_to_transpose);
    1771           0 : }
    1772             : 
    1773           0 : void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    1774             :                                   const uint8_t *limit0, const uint8_t *thresh0,
    1775             :                                   const uint8_t *blimit1, const uint8_t *limit1,
    1776             :                                   const uint8_t *thresh1) {
    1777             :   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
    1778             : #if !CONFIG_PARALLEL_DEBLOCKING
    1779             :   unsigned char *src[2];
    1780             :   unsigned char *dst[2];
    1781             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1782             :   // Transpose 8x16
    1783           0 :   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
    1784             : 
    1785             :   // Loop filtering
    1786           0 :   aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
    1787             :                                  blimit1, limit1, thresh1);
    1788             : #if !CONFIG_PARALLEL_DEBLOCKING
    1789           0 :   src[0] = t_dst;
    1790           0 :   src[1] = t_dst + 8;
    1791           0 :   dst[0] = s - 4;
    1792           0 :   dst[1] = s - 4 + p * 8;
    1793             : 
    1794             :   // Transpose back
    1795           0 :   transpose(src, 16, dst, p, 2);
    1796             : #else  // CONFIG_PARALLEL_DEBLOCKING
    1797             :   transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
    1798             : #endif  // !CONFIG_PARALLEL_DEBLOCKING
    1799           0 : }
    1800             : 
    1801           0 : void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
    1802             :                              const unsigned char *blimit,
    1803             :                              const unsigned char *limit,
    1804             :                              const unsigned char *thresh) {
    1805             :   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
    1806             :   unsigned char *src[1];
    1807             :   unsigned char *dst[1];
    1808             : 
    1809             :   // Transpose 8x8
    1810           0 :   src[0] = s - 4;
    1811           0 :   dst[0] = t_dst;
    1812             : 
    1813           0 :   transpose(src, p, dst, 8, 1);
    1814             : 
    1815             :   // Loop filtering
    1816           0 :   aom_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
    1817             : 
    1818           0 :   src[0] = t_dst;
    1819           0 :   dst[0] = s - 4;
    1820             : 
    1821             :   // Transpose back
    1822           0 :   transpose(src, 8, dst, p, 1);
    1823           0 : }
    1824             : 
    1825           0 : void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
    1826             :                                   const uint8_t *limit0, const uint8_t *thresh0,
    1827             :                                   const uint8_t *blimit1, const uint8_t *limit1,
    1828             :                                   const uint8_t *thresh1) {
    1829             :   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
    1830             :   unsigned char *src[2];
    1831             :   unsigned char *dst[2];
    1832             : 
    1833             :   // Transpose 8x16
    1834           0 :   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
    1835             : 
    1836             :   // Loop filtering
    1837           0 :   aom_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
    1838             :                                  blimit1, limit1, thresh1);
    1839           0 :   src[0] = t_dst;
    1840           0 :   src[1] = t_dst + 8;
    1841             : 
    1842           0 :   dst[0] = s - 4;
    1843           0 :   dst[1] = s - 4 + p * 8;
    1844             : 
    1845             :   // Transpose back
    1846           0 :   transpose(src, 16, dst, p, 2);
    1847           0 : }
    1848             : 
    1849           0 : void aom_lpf_vertical_16_sse2(unsigned char *s, int p,
    1850             :                               const unsigned char *blimit,
    1851             :                               const unsigned char *limit,
    1852             :                               const unsigned char *thresh) {
    1853             :   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
    1854             :   unsigned char *src[2];
    1855             :   unsigned char *dst[2];
    1856             : 
    1857           0 :   src[0] = s - 8;
    1858           0 :   src[1] = s;
    1859           0 :   dst[0] = t_dst;
    1860           0 :   dst[1] = t_dst + 8 * 8;
    1861             : 
    1862             :   // Transpose 16x8
    1863           0 :   transpose(src, p, dst, 8, 2);
    1864             : 
    1865             :   // Loop filtering
    1866           0 :   aom_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
    1867             : 
    1868           0 :   src[0] = t_dst;
    1869           0 :   src[1] = t_dst + 8 * 8;
    1870           0 :   dst[0] = s - 8;
    1871           0 :   dst[1] = s;
    1872             : 
    1873             :   // Transpose back
    1874           0 :   transpose(src, 8, dst, p, 2);
    1875           0 : }
    1876             : 
    1877           0 : void aom_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
    1878             :                                    const uint8_t *blimit, const uint8_t *limit,
    1879             :                                    const uint8_t *thresh) {
    1880             :   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
    1881             : 
    1882             :   // Transpose 16x16
    1883           0 :   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
    1884           0 :   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
    1885             : 
    1886             :   // Loop filtering
    1887           0 :   aom_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
    1888             : 
    1889             :   // Transpose back
    1890           0 :   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
    1891           0 :   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
    1892           0 : }

Generated by: LCOV version 1.13