LCOV - code coverage report
Current view: top level - third_party/aom/aom_dsp/x86 - highbd_loopfilter_sse2.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 635 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 15 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
       3             :  *
       4             :  * This source code is subject to the terms of the BSD 2 Clause License and
       5             :  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
       6             :  * was not distributed with this source code in the LICENSE file, you can
       7             :  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
       8             :  * Media Patent License 1.0 was not distributed with this source code in the
       9             :  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
      10             :  */
      11             : 
      12             : #include <emmintrin.h>  // SSE2
      13             : 
      14             : #include "./aom_dsp_rtcd.h"
      15             : #include "aom_ports/mem.h"
      16             : #include "aom_ports/emmintrin_compat.h"
      17             : 
      18           0 : static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
      19             :   __m128i ubounded;
      20             :   __m128i lbounded;
      21             :   __m128i retval;
      22             : 
      23           0 :   const __m128i zero = _mm_set1_epi16(0);
      24           0 :   const __m128i one = _mm_set1_epi16(1);
      25             :   __m128i t80, max, min;
      26             : 
      27           0 :   if (bd == 8) {
      28           0 :     t80 = _mm_set1_epi16(0x80);
      29           0 :     max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
      30           0 :   } else if (bd == 10) {
      31           0 :     t80 = _mm_set1_epi16(0x200);
      32           0 :     max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
      33             :   } else {  // bd == 12
      34           0 :     t80 = _mm_set1_epi16(0x800);
      35           0 :     max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
      36             :   }
      37             : 
      38           0 :   min = _mm_subs_epi16(zero, t80);
      39             : 
      40           0 :   ubounded = _mm_cmpgt_epi16(value, max);
      41           0 :   lbounded = _mm_cmplt_epi16(value, min);
      42           0 :   retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
      43           0 :   ubounded = _mm_and_si128(ubounded, max);
      44           0 :   lbounded = _mm_and_si128(lbounded, min);
      45           0 :   retval = _mm_or_si128(retval, ubounded);
      46           0 :   retval = _mm_or_si128(retval, lbounded);
      47           0 :   return retval;
      48             : }
      49             : 
      50             : // TODO(debargha, peter): Break up large functions into smaller ones
      51             : // in this file.
      52           0 : void aom_highbd_lpf_horizontal_edge_8_sse2(uint16_t *s, int p,
      53             :                                            const uint8_t *_blimit,
      54             :                                            const uint8_t *_limit,
      55             :                                            const uint8_t *_thresh, int bd) {
      56           0 :   const __m128i zero = _mm_set1_epi16(0);
      57           0 :   const __m128i one = _mm_set1_epi16(1);
      58             :   __m128i blimit, limit, thresh;
      59             :   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
      60             :   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
      61             :   __m128i ps1, qs1, ps0, qs0;
      62             :   __m128i abs_p0q0, abs_p1q1, ffff, work;
      63             :   __m128i filt, work_a, filter1, filter2;
      64             :   __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
      65             :   __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
      66             :   __m128i flat2_q0, flat2_p0;
      67             :   __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
      68             :   __m128i pixelFilter_p, pixelFilter_q;
      69             :   __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
      70             :   __m128i sum_p7, sum_q7, sum_p3, sum_q3;
      71             :   __m128i t4, t3, t80, t1;
      72             :   __m128i eight, four;
      73             : 
      74           0 :   if (bd == 8) {
      75           0 :     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
      76           0 :     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
      77           0 :     thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
      78           0 :   } else if (bd == 10) {
      79           0 :     blimit = _mm_slli_epi16(
      80             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
      81           0 :     limit = _mm_slli_epi16(
      82             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
      83           0 :     thresh = _mm_slli_epi16(
      84             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
      85             :   } else {  // bd == 12
      86           0 :     blimit = _mm_slli_epi16(
      87             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
      88           0 :     limit = _mm_slli_epi16(
      89             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
      90           0 :     thresh = _mm_slli_epi16(
      91             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
      92             :   }
      93             : 
      94           0 :   q4 = _mm_load_si128((__m128i *)(s + 4 * p));
      95           0 :   p4 = _mm_load_si128((__m128i *)(s - 5 * p));
      96           0 :   q3 = _mm_load_si128((__m128i *)(s + 3 * p));
      97           0 :   p3 = _mm_load_si128((__m128i *)(s - 4 * p));
      98           0 :   q2 = _mm_load_si128((__m128i *)(s + 2 * p));
      99           0 :   p2 = _mm_load_si128((__m128i *)(s - 3 * p));
     100           0 :   q1 = _mm_load_si128((__m128i *)(s + 1 * p));
     101           0 :   p1 = _mm_load_si128((__m128i *)(s - 2 * p));
     102           0 :   q0 = _mm_load_si128((__m128i *)(s + 0 * p));
     103           0 :   p0 = _mm_load_si128((__m128i *)(s - 1 * p));
     104             : 
     105             :   //  highbd_filter_mask
     106           0 :   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
     107           0 :   abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
     108             : 
     109           0 :   ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
     110             : 
     111           0 :   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
     112           0 :   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
     113             : 
     114             :   //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
     115           0 :   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
     116           0 :   hev = _mm_subs_epu16(flat, thresh);
     117           0 :   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
     118             : 
     119           0 :   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
     120           0 :   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
     121           0 :   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
     122           0 :   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
     123           0 :   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
     124           0 :   work = _mm_max_epi16(
     125             :       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
     126             :       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
     127           0 :   mask = _mm_max_epi16(work, mask);
     128           0 :   work = _mm_max_epi16(
     129             :       _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
     130             :       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
     131           0 :   mask = _mm_max_epi16(work, mask);
     132           0 :   work = _mm_max_epi16(
     133             :       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
     134             :       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
     135           0 :   mask = _mm_max_epi16(work, mask);
     136             : 
     137           0 :   mask = _mm_subs_epu16(mask, limit);
     138           0 :   mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
     139             : 
     140             :   // lp filter
     141             :   // highbd_filter4
     142           0 :   t4 = _mm_set1_epi16(4);
     143           0 :   t3 = _mm_set1_epi16(3);
     144           0 :   if (bd == 8)
     145           0 :     t80 = _mm_set1_epi16(0x80);
     146           0 :   else if (bd == 10)
     147           0 :     t80 = _mm_set1_epi16(0x200);
     148             :   else  // bd == 12
     149           0 :     t80 = _mm_set1_epi16(0x800);
     150             : 
     151           0 :   t1 = _mm_set1_epi16(0x1);
     152             : 
     153           0 :   ps1 = _mm_subs_epi16(p1, t80);
     154           0 :   qs1 = _mm_subs_epi16(q1, t80);
     155           0 :   ps0 = _mm_subs_epi16(p0, t80);
     156           0 :   qs0 = _mm_subs_epi16(q0, t80);
     157             : 
     158           0 :   filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
     159             :                        hev);
     160           0 :   work_a = _mm_subs_epi16(qs0, ps0);
     161           0 :   filt = _mm_adds_epi16(filt, work_a);
     162           0 :   filt = _mm_adds_epi16(filt, work_a);
     163           0 :   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
     164           0 :   filt = _mm_and_si128(filt, mask);
     165           0 :   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
     166           0 :   filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
     167             : 
     168             :   // Filter1 >> 3
     169           0 :   filter1 = _mm_srai_epi16(filter1, 0x3);
     170           0 :   filter2 = _mm_srai_epi16(filter2, 0x3);
     171             : 
     172           0 :   qs0 = _mm_adds_epi16(
     173             :       signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
     174           0 :   ps0 = _mm_adds_epi16(
     175             :       signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
     176           0 :   filt = _mm_adds_epi16(filter1, t1);
     177           0 :   filt = _mm_srai_epi16(filt, 1);
     178           0 :   filt = _mm_andnot_si128(hev, filt);
     179           0 :   qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
     180             :                        t80);
     181           0 :   ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
     182             :                        t80);
     183             : 
     184             :   // end highbd_filter4
     185             :   // loopfilter done
     186             : 
     187             :   // highbd_flat_mask4
     188           0 :   flat = _mm_max_epi16(
     189             :       _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
     190             :       _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
     191           0 :   work = _mm_max_epi16(
     192             :       _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
     193             :       _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
     194           0 :   flat = _mm_max_epi16(work, flat);
     195           0 :   work = _mm_max_epi16(abs_p1p0, abs_q1q0);
     196           0 :   flat = _mm_max_epi16(work, flat);
     197             : 
     198           0 :   if (bd == 8)
     199           0 :     flat = _mm_subs_epu16(flat, one);
     200           0 :   else if (bd == 10)
     201           0 :     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
     202             :   else  // bd == 12
     203           0 :     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
     204             : 
     205           0 :   flat = _mm_cmpeq_epi16(flat, zero);
     206             :   // end flat_mask4
     207             : 
     208             :   // flat & mask = flat && mask (as used in filter8)
     209             :   // (because, in both vars, each block of 16 either all 1s or all 0s)
     210           0 :   flat = _mm_and_si128(flat, mask);
     211             : 
     212           0 :   p5 = _mm_load_si128((__m128i *)(s - 6 * p));
     213           0 :   q5 = _mm_load_si128((__m128i *)(s + 5 * p));
     214           0 :   p6 = _mm_load_si128((__m128i *)(s - 7 * p));
     215           0 :   q6 = _mm_load_si128((__m128i *)(s + 6 * p));
     216           0 :   p7 = _mm_load_si128((__m128i *)(s - 8 * p));
     217           0 :   q7 = _mm_load_si128((__m128i *)(s + 7 * p));
     218             : 
     219             :   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
     220             :   // but referred to as p0-p4 & q0-q4 in fn)
     221           0 :   flat2 = _mm_max_epi16(
     222             :       _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
     223             :       _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
     224             : 
     225           0 :   work = _mm_max_epi16(
     226             :       _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
     227             :       _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
     228           0 :   flat2 = _mm_max_epi16(work, flat2);
     229             : 
     230           0 :   work = _mm_max_epi16(
     231             :       _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
     232             :       _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
     233           0 :   flat2 = _mm_max_epi16(work, flat2);
     234             : 
     235           0 :   work = _mm_max_epi16(
     236             :       _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
     237             :       _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
     238           0 :   flat2 = _mm_max_epi16(work, flat2);
     239             : 
     240           0 :   if (bd == 8)
     241           0 :     flat2 = _mm_subs_epu16(flat2, one);
     242           0 :   else if (bd == 10)
     243           0 :     flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
     244             :   else  // bd == 12
     245           0 :     flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
     246             : 
     247           0 :   flat2 = _mm_cmpeq_epi16(flat2, zero);
     248           0 :   flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
     249             :   // end highbd_flat_mask5
     250             : 
     251             :   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     252             :   // flat and wide flat calculations
     253           0 :   eight = _mm_set1_epi16(8);
     254           0 :   four = _mm_set1_epi16(4);
     255             : 
     256           0 :   pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
     257           0 :   pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
     258             : 
     259           0 :   pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
     260           0 :   pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
     261             : 
     262           0 :   pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
     263           0 :   pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
     264           0 :   pixelFilter_p =
     265           0 :       _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
     266           0 :   pixetFilter_p2p1p0 = _mm_add_epi16(
     267             :       four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
     268           0 :   flat2_p0 =
     269           0 :       _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
     270           0 :   flat2_q0 =
     271           0 :       _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
     272           0 :   flat_p0 = _mm_srli_epi16(
     273             :       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
     274           0 :   flat_q0 = _mm_srli_epi16(
     275             :       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
     276             : 
     277           0 :   sum_p7 = _mm_add_epi16(p7, p7);
     278           0 :   sum_q7 = _mm_add_epi16(q7, q7);
     279           0 :   sum_p3 = _mm_add_epi16(p3, p3);
     280           0 :   sum_q3 = _mm_add_epi16(q3, q3);
     281             : 
     282           0 :   pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
     283           0 :   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
     284           0 :   flat2_p1 = _mm_srli_epi16(
     285             :       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
     286           0 :   flat2_q1 = _mm_srli_epi16(
     287             :       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
     288             : 
     289           0 :   pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
     290           0 :   pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
     291           0 :   flat_p1 = _mm_srli_epi16(
     292             :       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
     293           0 :   flat_q1 = _mm_srli_epi16(
     294             :       _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
     295             : 
     296           0 :   sum_p7 = _mm_add_epi16(sum_p7, p7);
     297           0 :   sum_q7 = _mm_add_epi16(sum_q7, q7);
     298           0 :   sum_p3 = _mm_add_epi16(sum_p3, p3);
     299           0 :   sum_q3 = _mm_add_epi16(sum_q3, q3);
     300             : 
     301           0 :   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
     302           0 :   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
     303           0 :   flat2_p2 = _mm_srli_epi16(
     304             :       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
     305           0 :   flat2_q2 = _mm_srli_epi16(
     306             :       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
     307             : 
     308           0 :   pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
     309           0 :   pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
     310           0 :   flat_p2 = _mm_srli_epi16(
     311             :       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
     312           0 :   flat_q2 = _mm_srli_epi16(
     313             :       _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
     314             : 
     315           0 :   sum_p7 = _mm_add_epi16(sum_p7, p7);
     316           0 :   sum_q7 = _mm_add_epi16(sum_q7, q7);
     317           0 :   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
     318           0 :   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
     319           0 :   flat2_p3 = _mm_srli_epi16(
     320             :       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
     321           0 :   flat2_q3 = _mm_srli_epi16(
     322             :       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
     323             : 
     324           0 :   sum_p7 = _mm_add_epi16(sum_p7, p7);
     325           0 :   sum_q7 = _mm_add_epi16(sum_q7, q7);
     326           0 :   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
     327           0 :   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
     328           0 :   flat2_p4 = _mm_srli_epi16(
     329             :       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
     330           0 :   flat2_q4 = _mm_srli_epi16(
     331             :       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
     332             : 
     333           0 :   sum_p7 = _mm_add_epi16(sum_p7, p7);
     334           0 :   sum_q7 = _mm_add_epi16(sum_q7, q7);
     335           0 :   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
     336           0 :   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
     337           0 :   flat2_p5 = _mm_srli_epi16(
     338             :       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
     339           0 :   flat2_q5 = _mm_srli_epi16(
     340             :       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
     341             : 
     342           0 :   sum_p7 = _mm_add_epi16(sum_p7, p7);
     343           0 :   sum_q7 = _mm_add_epi16(sum_q7, q7);
     344           0 :   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
     345           0 :   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
     346           0 :   flat2_p6 = _mm_srli_epi16(
     347             :       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
     348           0 :   flat2_q6 = _mm_srli_epi16(
     349             :       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
     350             : 
     351             :   //  wide flat
     352             :   //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     353             : 
     354             :   //  highbd_filter8
     355           0 :   p2 = _mm_andnot_si128(flat, p2);
     356             :   //  p2 remains unchanged if !(flat && mask)
     357           0 :   flat_p2 = _mm_and_si128(flat, flat_p2);
     358             :   //  when (flat && mask)
     359           0 :   p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
     360           0 :   q2 = _mm_andnot_si128(flat, q2);
     361           0 :   flat_q2 = _mm_and_si128(flat, flat_q2);
     362           0 :   q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
     363             : 
     364           0 :   ps1 = _mm_andnot_si128(flat, ps1);
     365             :   //  p1 takes the value assigned to in in filter4 if !(flat && mask)
     366           0 :   flat_p1 = _mm_and_si128(flat, flat_p1);
     367             :   //  when (flat && mask)
     368           0 :   p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
     369           0 :   qs1 = _mm_andnot_si128(flat, qs1);
     370           0 :   flat_q1 = _mm_and_si128(flat, flat_q1);
     371           0 :   q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
     372             : 
     373           0 :   ps0 = _mm_andnot_si128(flat, ps0);
     374             :   //  p0 takes the value assigned to in in filter4 if !(flat && mask)
     375           0 :   flat_p0 = _mm_and_si128(flat, flat_p0);
     376             :   //  when (flat && mask)
     377           0 :   p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
     378           0 :   qs0 = _mm_andnot_si128(flat, qs0);
     379           0 :   flat_q0 = _mm_and_si128(flat, flat_q0);
     380           0 :   q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
     381             :   // end highbd_filter8
     382             : 
     383             :   // highbd_filter16
     384           0 :   p6 = _mm_andnot_si128(flat2, p6);
     385             :   //  p6 remains unchanged if !(flat2 && flat && mask)
     386           0 :   flat2_p6 = _mm_and_si128(flat2, flat2_p6);
     387             :   //  get values for when (flat2 && flat && mask)
     388           0 :   p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
     389           0 :   q6 = _mm_andnot_si128(flat2, q6);
     390             :   //  q6 remains unchanged if !(flat2 && flat && mask)
     391           0 :   flat2_q6 = _mm_and_si128(flat2, flat2_q6);
     392             :   //  get values for when (flat2 && flat && mask)
     393           0 :   q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
     394           0 :   _mm_store_si128((__m128i *)(s - 7 * p), p6);
     395           0 :   _mm_store_si128((__m128i *)(s + 6 * p), q6);
     396             : 
     397           0 :   p5 = _mm_andnot_si128(flat2, p5);
     398             :   //  p5 remains unchanged if !(flat2 && flat && mask)
     399           0 :   flat2_p5 = _mm_and_si128(flat2, flat2_p5);
     400             :   //  get values for when (flat2 && flat && mask)
     401           0 :   p5 = _mm_or_si128(p5, flat2_p5);
     402             :   //  full list of p5 values
     403           0 :   q5 = _mm_andnot_si128(flat2, q5);
     404             :   //  q5 remains unchanged if !(flat2 && flat && mask)
     405           0 :   flat2_q5 = _mm_and_si128(flat2, flat2_q5);
     406             :   //  get values for when (flat2 && flat && mask)
     407           0 :   q5 = _mm_or_si128(q5, flat2_q5);
     408             :   //  full list of q5 values
     409           0 :   _mm_store_si128((__m128i *)(s - 6 * p), p5);
     410           0 :   _mm_store_si128((__m128i *)(s + 5 * p), q5);
     411             : 
     412           0 :   p4 = _mm_andnot_si128(flat2, p4);
     413             :   //  p4 remains unchanged if !(flat2 && flat && mask)
     414           0 :   flat2_p4 = _mm_and_si128(flat2, flat2_p4);
     415             :   //  get values for when (flat2 && flat && mask)
     416           0 :   p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
     417           0 :   q4 = _mm_andnot_si128(flat2, q4);
     418             :   //  q4 remains unchanged if !(flat2 && flat && mask)
     419           0 :   flat2_q4 = _mm_and_si128(flat2, flat2_q4);
     420             :   //  get values for when (flat2 && flat && mask)
     421           0 :   q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
     422           0 :   _mm_store_si128((__m128i *)(s - 5 * p), p4);
     423           0 :   _mm_store_si128((__m128i *)(s + 4 * p), q4);
     424             : 
     425           0 :   p3 = _mm_andnot_si128(flat2, p3);
     426             :   //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
     427           0 :   flat2_p3 = _mm_and_si128(flat2, flat2_p3);
     428             :   //  get values for when (flat2 && flat && mask)
     429           0 :   p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
     430           0 :   q3 = _mm_andnot_si128(flat2, q3);
     431             :   //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
     432           0 :   flat2_q3 = _mm_and_si128(flat2, flat2_q3);
     433             :   //  get values for when (flat2 && flat && mask)
     434           0 :   q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
     435           0 :   _mm_store_si128((__m128i *)(s - 4 * p), p3);
     436           0 :   _mm_store_si128((__m128i *)(s + 3 * p), q3);
     437             : 
     438           0 :   p2 = _mm_andnot_si128(flat2, p2);
     439             :   //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
     440           0 :   flat2_p2 = _mm_and_si128(flat2, flat2_p2);
     441             :   //  get values for when (flat2 && flat && mask)
     442           0 :   p2 = _mm_or_si128(p2, flat2_p2);
     443             :   //  full list of p2 values
     444           0 :   q2 = _mm_andnot_si128(flat2, q2);
     445             :   //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
     446           0 :   flat2_q2 = _mm_and_si128(flat2, flat2_q2);
     447             :   //  get values for when (flat2 && flat && mask)
     448           0 :   q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
     449           0 :   _mm_store_si128((__m128i *)(s - 3 * p), p2);
     450           0 :   _mm_store_si128((__m128i *)(s + 2 * p), q2);
     451             : 
     452           0 :   p1 = _mm_andnot_si128(flat2, p1);
     453             :   //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
     454           0 :   flat2_p1 = _mm_and_si128(flat2, flat2_p1);
     455             :   //  get values for when (flat2 && flat && mask)
     456           0 :   p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
     457           0 :   q1 = _mm_andnot_si128(flat2, q1);
     458             :   //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
     459           0 :   flat2_q1 = _mm_and_si128(flat2, flat2_q1);
     460             :   //  get values for when (flat2 && flat && mask)
     461           0 :   q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
     462           0 :   _mm_store_si128((__m128i *)(s - 2 * p), p1);
     463           0 :   _mm_store_si128((__m128i *)(s + 1 * p), q1);
     464             : 
     465           0 :   p0 = _mm_andnot_si128(flat2, p0);
     466             :   //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
     467           0 :   flat2_p0 = _mm_and_si128(flat2, flat2_p0);
     468             :   //  get values for when (flat2 && flat && mask)
     469           0 :   p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
     470           0 :   q0 = _mm_andnot_si128(flat2, q0);
     471             :   //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
     472           0 :   flat2_q0 = _mm_and_si128(flat2, flat2_q0);
     473             :   //  get values for when (flat2 && flat && mask)
     474           0 :   q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
     475           0 :   _mm_store_si128((__m128i *)(s - 1 * p), p0);
     476             :   _mm_store_si128((__m128i *)(s - 0 * p), q0);
     477           0 : }
     478             : 
     479           0 : void aom_highbd_lpf_horizontal_edge_16_sse2(uint16_t *s, int p,
     480             :                                             const uint8_t *_blimit,
     481             :                                             const uint8_t *_limit,
     482             :                                             const uint8_t *_thresh, int bd) {
     483           0 :   aom_highbd_lpf_horizontal_edge_8_sse2(s, p, _blimit, _limit, _thresh, bd);
     484           0 :   aom_highbd_lpf_horizontal_edge_8_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
     485           0 : }
     486             : 
     487           0 : void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
     488             :                                       const uint8_t *_blimit,
     489             :                                       const uint8_t *_limit,
     490             :                                       const uint8_t *_thresh, int bd) {
     491             :   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
     492             :   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
     493             :   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
     494             :   DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
     495             :   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
     496             :   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
     497           0 :   const __m128i zero = _mm_set1_epi16(0);
     498             :   __m128i blimit, limit, thresh;
     499             :   __m128i mask, hev, flat;
     500           0 :   __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
     501           0 :   __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
     502           0 :   __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
     503           0 :   __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
     504           0 :   __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
     505           0 :   __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
     506           0 :   __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
     507           0 :   __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
     508           0 :   const __m128i one = _mm_set1_epi16(1);
     509           0 :   const __m128i ffff = _mm_cmpeq_epi16(one, one);
     510             :   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
     511           0 :   const __m128i four = _mm_set1_epi16(4);
     512             :   __m128i workp_a, workp_b, workp_shft;
     513             : 
     514           0 :   const __m128i t4 = _mm_set1_epi16(4);
     515           0 :   const __m128i t3 = _mm_set1_epi16(3);
     516             :   __m128i t80;
     517           0 :   const __m128i t1 = _mm_set1_epi16(0x1);
     518             :   __m128i ps1, ps0, qs0, qs1;
     519             :   __m128i filt;
     520             :   __m128i work_a;
     521             :   __m128i filter1, filter2;
     522             : 
     523           0 :   if (bd == 8) {
     524           0 :     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     525           0 :     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
     526           0 :     thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
     527           0 :     t80 = _mm_set1_epi16(0x80);
     528           0 :   } else if (bd == 10) {
     529           0 :     blimit = _mm_slli_epi16(
     530             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
     531           0 :     limit = _mm_slli_epi16(
     532             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
     533           0 :     thresh = _mm_slli_epi16(
     534             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
     535           0 :     t80 = _mm_set1_epi16(0x200);
     536             :   } else {  // bd == 12
     537           0 :     blimit = _mm_slli_epi16(
     538             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
     539           0 :     limit = _mm_slli_epi16(
     540             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
     541           0 :     thresh = _mm_slli_epi16(
     542             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
     543           0 :     t80 = _mm_set1_epi16(0x800);
     544             :   }
     545             : 
     546           0 :   ps1 = _mm_subs_epi16(p1, t80);
     547           0 :   ps0 = _mm_subs_epi16(p0, t80);
     548           0 :   qs0 = _mm_subs_epi16(q0, t80);
     549           0 :   qs1 = _mm_subs_epi16(q1, t80);
     550             : 
     551             :   // filter_mask and hev_mask
     552           0 :   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
     553           0 :   abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
     554             : 
     555           0 :   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
     556           0 :   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
     557           0 :   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
     558           0 :   hev = _mm_subs_epu16(flat, thresh);
     559           0 :   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
     560             : 
     561           0 :   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
     562           0 :   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
     563           0 :   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
     564           0 :   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
     565             :   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     566             :   // So taking maximums continues to work:
     567           0 :   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
     568           0 :   mask = _mm_max_epi16(abs_p1p0, mask);
     569             :   // mask |= (abs(p1 - p0) > limit) * -1;
     570           0 :   mask = _mm_max_epi16(abs_q1q0, mask);
     571             :   // mask |= (abs(q1 - q0) > limit) * -1;
     572             : 
     573           0 :   work = _mm_max_epi16(
     574             :       _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
     575             :       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
     576           0 :   mask = _mm_max_epi16(work, mask);
     577           0 :   work = _mm_max_epi16(
     578             :       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
     579             :       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
     580           0 :   mask = _mm_max_epi16(work, mask);
     581           0 :   mask = _mm_subs_epu16(mask, limit);
     582           0 :   mask = _mm_cmpeq_epi16(mask, zero);
     583             : 
     584             :   // flat_mask4
     585           0 :   flat = _mm_max_epi16(
     586             :       _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
     587             :       _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
     588           0 :   work = _mm_max_epi16(
     589             :       _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
     590             :       _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
     591           0 :   flat = _mm_max_epi16(work, flat);
     592           0 :   flat = _mm_max_epi16(abs_p1p0, flat);
     593           0 :   flat = _mm_max_epi16(abs_q1q0, flat);
     594             : 
     595           0 :   if (bd == 8)
     596           0 :     flat = _mm_subs_epu16(flat, one);
     597           0 :   else if (bd == 10)
     598           0 :     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
     599             :   else  // bd == 12
     600           0 :     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
     601             : 
     602           0 :   flat = _mm_cmpeq_epi16(flat, zero);
     603           0 :   flat = _mm_and_si128(flat, mask);  // flat & mask
     604             : 
     605             :   // Added before shift for rounding part of ROUND_POWER_OF_TWO
     606             : 
     607           0 :   workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
     608           0 :   workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
     609           0 :   workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
     610           0 :   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
     611             :   _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
     612             : 
     613           0 :   workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
     614           0 :   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
     615             :   _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
     616             : 
     617           0 :   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
     618           0 :   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
     619           0 :   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
     620             :   _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
     621             : 
     622           0 :   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
     623           0 :   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
     624           0 :   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
     625             :   _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
     626             : 
     627           0 :   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
     628           0 :   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
     629           0 :   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
     630             :   _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
     631             : 
     632           0 :   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
     633           0 :   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
     634           0 :   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
     635             :   _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
     636             : 
     637             :   // lp filter
     638           0 :   filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
     639           0 :   filt = _mm_and_si128(filt, hev);
     640           0 :   work_a = _mm_subs_epi16(qs0, ps0);
     641           0 :   filt = _mm_adds_epi16(filt, work_a);
     642           0 :   filt = _mm_adds_epi16(filt, work_a);
     643           0 :   filt = _mm_adds_epi16(filt, work_a);
     644             :   // (aom_filter + 3 * (qs0 - ps0)) & mask
     645           0 :   filt = signed_char_clamp_bd_sse2(filt, bd);
     646           0 :   filt = _mm_and_si128(filt, mask);
     647             : 
     648           0 :   filter1 = _mm_adds_epi16(filt, t4);
     649           0 :   filter2 = _mm_adds_epi16(filt, t3);
     650             : 
     651             :   // Filter1 >> 3
     652           0 :   filter1 = signed_char_clamp_bd_sse2(filter1, bd);
     653           0 :   filter1 = _mm_srai_epi16(filter1, 3);
     654             : 
     655             :   // Filter2 >> 3
     656           0 :   filter2 = signed_char_clamp_bd_sse2(filter2, bd);
     657           0 :   filter2 = _mm_srai_epi16(filter2, 3);
     658             : 
     659             :   // filt >> 1
     660           0 :   filt = _mm_adds_epi16(filter1, t1);
     661           0 :   filt = _mm_srai_epi16(filt, 1);
     662             :   // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
     663           0 :   filt = _mm_andnot_si128(hev, filt);
     664             : 
     665           0 :   work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
     666           0 :   work_a = _mm_adds_epi16(work_a, t80);
     667           0 :   q0 = _mm_load_si128((__m128i *)flat_oq0);
     668           0 :   work_a = _mm_andnot_si128(flat, work_a);
     669           0 :   q0 = _mm_and_si128(flat, q0);
     670           0 :   q0 = _mm_or_si128(work_a, q0);
     671             : 
     672           0 :   work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
     673           0 :   work_a = _mm_adds_epi16(work_a, t80);
     674           0 :   q1 = _mm_load_si128((__m128i *)flat_oq1);
     675           0 :   work_a = _mm_andnot_si128(flat, work_a);
     676           0 :   q1 = _mm_and_si128(flat, q1);
     677           0 :   q1 = _mm_or_si128(work_a, q1);
     678             : 
     679           0 :   work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
     680           0 :   q2 = _mm_load_si128((__m128i *)flat_oq2);
     681           0 :   work_a = _mm_andnot_si128(flat, work_a);
     682           0 :   q2 = _mm_and_si128(flat, q2);
     683           0 :   q2 = _mm_or_si128(work_a, q2);
     684             : 
     685           0 :   work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
     686           0 :   work_a = _mm_adds_epi16(work_a, t80);
     687           0 :   p0 = _mm_load_si128((__m128i *)flat_op0);
     688           0 :   work_a = _mm_andnot_si128(flat, work_a);
     689           0 :   p0 = _mm_and_si128(flat, p0);
     690           0 :   p0 = _mm_or_si128(work_a, p0);
     691             : 
     692           0 :   work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
     693           0 :   work_a = _mm_adds_epi16(work_a, t80);
     694           0 :   p1 = _mm_load_si128((__m128i *)flat_op1);
     695           0 :   work_a = _mm_andnot_si128(flat, work_a);
     696           0 :   p1 = _mm_and_si128(flat, p1);
     697           0 :   p1 = _mm_or_si128(work_a, p1);
     698             : 
     699           0 :   work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
     700           0 :   p2 = _mm_load_si128((__m128i *)flat_op2);
     701           0 :   work_a = _mm_andnot_si128(flat, work_a);
     702           0 :   p2 = _mm_and_si128(flat, p2);
     703           0 :   p2 = _mm_or_si128(work_a, p2);
     704             : 
     705           0 :   _mm_store_si128((__m128i *)(s - 3 * p), p2);
     706           0 :   _mm_store_si128((__m128i *)(s - 2 * p), p1);
     707           0 :   _mm_store_si128((__m128i *)(s - 1 * p), p0);
     708             :   _mm_store_si128((__m128i *)(s + 0 * p), q0);
     709           0 :   _mm_store_si128((__m128i *)(s + 1 * p), q1);
     710             :   _mm_store_si128((__m128i *)(s + 2 * p), q2);
     711           0 : }
     712             : 
     713           0 : void aom_highbd_lpf_horizontal_8_dual_sse2(
     714             :     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     715             :     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     716             :     const uint8_t *_thresh1, int bd) {
     717           0 :   aom_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
     718           0 :   aom_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
     719           0 : }
     720             : 
     721           0 : void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
     722             :                                       const uint8_t *_blimit,
     723             :                                       const uint8_t *_limit,
     724             :                                       const uint8_t *_thresh, int bd) {
     725           0 :   const __m128i zero = _mm_set1_epi16(0);
     726             :   __m128i blimit, limit, thresh;
     727             :   __m128i mask, hev, flat;
     728           0 :   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
     729           0 :   __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
     730           0 :   __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
     731           0 :   __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
     732           0 :   __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
     733           0 :   __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
     734           0 :   __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
     735           0 :   __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
     736           0 :   const __m128i abs_p1p0 =
     737           0 :       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
     738           0 :   const __m128i abs_q1q0 =
     739           0 :       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
     740           0 :   const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
     741           0 :   const __m128i one = _mm_set1_epi16(1);
     742           0 :   __m128i abs_p0q0 =
     743           0 :       _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
     744           0 :   __m128i abs_p1q1 =
     745           0 :       _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
     746             :   __m128i work;
     747           0 :   const __m128i t4 = _mm_set1_epi16(4);
     748           0 :   const __m128i t3 = _mm_set1_epi16(3);
     749             :   __m128i t80;
     750             :   __m128i tff80;
     751             :   __m128i tffe0;
     752             :   __m128i t1f;
     753             :   // equivalent to shifting 0x1f left by bitdepth - 8
     754             :   // and setting new bits to 1
     755           0 :   const __m128i t1 = _mm_set1_epi16(0x1);
     756             :   __m128i t7f;
     757             :   // equivalent to shifting 0x7f left by bitdepth - 8
     758             :   // and setting new bits to 1
     759             :   __m128i ps1, ps0, qs0, qs1;
     760             :   __m128i filt;
     761             :   __m128i work_a;
     762             :   __m128i filter1, filter2;
     763             : 
     764           0 :   if (bd == 8) {
     765           0 :     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     766           0 :     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
     767           0 :     thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
     768           0 :     t80 = _mm_set1_epi16(0x80);
     769           0 :     tff80 = _mm_set1_epi16(0xff80);
     770           0 :     tffe0 = _mm_set1_epi16(0xffe0);
     771           0 :     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
     772           0 :     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
     773           0 :   } else if (bd == 10) {
     774           0 :     blimit = _mm_slli_epi16(
     775             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
     776           0 :     limit = _mm_slli_epi16(
     777             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
     778           0 :     thresh = _mm_slli_epi16(
     779             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
     780           0 :     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
     781           0 :     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
     782           0 :     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
     783           0 :     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
     784           0 :     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
     785             :   } else {  // bd == 12
     786           0 :     blimit = _mm_slli_epi16(
     787             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
     788           0 :     limit = _mm_slli_epi16(
     789             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
     790           0 :     thresh = _mm_slli_epi16(
     791             :         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
     792           0 :     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
     793           0 :     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
     794           0 :     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
     795           0 :     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
     796           0 :     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
     797             :   }
     798             : 
     799           0 :   ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
     800           0 :   ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
     801           0 :   qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
     802           0 :   qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
     803             : 
     804             :   // filter_mask and hev_mask
     805           0 :   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
     806           0 :   hev = _mm_subs_epu16(flat, thresh);
     807           0 :   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
     808             : 
     809           0 :   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
     810           0 :   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
     811           0 :   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
     812           0 :   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
     813             :   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     814             :   // So taking maximums continues to work:
     815           0 :   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
     816           0 :   mask = _mm_max_epi16(flat, mask);
     817             :   // mask |= (abs(p1 - p0) > limit) * -1;
     818             :   // mask |= (abs(q1 - q0) > limit) * -1;
     819           0 :   work = _mm_max_epi16(
     820             :       _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
     821             :       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
     822           0 :   mask = _mm_max_epi16(work, mask);
     823           0 :   work = _mm_max_epi16(
     824             :       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
     825             :       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
     826           0 :   mask = _mm_max_epi16(work, mask);
     827           0 :   mask = _mm_subs_epu16(mask, limit);
     828           0 :   mask = _mm_cmpeq_epi16(mask, zero);
     829             : 
     830             :   // filter4
     831           0 :   filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
     832           0 :   filt = _mm_and_si128(filt, hev);
     833           0 :   work_a = _mm_subs_epi16(qs0, ps0);
     834           0 :   filt = _mm_adds_epi16(filt, work_a);
     835           0 :   filt = _mm_adds_epi16(filt, work_a);
     836           0 :   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
     837             : 
     838             :   // (aom_filter + 3 * (qs0 - ps0)) & mask
     839           0 :   filt = _mm_and_si128(filt, mask);
     840             : 
     841           0 :   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
     842           0 :   filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
     843             : 
     844             :   // Filter1 >> 3
     845           0 :   work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
     846           0 :   filter1 = _mm_srli_epi16(filter1, 3);
     847           0 :   work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
     848           0 :   filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
     849           0 :   filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
     850             : 
     851             :   // Filter2 >> 3
     852           0 :   work_a = _mm_cmpgt_epi16(zero, filter2);
     853           0 :   filter2 = _mm_srli_epi16(filter2, 3);
     854           0 :   work_a = _mm_and_si128(work_a, tffe0);
     855           0 :   filter2 = _mm_and_si128(filter2, t1f);
     856           0 :   filter2 = _mm_or_si128(filter2, work_a);
     857             : 
     858             :   // filt >> 1
     859           0 :   filt = _mm_adds_epi16(filter1, t1);
     860           0 :   work_a = _mm_cmpgt_epi16(zero, filt);
     861           0 :   filt = _mm_srli_epi16(filt, 1);
     862           0 :   work_a = _mm_and_si128(work_a, tff80);
     863           0 :   filt = _mm_and_si128(filt, t7f);
     864           0 :   filt = _mm_or_si128(filt, work_a);
     865             : 
     866           0 :   filt = _mm_andnot_si128(hev, filt);
     867             : 
     868           0 :   q0 = _mm_adds_epi16(
     869             :       signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
     870           0 :   q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
     871             :                       t80);
     872           0 :   p0 = _mm_adds_epi16(
     873             :       signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
     874           0 :   p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
     875             :                       t80);
     876             : 
     877           0 :   _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
     878           0 :   _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
     879             :   _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
     880           0 :   _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
     881           0 : }
     882             : 
     883           0 : void aom_highbd_lpf_horizontal_4_dual_sse2(
     884             :     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
     885             :     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
     886             :     const uint8_t *_thresh1, int bd) {
     887           0 :   aom_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
     888           0 :   aom_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
     889           0 : }
     890             : 
     891           0 : static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
     892             :                                     int out_p, int num_8x8_to_transpose) {
     893           0 :   int idx8x8 = 0;
     894             :   __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
     895             :   do {
     896           0 :     uint16_t *in = src[idx8x8];
     897           0 :     uint16_t *out = dst[idx8x8];
     898             : 
     899           0 :     p0 =
     900             :         _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
     901           0 :     p1 =
     902           0 :         _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
     903           0 :     p2 =
     904           0 :         _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
     905           0 :     p3 =
     906           0 :         _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
     907           0 :     p4 =
     908           0 :         _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
     909           0 :     p5 =
     910           0 :         _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
     911           0 :     p6 =
     912           0 :         _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
     913           0 :     p7 =
     914           0 :         _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
     915             :     // 00 10 01 11 02 12 03 13
     916           0 :     x0 = _mm_unpacklo_epi16(p0, p1);
     917             :     // 20 30 21 31 22 32 23 33
     918           0 :     x1 = _mm_unpacklo_epi16(p2, p3);
     919             :     // 40 50 41 51 42 52 43 53
     920           0 :     x2 = _mm_unpacklo_epi16(p4, p5);
     921             :     // 60 70 61 71 62 72 63 73
     922           0 :     x3 = _mm_unpacklo_epi16(p6, p7);
     923             :     // 00 10 20 30 01 11 21 31
     924           0 :     x4 = _mm_unpacklo_epi32(x0, x1);
     925             :     // 40 50 60 70 41 51 61 71
     926           0 :     x5 = _mm_unpacklo_epi32(x2, x3);
     927             :     // 00 10 20 30 40 50 60 70
     928           0 :     x6 = _mm_unpacklo_epi64(x4, x5);
     929             :     // 01 11 21 31 41 51 61 71
     930           0 :     x7 = _mm_unpackhi_epi64(x4, x5);
     931             : 
     932             :     _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
     933             :     // 00 10 20 30 40 50 60 70
     934           0 :     _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
     935             :     // 01 11 21 31 41 51 61 71
     936             : 
     937             :     // 02 12 22 32 03 13 23 33
     938           0 :     x4 = _mm_unpackhi_epi32(x0, x1);
     939             :     // 42 52 62 72 43 53 63 73
     940           0 :     x5 = _mm_unpackhi_epi32(x2, x3);
     941             :     // 02 12 22 32 42 52 62 72
     942           0 :     x6 = _mm_unpacklo_epi64(x4, x5);
     943             :     // 03 13 23 33 43 53 63 73
     944           0 :     x7 = _mm_unpackhi_epi64(x4, x5);
     945             : 
     946           0 :     _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
     947             :     // 02 12 22 32 42 52 62 72
     948           0 :     _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
     949             :     // 03 13 23 33 43 53 63 73
     950             : 
     951             :     // 04 14 05 15 06 16 07 17
     952           0 :     x0 = _mm_unpackhi_epi16(p0, p1);
     953             :     // 24 34 25 35 26 36 27 37
     954           0 :     x1 = _mm_unpackhi_epi16(p2, p3);
     955             :     // 44 54 45 55 46 56 47 57
     956           0 :     x2 = _mm_unpackhi_epi16(p4, p5);
     957             :     // 64 74 65 75 66 76 67 77
     958           0 :     x3 = _mm_unpackhi_epi16(p6, p7);
     959             :     // 04 14 24 34 05 15 25 35
     960           0 :     x4 = _mm_unpacklo_epi32(x0, x1);
     961             :     // 44 54 64 74 45 55 65 75
     962           0 :     x5 = _mm_unpacklo_epi32(x2, x3);
     963             :     // 04 14 24 34 44 54 64 74
     964           0 :     x6 = _mm_unpacklo_epi64(x4, x5);
     965             :     // 05 15 25 35 45 55 65 75
     966           0 :     x7 = _mm_unpackhi_epi64(x4, x5);
     967             : 
     968           0 :     _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
     969             :     // 04 14 24 34 44 54 64 74
     970           0 :     _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
     971             :     // 05 15 25 35 45 55 65 75
     972             : 
     973             :     // 06 16 26 36 07 17 27 37
     974           0 :     x4 = _mm_unpackhi_epi32(x0, x1);
     975             :     // 46 56 66 76 47 57 67 77
     976           0 :     x5 = _mm_unpackhi_epi32(x2, x3);
     977             :     // 06 16 26 36 46 56 66 76
     978           0 :     x6 = _mm_unpacklo_epi64(x4, x5);
     979             :     // 07 17 27 37 47 57 67 77
     980           0 :     x7 = _mm_unpackhi_epi64(x4, x5);
     981             : 
     982           0 :     _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
     983             :     // 06 16 26 36 46 56 66 76
     984           0 :     _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
     985             :     // 07 17 27 37 47 57 67 77
     986           0 :   } while (++idx8x8 < num_8x8_to_transpose);
     987           0 : }
     988             : 
     989           0 : static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
     990             :                                         uint16_t *out, int out_p) {
     991             :   uint16_t *src0[1];
     992             :   uint16_t *src1[1];
     993             :   uint16_t *dest0[1];
     994             :   uint16_t *dest1[1];
     995           0 :   src0[0] = in0;
     996           0 :   src1[0] = in1;
     997           0 :   dest0[0] = out;
     998           0 :   dest1[0] = out + 8;
     999           0 :   highbd_transpose(src0, in_p, dest0, out_p, 1);
    1000           0 :   highbd_transpose(src1, in_p, dest1, out_p, 1);
    1001           0 : }
    1002             : 
    1003           0 : void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
    1004             :                                     const uint8_t *limit, const uint8_t *thresh,
    1005             :                                     int bd) {
    1006             :   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
    1007             :   uint16_t *src[1];
    1008             :   uint16_t *dst[1];
    1009             : 
    1010             :   // Transpose 8x8
    1011           0 :   src[0] = s - 4;
    1012           0 :   dst[0] = t_dst;
    1013             : 
    1014           0 :   highbd_transpose(src, p, dst, 8, 1);
    1015             : 
    1016             :   // Loop filtering
    1017           0 :   aom_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
    1018             : 
    1019           0 :   src[0] = t_dst;
    1020           0 :   dst[0] = s - 4;
    1021             : 
    1022             :   // Transpose back
    1023           0 :   highbd_transpose(src, 8, dst, p, 1);
    1024           0 : }
    1025             : 
    1026           0 : void aom_highbd_lpf_vertical_4_dual_sse2(
    1027             :     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    1028             :     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    1029             :     const uint8_t *thresh1, int bd) {
    1030             :   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
    1031             :   uint16_t *src[2];
    1032             :   uint16_t *dst[2];
    1033             : 
    1034             :   // Transpose 8x16
    1035           0 :   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
    1036             : 
    1037             :   // Loop filtering
    1038           0 :   aom_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
    1039             :                                         thresh0, blimit1, limit1, thresh1, bd);
    1040           0 :   src[0] = t_dst;
    1041           0 :   src[1] = t_dst + 8;
    1042           0 :   dst[0] = s - 4;
    1043           0 :   dst[1] = s - 4 + p * 8;
    1044             : 
    1045             :   // Transpose back
    1046           0 :   highbd_transpose(src, 16, dst, p, 2);
    1047           0 : }
    1048             : 
    1049           0 : void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
    1050             :                                     const uint8_t *limit, const uint8_t *thresh,
    1051             :                                     int bd) {
    1052             :   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
    1053             :   uint16_t *src[1];
    1054             :   uint16_t *dst[1];
    1055             : 
    1056             :   // Transpose 8x8
    1057           0 :   src[0] = s - 4;
    1058           0 :   dst[0] = t_dst;
    1059             : 
    1060           0 :   highbd_transpose(src, p, dst, 8, 1);
    1061             : 
    1062             :   // Loop filtering
    1063           0 :   aom_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
    1064             : 
    1065           0 :   src[0] = t_dst;
    1066           0 :   dst[0] = s - 4;
    1067             : 
    1068             :   // Transpose back
    1069           0 :   highbd_transpose(src, 8, dst, p, 1);
    1070           0 : }
    1071             : 
    1072           0 : void aom_highbd_lpf_vertical_8_dual_sse2(
    1073             :     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
    1074             :     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
    1075             :     const uint8_t *thresh1, int bd) {
    1076             :   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
    1077             :   uint16_t *src[2];
    1078             :   uint16_t *dst[2];
    1079             : 
    1080             :   // Transpose 8x16
    1081           0 :   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
    1082             : 
    1083             :   // Loop filtering
    1084           0 :   aom_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
    1085             :                                         thresh0, blimit1, limit1, thresh1, bd);
    1086           0 :   src[0] = t_dst;
    1087           0 :   src[1] = t_dst + 8;
    1088             : 
    1089           0 :   dst[0] = s - 4;
    1090           0 :   dst[1] = s - 4 + p * 8;
    1091             : 
    1092             :   // Transpose back
    1093           0 :   highbd_transpose(src, 16, dst, p, 2);
    1094           0 : }
    1095             : 
    1096           0 : void aom_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
    1097             :                                      const uint8_t *limit,
    1098             :                                      const uint8_t *thresh, int bd) {
    1099             :   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
    1100             :   uint16_t *src[2];
    1101             :   uint16_t *dst[2];
    1102             : 
    1103           0 :   src[0] = s - 8;
    1104           0 :   src[1] = s;
    1105           0 :   dst[0] = t_dst;
    1106           0 :   dst[1] = t_dst + 8 * 8;
    1107             : 
    1108             :   // Transpose 16x8
    1109           0 :   highbd_transpose(src, p, dst, 8, 2);
    1110             : 
    1111             :   // Loop filtering
    1112           0 :   aom_highbd_lpf_horizontal_edge_8_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
    1113             :                                         bd);
    1114           0 :   src[0] = t_dst;
    1115           0 :   src[1] = t_dst + 8 * 8;
    1116           0 :   dst[0] = s - 8;
    1117           0 :   dst[1] = s;
    1118             : 
    1119             :   // Transpose back
    1120           0 :   highbd_transpose(src, 8, dst, p, 2);
    1121           0 : }
    1122             : 
    1123           0 : void aom_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
    1124             :                                           const uint8_t *blimit,
    1125             :                                           const uint8_t *limit,
    1126             :                                           const uint8_t *thresh, int bd) {
    1127             :   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
    1128             : 
    1129             :   //  Transpose 16x16
    1130           0 :   highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
    1131           0 :   highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
    1132             : 
    1133             :   //  Loop filtering
    1134           0 :   aom_highbd_lpf_horizontal_edge_16_sse2(t_dst + 8 * 16, 16, blimit, limit,
    1135             :                                          thresh, bd);
    1136             : 
    1137             :   //  Transpose back
    1138           0 :   highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
    1139           0 :   highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
    1140           0 : }

Generated by: LCOV version 1.13