LCOV - code coverage report
Current view: top level - media/libyuv/libyuv/source - scale_gcc.cc (source / functions) Hit Total Coverage
Test: output.info Lines: 0 102 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 30 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
       3             :  *
       4             :  *  Use of this source code is governed by a BSD-style license
       5             :  *  that can be found in the LICENSE file in the root of the source
       6             :  *  tree. An additional intellectual property rights grant can be found
       7             :  *  in the file PATENTS. All contributing project authors may
       8             :  *  be found in the AUTHORS file in the root of the source tree.
       9             :  */
      10             : 
      11             : #include "libyuv/row.h"
      12             : #include "libyuv/scale_row.h"
      13             : 
      14             : #ifdef __cplusplus
      15             : namespace libyuv {
      16             : extern "C" {
      17             : #endif
      18             : 
      19             : // This module is for GCC x86 and x64.
      20             : #if !defined(LIBYUV_DISABLE_X86) && \
      21             :     (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
      22             : 
      23             : // Offsets for source bytes 0 to 9
      24             : static uvec8 kShuf0 = {0,   1,   3,   4,   5,   7,   8,   9,
      25             :                        128, 128, 128, 128, 128, 128, 128, 128};
      26             : 
      27             : // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
      28             : static uvec8 kShuf1 = {3,   4,   5,   7,   8,   9,   11,  12,
      29             :                        128, 128, 128, 128, 128, 128, 128, 128};
      30             : 
      31             : // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
      32             : static uvec8 kShuf2 = {5,   7,   8,   9,   11,  12,  13,  15,
      33             :                        128, 128, 128, 128, 128, 128, 128, 128};
      34             : 
      35             : // Offsets for source bytes 0 to 10
      36             : static uvec8 kShuf01 = {0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10};
      37             : 
      38             : // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
      39             : static uvec8 kShuf11 = {2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13};
      40             : 
      41             : // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
      42             : static uvec8 kShuf21 = {5,  6,  6,  7,  8,  9,  9,  10,
      43             :                         10, 11, 12, 13, 13, 14, 14, 15};
      44             : 
      45             : // Coefficients for source bytes 0 to 10
      46             : static uvec8 kMadd01 = {3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2};
      47             : 
      48             : // Coefficients for source bytes 10 to 21
      49             : static uvec8 kMadd11 = {1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1};
      50             : 
      51             : // Coefficients for source bytes 21 to 31
      52             : static uvec8 kMadd21 = {2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3};
      53             : 
      54             : // Coefficients for source bytes 21 to 31
      55             : static vec16 kRound34 = {2, 2, 2, 2, 2, 2, 2, 2};
      56             : 
      57             : static uvec8 kShuf38a = {0,   3,   6,   8,   11,  14,  128, 128,
      58             :                          128, 128, 128, 128, 128, 128, 128, 128};
      59             : 
      60             : static uvec8 kShuf38b = {128, 128, 128, 128, 128, 128, 0,   3,
      61             :                          6,   8,   11,  14,  128, 128, 128, 128};
      62             : 
      63             : // Arrange words 0,3,6 into 0,1,2
      64             : static uvec8 kShufAc = {0,   1,   6,   7,   12,  13,  128, 128,
      65             :                         128, 128, 128, 128, 128, 128, 128, 128};
      66             : 
      67             : // Arrange words 0,3,6 into 3,4,5
      68             : static uvec8 kShufAc3 = {128, 128, 128, 128, 128, 128, 0,   1,
      69             :                          6,   7,   12,  13,  128, 128, 128, 128};
      70             : 
      71             : // Scaling values for boxes of 3x3 and 2x3
      72             : static uvec16 kScaleAc33 = {65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9,
      73             :                             65536 / 9, 65536 / 6, 0,         0};
      74             : 
      75             : // Arrange first value for pixels 0,1,2,3,4,5
      76             : static uvec8 kShufAb0 = {0,  128, 3,  128, 6,   128, 8,   128,
      77             :                          11, 128, 14, 128, 128, 128, 128, 128};
      78             : 
      79             : // Arrange second value for pixels 0,1,2,3,4,5
      80             : static uvec8 kShufAb1 = {1,  128, 4,  128, 7,   128, 9,   128,
      81             :                          12, 128, 15, 128, 128, 128, 128, 128};
      82             : 
      83             : // Arrange third value for pixels 0,1,2,3,4,5
      84             : static uvec8 kShufAb2 = {2,  128, 5,   128, 128, 128, 10,  128,
      85             :                          13, 128, 128, 128, 128, 128, 128, 128};
      86             : 
      87             : // Scaling values for boxes of 3x2 and 2x2
      88             : static uvec16 kScaleAb2 = {65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3,
      89             :                            65536 / 3, 65536 / 2, 0,         0};
      90             : 
      91             : // GCC versions of row functions are verbatim conversions from Visual C.
      92             : // Generated using gcc disassembly on Visual C object file:
      93             : // objdump -D yuvscaler.obj >yuvscaler.txt
      94             : 
      95           0 : void ScaleRowDown2_SSSE3(const uint8* src_ptr,
      96             :                          ptrdiff_t src_stride,
      97             :                          uint8* dst_ptr,
      98             :                          int dst_width) {
      99             :   (void)src_stride;
     100             :   asm volatile (
     101             :     LABELALIGN
     102             :   "1:                                          \n"
     103             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     104             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     105             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     106             :     "psrlw     $0x8,%%xmm0                     \n"
     107             :     "psrlw     $0x8,%%xmm1                     \n"
     108             :     "packuswb  %%xmm1,%%xmm0                   \n"
     109             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     110             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
     111             :     "sub       $0x10,%2                        \n"
     112             :     "jg        1b                              \n"
     113             :   : "+r"(src_ptr),    // %0
     114             :     "+r"(dst_ptr),    // %1
     115             :     "+r"(dst_width)   // %2
     116             :   :: "memory", "cc", "xmm0", "xmm1"
     117           0 :   );
     118           0 : }
     119             : 
     120           0 : void ScaleRowDown2Linear_SSSE3(const uint8* src_ptr,
     121             :                                ptrdiff_t src_stride,
     122             :                                uint8* dst_ptr,
     123             :                                int dst_width) {
     124             :   (void)src_stride;
     125             :   asm volatile (
     126             :     "pcmpeqb    %%xmm4,%%xmm4                  \n"
     127             :     "psrlw      $0xf,%%xmm4                    \n"
     128             :     "packuswb   %%xmm4,%%xmm4                  \n"
     129             :     "pxor       %%xmm5,%%xmm5                  \n"
     130             : 
     131             :     LABELALIGN
     132             :   "1:                                          \n"
     133             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     134             :     "movdqu    " MEMACCESS2(0x10, 0) ",%%xmm1  \n"
     135             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     136             :     "pmaddubsw  %%xmm4,%%xmm0                  \n"
     137             :     "pmaddubsw  %%xmm4,%%xmm1                  \n"
     138             :     "pavgw      %%xmm5,%%xmm0                  \n"
     139             :     "pavgw      %%xmm5,%%xmm1                  \n"
     140             :     "packuswb   %%xmm1,%%xmm0                  \n"
     141             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     142             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
     143             :     "sub       $0x10,%2                        \n"
     144             :     "jg        1b                              \n"
     145             :   : "+r"(src_ptr),    // %0
     146             :     "+r"(dst_ptr),    // %1
     147             :     "+r"(dst_width)   // %2
     148             :   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
     149           0 :   );
     150           0 : }
     151             : 
     152           0 : void ScaleRowDown2Box_SSSE3(const uint8* src_ptr,
     153             :                             ptrdiff_t src_stride,
     154             :                             uint8* dst_ptr,
     155             :                             int dst_width) {
     156             :   asm volatile (
     157             :     "pcmpeqb    %%xmm4,%%xmm4                  \n"
     158             :     "psrlw      $0xf,%%xmm4                    \n"
     159             :     "packuswb   %%xmm4,%%xmm4                  \n"
     160             :     "pxor       %%xmm5,%%xmm5                  \n"
     161             : 
     162             :     LABELALIGN
     163             :   "1:                                          \n"
     164             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     165             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     166             :     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
     167             :     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
     168             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     169             :     "pmaddubsw  %%xmm4,%%xmm0                  \n"
     170             :     "pmaddubsw  %%xmm4,%%xmm1                  \n"
     171             :     "pmaddubsw  %%xmm4,%%xmm2                  \n"
     172             :     "pmaddubsw  %%xmm4,%%xmm3                  \n"
     173             :     "paddw      %%xmm2,%%xmm0                  \n"
     174             :     "paddw      %%xmm3,%%xmm1                  \n"
     175             :     "psrlw      $0x1,%%xmm0                    \n"
     176             :     "psrlw      $0x1,%%xmm1                    \n"
     177             :     "pavgw      %%xmm5,%%xmm0                  \n"
     178             :     "pavgw      %%xmm5,%%xmm1                  \n"
     179             :     "packuswb   %%xmm1,%%xmm0                  \n"
     180             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     181             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
     182             :     "sub       $0x10,%2                        \n"
     183             :     "jg        1b                              \n"
     184             :   : "+r"(src_ptr),    // %0
     185             :     "+r"(dst_ptr),    // %1
     186             :     "+r"(dst_width)   // %2
     187             :   : "r"((intptr_t)(src_stride))   // %3
     188             :   : "memory", "cc", NACL_R14
     189             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
     190           0 :   );
     191           0 : }
     192             : 
     193             : #ifdef HAS_SCALEROWDOWN2_AVX2
     194           0 : void ScaleRowDown2_AVX2(const uint8* src_ptr,
     195             :                         ptrdiff_t src_stride,
     196             :                         uint8* dst_ptr,
     197             :                         int dst_width) {
     198             :   (void)src_stride;
     199             :   asm volatile (
     200             :     LABELALIGN
     201             :   "1:                                          \n"
     202             :     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     203             :     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     204             :     "lea        " MEMLEA(0x40,0) ",%0          \n"
     205             :     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
     206             :     "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
     207             :     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
     208             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     209             :     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
     210             :     "lea        " MEMLEA(0x20,1) ",%1          \n"
     211             :     "sub        $0x20,%2                       \n"
     212             :     "jg         1b                             \n"
     213             :     "vzeroupper                                \n"
     214             :   : "+r"(src_ptr),    // %0
     215             :     "+r"(dst_ptr),    // %1
     216             :     "+r"(dst_width)   // %2
     217             :   :: "memory", "cc", "xmm0", "xmm1"
     218           0 :   );
     219           0 : }
     220             : 
     221           0 : void ScaleRowDown2Linear_AVX2(const uint8* src_ptr,
     222             :                               ptrdiff_t src_stride,
     223             :                               uint8* dst_ptr,
     224             :                               int dst_width) {
     225             :   (void)src_stride;
     226             :   asm volatile (
     227             :     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
     228             :     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
     229             :     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
     230             :     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
     231             : 
     232             :     LABELALIGN
     233             :   "1:                                          \n"
     234             :     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     235             :     "vmovdqu    " MEMACCESS2(0x20, 0) ",%%ymm1 \n"
     236             :     "lea        " MEMLEA(0x40,0) ",%0          \n"
     237             :     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
     238             :     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
     239             :     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
     240             :     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
     241             :     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
     242             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     243             :     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
     244             :     "lea        " MEMLEA(0x20,1) ",%1          \n"
     245             :     "sub        $0x20,%2                       \n"
     246             :     "jg         1b                             \n"
     247             :     "vzeroupper                                \n"
     248             :   : "+r"(src_ptr),    // %0
     249             :     "+r"(dst_ptr),    // %1
     250             :     "+r"(dst_width)   // %2
     251             :   :: "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
     252           0 :   );
     253           0 : }
     254             : 
     255           0 : void ScaleRowDown2Box_AVX2(const uint8* src_ptr,
     256             :                            ptrdiff_t src_stride,
     257             :                            uint8* dst_ptr,
     258             :                            int dst_width) {
     259             :   asm volatile (
     260             :     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
     261             :     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
     262             :     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
     263             :     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
     264             : 
     265             :     LABELALIGN
     266             :   "1:                                          \n"
     267             :     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     268             :     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     269             :     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
     270             :     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
     271             :     "lea        " MEMLEA(0x40,0) ",%0          \n"
     272             :     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
     273             :     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
     274             :     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
     275             :     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
     276             :     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
     277             :     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
     278             :     "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
     279             :     "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
     280             :     "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
     281             :     "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
     282             :     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
     283             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     284             :     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
     285             :     "lea        " MEMLEA(0x20,1) ",%1          \n"
     286             :     "sub        $0x20,%2                       \n"
     287             :     "jg         1b                             \n"
     288             :     "vzeroupper                                \n"
     289             :   : "+r"(src_ptr),    // %0
     290             :     "+r"(dst_ptr),    // %1
     291             :     "+r"(dst_width)   // %2
     292             :   : "r"((intptr_t)(src_stride))   // %3
     293             :   : "memory", "cc", NACL_R14
     294             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
     295           0 :   );
     296           0 : }
     297             : #endif  // HAS_SCALEROWDOWN2_AVX2
     298             : 
     299           0 : void ScaleRowDown4_SSSE3(const uint8* src_ptr,
     300             :                          ptrdiff_t src_stride,
     301             :                          uint8* dst_ptr,
     302             :                          int dst_width) {
     303             :   (void)src_stride;
     304             :   asm volatile (
     305             :     "pcmpeqb   %%xmm5,%%xmm5                   \n"
     306             :     "psrld     $0x18,%%xmm5                    \n"
     307             :     "pslld     $0x10,%%xmm5                    \n"
     308             : 
     309             :     LABELALIGN
     310             :   "1:                                          \n"
     311             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     312             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     313             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     314             :     "pand      %%xmm5,%%xmm0                   \n"
     315             :     "pand      %%xmm5,%%xmm1                   \n"
     316             :     "packuswb  %%xmm1,%%xmm0                   \n"
     317             :     "psrlw     $0x8,%%xmm0                     \n"
     318             :     "packuswb  %%xmm0,%%xmm0                   \n"
     319             :     "movq      %%xmm0," MEMACCESS(1) "         \n"
     320             :     "lea       " MEMLEA(0x8,1) ",%1            \n"
     321             :     "sub       $0x8,%2                         \n"
     322             :     "jg        1b                              \n"
     323             :   : "+r"(src_ptr),    // %0
     324             :     "+r"(dst_ptr),    // %1
     325             :     "+r"(dst_width)   // %2
     326             :   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
     327           0 :   );
     328           0 : }
     329             : 
     330           0 : void ScaleRowDown4Box_SSSE3(const uint8* src_ptr,
     331             :                             ptrdiff_t src_stride,
     332             :                             uint8* dst_ptr,
     333             :                             int dst_width) {
     334             :   intptr_t stridex3;
     335             :   asm volatile (
     336             :     "pcmpeqb    %%xmm4,%%xmm4                  \n"
     337             :     "psrlw      $0xf,%%xmm4                    \n"
     338             :     "movdqa     %%xmm4,%%xmm5                  \n"
     339             :     "packuswb   %%xmm4,%%xmm4                  \n"
     340             :     "psllw      $0x3,%%xmm5                    \n"
     341             :     "lea       " MEMLEA4(0x00,4,4,2) ",%3      \n"
     342             : 
     343             :     LABELALIGN
     344             :   "1:                                          \n"
     345             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     346             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     347             :     MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
     348             :     MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
     349             :     "pmaddubsw  %%xmm4,%%xmm0                  \n"
     350             :     "pmaddubsw  %%xmm4,%%xmm1                  \n"
     351             :     "pmaddubsw  %%xmm4,%%xmm2                  \n"
     352             :     "pmaddubsw  %%xmm4,%%xmm3                  \n"
     353             :     "paddw      %%xmm2,%%xmm0                  \n"
     354             :     "paddw      %%xmm3,%%xmm1                  \n"
     355             :     MEMOPREG(movdqu,0x00,0,4,2,xmm2)           //  movdqu  (%0,%4,2),%%xmm2
     356             :     MEMOPREG(movdqu,0x10,0,4,2,xmm3)           //  movdqu  0x10(%0,%4,2),%%xmm3
     357             :     "pmaddubsw  %%xmm4,%%xmm2                  \n"
     358             :     "pmaddubsw  %%xmm4,%%xmm3                  \n"
     359             :     "paddw      %%xmm2,%%xmm0                  \n"
     360             :     "paddw      %%xmm3,%%xmm1                  \n"
     361             :     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu  (%0,%3,1),%%xmm2
     362             :     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu  0x10(%0,%3,1),%%xmm3
     363             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     364             :     "pmaddubsw  %%xmm4,%%xmm2                  \n"
     365             :     "pmaddubsw  %%xmm4,%%xmm3                  \n"
     366             :     "paddw      %%xmm2,%%xmm0                  \n"
     367             :     "paddw      %%xmm3,%%xmm1                  \n"
     368             :     "phaddw     %%xmm1,%%xmm0                  \n"
     369             :     "paddw      %%xmm5,%%xmm0                  \n"
     370             :     "psrlw      $0x4,%%xmm0                    \n"
     371             :     "packuswb   %%xmm0,%%xmm0                  \n"
     372             :     "movq      %%xmm0," MEMACCESS(1) "         \n"
     373             :     "lea       " MEMLEA(0x8,1) ",%1            \n"
     374             :     "sub       $0x8,%2                         \n"
     375             :     "jg        1b                              \n"
     376             :   : "+r"(src_ptr),     // %0
     377             :     "+r"(dst_ptr),     // %1
     378             :     "+r"(dst_width),   // %2
     379             :     "=&r"(stridex3)    // %3
     380             :   : "r"((intptr_t)(src_stride))    // %4
     381             :   : "memory", "cc", NACL_R14
     382             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
     383           0 :   );
     384           0 : }
     385             : 
     386             : #ifdef HAS_SCALEROWDOWN4_AVX2
     387           0 : void ScaleRowDown4_AVX2(const uint8* src_ptr,
     388             :                         ptrdiff_t src_stride,
     389             :                         uint8* dst_ptr,
     390             :                         int dst_width) {
     391             :   (void)src_stride;
     392             :   asm volatile (
     393             :     "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
     394             :     "vpsrld     $0x18,%%ymm5,%%ymm5            \n"
     395             :     "vpslld     $0x10,%%ymm5,%%ymm5            \n"
     396             :     LABELALIGN
     397             :   "1:                                          \n"
     398             :     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     399             :     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     400             :     "lea        " MEMLEA(0x40,0) ",%0          \n"
     401             :     "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
     402             :     "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
     403             :     "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
     404             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     405             :     "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
     406             :     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
     407             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     408             :     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
     409             :     "lea        " MEMLEA(0x10,1) ",%1          \n"
     410             :     "sub        $0x10,%2                       \n"
     411             :     "jg         1b                             \n"
     412             :     "vzeroupper                                \n"
     413             :   : "+r"(src_ptr),    // %0
     414             :     "+r"(dst_ptr),    // %1
     415             :     "+r"(dst_width)   // %2
     416             :   :: "memory", "cc", "xmm0", "xmm1", "xmm5"
     417           0 :   );
     418           0 : }
     419             : 
     420           0 : void ScaleRowDown4Box_AVX2(const uint8* src_ptr,
     421             :                            ptrdiff_t src_stride,
     422             :                            uint8* dst_ptr,
     423             :                            int dst_width) {
     424             :   asm volatile (
     425             :     "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
     426             :     "vpsrlw     $0xf,%%ymm4,%%ymm4             \n"
     427             :     "vpsllw     $0x3,%%ymm4,%%ymm5             \n"
     428             :     "vpackuswb  %%ymm4,%%ymm4,%%ymm4           \n"
     429             : 
     430             :     LABELALIGN
     431             :   "1:                                          \n"
     432             :     "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
     433             :     "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
     434             :     MEMOPREG(vmovdqu,0x00,0,3,1,ymm2)          //  vmovdqu  (%0,%3,1),%%ymm2
     435             :     MEMOPREG(vmovdqu,0x20,0,3,1,ymm3)          //  vmovdqu  0x20(%0,%3,1),%%ymm3
     436             :     "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
     437             :     "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
     438             :     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
     439             :     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
     440             :     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
     441             :     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
     442             :     MEMOPREG(vmovdqu,0x00,0,3,2,ymm2)          //  vmovdqu  (%0,%3,2),%%ymm2
     443             :     MEMOPREG(vmovdqu,0x20,0,3,2,ymm3)          //  vmovdqu  0x20(%0,%3,2),%%ymm3
     444             :     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
     445             :     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
     446             :     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
     447             :     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
     448             :     MEMOPREG(vmovdqu,0x00,0,4,1,ymm2)          //  vmovdqu  (%0,%4,1),%%ymm2
     449             :     MEMOPREG(vmovdqu,0x20,0,4,1,ymm3)          //  vmovdqu  0x20(%0,%4,1),%%ymm3
     450             :     "lea        " MEMLEA(0x40,0) ",%0          \n"
     451             :     "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
     452             :     "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
     453             :     "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
     454             :     "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
     455             :     "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"
     456             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     457             :     "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
     458             :     "vpsrlw     $0x4,%%ymm0,%%ymm0             \n"
     459             :     "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
     460             :     "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
     461             :     "vmovdqu    %%xmm0," MEMACCESS(1) "        \n"
     462             :     "lea        " MEMLEA(0x10,1) ",%1          \n"
     463             :     "sub        $0x10,%2                       \n"
     464             :     "jg         1b                             \n"
     465             :     "vzeroupper                                \n"
     466             :   : "+r"(src_ptr),    // %0
     467             :     "+r"(dst_ptr),    // %1
     468             :     "+r"(dst_width)   // %2
     469             :   : "r"((intptr_t)(src_stride)),  // %3
     470           0 :     "r"((intptr_t)(src_stride * 3))   // %4
     471             :   : "memory", "cc", NACL_R14
     472             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
     473           0 :   );
     474           0 : }
     475             : #endif  // HAS_SCALEROWDOWN4_AVX2
     476             : 
     477           0 : void ScaleRowDown34_SSSE3(const uint8* src_ptr,
     478             :                           ptrdiff_t src_stride,
     479             :                           uint8* dst_ptr,
     480             :                           int dst_width) {
     481             :   (void)src_stride;
     482             :   asm volatile(
     483             :       "movdqa    %0,%%xmm3                       \n"
     484             :       "movdqa    %1,%%xmm4                       \n"
     485             :       "movdqa    %2,%%xmm5                       \n"
     486             :       :
     487             :       : "m"(kShuf0),  // %0
     488             :         "m"(kShuf1),  // %1
     489             :         "m"(kShuf2)   // %2
     490           0 :       );
     491             :   asm volatile (
     492             :     LABELALIGN
     493             :   "1:                                          \n"
     494             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     495             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm2   \n"
     496             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     497             :     "movdqa    %%xmm2,%%xmm1                   \n"
     498             :     "palignr   $0x8,%%xmm0,%%xmm1              \n"
     499             :     "pshufb    %%xmm3,%%xmm0                   \n"
     500             :     "pshufb    %%xmm4,%%xmm1                   \n"
     501             :     "pshufb    %%xmm5,%%xmm2                   \n"
     502             :     "movq      %%xmm0," MEMACCESS(1) "         \n"
     503             :     "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
     504             :     "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
     505             :     "lea       " MEMLEA(0x18,1) ",%1           \n"
     506             :     "sub       $0x18,%2                        \n"
     507             :     "jg        1b                              \n"
     508             :   : "+r"(src_ptr),   // %0
     509             :     "+r"(dst_ptr),   // %1
     510             :     "+r"(dst_width)  // %2
     511             :   :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
     512           0 :   );
     513           0 : }
     514             : 
     515           0 : void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
     516             :                                 ptrdiff_t src_stride,
     517             :                                 uint8* dst_ptr,
     518             :                                 int dst_width) {
     519             :   asm volatile(
     520             :       "movdqa    %0,%%xmm2                       \n"  // kShuf01
     521             :       "movdqa    %1,%%xmm3                       \n"  // kShuf11
     522             :       "movdqa    %2,%%xmm4                       \n"  // kShuf21
     523             :       :
     524             :       : "m"(kShuf01),  // %0
     525             :         "m"(kShuf11),  // %1
     526             :         "m"(kShuf21)   // %2
     527           0 :       );
     528             :   asm volatile(
     529             :       "movdqa    %0,%%xmm5                       \n"  // kMadd01
     530             :       "movdqa    %1,%%xmm0                       \n"  // kMadd11
     531             :       "movdqa    %2,%%xmm1                       \n"  // kRound34
     532             :       :
     533             :       : "m"(kMadd01),  // %0
     534             :         "m"(kMadd11),  // %1
     535             :         "m"(kRound34)  // %2
     536           0 :       );
     537             :   asm volatile (
     538             :     LABELALIGN
     539             :   "1:                                          \n"
     540             :     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
     541             :     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3),%%xmm7
     542             :     "pavgb     %%xmm7,%%xmm6                   \n"
     543             :     "pshufb    %%xmm2,%%xmm6                   \n"
     544             :     "pmaddubsw %%xmm5,%%xmm6                   \n"
     545             :     "paddsw    %%xmm1,%%xmm6                   \n"
     546             :     "psrlw     $0x2,%%xmm6                     \n"
     547             :     "packuswb  %%xmm6,%%xmm6                   \n"
     548             :     "movq      %%xmm6," MEMACCESS(1) "         \n"
     549             :     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
     550             :     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3),%%xmm7
     551             :     "pavgb     %%xmm7,%%xmm6                   \n"
     552             :     "pshufb    %%xmm3,%%xmm6                   \n"
     553             :     "pmaddubsw %%xmm0,%%xmm6                   \n"
     554             :     "paddsw    %%xmm1,%%xmm6                   \n"
     555             :     "psrlw     $0x2,%%xmm6                     \n"
     556             :     "packuswb  %%xmm6,%%xmm6                   \n"
     557             :     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
     558             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
     559             :     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3),%%xmm7
     560             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     561             :     "pavgb     %%xmm7,%%xmm6                   \n"
     562             :     "pshufb    %%xmm4,%%xmm6                   \n"
     563             :     "pmaddubsw %4,%%xmm6                       \n"
     564             :     "paddsw    %%xmm1,%%xmm6                   \n"
     565             :     "psrlw     $0x2,%%xmm6                     \n"
     566             :     "packuswb  %%xmm6,%%xmm6                   \n"
     567             :     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
     568             :     "lea       " MEMLEA(0x18,1) ",%1           \n"
     569             :     "sub       $0x18,%2                        \n"
     570             :     "jg        1b                              \n"
     571             :   : "+r"(src_ptr),   // %0
     572             :     "+r"(dst_ptr),   // %1
     573             :     "+r"(dst_width)  // %2
     574             :   : "r"((intptr_t)(src_stride)),  // %3
     575             :     "m"(kMadd21)     // %4
     576             :   : "memory", "cc", NACL_R14
     577             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
     578           0 :   );
     579           0 : }
     580             : 
     581           0 : void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
     582             :                                 ptrdiff_t src_stride,
     583             :                                 uint8* dst_ptr,
     584             :                                 int dst_width) {
     585             :   asm volatile(
     586             :       "movdqa    %0,%%xmm2                       \n"  // kShuf01
     587             :       "movdqa    %1,%%xmm3                       \n"  // kShuf11
     588             :       "movdqa    %2,%%xmm4                       \n"  // kShuf21
     589             :       :
     590             :       : "m"(kShuf01),  // %0
     591             :         "m"(kShuf11),  // %1
     592             :         "m"(kShuf21)   // %2
     593           0 :       );
     594             :   asm volatile(
     595             :       "movdqa    %0,%%xmm5                       \n"  // kMadd01
     596             :       "movdqa    %1,%%xmm0                       \n"  // kMadd11
     597             :       "movdqa    %2,%%xmm1                       \n"  // kRound34
     598             :       :
     599             :       : "m"(kMadd01),  // %0
     600             :         "m"(kMadd11),  // %1
     601             :         "m"(kRound34)  // %2
     602           0 :       );
     603             : 
     604             :   asm volatile (
     605             :     LABELALIGN
     606             :   "1:                                          \n"
     607             :     "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
     608             :     MEMOPREG(movdqu,0x00,0,3,1,xmm7)           //  movdqu  (%0,%3,1),%%xmm7
     609             :     "pavgb     %%xmm6,%%xmm7                   \n"
     610             :     "pavgb     %%xmm7,%%xmm6                   \n"
     611             :     "pshufb    %%xmm2,%%xmm6                   \n"
     612             :     "pmaddubsw %%xmm5,%%xmm6                   \n"
     613             :     "paddsw    %%xmm1,%%xmm6                   \n"
     614             :     "psrlw     $0x2,%%xmm6                     \n"
     615             :     "packuswb  %%xmm6,%%xmm6                   \n"
     616             :     "movq      %%xmm6," MEMACCESS(1) "         \n"
     617             :     "movdqu    " MEMACCESS2(0x8,0) ",%%xmm6    \n"
     618             :     MEMOPREG(movdqu,0x8,0,3,1,xmm7)            //  movdqu  0x8(%0,%3,1),%%xmm7
     619             :     "pavgb     %%xmm6,%%xmm7                   \n"
     620             :     "pavgb     %%xmm7,%%xmm6                   \n"
     621             :     "pshufb    %%xmm3,%%xmm6                   \n"
     622             :     "pmaddubsw %%xmm0,%%xmm6                   \n"
     623             :     "paddsw    %%xmm1,%%xmm6                   \n"
     624             :     "psrlw     $0x2,%%xmm6                     \n"
     625             :     "packuswb  %%xmm6,%%xmm6                   \n"
     626             :     "movq      %%xmm6," MEMACCESS2(0x8,1) "    \n"
     627             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
     628             :     MEMOPREG(movdqu,0x10,0,3,1,xmm7)           //  movdqu  0x10(%0,%3,1),%%xmm7
     629             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     630             :     "pavgb     %%xmm6,%%xmm7                   \n"
     631             :     "pavgb     %%xmm7,%%xmm6                   \n"
     632             :     "pshufb    %%xmm4,%%xmm6                   \n"
     633             :     "pmaddubsw %4,%%xmm6                       \n"
     634             :     "paddsw    %%xmm1,%%xmm6                   \n"
     635             :     "psrlw     $0x2,%%xmm6                     \n"
     636             :     "packuswb  %%xmm6,%%xmm6                   \n"
     637             :     "movq      %%xmm6," MEMACCESS2(0x10,1) "   \n"
     638             :     "lea       " MEMLEA(0x18,1) ",%1           \n"
     639             :     "sub       $0x18,%2                        \n"
     640             :     "jg        1b                              \n"
     641             :     : "+r"(src_ptr),   // %0
     642             :       "+r"(dst_ptr),   // %1
     643             :       "+r"(dst_width)  // %2
     644             :     : "r"((intptr_t)(src_stride)),  // %3
     645             :       "m"(kMadd21)     // %4
     646             :     : "memory", "cc", NACL_R14
     647             :       "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
     648           0 :   );
     649           0 : }
     650             : 
     651           0 : void ScaleRowDown38_SSSE3(const uint8* src_ptr,
     652             :                           ptrdiff_t src_stride,
     653             :                           uint8* dst_ptr,
     654             :                           int dst_width) {
     655             :   (void)src_stride;
     656             :   asm volatile (
     657             :     "movdqa    %3,%%xmm4                       \n"
     658             :     "movdqa    %4,%%xmm5                       \n"
     659             : 
     660             :     LABELALIGN
     661             :   "1:                                          \n"
     662             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     663             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     664             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     665             :     "pshufb    %%xmm4,%%xmm0                   \n"
     666             :     "pshufb    %%xmm5,%%xmm1                   \n"
     667             :     "paddusb   %%xmm1,%%xmm0                   \n"
     668             :     "movq      %%xmm0," MEMACCESS(1) "         \n"
     669             :     "movhlps   %%xmm0,%%xmm1                   \n"
     670             :     "movd      %%xmm1," MEMACCESS2(0x8,1) "    \n"
     671             :     "lea       " MEMLEA(0xc,1) ",%1            \n"
     672             :     "sub       $0xc,%2                         \n"
     673             :     "jg        1b                              \n"
     674             :   : "+r"(src_ptr),   // %0
     675             :     "+r"(dst_ptr),   // %1
     676             :     "+r"(dst_width)  // %2
     677             :   : "m"(kShuf38a),   // %3
     678             :     "m"(kShuf38b)    // %4
     679             :   : "memory", "cc", "xmm0", "xmm1", "xmm4", "xmm5"
     680           0 :   );
     681           0 : }
     682             : 
     683           0 : void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
     684             :                                 ptrdiff_t src_stride,
     685             :                                 uint8* dst_ptr,
     686             :                                 int dst_width) {
     687             :   asm volatile(
     688             :       "movdqa    %0,%%xmm2                       \n"
     689             :       "movdqa    %1,%%xmm3                       \n"
     690             :       "movdqa    %2,%%xmm4                       \n"
     691             :       "movdqa    %3,%%xmm5                       \n"
     692             :       :
     693             :       : "m"(kShufAb0),  // %0
     694             :         "m"(kShufAb1),  // %1
     695             :         "m"(kShufAb2),  // %2
     696             :         "m"(kScaleAb2)  // %3
     697           0 :       );
     698             :   asm volatile (
     699             :     LABELALIGN
     700             :   "1:                                          \n"
     701             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     702             :     MEMOPREG(movdqu,0x00,0,3,1,xmm1)           //  movdqu  (%0,%3,1),%%xmm1
     703             :     "lea       " MEMLEA(0x10,0) ",%0           \n"
     704             :     "pavgb     %%xmm1,%%xmm0                   \n"
     705             :     "movdqa    %%xmm0,%%xmm1                   \n"
     706             :     "pshufb    %%xmm2,%%xmm1                   \n"
     707             :     "movdqa    %%xmm0,%%xmm6                   \n"
     708             :     "pshufb    %%xmm3,%%xmm6                   \n"
     709             :     "paddusw   %%xmm6,%%xmm1                   \n"
     710             :     "pshufb    %%xmm4,%%xmm0                   \n"
     711             :     "paddusw   %%xmm0,%%xmm1                   \n"
     712             :     "pmulhuw   %%xmm5,%%xmm1                   \n"
     713             :     "packuswb  %%xmm1,%%xmm1                   \n"
     714             :     "movd      %%xmm1," MEMACCESS(1) "         \n"
     715             :     "psrlq     $0x10,%%xmm1                    \n"
     716             :     "movd      %%xmm1," MEMACCESS2(0x2,1) "    \n"
     717             :     "lea       " MEMLEA(0x6,1) ",%1            \n"
     718             :     "sub       $0x6,%2                         \n"
     719             :     "jg        1b                              \n"
     720             :   : "+r"(src_ptr),     // %0
     721             :     "+r"(dst_ptr),     // %1
     722             :     "+r"(dst_width)    // %2
     723             :   : "r"((intptr_t)(src_stride))  // %3
     724             :   : "memory", "cc", NACL_R14
     725             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
     726           0 :   );
     727           0 : }
     728             : 
     729           0 : void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
     730             :                                 ptrdiff_t src_stride,
     731             :                                 uint8* dst_ptr,
     732             :                                 int dst_width) {
     733             :   asm volatile(
     734             :       "movdqa    %0,%%xmm2                       \n"
     735             :       "movdqa    %1,%%xmm3                       \n"
     736             :       "movdqa    %2,%%xmm4                       \n"
     737             :       "pxor      %%xmm5,%%xmm5                   \n"
     738             :       :
     739             :       : "m"(kShufAc),    // %0
     740             :         "m"(kShufAc3),   // %1
     741             :         "m"(kScaleAc33)  // %2
     742           0 :       );
     743             :   asm volatile (
     744             :     LABELALIGN
     745             :   "1:                                          \n"
     746             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     747             :     MEMOPREG(movdqu,0x00,0,3,1,xmm6)           //  movdqu  (%0,%3,1),%%xmm6
     748             :     "movhlps   %%xmm0,%%xmm1                   \n"
     749             :     "movhlps   %%xmm6,%%xmm7                   \n"
     750             :     "punpcklbw %%xmm5,%%xmm0                   \n"
     751             :     "punpcklbw %%xmm5,%%xmm1                   \n"
     752             :     "punpcklbw %%xmm5,%%xmm6                   \n"
     753             :     "punpcklbw %%xmm5,%%xmm7                   \n"
     754             :     "paddusw   %%xmm6,%%xmm0                   \n"
     755             :     "paddusw   %%xmm7,%%xmm1                   \n"
     756             :     MEMOPREG(movdqu,0x00,0,3,2,xmm6)           //  movdqu  (%0,%3,2),%%xmm6
     757             :     "lea       " MEMLEA(0x10,0) ",%0           \n"
     758             :     "movhlps   %%xmm6,%%xmm7                   \n"
     759             :     "punpcklbw %%xmm5,%%xmm6                   \n"
     760             :     "punpcklbw %%xmm5,%%xmm7                   \n"
     761             :     "paddusw   %%xmm6,%%xmm0                   \n"
     762             :     "paddusw   %%xmm7,%%xmm1                   \n"
     763             :     "movdqa    %%xmm0,%%xmm6                   \n"
     764             :     "psrldq    $0x2,%%xmm0                     \n"
     765             :     "paddusw   %%xmm0,%%xmm6                   \n"
     766             :     "psrldq    $0x2,%%xmm0                     \n"
     767             :     "paddusw   %%xmm0,%%xmm6                   \n"
     768             :     "pshufb    %%xmm2,%%xmm6                   \n"
     769             :     "movdqa    %%xmm1,%%xmm7                   \n"
     770             :     "psrldq    $0x2,%%xmm1                     \n"
     771             :     "paddusw   %%xmm1,%%xmm7                   \n"
     772             :     "psrldq    $0x2,%%xmm1                     \n"
     773             :     "paddusw   %%xmm1,%%xmm7                   \n"
     774             :     "pshufb    %%xmm3,%%xmm7                   \n"
     775             :     "paddusw   %%xmm7,%%xmm6                   \n"
     776             :     "pmulhuw   %%xmm4,%%xmm6                   \n"
     777             :     "packuswb  %%xmm6,%%xmm6                   \n"
     778             :     "movd      %%xmm6," MEMACCESS(1) "         \n"
     779             :     "psrlq     $0x10,%%xmm6                    \n"
     780             :     "movd      %%xmm6," MEMACCESS2(0x2,1) "    \n"
     781             :     "lea       " MEMLEA(0x6,1) ",%1            \n"
     782             :     "sub       $0x6,%2                         \n"
     783             :     "jg        1b                              \n"
     784             :   : "+r"(src_ptr),    // %0
     785             :     "+r"(dst_ptr),    // %1
     786             :     "+r"(dst_width)   // %2
     787             :   : "r"((intptr_t)(src_stride))   // %3
     788             :   : "memory", "cc", NACL_R14
     789             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
     790           0 :   );
     791           0 : }
     792             : 
     793             : // Reads 16xN bytes and produces 16 shorts at a time.
     794           0 : void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
     795             :   asm volatile (
     796             :     "pxor      %%xmm5,%%xmm5                   \n"
     797             : 
     798             :     LABELALIGN
     799             :   "1:                                          \n"
     800             :     "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
     801             :     "lea       " MEMLEA(0x10,0) ",%0           \n"  // src_ptr += 16
     802             :     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     803             :     "movdqu    " MEMACCESS2(0x10,1) ",%%xmm1   \n"
     804             :     "movdqa    %%xmm3,%%xmm2                   \n"
     805             :     "punpcklbw %%xmm5,%%xmm2                   \n"
     806             :     "punpckhbw %%xmm5,%%xmm3                   \n"
     807             :     "paddusw   %%xmm2,%%xmm0                   \n"
     808             :     "paddusw   %%xmm3,%%xmm1                   \n"
     809             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     810             :     "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
     811             :     "lea       " MEMLEA(0x20,1) ",%1           \n"
     812             :     "sub       $0x10,%2                        \n"
     813             :     "jg        1b                              \n"
     814             :   : "+r"(src_ptr),     // %0
     815             :     "+r"(dst_ptr),     // %1
     816             :     "+r"(src_width)    // %2
     817             :   :
     818             :   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
     819           0 :   );
     820           0 : }
     821             : 
     822             : #ifdef HAS_SCALEADDROW_AVX2
     823             : // Reads 32 bytes and accumulates to 32 shorts at a time.
     824           0 : void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) {
     825             :   asm volatile (
     826             :     "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
     827             : 
     828             :     LABELALIGN
     829             :   "1:                                          \n"
     830             :     "vmovdqu    " MEMACCESS(0) ",%%ymm3        \n"
     831             :     "lea        " MEMLEA(0x20,0) ",%0          \n"  // src_ptr += 32
     832             :     "vpermq     $0xd8,%%ymm3,%%ymm3            \n"
     833             :     "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
     834             :     "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
     835             :     "vpaddusw   " MEMACCESS(1) ",%%ymm2,%%ymm0 \n"
     836             :     "vpaddusw   " MEMACCESS2(0x20,1) ",%%ymm3,%%ymm1 \n"
     837             :     "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
     838             :     "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
     839             :     "lea       " MEMLEA(0x40,1) ",%1           \n"
     840             :     "sub       $0x20,%2                        \n"
     841             :     "jg        1b                              \n"
     842             :     "vzeroupper                                \n"
     843             :   : "+r"(src_ptr),     // %0
     844             :     "+r"(dst_ptr),     // %1
     845             :     "+r"(src_width)    // %2
     846             :   :
     847             :   : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
     848           0 :   );
     849           0 : }
     850             : #endif  // HAS_SCALEADDROW_AVX2
     851             : 
     852             : // Constant for making pixels signed to avoid pmaddubsw
     853             : // saturation.
     854             : static uvec8 kFsub80 = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
     855             :                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80};
     856             : 
     857             : // Constant for making pixels unsigned and adding .5 for rounding.
     858             : static uvec16 kFadd40 = {0x4040, 0x4040, 0x4040, 0x4040,
     859             :                          0x4040, 0x4040, 0x4040, 0x4040};
     860             : 
     861             : // Bilinear column filtering. SSSE3 version.
     862           0 : void ScaleFilterCols_SSSE3(uint8* dst_ptr,
     863             :                            const uint8* src_ptr,
     864             :                            int dst_width,
     865             :                            int x,
     866             :                            int dx) {
     867             :   intptr_t x0, x1, temp_pixel;
     868             :   asm volatile (
     869             :     "movd      %6,%%xmm2                       \n"
     870             :     "movd      %7,%%xmm3                       \n"
     871             :     "movl      $0x04040000,%k2                 \n"
     872             :     "movd      %k2,%%xmm5                      \n"
     873             :     "pcmpeqb   %%xmm6,%%xmm6                   \n"
     874             :     "psrlw     $0x9,%%xmm6                     \n"  // 0x007f007f
     875             :     "pcmpeqb   %%xmm7,%%xmm7                   \n"
     876             :     "psrlw     $15,%%xmm7                      \n"  // 0x00010001
     877             : 
     878             :     "pextrw    $0x1,%%xmm2,%k3                 \n"
     879             :     "subl      $0x2,%5                         \n"
     880             :     "jl        29f                             \n"
     881             :     "movdqa    %%xmm2,%%xmm0                   \n"
     882             :     "paddd     %%xmm3,%%xmm0                   \n"
     883             :     "punpckldq %%xmm0,%%xmm2                   \n"
     884             :     "punpckldq %%xmm3,%%xmm3                   \n"
     885             :     "paddd     %%xmm3,%%xmm3                   \n"
     886             :     "pextrw    $0x3,%%xmm2,%k4                 \n"
     887             : 
     888             :     LABELALIGN
     889             :   "2:                                          \n"
     890             :     "movdqa    %%xmm2,%%xmm1                   \n"
     891             :     "paddd     %%xmm3,%%xmm2                   \n"
     892             :     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
     893             :     "movd      %k2,%%xmm0                      \n"
     894             :     "psrlw     $0x9,%%xmm1                     \n"
     895             :     MEMOPARG(movzwl,0x00,1,4,1,k2)             //  movzwl  (%1,%4,1),%k2
     896             :     "movd      %k2,%%xmm4                      \n"
     897             :     "pshufb    %%xmm5,%%xmm1                   \n"
     898             :     "punpcklwd %%xmm4,%%xmm0                   \n"
     899             :     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
     900             :     "pxor      %%xmm6,%%xmm1                   \n"  // 128 - f = (f ^ 127 ) + 1
     901             :     "paddusb   %%xmm7,%%xmm1                   \n"
     902             :     "pmaddubsw %%xmm0,%%xmm1                   \n"
     903             :     "pextrw    $0x1,%%xmm2,%k3                 \n"
     904             :     "pextrw    $0x3,%%xmm2,%k4                 \n"
     905             :     "paddw     %9,%%xmm1                       \n"  // make pixels unsigned.
     906             :     "psrlw     $0x7,%%xmm1                     \n"
     907             :     "packuswb  %%xmm1,%%xmm1                   \n"
     908             :     "movd      %%xmm1,%k2                      \n"
     909             :     "mov       %w2," MEMACCESS(0) "            \n"
     910             :     "lea       " MEMLEA(0x2,0) ",%0            \n"
     911             :     "subl      $0x2,%5                         \n"
     912             :     "jge       2b                              \n"
     913             : 
     914             :     LABELALIGN
     915             :   "29:                                         \n"
     916             :     "addl      $0x1,%5                         \n"
     917             :     "jl        99f                             \n"
     918             :     MEMOPARG(movzwl,0x00,1,3,1,k2)             //  movzwl  (%1,%3,1),%k2
     919             :     "movd      %k2,%%xmm0                      \n"
     920             :     "psrlw     $0x9,%%xmm2                     \n"
     921             :     "pshufb    %%xmm5,%%xmm2                   \n"
     922             :     "psubb     %8,%%xmm0                       \n"  // make pixels signed.
     923             :     "pxor      %%xmm6,%%xmm2                   \n"
     924             :     "paddusb   %%xmm7,%%xmm2                   \n"
     925             :     "pmaddubsw %%xmm0,%%xmm2                   \n"
     926             :     "paddw     %9,%%xmm2                       \n"  // make pixels unsigned.
     927             :     "psrlw     $0x7,%%xmm2                     \n"
     928             :     "packuswb  %%xmm2,%%xmm2                   \n"
     929             :     "movd      %%xmm2,%k2                      \n"
     930             :     "mov       %b2," MEMACCESS(0) "            \n"
     931             :   "99:                                         \n"
     932             :   : "+r"(dst_ptr),      // %0
     933             :     "+r"(src_ptr),      // %1
     934             :     "=&a"(temp_pixel),  // %2
     935             :     "=&r"(x0),          // %3
     936             :     "=&r"(x1),          // %4
     937             : #if defined(__x86_64__)
     938             :     "+rm"(dst_width)    // %5
     939             : #else
     940             :     "+m"(dst_width)    // %5
     941             : #endif
     942             :   : "rm"(x),            // %6
     943             :     "rm"(dx),           // %7
     944             : #if defined(__x86_64__)
     945             :     "x"(kFsub80),       // %8
     946             :     "x"(kFadd40)        // %9
     947             : #else
     948             :     "m"(kFsub80),       // %8
     949             :     "m"(kFadd40)        // %9
     950             : #endif
     951             :   : "memory", "cc", NACL_R14
     952             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
     953           0 :   );
     954           0 : }
     955             : 
     956             : // Reads 4 pixels, duplicates them and writes 8 pixels.
     957             : // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
     958           0 : void ScaleColsUp2_SSE2(uint8* dst_ptr,
     959             :                        const uint8* src_ptr,
     960             :                        int dst_width,
     961             :                        int x,
     962             :                        int dx) {
     963             :   (void)x;
     964             :   (void)dx;
     965             :   asm volatile (
     966             :     LABELALIGN
     967             :   "1:                                          \n"
     968             :     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
     969             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
     970             :     "movdqa    %%xmm0,%%xmm1                   \n"
     971             :     "punpcklbw %%xmm0,%%xmm0                   \n"
     972             :     "punpckhbw %%xmm1,%%xmm1                   \n"
     973             :     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
     974             :     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
     975             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     976             :     "sub       $0x20,%2                         \n"
     977             :     "jg        1b                              \n"
     978             : 
     979             :   : "+r"(dst_ptr),     // %0
     980             :     "+r"(src_ptr),     // %1
     981             :     "+r"(dst_width)    // %2
     982             :   :: "memory", "cc", "xmm0", "xmm1"
     983           0 :   );
     984           0 : }
     985             : 
     986           0 : void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
     987             :                             ptrdiff_t src_stride,
     988             :                             uint8* dst_argb,
     989             :                             int dst_width) {
     990             :   (void)src_stride;
     991             :   asm volatile (
     992             :     LABELALIGN
     993             :   "1:                                          \n"
     994             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
     995             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
     996             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
     997             :     "shufps    $0xdd,%%xmm1,%%xmm0             \n"
     998             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
     999             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
    1000             :     "sub       $0x4,%2                         \n"
    1001             :     "jg        1b                              \n"
    1002             :   : "+r"(src_argb),  // %0
    1003             :     "+r"(dst_argb),  // %1
    1004             :     "+r"(dst_width)  // %2
    1005             :   :: "memory", "cc", "xmm0", "xmm1"
    1006           0 :   );
    1007           0 : }
    1008             : 
    1009           0 : void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
    1010             :                                   ptrdiff_t src_stride,
    1011             :                                   uint8* dst_argb,
    1012             :                                   int dst_width) {
    1013             :   (void)src_stride;
    1014             :   asm volatile (
    1015             :     LABELALIGN
    1016             :   "1:                                          \n"
    1017             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    1018             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    1019             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
    1020             :     "movdqa    %%xmm0,%%xmm2                   \n"
    1021             :     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    1022             :     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    1023             :     "pavgb     %%xmm2,%%xmm0                   \n"
    1024             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    1025             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
    1026             :     "sub       $0x4,%2                         \n"
    1027             :     "jg        1b                              \n"
    1028             :   : "+r"(src_argb),  // %0
    1029             :     "+r"(dst_argb),  // %1
    1030             :     "+r"(dst_width)  // %2
    1031             :   :: "memory", "cc", "xmm0", "xmm1"
    1032           0 :   );
    1033           0 : }
    1034             : 
    1035           0 : void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
    1036             :                                ptrdiff_t src_stride,
    1037             :                                uint8* dst_argb,
    1038             :                                int dst_width) {
    1039             :   asm volatile (
    1040             :     LABELALIGN
    1041             :   "1:                                          \n"
    1042             :     "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    1043             :     "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    1044             :     MEMOPREG(movdqu,0x00,0,3,1,xmm2)           //  movdqu   (%0,%3,1),%%xmm2
    1045             :     MEMOPREG(movdqu,0x10,0,3,1,xmm3)           //  movdqu   0x10(%0,%3,1),%%xmm3
    1046             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
    1047             :     "pavgb     %%xmm2,%%xmm0                   \n"
    1048             :     "pavgb     %%xmm3,%%xmm1                   \n"
    1049             :     "movdqa    %%xmm0,%%xmm2                   \n"
    1050             :     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    1051             :     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    1052             :     "pavgb     %%xmm2,%%xmm0                   \n"
    1053             :     "movdqu    %%xmm0," MEMACCESS(1) "         \n"
    1054             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
    1055             :     "sub       $0x4,%2                         \n"
    1056             :     "jg        1b                              \n"
    1057             :   : "+r"(src_argb),   // %0
    1058             :     "+r"(dst_argb),   // %1
    1059             :     "+r"(dst_width)   // %2
    1060             :   : "r"((intptr_t)(src_stride))   // %3
    1061             :   : "memory", "cc", NACL_R14
    1062             :     "xmm0", "xmm1", "xmm2", "xmm3"
    1063           0 :   );
    1064           0 : }
    1065             : 
    1066             : // Reads 4 pixels at a time.
    1067             : // Alignment requirement: dst_argb 16 byte aligned.
    1068           0 : void ScaleARGBRowDownEven_SSE2(const uint8* src_argb,
    1069             :                                ptrdiff_t src_stride,
    1070             :                                int src_stepx,
    1071             :                                uint8* dst_argb,
    1072             :                                int dst_width) {
    1073           0 :   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
    1074             :   intptr_t src_stepx_x12;
    1075             :   (void)src_stride;
    1076             :   asm volatile (
    1077             :     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
    1078             :     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
    1079             :     LABELALIGN
    1080             :   "1:                                          \n"
    1081             :     "movd      " MEMACCESS(0) ",%%xmm0         \n"
    1082             :     MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
    1083             :     "punpckldq %%xmm1,%%xmm0                   \n"
    1084             :     MEMOPREG(movd,0x00,0,1,2,xmm2)             //  movd      (%0,%1,2),%%xmm2
    1085             :     MEMOPREG(movd,0x00,0,4,1,xmm3)             //  movd      (%0,%4,1),%%xmm3
    1086             :     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
    1087             :     "punpckldq %%xmm3,%%xmm2                   \n"
    1088             :     "punpcklqdq %%xmm2,%%xmm0                  \n"
    1089             :     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
    1090             :     "lea       " MEMLEA(0x10,2) ",%2           \n"
    1091             :     "sub       $0x4,%3                         \n"
    1092             :     "jg        1b                              \n"
    1093             :   : "+r"(src_argb),       // %0
    1094             :     "+r"(src_stepx_x4),   // %1
    1095             :     "+r"(dst_argb),       // %2
    1096             :     "+r"(dst_width),      // %3
    1097             :     "=&r"(src_stepx_x12)  // %4
    1098             :   :: "memory", "cc", NACL_R14
    1099             :     "xmm0", "xmm1", "xmm2", "xmm3"
    1100           0 :   );
    1101           0 : }
    1102             : 
    1103             : // Blends four 2x2 to 4x1.
    1104             : // Alignment requirement: dst_argb 16 byte aligned.
    1105           0 : void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
    1106             :                                   ptrdiff_t src_stride,
    1107             :                                   int src_stepx,
    1108             :                                   uint8* dst_argb,
    1109             :                                   int dst_width) {
    1110           0 :   intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
    1111             :   intptr_t src_stepx_x12;
    1112           0 :   intptr_t row1 = (intptr_t)(src_stride);
    1113             :   asm volatile (
    1114             :     "lea       " MEMLEA3(0x00,1,4) ",%1        \n"
    1115             :     "lea       " MEMLEA4(0x00,1,1,2) ",%4      \n"
    1116             :     "lea       " MEMLEA4(0x00,0,5,1) ",%5      \n"
    1117             : 
    1118             :     LABELALIGN
    1119             :   "1:                                          \n"
    1120             :     "movq      " MEMACCESS(0) ",%%xmm0         \n"
    1121             :     MEMOPREG(movhps,0x00,0,1,1,xmm0)           //  movhps    (%0,%1,1),%%xmm0
    1122             :     MEMOPREG(movq,0x00,0,1,2,xmm1)             //  movq      (%0,%1,2),%%xmm1
    1123             :     MEMOPREG(movhps,0x00,0,4,1,xmm1)           //  movhps    (%0,%4,1),%%xmm1
    1124             :     "lea       " MEMLEA4(0x00,0,1,4) ",%0      \n"
    1125             :     "movq      " MEMACCESS(5) ",%%xmm2         \n"
    1126             :     MEMOPREG(movhps,0x00,5,1,1,xmm2)           //  movhps    (%5,%1,1),%%xmm2
    1127             :     MEMOPREG(movq,0x00,5,1,2,xmm3)             //  movq      (%5,%1,2),%%xmm3
    1128             :     MEMOPREG(movhps,0x00,5,4,1,xmm3)           //  movhps    (%5,%4,1),%%xmm3
    1129             :     "lea       " MEMLEA4(0x00,5,1,4) ",%5      \n"
    1130             :     "pavgb     %%xmm2,%%xmm0                   \n"
    1131             :     "pavgb     %%xmm3,%%xmm1                   \n"
    1132             :     "movdqa    %%xmm0,%%xmm2                   \n"
    1133             :     "shufps    $0x88,%%xmm1,%%xmm0             \n"
    1134             :     "shufps    $0xdd,%%xmm1,%%xmm2             \n"
    1135             :     "pavgb     %%xmm2,%%xmm0                   \n"
    1136             :     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
    1137             :     "lea       " MEMLEA(0x10,2) ",%2           \n"
    1138             :     "sub       $0x4,%3                         \n"
    1139             :     "jg        1b                              \n"
    1140             :   : "+r"(src_argb),        // %0
    1141             :     "+r"(src_stepx_x4),    // %1
    1142             :     "+r"(dst_argb),        // %2
    1143             :     "+rm"(dst_width),      // %3
    1144             :     "=&r"(src_stepx_x12),  // %4
    1145             :     "+r"(row1)             // %5
    1146             :   :: "memory", "cc", NACL_R14
    1147             :     "xmm0", "xmm1", "xmm2", "xmm3"
    1148           0 :   );
    1149           0 : }
    1150             : 
    1151           0 : void ScaleARGBCols_SSE2(uint8* dst_argb,
    1152             :                         const uint8* src_argb,
    1153             :                         int dst_width,
    1154             :                         int x,
    1155             :                         int dx) {
    1156             :   intptr_t x0, x1;
    1157             :   asm volatile (
    1158             :     "movd      %5,%%xmm2                       \n"
    1159             :     "movd      %6,%%xmm3                       \n"
    1160             :     "pshufd    $0x0,%%xmm2,%%xmm2              \n"
    1161             :     "pshufd    $0x11,%%xmm3,%%xmm0             \n"
    1162             :     "paddd     %%xmm0,%%xmm2                   \n"
    1163             :     "paddd     %%xmm3,%%xmm3                   \n"
    1164             :     "pshufd    $0x5,%%xmm3,%%xmm0              \n"
    1165             :     "paddd     %%xmm0,%%xmm2                   \n"
    1166             :     "paddd     %%xmm3,%%xmm3                   \n"
    1167             :     "pshufd    $0x0,%%xmm3,%%xmm3              \n"
    1168             :     "pextrw    $0x1,%%xmm2,%k0                 \n"
    1169             :     "pextrw    $0x3,%%xmm2,%k1                 \n"
    1170             :     "cmp       $0x0,%4                         \n"
    1171             :     "jl        99f                             \n"
    1172             :     "sub       $0x4,%4                         \n"
    1173             :     "jl        49f                             \n"
    1174             : 
    1175             :     LABELALIGN
    1176             :   "40:                                         \n"
    1177             :     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
    1178             :     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
    1179             :     "pextrw    $0x5,%%xmm2,%k0                 \n"
    1180             :     "pextrw    $0x7,%%xmm2,%k1                 \n"
    1181             :     "paddd     %%xmm3,%%xmm2                   \n"
    1182             :     "punpckldq %%xmm1,%%xmm0                   \n"
    1183             :     MEMOPREG(movd,0x00,3,0,4,xmm1)             //  movd      (%3,%0,4),%%xmm1
    1184             :     MEMOPREG(movd,0x00,3,1,4,xmm4)             //  movd      (%3,%1,4),%%xmm4
    1185             :     "pextrw    $0x1,%%xmm2,%k0                 \n"
    1186             :     "pextrw    $0x3,%%xmm2,%k1                 \n"
    1187             :     "punpckldq %%xmm4,%%xmm1                   \n"
    1188             :     "punpcklqdq %%xmm1,%%xmm0                  \n"
    1189             :     "movdqu    %%xmm0," MEMACCESS(2) "         \n"
    1190             :     "lea       " MEMLEA(0x10,2) ",%2           \n"
    1191             :     "sub       $0x4,%4                         \n"
    1192             :     "jge       40b                             \n"
    1193             : 
    1194             :   "49:                                         \n"
    1195             :     "test      $0x2,%4                         \n"
    1196             :     "je        29f                             \n"
    1197             :     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
    1198             :     MEMOPREG(movd,0x00,3,1,4,xmm1)             //  movd      (%3,%1,4),%%xmm1
    1199             :     "pextrw    $0x5,%%xmm2,%k0                 \n"
    1200             :     "punpckldq %%xmm1,%%xmm0                   \n"
    1201             :     "movq      %%xmm0," MEMACCESS(2) "         \n"
    1202             :     "lea       " MEMLEA(0x8,2) ",%2            \n"
    1203             :   "29:                                         \n"
    1204             :     "test      $0x1,%4                         \n"
    1205             :     "je        99f                             \n"
    1206             :     MEMOPREG(movd,0x00,3,0,4,xmm0)             //  movd      (%3,%0,4),%%xmm0
    1207             :     "movd      %%xmm0," MEMACCESS(2) "         \n"
    1208             :   "99:                                         \n"
    1209             :   : "=&a"(x0),         // %0
    1210             :     "=&d"(x1),         // %1
    1211             :     "+r"(dst_argb),    // %2
    1212             :     "+r"(src_argb),    // %3
    1213             :     "+r"(dst_width)    // %4
    1214             :   : "rm"(x),           // %5
    1215             :     "rm"(dx)           // %6
    1216             :   : "memory", "cc", NACL_R14
    1217             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
    1218           0 :   );
    1219           0 : }
    1220             : 
    1221             : // Reads 4 pixels, duplicates them and writes 8 pixels.
    1222             : // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
    1223           0 : void ScaleARGBColsUp2_SSE2(uint8* dst_argb,
    1224             :                            const uint8* src_argb,
    1225             :                            int dst_width,
    1226             :                            int x,
    1227             :                            int dx) {
    1228             :   (void)x;
    1229             :   (void)dx;
    1230             :   asm volatile (
    1231             :     LABELALIGN
    1232             :   "1:                                          \n"
    1233             :     "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
    1234             :     "lea       " MEMLEA(0x10,1) ",%1           \n"
    1235             :     "movdqa    %%xmm0,%%xmm1                   \n"
    1236             :     "punpckldq %%xmm0,%%xmm0                   \n"
    1237             :     "punpckhdq %%xmm1,%%xmm1                   \n"
    1238             :     "movdqu    %%xmm0," MEMACCESS(0) "         \n"
    1239             :     "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
    1240             :     "lea       " MEMLEA(0x20,0) ",%0           \n"
    1241             :     "sub       $0x8,%2                         \n"
    1242             :     "jg        1b                              \n"
    1243             : 
    1244             :   : "+r"(dst_argb),    // %0
    1245             :     "+r"(src_argb),    // %1
    1246             :     "+r"(dst_width)    // %2
    1247             :   :: "memory", "cc", NACL_R14
    1248             :     "xmm0", "xmm1"
    1249           0 :   );
    1250           0 : }
    1251             : 
    1252             : // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
    1253             : static uvec8 kShuffleColARGB = {
    1254             :     0u, 4u,  1u, 5u,  2u,  6u,  3u,  7u,  // bbggrraa 1st pixel
    1255             :     8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
    1256             : };
    1257             : 
    1258             : // Shuffle table for duplicating 2 fractions into 8 bytes each
    1259             : static uvec8 kShuffleFractions = {
    1260             :     0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
    1261             : };
    1262             : 
    1263             : // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
    1264           0 : void ScaleARGBFilterCols_SSSE3(uint8* dst_argb,
    1265             :                                const uint8* src_argb,
    1266             :                                int dst_width,
    1267             :                                int x,
    1268             :                                int dx) {
    1269             :   intptr_t x0, x1;
    1270             :   asm volatile(
    1271             :       "movdqa    %0,%%xmm4                       \n"
    1272             :       "movdqa    %1,%%xmm5                       \n"
    1273             :       :
    1274             :       : "m"(kShuffleColARGB),   // %0
    1275             :         "m"(kShuffleFractions)  // %1
    1276           0 :       );
    1277             : 
    1278             :   asm volatile (
    1279             :     "movd      %5,%%xmm2                       \n"
    1280             :     "movd      %6,%%xmm3                       \n"
    1281             :     "pcmpeqb   %%xmm6,%%xmm6                   \n"
    1282             :     "psrlw     $0x9,%%xmm6                     \n"
    1283             :     "pextrw    $0x1,%%xmm2,%k3                 \n"
    1284             :     "sub       $0x2,%2                         \n"
    1285             :     "jl        29f                             \n"
    1286             :     "movdqa    %%xmm2,%%xmm0                   \n"
    1287             :     "paddd     %%xmm3,%%xmm0                   \n"
    1288             :     "punpckldq %%xmm0,%%xmm2                   \n"
    1289             :     "punpckldq %%xmm3,%%xmm3                   \n"
    1290             :     "paddd     %%xmm3,%%xmm3                   \n"
    1291             :     "pextrw    $0x3,%%xmm2,%k4                 \n"
    1292             : 
    1293             :     LABELALIGN
    1294             :   "2:                                          \n"
    1295             :     "movdqa    %%xmm2,%%xmm1                   \n"
    1296             :     "paddd     %%xmm3,%%xmm2                   \n"
    1297             :     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
    1298             :     "psrlw     $0x9,%%xmm1                     \n"
    1299             :     MEMOPREG(movhps,0x00,1,4,4,xmm0)           //  movhps    (%1,%4,4),%%xmm0
    1300             :     "pshufb    %%xmm5,%%xmm1                   \n"
    1301             :     "pshufb    %%xmm4,%%xmm0                   \n"
    1302             :     "pxor      %%xmm6,%%xmm1                   \n"
    1303             :     "pmaddubsw %%xmm1,%%xmm0                   \n"
    1304             :     "psrlw     $0x7,%%xmm0                     \n"
    1305             :     "pextrw    $0x1,%%xmm2,%k3                 \n"
    1306             :     "pextrw    $0x3,%%xmm2,%k4                 \n"
    1307             :     "packuswb  %%xmm0,%%xmm0                   \n"
    1308             :     "movq      %%xmm0," MEMACCESS(0) "         \n"
    1309             :     "lea       " MEMLEA(0x8,0) ",%0            \n"
    1310             :     "sub       $0x2,%2                         \n"
    1311             :     "jge       2b                              \n"
    1312             : 
    1313             :     LABELALIGN
    1314             :   "29:                                         \n"
    1315             :     "add       $0x1,%2                         \n"
    1316             :     "jl        99f                             \n"
    1317             :     "psrlw     $0x9,%%xmm2                     \n"
    1318             :     MEMOPREG(movq,0x00,1,3,4,xmm0)             //  movq      (%1,%3,4),%%xmm0
    1319             :     "pshufb    %%xmm5,%%xmm2                   \n"
    1320             :     "pshufb    %%xmm4,%%xmm0                   \n"
    1321             :     "pxor      %%xmm6,%%xmm2                   \n"
    1322             :     "pmaddubsw %%xmm2,%%xmm0                   \n"
    1323             :     "psrlw     $0x7,%%xmm0                     \n"
    1324             :     "packuswb  %%xmm0,%%xmm0                   \n"
    1325             :     "movd      %%xmm0," MEMACCESS(0) "         \n"
    1326             : 
    1327             :     LABELALIGN
    1328             :   "99:                                         \n"
    1329             :   : "+r"(dst_argb),    // %0
    1330             :     "+r"(src_argb),    // %1
    1331             :     "+rm"(dst_width),  // %2
    1332             :     "=&r"(x0),         // %3
    1333             :     "=&r"(x1)          // %4
    1334             :   : "rm"(x),           // %5
    1335             :     "rm"(dx)           // %6
    1336             :   : "memory", "cc", NACL_R14
    1337             :     "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
    1338           0 :   );
    1339           0 : }
    1340             : 
    1341             : // Divide num by div and return as 16.16 fixed point result.
    1342           0 : int FixedDiv_X86(int num, int div) {
    1343             :   asm volatile(
    1344             :       "cdq                                       \n"
    1345             :       "shld      $0x10,%%eax,%%edx               \n"
    1346             :       "shl       $0x10,%%eax                     \n"
    1347             :       "idiv      %1                              \n"
    1348             :       "mov       %0, %%eax                       \n"
    1349             :       : "+a"(num)  // %0
    1350             :       : "c"(div)   // %1
    1351           0 :       : "memory", "cc", "edx");
    1352           0 :   return num;
    1353             : }
    1354             : 
    1355             : // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
    1356           0 : int FixedDiv1_X86(int num, int div) {
    1357             :   asm volatile(
    1358             :       "cdq                                       \n"
    1359             :       "shld      $0x10,%%eax,%%edx               \n"
    1360             :       "shl       $0x10,%%eax                     \n"
    1361             :       "sub       $0x10001,%%eax                  \n"
    1362             :       "sbb       $0x0,%%edx                      \n"
    1363             :       "sub       $0x1,%1                         \n"
    1364             :       "idiv      %1                              \n"
    1365             :       "mov       %0, %%eax                       \n"
    1366             :       : "+a"(num)  // %0
    1367             :       : "c"(div)   // %1
    1368           0 :       : "memory", "cc", "edx");
    1369           0 :   return num;
    1370             : }
    1371             : 
    1372             : #endif  // defined(__x86_64__) || defined(__i386__)
    1373             : 
    1374             : #ifdef __cplusplus
    1375             : }  // extern "C"
    1376             : }  // namespace libyuv
    1377             : #endif

Generated by: LCOV version 1.13