LCOV - code coverage report
Current view: top level - media/libtheora/lib/x86 - mmxidct.c (source / functions) Hit Total Coverage
Test: output.info Lines: 0 24 0.0 %
Date: 2017-07-14 16:53:18 Functions: 0 3 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /********************************************************************
       2             :  *                                                                  *
       3             :  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
       4             :  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
       5             :  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
       6             :  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
       7             :  *                                                                  *
       8             :  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
       9             :  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
      10             :  *                                                                  *
      11             :  ********************************************************************
      12             : 
      13             :   function:
      14             :     last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
      15             : 
      16             :  ********************************************************************/
      17             : 
      18             : /*MMX acceleration of Theora's iDCT.
      19             :   Originally written by Rudolf Marek, based on code from On2's VP3.*/
      20             : #include "x86int.h"
      21             : #include "../dct.h"
      22             : 
      23             : #if defined(OC_X86_ASM)
      24             : 
      25             : /*These are offsets into the table of constants below.*/
      26             : /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
      27             : #define OC_COSINE_OFFSET (0)
      28             : /*A row of 8's.*/
      29             : #define OC_EIGHT_OFFSET  (56)
      30             : 
      31             : 
      32             : 
      33             : /*38 cycles*/
      34             : #define OC_IDCT_BEGIN(_y,_x) \
      35             :   "#OC_IDCT_BEGIN\n\t" \
      36             :   "movq "OC_I(3,_x)",%%mm2\n\t" \
      37             :   "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
      38             :   "movq %%mm2,%%mm4\n\t" \
      39             :   "movq "OC_J(5,_x)",%%mm7\n\t" \
      40             :   "pmulhw %%mm6,%%mm4\n\t" \
      41             :   "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
      42             :   "pmulhw %%mm7,%%mm6\n\t" \
      43             :   "movq %%mm1,%%mm5\n\t" \
      44             :   "pmulhw %%mm2,%%mm1\n\t" \
      45             :   "movq "OC_I(1,_x)",%%mm3\n\t" \
      46             :   "pmulhw %%mm7,%%mm5\n\t" \
      47             :   "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
      48             :   "paddw %%mm2,%%mm4\n\t" \
      49             :   "paddw %%mm7,%%mm6\n\t" \
      50             :   "paddw %%mm1,%%mm2\n\t" \
      51             :   "movq "OC_J(7,_x)",%%mm1\n\t" \
      52             :   "paddw %%mm5,%%mm7\n\t" \
      53             :   "movq %%mm0,%%mm5\n\t" \
      54             :   "pmulhw %%mm3,%%mm0\n\t" \
      55             :   "paddw %%mm7,%%mm4\n\t" \
      56             :   "pmulhw %%mm1,%%mm5\n\t" \
      57             :   "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
      58             :   "psubw %%mm2,%%mm6\n\t" \
      59             :   "paddw %%mm3,%%mm0\n\t" \
      60             :   "pmulhw %%mm7,%%mm3\n\t" \
      61             :   "movq "OC_I(2,_x)",%%mm2\n\t" \
      62             :   "pmulhw %%mm1,%%mm7\n\t" \
      63             :   "paddw %%mm1,%%mm5\n\t" \
      64             :   "movq %%mm2,%%mm1\n\t" \
      65             :   "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
      66             :   "psubw %%mm5,%%mm3\n\t" \
      67             :   "movq "OC_J(6,_x)",%%mm5\n\t" \
      68             :   "paddw %%mm7,%%mm0\n\t" \
      69             :   "movq %%mm5,%%mm7\n\t" \
      70             :   "psubw %%mm4,%%mm0\n\t" \
      71             :   "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
      72             :   "paddw %%mm1,%%mm2\n\t" \
      73             :   "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
      74             :   "paddw %%mm4,%%mm4\n\t" \
      75             :   "paddw %%mm0,%%mm4\n\t" \
      76             :   "psubw %%mm6,%%mm3\n\t" \
      77             :   "paddw %%mm7,%%mm5\n\t" \
      78             :   "paddw %%mm6,%%mm6\n\t" \
      79             :   "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
      80             :   "paddw %%mm3,%%mm6\n\t" \
      81             :   "movq %%mm4,"OC_I(1,_y)"\n\t" \
      82             :   "psubw %%mm5,%%mm1\n\t" \
      83             :   "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
      84             :   "movq %%mm3,%%mm5\n\t" \
      85             :   "pmulhw %%mm4,%%mm3\n\t" \
      86             :   "paddw %%mm2,%%mm7\n\t" \
      87             :   "movq %%mm6,"OC_I(2,_y)"\n\t" \
      88             :   "movq %%mm0,%%mm2\n\t" \
      89             :   "movq "OC_I(0,_x)",%%mm6\n\t" \
      90             :   "pmulhw %%mm4,%%mm0\n\t" \
      91             :   "paddw %%mm3,%%mm5\n\t" \
      92             :   "movq "OC_J(4,_x)",%%mm3\n\t" \
      93             :   "psubw %%mm1,%%mm5\n\t" \
      94             :   "paddw %%mm0,%%mm2\n\t" \
      95             :   "psubw %%mm3,%%mm6\n\t" \
      96             :   "movq %%mm6,%%mm0\n\t" \
      97             :   "pmulhw %%mm4,%%mm6\n\t" \
      98             :   "paddw %%mm3,%%mm3\n\t" \
      99             :   "paddw %%mm1,%%mm1\n\t" \
     100             :   "paddw %%mm0,%%mm3\n\t" \
     101             :   "paddw %%mm5,%%mm1\n\t" \
     102             :   "pmulhw %%mm3,%%mm4\n\t" \
     103             :   "paddw %%mm0,%%mm6\n\t" \
     104             :   "psubw %%mm2,%%mm6\n\t" \
     105             :   "paddw %%mm2,%%mm2\n\t" \
     106             :   "movq "OC_I(1,_y)",%%mm0\n\t" \
     107             :   "paddw %%mm6,%%mm2\n\t" \
     108             :   "paddw %%mm3,%%mm4\n\t" \
     109             :   "psubw %%mm1,%%mm2\n\t" \
     110             :   "#end OC_IDCT_BEGIN\n\t" \
     111             : 
     112             : /*38+8=46 cycles.*/
     113             : #define OC_ROW_IDCT(_y,_x) \
     114             :   "#OC_ROW_IDCT\n" \
     115             :   OC_IDCT_BEGIN(_y,_x) \
     116             :   /*r3=D'*/ \
     117             :   "movq "OC_I(2,_y)",%%mm3\n\t" \
     118             :   /*r4=E'=E-G*/ \
     119             :   "psubw %%mm7,%%mm4\n\t" \
     120             :   /*r1=H'+H'*/ \
     121             :   "paddw %%mm1,%%mm1\n\t" \
     122             :   /*r7=G+G*/ \
     123             :   "paddw %%mm7,%%mm7\n\t" \
     124             :   /*r1=R1=A''+H'*/ \
     125             :   "paddw %%mm2,%%mm1\n\t" \
     126             :   /*r7=G'=E+G*/ \
     127             :   "paddw %%mm4,%%mm7\n\t" \
     128             :   /*r4=R4=E'-D'*/ \
     129             :   "psubw %%mm3,%%mm4\n\t" \
     130             :   "paddw %%mm3,%%mm3\n\t" \
     131             :   /*r6=R6=F'-B''*/ \
     132             :   "psubw %%mm5,%%mm6\n\t" \
     133             :   "paddw %%mm5,%%mm5\n\t" \
     134             :   /*r3=R3=E'+D'*/ \
     135             :   "paddw %%mm4,%%mm3\n\t" \
     136             :   /*r5=R5=F'+B''*/ \
     137             :   "paddw %%mm6,%%mm5\n\t" \
     138             :   /*r7=R7=G'-C'*/ \
     139             :   "psubw %%mm0,%%mm7\n\t" \
     140             :   "paddw %%mm0,%%mm0\n\t" \
     141             :   /*Save R1.*/ \
     142             :   "movq %%mm1,"OC_I(1,_y)"\n\t" \
     143             :   /*r0=R0=G.+C.*/ \
     144             :   "paddw %%mm7,%%mm0\n\t" \
     145             :   "#end OC_ROW_IDCT\n\t" \
     146             : 
     147             : /*The following macro does two 4x4 transposes in place.
     148             :   At entry, we assume:
     149             :     r0 = a3 a2 a1 a0
     150             :   I(1) = b3 b2 b1 b0
     151             :     r2 = c3 c2 c1 c0
     152             :     r3 = d3 d2 d1 d0
     153             : 
     154             :     r4 = e3 e2 e1 e0
     155             :     r5 = f3 f2 f1 f0
     156             :     r6 = g3 g2 g1 g0
     157             :     r7 = h3 h2 h1 h0
     158             : 
     159             :   At exit, we have:
     160             :   I(0) = d0 c0 b0 a0
     161             :   I(1) = d1 c1 b1 a1
     162             :   I(2) = d2 c2 b2 a2
     163             :   I(3) = d3 c3 b3 a3
     164             : 
     165             :   J(4) = h0 g0 f0 e0
     166             :   J(5) = h1 g1 f1 e1
     167             :   J(6) = h2 g2 f2 e2
     168             :   J(7) = h3 g3 f3 e3
     169             : 
     170             :   I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
     171             :   J(4) J(5) J(6) J(7) is the transpose of r4  r5  r6 r7.
     172             : 
     173             :   Since r1 is free at entry, we calculate the Js first.*/
     174             : /*19 cycles.*/
     175             : #define OC_TRANSPOSE(_y) \
     176             :   "#OC_TRANSPOSE\n\t" \
     177             :   "movq %%mm4,%%mm1\n\t" \
     178             :   "punpcklwd %%mm5,%%mm4\n\t" \
     179             :   "movq %%mm0,"OC_I(0,_y)"\n\t" \
     180             :   "punpckhwd %%mm5,%%mm1\n\t" \
     181             :   "movq %%mm6,%%mm0\n\t" \
     182             :   "punpcklwd %%mm7,%%mm6\n\t" \
     183             :   "movq %%mm4,%%mm5\n\t" \
     184             :   "punpckldq %%mm6,%%mm4\n\t" \
     185             :   "punpckhdq %%mm6,%%mm5\n\t" \
     186             :   "movq %%mm1,%%mm6\n\t" \
     187             :   "movq %%mm4,"OC_J(4,_y)"\n\t" \
     188             :   "punpckhwd %%mm7,%%mm0\n\t" \
     189             :   "movq %%mm5,"OC_J(5,_y)"\n\t" \
     190             :   "punpckhdq %%mm0,%%mm6\n\t" \
     191             :   "movq "OC_I(0,_y)",%%mm4\n\t" \
     192             :   "punpckldq %%mm0,%%mm1\n\t" \
     193             :   "movq "OC_I(1,_y)",%%mm5\n\t" \
     194             :   "movq %%mm4,%%mm0\n\t" \
     195             :   "movq %%mm6,"OC_J(7,_y)"\n\t" \
     196             :   "punpcklwd %%mm5,%%mm0\n\t" \
     197             :   "movq %%mm1,"OC_J(6,_y)"\n\t" \
     198             :   "punpckhwd %%mm5,%%mm4\n\t" \
     199             :   "movq %%mm2,%%mm5\n\t" \
     200             :   "punpcklwd %%mm3,%%mm2\n\t" \
     201             :   "movq %%mm0,%%mm1\n\t" \
     202             :   "punpckldq %%mm2,%%mm0\n\t" \
     203             :   "punpckhdq %%mm2,%%mm1\n\t" \
     204             :   "movq %%mm4,%%mm2\n\t" \
     205             :   "movq %%mm0,"OC_I(0,_y)"\n\t" \
     206             :   "punpckhwd %%mm3,%%mm5\n\t" \
     207             :   "movq %%mm1,"OC_I(1,_y)"\n\t" \
     208             :   "punpckhdq %%mm5,%%mm4\n\t" \
     209             :   "punpckldq %%mm5,%%mm2\n\t" \
     210             :   "movq %%mm4,"OC_I(3,_y)"\n\t" \
     211             :   "movq %%mm2,"OC_I(2,_y)"\n\t" \
     212             :   "#end OC_TRANSPOSE\n\t" \
     213             : 
     214             : /*38+19=57 cycles.*/
     215             : #define OC_COLUMN_IDCT(_y) \
     216             :   "#OC_COLUMN_IDCT\n" \
     217             :   OC_IDCT_BEGIN(_y,_y) \
     218             :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
     219             :   /*r1=H'+H'*/ \
     220             :   "paddw %%mm1,%%mm1\n\t" \
     221             :   /*r1=R1=A''+H'*/ \
     222             :   "paddw %%mm2,%%mm1\n\t" \
     223             :   /*r2=NR2*/ \
     224             :   "psraw $4,%%mm2\n\t" \
     225             :   /*r4=E'=E-G*/ \
     226             :   "psubw %%mm7,%%mm4\n\t" \
     227             :   /*r1=NR1*/ \
     228             :   "psraw $4,%%mm1\n\t" \
     229             :   /*r3=D'*/ \
     230             :   "movq "OC_I(2,_y)",%%mm3\n\t" \
     231             :   /*r7=G+G*/ \
     232             :   "paddw %%mm7,%%mm7\n\t" \
     233             :   /*Store NR2 at I(2).*/ \
     234             :   "movq %%mm2,"OC_I(2,_y)"\n\t" \
     235             :   /*r7=G'=E+G*/ \
     236             :   "paddw %%mm4,%%mm7\n\t" \
     237             :   /*Store NR1 at I(1).*/ \
     238             :   "movq %%mm1,"OC_I(1,_y)"\n\t" \
     239             :   /*r4=R4=E'-D'*/ \
     240             :   "psubw %%mm3,%%mm4\n\t" \
     241             :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
     242             :   /*r3=D'+D'*/ \
     243             :   "paddw %%mm3,%%mm3\n\t" \
     244             :   /*r3=R3=E'+D'*/ \
     245             :   "paddw %%mm4,%%mm3\n\t" \
     246             :   /*r4=NR4*/ \
     247             :   "psraw $4,%%mm4\n\t" \
     248             :   /*r6=R6=F'-B''*/ \
     249             :   "psubw %%mm5,%%mm6\n\t" \
     250             :   /*r3=NR3*/ \
     251             :   "psraw $4,%%mm3\n\t" \
     252             :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
     253             :   /*r5=B''+B''*/ \
     254             :   "paddw %%mm5,%%mm5\n\t" \
     255             :   /*r5=R5=F'+B''*/ \
     256             :   "paddw %%mm6,%%mm5\n\t" \
     257             :   /*r6=NR6*/ \
     258             :   "psraw $4,%%mm6\n\t" \
     259             :   /*Store NR4 at J(4).*/ \
     260             :   "movq %%mm4,"OC_J(4,_y)"\n\t" \
     261             :   /*r5=NR5*/ \
     262             :   "psraw $4,%%mm5\n\t" \
     263             :   /*Store NR3 at I(3).*/ \
     264             :   "movq %%mm3,"OC_I(3,_y)"\n\t" \
     265             :   /*r7=R7=G'-C'*/ \
     266             :   "psubw %%mm0,%%mm7\n\t" \
     267             :   "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
     268             :   /*r0=C'+C'*/ \
     269             :   "paddw %%mm0,%%mm0\n\t" \
     270             :   /*r0=R0=G'+C'*/ \
     271             :   "paddw %%mm7,%%mm0\n\t" \
     272             :   /*r7=NR7*/ \
     273             :   "psraw $4,%%mm7\n\t" \
     274             :   /*Store NR6 at J(6).*/ \
     275             :   "movq %%mm6,"OC_J(6,_y)"\n\t" \
     276             :   /*r0=NR0*/ \
     277             :   "psraw $4,%%mm0\n\t" \
     278             :   /*Store NR5 at J(5).*/ \
     279             :   "movq %%mm5,"OC_J(5,_y)"\n\t" \
     280             :   /*Store NR7 at J(7).*/ \
     281             :   "movq %%mm7,"OC_J(7,_y)"\n\t" \
     282             :   /*Store NR0 at I(0).*/ \
     283             :   "movq %%mm0,"OC_I(0,_y)"\n\t" \
     284             :   "#end OC_COLUMN_IDCT\n\t" \
     285             : 
     286           0 : static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
     287             :   /*This routine accepts an 8x8 matrix, but in partially transposed form.
     288             :     Every 4x4 block is transposed.*/
     289           0 :   __asm__ __volatile__(
     290             : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
     291             : #define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+8,_y)
     292             :     OC_ROW_IDCT(y,x)
     293             :     OC_TRANSPOSE(y)
     294             : #undef  OC_I
     295             : #undef  OC_J
     296             : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+64,_y)
     297             : #define OC_J(_k,_y)   OC_MEM_OFFS(((_k)-4)*16+72,_y)
     298             :     OC_ROW_IDCT(y,x)
     299             :     OC_TRANSPOSE(y)
     300             : #undef  OC_I
     301             : #undef  OC_J
     302             : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16,_y)
     303             : #define OC_J(_k,_y)   OC_I(_k,_y)
     304             :     OC_COLUMN_IDCT(y)
     305             : #undef  OC_I
     306             : #undef  OC_J
     307             : #define OC_I(_k,_y)   OC_MEM_OFFS((_k)*16+8,_y)
     308             : #define OC_J(_k,_y)   OC_I(_k,_y)
     309             :     OC_COLUMN_IDCT(y)
     310             : #undef  OC_I
     311             : #undef  OC_J
     312           0 :     :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
     313           0 :     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
     314           0 :      [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
     315             :   );
     316           0 :   if(_x!=_y){
     317             :     int i;
     318           0 :     __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
     319           0 :     for(i=0;i<4;i++){
     320           0 :       __asm__ __volatile__(
     321             :         "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
     322             :         "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
     323             :         "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
     324             :         "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
     325           0 :         :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
     326             :       );
     327             :     }
     328             :   }
     329           0 : }
     330             : 
     331             : /*25 cycles.*/
     332             : #define OC_IDCT_BEGIN_10(_y,_x) \
     333             :  "#OC_IDCT_BEGIN_10\n\t" \
     334             :  "movq "OC_I(3,_x)",%%mm2\n\t" \
     335             :  "nop\n\t" \
     336             :  "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
     337             :  "movq %%mm2,%%mm4\n\t" \
     338             :  "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
     339             :  "pmulhw %%mm6,%%mm4\n\t" \
     340             :  "movq "OC_I(1,_x)",%%mm3\n\t" \
     341             :  "pmulhw %%mm2,%%mm1\n\t" \
     342             :  "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
     343             :  "paddw %%mm2,%%mm4\n\t" \
     344             :  "pxor %%mm6,%%mm6\n\t" \
     345             :  "paddw %%mm1,%%mm2\n\t" \
     346             :  "movq "OC_I(2,_x)",%%mm5\n\t" \
     347             :  "pmulhw %%mm3,%%mm0\n\t" \
     348             :  "movq %%mm5,%%mm1\n\t" \
     349             :  "paddw %%mm3,%%mm0\n\t" \
     350             :  "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
     351             :  "psubw %%mm2,%%mm6\n\t" \
     352             :  "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
     353             :  "psubw %%mm4,%%mm0\n\t" \
     354             :  "movq "OC_I(2,_x)",%%mm7\n\t" \
     355             :  "paddw %%mm4,%%mm4\n\t" \
     356             :  "paddw %%mm5,%%mm7\n\t" \
     357             :  "paddw %%mm0,%%mm4\n\t" \
     358             :  "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
     359             :  "psubw %%mm6,%%mm3\n\t" \
     360             :  "movq %%mm4,"OC_I(1,_y)"\n\t" \
     361             :  "paddw %%mm6,%%mm6\n\t" \
     362             :  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
     363             :  "paddw %%mm3,%%mm6\n\t" \
     364             :  "movq %%mm3,%%mm5\n\t" \
     365             :  "pmulhw %%mm4,%%mm3\n\t" \
     366             :  "movq %%mm6,"OC_I(2,_y)"\n\t" \
     367             :  "movq %%mm0,%%mm2\n\t" \
     368             :  "movq "OC_I(0,_x)",%%mm6\n\t" \
     369             :  "pmulhw %%mm4,%%mm0\n\t" \
     370             :  "paddw %%mm3,%%mm5\n\t" \
     371             :  "paddw %%mm0,%%mm2\n\t" \
     372             :  "psubw %%mm1,%%mm5\n\t" \
     373             :  "pmulhw %%mm4,%%mm6\n\t" \
     374             :  "paddw "OC_I(0,_x)",%%mm6\n\t" \
     375             :  "paddw %%mm1,%%mm1\n\t" \
     376             :  "movq %%mm6,%%mm4\n\t" \
     377             :  "paddw %%mm5,%%mm1\n\t" \
     378             :  "psubw %%mm2,%%mm6\n\t" \
     379             :  "paddw %%mm2,%%mm2\n\t" \
     380             :  "movq "OC_I(1,_y)",%%mm0\n\t" \
     381             :  "paddw %%mm6,%%mm2\n\t" \
     382             :  "psubw %%mm1,%%mm2\n\t" \
     383             :  "nop\n\t" \
     384             :  "#end OC_IDCT_BEGIN_10\n\t" \
     385             : 
     386             : /*25+8=33 cycles.*/
     387             : #define OC_ROW_IDCT_10(_y,_x) \
     388             :  "#OC_ROW_IDCT_10\n\t" \
     389             :  OC_IDCT_BEGIN_10(_y,_x) \
     390             :  /*r3=D'*/ \
     391             :  "movq "OC_I(2,_y)",%%mm3\n\t" \
     392             :  /*r4=E'=E-G*/ \
     393             :  "psubw %%mm7,%%mm4\n\t" \
     394             :  /*r1=H'+H'*/ \
     395             :  "paddw %%mm1,%%mm1\n\t" \
     396             :  /*r7=G+G*/ \
     397             :  "paddw %%mm7,%%mm7\n\t" \
     398             :  /*r1=R1=A''+H'*/ \
     399             :  "paddw %%mm2,%%mm1\n\t" \
     400             :  /*r7=G'=E+G*/ \
     401             :  "paddw %%mm4,%%mm7\n\t" \
     402             :  /*r4=R4=E'-D'*/ \
     403             :  "psubw %%mm3,%%mm4\n\t" \
     404             :  "paddw %%mm3,%%mm3\n\t" \
     405             :  /*r6=R6=F'-B''*/ \
     406             :  "psubw %%mm5,%%mm6\n\t" \
     407             :  "paddw %%mm5,%%mm5\n\t" \
     408             :  /*r3=R3=E'+D'*/ \
     409             :  "paddw %%mm4,%%mm3\n\t" \
     410             :  /*r5=R5=F'+B''*/ \
     411             :  "paddw %%mm6,%%mm5\n\t" \
     412             :  /*r7=R7=G'-C'*/ \
     413             :  "psubw %%mm0,%%mm7\n\t" \
     414             :  "paddw %%mm0,%%mm0\n\t" \
     415             :  /*Save R1.*/ \
     416             :  "movq %%mm1,"OC_I(1,_y)"\n\t" \
     417             :  /*r0=R0=G'+C'*/ \
     418             :  "paddw %%mm7,%%mm0\n\t" \
     419             :  "#end OC_ROW_IDCT_10\n\t" \
     420             : 
     421             : /*25+19=44 cycles'*/
     422             : #define OC_COLUMN_IDCT_10(_y) \
     423             :  "#OC_COLUMN_IDCT_10\n\t" \
     424             :  OC_IDCT_BEGIN_10(_y,_y) \
     425             :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
     426             :  /*r1=H'+H'*/ \
     427             :  "paddw %%mm1,%%mm1\n\t" \
     428             :  /*r1=R1=A''+H'*/ \
     429             :  "paddw %%mm2,%%mm1\n\t" \
     430             :  /*r2=NR2*/ \
     431             :  "psraw $4,%%mm2\n\t" \
     432             :  /*r4=E'=E-G*/ \
     433             :  "psubw %%mm7,%%mm4\n\t" \
     434             :  /*r1=NR1*/ \
     435             :  "psraw $4,%%mm1\n\t" \
     436             :  /*r3=D'*/ \
     437             :  "movq "OC_I(2,_y)",%%mm3\n\t" \
     438             :  /*r7=G+G*/ \
     439             :  "paddw %%mm7,%%mm7\n\t" \
     440             :  /*Store NR2 at I(2).*/ \
     441             :  "movq %%mm2,"OC_I(2,_y)"\n\t" \
     442             :  /*r7=G'=E+G*/ \
     443             :  "paddw %%mm4,%%mm7\n\t" \
     444             :  /*Store NR1 at I(1).*/ \
     445             :  "movq %%mm1,"OC_I(1,_y)"\n\t" \
     446             :  /*r4=R4=E'-D'*/ \
     447             :  "psubw %%mm3,%%mm4\n\t" \
     448             :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
     449             :  /*r3=D'+D'*/ \
     450             :  "paddw %%mm3,%%mm3\n\t" \
     451             :  /*r3=R3=E'+D'*/ \
     452             :  "paddw %%mm4,%%mm3\n\t" \
     453             :  /*r4=NR4*/ \
     454             :  "psraw $4,%%mm4\n\t" \
     455             :  /*r6=R6=F'-B''*/ \
     456             :  "psubw %%mm5,%%mm6\n\t" \
     457             :  /*r3=NR3*/ \
     458             :  "psraw $4,%%mm3\n\t" \
     459             :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
     460             :  /*r5=B''+B''*/ \
     461             :  "paddw %%mm5,%%mm5\n\t" \
     462             :  /*r5=R5=F'+B''*/ \
     463             :  "paddw %%mm6,%%mm5\n\t" \
     464             :  /*r6=NR6*/ \
     465             :  "psraw $4,%%mm6\n\t" \
     466             :  /*Store NR4 at J(4).*/ \
     467             :  "movq %%mm4,"OC_J(4,_y)"\n\t" \
     468             :  /*r5=NR5*/ \
     469             :  "psraw $4,%%mm5\n\t" \
     470             :  /*Store NR3 at I(3).*/ \
     471             :  "movq %%mm3,"OC_I(3,_y)"\n\t" \
     472             :  /*r7=R7=G'-C'*/ \
     473             :  "psubw %%mm0,%%mm7\n\t" \
     474             :  "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
     475             :  /*r0=C'+C'*/ \
     476             :  "paddw %%mm0,%%mm0\n\t" \
     477             :  /*r0=R0=G'+C'*/ \
     478             :  "paddw %%mm7,%%mm0\n\t" \
     479             :  /*r7=NR7*/ \
     480             :  "psraw $4,%%mm7\n\t" \
     481             :  /*Store NR6 at J(6).*/ \
     482             :  "movq %%mm6,"OC_J(6,_y)"\n\t" \
     483             :  /*r0=NR0*/ \
     484             :  "psraw $4,%%mm0\n\t" \
     485             :  /*Store NR5 at J(5).*/ \
     486             :  "movq %%mm5,"OC_J(5,_y)"\n\t" \
     487             :  /*Store NR7 at J(7).*/ \
     488             :  "movq %%mm7,"OC_J(7,_y)"\n\t" \
     489             :  /*Store NR0 at I(0).*/ \
     490             :  "movq %%mm0,"OC_I(0,_y)"\n\t" \
     491             :  "#end OC_COLUMN_IDCT_10\n\t" \
     492             : 
     493           0 : static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
     494           0 :   __asm__ __volatile__(
     495             : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
     496             : #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
     497             :     /*Done with dequant, descramble, and partial transpose.
     498             :       Now do the iDCT itself.*/
     499             :     OC_ROW_IDCT_10(y,x)
     500             :     OC_TRANSPOSE(y)
     501             : #undef  OC_I
     502             : #undef  OC_J
     503             : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
     504             : #define OC_J(_k,_y) OC_I(_k,_y)
     505             :     OC_COLUMN_IDCT_10(y)
     506             : #undef  OC_I
     507             : #undef  OC_J
     508             : #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
     509             : #define OC_J(_k,_y) OC_I(_k,_y)
     510             :     OC_COLUMN_IDCT_10(y)
     511             : #undef  OC_I
     512             : #undef  OC_J
     513           0 :     :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
     514           0 :     :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
     515           0 :      [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
     516             :   );
     517           0 :   if(_x!=_y){
     518           0 :     __asm__ __volatile__(
     519             :       "pxor %%mm0,%%mm0\n\t"
     520             :       "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
     521             :       "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
     522             :       "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
     523             :       "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
     524           0 :       :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
     525             :     );
     526             :   }
     527           0 : }
     528             : 
     529             : /*Performs an inverse 8x8 Type-II DCT transform.
     530             :   The input is assumed to be scaled by a factor of 4 relative to orthonormal
     531             :    version of the transform.*/
     532           0 : void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
     533             :   /*_last_zzi is subtly different from an actual count of the number of
     534             :      coefficients we decoded for this block.
     535             :     It contains the value of zzi BEFORE the final token in the block was
     536             :      decoded.
     537             :     In most cases this is an EOB token (the continuation of an EOB run from a
     538             :      previous block counts), and so this is the same as the coefficient count.
     539             :     However, in the case that the last token was NOT an EOB token, but filled
     540             :      the block up with exactly 64 coefficients, _last_zzi will be less than 64.
     541             :     Provided the last token was not a pure zero run, the minimum value it can
     542             :      be is 46, and so that doesn't affect any of the cases in this routine.
     543             :     However, if the last token WAS a pure zero run of length 63, then _last_zzi
     544             :      will be 1 while the number of coefficients decoded is 64.
     545             :     Thus, we will trigger the following special case, where the real
     546             :      coefficient count would not.
     547             :     Note also that a zero run of length 64 will give _last_zzi a value of 0,
     548             :      but we still process the DC coefficient, which might have a non-zero value
     549             :      due to DC prediction.
     550             :     Although convoluted, this is arguably the correct behavior: it allows us to
     551             :      use a smaller transform when the block ends with a long zero run instead
     552             :      of a normal EOB token.
     553             :     It could be smarter... multiple separate zero runs at the end of a block
     554             :      will fool it, but an encoder that generates these really deserves what it
     555             :      gets.
     556             :     Needless to say we inherited this approach from VP3.*/
     557             :   /*Then perform the iDCT.*/
     558           0 :   if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
     559           0 :   else oc_idct8x8_slow_mmx(_y,_x);
     560           0 : }
     561             : 
     562             : #endif

Generated by: LCOV version 1.13