mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 20:42:49 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			890 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			890 lines
		
	
	
		
			38 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ;
 | |
| ; Simple IDCT MMX
 | |
| ;
 | |
| ; Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
 | |
| ;
 | |
| ; Conversion from gcc syntax to x264asm syntax with minimal modifications
 | |
| ; by James Darnley <jdarnley@obe.tv>.
 | |
| ;
 | |
| ; This file is part of FFmpeg.
 | |
| ;
 | |
| ; FFmpeg is free software; you can redistribute it and/or
 | |
| ; modify it under the terms of the GNU Lesser General Public
 | |
| ; License as published by the Free Software Foundation; either
 | |
| ; version 2.1 of the License, or (at your option) any later version.
 | |
| ;
 | |
| ; FFmpeg is distributed in the hope that it will be useful,
 | |
| ; but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ; Lesser General Public License for more details.
 | |
| ;
 | |
| ; You should have received a copy of the GNU Lesser General Public
 | |
| ; License along with FFmpeg; if not, write to the Free Software
 | |
| ; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ;/
 | |
| 
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| SECTION_RODATA
 | |
| 
 | |
| cextern pb_80
 | |
| 
 | |
| wm1010: dw 0, 0xffff, 0, 0xffff
 | |
| d40000: dd 4 << 16, 0
 | |
| 
 | |
| ; 23170.475006
 | |
| ; 22725.260826
 | |
| ; 21406.727617
 | |
| ; 19265.545870
 | |
| ; 16384.000000
 | |
| ; 12872.826198
 | |
| ; 8866.956905
 | |
| ; 4520.335430
 | |
| 
 | |
| %define C0 23170 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| %define C1 22725 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| %define C2 21407 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| %define C3 19266 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| %define C4 16383 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
 | |
| %define C5 12873 ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| %define C6 8867  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| %define C7 4520  ; cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 | |
| 
 | |
| %define ROW_SHIFT 11
 | |
| %define COL_SHIFT 20 ; 6
 | |
| 
 | |
| coeffs:
 | |
|     dw 1 << (ROW_SHIFT - 1), 0
 | |
|     dw 1 << (ROW_SHIFT - 1), 0
 | |
|     dw 1 << (ROW_SHIFT - 1), 1
 | |
|     dw 1 << (ROW_SHIFT - 1), 0
 | |
| 
 | |
|     dw C4,  C4,  C4,  C4
 | |
|     dw C4, -C4,  C4, -C4
 | |
| 
 | |
|     dw C2,  C6,  C2,  C6
 | |
|     dw C6, -C2,  C6, -C2
 | |
| 
 | |
|     dw C1,  C3,  C1,  C3
 | |
|     dw C5,  C7,  C5,  C7
 | |
| 
 | |
|     dw C3, -C7,  C3, -C7
 | |
|     dw -C1, -C5, -C1, -C5
 | |
| 
 | |
|     dw C5, -C1,  C5, -C1
 | |
|     dw C7,  C3,  C7,  C3
 | |
| 
 | |
|     dw C7, -C5,  C7, -C5
 | |
|     dw C3, -C1,  C3, -C1
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| %macro DC_COND_IDCT 7
 | |
|     movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
 | |
|     movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
 | |
|     movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
 | |
|     movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
 | |
|     movq            mm4, [wm1010]
 | |
|     pand            mm4, mm0
 | |
|     por             mm4, mm1
 | |
|     por             mm4, mm2
 | |
|     por             mm4, mm3
 | |
|     packssdw        mm4, mm4
 | |
|     movd            t0d, mm4
 | |
|     or              t0d, t0d
 | |
|     jz              %%1
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | |
|     movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | |
|     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | |
|     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | |
|     paddd           mm4, [coeffs + 8]
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     paddd           mm4, mm5            ; A0             a0
 | |
|     psubd           mm6, mm5            ; A3             a3
 | |
|     movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
 | |
|     pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
 | |
|     paddd           mm0, [coeffs + 8]
 | |
|     paddd           mm1, mm0            ; A1             a1
 | |
|     paddd           mm0, mm0
 | |
|     psubd           mm0, mm1            ; A2             a2
 | |
|     pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | |
|     paddd           mm7, mm5            ; B0             b0
 | |
|     movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | |
|     pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | |
|     paddd           mm7, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm7            ; A0-B0          a0-b0
 | |
|     paddd           mm5, mm2            ; B1             b1
 | |
|     psrad           mm7, %7
 | |
|     psrad           mm4, %7
 | |
|     movq            mm2, mm1            ; A1             a1
 | |
|     paddd           mm1, mm5            ; A1+B1          a1+b1
 | |
|     psubd           mm2, mm5            ; A1-B1          a1-b1
 | |
|     psrad           mm1, %7
 | |
|     psrad           mm2, %7
 | |
|     packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
 | |
|     packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
 | |
|     movq           [%5], mm7
 | |
|     movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
 | |
|     movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | |
|     movq      [24 + %5], mm2
 | |
|     pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
 | |
|     movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | |
|     pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | |
|     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | |
|     movq            mm2, mm0            ; A2             a2
 | |
|     pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | |
|     paddd           mm4, mm7            ; B2             b2
 | |
|     paddd           mm2, mm4            ; A2+B2          a2+b2
 | |
|     psubd           mm0, mm4            ; a2-B2          a2-b2
 | |
|     psrad           mm2, %7
 | |
|     psrad           mm0, %7
 | |
|     movq            mm4, mm6            ; A3             a3
 | |
|     paddd           mm3, mm1            ; B3             b3
 | |
|     paddd           mm6, mm3            ; A3+B3          a3+b3
 | |
|     psubd           mm4, mm3            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %7
 | |
|     packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
 | |
|     movq       [8 + %5], mm2
 | |
|     psrad           mm4, %7
 | |
|     packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
 | |
|     movq      [16 + %5], mm4
 | |
|     jmp             %%2
 | |
| %%1:
 | |
|     pslld           mm0, 16
 | |
|     paddd           mm0, [d40000]
 | |
|     psrad           mm0, 13
 | |
|     packssdw        mm0, mm0
 | |
|     movq           [%5], mm0
 | |
|     movq       [8 + %5], mm0
 | |
|     movq      [16 + %5], mm0
 | |
|     movq      [24 + %5], mm0
 | |
| %%2:
 | |
| %endmacro
 | |
| 
 | |
| %macro Z_COND_IDCT 8
 | |
|     movq            mm0, [blockq + %1]  ; R4     R0      r4      r0
 | |
|     movq            mm1, [blockq + %2]  ; R6     R2      r6      r2
 | |
|     movq            mm2, [blockq + %3]  ; R3     R1      r3      r1
 | |
|     movq            mm3, [blockq + %4]  ; R7     R5      r7      r5
 | |
|     movq            mm4, mm0
 | |
|     por             mm4, mm1
 | |
|     por             mm4, mm2
 | |
|     por             mm4, mm3
 | |
|     packssdw        mm4, mm4
 | |
|     movd            t0d, mm4
 | |
|     or              t0d, t0d
 | |
|     jz               %8
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | |
|     movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | |
|     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | |
|     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | |
|     paddd           mm4, [coeffs]
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     paddd           mm4, mm5            ; A0             a0
 | |
|     psubd           mm6, mm5            ; A3             a3
 | |
|     movq            mm5, [coeffs + 56]  ; C7     C5      C7      C5
 | |
|     pmaddwd         mm5, mm3            ; C7R7+C5R5      C7r7+C5r5
 | |
|     paddd           mm0, [coeffs]
 | |
|     paddd           mm1, mm0            ; A1             a1
 | |
|     paddd           mm0, mm0
 | |
|     psubd           mm0, mm1            ; A2             a2
 | |
|     pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | |
|     paddd           mm7, mm5            ; B0             b0
 | |
|     movq            mm5, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | |
|     pmaddwd         mm5, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | |
|     paddd           mm7, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm7            ; A0-B0          a0-b0
 | |
|     paddd           mm5, mm2            ; B1             b1
 | |
|     psrad           mm7, %7
 | |
|     psrad           mm4, %7
 | |
|     movq            mm2, mm1            ; A1             a1
 | |
|     paddd           mm1, mm5            ; A1+B1          a1+b1
 | |
|     psubd           mm2, mm5            ; A1-B1          a1-b1
 | |
|     psrad           mm1, %7
 | |
|     psrad           mm2, %7
 | |
|     packssdw        mm7, mm1            ; A1+B1  a1+b1   A0+B0   a0+b0
 | |
|     packssdw        mm2, mm4            ; A0-B0  a0-b0   A1-B1   a1-b1
 | |
|     movq           [%5], mm7
 | |
|     movq            mm1, [blockq + %3]  ; R3     R1      r3      r1
 | |
|     movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | |
|     movq      [24 + %5], mm2
 | |
|     pmaddwd         mm4, mm1            ; -C1R3+C5R1     -C1r3+C5r1
 | |
|     movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | |
|     pmaddwd         mm1, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | |
|     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | |
|     movq            mm2, mm0            ; A2             a2
 | |
|     pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | |
|     paddd           mm4, mm7            ; B2             b2
 | |
|     paddd           mm2, mm4            ; A2+B2          a2+b2
 | |
|     psubd           mm0, mm4            ; a2-B2          a2-b2
 | |
|     psrad           mm2, %7
 | |
|     psrad           mm0, %7
 | |
|     movq            mm4, mm6            ; A3             a3
 | |
|     paddd           mm3, mm1            ; B3             b3
 | |
|     paddd           mm6, mm3            ; A3+B3          a3+b3
 | |
|     psubd           mm4, mm3            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %7
 | |
|     packssdw        mm2, mm6            ; A3+B3  a3+b3   A2+B2   a2+b2
 | |
|     movq       [8 + %5], mm2
 | |
|     psrad           mm4, %7
 | |
|     packssdw        mm4, mm0            ; A2-B2  a2-b2   A3-B3   a3-b3
 | |
|     movq      [16 + %5], mm4
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT1 6
 | |
|     movq            mm0, %1             ; R4     R0      r4      r0
 | |
|     movq            mm1, %2             ; R6     R2      r6      r2
 | |
|     movq            mm2, %3             ; R3     R1      r3      r1
 | |
|     movq            mm3, %4             ; R7     R5      r7      r5
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | |
|     movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | |
|     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | |
|     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | |
|     paddd           mm4, mm5            ; A0             a0
 | |
|     psubd           mm6, mm5            ; A3             a3
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     paddd           mm0, mm1            ; A1             a1
 | |
|     psubd           mm5, mm1            ; A2             a2
 | |
|     movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | |
|     pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | |
|     pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | |
|     paddd           mm7, mm1            ; B0             b0
 | |
|     movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | |
|     pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | |
|     paddd           mm7, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm7            ; A0-B0          a0-b0
 | |
|     paddd           mm1, mm2            ; B1             b1
 | |
|     psrad           mm7, %6
 | |
|     psrad           mm4, %6
 | |
|     movq            mm2, mm0            ; A1             a1
 | |
|     paddd           mm0, mm1            ; A1+B1          a1+b1
 | |
|     psubd           mm2, mm1            ; A1-B1          a1-b1
 | |
|     psrad           mm0, %6
 | |
|     psrad           mm2, %6
 | |
|     packssdw        mm7, mm7            ; A0+B0  a0+b0
 | |
|     movd           [%5], mm7
 | |
|     packssdw        mm0, mm0            ; A1+B1  a1+b1
 | |
|     movd      [16 + %5], mm0
 | |
|     packssdw        mm2, mm2            ; A1-B1  a1-b1
 | |
|     movd      [96 + %5], mm2
 | |
|     packssdw        mm4, mm4            ; A0-B0  a0-b0
 | |
|     movd     [112 + %5], mm4
 | |
|     movq            mm0, %3             ; R3     R1      r3      r1
 | |
|     movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | |
|     pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
 | |
|     movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | |
|     pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | |
|     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | |
|     movq            mm2, mm5            ; A2             a2
 | |
|     pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | |
|     paddd           mm4, mm7            ; B2             b2
 | |
|     paddd           mm2, mm4            ; A2+B2          a2+b2
 | |
|     psubd           mm5, mm4            ; a2-B2          a2-b2
 | |
|     psrad           mm2, %6
 | |
|     psrad           mm5, %6
 | |
|     movq            mm4, mm6            ; A3             a3
 | |
|     paddd           mm3, mm0            ; B3             b3
 | |
|     paddd           mm6, mm3            ; A3+B3          a3+b3
 | |
|     psubd           mm4, mm3            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %6
 | |
|     psrad           mm4, %6
 | |
|     packssdw        mm2, mm2            ; A2+B2  a2+b2
 | |
|     packssdw        mm6, mm6            ; A3+B3  a3+b3
 | |
|     movd      [32 + %5], mm2
 | |
|     packssdw        mm4, mm4            ; A3-B3  a3-b3
 | |
|     packssdw        mm5, mm5            ; A2-B2  a2-b2
 | |
|     movd      [48 + %5], mm6
 | |
|     movd      [64 + %5], mm4
 | |
|     movd      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT2 6
 | |
|     movq            mm0, %1             ; R4     R0      r4      r0
 | |
|     movq            mm1, %2             ; R6     R2      r6      r2
 | |
|     movq            mm3, %4             ; R7     R5      r7      r5
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | |
|     movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | |
|     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     paddd           mm4, mm5            ; A0             a0
 | |
|     psubd           mm6, mm5            ; A3             a3
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     paddd           mm0, mm1            ; A1             a1
 | |
|     psubd           mm5, mm1            ; A2             a2
 | |
|     movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | |
|     pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | |
|     movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | |
|     pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | |
|     paddd           mm1, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm1            ; A0-B0          a0-b0
 | |
|     psrad           mm1, %6
 | |
|     psrad           mm4, %6
 | |
|     movq            mm2, mm0            ; A1             a1
 | |
|     paddd           mm0, mm7            ; A1+B1          a1+b1
 | |
|     psubd           mm2, mm7            ; A1-B1          a1-b1
 | |
|     psrad           mm0, %6
 | |
|     psrad           mm2, %6
 | |
|     packssdw        mm1, mm1            ; A0+B0  a0+b0
 | |
|     movd           [%5], mm1
 | |
|     packssdw        mm0, mm0            ; A1+B1  a1+b1
 | |
|     movd      [16 + %5], mm0
 | |
|     packssdw        mm2, mm2            ; A1-B1  a1-b1
 | |
|     movd      [96 + %5], mm2
 | |
|     packssdw        mm4, mm4            ; A0-B0  a0-b0
 | |
|     movd     [112 + %5], mm4
 | |
|     movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
 | |
|     pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
 | |
|     movq            mm2, mm5            ; A2             a2
 | |
|     pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | |
|     paddd           mm2, mm1            ; A2+B2          a2+b2
 | |
|     psubd           mm5, mm1            ; a2-B2          a2-b2
 | |
|     psrad           mm2, %6
 | |
|     psrad           mm5, %6
 | |
|     movq            mm1, mm6            ; A3             a3
 | |
|     paddd           mm6, mm3            ; A3+B3          a3+b3
 | |
|     psubd           mm1, mm3            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %6
 | |
|     psrad           mm1, %6
 | |
|     packssdw        mm2, mm2            ; A2+B2  a2+b2
 | |
|     packssdw        mm6, mm6            ; A3+B3  a3+b3
 | |
|     movd      [32 + %5], mm2
 | |
|     packssdw        mm1, mm1            ; A3-B3  a3-b3
 | |
|     packssdw        mm5, mm5            ; A2-B2  a2-b2
 | |
|     movd      [48 + %5], mm6
 | |
|     movd      [64 + %5], mm1
 | |
|     movd      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT3 6
 | |
|     movq            mm0, %1             ; R4     R0      r4      r0
 | |
|     movq            mm3, %4             ; R7     R5      r7      r5
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | |
|     pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | |
|     movq            mm7, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | |
|     pmaddwd         mm7, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | |
|     paddd           mm1, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm1            ; A0-B0          a0-b0
 | |
|     psrad           mm1, %6
 | |
|     psrad           mm4, %6
 | |
|     movq            mm2, mm0            ; A1             a1
 | |
|     paddd           mm0, mm7            ; A1+B1          a1+b1
 | |
|     psubd           mm2, mm7            ; A1-B1          a1-b1
 | |
|     psrad           mm0, %6
 | |
|     psrad           mm2, %6
 | |
|     packssdw        mm1, mm1            ; A0+B0  a0+b0
 | |
|     movd           [%5], mm1
 | |
|     packssdw        mm0, mm0            ; A1+B1  a1+b1
 | |
|     movd      [16 + %5], mm0
 | |
|     packssdw        mm2, mm2            ; A1-B1  a1-b1
 | |
|     movd      [96 + %5], mm2
 | |
|     packssdw        mm4, mm4            ; A0-B0  a0-b0
 | |
|     movd     [112 + %5], mm4
 | |
|     movq            mm1, [coeffs + 88]  ; C3     C7      C3      C7
 | |
|     pmaddwd         mm1, mm3            ; C3R7+C7R5      C3r7+C7r5
 | |
|     movq            mm2, mm5            ; A2             a2
 | |
|     pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | |
|     paddd           mm2, mm1            ; A2+B2          a2+b2
 | |
|     psubd           mm5, mm1            ; a2-B2          a2-b2
 | |
|     psrad           mm2, %6
 | |
|     psrad           mm5, %6
 | |
|     movq            mm1, mm6            ; A3             a3
 | |
|     paddd           mm6, mm3            ; A3+B3          a3+b3
 | |
|     psubd           mm1, mm3            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %6
 | |
|     psrad           mm1, %6
 | |
|     packssdw        mm2, mm2            ; A2+B2  a2+b2
 | |
|     packssdw        mm6, mm6            ; A3+B3  a3+b3
 | |
|     movd      [32 + %5], mm2
 | |
|     packssdw        mm1, mm1            ; A3-B3  a3-b3
 | |
|     packssdw        mm5, mm5            ; A2-B2  a2-b2
 | |
|     movd      [48 + %5], mm6
 | |
|     movd      [64 + %5], mm1
 | |
|     movd      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT4 6
 | |
|     movq            mm0, %1             ; R4     R0      r4      r0
 | |
|     movq            mm2, %3             ; R3     R1      r3      r1
 | |
|     movq            mm3, %4             ; R7     R5      r7      r5
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | |
|     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm1, [coeffs + 56]  ; C7     C5      C7      C5
 | |
|     pmaddwd         mm1, mm3            ; C7R7+C5R5      C7r7+C5r5
 | |
|     pmaddwd         mm2, [coeffs + 64]  ; -C7R3+C3R1     -C7r3+C3r1
 | |
|     paddd           mm7, mm1            ; B0             b0
 | |
|     movq            mm1, [coeffs + 72]  ; -C5    -C1     -C5     -C1
 | |
|     pmaddwd         mm1, mm3            ; -C5R7-C1R5     -C5r7-C1r5
 | |
|     paddd           mm7, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm7            ; A0-B0          a0-b0
 | |
|     paddd           mm1, mm2            ; B1             b1
 | |
|     psrad           mm7, %6
 | |
|     psrad           mm4, %6
 | |
|     movq            mm2, mm0            ; A1             a1
 | |
|     paddd           mm0, mm1            ; A1+B1          a1+b1
 | |
|     psubd           mm2, mm1            ; A1-B1          a1-b1
 | |
|     psrad           mm0, %6
 | |
|     psrad           mm2, %6
 | |
|     packssdw        mm7, mm7            ; A0+B0  a0+b0
 | |
|     movd           [%5], mm7
 | |
|     packssdw        mm0, mm0            ; A1+B1  a1+b1
 | |
|     movd      [16 + %5], mm0
 | |
|     packssdw        mm2, mm2            ; A1-B1  a1-b1
 | |
|     movd      [96 + %5], mm2
 | |
|     packssdw        mm4, mm4            ; A0-B0  a0-b0
 | |
|     movd     [112 + %5], mm4
 | |
|     movq            mm0, %3             ; R3     R1      r3      r1
 | |
|     movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | |
|     pmaddwd         mm4, mm0            ; -C1R3+C5R1     -C1r3+C5r1
 | |
|     movq            mm7, [coeffs + 88]  ; C3     C7      C3      C7
 | |
|     pmaddwd         mm0, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | |
|     pmaddwd         mm7, mm3            ; C3R7+C7R5      C3r7+C7r5
 | |
|     movq            mm2, mm5            ; A2             a2
 | |
|     pmaddwd         mm3, [coeffs + 104] ; -C1R7+C3R5     -C1r7+C3r5
 | |
|     paddd           mm4, mm7            ; B2             b2
 | |
|     paddd           mm2, mm4            ; A2+B2          a2+b2
 | |
|     psubd           mm5, mm4            ; a2-B2          a2-b2
 | |
|     psrad           mm2, %6
 | |
|     psrad           mm5, %6
 | |
|     movq            mm4, mm6            ; A3             a3
 | |
|     paddd           mm3, mm0            ; B3             b3
 | |
|     paddd           mm6, mm3            ; A3+B3          a3+b3
 | |
|     psubd           mm4, mm3            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %6
 | |
|     psrad           mm4, %6
 | |
|     packssdw        mm2, mm2            ; A2+B2  a2+b2
 | |
|     packssdw        mm6, mm6            ; A3+B3  a3+b3
 | |
|     movd      [32 + %5], mm2
 | |
|     packssdw        mm4, mm4            ; A3-B3  a3-b3
 | |
|     packssdw        mm5, mm5            ; A2-B2  a2-b2
 | |
|     movd      [48 + %5], mm6
 | |
|     movd      [64 + %5], mm4
 | |
|     movd      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT5 6
 | |
|     movq            mm0, %1             ; R4     R0      r4      r0
 | |
|     movq            mm2, %3             ; R3     R1      r3      r1
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | |
|     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm3, [coeffs + 64]
 | |
|     pmaddwd         mm3, mm2            ; -C7R3+C3R1     -C7r3+C3r1
 | |
|     paddd           mm7, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm7            ; A0-B0          a0-b0
 | |
|     psrad           mm7, %6
 | |
|     psrad           mm4, %6
 | |
|     movq            mm1, mm0            ; A1             a1
 | |
|     paddd           mm0, mm3            ; A1+B1          a1+b1
 | |
|     psubd           mm1, mm3            ; A1-B1          a1-b1
 | |
|     psrad           mm0, %6
 | |
|     psrad           mm1, %6
 | |
|     packssdw        mm7, mm7            ; A0+B0  a0+b0
 | |
|     movd           [%5], mm7
 | |
|     packssdw        mm0, mm0            ; A1+B1  a1+b1
 | |
|     movd      [16 + %5], mm0
 | |
|     packssdw        mm1, mm1            ; A1-B1  a1-b1
 | |
|     movd      [96 + %5], mm1
 | |
|     packssdw        mm4, mm4            ; A0-B0  a0-b0
 | |
|     movd     [112 + %5], mm4
 | |
|     movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | |
|     pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
 | |
|     pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | |
|     movq            mm1, mm5            ; A2             a2
 | |
|     paddd           mm1, mm4            ; A2+B2          a2+b2
 | |
|     psubd           mm5, mm4            ; a2-B2          a2-b2
 | |
|     psrad           mm1, %6
 | |
|     psrad           mm5, %6
 | |
|     movq            mm4, mm6            ; A3             a3
 | |
|     paddd           mm6, mm2            ; A3+B3          a3+b3
 | |
|     psubd           mm4, mm2            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %6
 | |
|     psrad           mm4, %6
 | |
|     packssdw        mm1, mm1            ; A2+B2  a2+b2
 | |
|     packssdw        mm6, mm6            ; A3+B3  a3+b3
 | |
|     movd      [32 + %5], mm1
 | |
|     packssdw        mm4, mm4            ; A3-B3  a3-b3
 | |
|     packssdw        mm5, mm5            ; A2-B2  a2-b2
 | |
|     movd      [48 + %5], mm6
 | |
|     movd      [64 + %5], mm4
 | |
|     movd      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT6 6
 | |
|     movq            mm0, [%1]           ; R4     R0      r4      r0
 | |
|     movq            mm1, [%2]           ; R6     R2      r6      r2
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | |
|     movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | |
|     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     paddd           mm4, mm5            ; A0             a0
 | |
|     psubd           mm6, mm5            ; A3             a3
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     paddd           mm0, mm1            ; A1             a1
 | |
|     psubd           mm5, mm1            ; A2             a2
 | |
|     movq            mm2, [8 + %1]       ; R4     R0      r4      r0
 | |
|     movq            mm3, [8 + %2]       ; R6     R2      r6      r2
 | |
|     movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm7, mm3            ; C6R6+C2R2      C6r6+C2r2
 | |
|     pmaddwd         mm3, [coeffs + 40]  ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     paddd           mm7, mm1            ; A0             a0
 | |
|     paddd           mm1, mm1            ; 2C0            2c0
 | |
|     psubd           mm1, mm7            ; A3             a3
 | |
|     paddd           mm3, mm2            ; A1             a1
 | |
|     paddd           mm2, mm2            ; 2C1            2c1
 | |
|     psubd           mm2, mm3            ; A2             a2
 | |
|     psrad           mm4, %6
 | |
|     psrad           mm7, %6
 | |
|     psrad           mm3, %6
 | |
|     packssdw        mm4, mm7            ; A0     a0
 | |
|     movq           [%5], mm4
 | |
|     psrad           mm0, %6
 | |
|     packssdw        mm0, mm3            ; A1     a1
 | |
|     movq      [16 + %5], mm0
 | |
|     movq      [96 + %5], mm0
 | |
|     movq     [112 + %5], mm4
 | |
|     psrad           mm5, %6
 | |
|     psrad           mm6, %6
 | |
|     psrad           mm2, %6
 | |
|     packssdw        mm5, mm2            ; A2-B2  a2-b2
 | |
|     movq      [32 + %5], mm5
 | |
|     psrad           mm1, %6
 | |
|     packssdw        mm6, mm1            ; A3+B3  a3+b3
 | |
|     movq      [48 + %5], mm6
 | |
|     movq      [64 + %5], mm6
 | |
|     movq      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT7 6
 | |
|     movq            mm0, %1             ; R4     R0      r4      r0
 | |
|     movq            mm1, %2             ; R6     R2      r6      r2
 | |
|     movq            mm2, %3             ; R3     R1      r3      r1
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     pmaddwd         mm5, mm1            ; C6R6+C2R2      C6r6+C2r2
 | |
|     movq            mm6, [coeffs + 40]  ; -C2    C6      -C2     C6
 | |
|     pmaddwd         mm1, mm6            ; -C2R6+C6R2     -C2r6+C6r2
 | |
|     movq            mm6, mm4            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 48]  ; C3     C1      C3      C1
 | |
|     pmaddwd         mm7, mm2            ; C3R3+C1R1      C3r3+C1r1
 | |
|     paddd           mm4, mm5            ; A0             a0
 | |
|     psubd           mm6, mm5            ; A3             a3
 | |
|     movq            mm5, mm0            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     paddd           mm0, mm1            ; A1             a1
 | |
|     psubd           mm5, mm1            ; A2             a2
 | |
|     movq            mm1, [coeffs + 64]
 | |
|     pmaddwd         mm1, mm2            ; -C7R3+C3R1     -C7r3+C3r1
 | |
|     paddd           mm7, mm4            ; A0+B0          a0+b0
 | |
|     paddd           mm4, mm4            ; 2A0            2a0
 | |
|     psubd           mm4, mm7            ; A0-B0          a0-b0
 | |
|     psrad           mm7, %6
 | |
|     psrad           mm4, %6
 | |
|     movq            mm3, mm0            ; A1             a1
 | |
|     paddd           mm0, mm1            ; A1+B1          a1+b1
 | |
|     psubd           mm3, mm1            ; A1-B1          a1-b1
 | |
|     psrad           mm0, %6
 | |
|     psrad           mm3, %6
 | |
|     packssdw        mm7, mm7            ; A0+B0  a0+b0
 | |
|     movd           [%5], mm7
 | |
|     packssdw        mm0, mm0            ; A1+B1  a1+b1
 | |
|     movd      [16 + %5], mm0
 | |
|     packssdw        mm3, mm3            ; A1-B1  a1-b1
 | |
|     movd      [96 + %5], mm3
 | |
|     packssdw        mm4, mm4            ; A0-B0  a0-b0
 | |
|     movd     [112 + %5], mm4
 | |
|     movq            mm4, [coeffs + 80]  ; -C1    C5      -C1     C5
 | |
|     pmaddwd         mm4, mm2            ; -C1R3+C5R1     -C1r3+C5r1
 | |
|     pmaddwd         mm2, [coeffs + 96]  ; -C5R3+C7R1     -C5r3+C7r1
 | |
|     movq            mm3, mm5            ; A2             a2
 | |
|     paddd           mm3, mm4            ; A2+B2          a2+b2
 | |
|     psubd           mm5, mm4            ; a2-B2          a2-b2
 | |
|     psrad           mm3, %6
 | |
|     psrad           mm5, %6
 | |
|     movq            mm4, mm6            ; A3             a3
 | |
|     paddd           mm6, mm2            ; A3+B3          a3+b3
 | |
|     psubd           mm4, mm2            ; a3-B3          a3-b3
 | |
|     psrad           mm6, %6
 | |
|     packssdw        mm3, mm3            ; A2+B2  a2+b2
 | |
|     movd      [32 + %5], mm3
 | |
|     psrad           mm4, %6
 | |
|     packssdw        mm6, mm6            ; A3+B3  a3+b3
 | |
|     movd      [48 + %5], mm6
 | |
|     packssdw        mm4, mm4            ; A3-B3  a3-b3
 | |
|     packssdw        mm5, mm5            ; A2-B2  a2-b2
 | |
|     movd      [64 + %5], mm4
 | |
|     movd      [80 + %5], mm5
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT8 6
 | |
|     movq            mm0, [%1]           ; R4     R0      r4      r0
 | |
|     movq            mm4, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm4, mm0            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm5, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm0, mm5            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     psrad           mm4, %6
 | |
|     psrad           mm0, %6
 | |
|     movq            mm2, [8 + %1]       ; R4     R0      r4      r0
 | |
|     movq            mm1, [coeffs + 16]  ; C4     C4      C4      C4
 | |
|     pmaddwd         mm1, mm2            ; C4R4+C4R0      C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 24]  ; -C4    C4      -C4     C4
 | |
|     pmaddwd         mm2, mm7            ; -C4R4+C4R0     -C4r4+C4r0
 | |
|     movq            mm7, [coeffs + 32]  ; C6     C2      C6      C2
 | |
|     psrad           mm1, %6
 | |
|     packssdw        mm4, mm1            ; A0     a0
 | |
|     movq           [%5], mm4
 | |
|     psrad           mm2, %6
 | |
|     packssdw        mm0, mm2            ; A1     a1
 | |
|     movq      [16 + %5], mm0
 | |
|     movq      [96 + %5], mm0
 | |
|     movq     [112 + %5], mm4
 | |
|     movq      [32 + %5], mm0
 | |
|     movq      [48 + %5], mm4
 | |
|     movq      [64 + %5], mm4
 | |
|     movq      [80 + %5], mm0
 | |
| %endmacro
 | |
| 
 | |
| %macro IDCT 0
 | |
|     DC_COND_IDCT  0,   8,  16,  24, rsp +  0, null, 11
 | |
|     Z_COND_IDCT  32,  40,  48,  56, rsp + 32, null, 11, %%4
 | |
|     Z_COND_IDCT  64,  72,  80,  88, rsp + 64, null, 11, %%2
 | |
|     Z_COND_IDCT  96, 104, 112, 120, rsp + 96, null, 11, %%1
 | |
| 
 | |
|     IDCT1 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | |
|     IDCT1 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | |
|     IDCT1 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | |
|     IDCT1 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%4:
 | |
|     Z_COND_IDCT 64,  72,  80,  88, rsp + 64, null, 11, %%6
 | |
|     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%5
 | |
| 
 | |
|     IDCT2 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | |
|     IDCT2 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | |
|     IDCT2 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | |
|     IDCT2 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%6:
 | |
|     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%7
 | |
| 
 | |
|     IDCT3 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | |
|     IDCT3 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | |
|     IDCT3 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | |
|     IDCT3 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%2:
 | |
|     Z_COND_IDCT 96, 104, 112, 120, rsp + 96, null, 11, %%3
 | |
| 
 | |
|     IDCT4 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | |
|     IDCT4 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | |
|     IDCT4 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | |
|     IDCT4 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%3:
 | |
| 
 | |
|     IDCT5 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | |
|     IDCT5 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | |
|     IDCT5 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | |
|     IDCT5 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%5:
 | |
| 
 | |
|     IDCT6 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
 | |
|     IDCT6 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%1:
 | |
| 
 | |
|     IDCT7 [rsp +  0], [rsp + 64], [rsp + 32], [rsp +  96], blockq +  0, 20
 | |
|     IDCT7 [rsp +  8], [rsp + 72], [rsp + 40], [rsp + 104], blockq +  4, 20
 | |
|     IDCT7 [rsp + 16], [rsp + 80], [rsp + 48], [rsp + 112], blockq +  8, 20
 | |
|     IDCT7 [rsp + 24], [rsp + 88], [rsp + 56], [rsp + 120], blockq + 12, 20
 | |
|     jmp %%9
 | |
| 
 | |
|     ALIGN 16
 | |
|     %%7:
 | |
| 
 | |
|     IDCT8 rsp +  0, rsp + 64, rsp + 32, rsp +  96, blockq +  0, 20
 | |
|     IDCT8 rsp + 16, rsp + 80, rsp + 48, rsp + 112, blockq +  8, 20
 | |
| 
 | |
|     %%9:
 | |
| %endmacro
 | |
| 
 | |
| %macro PUT_PIXELS_CLAMPED_HALF 1
 | |
|     mova     m0, [blockq+mmsize*0+%1]
 | |
|     mova     m1, [blockq+mmsize*2+%1]
 | |
| %if mmsize == 8
 | |
|     mova     m2, [blockq+mmsize*4+%1]
 | |
|     mova     m3, [blockq+mmsize*6+%1]
 | |
| %endif
 | |
|     packuswb m0, [blockq+mmsize*1+%1]
 | |
|     packuswb m1, [blockq+mmsize*3+%1]
 | |
| %if mmsize == 8
 | |
|     packuswb m2, [blockq+mmsize*5+%1]
 | |
|     packuswb m3, [blockq+mmsize*7+%1]
 | |
|     movq           [pixelsq], m0
 | |
|     movq    [lsizeq+pixelsq], m1
 | |
|     movq  [2*lsizeq+pixelsq], m2
 | |
|     movq   [lsize3q+pixelsq], m3
 | |
| %else
 | |
|     movq           [pixelsq], m0
 | |
|     movhps  [lsizeq+pixelsq], m0
 | |
|     movq  [2*lsizeq+pixelsq], m1
 | |
|     movhps [lsize3q+pixelsq], m1
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro ADD_PIXELS_CLAMPED 1
 | |
|     mova       m0, [blockq+mmsize*0+%1]
 | |
|     mova       m1, [blockq+mmsize*1+%1]
 | |
| %if mmsize == 8
 | |
|     mova       m5, [blockq+mmsize*2+%1]
 | |
|     mova       m6, [blockq+mmsize*3+%1]
 | |
| %endif
 | |
|     movq       m2, [pixelsq]
 | |
|     movq       m3, [pixelsq+lsizeq]
 | |
| %if mmsize == 8
 | |
|     mova       m7, m2
 | |
|     punpcklbw  m2, m4
 | |
|     punpckhbw  m7, m4
 | |
|     paddsw     m0, m2
 | |
|     paddsw     m1, m7
 | |
|     mova       m7, m3
 | |
|     punpcklbw  m3, m4
 | |
|     punpckhbw  m7, m4
 | |
|     paddsw     m5, m3
 | |
|     paddsw     m6, m7
 | |
| %else
 | |
|     punpcklbw  m2, m4
 | |
|     punpcklbw  m3, m4
 | |
|     paddsw     m0, m2
 | |
|     paddsw     m1, m3
 | |
| %endif
 | |
|     packuswb   m0, m1
 | |
| %if mmsize == 8
 | |
|     packuswb   m5, m6
 | |
|     movq       [pixelsq], m0
 | |
|     movq       [pixelsq+lsizeq], m5
 | |
| %else
 | |
|     movq       [pixelsq], m0
 | |
|     movhps     [pixelsq+lsizeq], m0
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| INIT_MMX mmx
 | |
| 
 | |
| cglobal simple_idct, 1, 2, 8, 128, block, t0
 | |
|     IDCT
 | |
| RET
 | |
| 
 | |
| cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
 | |
|     IDCT
 | |
|     lea lsize3q, [lsizeq*3]
 | |
|     PUT_PIXELS_CLAMPED_HALF 0
 | |
|     lea pixelsq, [pixelsq+lsizeq*4]
 | |
|     PUT_PIXELS_CLAMPED_HALF 64
 | |
| RET
 | |
| 
 | |
| cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
 | |
|     IDCT
 | |
|     pxor       m4, m4
 | |
|     ADD_PIXELS_CLAMPED 0
 | |
|     lea        pixelsq, [pixelsq+lsizeq*2]
 | |
|     ADD_PIXELS_CLAMPED 32
 | |
|     lea        pixelsq, [pixelsq+lsizeq*2]
 | |
|     ADD_PIXELS_CLAMPED 64
 | |
|     lea        pixelsq, [pixelsq+lsizeq*2]
 | |
|     ADD_PIXELS_CLAMPED 96
 | |
| RET
 | |
| 
 | |
| INIT_XMM sse2
 | |
| 
 | |
| cglobal simple_idct_put, 3, 5, 8, 128, pixels, lsize, block, lsize3, t0
 | |
|     IDCT
 | |
|     lea lsize3q, [lsizeq*3]
 | |
|     PUT_PIXELS_CLAMPED_HALF 0
 | |
|     lea pixelsq, [pixelsq+lsizeq*4]
 | |
|     PUT_PIXELS_CLAMPED_HALF 64
 | |
| RET
 | |
| 
 | |
| cglobal simple_idct_add, 3, 4, 8, 128, pixels, lsize, block, t0
 | |
|     IDCT
 | |
|     pxor       m4, m4
 | |
|     ADD_PIXELS_CLAMPED 0
 | |
|     lea        pixelsq, [pixelsq+lsizeq*2]
 | |
|     ADD_PIXELS_CLAMPED 32
 | |
|     lea        pixelsq, [pixelsq+lsizeq*2]
 | |
|     ADD_PIXELS_CLAMPED 64
 | |
|     lea        pixelsq, [pixelsq+lsizeq*2]
 | |
|     ADD_PIXELS_CLAMPED 96
 | |
| RET
 | 
