mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 12:36:41 +08:00 
			
		
		
		
	 bbe95f7353
			
		
	
	bbe95f7353
	
	
	
		
			
			From x86inc: > On AMD cpus <=K10, an ordinary ret is slow if it immediately follows either > a branch or a branch target. So switch to a 2-byte form of ret in that case. > We can automatically detect "follows a branch", but not a branch target. > (SSSE3 is a sufficient condition to know that your cpu doesn't have this problem.) x86inc can automatically determine whether to use REP_RET rather than REP in most of these cases, so impact is minimal. Additionally, a few REP_RETs were used unnecessary, despite the return being nowhere near a branch. The only CPUs affected were AMD K10s, made between 2007 and 2011, 16 years ago and 12 years ago, respectively. In the future, everyone involved with x86inc should consider dropping REP_RETs altogether.
		
			
				
	
	
		
			731 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			731 lines
		
	
	
		
			20 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ;******************************************************************************
 | |
| ;* Copyright (c) 2012 Michael Niedermayer
 | |
| ;*
 | |
| ;* This file is part of FFmpeg.
 | |
| ;*
 | |
| ;* FFmpeg is free software; you can redistribute it and/or
 | |
| ;* modify it under the terms of the GNU Lesser General Public
 | |
| ;* License as published by the Free Software Foundation; either
 | |
| ;* version 2.1 of the License, or (at your option) any later version.
 | |
| ;*
 | |
| ;* FFmpeg is distributed in the hope that it will be useful,
 | |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ;* Lesser General Public License for more details.
 | |
| ;*
 | |
| ;* You should have received a copy of the GNU Lesser General Public
 | |
| ;* License along with FFmpeg; if not, write to the Free Software
 | |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ;******************************************************************************
 | |
| 
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| SECTION_RODATA 32
 | |
| flt2pm31: times 8 dd 4.6566129e-10
 | |
| flt2p31 : times 8 dd 2147483648.0
 | |
| flt2p15 : times 8 dd 32768.0
 | |
| 
 | |
| word_unpack_shuf : db  0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| 
 | |
| ;to, from, a/u, log2_outsize, log_intsize, const
 | |
| %macro PACK_2CH 5-7
 | |
| cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
 | |
|     mov src2q   , [srcq+gprsize]
 | |
|     mov srcq    , [srcq]
 | |
|     mov dstq    , [dstq]
 | |
| %ifidn %3, a
 | |
|     test dstq, mmsize-1
 | |
|         jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test srcq, mmsize-1
 | |
|         jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src2q, mmsize-1
 | |
|         jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %else
 | |
| pack_2ch_%2_to_%1_u_int %+ SUFFIX:
 | |
| %endif
 | |
|     lea     srcq , [srcq  + (1<<%5)*lenq]
 | |
|     lea     src2q, [src2q + (1<<%5)*lenq]
 | |
|     lea     dstq , [dstq  + (2<<%4)*lenq]
 | |
|     neg     lenq
 | |
|     %7 m0,m1,m2,m3,m4,m5
 | |
| .next:
 | |
| %if %4 >= %5
 | |
|     mov%3     m0, [         srcq +(1<<%5)*lenq]
 | |
|     mova      m1, m0
 | |
|     mov%3     m2, [         src2q+(1<<%5)*lenq]
 | |
| %if %5 == 1
 | |
|     punpcklwd m0, m2
 | |
|     punpckhwd m1, m2
 | |
| %else
 | |
|     punpckldq m0, m2
 | |
|     punpckhdq m1, m2
 | |
| %endif
 | |
|     %6 m0,m1,m2,m3,m4,m5
 | |
| %else
 | |
|     mov%3     m0, [         srcq +(1<<%5)*lenq]
 | |
|     mov%3     m1, [mmsize + srcq +(1<<%5)*lenq]
 | |
|     mov%3     m2, [         src2q+(1<<%5)*lenq]
 | |
|     mov%3     m3, [mmsize + src2q+(1<<%5)*lenq]
 | |
|     %6 m0,m1,m2,m3,m4,m5
 | |
|     mova      m2, m0
 | |
|     punpcklwd m0, m1
 | |
|     punpckhwd m2, m1
 | |
|     SWAP 1,2
 | |
| %endif
 | |
|     mov%3 [           dstq+(2<<%4)*lenq], m0
 | |
|     mov%3 [  mmsize + dstq+(2<<%4)*lenq], m1
 | |
| %if %4 > %5
 | |
|     mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
 | |
|     mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
 | |
|     add lenq, 4*mmsize/(2<<%4)
 | |
| %else
 | |
|     add lenq, 2*mmsize/(2<<%4)
 | |
| %endif
 | |
|         jl .next
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %macro UNPACK_2CH 5-7
 | |
| cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
 | |
|     mov dst2q   , [dstq+gprsize]
 | |
|     mov srcq    , [srcq]
 | |
|     mov dstq    , [dstq]
 | |
| %ifidn %3, a
 | |
|     test dstq, mmsize-1
 | |
|         jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test srcq, mmsize-1
 | |
|         jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test dst2q, mmsize-1
 | |
|         jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %else
 | |
| unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
 | |
| %endif
 | |
|     lea     srcq , [srcq  + (2<<%5)*lenq]
 | |
|     lea     dstq , [dstq  + (1<<%4)*lenq]
 | |
|     lea     dst2q, [dst2q + (1<<%4)*lenq]
 | |
|     neg     lenq
 | |
|     %7 m0,m1,m2,m3,m4,m5
 | |
|     mova      m6, [word_unpack_shuf]
 | |
| .next:
 | |
|     mov%3     m0, [           srcq +(2<<%5)*lenq]
 | |
|     mov%3     m2, [  mmsize + srcq +(2<<%5)*lenq]
 | |
| %if %5 == 1
 | |
| %ifidn SUFFIX, _ssse3
 | |
|     pshufb    m0, m6
 | |
|     mova      m1, m0
 | |
|     pshufb    m2, m6
 | |
|     punpcklqdq m0,m2
 | |
|     punpckhqdq m1,m2
 | |
| %else
 | |
|     mova      m1, m0
 | |
|     punpcklwd m0,m2
 | |
|     punpckhwd m1,m2
 | |
| 
 | |
|     mova      m2, m0
 | |
|     punpcklwd m0,m1
 | |
|     punpckhwd m2,m1
 | |
| 
 | |
|     mova      m1, m0
 | |
|     punpcklwd m0,m2
 | |
|     punpckhwd m1,m2
 | |
| %endif
 | |
| %else
 | |
|     mova      m1, m0
 | |
|     shufps    m0, m2, 10001000b
 | |
|     shufps    m1, m2, 11011101b
 | |
| %endif
 | |
| %if %4 < %5
 | |
|     mov%3     m2, [2*mmsize + srcq +(2<<%5)*lenq]
 | |
|     mova      m3, m2
 | |
|     mov%3     m4, [3*mmsize + srcq +(2<<%5)*lenq]
 | |
|     shufps    m2, m4, 10001000b
 | |
|     shufps    m3, m4, 11011101b
 | |
|     SWAP 1,2
 | |
| %endif
 | |
|     %6 m0,m1,m2,m3,m4,m5
 | |
|     mov%3 [           dstq+(1<<%4)*lenq], m0
 | |
| %if %4 > %5
 | |
|     mov%3 [          dst2q+(1<<%4)*lenq], m2
 | |
|     mov%3 [ mmsize +  dstq+(1<<%4)*lenq], m1
 | |
|     mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
 | |
|     add lenq, 2*mmsize/(1<<%4)
 | |
| %else
 | |
|     mov%3 [          dst2q+(1<<%4)*lenq], m1
 | |
|     add lenq, mmsize/(1<<%4)
 | |
| %endif
 | |
|         jl .next
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %macro CONV 5-7
 | |
| cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
 | |
|     mov srcq    , [srcq]
 | |
|     mov dstq    , [dstq]
 | |
| %ifidn %3, a
 | |
|     test dstq, mmsize-1
 | |
|         jne %2_to_%1_u_int %+ SUFFIX
 | |
|     test srcq, mmsize-1
 | |
|         jne %2_to_%1_u_int %+ SUFFIX
 | |
| %else
 | |
| %2_to_%1_u_int %+ SUFFIX:
 | |
| %endif
 | |
|     lea     srcq , [srcq  + (1<<%5)*lenq]
 | |
|     lea     dstq , [dstq  + (1<<%4)*lenq]
 | |
|     neg     lenq
 | |
|     %7 m0,m1,m2,m3,m4,m5
 | |
| .next:
 | |
|     mov%3     m0, [           srcq +(1<<%5)*lenq]
 | |
|     mov%3     m1, [  mmsize + srcq +(1<<%5)*lenq]
 | |
| %if %4 < %5
 | |
|     mov%3     m2, [2*mmsize + srcq +(1<<%5)*lenq]
 | |
|     mov%3     m3, [3*mmsize + srcq +(1<<%5)*lenq]
 | |
| %endif
 | |
|     %6 m0,m1,m2,m3,m4,m5
 | |
|     mov%3 [           dstq+(1<<%4)*lenq], m0
 | |
|     mov%3 [  mmsize + dstq+(1<<%4)*lenq], m1
 | |
| %if %4 > %5
 | |
|     mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
 | |
|     mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
 | |
|     add lenq, 4*mmsize/(1<<%4)
 | |
| %else
 | |
|     add lenq, 2*mmsize/(1<<%4)
 | |
| %endif
 | |
|         jl .next
 | |
| %if mmsize == 8
 | |
|     emms
 | |
|     RET
 | |
| %else
 | |
|     RET
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro PACK_6CH 8
 | |
| cglobal pack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, src1, src2, src3, src4, src5, len
 | |
| %if ARCH_X86_64
 | |
|     mov     lend, r2d
 | |
| %else
 | |
|     %define lend dword r2m
 | |
| %endif
 | |
|     mov    src1q, [srcq+1*gprsize]
 | |
|     mov    src2q, [srcq+2*gprsize]
 | |
|     mov    src3q, [srcq+3*gprsize]
 | |
|     mov    src4q, [srcq+4*gprsize]
 | |
|     mov    src5q, [srcq+5*gprsize]
 | |
|     mov     srcq, [srcq]
 | |
|     mov     dstq, [dstq]
 | |
| %ifidn %3, a
 | |
|     test dstq, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test srcq, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src1q, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src2q, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src3q, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src4q, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src5q, mmsize-1
 | |
|         jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %else
 | |
| pack_6ch_%2_to_%1_u_int %+ SUFFIX:
 | |
| %endif
 | |
|     sub    src1q, srcq
 | |
|     sub    src2q, srcq
 | |
|     sub    src3q, srcq
 | |
|     sub    src4q, srcq
 | |
|     sub    src5q, srcq
 | |
|     %8 x,x,x,x,m7,x
 | |
| .loop:
 | |
|     mov%3     m0, [srcq      ]
 | |
|     mov%3     m1, [srcq+src1q]
 | |
|     mov%3     m2, [srcq+src2q]
 | |
|     mov%3     m3, [srcq+src3q]
 | |
|     mov%3     m4, [srcq+src4q]
 | |
|     mov%3     m5, [srcq+src5q]
 | |
| %if cpuflag(sse)
 | |
|     SBUTTERFLYPS 0, 1, 6
 | |
|     SBUTTERFLYPS 2, 3, 6
 | |
|     SBUTTERFLYPS 4, 5, 6
 | |
| 
 | |
| %if cpuflag(avx)
 | |
|     blendps   m6, m4, m0, 1100b
 | |
| %else
 | |
|     movaps    m6, m4
 | |
|     shufps    m4, m0, q3210
 | |
|     SWAP 4,6
 | |
| %endif
 | |
|     movlhps   m0, m2
 | |
|     movhlps   m4, m2
 | |
| %if cpuflag(avx)
 | |
|     blendps   m2, m5, m1, 1100b
 | |
| %else
 | |
|     movaps    m2, m5
 | |
|     shufps    m5, m1, q3210
 | |
|     SWAP 2,5
 | |
| %endif
 | |
|     movlhps   m1, m3
 | |
|     movhlps   m5, m3
 | |
| 
 | |
|     %7 m0,m6,x,x,m7,m3
 | |
|     %7 m4,m1,x,x,m7,m3
 | |
|     %7 m2,m5,x,x,m7,m3
 | |
| 
 | |
|     mov %+ %3 %+ ps [dstq   ], m0
 | |
|     mov %+ %3 %+ ps [dstq+16], m6
 | |
|     mov %+ %3 %+ ps [dstq+32], m4
 | |
|     mov %+ %3 %+ ps [dstq+48], m1
 | |
|     mov %+ %3 %+ ps [dstq+64], m2
 | |
|     mov %+ %3 %+ ps [dstq+80], m5
 | |
| %else ; mmx
 | |
|     SBUTTERFLY dq, 0, 1, 6
 | |
|     SBUTTERFLY dq, 2, 3, 6
 | |
|     SBUTTERFLY dq, 4, 5, 6
 | |
| 
 | |
|     movq   [dstq   ], m0
 | |
|     movq   [dstq+ 8], m2
 | |
|     movq   [dstq+16], m4
 | |
|     movq   [dstq+24], m1
 | |
|     movq   [dstq+32], m3
 | |
|     movq   [dstq+40], m5
 | |
| %endif
 | |
|     add      srcq, mmsize
 | |
|     add      dstq, mmsize*6
 | |
|     sub      lend, mmsize/4
 | |
|     jg .loop
 | |
| %if mmsize == 8
 | |
|     emms
 | |
|     RET
 | |
| %else
 | |
|     RET
 | |
| %endif
 | |
| %endmacro
 | |
| 
 | |
| %macro UNPACK_6CH 8
 | |
| cglobal unpack_6ch_%2_to_%1_%3, 2, 8, %6, dst, src, dst1, dst2, dst3, dst4, dst5, len
 | |
| %if ARCH_X86_64
 | |
|     mov     lend, r2d
 | |
| %else
 | |
|     %define lend dword r2m
 | |
| %endif
 | |
|     mov    dst1q, [dstq+1*gprsize]
 | |
|     mov    dst2q, [dstq+2*gprsize]
 | |
|     mov    dst3q, [dstq+3*gprsize]
 | |
|     mov    dst4q, [dstq+4*gprsize]
 | |
|     mov    dst5q, [dstq+5*gprsize]
 | |
|     mov     dstq, [dstq]
 | |
|     mov     srcq, [srcq]
 | |
| %ifidn %3, a
 | |
|     test dstq, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test srcq, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test dst1q, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test dst2q, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test dst3q, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test dst4q, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test dst5q, mmsize-1
 | |
|         jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %else
 | |
| unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
 | |
| %endif
 | |
|     sub    dst1q, dstq
 | |
|     sub    dst2q, dstq
 | |
|     sub    dst3q, dstq
 | |
|     sub    dst4q, dstq
 | |
|     sub    dst5q, dstq
 | |
|     %8 x,x,x,x,m7,x
 | |
| .loop:
 | |
|     mov%3     m0, [srcq   ]
 | |
|     mov%3     m1, [srcq+16]
 | |
|     mov%3     m2, [srcq+32]
 | |
|     mov%3     m3, [srcq+48]
 | |
|     mov%3     m4, [srcq+64]
 | |
|     mov%3     m5, [srcq+80]
 | |
| 
 | |
|     SBUTTERFLYPS 0, 3, 6
 | |
|     SBUTTERFLYPS 1, 4, 6
 | |
|     SBUTTERFLYPS 2, 5, 6
 | |
|     SBUTTERFLYPS 0, 4, 6
 | |
|     SBUTTERFLYPS 3, 2, 6
 | |
|     SBUTTERFLYPS 1, 5, 6
 | |
|     SWAP 1, 4
 | |
|     SWAP 2, 3
 | |
| 
 | |
|     %7 m0,m1,x,x,m7,m6
 | |
|     %7 m2,m3,x,x,m7,m6
 | |
|     %7 m4,m5,x,x,m7,m6
 | |
| 
 | |
|     mov %+ %3 %+ ps [dstq      ], m0
 | |
|     mov %+ %3 %+ ps [dstq+dst1q], m1
 | |
|     mov %+ %3 %+ ps [dstq+dst2q], m2
 | |
|     mov %+ %3 %+ ps [dstq+dst3q], m3
 | |
|     mov %+ %3 %+ ps [dstq+dst4q], m4
 | |
|     mov %+ %3 %+ ps [dstq+dst5q], m5
 | |
| 
 | |
|     add      srcq, mmsize*6
 | |
|     add      dstq, mmsize
 | |
|     sub      lend, mmsize/4
 | |
|     jg .loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
 | |
| 
 | |
| %macro PACK_8CH 8
 | |
| cglobal pack_8ch_%2_to_%1_%3, 2, PACK_8CH_GPRS, %6, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
 | |
|     mov     dstq, [dstq]
 | |
| %if ARCH_X86_32
 | |
|     DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
 | |
|     %define lend dword r2m
 | |
|     %define src1q r0q
 | |
|     %define src1m dword [rsp+32]
 | |
| %if HAVE_ALIGNED_STACK == 0
 | |
|     DEFINE_ARGS dst, src, src2, src3, src5, src6
 | |
|     %define src4q r0q
 | |
|     %define src4m dword [rsp+36]
 | |
| %endif
 | |
|     %define src7q r0q
 | |
|     %define src7m dword [rsp+40]
 | |
|     mov     dstm, dstq
 | |
| %endif
 | |
|     mov    src7q, [srcq+7*gprsize]
 | |
|     mov    src6q, [srcq+6*gprsize]
 | |
| %if ARCH_X86_32
 | |
|     mov    src7m, src7q
 | |
| %endif
 | |
|     mov    src5q, [srcq+5*gprsize]
 | |
|     mov    src4q, [srcq+4*gprsize]
 | |
|     mov    src3q, [srcq+3*gprsize]
 | |
| %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
 | |
|     mov    src4m, src4q
 | |
| %endif
 | |
|     mov    src2q, [srcq+2*gprsize]
 | |
|     mov    src1q, [srcq+1*gprsize]
 | |
|     mov     srcq, [srcq]
 | |
| %ifidn %3, a
 | |
| %if ARCH_X86_32
 | |
|     test dstmp, mmsize-1
 | |
| %else
 | |
|     test dstq, mmsize-1
 | |
| %endif
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test srcq, mmsize-1
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src1q, mmsize-1
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src2q, mmsize-1
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src3q, mmsize-1
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
 | |
|     test src4m, mmsize-1
 | |
| %else
 | |
|     test src4q, mmsize-1
 | |
| %endif
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src5q, mmsize-1
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
|     test src6q, mmsize-1
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %if ARCH_X86_32
 | |
|     test src7m, mmsize-1
 | |
| %else
 | |
|     test src7q, mmsize-1
 | |
| %endif
 | |
|         jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
 | |
| %else
 | |
| pack_8ch_%2_to_%1_u_int %+ SUFFIX:
 | |
| %endif
 | |
|     sub    src1q, srcq
 | |
|     sub    src2q, srcq
 | |
|     sub    src3q, srcq
 | |
| %if ARCH_X86_64 || HAVE_ALIGNED_STACK
 | |
|     sub    src4q, srcq
 | |
| %else
 | |
|     sub    src4m, srcq
 | |
| %endif
 | |
|     sub    src5q, srcq
 | |
|     sub    src6q, srcq
 | |
| %if ARCH_X86_64
 | |
|     sub    src7q, srcq
 | |
| %else
 | |
|     mov src1m, src1q
 | |
|     sub src7m, srcq
 | |
| %endif
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     %8 x,x,x,x,m9,x
 | |
| %elifidn %1, int32
 | |
|     %define m9 [flt2p31]
 | |
| %else
 | |
|     %define m9 [flt2pm31]
 | |
| %endif
 | |
| 
 | |
| .loop:
 | |
|     mov%3     m0, [srcq      ]
 | |
|     mov%3     m1, [srcq+src1q]
 | |
|     mov%3     m2, [srcq+src2q]
 | |
| %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
 | |
|     mov    src4q, src4m
 | |
| %endif
 | |
|     mov%3     m3, [srcq+src3q]
 | |
|     mov%3     m4, [srcq+src4q]
 | |
|     mov%3     m5, [srcq+src5q]
 | |
| %if ARCH_X86_32
 | |
|     mov    src7q, src7m
 | |
| %endif
 | |
|     mov%3     m6, [srcq+src6q]
 | |
|     mov%3     m7, [srcq+src7q]
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
 | |
| 
 | |
|     %7 m0,m1,x,x,m9,m8
 | |
|     %7 m2,m3,x,x,m9,m8
 | |
|     %7 m4,m5,x,x,m9,m8
 | |
|     %7 m6,m7,x,x,m9,m8
 | |
| 
 | |
|     mov%3 [dstq], m0
 | |
| %else
 | |
|     mov     dstq, dstm
 | |
| 
 | |
|     TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
 | |
| 
 | |
|     %7 m0,m1,x,x,m9,m2
 | |
|     mova     m2, [rsp]
 | |
|     mov%3   [dstq], m0
 | |
|     %7 m2,m3,x,x,m9,m0
 | |
|     %7 m4,m5,x,x,m9,m0
 | |
|     %7 m6,m7,x,x,m9,m0
 | |
| 
 | |
| %endif
 | |
| 
 | |
|     mov%3 [dstq+16],  m1
 | |
|     mov%3 [dstq+32],  m2
 | |
|     mov%3 [dstq+48],  m3
 | |
|     mov%3 [dstq+64],  m4
 | |
|     mov%3 [dstq+80],  m5
 | |
|     mov%3 [dstq+96],  m6
 | |
|     mov%3 [dstq+112], m7
 | |
| 
 | |
|     add      srcq, mmsize
 | |
|     add      dstq, mmsize*8
 | |
| %if ARCH_X86_32
 | |
|     mov      dstm, dstq
 | |
|     mov      src1q, src1m
 | |
| %endif
 | |
|     sub      lend, mmsize/4
 | |
|     jg .loop
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| %macro INT16_TO_INT32_N 6
 | |
|     pxor      m2, m2
 | |
|     pxor      m3, m3
 | |
|     punpcklwd m2, m1
 | |
|     punpckhwd m3, m1
 | |
|     SWAP 4,0
 | |
|     pxor      m0, m0
 | |
|     pxor      m1, m1
 | |
|     punpcklwd m0, m4
 | |
|     punpckhwd m1, m4
 | |
| %endmacro
 | |
| 
 | |
| %macro INT32_TO_INT16_N 6
 | |
|     psrad     m0, 16
 | |
|     psrad     m1, 16
 | |
|     psrad     m2, 16
 | |
|     psrad     m3, 16
 | |
|     packssdw  m0, m1
 | |
|     packssdw  m2, m3
 | |
|     SWAP 1,2
 | |
| %endmacro
 | |
| 
 | |
| %macro INT32_TO_FLOAT_INIT 6
 | |
|     mova      %5, [flt2pm31]
 | |
| %endmacro
 | |
| %macro INT32_TO_FLOAT_N 6
 | |
|     cvtdq2ps  %1, %1
 | |
|     cvtdq2ps  %2, %2
 | |
|     mulps %1, %1, %5
 | |
|     mulps %2, %2, %5
 | |
| %endmacro
 | |
| 
 | |
| %macro FLOAT_TO_INT32_INIT 6
 | |
|     mova      %5, [flt2p31]
 | |
| %endmacro
 | |
| %macro FLOAT_TO_INT32_N 6
 | |
|     mulps %1, %5
 | |
|     mulps %2, %5
 | |
|     cvtps2dq  %6, %1
 | |
|     cmpps %1, %1, %5, 5
 | |
|     paddd %1, %6
 | |
|     cvtps2dq  %6, %2
 | |
|     cmpps %2, %2, %5, 5
 | |
|     paddd %2, %6
 | |
| %endmacro
 | |
| 
 | |
| %macro INT16_TO_FLOAT_INIT 6
 | |
|     mova      m5, [flt2pm31]
 | |
| %endmacro
 | |
| %macro INT16_TO_FLOAT_N 6
 | |
|     INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
 | |
|     cvtdq2ps  m0, m0
 | |
|     cvtdq2ps  m1, m1
 | |
|     cvtdq2ps  m2, m2
 | |
|     cvtdq2ps  m3, m3
 | |
|     mulps m0, m0, m5
 | |
|     mulps m1, m1, m5
 | |
|     mulps m2, m2, m5
 | |
|     mulps m3, m3, m5
 | |
| %endmacro
 | |
| 
 | |
| %macro FLOAT_TO_INT16_INIT 6
 | |
|     mova      m5, [flt2p15]
 | |
| %endmacro
 | |
| %macro FLOAT_TO_INT16_N 6
 | |
|     mulps m0, m5
 | |
|     mulps m1, m5
 | |
|     mulps m2, m5
 | |
|     mulps m3, m5
 | |
|     cvtps2dq  m0, m0
 | |
|     cvtps2dq  m1, m1
 | |
|     packssdw  m0, m1
 | |
|     cvtps2dq  m1, m2
 | |
|     cvtps2dq  m3, m3
 | |
|     packssdw  m1, m3
 | |
| %endmacro
 | |
| 
 | |
| %macro NOP_N 0-6
 | |
| %endmacro
 | |
| 
 | |
| INIT_XMM sse
 | |
| PACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
 | |
| PACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
 | |
| 
 | |
| UNPACK_6CH float, float, u, 2, 2, 7, NOP_N, NOP_N
 | |
| UNPACK_6CH float, float, a, 2, 2, 7, NOP_N, NOP_N
 | |
| 
 | |
| INIT_XMM sse2
 | |
| CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
 | |
| CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 | |
| 
 | |
| PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
 | |
| PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
 | |
| PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
 | |
| PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
 | |
| PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
 | |
| PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 | |
| 
 | |
| UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
 | |
| UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
 | |
| UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
 | |
| UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
 | |
| UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
 | |
| UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 | |
| 
 | |
| CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 | |
| CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 | |
| 
 | |
| PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 | |
| PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 | |
| 
 | |
| UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 | |
| UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 | |
| 
 | |
| PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| 
 | |
| UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| 
 | |
| PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
 | |
| PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
 | |
| 
 | |
| PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| 
 | |
| INIT_XMM ssse3
 | |
| UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
 | |
| UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
 | |
| UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
 | |
| UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 | |
| 
 | |
| %if HAVE_AVX_EXTERNAL
 | |
| INIT_XMM avx
 | |
| PACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
 | |
| PACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
 | |
| 
 | |
| UNPACK_6CH float, float, u, 2, 2, 8, NOP_N, NOP_N
 | |
| UNPACK_6CH float, float, a, 2, 2, 8, NOP_N, NOP_N
 | |
| 
 | |
| PACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| PACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| 
 | |
| UNPACK_6CH float, int32, u, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| UNPACK_6CH float, int32, a, 2, 2, 8, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| UNPACK_6CH int32, float, u, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| UNPACK_6CH int32, float, a, 2, 2, 8, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| 
 | |
| PACK_8CH float, float, u, 2, 2, 9, NOP_N, NOP_N
 | |
| PACK_8CH float, float, a, 2, 2, 9, NOP_N, NOP_N
 | |
| 
 | |
| PACK_8CH float, int32, u, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_8CH float, int32, a, 2, 2, 10, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| PACK_8CH int32, float, u, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| PACK_8CH int32, float, a, 2, 2, 10, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| 
 | |
| INIT_YMM avx
 | |
| CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
 | |
| %endif
 | |
| 
 | |
| %if HAVE_AVX2_EXTERNAL
 | |
| INIT_YMM avx2
 | |
| CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 | |
| %endif
 |