mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 20:42:49 +08:00 
			
		
		
		
	x86/tx_float: fully support 128bit regs in LOAD64_LUT
The gather path didn't support 128bit registers. It's not faster on Zen 3, but it's here for completeness.
This commit is contained in:
		| @@ -107,19 +107,19 @@ SECTION .text | ||||
| ; %7 - temporary register (for avx only, enables vgatherdpd (AVX2) if FMA3 is set) | ||||
| %macro LOAD64_LUT 5-7 | ||||
| %if %0 > 6 && cpuflag(avx2) | ||||
|     pcmpeqd %6, %6 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 | ||||
|     movapd xmm%7, [%3 + %4] ; float mov since vgatherdpd is a float instruction | ||||
|     vgatherdpd %1, [%2 + xmm%7*8], %6 ; must use separate registers for args | ||||
|     pcmpeqd %7, %7 ; pcmpeqq has a 0.5 throughput on Zen 3, this has 0.25 | ||||
|     movupd xmm%6, [%3 + %4] ; float mov since vgatherdpd is a float instruction | ||||
|     vgatherdpd %1, [%2 + xmm%6*8], %7 ; must use separate registers for args | ||||
| %else | ||||
|     mov      %5d, [%3 + %4 + 0] | ||||
|     movsd  xmm%1, [%2 + %5q*8] | ||||
| %if mmsize == 32 | ||||
| %if sizeof%1 > 16 && %0 > 5 | ||||
|     mov      %5d, [%3 + %4 + 8] | ||||
|     movsd  xmm%6, [%2 + %5q*8] | ||||
| %endif | ||||
|     mov      %5d, [%3 + %4 + 4] | ||||
|     movhps xmm%1, [%2 + %5q*8] | ||||
| %if mmsize == 32 | ||||
| %if sizeof%1 > 16 && %0 > 5 | ||||
|     mov      %5d, [%3 + %4 + 12] | ||||
|     movhps xmm%6, [%2 + %5q*8] | ||||
|     vinsertf128 %1, %1, xmm%6, 1 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Lynne
					Lynne