mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 20:42:49 +08:00 
			
		
		
		
	 1a35fffaf2
			
		
	
	1a35fffaf2
	
	
	
		
			
			if taken from stack, they may have garbage in the upper bits otherwise. Also, there are only 8 arguments, so don't attempt to load 11. Fixes SIGSEV crashes in some targets. Reviewed-by: durandal_1707 Signed-off-by: James Almer <jamrial@gmail.com>
		
			
				
	
	
		
			698 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
			
		
		
	
	
			698 lines
		
	
	
		
			14 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
| ;******************************************************************************
 | |
| ;* x86-optimized functions for the CFHD decoder
 | |
| ;* Copyright (c) 2020 Paul B Mahol
 | |
| ;*
 | |
| ;* This file is part of FFmpeg.
 | |
| ;*
 | |
| ;* FFmpeg is free software; you can redistribute it and/or
 | |
| ;* modify it under the terms of the GNU Lesser General Public
 | |
| ;* License as published by the Free Software Foundation; either
 | |
| ;* version 2.1 of the License, or (at your option) any later version.
 | |
| ;*
 | |
| ;* FFmpeg is distributed in the hope that it will be useful,
 | |
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
| ;* Lesser General Public License for more details.
 | |
| ;*
 | |
| ;* You should have received a copy of the GNU Lesser General Public
 | |
| ;* License along with FFmpeg; if not, write to the Free Software
 | |
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
| ;******************************************************************************
 | |
| 
 | |
| %include "libavutil/x86/x86util.asm"
 | |
| 
 | |
| SECTION_RODATA
 | |
| 
 | |
| factor_p1_n1: dw 1, -1, 1, -1, 1, -1, 1, -1,
 | |
| factor_n1_p1: dw -1, 1, -1, 1, -1, 1, -1, 1,
 | |
| factor_p11_n4: dw 11, -4, 11, -4, 11, -4, 11, -4,
 | |
| factor_p5_p4: dw 5, 4, 5, 4, 5, 4, 5, 4,
 | |
| pd_4: times 4 dd 4
 | |
| pw_1: times 8 dw 1
 | |
| pw_0: times 8 dw 0
 | |
| pw_1023: times 8 dw 1023
 | |
| pw_4095: times 8 dw 4095
 | |
| 
 | |
| SECTION .text
 | |
| 
 | |
| %macro CFHD_HORIZ_FILTER 1
 | |
| %if %1 == 1023
 | |
| cglobal cfhd_horiz_filter_clip10, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
 | |
|     shl        widthd, 1
 | |
| %define ostrideq widthq
 | |
| %define lwidthq  widthq
 | |
| %define hwidthq  widthq
 | |
| %elif %1 == 4095
 | |
| cglobal cfhd_horiz_filter_clip12, 5, 6, 8 + 4 * ARCH_X86_64, output, low, high, width, x, temp
 | |
|     shl        widthd, 1
 | |
| %define ostrideq widthq
 | |
| %define lwidthq  widthq
 | |
| %define hwidthq  widthq
 | |
| %else
 | |
| %if ARCH_X86_64
 | |
| cglobal cfhd_horiz_filter, 8, 11, 12, output, ostride, low, lwidth, high, hwidth, width, height, x, y, temp
 | |
|     shl  ostrided, 1
 | |
|     shl   lwidthd, 1
 | |
|     shl   hwidthd, 1
 | |
|     shl    widthd, 1
 | |
| 
 | |
|     mov        yd, heightd
 | |
|     neg        yq
 | |
| %else
 | |
| cglobal cfhd_horiz_filter, 7, 7, 8, output, x, low, y, high, temp, width, height
 | |
|     shl        xd, 1
 | |
|     shl        yd, 1
 | |
|     shl     tempd, 1
 | |
|     shl    widthd, 1
 | |
| 
 | |
|     mov       xmp, xq
 | |
|     mov       ymp, yq
 | |
|     mov    tempmp, tempq
 | |
| 
 | |
|     mov        yd, r7m
 | |
|     neg        yq
 | |
| 
 | |
| %define ostrideq xm
 | |
| %define lwidthq  ym
 | |
| %define hwidthq  tempm
 | |
| %endif
 | |
| %endif
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     mova       m8, [factor_p1_n1]
 | |
|     mova       m9, [factor_n1_p1]
 | |
|     mova      m10, [pw_1]
 | |
|     mova      m11, [pd_4]
 | |
| %endif
 | |
| 
 | |
| %if %1 == 0
 | |
| .looph:
 | |
| %endif
 | |
|     movsx          xq, word [lowq]
 | |
|     imul           xq, 11
 | |
| 
 | |
|     movsx       tempq, word [lowq + 2]
 | |
|     imul        tempq, -4
 | |
|     add         tempq, xq
 | |
| 
 | |
|     movsx          xq, word [lowq + 4]
 | |
|     add         tempq, xq
 | |
|     add         tempq, 4
 | |
|     sar         tempq, 3
 | |
| 
 | |
|     movsx          xq, word [highq]
 | |
|     add         tempq, xq
 | |
|     sar         tempq, 1
 | |
| 
 | |
| %if %1
 | |
|     movd          xm0, tempd
 | |
|     CLIPW          m0, [pw_0], [pw_%1]
 | |
|     pextrw      tempd, xm0, 0
 | |
| %endif
 | |
|     mov  word [outputq], tempw
 | |
| 
 | |
|     movsx          xq, word [lowq]
 | |
|     imul           xq, 5
 | |
| 
 | |
|     movsx       tempq, word [lowq + 2]
 | |
|     imul        tempq, 4
 | |
|     add         tempq, xq
 | |
| 
 | |
|     movsx          xq, word [lowq + 4]
 | |
|     sub         tempq, xq
 | |
|     add         tempq, 4
 | |
|     sar         tempq, 3
 | |
| 
 | |
|     movsx          xq, word [highq]
 | |
|     sub         tempq, xq
 | |
|     sar         tempq, 1
 | |
| 
 | |
| %if %1
 | |
|     movd          xm0, tempd
 | |
|     CLIPW          m0, [pw_0], [pw_%1]
 | |
|     pextrw      tempd, xm0, 0
 | |
| %endif
 | |
|     mov  word [outputq + 2], tempw
 | |
| 
 | |
|     mov            xq, 0
 | |
| 
 | |
| .loop:
 | |
|     movu           m4, [lowq + xq]
 | |
|     movu           m1, [lowq + xq + 4]
 | |
| 
 | |
|     mova           m5, m4
 | |
|     punpcklwd      m4, m1
 | |
|     punpckhwd      m5, m1
 | |
| 
 | |
|     mova           m6, m4
 | |
|     mova           m7, m5
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd        m4, m8
 | |
|     pmaddwd        m5, m8
 | |
|     pmaddwd        m6, m9
 | |
|     pmaddwd        m7, m9
 | |
| 
 | |
|     paddd          m4, m11
 | |
|     paddd          m5, m11
 | |
|     paddd          m6, m11
 | |
|     paddd          m7, m11
 | |
| %else
 | |
|     pmaddwd        m4, [factor_p1_n1]
 | |
|     pmaddwd        m5, [factor_p1_n1]
 | |
|     pmaddwd        m6, [factor_n1_p1]
 | |
|     pmaddwd        m7, [factor_n1_p1]
 | |
| 
 | |
|     paddd          m4, [pd_4]
 | |
|     paddd          m5, [pd_4]
 | |
|     paddd          m6, [pd_4]
 | |
|     paddd          m7, [pd_4]
 | |
| %endif
 | |
| 
 | |
|     psrad          m4, 3
 | |
|     psrad          m5, 3
 | |
|     psrad          m6, 3
 | |
|     psrad          m7, 3
 | |
| 
 | |
|     movu           m2, [lowq + xq + 2]
 | |
|     movu           m3, [highq + xq + 2]
 | |
| 
 | |
|     mova           m0, m2
 | |
|     punpcklwd      m2, m3
 | |
|     punpckhwd      m0, m3
 | |
| 
 | |
|     mova           m1, m2
 | |
|     mova           m3, m0
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd        m2, m10
 | |
|     pmaddwd        m0, m10
 | |
|     pmaddwd        m1, m8
 | |
|     pmaddwd        m3, m8
 | |
| %else
 | |
|     pmaddwd        m2, [pw_1]
 | |
|     pmaddwd        m0, [pw_1]
 | |
|     pmaddwd        m1, [factor_p1_n1]
 | |
|     pmaddwd        m3, [factor_p1_n1]
 | |
| %endif
 | |
| 
 | |
|     paddd          m2, m4
 | |
|     paddd          m0, m5
 | |
|     paddd          m1, m6
 | |
|     paddd          m3, m7
 | |
| 
 | |
|     psrad          m2, 1
 | |
|     psrad          m0, 1
 | |
|     psrad          m1, 1
 | |
|     psrad          m3, 1
 | |
| 
 | |
|     packssdw       m2, m0
 | |
|     packssdw       m1, m3
 | |
| 
 | |
|     mova           m0, m2
 | |
|     punpcklwd      m2, m1
 | |
|     punpckhwd      m0, m1
 | |
| 
 | |
| %if %1
 | |
|     CLIPW          m2, [pw_0], [pw_%1]
 | |
|     CLIPW          m0, [pw_0], [pw_%1]
 | |
| %endif
 | |
| 
 | |
|     movu  [outputq + xq * 2 + 4], m2
 | |
|     movu  [outputq + xq * 2 + mmsize + 4], m0
 | |
| 
 | |
|     add            xq, mmsize
 | |
|     cmp            xq, widthq
 | |
|     jl .loop
 | |
| 
 | |
|     add          lowq, widthq
 | |
|     add         highq, widthq
 | |
|     add       outputq, widthq
 | |
|     add       outputq, widthq
 | |
| 
 | |
|     movsx          xq, word [lowq - 2]
 | |
|     imul           xq, 5
 | |
| 
 | |
|     movsx       tempq, word [lowq - 4]
 | |
|     imul        tempq, 4
 | |
|     add         tempq, xq
 | |
| 
 | |
|     movsx          xq, word [lowq - 6]
 | |
|     sub         tempq, xq
 | |
|     add         tempq, 4
 | |
|     sar         tempq, 3
 | |
| 
 | |
|     movsx          xq, word [highq - 2]
 | |
|     add         tempq, xq
 | |
|     sar         tempq, 1
 | |
| 
 | |
| %if %1
 | |
|     movd          xm0, tempd
 | |
|     CLIPW          m0, [pw_0], [pw_%1]
 | |
|     pextrw      tempd, xm0, 0
 | |
| %endif
 | |
|     mov  word [outputq - 4], tempw
 | |
| 
 | |
|     movsx          xq, word [lowq - 2]
 | |
|     imul           xq, 11
 | |
| 
 | |
|     movsx       tempq, word [lowq - 4]
 | |
|     imul        tempq, -4
 | |
|     add         tempq, xq
 | |
| 
 | |
|     movsx          xq, word [lowq - 6]
 | |
|     add         tempq, xq
 | |
|     add         tempq, 4
 | |
|     sar         tempq, 3
 | |
| 
 | |
|     movsx          xq, word [highq - 2]
 | |
|     sub         tempq, xq
 | |
|     sar         tempq, 1
 | |
| 
 | |
| %if %1
 | |
|     movd          xm0, tempd
 | |
|     CLIPW          m0, [pw_0], [pw_%1]
 | |
|     pextrw      tempd, xm0, 0
 | |
| %endif
 | |
|     mov  word [outputq - 2], tempw
 | |
| 
 | |
| %if %1 == 0
 | |
|     sub          lowq, widthq
 | |
|     sub         highq, widthq
 | |
|     sub       outputq, widthq
 | |
|     sub       outputq, widthq
 | |
| 
 | |
|     add          lowq, lwidthq
 | |
|     add         highq, hwidthq
 | |
|     add       outputq, ostrideq
 | |
|     add       outputq, ostrideq
 | |
|     add            yq, 1
 | |
|     jl .looph
 | |
| %endif
 | |
| 
 | |
|     RET
 | |
| %endmacro
 | |
| 
 | |
| INIT_XMM sse2
 | |
| CFHD_HORIZ_FILTER 0
 | |
| 
 | |
| INIT_XMM sse2
 | |
| CFHD_HORIZ_FILTER 1023
 | |
| 
 | |
| INIT_XMM sse2
 | |
| CFHD_HORIZ_FILTER 4095
 | |
| 
 | |
| INIT_XMM sse2
 | |
| %if ARCH_X86_64
 | |
| cglobal cfhd_vert_filter, 8, 11, 14, output, ostride, low, lwidth, high, hwidth, width, height, x, y, pos
 | |
|     shl        ostrided, 1
 | |
|     shl         lwidthd, 1
 | |
|     shl         hwidthd, 1
 | |
|     shl          widthd, 1
 | |
| 
 | |
|     dec   heightd
 | |
| 
 | |
|     mova       m8, [factor_p1_n1]
 | |
|     mova       m9, [factor_n1_p1]
 | |
|     mova      m10, [pw_1]
 | |
|     mova      m11, [pd_4]
 | |
|     mova      m12, [factor_p11_n4]
 | |
|     mova      m13, [factor_p5_p4]
 | |
| %else
 | |
| cglobal cfhd_vert_filter, 7, 7, 8, output, x, low, y, high, pos, width, height
 | |
|     shl        xd, 1
 | |
|     shl        yd, 1
 | |
|     shl      posd, 1
 | |
|     shl    widthd, 1
 | |
| 
 | |
|     mov       xmp, xq
 | |
|     mov       ymp, yq
 | |
|     mov     posmp, posq
 | |
| 
 | |
|     mov        xq, r7m
 | |
|     dec        xq
 | |
|     mov   widthmp, xq
 | |
| 
 | |
| %define ostrideq xm
 | |
| %define lwidthq  ym
 | |
| %define hwidthq  posm
 | |
| %define heightq  widthm
 | |
| 
 | |
| %endif
 | |
| 
 | |
|     xor        xq, xq
 | |
| .loopw:
 | |
|     xor        yq, yq
 | |
| 
 | |
|     mov      posq, xq
 | |
|     movu       m0, [lowq + posq]
 | |
|     add      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m2, m0
 | |
|     punpcklwd  m0, m1
 | |
|     punpckhwd  m2, m1
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd    m0, m12
 | |
|     pmaddwd    m2, m12
 | |
| %else
 | |
|     pmaddwd    m0, [factor_p11_n4]
 | |
|     pmaddwd    m2, [factor_p11_n4]
 | |
| %endif
 | |
| 
 | |
|     pxor       m4, m4
 | |
|     add      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     paddd      m0, m4
 | |
|     paddd      m2, m3
 | |
| 
 | |
|     paddd      m0, [pd_4]
 | |
|     paddd      m2, [pd_4]
 | |
| 
 | |
|     psrad      m0, 3
 | |
|     psrad      m2, 3
 | |
| 
 | |
|     mov      posq, xq
 | |
|     pxor       m4, m4
 | |
|     movu       m1, [highq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     paddd      m0, m4
 | |
|     paddd      m2, m3
 | |
| 
 | |
|     psrad      m0, 1
 | |
|     psrad      m2, 1
 | |
| 
 | |
|     packssdw   m0, m2
 | |
| 
 | |
|     movu    [outputq + posq], m0
 | |
| 
 | |
|     movu       m0, [lowq + posq]
 | |
|     add      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m2, m0
 | |
|     punpcklwd  m0, m1
 | |
|     punpckhwd  m2, m1
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd    m0, m13
 | |
|     pmaddwd    m2, m13
 | |
| %else
 | |
|     pmaddwd    m0, [factor_p5_p4]
 | |
|     pmaddwd    m2, [factor_p5_p4]
 | |
| %endif
 | |
| 
 | |
|     pxor       m4, m4
 | |
|     add      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     psubd      m0, m4
 | |
|     psubd      m2, m3
 | |
| 
 | |
|     paddd      m0, [pd_4]
 | |
|     paddd      m2, [pd_4]
 | |
| 
 | |
|     psrad      m0, 3
 | |
|     psrad      m2, 3
 | |
| 
 | |
|     mov      posq, xq
 | |
|     pxor       m4, m4
 | |
|     movu       m1, [highq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     psubd      m0, m4
 | |
|     psubd      m2, m3
 | |
| 
 | |
|     psrad      m0, 1
 | |
|     psrad      m2, 1
 | |
| 
 | |
|     packssdw   m0, m2
 | |
| 
 | |
|     add      posq, ostrideq
 | |
|     movu    [outputq + posq], m0
 | |
| 
 | |
|     add        yq, 1
 | |
| .looph:
 | |
|     mov      posq, lwidthq
 | |
|     imul     posq, yq
 | |
|     sub      posq, lwidthq
 | |
|     add      posq, xq
 | |
| 
 | |
|     movu       m4, [lowq + posq]
 | |
| 
 | |
|     add      posq, lwidthq
 | |
|     add      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
| 
 | |
|     mova       m5, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m5, m1
 | |
| 
 | |
|     mova       m6, m4
 | |
|     mova       m7, m5
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd    m4, m8
 | |
|     pmaddwd    m5, m8
 | |
|     pmaddwd    m6, m9
 | |
|     pmaddwd    m7, m9
 | |
| 
 | |
|     paddd      m4, m11
 | |
|     paddd      m5, m11
 | |
|     paddd      m6, m11
 | |
|     paddd      m7, m11
 | |
| %else
 | |
|     pmaddwd    m4, [factor_p1_n1]
 | |
|     pmaddwd    m5, [factor_p1_n1]
 | |
|     pmaddwd    m6, [factor_n1_p1]
 | |
|     pmaddwd    m7, [factor_n1_p1]
 | |
| 
 | |
|     paddd      m4, [pd_4]
 | |
|     paddd      m5, [pd_4]
 | |
|     paddd      m6, [pd_4]
 | |
|     paddd      m7, [pd_4]
 | |
| %endif
 | |
| 
 | |
|     psrad      m4, 3
 | |
|     psrad      m5, 3
 | |
|     psrad      m6, 3
 | |
|     psrad      m7, 3
 | |
| 
 | |
|     sub      posq, lwidthq
 | |
|     movu       m0, [lowq + posq]
 | |
| 
 | |
|     mov      posq, hwidthq
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
|     movu       m1, [highq + posq]
 | |
| 
 | |
|     mova       m2, m0
 | |
|     punpcklwd  m0, m1
 | |
|     punpckhwd  m2, m1
 | |
| 
 | |
|     mova       m1, m0
 | |
|     mova       m3, m2
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd    m0, m10
 | |
|     pmaddwd    m2, m10
 | |
|     pmaddwd    m1, m8
 | |
|     pmaddwd    m3, m8
 | |
| %else
 | |
|     pmaddwd    m0, [pw_1]
 | |
|     pmaddwd    m2, [pw_1]
 | |
|     pmaddwd    m1, [factor_p1_n1]
 | |
|     pmaddwd    m3, [factor_p1_n1]
 | |
| %endif
 | |
| 
 | |
|     paddd      m0, m4
 | |
|     paddd      m2, m5
 | |
|     paddd      m1, m6
 | |
|     paddd      m3, m7
 | |
| 
 | |
|     psrad      m0, 1
 | |
|     psrad      m2, 1
 | |
|     psrad      m1, 1
 | |
|     psrad      m3, 1
 | |
| 
 | |
|     packssdw   m0, m2
 | |
|     packssdw   m1, m3
 | |
| 
 | |
|     mov      posq, ostrideq
 | |
|     imul     posq, 2
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
| 
 | |
|     movu    [outputq + posq], m0
 | |
|     add      posq, ostrideq
 | |
|     movu    [outputq + posq], m1
 | |
| 
 | |
|     add        yq, 1
 | |
|     cmp        yq, heightq
 | |
|     jl .looph
 | |
| 
 | |
|     mov      posq, lwidthq
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
|     movu       m0, [lowq + posq]
 | |
|     sub      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m2, m0
 | |
|     punpcklwd  m0, m1
 | |
|     punpckhwd  m2, m1
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd    m0, m13
 | |
|     pmaddwd    m2, m13
 | |
| %else
 | |
|     pmaddwd    m0, [factor_p5_p4]
 | |
|     pmaddwd    m2, [factor_p5_p4]
 | |
| %endif
 | |
| 
 | |
|     pxor       m4, m4
 | |
|     sub      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     psubd      m0, m4
 | |
|     psubd      m2, m3
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     paddd      m0, m11
 | |
|     paddd      m2, m11
 | |
| %else
 | |
|     paddd      m0, [pd_4]
 | |
|     paddd      m2, [pd_4]
 | |
| %endif
 | |
| 
 | |
|     psrad      m0, 3
 | |
|     psrad      m2, 3
 | |
| 
 | |
|     mov      posq, hwidthq
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
|     pxor       m4, m4
 | |
|     movu       m1, [highq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     paddd      m0, m4
 | |
|     paddd      m2, m3
 | |
| 
 | |
|     psrad      m0, 1
 | |
|     psrad      m2, 1
 | |
| 
 | |
|     packssdw   m0, m2
 | |
| 
 | |
|     mov      posq, ostrideq
 | |
|     imul     posq, 2
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
|     movu    [outputq + posq], m0
 | |
| 
 | |
|     mov      posq, lwidthq
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
|     movu       m0, [lowq + posq]
 | |
|     sub      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m2, m0
 | |
|     punpcklwd  m0, m1
 | |
|     punpckhwd  m2, m1
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     pmaddwd    m0, m12
 | |
|     pmaddwd    m2, m12
 | |
| %else
 | |
|     pmaddwd    m0, [factor_p11_n4]
 | |
|     pmaddwd    m2, [factor_p11_n4]
 | |
| %endif
 | |
| 
 | |
|     pxor       m4, m4
 | |
|     sub      posq, lwidthq
 | |
|     movu       m1, [lowq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     paddd      m0, m4
 | |
|     paddd      m2, m3
 | |
| 
 | |
| %if ARCH_X86_64
 | |
|     paddd      m0, m11
 | |
|     paddd      m2, m11
 | |
| %else
 | |
|     paddd      m0, [pd_4]
 | |
|     paddd      m2, [pd_4]
 | |
| %endif
 | |
| 
 | |
|     psrad      m0, 3
 | |
|     psrad      m2, 3
 | |
| 
 | |
|     mov      posq, hwidthq
 | |
|     imul     posq, yq
 | |
|     add      posq, xq
 | |
|     pxor       m4, m4
 | |
|     movu       m1, [highq + posq]
 | |
|     mova       m3, m4
 | |
|     punpcklwd  m4, m1
 | |
|     punpckhwd  m3, m1
 | |
| 
 | |
|     psrad      m4, 16
 | |
|     psrad      m3, 16
 | |
| 
 | |
|     psubd      m0, m4
 | |
|     psubd      m2, m3
 | |
| 
 | |
|     psrad      m0, 1
 | |
|     psrad      m2, 1
 | |
| 
 | |
|     packssdw   m0, m2
 | |
| 
 | |
|     mov      posq, ostrideq
 | |
|     imul     posq, 2
 | |
|     imul     posq, yq
 | |
|     add      posq, ostrideq
 | |
|     add      posq, xq
 | |
|     movu    [outputq + posq], m0
 | |
| 
 | |
|     add        xq, mmsize
 | |
|     cmp        xq, widthq
 | |
|     jl .loopw
 | |
|     RET
 |