mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 12:36:41 +08:00 
			
		
		
		
	 42c1cc35b7
			
		
	
	42c1cc35b7
	
	
	
		
			
			The previous implementation targeted DTS Coherent Acoustics, which only
requires mdct_bits == 6. This relatively small size lent itself to
unrolling the loops a small number of times, and encoding offsets
calculated at assembly time within the load/store instructions of each
iteration.
In the more general case (codecs such as AAC and AC3) much larger arrays
are used - mdct_bits == [8, 9, 11]. The old method does not scale for
these cases, so more integer registers are used with non-unrolled versions
of the loops (and with some stack spillage). The postrotation filter loop
is still unrolled by a factor of 2 to permit the double-buffering of some
VFP registers to facilitate overlap of neighbouring iterations.
I benchmarked the result by measuring the number of gperftools samples
that hit anywhere in the AAC decoder (starting from aac_decode_frame())
or specifically in ff_imdct_half_c / ff_imdct_half_vfp, for the same
example AAC stream:
                  Before          After
                  Mean   StdDev   Mean   StdDev  Confidence  Change
aac_decode_frame  2368.1 35.8     2117.2 35.3    100.0%      +11.8%
ff_imdct_half_*   457.5  22.4     251.2  16.2    100.0%      +82.1%
Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
		
	
		
			
				
	
	
		
			348 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			348 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2013 RISC OS Open Ltd
 | |
|  * Author: Ben Avison <bavison@riscosopen.org>
 | |
|  *
 | |
|  * This file is part of FFmpeg.
 | |
|  *
 | |
|  * FFmpeg is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * FFmpeg is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with FFmpeg; if not, write to the Free Software
 | |
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
|  */
 | |
| 
 | |
| #include "libavutil/arm/asm.S"
 | |
| 
 | |
| CONTEXT .req    a1
 | |
| ORIGOUT .req    a2
 | |
| IN      .req    a3
 | |
| OUT     .req    v1
 | |
| REVTAB  .req    v2
 | |
| TCOS    .req    v3
 | |
| TSIN    .req    v4
 | |
| OLDFPSCR .req   v5
 | |
| J0      .req    a2
 | |
| J1      .req    a4
 | |
| J2      .req    ip
 | |
| J3      .req    lr
 | |
| REVTAB_HI .req  v5
 | |
| IN_HI   .req    v6
 | |
| OUT_HI  .req    v6
 | |
| TCOS_HI .req    sl
 | |
| TSIN_HI .req    fp
 | |
| 
 | |
| .macro prerotation_innerloop
 | |
|  .set trig_lo, k
 | |
|  .set trig_hi, n4 - k - 2
 | |
|  .set in_lo, trig_lo * 2
 | |
|  .set in_hi, trig_hi * 2
 | |
|         vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
 | |
|         vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
 | |
|         vldr    s0, [IN, #in_hi*4 + 12]
 | |
|         vldr    s1, [IN, #in_hi*4 + 4]
 | |
|         vldr    s2, [IN, #in_lo*4 + 12]
 | |
|         vldr    s3, [IN, #in_lo*4 + 4]
 | |
|         vmul.f  s8, s0, s16                     @ vector operation
 | |
|         vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
 | |
|         vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
 | |
|         vldr    s4, [IN, #in_lo*4]
 | |
|         vldr    s5, [IN, #in_lo*4 + 8]
 | |
|         vldr    s6, [IN, #in_hi*4]
 | |
|         vldr    s7, [IN, #in_hi*4 + 8]
 | |
|         ldr     J0, [REVTAB, #trig_lo*2]
 | |
|         vmul.f  s12, s0, s20                    @ vector operation
 | |
|         ldr     J2, [REVTAB, #trig_hi*2]
 | |
|         mov     J1, J0, lsr #16
 | |
|         and     J0, J0, #255                    @ halfword value will be < n4
 | |
|         vmls.f  s8, s4, s20                     @ vector operation
 | |
|         mov     J3, J2, lsr #16
 | |
|         and     J2, J2, #255                    @ halfword value will be < n4
 | |
|         add     J0, OUT, J0, lsl #3
 | |
|         vmla.f  s12, s4, s16                    @ vector operation
 | |
|         add     J1, OUT, J1, lsl #3
 | |
|         add     J2, OUT, J2, lsl #3
 | |
|         add     J3, OUT, J3, lsl #3
 | |
|         vstr    s8, [J0]
 | |
|         vstr    s9, [J1]
 | |
|         vstr    s10, [J2]
 | |
|         vstr    s11, [J3]
 | |
|         vstr    s12, [J0, #4]
 | |
|         vstr    s13, [J1, #4]
 | |
|         vstr    s14, [J2, #4]
 | |
|         vstr    s15, [J3, #4]
 | |
|  .set k, k + 2
 | |
| .endm
 | |
| 
 | |
| .macro prerotation_innerloop_rolled
 | |
|         vldmia  TCOS!, {s16,s17}
 | |
|         vldmdb  TCOS_HI!, {s18,s19}
 | |
|         vldr    s0, [IN_HI, #-4]
 | |
|         vldr    s1, [IN_HI, #-12]
 | |
|         vldr    s2, [IN, #12]
 | |
|         vldr    s3, [IN, #4]
 | |
|         vmul.f  s8, s0, s16                     @ vector operation
 | |
|         vldmia  TSIN!, {s20,s21}
 | |
|         vldmdb  TSIN_HI!, {s22,s23}
 | |
|         vldr    s4, [IN]
 | |
|         vldr    s5, [IN, #8]
 | |
|         vldr    s6, [IN_HI, #-16]
 | |
|         vldr    s7, [IN_HI, #-8]
 | |
|         vmul.f  s12, s0, s20                    @ vector operation
 | |
|         add     IN, IN, #16
 | |
|         sub     IN_HI, IN_HI, #16
 | |
|         ldrh    J0, [REVTAB], #2
 | |
|         ldrh    J1, [REVTAB], #2
 | |
|         vmls.f  s8, s4, s20                     @ vector operation
 | |
|         ldrh    J3, [REVTAB_HI, #-2]!
 | |
|         ldrh    J2, [REVTAB_HI, #-2]!
 | |
|         add     J0, OUT, J0, lsl #3
 | |
|         vmla.f  s12, s4, s16                    @ vector operation
 | |
|         add     J1, OUT, J1, lsl #3
 | |
|         add     J2, OUT, J2, lsl #3
 | |
|         add     J3, OUT, J3, lsl #3
 | |
|         vstr    s8, [J0]
 | |
|         vstr    s9, [J1]
 | |
|         vstr    s10, [J2]
 | |
|         vstr    s11, [J3]
 | |
|         vstr    s12, [J0, #4]
 | |
|         vstr    s13, [J1, #4]
 | |
|         vstr    s14, [J2, #4]
 | |
|         vstr    s15, [J3, #4]
 | |
| .endm
 | |
| 
 | |
| .macro postrotation_innerloop tail, head
 | |
|  .set trig_lo_head, n8 - k - 2
 | |
|  .set trig_hi_head, n8 + k
 | |
|  .set out_lo_head, trig_lo_head * 2
 | |
|  .set out_hi_head, trig_hi_head * 2
 | |
|  .set trig_lo_tail, n8 - (k - 2) - 2
 | |
|  .set trig_hi_tail, n8 + (k - 2)
 | |
|  .set out_lo_tail, trig_lo_tail * 2
 | |
|  .set out_hi_tail, trig_hi_tail * 2
 | |
|  .if (k & 2) == 0
 | |
|   TCOS_D0_HEAD .req d10 @ s20,s21
 | |
|   TCOS_D1_HEAD .req d11 @ s22,s23
 | |
|   TCOS_S0_TAIL .req s24
 | |
|  .else
 | |
|   TCOS_D0_HEAD .req d12 @ s24,s25
 | |
|   TCOS_D1_HEAD .req d13 @ s26,s27
 | |
|   TCOS_S0_TAIL .req s20
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
 | |
|         vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
 | |
|         vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vldr    s0, [OUT, #out_lo_head*4]
 | |
|         vldr    s1, [OUT, #out_lo_head*4 + 8]
 | |
|         vldr    s2, [OUT, #out_hi_head*4]
 | |
|         vldr    s3, [OUT, #out_hi_head*4 + 8]
 | |
|         vldr    s4, [OUT, #out_lo_head*4 + 4]
 | |
|         vldr    s5, [OUT, #out_lo_head*4 + 12]
 | |
|         vldr    s6, [OUT, #out_hi_head*4 + 4]
 | |
|         vldr    s7, [OUT, #out_hi_head*4 + 12]
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vstr    s8, [OUT, #out_lo_tail*4]
 | |
|         vstr    s9, [OUT, #out_lo_tail*4 + 8]
 | |
|         vstr    s10, [OUT, #out_hi_tail*4]
 | |
|         vstr    s11, [OUT, #out_hi_tail*4 + 8]
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vmul.f  s8, s4, s16                 @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vstr    s12, [OUT, #out_hi_tail*4 + 12]
 | |
|         vstr    s13, [OUT, #out_hi_tail*4 + 4]
 | |
|         vstr    s14, [OUT, #out_lo_tail*4 + 12]
 | |
|         vstr    s15, [OUT, #out_lo_tail*4 + 4]
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vmul.f  s12, s0, s16                @ vector operation
 | |
|         vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
 | |
|  .endif
 | |
|  .unreq TCOS_D0_HEAD
 | |
|  .unreq TCOS_D1_HEAD
 | |
|  .unreq TCOS_S0_TAIL
 | |
|  .ifnc "\head",""
 | |
|   .set k, k + 2
 | |
|  .endif
 | |
| .endm
 | |
| 
 | |
| .macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
 | |
|  .ifnc "\tail",""
 | |
|         vmls.f  s8, s0, \tcos_s0_tail       @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vldmia  TSIN!, {s16,s17}
 | |
|         vldmdb  TSIN_HI!, {s18,s19}
 | |
|         vldmia  TCOS!, {\tcos_s0_head,\tcos_s1_head}
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vmla.f  s12, s4, \tcos_s0_tail      @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vldr    s0, [OUT, #+\out_offset_head+0]
 | |
|         vldr    s1, [OUT, #+\out_offset_head+8]
 | |
|         vldr    s2, [OUT_HI, #-\out_offset_head-16]
 | |
|         vldr    s3, [OUT_HI, #-\out_offset_head-8]
 | |
|         vldr    s4, [OUT, #+\out_offset_head+4]
 | |
|         vldr    s5, [OUT, #+\out_offset_head+12]
 | |
|         vldr    s6, [OUT_HI, #-\out_offset_head-12]
 | |
|         vldr    s7, [OUT_HI, #-\out_offset_head-4]
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vstr    s8, [OUT, #+\out_offset_tail+0]
 | |
|         vstr    s9, [OUT, #+\out_offset_tail+8]
 | |
|         vstr    s10, [OUT_HI, #-\out_offset_tail-16]
 | |
|         vstr    s11, [OUT_HI, #-\out_offset_tail-8]
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vmul.f  s8, s4, s16                 @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vstr    s12, [OUT_HI, #-\out_offset_tail-4]
 | |
|         vstr    s13, [OUT_HI, #-\out_offset_tail-12]
 | |
|         vstr    s14, [OUT, #+\out_offset_tail+12]
 | |
|         vstr    s15, [OUT, #+\out_offset_tail+4]
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vmul.f  s12, s0, s16                @ vector operation
 | |
|         vldmdb  TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
 | |
|  .endif
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /* void ff_imdct_half_vfp(FFTContext *s,
 | |
|  *                        FFTSample *output,
 | |
|  *                        const FFTSample *input)
 | |
|  */
 | |
| function ff_imdct_half_vfp, export=1
 | |
|         ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
 | |
|         teq     ip, #6
 | |
|         bne     10f
 | |
| 
 | |
|  .set n, 1<<6
 | |
|  .set n2, n/2
 | |
|  .set n4, n/4
 | |
|  .set n8, n/8
 | |
| 
 | |
|         push    {v1-v5,lr}
 | |
|         vpush   {s16-s27}
 | |
|         fmrx    OLDFPSCR, FPSCR
 | |
|         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 | |
|         fmxr    FPSCR, lr
 | |
|         mov     OUT, ORIGOUT
 | |
|         ldr     REVTAB, [CONTEXT, #2*4]
 | |
|         ldr     TCOS, [CONTEXT, #6*4]
 | |
|         ldr     TSIN, [CONTEXT, #7*4]
 | |
| 
 | |
|  .set k, 0
 | |
|  .rept n8/2
 | |
|         prerotation_innerloop
 | |
|  .endr
 | |
| 
 | |
|         fmxr    FPSCR, OLDFPSCR
 | |
|         mov     a1, OUT
 | |
|         bl      X(ff_fft16_vfp)
 | |
|         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 | |
|         fmxr    FPSCR, lr
 | |
| 
 | |
|  .set k, 0
 | |
|         postrotation_innerloop , head
 | |
|  .rept n8/2 - 1
 | |
|         postrotation_innerloop tail, head
 | |
|  .endr
 | |
|         postrotation_innerloop tail
 | |
| 
 | |
|         fmxr    FPSCR, OLDFPSCR
 | |
|         vpop    {s16-s27}
 | |
|         pop     {v1-v5,pc}
 | |
| 
 | |
| 10:
 | |
|         push    {v1-v6,sl,fp,lr}
 | |
|         vpush   {s16-s27}
 | |
|         fmrx    OLDFPSCR, FPSCR
 | |
|         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 | |
|         fmxr    FPSCR, lr
 | |
|         mov     lr, #1
 | |
|         mov     OUT, ORIGOUT
 | |
|         ldr     REVTAB, [CONTEXT, #2*4]
 | |
|         ldr     TCOS, [CONTEXT, #6*4]
 | |
|         ldr     TSIN, [CONTEXT, #7*4]
 | |
|         mov     lr, lr, lsl ip
 | |
| 
 | |
|         push    {CONTEXT,OLDFPSCR}
 | |
|         add     IN_HI, IN, lr, lsl #1
 | |
|         add     REVTAB_HI, REVTAB, lr, lsr #1
 | |
|         add     TCOS_HI, TCOS, lr
 | |
|         add     TSIN_HI, TSIN, lr
 | |
| 0:      prerotation_innerloop_rolled
 | |
|         teq     IN, IN_HI
 | |
|         bne     0b
 | |
|         ldmia   sp, {CONTEXT,OLDFPSCR}
 | |
| 
 | |
|         mov     ORIGOUT, OUT
 | |
|         fmxr    FPSCR, OLDFPSCR
 | |
|         ldr     ip, [CONTEXT, #9*4]
 | |
|         blx     ip                          @ s->fft_calc(s, output)
 | |
| 
 | |
|         pop     {CONTEXT,OLDFPSCR}
 | |
|         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 | |
|         ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
 | |
|         fmxr    FPSCR, lr
 | |
|         mov     lr, #1
 | |
|         mov     lr, lr, lsl ip
 | |
|         sub     TCOS, TCOS, lr, lsr #1
 | |
|         sub     TSIN, TSIN, lr, lsr #1
 | |
|         add     OUT_HI, OUT, lr, lsl #1
 | |
|         add     TCOS_HI, TCOS, lr
 | |
|         add     TSIN_HI, TSIN, lr
 | |
|         postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
 | |
|         b       1f
 | |
| 0:      add     OUT, OUT, #32
 | |
|         sub     OUT_HI, OUT_HI, #32
 | |
|         postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
 | |
| 1:      postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
 | |
|         teq     TSIN, TSIN_HI
 | |
|         bne     0b
 | |
|         postrotation_innerloop_rolled tail,,,,,, s24,, 16
 | |
| 
 | |
|         fmxr    FPSCR, OLDFPSCR
 | |
|         vpop    {s16-s27}
 | |
|         pop     {v1-v6,sl,fp,pc}
 | |
| endfunc
 | |
| 
 | |
|         .unreq  CONTEXT
 | |
|         .unreq  ORIGOUT
 | |
|         .unreq  IN
 | |
|         .unreq  OUT
 | |
|         .unreq  REVTAB
 | |
|         .unreq  TCOS
 | |
|         .unreq  TSIN
 | |
|         .unreq  OLDFPSCR
 | |
|         .unreq  J0
 | |
|         .unreq  J1
 | |
|         .unreq  J2
 | |
|         .unreq  J3
 | |
|         .unreq  REVTAB_HI
 | |
|         .unreq  IN_HI
 | |
|         .unreq  OUT_HI
 | |
|         .unreq  TCOS_HI
 | |
|         .unreq  TSIN_HI
 |