mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-30 20:16:42 +08:00 
			
		
		
		
	 23f250d2bc
			
		
	
	23f250d2bc
	
	
	
		
			
			* qatar/master: arm: Add VFP-accelerated version of qmf_32_subbands Merged-by: Michael Niedermayer <michaelni@gmx.at>
		
			
				
	
	
		
			494 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			494 lines
		
	
	
		
			16 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2013 RISC OS Open Ltd
 | |
|  * Author: Ben Avison <bavison@riscosopen.org>
 | |
|  *
 | |
|  * This file is part of FFmpeg.
 | |
|  *
 | |
|  * FFmpeg is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * FFmpeg is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with FFmpeg; if not, write to the Free Software
 | |
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
|  */
 | |
| 
 | |
| #include "libavutil/arm/asm.S"
 | |
| 
 | |
| POUT          .req    a1
 | |
| PIN           .req    a2
 | |
| PCOEF         .req    a3
 | |
| DECIFACTOR    .req    a4
 | |
| OLDFPSCR      .req    a4
 | |
| COUNTER       .req    ip
 | |
| 
 | |
| SCALE32       .req    s28  @ use vector of 4 in place of 9th scalar when decifactor=32 / JMAX=8
 | |
| SCALE64       .req    s0   @ spare register in scalar bank when decifactor=64 / JMAX=4
 | |
| IN0           .req    s4
 | |
| IN1           .req    s5
 | |
| IN2           .req    s6
 | |
| IN3           .req    s7
 | |
| IN4           .req    s0
 | |
| IN5           .req    s1
 | |
| IN6           .req    s2
 | |
| IN7           .req    s3
 | |
| COEF0         .req    s8   @ coefficient elements
 | |
| COEF1         .req    s9
 | |
| COEF2         .req    s10
 | |
| COEF3         .req    s11
 | |
| COEF4         .req    s12
 | |
| COEF5         .req    s13
 | |
| COEF6         .req    s14
 | |
| COEF7         .req    s15
 | |
| ACCUM0        .req    s16  @ double-buffered multiply-accumulate results
 | |
| ACCUM4        .req    s20
 | |
| POST0         .req    s24  @ do long-latency post-multiply in this vector in parallel
 | |
| POST1         .req    s25
 | |
| POST2         .req    s26
 | |
| POST3         .req    s27
 | |
| 
 | |
| 
 | |
| .macro inner_loop  decifactor, dir, tail, head
 | |
|  .ifc "\dir","up"
 | |
|   .set X, 0
 | |
|   .set Y, 4
 | |
|  .else
 | |
|   .set X, 4*JMAX*4 - 4
 | |
|   .set Y, -4
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vldr    COEF0, [PCOEF, #X + (0*JMAX + 0) * Y]
 | |
|         vldr    COEF1, [PCOEF, #X + (1*JMAX + 0) * Y]
 | |
|         vldr    COEF2, [PCOEF, #X + (2*JMAX + 0) * Y]
 | |
|         vldr    COEF3, [PCOEF, #X + (3*JMAX + 0) * Y]
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vadd.f  POST0, ACCUM0, ACCUM4   @ vector operation
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vmul.f  ACCUM0, COEF0, IN0      @ vector = vector * scalar
 | |
|         vldr    COEF4, [PCOEF, #X + (0*JMAX + 1) * Y]
 | |
|         vldr    COEF5, [PCOEF, #X + (1*JMAX + 1) * Y]
 | |
|         vldr    COEF6, [PCOEF, #X + (2*JMAX + 1) * Y]
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vmul.f  POST0, POST0, SCALE\decifactor  @ vector operation (SCALE may be scalar)
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vldr    COEF7, [PCOEF, #X + (3*JMAX + 1) * Y]
 | |
|    .ifc "\tail",""
 | |
|         vmul.f  ACCUM4, COEF4, IN1      @ vector operation
 | |
|    .endif
 | |
|         vldr    COEF0, [PCOEF, #X + (0*JMAX + 2) * Y]
 | |
|         vldr    COEF1, [PCOEF, #X + (1*JMAX + 2) * Y]
 | |
|    .ifnc "\tail",""
 | |
|         vmul.f  ACCUM4, COEF4, IN1      @ vector operation
 | |
|    .endif
 | |
|         vldr    COEF2, [PCOEF, #X + (2*JMAX + 2) * Y]
 | |
|         vldr    COEF3, [PCOEF, #X + (3*JMAX + 2) * Y]
 | |
|  .endif
 | |
|  .ifnc "\tail",""
 | |
|         vstmia  POUT!, {POST0-POST3}
 | |
|  .endif
 | |
|  .ifnc "\head",""
 | |
|         vmla.f  ACCUM0, COEF0, IN2      @ vector = vector * scalar
 | |
|         vldr    COEF4, [PCOEF, #X + (0*JMAX + 3) * Y]
 | |
|         vldr    COEF5, [PCOEF, #X + (1*JMAX + 3) * Y]
 | |
|         vldr    COEF6, [PCOEF, #X + (2*JMAX + 3) * Y]
 | |
|         vldr    COEF7, [PCOEF, #X + (3*JMAX + 3) * Y]
 | |
|         vmla.f  ACCUM4, COEF4, IN3      @ vector = vector * scalar
 | |
|   .if \decifactor == 32
 | |
|         vldr    COEF0, [PCOEF, #X + (0*JMAX + 4) * Y]
 | |
|         vldr    COEF1, [PCOEF, #X + (1*JMAX + 4) * Y]
 | |
|         vldr    COEF2, [PCOEF, #X + (2*JMAX + 4) * Y]
 | |
|         vldr    COEF3, [PCOEF, #X + (3*JMAX + 4) * Y]
 | |
|         vmla.f  ACCUM0, COEF0, IN4      @ vector = vector * scalar
 | |
|         vldr    COEF4, [PCOEF, #X + (0*JMAX + 5) * Y]
 | |
|         vldr    COEF5, [PCOEF, #X + (1*JMAX + 5) * Y]
 | |
|         vldr    COEF6, [PCOEF, #X + (2*JMAX + 5) * Y]
 | |
|         vldr    COEF7, [PCOEF, #X + (3*JMAX + 5) * Y]
 | |
|         vmla.f  ACCUM4, COEF4, IN5      @ vector = vector * scalar
 | |
|         vldr    COEF0, [PCOEF, #X + (0*JMAX + 6) * Y]
 | |
|         vldr    COEF1, [PCOEF, #X + (1*JMAX + 6) * Y]
 | |
|         vldr    COEF2, [PCOEF, #X + (2*JMAX + 6) * Y]
 | |
|         vldr    COEF3, [PCOEF, #X + (3*JMAX + 6) * Y]
 | |
|         vmla.f  ACCUM0, COEF0, IN6      @ vector = vector * scalar
 | |
|         vldr    COEF4, [PCOEF, #X + (0*JMAX + 7) * Y]
 | |
|         vldr    COEF5, [PCOEF, #X + (1*JMAX + 7) * Y]
 | |
|         vldr    COEF6, [PCOEF, #X + (2*JMAX + 7) * Y]
 | |
|         vldr    COEF7, [PCOEF, #X + (3*JMAX + 7) * Y]
 | |
|         vmla.f  ACCUM4, COEF4, IN7      @ vector = vector * scalar
 | |
|   .endif
 | |
|  .endif
 | |
| .endm
 | |
| 
 | |
| .macro dca_lfe_fir  decifactor
 | |
|  .if \decifactor == 32
 | |
|   .set JMAX, 8
 | |
|         vpush   {s16-s31}
 | |
|         vmov    SCALE32, s0             @ duplicate scalar across vector
 | |
|         vldr    IN4, [PIN, #-4*4]
 | |
|         vldr    IN5, [PIN, #-5*4]
 | |
|         vldr    IN6, [PIN, #-6*4]
 | |
|         vldr    IN7, [PIN, #-7*4]
 | |
|  .else
 | |
|   .set JMAX, 4
 | |
|         vpush   {s16-s27}
 | |
|  .endif
 | |
| 
 | |
|         mov     COUNTER, #\decifactor/4 - 1
 | |
|         inner_loop  \decifactor, up,, head
 | |
| 1:      add     PCOEF, PCOEF, #4*JMAX*4
 | |
|         subs    COUNTER, COUNTER, #1
 | |
|         inner_loop  \decifactor, up, tail, head
 | |
|         bne     1b
 | |
|         inner_loop  \decifactor, up, tail
 | |
| 
 | |
|         mov     COUNTER, #\decifactor/4 - 1
 | |
|         inner_loop  \decifactor, down,, head
 | |
| 1:      sub     PCOEF, PCOEF, #4*JMAX*4
 | |
|         subs    COUNTER, COUNTER, #1
 | |
|         inner_loop  \decifactor, down, tail, head
 | |
|         bne     1b
 | |
|         inner_loop  \decifactor, down, tail
 | |
| 
 | |
|  .if \decifactor == 32
 | |
|         vpop    {s16-s31}
 | |
|  .else
 | |
|         vpop    {s16-s27}
 | |
|  .endif
 | |
|         fmxr    FPSCR, OLDFPSCR
 | |
|         bx      lr
 | |
| .endm
 | |
| 
 | |
| 
 | |
| /* void ff_dca_lfe_fir_vfp(float *out, const float *in, const float *coefs,
 | |
|  *                         int decifactor, float scale)
 | |
|  */
 | |
| function ff_dca_lfe_fir_vfp, export=1
 | |
|         teq     DECIFACTOR, #32
 | |
|         fmrx    OLDFPSCR, FPSCR
 | |
|         ldr     ip, =0x03030000         @ RunFast mode, short vectors of length 4, stride 1
 | |
|         fmxr    FPSCR, ip
 | |
| NOVFP   vldr    s0, [sp]
 | |
|         vldr    IN0, [PIN, #-0*4]
 | |
|         vldr    IN1, [PIN, #-1*4]
 | |
|         vldr    IN2, [PIN, #-2*4]
 | |
|         vldr    IN3, [PIN, #-3*4]
 | |
|         beq     32f
 | |
| 64:     dca_lfe_fir  64
 | |
|  .ltorg
 | |
| 32:     dca_lfe_fir  32
 | |
| endfunc
 | |
| 
 | |
|         .unreq  POUT
 | |
|         .unreq  PIN
 | |
|         .unreq  PCOEF
 | |
|         .unreq  DECIFACTOR
 | |
|         .unreq  OLDFPSCR
 | |
|         .unreq  COUNTER
 | |
| 
 | |
|         .unreq  SCALE32
 | |
|         .unreq  SCALE64
 | |
|         .unreq  IN0
 | |
|         .unreq  IN1
 | |
|         .unreq  IN2
 | |
|         .unreq  IN3
 | |
|         .unreq  IN4
 | |
|         .unreq  IN5
 | |
|         .unreq  IN6
 | |
|         .unreq  IN7
 | |
|         .unreq  COEF0
 | |
|         .unreq  COEF1
 | |
|         .unreq  COEF2
 | |
|         .unreq  COEF3
 | |
|         .unreq  COEF4
 | |
|         .unreq  COEF5
 | |
|         .unreq  COEF6
 | |
|         .unreq  COEF7
 | |
|         .unreq  ACCUM0
 | |
|         .unreq  ACCUM4
 | |
|         .unreq  POST0
 | |
|         .unreq  POST1
 | |
|         .unreq  POST2
 | |
|         .unreq  POST3
 | |
| 
 | |
| 
 | |
| IN      .req    a1
 | |
| SBACT   .req    a2
 | |
| OLDFPSCR .req   a3
 | |
| IMDCT   .req    a4
 | |
| WINDOW  .req    v1
 | |
| OUT     .req    v2
 | |
| BUF     .req    v3
 | |
| SCALEINT .req   v4 @ only used in softfp case
 | |
| COUNT   .req    v5
 | |
| 
 | |
| SCALE   .req    s0
 | |
| 
 | |
| /* Stack layout differs in softfp and hardfp cases:
 | |
|  *
 | |
|  * hardfp
 | |
|  *      fp -> 6 arg words saved by caller
 | |
|  *            a3,a4,v1-v3,v5,fp,lr on entry (a3 just to pad to 8 bytes)
 | |
|  *            s16-s23 on entry
 | |
|  *            align 16
 | |
|  *     buf -> 8*32*4 bytes buffer
 | |
|  *            s0 on entry
 | |
|  *      sp -> 3 arg words for callee
 | |
|  *
 | |
|  * softfp
 | |
|  *      fp -> 7 arg words saved by caller
 | |
|  *            a4,v1-v5,fp,lr on entry
 | |
|  *            s16-s23 on entry
 | |
|  *            align 16
 | |
|  *     buf -> 8*32*4 bytes buffer
 | |
|  *      sp -> 4 arg words for callee
 | |
|  */
 | |
| 
 | |
| /* void ff_dca_qmf_32_subbands_vfp(float samples_in[32][8], int sb_act,
 | |
|  *                                 SynthFilterContext *synth, FFTContext *imdct,
 | |
|  *                                 float (*synth_buf_ptr)[512],
 | |
|  *                                 int *synth_buf_offset, float (*synth_buf2)[32],
 | |
|  *                                 const float (*window)[512], float *samples_out,
 | |
|  *                                 float (*raXin)[32], float scale);
 | |
|  */
 | |
| function ff_dca_qmf_32_subbands_vfp, export=1
 | |
| VFP     push    {a3-a4,v1-v3,v5,fp,lr}
 | |
| NOVFP   push    {a4,v1-v5,fp,lr}
 | |
|         add     fp, sp, #8*4
 | |
|         vpush   {s16-s23}
 | |
|         @ The buffer pointed at by raXin isn't big enough for us to do a
 | |
|         @ complete matrix transposition as we want to, so allocate an
 | |
|         @ alternative buffer from the stack. Align to 4 words for speed.
 | |
|         sub     BUF, sp, #8*32*4
 | |
|         bic     BUF, BUF, #15
 | |
|         mov     sp, BUF
 | |
|         ldr     lr, =0x03330000     @ RunFast mode, short vectors of length 4, stride 2
 | |
|         fmrx    OLDFPSCR, FPSCR
 | |
|         fmxr    FPSCR, lr
 | |
|         @ COUNT is used to count down 2 things at once:
 | |
|         @ bits 0-4 are the number of word pairs remaining in the output row
 | |
|         @ bits 5-31 are the number of words to copy (with possible negation)
 | |
|         @   from the source matrix before we start zeroing the remainder
 | |
|         mov     COUNT, #(-4 << 5) + 16
 | |
|         adds    COUNT, COUNT, SBACT, lsl #5
 | |
|         bmi     2f
 | |
| 1:
 | |
|         vldr    s8,  [IN, #(0*8+0)*4]
 | |
|         vldr    s10, [IN, #(0*8+1)*4]
 | |
|         vldr    s12, [IN, #(0*8+2)*4]
 | |
|         vldr    s14, [IN, #(0*8+3)*4]
 | |
|         vldr    s16, [IN, #(0*8+4)*4]
 | |
|         vldr    s18, [IN, #(0*8+5)*4]
 | |
|         vldr    s20, [IN, #(0*8+6)*4]
 | |
|         vldr    s22, [IN, #(0*8+7)*4]
 | |
|         vneg.f  s8, s8
 | |
|         vldr    s9,  [IN, #(1*8+0)*4]
 | |
|         vldr    s11, [IN, #(1*8+1)*4]
 | |
|         vldr    s13, [IN, #(1*8+2)*4]
 | |
|         vldr    s15, [IN, #(1*8+3)*4]
 | |
|         vneg.f  s16, s16
 | |
|         vldr    s17, [IN, #(1*8+4)*4]
 | |
|         vldr    s19, [IN, #(1*8+5)*4]
 | |
|         vldr    s21, [IN, #(1*8+6)*4]
 | |
|         vldr    s23, [IN, #(1*8+7)*4]
 | |
|         vstr    d4,  [BUF, #(0*32+0)*4]
 | |
|         vstr    d5,  [BUF, #(1*32+0)*4]
 | |
|         vstr    d6,  [BUF, #(2*32+0)*4]
 | |
|         vstr    d7,  [BUF, #(3*32+0)*4]
 | |
|         vstr    d8,  [BUF, #(4*32+0)*4]
 | |
|         vstr    d9,  [BUF, #(5*32+0)*4]
 | |
|         vstr    d10, [BUF, #(6*32+0)*4]
 | |
|         vstr    d11, [BUF, #(7*32+0)*4]
 | |
|         vldr    s9,  [IN, #(3*8+0)*4]
 | |
|         vldr    s11, [IN, #(3*8+1)*4]
 | |
|         vldr    s13, [IN, #(3*8+2)*4]
 | |
|         vldr    s15, [IN, #(3*8+3)*4]
 | |
|         vldr    s17, [IN, #(3*8+4)*4]
 | |
|         vldr    s19, [IN, #(3*8+5)*4]
 | |
|         vldr    s21, [IN, #(3*8+6)*4]
 | |
|         vldr    s23, [IN, #(3*8+7)*4]
 | |
|         vneg.f  s9, s9
 | |
|         vldr    s8,  [IN, #(2*8+0)*4]
 | |
|         vldr    s10, [IN, #(2*8+1)*4]
 | |
|         vldr    s12, [IN, #(2*8+2)*4]
 | |
|         vldr    s14, [IN, #(2*8+3)*4]
 | |
|         vneg.f  s17, s17
 | |
|         vldr    s16, [IN, #(2*8+4)*4]
 | |
|         vldr    s18, [IN, #(2*8+5)*4]
 | |
|         vldr    s20, [IN, #(2*8+6)*4]
 | |
|         vldr    s22, [IN, #(2*8+7)*4]
 | |
|         vstr    d4,  [BUF, #(0*32+2)*4]
 | |
|         vstr    d5,  [BUF, #(1*32+2)*4]
 | |
|         vstr    d6,  [BUF, #(2*32+2)*4]
 | |
|         vstr    d7,  [BUF, #(3*32+2)*4]
 | |
|         vstr    d8,  [BUF, #(4*32+2)*4]
 | |
|         vstr    d9,  [BUF, #(5*32+2)*4]
 | |
|         vstr    d10, [BUF, #(6*32+2)*4]
 | |
|         vstr    d11, [BUF, #(7*32+2)*4]
 | |
|         add     IN, IN, #4*8*4
 | |
|         add     BUF, BUF, #4*4
 | |
|         subs    COUNT, COUNT, #(4 << 5) + 2
 | |
|         bpl     1b
 | |
| 2:      @ Now deal with trailing < 4 samples
 | |
|         adds    COUNT, COUNT, #3 << 5
 | |
|         bmi     4f  @ sb_act was a multiple of 4
 | |
|         bics    lr, COUNT, #0x1F
 | |
|         bne     3f
 | |
|         @ sb_act was n*4+1
 | |
|         vldr    s8,  [IN, #(0*8+0)*4]
 | |
|         vldr    s10, [IN, #(0*8+1)*4]
 | |
|         vldr    s12, [IN, #(0*8+2)*4]
 | |
|         vldr    s14, [IN, #(0*8+3)*4]
 | |
|         vldr    s16, [IN, #(0*8+4)*4]
 | |
|         vldr    s18, [IN, #(0*8+5)*4]
 | |
|         vldr    s20, [IN, #(0*8+6)*4]
 | |
|         vldr    s22, [IN, #(0*8+7)*4]
 | |
|         vneg.f  s8, s8
 | |
|         vldr    s9,  zero
 | |
|         vldr    s11, zero
 | |
|         vldr    s13, zero
 | |
|         vldr    s15, zero
 | |
|         vneg.f  s16, s16
 | |
|         vldr    s17, zero
 | |
|         vldr    s19, zero
 | |
|         vldr    s21, zero
 | |
|         vldr    s23, zero
 | |
|         vstr    d4,  [BUF, #(0*32+0)*4]
 | |
|         vstr    d5,  [BUF, #(1*32+0)*4]
 | |
|         vstr    d6,  [BUF, #(2*32+0)*4]
 | |
|         vstr    d7,  [BUF, #(3*32+0)*4]
 | |
|         vstr    d8,  [BUF, #(4*32+0)*4]
 | |
|         vstr    d9,  [BUF, #(5*32+0)*4]
 | |
|         vstr    d10, [BUF, #(6*32+0)*4]
 | |
|         vstr    d11, [BUF, #(7*32+0)*4]
 | |
|         add     BUF, BUF, #2*4
 | |
|         sub     COUNT, COUNT, #1
 | |
|         b       4f
 | |
| 3:      @ sb_act was n*4+2 or n*4+3, so do the first 2
 | |
|         vldr    s8,  [IN, #(0*8+0)*4]
 | |
|         vldr    s10, [IN, #(0*8+1)*4]
 | |
|         vldr    s12, [IN, #(0*8+2)*4]
 | |
|         vldr    s14, [IN, #(0*8+3)*4]
 | |
|         vldr    s16, [IN, #(0*8+4)*4]
 | |
|         vldr    s18, [IN, #(0*8+5)*4]
 | |
|         vldr    s20, [IN, #(0*8+6)*4]
 | |
|         vldr    s22, [IN, #(0*8+7)*4]
 | |
|         vneg.f  s8, s8
 | |
|         vldr    s9,  [IN, #(1*8+0)*4]
 | |
|         vldr    s11, [IN, #(1*8+1)*4]
 | |
|         vldr    s13, [IN, #(1*8+2)*4]
 | |
|         vldr    s15, [IN, #(1*8+3)*4]
 | |
|         vneg.f  s16, s16
 | |
|         vldr    s17, [IN, #(1*8+4)*4]
 | |
|         vldr    s19, [IN, #(1*8+5)*4]
 | |
|         vldr    s21, [IN, #(1*8+6)*4]
 | |
|         vldr    s23, [IN, #(1*8+7)*4]
 | |
|         vstr    d4,  [BUF, #(0*32+0)*4]
 | |
|         vstr    d5,  [BUF, #(1*32+0)*4]
 | |
|         vstr    d6,  [BUF, #(2*32+0)*4]
 | |
|         vstr    d7,  [BUF, #(3*32+0)*4]
 | |
|         vstr    d8,  [BUF, #(4*32+0)*4]
 | |
|         vstr    d9,  [BUF, #(5*32+0)*4]
 | |
|         vstr    d10, [BUF, #(6*32+0)*4]
 | |
|         vstr    d11, [BUF, #(7*32+0)*4]
 | |
|         add     BUF, BUF, #2*4
 | |
|         sub     COUNT, COUNT, #(2 << 5) + 1
 | |
|         bics    lr, COUNT, #0x1F
 | |
|         bne     4f
 | |
|         @ sb_act was n*4+3
 | |
|         vldr    s8,  [IN, #(2*8+0)*4]
 | |
|         vldr    s10, [IN, #(2*8+1)*4]
 | |
|         vldr    s12, [IN, #(2*8+2)*4]
 | |
|         vldr    s14, [IN, #(2*8+3)*4]
 | |
|         vldr    s16, [IN, #(2*8+4)*4]
 | |
|         vldr    s18, [IN, #(2*8+5)*4]
 | |
|         vldr    s20, [IN, #(2*8+6)*4]
 | |
|         vldr    s22, [IN, #(2*8+7)*4]
 | |
|         vldr    s9,  zero
 | |
|         vldr    s11, zero
 | |
|         vldr    s13, zero
 | |
|         vldr    s15, zero
 | |
|         vldr    s17, zero
 | |
|         vldr    s19, zero
 | |
|         vldr    s21, zero
 | |
|         vldr    s23, zero
 | |
|         vstr    d4,  [BUF, #(0*32+0)*4]
 | |
|         vstr    d5,  [BUF, #(1*32+0)*4]
 | |
|         vstr    d6,  [BUF, #(2*32+0)*4]
 | |
|         vstr    d7,  [BUF, #(3*32+0)*4]
 | |
|         vstr    d8,  [BUF, #(4*32+0)*4]
 | |
|         vstr    d9,  [BUF, #(5*32+0)*4]
 | |
|         vstr    d10, [BUF, #(6*32+0)*4]
 | |
|         vstr    d11, [BUF, #(7*32+0)*4]
 | |
|         add     BUF, BUF, #2*4
 | |
|         sub     COUNT, COUNT, #1
 | |
| 4:      @ Now fill the remainder with 0
 | |
|         vldr    s8, zero
 | |
|         vldr    s9, zero
 | |
|         ands    COUNT, COUNT, #0x1F
 | |
|         beq     6f
 | |
| 5:      vstr    d4, [BUF, #(0*32+0)*4]
 | |
|         vstr    d4, [BUF, #(1*32+0)*4]
 | |
|         vstr    d4, [BUF, #(2*32+0)*4]
 | |
|         vstr    d4, [BUF, #(3*32+0)*4]
 | |
|         vstr    d4, [BUF, #(4*32+0)*4]
 | |
|         vstr    d4, [BUF, #(5*32+0)*4]
 | |
|         vstr    d4, [BUF, #(6*32+0)*4]
 | |
|         vstr    d4, [BUF, #(7*32+0)*4]
 | |
|         add     BUF, BUF, #2*4
 | |
|         subs    COUNT, COUNT, #1
 | |
|         bne     5b
 | |
| 6:
 | |
|         fmxr    FPSCR, OLDFPSCR
 | |
|         ldr     WINDOW, [fp, #3*4]
 | |
|         ldr     OUT, [fp, #4*4]
 | |
|         sub     BUF, BUF, #32*4
 | |
| NOVFP   ldr     SCALEINT, [fp, #6*4]
 | |
|         mov     COUNT, #8
 | |
| VFP     vpush   {SCALE}
 | |
| VFP     sub     sp, sp, #3*4
 | |
| NOVFP   sub     sp, sp, #4*4
 | |
| 7:
 | |
| VFP     ldr     a1, [fp, #-7*4]     @ imdct
 | |
| NOVFP   ldr     a1, [fp, #-8*4]
 | |
|         ldmia   fp, {a2-a4}
 | |
| VFP     stmia   sp, {WINDOW, OUT, BUF}
 | |
| NOVFP   stmia   sp, {WINDOW, OUT, BUF, SCALEINT}
 | |
| VFP     vldr    SCALE, [sp, #3*4]
 | |
|         bl      ff_synth_filter_float_vfp
 | |
|         add     OUT, OUT, #32*4
 | |
|         add     BUF, BUF, #32*4
 | |
|         subs    COUNT, COUNT, #1
 | |
|         bne     7b
 | |
| 
 | |
| A       sub     sp, fp, #(8+8)*4
 | |
| T       sub     fp, fp, #(8+8)*4
 | |
| T       mov     sp, fp
 | |
|         vpop    {s16-s23}
 | |
| VFP     pop     {a3-a4,v1-v3,v5,fp,pc}
 | |
| NOVFP   pop     {a4,v1-v5,fp,pc}
 | |
| endfunc
 | |
| 
 | |
|         .unreq  IN
 | |
|         .unreq  SBACT
 | |
|         .unreq  OLDFPSCR
 | |
|         .unreq  IMDCT
 | |
|         .unreq  WINDOW
 | |
|         .unreq  OUT
 | |
|         .unreq  BUF
 | |
|         .unreq  SCALEINT
 | |
|         .unreq  COUNT
 | |
| 
 | |
|         .unreq  SCALE
 | |
| 
 | |
|         .align 2
 | |
| zero:   .word   0
 |