mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-25 09:51:06 +08:00 
			
		
		
		
	 cb416a7d79
			
		
	
	cb416a7d79
	
	
	
		
			
			* commit '4c81613df499ba81d64ea102b38d0c6686cc304c': arm: mlpdsp: handle pic offset calculation in a macro Merged-by: Michael Niedermayer <michaelni@gmx.at>
		
			
				
	
	
		
			663 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			663 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2014 RISC OS Open Ltd
 | |
|  * Author: Ben Avison <bavison@riscosopen.org>
 | |
|  *
 | |
|  * This file is part of FFmpeg.
 | |
|  *
 | |
|  * FFmpeg is free software; you can redistribute it and/or
 | |
|  * modify it under the terms of the GNU Lesser General Public
 | |
|  * License as published by the Free Software Foundation; either
 | |
|  * version 2.1 of the License, or (at your option) any later version.
 | |
|  *
 | |
|  * FFmpeg is distributed in the hope that it will be useful,
 | |
|  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
|  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | |
|  * Lesser General Public License for more details.
 | |
|  *
 | |
|  * You should have received a copy of the GNU Lesser General Public
 | |
|  * License along with FFmpeg; if not, write to the Free Software
 | |
|  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 | |
|  */
 | |
| 
 | |
| #include "libavutil/arm/asm.S"
 | |
| 
 | |
| #define MAX_CHANNELS        8
 | |
| #define MAX_FIR_ORDER       8
 | |
| #define MAX_IIR_ORDER       4
 | |
| #define MAX_RATEFACTOR      4
 | |
| #define MAX_BLOCKSIZE       (40 * MAX_RATEFACTOR)
 | |
| 
 | |
| PST     .req    a1
 | |
| PCO     .req    a2
 | |
| AC0     .req    a3
 | |
| AC1     .req    a4
 | |
| CO0     .req    v1
 | |
| CO1     .req    v2
 | |
| CO2     .req    v3
 | |
| CO3     .req    v4
 | |
| ST0     .req    v5
 | |
| ST1     .req    v6
 | |
| ST2     .req    sl
 | |
| ST3     .req    fp
 | |
| I       .req    ip
 | |
| PSAMP   .req    lr
 | |
| 
 | |
| 
 | |
| .macro branch_pic_label first, remainder:vararg
 | |
| A       .word           \first   - 4
 | |
| T       .hword          (\first) / 2
 | |
| .ifnb   \remainder
 | |
|         branch_pic_label \remainder
 | |
| .endif
 | |
| .endm
 | |
| 
 | |
| // Some macros that do loads/multiplies where the register number is determined
 | |
| // from an assembly-time expression. Boy is GNU assembler's syntax ugly...
 | |
| 
 | |
| .macro load  group, index, base, offset
 | |
|        .altmacro
 | |
|        load_ \group, %(\index), \base, \offset
 | |
|        .noaltmacro
 | |
| .endm
 | |
| 
 | |
| .macro load_ group, index, base, offset
 | |
|         ldr     \group\index, [\base, #\offset]
 | |
| .endm
 | |
| 
 | |
| .macro loadd  group, index, base, offset
 | |
|        .altmacro
 | |
|        loadd_ \group, %(\index), %(\index+1), \base, \offset
 | |
|        .noaltmacro
 | |
| .endm
 | |
| 
 | |
| .macro loadd_ group, index0, index1, base, offset
 | |
| A .if \offset >= 256
 | |
| A       ldr     \group\index0, [\base, #\offset]
 | |
| A       ldr     \group\index1, [\base, #(\offset) + 4]
 | |
| A .else
 | |
|         ldrd    \group\index0, \group\index1, [\base, #\offset]
 | |
| A .endif
 | |
| .endm
 | |
| 
 | |
| .macro multiply  index, accumulate, long
 | |
|         .altmacro
 | |
|         multiply_ %(\index), \accumulate, \long
 | |
|         .noaltmacro
 | |
| .endm
 | |
| 
 | |
| .macro multiply_  index, accumulate, long
 | |
|  .if \long
 | |
|   .if \accumulate
 | |
|         smlal   AC0, AC1, CO\index, ST\index
 | |
|   .else
 | |
|         smull   AC0, AC1, CO\index, ST\index
 | |
|   .endif
 | |
|  .else
 | |
|   .if \accumulate
 | |
|         mla     AC0, CO\index, ST\index, AC0
 | |
|   .else
 | |
|         mul     AC0, CO\index, ST\index
 | |
|   .endif
 | |
|  .endif
 | |
| .endm
 | |
| 
 | |
| // A macro to update the load register number and load offsets
 | |
| 
 | |
| .macro inc  howmany
 | |
|   .set LOAD_REG, (LOAD_REG + \howmany) & 3
 | |
|   .set OFFSET_CO, OFFSET_CO + 4 * \howmany
 | |
|   .set OFFSET_ST, OFFSET_ST + 4 * \howmany
 | |
|   .if FIR_REMAIN > 0
 | |
|     .set FIR_REMAIN, FIR_REMAIN - \howmany
 | |
|     .if FIR_REMAIN == 0
 | |
|       .set OFFSET_CO, 4 * MAX_FIR_ORDER
 | |
|       .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
 | |
|     .endif
 | |
|   .elseif IIR_REMAIN > 0
 | |
|     .set IIR_REMAIN, IIR_REMAIN - \howmany
 | |
|   .endif
 | |
| .endm
 | |
| 
 | |
| // Macro to implement the inner loop for one specific combination of parameters
 | |
| 
 | |
| .macro implement_filter  mask_minus1, shift_0, shift_8, iir_taps, fir_taps
 | |
|   .set TOTAL_TAPS, \iir_taps + \fir_taps
 | |
| 
 | |
|   // Deal with register allocation...
 | |
|   .set DEFINED_SHIFT, 0
 | |
|   .set DEFINED_MASK, 0
 | |
|   .set SHUFFLE_SHIFT, 0
 | |
|   .set SHUFFLE_MASK, 0
 | |
|   .set SPILL_SHIFT, 0
 | |
|   .set SPILL_MASK, 0
 | |
|   .if TOTAL_TAPS == 0
 | |
|     // Little register pressure in this case - just keep MASK where it was
 | |
|     .if !\mask_minus1
 | |
|       MASK .req ST1
 | |
|       .set DEFINED_MASK, 1
 | |
|     .endif
 | |
|   .else
 | |
|     .if \shift_0
 | |
|       .if !\mask_minus1
 | |
|         // AC1 is unused with shift 0
 | |
|         MASK .req AC1
 | |
|         .set DEFINED_MASK, 1
 | |
|         .set SHUFFLE_MASK, 1
 | |
|       .endif
 | |
|     .elseif \shift_8
 | |
|       .if !\mask_minus1
 | |
|         .if TOTAL_TAPS <= 4
 | |
|         // All coefficients are preloaded (so pointer not needed)
 | |
|           MASK .req PCO
 | |
|           .set DEFINED_MASK, 1
 | |
|           .set SHUFFLE_MASK, 1
 | |
|         .else
 | |
|           .set SPILL_MASK, 1
 | |
|         .endif
 | |
|       .endif
 | |
|     .else // shift not 0 or 8
 | |
|       .if TOTAL_TAPS <= 3
 | |
|         // All coefficients are preloaded, and at least one CO register is unused
 | |
|         .if \fir_taps & 1
 | |
|           SHIFT .req CO0
 | |
|           .set DEFINED_SHIFT, 1
 | |
|           .set SHUFFLE_SHIFT, 1
 | |
|         .else
 | |
|           SHIFT .req CO3
 | |
|           .set DEFINED_SHIFT, 1
 | |
|           .set SHUFFLE_SHIFT, 1
 | |
|         .endif
 | |
|         .if !\mask_minus1
 | |
|           MASK .req PCO
 | |
|           .set DEFINED_MASK, 1
 | |
|           .set SHUFFLE_MASK, 1
 | |
|         .endif
 | |
|       .elseif TOTAL_TAPS == 4
 | |
|         // All coefficients are preloaded
 | |
|         SHIFT .req PCO
 | |
|         .set DEFINED_SHIFT, 1
 | |
|         .set SHUFFLE_SHIFT, 1
 | |
|         .if !\mask_minus1
 | |
|           .set SPILL_MASK, 1
 | |
|         .endif
 | |
|       .else
 | |
|         .set SPILL_SHIFT, 1
 | |
|         .if !\mask_minus1
 | |
|           .set SPILL_MASK, 1
 | |
|         .endif
 | |
|       .endif
 | |
|     .endif
 | |
|   .endif
 | |
|   .if SPILL_SHIFT
 | |
|     SHIFT .req ST0
 | |
|     .set DEFINED_SHIFT, 1
 | |
|   .endif
 | |
|   .if SPILL_MASK
 | |
|     MASK .req ST1
 | |
|     .set DEFINED_MASK, 1
 | |
|   .endif
 | |
| 
 | |
|         // Preload coefficients if possible
 | |
|   .if TOTAL_TAPS <= 4
 | |
|     .set OFFSET_CO, 0
 | |
|     .if \fir_taps & 1
 | |
|       .set LOAD_REG, 1
 | |
|     .else
 | |
|       .set LOAD_REG, 0
 | |
|     .endif
 | |
|     .rept \fir_taps
 | |
|         load    CO, LOAD_REG, PCO, OFFSET_CO
 | |
|       .set LOAD_REG, (LOAD_REG + 1) & 3
 | |
|       .set OFFSET_CO, OFFSET_CO + 4
 | |
|     .endr
 | |
|     .set OFFSET_CO, 4 * MAX_FIR_ORDER
 | |
|     .rept \iir_taps
 | |
|         load    CO, LOAD_REG, PCO, OFFSET_CO
 | |
|       .set LOAD_REG, (LOAD_REG + 1) & 3
 | |
|       .set OFFSET_CO, OFFSET_CO + 4
 | |
|     .endr
 | |
|   .endif
 | |
| 
 | |
|         // Move mask/shift to final positions if necessary
 | |
|         // Need to do this after preloading, because in some cases we
 | |
|         // reuse the coefficient pointer register
 | |
|   .if SHUFFLE_SHIFT
 | |
|         mov     SHIFT, ST0
 | |
|   .endif
 | |
|   .if SHUFFLE_MASK
 | |
|         mov     MASK, ST1
 | |
|   .endif
 | |
| 
 | |
|         // Begin loop
 | |
| 01:
 | |
|   .if TOTAL_TAPS == 0
 | |
|         // Things simplify a lot in this case
 | |
|         // In fact this could be pipelined further if it's worth it...
 | |
|         ldr     ST0, [PSAMP]
 | |
|         subs    I, I, #1
 | |
|     .if !\mask_minus1
 | |
|         and     ST0, ST0, MASK
 | |
|     .endif
 | |
|         str     ST0, [PST, #-4]!
 | |
|         str     ST0, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
 | |
|         str     ST0, [PSAMP], #4 * MAX_CHANNELS
 | |
|         bne     01b
 | |
|   .else
 | |
|     .if \fir_taps & 1
 | |
|       .set LOAD_REG, 1
 | |
|     .else
 | |
|       .set LOAD_REG, 0
 | |
|     .endif
 | |
|     .set LOAD_BANK, 0
 | |
|     .set FIR_REMAIN, \fir_taps
 | |
|     .set IIR_REMAIN, \iir_taps
 | |
|     .if FIR_REMAIN == 0 // only IIR terms
 | |
|       .set OFFSET_CO, 4 * MAX_FIR_ORDER
 | |
|       .set OFFSET_ST, 4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)
 | |
|     .else
 | |
|       .set OFFSET_CO, 0
 | |
|       .set OFFSET_ST, 0
 | |
|     .endif
 | |
|     .set MUL_REG, LOAD_REG
 | |
|     .set COUNTER, 0
 | |
|     .rept TOTAL_TAPS + 2
 | |
|         // Do load(s)
 | |
|      .if FIR_REMAIN != 0 || IIR_REMAIN != 0
 | |
|       .if COUNTER == 0
 | |
|        .if TOTAL_TAPS > 4
 | |
|         load    CO, LOAD_REG, PCO, OFFSET_CO
 | |
|        .endif
 | |
|         load    ST, LOAD_REG, PST, OFFSET_ST
 | |
|         inc     1
 | |
|       .elseif COUNTER == 1 && (\fir_taps & 1) == 0
 | |
|        .if TOTAL_TAPS > 4
 | |
|         load    CO, LOAD_REG, PCO, OFFSET_CO
 | |
|        .endif
 | |
|         load    ST, LOAD_REG, PST, OFFSET_ST
 | |
|         inc     1
 | |
|       .elseif LOAD_BANK == 0
 | |
|        .if TOTAL_TAPS > 4
 | |
|         .if FIR_REMAIN == 0 && IIR_REMAIN == 1
 | |
|         load    CO, LOAD_REG, PCO, OFFSET_CO
 | |
|         .else
 | |
|         loadd   CO, LOAD_REG, PCO, OFFSET_CO
 | |
|         .endif
 | |
|        .endif
 | |
|        .set LOAD_BANK, 1
 | |
|       .else
 | |
|        .if FIR_REMAIN == 0 && IIR_REMAIN == 1
 | |
|         load    ST, LOAD_REG, PST, OFFSET_ST
 | |
|         inc     1
 | |
|        .else
 | |
|         loadd   ST, LOAD_REG, PST, OFFSET_ST
 | |
|         inc     2
 | |
|        .endif
 | |
|        .set LOAD_BANK, 0
 | |
|       .endif
 | |
|      .endif
 | |
| 
 | |
|         // Do interleaved multiplies, slightly delayed
 | |
|      .if COUNTER >= 2
 | |
|         multiply MUL_REG, COUNTER > 2, !\shift_0
 | |
|       .set MUL_REG, (MUL_REG + 1) & 3
 | |
|      .endif
 | |
|      .set COUNTER, COUNTER + 1
 | |
|     .endr
 | |
| 
 | |
|         // Post-process the result of the multiplies
 | |
|     .if SPILL_SHIFT
 | |
|         ldr     SHIFT, [sp, #9*4 + 0*4]
 | |
|     .endif
 | |
|     .if SPILL_MASK
 | |
|         ldr     MASK, [sp, #9*4 + 1*4]
 | |
|     .endif
 | |
|         ldr     ST2, [PSAMP]
 | |
|         subs    I, I, #1
 | |
|     .if \shift_8
 | |
|         mov     AC0, AC0, lsr #8
 | |
|         orr     AC0, AC0, AC1, lsl #24
 | |
|     .elseif !\shift_0
 | |
|         rsb     ST3, SHIFT, #32
 | |
|         mov     AC0, AC0, lsr SHIFT
 | |
| A       orr     AC0, AC0, AC1, lsl ST3
 | |
| T       mov     AC1, AC1, lsl ST3
 | |
| T       orr     AC0, AC0, AC1
 | |
|     .endif
 | |
|     .if \mask_minus1
 | |
|         add     ST3, ST2, AC0
 | |
|     .else
 | |
|         add     ST2, ST2, AC0
 | |
|         and     ST3, ST2, MASK
 | |
|         sub     ST2, ST3, AC0
 | |
|     .endif
 | |
|         str     ST3, [PST, #-4]!
 | |
|         str     ST2, [PST, #4 * (MAX_BLOCKSIZE + MAX_FIR_ORDER)]
 | |
|         str     ST3, [PSAMP], #4 * MAX_CHANNELS
 | |
|         bne     01b
 | |
|   .endif
 | |
|         b       99f
 | |
| 
 | |
|   .if DEFINED_SHIFT
 | |
|     .unreq SHIFT
 | |
|   .endif
 | |
|   .if DEFINED_MASK
 | |
|     .unreq MASK
 | |
|   .endif
 | |
| .endm
 | |
| 
 | |
| .macro switch_on_fir_taps  mask_minus1, shift_0, shift_8, iir_taps
 | |
| A       ldr     CO0, [pc, a3, lsl #2]   // firorder is in range 0-(8-iir_taps)
 | |
| A       add     pc,  pc,  CO0
 | |
| T       tbh     [pc, a3, lsl #1]
 | |
| 0:
 | |
|         branch_pic_label (70f - 0b), (71f - 0b), (72f - 0b), (73f - 0b)
 | |
|         branch_pic_label (74f - 0b)
 | |
|  .if \iir_taps <= 3
 | |
|         branch_pic_label (75f - 0b)
 | |
|   .if \iir_taps <= 2
 | |
|         branch_pic_label (76f - 0b)
 | |
|    .if \iir_taps <= 1
 | |
|         branch_pic_label (77f - 0b)
 | |
|     .if \iir_taps == 0
 | |
|         branch_pic_label (78f - 0b)
 | |
|     .endif
 | |
|    .endif
 | |
|   .endif
 | |
|  .endif
 | |
| 70:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 0
 | |
| 71:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 1
 | |
| 72:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 2
 | |
| 73:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 3
 | |
| 74:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 4
 | |
|  .if \iir_taps <= 3
 | |
| 75:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 5
 | |
|   .if \iir_taps <= 2
 | |
| 76:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 6
 | |
|    .if \iir_taps <= 1
 | |
| 77:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 7
 | |
|     .if \iir_taps == 0
 | |
| 78:     implement_filter  \mask_minus1, \shift_0, \shift_8, \iir_taps, 8
 | |
|     .endif
 | |
|    .endif
 | |
|   .endif
 | |
|  .endif
 | |
| .endm
 | |
| 
 | |
| .macro switch_on_iir_taps  mask_minus1, shift_0, shift_8
 | |
| A       ldr     CO0, [pc, a4, lsl #2]   // irorder is in range 0-4
 | |
| A       add     pc,  pc,  CO0
 | |
| T       tbh     [pc, a4, lsl #1]
 | |
| 0:
 | |
|         branch_pic_label (60f - 0b), (61f - 0b), (62f - 0b), (63f - 0b)
 | |
|         branch_pic_label (64f - 0b)
 | |
| 60:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 0
 | |
| 61:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 1
 | |
| 62:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 2
 | |
| 63:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 3
 | |
| 64:     switch_on_fir_taps  \mask_minus1, \shift_0, \shift_8, 4
 | |
| .endm
 | |
| 
 | |
| /* void ff_mlp_filter_channel_arm(int32_t *state, const int32_t *coeff,
 | |
|  *                                int firorder, int iirorder,
 | |
|  *                                unsigned int filter_shift, int32_t mask,
 | |
|  *                                int blocksize, int32_t *sample_buffer);
 | |
|  */
 | |
| function ff_mlp_filter_channel_arm, export=1
 | |
|         push    {v1-fp,lr}
 | |
|         add     v1, sp, #9*4 // point at arguments on stack
 | |
|         ldm     v1, {ST0,ST1,I,PSAMP}
 | |
|         cmp     ST1, #-1
 | |
|         bne     30f
 | |
|         movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
 | |
|         bne     20f
 | |
|         bcs     10f
 | |
|         switch_on_iir_taps 1, 1, 0
 | |
| 10:     switch_on_iir_taps 1, 0, 1
 | |
| 20:     switch_on_iir_taps 1, 0, 0
 | |
| 30:     movs    ST2, ST0, lsl #29 // shift is in range 0-15; we want to special-case 0 and 8
 | |
|         bne     50f
 | |
|         bcs     40f
 | |
|         switch_on_iir_taps 0, 1, 0
 | |
| 40:     switch_on_iir_taps 0, 0, 1
 | |
| 50:     switch_on_iir_taps 0, 0, 0
 | |
| 99:     pop     {v1-fp,pc}
 | |
| endfunc
 | |
| 
 | |
|         .unreq  PST
 | |
|         .unreq  PCO
 | |
|         .unreq  AC0
 | |
|         .unreq  AC1
 | |
|         .unreq  CO0
 | |
|         .unreq  CO1
 | |
|         .unreq  CO2
 | |
|         .unreq  CO3
 | |
|         .unreq  ST0
 | |
|         .unreq  ST1
 | |
|         .unreq  ST2
 | |
|         .unreq  ST3
 | |
|         .unreq  I
 | |
|         .unreq  PSAMP
 | |
| 
 | |
| /********************************************************************/
 | |
| 
 | |
| PSA     .req    a1 // samples
 | |
| PCO     .req    a2 // coeffs
 | |
| PBL     .req    a3 // bypassed_lsbs
 | |
| INDEX   .req    a4
 | |
| CO0     .req    v1
 | |
| CO1     .req    v2
 | |
| CO2     .req    v3
 | |
| CO3     .req    v4
 | |
| SA0     .req    v5
 | |
| SA1     .req    v6
 | |
| SA2     .req    sl
 | |
| SA3     .req    fp
 | |
| AC0     .req    ip
 | |
| AC1     .req    lr
 | |
| NOISE   .req    SA0
 | |
| LSB     .req    SA1
 | |
| DCH     .req    SA2 // dest_ch
 | |
| MASK    .req    SA3
 | |
| 
 | |
|     // INDEX is used as follows:
 | |
|     // bits 0..6   index2 (values up to 17, but wider so that we can
 | |
|     //               add to index field without needing to mask)
 | |
|     // bits 7..14  i (values up to 160)
 | |
|     // bit 15      underflow detect for i
 | |
|     // bits 25..31 (if access_unit_size_pow2 == 128)  \ index
 | |
|     // bits 26..31 (if access_unit_size_pow2 == 64)   /
 | |
| 
 | |
| .macro implement_rematrix  shift, index_mask, mask_minus1, maxchan
 | |
|     .if \maxchan == 1
 | |
|         // We can just leave the coefficients in registers in this case
 | |
|         ldrd    CO0, CO1, [PCO]
 | |
|     .endif
 | |
| 1:
 | |
|     .if \maxchan == 1
 | |
|         ldrd    SA0, SA1, [PSA]
 | |
|         smull   AC0, AC1, CO0, SA0
 | |
|     .elseif \maxchan == 5
 | |
|         ldr     CO0, [PCO, #0]
 | |
|         ldr     SA0, [PSA, #0]
 | |
|         ldr     CO1, [PCO, #4]
 | |
|         ldr     SA1, [PSA, #4]
 | |
|         ldrd    CO2, CO3, [PCO, #8]
 | |
|         smull   AC0, AC1, CO0, SA0
 | |
|         ldrd    SA2, SA3, [PSA, #8]
 | |
|         smlal   AC0, AC1, CO1, SA1
 | |
|         ldrd    CO0, CO1, [PCO, #16]
 | |
|         smlal   AC0, AC1, CO2, SA2
 | |
|         ldrd    SA0, SA1, [PSA, #16]
 | |
|         smlal   AC0, AC1, CO3, SA3
 | |
|         smlal   AC0, AC1, CO0, SA0
 | |
|     .else // \maxchan == 7
 | |
|         ldr     CO2, [PCO, #0]
 | |
|         ldr     SA2, [PSA, #0]
 | |
|         ldr     CO3, [PCO, #4]
 | |
|         ldr     SA3, [PSA, #4]
 | |
|         ldrd    CO0, CO1, [PCO, #8]
 | |
|         smull   AC0, AC1, CO2, SA2
 | |
|         ldrd    SA0, SA1, [PSA, #8]
 | |
|         smlal   AC0, AC1, CO3, SA3
 | |
|         ldrd    CO2, CO3, [PCO, #16]
 | |
|         smlal   AC0, AC1, CO0, SA0
 | |
|         ldrd    SA2, SA3, [PSA, #16]
 | |
|         smlal   AC0, AC1, CO1, SA1
 | |
|         ldrd    CO0, CO1, [PCO, #24]
 | |
|         smlal   AC0, AC1, CO2, SA2
 | |
|         ldrd    SA0, SA1, [PSA, #24]
 | |
|         smlal   AC0, AC1, CO3, SA3
 | |
|         smlal   AC0, AC1, CO0, SA0
 | |
|     .endif
 | |
|         ldm     sp, {NOISE, DCH, MASK}
 | |
|         smlal   AC0, AC1, CO1, SA1
 | |
|     .if \shift != 0
 | |
|       .if \index_mask == 63
 | |
|         add     NOISE, NOISE, INDEX, lsr #32-6
 | |
|         ldrb    LSB, [PBL], #MAX_CHANNELS
 | |
|         ldrsb   NOISE, [NOISE]
 | |
|         add     INDEX, INDEX, INDEX, lsl #32-6
 | |
|       .else // \index_mask == 127
 | |
|         add     NOISE, NOISE, INDEX, lsr #32-7
 | |
|         ldrb    LSB, [PBL], #MAX_CHANNELS
 | |
|         ldrsb   NOISE, [NOISE]
 | |
|         add     INDEX, INDEX, INDEX, lsl #32-7
 | |
|       .endif
 | |
|         sub     INDEX, INDEX, #1<<7
 | |
|         adds    AC0, AC0, NOISE, lsl #\shift + 7
 | |
|         adc     AC1, AC1, NOISE, asr #31
 | |
|     .else
 | |
|         ldrb    LSB, [PBL], #MAX_CHANNELS
 | |
|         sub     INDEX, INDEX, #1<<7
 | |
|     .endif
 | |
|         add     PSA, PSA, #MAX_CHANNELS*4
 | |
|         mov     AC0, AC0, lsr #14
 | |
|         orr     AC0, AC0, AC1, lsl #18
 | |
|     .if !\mask_minus1
 | |
|         and     AC0, AC0, MASK
 | |
|     .endif
 | |
|         add     AC0, AC0, LSB
 | |
|         tst     INDEX, #1<<15
 | |
|         str     AC0, [PSA, DCH, lsl #2]  // DCH is precompensated for the early increment of PSA
 | |
|         beq     1b
 | |
|         b       98f
 | |
| .endm
 | |
| 
 | |
| .macro switch_on_maxchan  shift, index_mask, mask_minus1
 | |
|         cmp     v4, #5
 | |
|         blo     51f
 | |
|         beq     50f
 | |
|         implement_rematrix  \shift, \index_mask, \mask_minus1, 7
 | |
| 50:     implement_rematrix  \shift, \index_mask, \mask_minus1, 5
 | |
| 51:     implement_rematrix  \shift, \index_mask, \mask_minus1, 1
 | |
| .endm
 | |
| 
 | |
| .macro switch_on_mask  shift, index_mask
 | |
|         cmp     sl, #-1
 | |
|         bne     40f
 | |
|         switch_on_maxchan  \shift, \index_mask, 1
 | |
| 40:     switch_on_maxchan  \shift, \index_mask, 0
 | |
| .endm
 | |
| 
 | |
| .macro switch_on_au_size  shift
 | |
|   .if \shift == 0
 | |
|         switch_on_mask  \shift, undefined
 | |
|   .else
 | |
|         teq     v6, #64
 | |
|         bne     30f
 | |
|         orr     INDEX, INDEX, v1, lsl #32-6
 | |
|         switch_on_mask  \shift, 63
 | |
| 30:     orr     INDEX, INDEX, v1, lsl #32-7
 | |
|         switch_on_mask  \shift, 127
 | |
|   .endif
 | |
| .endm
 | |
| 
 | |
| /* void ff_mlp_rematrix_channel_arm(int32_t *samples,
 | |
|  *                                  const int32_t *coeffs,
 | |
|  *                                  const uint8_t *bypassed_lsbs,
 | |
|  *                                  const int8_t *noise_buffer,
 | |
|  *                                  int index,
 | |
|  *                                  unsigned int dest_ch,
 | |
|  *                                  uint16_t blockpos,
 | |
|  *                                  unsigned int maxchan,
 | |
|  *                                  int matrix_noise_shift,
 | |
|  *                                  int access_unit_size_pow2,
 | |
|  *                                  int32_t mask);
 | |
|  */
 | |
| function ff_mlp_rematrix_channel_arm, export=1
 | |
|         push    {v1-fp,lr}
 | |
|         add     v1, sp, #9*4 // point at arguments on stack
 | |
|         ldm     v1, {v1-sl}
 | |
|         teq     v4, #1
 | |
|         itt     ne
 | |
|         teqne   v4, #5
 | |
|         teqne   v4, #7
 | |
|         bne     99f
 | |
|         teq     v6, #64
 | |
|         it      ne
 | |
|         teqne   v6, #128
 | |
|         bne     99f
 | |
|         sub     v2, v2, #MAX_CHANNELS
 | |
|         push    {a4,v2,sl}          // initialise NOISE,DCH,MASK; make sp dword-aligned
 | |
|         movs    INDEX, v3, lsl #7
 | |
|         beq     98f                 // just in case, do nothing if blockpos = 0
 | |
|         subs    INDEX, INDEX, #1<<7 // offset by 1 so we borrow at the right time
 | |
|         adc     lr, v1, v1          // calculate index2 (C was set by preceding subs)
 | |
|         orr     INDEX, INDEX, lr
 | |
|         // Switch on matrix_noise_shift: values 0 and 1 are
 | |
|         // disproportionately common so do those in a form the branch
 | |
|         // predictor can accelerate. Values can only go up to 15.
 | |
|         cmp     v5, #1
 | |
|         beq     11f
 | |
|         blo     10f
 | |
| A       ldr     v5,  [pc,  v5,  lsl #2]
 | |
| A       add     pc,  pc,  v5
 | |
| T       tbh     [pc, v5, lsl #1]
 | |
| 0:
 | |
|         branch_pic_label          0,          0, (12f - 0b), (13f - 0b)
 | |
|         branch_pic_label (14f - 0b), (15f - 0b), (16f - 0b), (17f - 0b)
 | |
|         branch_pic_label (18f - 0b), (19f - 0b), (20f - 0b), (21f - 0b)
 | |
|         branch_pic_label (22f - 0b), (23f - 0b), (24f - 0b), (25f - 0b)
 | |
| 10:     switch_on_au_size  0
 | |
| 11:     switch_on_au_size  1
 | |
| 12:     switch_on_au_size  2
 | |
| 13:     switch_on_au_size  3
 | |
| 14:     switch_on_au_size  4
 | |
| 15:     switch_on_au_size  5
 | |
| 16:     switch_on_au_size  6
 | |
| 17:     switch_on_au_size  7
 | |
| 18:     switch_on_au_size  8
 | |
| 19:     switch_on_au_size  9
 | |
| 20:     switch_on_au_size  10
 | |
| 21:     switch_on_au_size  11
 | |
| 22:     switch_on_au_size  12
 | |
| 23:     switch_on_au_size  13
 | |
| 24:     switch_on_au_size  14
 | |
| 25:     switch_on_au_size  15
 | |
| 
 | |
| 98:     add     sp, sp, #3*4
 | |
|         pop     {v1-fp,pc}
 | |
| 99:     // Can't handle these parameters, drop back to C
 | |
|         pop     {v1-fp,lr}
 | |
|         b       X(ff_mlp_rematrix_channel)
 | |
| endfunc
 | |
| 
 | |
|         .unreq  PSA
 | |
|         .unreq  PCO
 | |
|         .unreq  PBL
 | |
|         .unreq  INDEX
 | |
|         .unreq  CO0
 | |
|         .unreq  CO1
 | |
|         .unreq  CO2
 | |
|         .unreq  CO3
 | |
|         .unreq  SA0
 | |
|         .unreq  SA1
 | |
|         .unreq  SA2
 | |
|         .unreq  SA3
 | |
|         .unreq  AC0
 | |
|         .unreq  AC1
 | |
|         .unreq  NOISE
 | |
|         .unreq  LSB
 | |
|         .unreq  DCH
 | |
|         .unreq  MASK
 |