arm: vp9lpf: Implement the mix2_44 function with one single filter pass

For this case, with 8 inputs but only changing 4 of them, we can fit
all 16 input pixels into a q register, and still have enough temporary
registers for doing the loop filter.

The wd=8 filters would require too many temporary registers for
processing all 16 pixels at once though.

Before:                          Cortex A7      A8     A9     A53
vp9_loop_filter_mix2_v_44_16_neon:   289.7   256.2  237.5   181.2
After:
vp9_loop_filter_mix2_v_44_16_neon:   221.2   150.5  177.7   138.0

This is cherrypicked from libav commit
575e31e931.

Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
Martin Storsjö
2017-01-14 13:22:30 +02:00
parent f32690a298
commit a88db8b9a0
2 changed files with 195 additions and 3 deletions

View File

@@ -195,6 +195,8 @@ define_loop_filters(8, 8);
define_loop_filters(16, 8);
define_loop_filters(16, 16);
define_loop_filters(44, 16);
#define lf_mix_fn(dir, wd1, wd2, stridea) \
static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst, \
ptrdiff_t stride, \
@@ -208,7 +210,6 @@ static void loop_filter_##dir##_##wd1##wd2##_16_neon(uint8_t *dst,
lf_mix_fn(h, wd1, wd2, stride) \
lf_mix_fn(v, wd1, wd2, sizeof(uint8_t))
lf_mix_fns(4, 4)
lf_mix_fns(4, 8)
lf_mix_fns(8, 4)
lf_mix_fns(8, 8)
@@ -228,8 +229,8 @@ static av_cold void vp9dsp_loopfilter_init_arm(VP9DSPContext *dsp)
dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_neon;
dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_neon;
dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_neon;
dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_neon;
dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_neon;
dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_neon;
dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_neon;
dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_neon;
dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_neon;