mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 20:42:49 +08:00 
			
		
		
		
	Full-pixel MC functions.
Decoding time of ped1080p.webm goes from 11.3sec to 11.1sec.
This commit is contained in:
		| @@ -219,3 +219,60 @@ filter_v_fn avg | |||||||
| INIT_XMM ssse3 | INIT_XMM ssse3 | ||||||
| filter_v_fn put | filter_v_fn put | ||||||
| filter_v_fn avg | filter_v_fn avg | ||||||
|  |  | ||||||
|  | %macro fpel_fn 6 | ||||||
|  | %if %2 == 4 | ||||||
|  | %define %%srcfn movh | ||||||
|  | %define %%dstfn movh | ||||||
|  | %else | ||||||
|  | %define %%srcfn movu | ||||||
|  | %define %%dstfn mova | ||||||
|  | %endif | ||||||
|  |  | ||||||
|  | %if %2 <= 16 | ||||||
|  | cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 | ||||||
|  |     lea  sstride3q, [sstrideq*3] | ||||||
|  |     lea  dstride3q, [dstrideq*3] | ||||||
|  | %else | ||||||
|  | cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h | ||||||
|  | %endif | ||||||
|  | .loop: | ||||||
|  |     %%srcfn     m0, [srcq] | ||||||
|  |     %%srcfn     m1, [srcq+s%3] | ||||||
|  |     %%srcfn     m2, [srcq+s%4] | ||||||
|  |     %%srcfn     m3, [srcq+s%5] | ||||||
|  |     lea       srcq, [srcq+sstrideq*%6] | ||||||
|  | %ifidn %1, avg | ||||||
|  |     pavgb       m0, [dstq] | ||||||
|  |     pavgb       m1, [dstq+d%3] | ||||||
|  |     pavgb       m2, [dstq+d%4] | ||||||
|  |     pavgb       m3, [dstq+d%5] | ||||||
|  | %endif | ||||||
|  |     %%dstfn [dstq], m0 | ||||||
|  |     %%dstfn [dstq+d%3], m1 | ||||||
|  |     %%dstfn [dstq+d%4], m2 | ||||||
|  |     %%dstfn [dstq+d%5], m3 | ||||||
|  |     lea       dstq, [dstq+dstrideq*%6] | ||||||
|  |     sub         hd, %6 | ||||||
|  |     jnz .loop | ||||||
|  |     RET | ||||||
|  | %endmacro | ||||||
|  |  | ||||||
|  | %define d16 16 | ||||||
|  | %define s16 16 | ||||||
|  | INIT_MMX mmx | ||||||
|  | fpel_fn put, 4,  strideq, strideq*2, stride3q, 4 | ||||||
|  | fpel_fn put, 8,  strideq, strideq*2, stride3q, 4 | ||||||
|  | INIT_MMX sse | ||||||
|  | fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4 | ||||||
|  | fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4 | ||||||
|  | INIT_XMM sse | ||||||
|  | fpel_fn put, 16, strideq, strideq*2, stride3q, 4 | ||||||
|  | fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2 | ||||||
|  | fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1 | ||||||
|  | INIT_XMM sse2 | ||||||
|  | fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 | ||||||
|  | fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2 | ||||||
|  | fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1 | ||||||
|  | %undef s16 | ||||||
|  | %undef d16 | ||||||
|   | |||||||
| @@ -27,6 +27,22 @@ | |||||||
|  |  | ||||||
| #if HAVE_YASM | #if HAVE_YASM | ||||||
|  |  | ||||||
|  | #define fpel_func(avg, sz, opt) \ | ||||||
|  | void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | ||||||
|  |                           const uint8_t *src, ptrdiff_t src_stride, \ | ||||||
|  |                           int h, int mx, int my) | ||||||
|  | fpel_func(put,  4, mmx); | ||||||
|  | fpel_func(put,  8, mmx); | ||||||
|  | fpel_func(put, 16, sse); | ||||||
|  | fpel_func(put, 32, sse); | ||||||
|  | fpel_func(put, 64, sse); | ||||||
|  | fpel_func(avg,  4, sse); | ||||||
|  | fpel_func(avg,  8, sse); | ||||||
|  | fpel_func(avg, 16, sse2); | ||||||
|  | fpel_func(avg, 32, sse2); | ||||||
|  | fpel_func(avg, 64, sse2); | ||||||
|  | #undef fpel_func | ||||||
|  |  | ||||||
| #define mc_func(avg, sz, dir, opt) \ | #define mc_func(avg, sz, dir, opt) \ | ||||||
| void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | ||||||
|                                              const uint8_t *src, ptrdiff_t src_stride, \ |                                              const uint8_t *src, ptrdiff_t src_stride, \ | ||||||
| @@ -141,6 +157,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) | |||||||
| #if HAVE_YASM | #if HAVE_YASM | ||||||
|     int cpu_flags = av_get_cpu_flags(); |     int cpu_flags = av_get_cpu_flags(); | ||||||
|  |  | ||||||
|  | #define init_fpel(idx1, idx2, sz, type, opt) \ | ||||||
|  |     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ | ||||||
|  |     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ | ||||||
|  |     dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \ | ||||||
|  |     dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_##opt | ||||||
|  |  | ||||||
|  |  | ||||||
| #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ | #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ | ||||||
|     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ |     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ | ||||||
|     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ |     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ | ||||||
| @@ -158,11 +181,31 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) | |||||||
|     init_subpel2(idx, 0, 1, v, type, opt); \ |     init_subpel2(idx, 0, 1, v, type, opt); \ | ||||||
|     init_subpel2(idx, 1, 0, h, type, opt) |     init_subpel2(idx, 1, 0, h, type, opt) | ||||||
|  |  | ||||||
|  |     if (cpu_flags & AV_CPU_FLAG_MMX) { | ||||||
|  |         init_fpel(4, 0,  4, put, mmx); | ||||||
|  |         init_fpel(3, 0,  8, put, mmx); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (cpu_flags & AV_CPU_FLAG_SSE) { | ||||||
|  |         init_fpel(2, 0, 16, put, sse); | ||||||
|  |         init_fpel(1, 0, 32, put, sse); | ||||||
|  |         init_fpel(0, 0, 64, put, sse); | ||||||
|  |         init_fpel(4, 1,  4, avg, sse); | ||||||
|  |         init_fpel(3, 1,  8, avg, sse); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if (cpu_flags & AV_CPU_FLAG_SSE2) { | ||||||
|  |         init_fpel(2, 1, 16, avg, sse2); | ||||||
|  |         init_fpel(1, 1, 32, avg, sse2); | ||||||
|  |         init_fpel(0, 1, 64, avg, sse2); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     if (cpu_flags & AV_CPU_FLAG_SSSE3) { |     if (cpu_flags & AV_CPU_FLAG_SSSE3) { | ||||||
|         init_subpel3(0, put, ssse3); |         init_subpel3(0, put, ssse3); | ||||||
|         init_subpel3(1, avg, ssse3); |         init_subpel3(1, avg, ssse3); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | #undef init_fpel | ||||||
| #undef init_subpel1 | #undef init_subpel1 | ||||||
| #undef init_subpel2 | #undef init_subpel2 | ||||||
| #undef init_subpel3 | #undef init_subpel3 | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Ronald S. Bultje
					Ronald S. Bultje