mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-10-31 12:36:41 +08:00 
			
		
		
		
	Full-pixel MC functions.
Decoding time of ped1080p.webm goes from 11.3sec to 11.1sec.
This commit is contained in:
		| @@ -219,3 +219,60 @@ filter_v_fn avg | ||||
| INIT_XMM ssse3 | ||||
| filter_v_fn put | ||||
| filter_v_fn avg | ||||
|  | ||||
| %macro fpel_fn 6 | ||||
| %if %2 == 4 | ||||
| %define %%srcfn movh | ||||
| %define %%dstfn movh | ||||
| %else | ||||
| %define %%srcfn movu | ||||
| %define %%dstfn mova | ||||
| %endif | ||||
|  | ||||
| %if %2 <= 16 | ||||
| cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 | ||||
|     lea  sstride3q, [sstrideq*3] | ||||
|     lea  dstride3q, [dstrideq*3] | ||||
| %else | ||||
| cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h | ||||
| %endif | ||||
| .loop: | ||||
|     %%srcfn     m0, [srcq] | ||||
|     %%srcfn     m1, [srcq+s%3] | ||||
|     %%srcfn     m2, [srcq+s%4] | ||||
|     %%srcfn     m3, [srcq+s%5] | ||||
|     lea       srcq, [srcq+sstrideq*%6] | ||||
| %ifidn %1, avg | ||||
|     pavgb       m0, [dstq] | ||||
|     pavgb       m1, [dstq+d%3] | ||||
|     pavgb       m2, [dstq+d%4] | ||||
|     pavgb       m3, [dstq+d%5] | ||||
| %endif | ||||
|     %%dstfn [dstq], m0 | ||||
|     %%dstfn [dstq+d%3], m1 | ||||
|     %%dstfn [dstq+d%4], m2 | ||||
|     %%dstfn [dstq+d%5], m3 | ||||
|     lea       dstq, [dstq+dstrideq*%6] | ||||
|     sub         hd, %6 | ||||
|     jnz .loop | ||||
|     RET | ||||
| %endmacro | ||||
|  | ||||
| %define d16 16 | ||||
| %define s16 16 | ||||
| INIT_MMX mmx | ||||
| fpel_fn put, 4,  strideq, strideq*2, stride3q, 4 | ||||
| fpel_fn put, 8,  strideq, strideq*2, stride3q, 4 | ||||
| INIT_MMX sse | ||||
| fpel_fn avg, 4,  strideq, strideq*2, stride3q, 4 | ||||
| fpel_fn avg, 8,  strideq, strideq*2, stride3q, 4 | ||||
| INIT_XMM sse | ||||
| fpel_fn put, 16, strideq, strideq*2, stride3q, 4 | ||||
| fpel_fn put, 32, mmsize,  strideq,   strideq+mmsize, 2 | ||||
| fpel_fn put, 64, mmsize,  mmsize*2,  mmsize*3, 1 | ||||
| INIT_XMM sse2 | ||||
| fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 | ||||
| fpel_fn avg, 32, mmsize,  strideq,   strideq+mmsize, 2 | ||||
| fpel_fn avg, 64, mmsize,  mmsize*2,  mmsize*3, 1 | ||||
| %undef s16 | ||||
| %undef d16 | ||||
|   | ||||
| @@ -27,6 +27,22 @@ | ||||
|  | ||||
| #if HAVE_YASM | ||||
|  | ||||
| #define fpel_func(avg, sz, opt) \ | ||||
| void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | ||||
|                           const uint8_t *src, ptrdiff_t src_stride, \ | ||||
|                           int h, int mx, int my) | ||||
| fpel_func(put,  4, mmx); | ||||
| fpel_func(put,  8, mmx); | ||||
| fpel_func(put, 16, sse); | ||||
| fpel_func(put, 32, sse); | ||||
| fpel_func(put, 64, sse); | ||||
| fpel_func(avg,  4, sse); | ||||
| fpel_func(avg,  8, sse); | ||||
| fpel_func(avg, 16, sse2); | ||||
| fpel_func(avg, 32, sse2); | ||||
| fpel_func(avg, 64, sse2); | ||||
| #undef fpel_func | ||||
|  | ||||
| #define mc_func(avg, sz, dir, opt) \ | ||||
| void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ | ||||
|                                              const uint8_t *src, ptrdiff_t src_stride, \ | ||||
| @@ -141,6 +157,13 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) | ||||
| #if HAVE_YASM | ||||
|     int cpu_flags = av_get_cpu_flags(); | ||||
|  | ||||
| #define init_fpel(idx1, idx2, sz, type, opt) \ | ||||
|     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ | ||||
|     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ | ||||
|     dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \ | ||||
|     dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_##type##sz##_##opt | ||||
|  | ||||
|  | ||||
| #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ | ||||
|     dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ | ||||
|     dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ | ||||
| @@ -158,11 +181,31 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) | ||||
|     init_subpel2(idx, 0, 1, v, type, opt); \ | ||||
|     init_subpel2(idx, 1, 0, h, type, opt) | ||||
|  | ||||
|     if (cpu_flags & AV_CPU_FLAG_MMX) { | ||||
|         init_fpel(4, 0,  4, put, mmx); | ||||
|         init_fpel(3, 0,  8, put, mmx); | ||||
|     } | ||||
|  | ||||
|     if (cpu_flags & AV_CPU_FLAG_SSE) { | ||||
|         init_fpel(2, 0, 16, put, sse); | ||||
|         init_fpel(1, 0, 32, put, sse); | ||||
|         init_fpel(0, 0, 64, put, sse); | ||||
|         init_fpel(4, 1,  4, avg, sse); | ||||
|         init_fpel(3, 1,  8, avg, sse); | ||||
|     } | ||||
|  | ||||
|     if (cpu_flags & AV_CPU_FLAG_SSE2) { | ||||
|         init_fpel(2, 1, 16, avg, sse2); | ||||
|         init_fpel(1, 1, 32, avg, sse2); | ||||
|         init_fpel(0, 1, 64, avg, sse2); | ||||
|     } | ||||
|  | ||||
|     if (cpu_flags & AV_CPU_FLAG_SSSE3) { | ||||
|         init_subpel3(0, put, ssse3); | ||||
|         init_subpel3(1, avg, ssse3); | ||||
|     } | ||||
|  | ||||
| #undef init_fpel | ||||
| #undef init_subpel1 | ||||
| #undef init_subpel2 | ||||
| #undef init_subpel3 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Ronald S. Bultje
					Ronald S. Bultje