mirror of
				https://github.com/nyanmisaka/ffmpeg-rockchip.git
				synced 2025-11-01 04:53:04 +08:00 
			
		
		
		
	aarch64: h264qpel: Do vertical filtering without transposing
This gives rather big speedups on these functions: Before: put_h264_qpel_8_mc01_8_neon: 241.0 131.5 138.7 put_h264_qpel_8_mc02_8_neon: 214.7 121.2 127.5 put_h264_qpel_8_mc03_8_neon: 242.5 131.2 135.7 put_h264_qpel_8_mc11_8_neon: 421.2 218.7 251.0 put_h264_qpel_8_mc12_8_neon: 878.0 509.5 537.5 put_h264_qpel_8_mc13_8_neon: 423.7 217.0 252.0 put_h264_qpel_8_mc21_8_neon: 858.2 479.5 514.0 put_h264_qpel_8_mc22_8_neon: 649.7 385.2 403.0 put_h264_qpel_8_mc23_8_neon: 860.2 476.5 517.7 put_h264_qpel_8_mc31_8_neon: 437.2 219.5 252.5 put_h264_qpel_8_mc32_8_neon: 892.5 510.5 546.0 put_h264_qpel_8_mc33_8_neon: 438.2 218.5 257.0 put_h264_qpel_16_mc01_8_neon: 944.2 509.7 546.7 put_h264_qpel_16_mc02_8_neon: 878.7 469.5 509.7 put_h264_qpel_16_mc03_8_neon: 945.7 510.7 557.0 put_h264_qpel_16_mc11_8_neon: 1663.2 858.5 979.5 put_h264_qpel_16_mc12_8_neon: 3510.2 2027.7 2112.7 put_h264_qpel_16_mc13_8_neon: 1664.7 857.5 980.5 put_h264_qpel_16_mc21_8_neon: 3366.2 1928.5 2030.5 put_h264_qpel_16_mc22_8_neon: 2584.7 1514.7 1590.2 put_h264_qpel_16_mc23_8_neon: 3367.7 1927.7 2035.0 put_h264_qpel_16_mc31_8_neon: 1716.7 849.7 997.0 put_h264_qpel_16_mc32_8_neon: 3564.0 2044.2 3835.2 put_h264_qpel_16_mc33_8_neon: 1717.7 863.0 989.5 After: put_h264_qpel_8_mc01_8_neon: 136.0 73.7 76.0 put_h264_qpel_8_mc02_8_neon: 108.7 65.0 64.0 put_h264_qpel_8_mc03_8_neon: 137.5 72.7 73.0 put_h264_qpel_8_mc11_8_neon: 316.2 159.0 188.5 put_h264_qpel_8_mc12_8_neon: 653.0 375.5 384.7 put_h264_qpel_8_mc13_8_neon: 318.7 165.5 189.5 put_h264_qpel_8_mc21_8_neon: 739.2 385.7 432.5 put_h264_qpel_8_mc22_8_neon: 530.7 295.5 309.5 put_h264_qpel_8_mc23_8_neon: 741.2 393.7 421.0 put_h264_qpel_8_mc31_8_neon: 332.2 162.5 190.0 put_h264_qpel_8_mc32_8_neon: 667.5 378.2 390.5 put_h264_qpel_8_mc33_8_neon: 332.7 166.5 195.5 put_h264_qpel_16_mc01_8_neon: 524.2 285.2 294.0 put_h264_qpel_16_mc02_8_neon: 454.7 252.2 250.2 put_h264_qpel_16_mc03_8_neon: 525.7 286.0 283.0 put_h264_qpel_16_mc11_8_neon: 1243.2 630.7 726.7 put_h264_qpel_16_mc12_8_neon: 2610.2 1479.7 1481.2 put_h264_qpel_16_mc13_8_neon: 1250.5 631.7 727.7 put_h264_qpel_16_mc21_8_neon: 2890.2 1571.2 1679.7 put_h264_qpel_16_mc22_8_neon: 2108.7 1177.5 1223.5 put_h264_qpel_16_mc23_8_neon: 2891.7 1578.7 1667.7 put_h264_qpel_16_mc31_8_neon: 1296.7 630.5 752.5 put_h264_qpel_16_mc32_8_neon: 2664.0 1483.2 1503.5 put_h264_qpel_16_mc33_8_neon: 1297.7 632.5 747.2 I.e. overall a 20%-60% reduction in runtime of these functions. Signed-off-by: Martin Storsjö <martin@martin.st>
This commit is contained in:
		| @@ -58,6 +58,24 @@ | |||||||
|   .endif |   .endif | ||||||
| .endm | .endm | ||||||
|  |  | ||||||
|  | //trashes v0-v4 | ||||||
|  | .macro  lowpass_8_v     r0,  r1,  r2,  r3,  r4,  r5,  r6,  d0,  d1,  narrow=1 | ||||||
|  |         uaddl           v2.8H,      \r2\().8B, \r3\().8B | ||||||
|  |         uaddl           v0.8H,      \r3\().8B, \r4\().8B | ||||||
|  |         uaddl           v4.8H,      \r1\().8B, \r4\().8B | ||||||
|  |         uaddl           v1.8H,      \r2\().8B, \r5\().8B | ||||||
|  |         uaddl           \d0\().8H,  \r0\().8B, \r5\().8B | ||||||
|  |         uaddl           \d1\().8H,  \r1\().8B, \r6\().8B | ||||||
|  |         mla             \d0\().8H,  v2.8H,     v6.H[1] | ||||||
|  |         mls             \d0\().8H,  v4.8H,     v6.H[0] | ||||||
|  |         mla             \d1\().8H,  v0.8H,     v6.H[1] | ||||||
|  |         mls             \d1\().8H,  v1.8H,     v6.H[0] | ||||||
|  |   .if \narrow | ||||||
|  |         sqrshrun        \d0\().8B,  \d0\().8H, #5 | ||||||
|  |         sqrshrun        \d1\().8B,  \d1\().8H, #5 | ||||||
|  |   .endif | ||||||
|  | .endm | ||||||
|  |  | ||||||
| //trashes v0-v5, v7, v30-v31 | //trashes v0-v5, v7, v30-v31 | ||||||
| .macro  lowpass_8H      r0,  r1 | .macro  lowpass_8H      r0,  r1 | ||||||
|         ext             v0.16B,     \r0\().16B, \r0\().16B, #2 |         ext             v0.16B,     \r0\().16B, \r0\().16B, #2 | ||||||
| @@ -100,18 +118,13 @@ | |||||||
| .endm | .endm | ||||||
|  |  | ||||||
| // trashed v0-v7 | // trashed v0-v7 | ||||||
| .macro  lowpass_8.16    r0,  r1,  r2 | .macro  lowpass_8.16    r0,  r1,  r2,  r3,  r4,  r5 | ||||||
|         ext             v1.16B,     \r0\().16B, \r1\().16B, #4 |         saddl           v5.4S,      \r2\().4H,  \r3\().4H | ||||||
|         ext             v0.16B,     \r0\().16B, \r1\().16B, #6 |         saddl2          v1.4S,      \r2\().8H,  \r3\().8H | ||||||
|         saddl           v5.4S,      v1.4H,      v0.4H |         saddl           v6.4S,      \r1\().4H,  \r4\().4H | ||||||
|         ext             v2.16B,     \r0\().16B, \r1\().16B, #2 |         saddl2          v2.4S,      \r1\().8H,  \r4\().8H | ||||||
|         saddl2          v1.4S,      v1.8H,      v0.8H |         saddl           v0.4S,      \r0\().4H,  \r5\().4H | ||||||
|         ext             v3.16B,     \r0\().16B, \r1\().16B, #8 |         saddl2          v4.4S,      \r0\().8H,  \r5\().8H | ||||||
|         saddl           v6.4S,      v2.4H,      v3.4H |  | ||||||
|         ext             \r1\().16B, \r0\().16B, \r1\().16B, #10 |  | ||||||
|         saddl2          v2.4S,      v2.8H,      v3.8H |  | ||||||
|         saddl           v0.4S,      \r0\().4H,  \r1\().4H |  | ||||||
|         saddl2          v4.4S,      \r0\().8H,  \r1\().8H |  | ||||||
|  |  | ||||||
|         shl             v3.4S,  v5.4S,  #4 |         shl             v3.4S,  v5.4S,  #4 | ||||||
|         shl             v5.4S,  v5.4S,  #2 |         shl             v5.4S,  v5.4S,  #2 | ||||||
| @@ -134,7 +147,7 @@ | |||||||
|         rshrn           v5.4H,  v5.4S,  #10 |         rshrn           v5.4H,  v5.4S,  #10 | ||||||
|         rshrn2          v5.8H,  v1.4S,  #10 |         rshrn2          v5.8H,  v1.4S,  #10 | ||||||
|  |  | ||||||
|         sqxtun          \r2\().8B,  v5.8H |         sqxtun          \r0\().8B,  v5.8H | ||||||
| .endm | .endm | ||||||
|  |  | ||||||
| function put_h264_qpel16_h_lowpass_neon_packed | function put_h264_qpel16_h_lowpass_neon_packed | ||||||
| @@ -258,27 +271,23 @@ endfunc | |||||||
|  |  | ||||||
| function \type\()_h264_qpel8_v_lowpass_neon | function \type\()_h264_qpel8_v_lowpass_neon | ||||||
|         ld1             {v16.8B}, [x1], x3 |         ld1             {v16.8B}, [x1], x3 | ||||||
|         ld1             {v18.8B}, [x1], x3 |  | ||||||
|         ld1             {v20.8B}, [x1], x3 |  | ||||||
|         ld1             {v22.8B}, [x1], x3 |  | ||||||
|         ld1             {v24.8B}, [x1], x3 |  | ||||||
|         ld1             {v26.8B}, [x1], x3 |  | ||||||
|         ld1             {v28.8B}, [x1], x3 |  | ||||||
|         ld1             {v30.8B}, [x1], x3 |  | ||||||
|         ld1             {v17.8B}, [x1], x3 |         ld1             {v17.8B}, [x1], x3 | ||||||
|  |         ld1             {v18.8B}, [x1], x3 | ||||||
|         ld1             {v19.8B}, [x1], x3 |         ld1             {v19.8B}, [x1], x3 | ||||||
|  |         ld1             {v20.8B}, [x1], x3 | ||||||
|         ld1             {v21.8B}, [x1], x3 |         ld1             {v21.8B}, [x1], x3 | ||||||
|  |         ld1             {v22.8B}, [x1], x3 | ||||||
|         ld1             {v23.8B}, [x1], x3 |         ld1             {v23.8B}, [x1], x3 | ||||||
|         ld1             {v25.8B}, [x1] |         ld1             {v24.8B}, [x1], x3 | ||||||
|  |         ld1             {v25.8B}, [x1], x3 | ||||||
|         transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1 |         ld1             {v26.8B}, [x1], x3 | ||||||
|         transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1 |         ld1             {v27.8B}, [x1], x3 | ||||||
|         lowpass_8       v16, v17, v18, v19, v16, v17 |         ld1             {v28.8B}, [x1] | ||||||
|         lowpass_8       v20, v21, v22, v23, v18, v19 |  | ||||||
|         lowpass_8       v24, v25, v26, v27, v20, v21 |  | ||||||
|         lowpass_8       v28, v29, v30, v31, v22, v23 |  | ||||||
|         transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1 |  | ||||||
|  |  | ||||||
|  |         lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17 | ||||||
|  |         lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19 | ||||||
|  |         lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21 | ||||||
|  |         lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23 | ||||||
|   .ifc \type,avg |   .ifc \type,avg | ||||||
|         ld1             {v24.8B},  [x0], x2 |         ld1             {v24.8B},  [x0], x2 | ||||||
|         ld1             {v25.8B}, [x0], x2 |         ld1             {v25.8B}, [x0], x2 | ||||||
| @@ -335,26 +344,23 @@ endfunc | |||||||
|  |  | ||||||
| function \type\()_h264_qpel8_v_lowpass_l2_neon | function \type\()_h264_qpel8_v_lowpass_l2_neon | ||||||
|         ld1             {v16.8B}, [x1], x3 |         ld1             {v16.8B}, [x1], x3 | ||||||
|         ld1             {v18.8B}, [x1], x3 |  | ||||||
|         ld1             {v20.8B}, [x1], x3 |  | ||||||
|         ld1             {v22.8B}, [x1], x3 |  | ||||||
|         ld1             {v24.8B}, [x1], x3 |  | ||||||
|         ld1             {v26.8B}, [x1], x3 |  | ||||||
|         ld1             {v28.8B}, [x1], x3 |  | ||||||
|         ld1             {v30.8B}, [x1], x3 |  | ||||||
|         ld1             {v17.8B}, [x1], x3 |         ld1             {v17.8B}, [x1], x3 | ||||||
|  |         ld1             {v18.8B}, [x1], x3 | ||||||
|         ld1             {v19.8B}, [x1], x3 |         ld1             {v19.8B}, [x1], x3 | ||||||
|  |         ld1             {v20.8B}, [x1], x3 | ||||||
|         ld1             {v21.8B}, [x1], x3 |         ld1             {v21.8B}, [x1], x3 | ||||||
|  |         ld1             {v22.8B}, [x1], x3 | ||||||
|         ld1             {v23.8B}, [x1], x3 |         ld1             {v23.8B}, [x1], x3 | ||||||
|         ld1             {v25.8B}, [x1] |         ld1             {v24.8B}, [x1], x3 | ||||||
|  |         ld1             {v25.8B}, [x1], x3 | ||||||
|  |         ld1             {v26.8B}, [x1], x3 | ||||||
|  |         ld1             {v27.8B}, [x1], x3 | ||||||
|  |         ld1             {v28.8B}, [x1] | ||||||
|  |  | ||||||
|         transpose_8x8B  v16, v18, v20, v22, v24, v26, v28, v30, v0,  v1 |         lowpass_8_v     v16, v17, v18, v19, v20, v21, v22, v16, v17 | ||||||
|         transpose_8x8B  v17, v19, v21, v23, v25, v27, v29, v31, v0,  v1 |         lowpass_8_v     v18, v19, v20, v21, v22, v23, v24, v18, v19 | ||||||
|         lowpass_8       v16, v17, v18, v19, v16, v17 |         lowpass_8_v     v20, v21, v22, v23, v24, v25, v26, v20, v21 | ||||||
|         lowpass_8       v20, v21, v22, v23, v18, v19 |         lowpass_8_v     v22, v23, v24, v25, v26, v27, v28, v22, v23 | ||||||
|         lowpass_8       v24, v25, v26, v27, v20, v21 |  | ||||||
|         lowpass_8       v28, v29, v30, v31, v22, v23 |  | ||||||
|         transpose_8x8B  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1 |  | ||||||
|  |  | ||||||
|         ld1             {v24.8B},  [x12], x2 |         ld1             {v24.8B},  [x12], x2 | ||||||
|         ld1             {v25.8B},  [x12], x2 |         ld1             {v25.8B},  [x12], x2 | ||||||
| @@ -432,22 +438,17 @@ function put_h264_qpel8_hv_lowpass_neon_top | |||||||
|         lowpass_8H      v26, v27 |         lowpass_8H      v26, v27 | ||||||
|         lowpass_8H      v28, v29 |         lowpass_8H      v28, v29 | ||||||
|  |  | ||||||
|         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1 |         lowpass_8.16    v16, v17, v18, v19, v20, v21 | ||||||
|         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1 |         lowpass_8.16    v17, v18, v19, v20, v21, v22 | ||||||
|  |  | ||||||
|         lowpass_8.16    v16, v24, v16 |         lowpass_8.16    v18, v19, v20, v21, v22, v23 | ||||||
|         lowpass_8.16    v17, v25, v17 |         lowpass_8.16    v19, v20, v21, v22, v23, v24 | ||||||
|  |  | ||||||
|         lowpass_8.16    v18, v26, v18 |         lowpass_8.16    v20, v21, v22, v23, v24, v25 | ||||||
|         lowpass_8.16    v19, v27, v19 |         lowpass_8.16    v21, v22, v23, v24, v25, v26 | ||||||
|  |  | ||||||
|         lowpass_8.16    v20, v28, v20 |         lowpass_8.16    v22, v23, v24, v25, v26, v27 | ||||||
|         lowpass_8.16    v21, v29, v21 |         lowpass_8.16    v23, v24, v25, v26, v27, v28 | ||||||
|  |  | ||||||
|         lowpass_8.16    v22, v30, v22 |  | ||||||
|         lowpass_8.16    v23, v31, v23 |  | ||||||
|  |  | ||||||
|         transpose_8x8B v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1 |  | ||||||
|  |  | ||||||
|         ret |         ret | ||||||
| endfunc | endfunc | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Martin Storsjö
					Martin Storsjö