chen 
							
						 
					 
					
						
						
							
						
						a2f5cc54f8 
					 
					
						
						
							
							moe preprocess op support 160 experts and fused_moe triton kernel name add K ( #3121 )  
						
						
						
						
					 
					
						2025-08-01 10:46:20 +08:00 
						 
				 
			
				
					
						
							
							
								Yuan Xiaolan 
							
						 
					 
					
						
						
							
						
						5f56d289a7 
					 
					
						
						
							
							fix is_permuted ( #3098 )  
						
						... 
						
						
						
						Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com > 
						
						
					 
					
						2025-07-31 19:58:05 +08:00 
						 
				 
			
				
					
						
							
							
								RAM 
							
						 
					 
					
						
						
							
						
						d850660872 
					 
					
						
						
							
							[Executor] Refactor GetBlockShapeAndSplitKVBlock Kernel ( #2989 )  
						
						... 
						
						
						
						* reset decoder_block_shape_q buffer
* refactor GetBlockShapeAndSplitKVBlock Kernel and cudagraph padding batch
* update decode_max_tile_size
* fix pre-commit
* update block_multihead_attn_backend
* update flas attn backend
* update MLA Attention
* update XPU Attention
* update gcu,iluvatar model runner
* Update MTP
* fix MTP bug 
						
						
					 
					
						2025-07-31 00:09:31 +08:00 
						 
				 
			
				
					
						
							
							
								bukejiyu 
							
						 
					 
					
						
						
							
						
						db698bda01 
					 
					
						
						
							
							qwen loader ( #3057 )  
						
						
						
						
					 
					
						2025-07-30 19:09:38 +08:00 
						 
				 
			
				
					
						
							
							
								zhink 
							
						 
					 
					
						
						
							
						
						d89b6dd43f 
					 
					
						
						
							
							adapter qwen3 moe attr for init ( #3066 )  
						
						... 
						
						
						
						adapter qwen3 moe attr for init 
						
						
					 
					
						2025-07-30 16:49:28 +08:00 
						 
				 
			
				
					
						
							
							
								bukejiyu 
							
						 
					 
					
						
						
							
						
						8e203666d9 
					 
					
						
						
							
							w4a8 offline ( #3074 )  
						
						... 
						
						
						
						* w4a8 offline
* update
* update
* update 
						
						
					 
					
						2025-07-30 16:33:30 +08:00 
						 
				 
			
				
					
						
							
							
								Yuan Xiaolan 
							
						 
					 
					
						
						
							
						
						35935da9e5 
					 
					
						
						
							
							support W4A8 EPLB ( #3075 )  
						
						
						
						
					 
					
						2025-07-30 14:34:12 +08:00 
						 
				 
			
				
					
						
							
							
								Zero Rains 
							
						 
					 
					
						
						
							
						
						b2f9a42d87 
					 
					
						
						
							
							[Feature] Support repetition early stop ( #3024 )  
						
						... 
						
						
						
						* support repetition early stop and support user to set the parameter
* remove log
* fix codestyle
* add the early_stop_config to rollout_config
* update config and EarlyStopper class
* fix the bug for triton
* modify the stop method
* update description
* modify the usage for stop_flags
---------
Co-authored-by: Yuanle Liu <yuanlehome@163.com > 
						
						
					 
					
						2025-07-29 22:42:54 +08:00 
						 
				 
			
				
					
						
							
							
								Yuan Xiaolan 
							
						 
					 
					
						
						
							
						
						3214fb5393 
					 
					
						
						
							
							support model loading for w4a8 offline quant ( #3064 )  
						
						... 
						
						
						
						支持W4A8 EP 对离线量化权重的load 
						
						
					 
					
						2025-07-29 21:54:37 +08:00 
						 
				 
			
				
					
						
							
							
								Longzhi Wang 
							
						 
					 
					
						
						
							
						
						be0a0f2bb2 
					 
					
						
						
							
							fix arguement error in ep when pd ( #3060 )  
						
						
						
						
					 
					
						2025-07-29 17:17:24 +08:00 
						 
				 
			
				
					
						
							
							
								YuanRisheng 
							
						 
					 
					
						
						
							
						
						502ee92a0a 
					 
					
						
						
							
							Unify server-side and model-side Config (Part3)  ( #3047 )  
						
						... 
						
						
						
						* merge model config
* fix arch
* fix rl 
						
						
					 
					
						2025-07-29 17:07:44 +08:00 
						 
				 
			
				
					
						
							
							
								Longzhi Wang 
							
						 
					 
					
						
						
							
						
						907d561523 
					 
					
						
						
							
							fix ep when paddle version mismatch ( #3056 )  
						
						
						
						
					 
					
						2025-07-29 15:06:49 +08:00 
						 
				 
			
				
					
						
							
							
								JYChen 
							
						 
					 
					
						
						
							
						
						dafe02a7b9 
					 
					
						
						
							
							[stop sequence] support stop sequence ( #3025 )  
						
						... 
						
						
						
						* stop seqs in multi-ends
* unittest for gpu stop op
* kernel tid==0 
						
						
					 
					
						2025-07-29 14:17:37 +08:00 
						 
				 
			
				
					
						
							
							
								Yuan Xiaolan 
							
						 
					 
					
						
						
							
						
						b1d787a272 
					 
					
						
						
							
							[fix] w4a8 model loading and hadamard config ( #3013 )  
						
						
						
						
					 
					
						2025-07-28 18:17:59 +08:00 
						 
				 
			
				
					
						
							
							
								K11OntheBoat 
							
						 
					 
					
						
						
							
						
						83048bbe55 
					 
					
						
						
							
							[Feature] Deepseekv3 supports cudagraph ( #3041 )  
						
						... 
						
						
						
						Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com ”> 
						
						
					 
					
						2025-07-28 17:12:54 +08:00 
						 
				 
			
				
					
						
							
							
								AIbin 
							
						 
					 
					
						
						
							
						
						ec52d39e68 
					 
					
						
						
							
							【Inference Optimize】Update wint2 weight n-dim reorder ( #3042 )  
						
						
						
						
					 
					
						2025-07-28 16:31:56 +08:00 
						 
				 
			
				
					
						
							
							
								YuanRisheng 
							
						 
					 
					
						
						
							
						
						bddf403576 
					 
					
						
						
							
							Unify server-side and model-side Config (Part2) ( #3035 )  
						
						... 
						
						
						
						* merge speculative and graph opt conifg
* add attr 
						
						
					 
					
						2025-07-28 15:31:48 +08:00 
						 
				 
			
				
					
						
							
							
								Longzhi Wang 
							
						 
					 
					
						
						
							
						
						247010d298 
					 
					
						
						
							
							fix arguement error ( #3030 )  
						
						
						
						
					 
					
						2025-07-28 11:03:29 +08:00 
						 
				 
			
				
					
						
							
							
								YuanRisheng 
							
						 
					 
					
						
						
							
						
						6ccc10ad47 
					 
					
						
						
							
							Unify server-side and model-side Config (Part1) ( #3018 )  
						
						... 
						
						
						
						* move cache config
* fix mtp 
						
						
					 
					
						2025-07-28 10:51:52 +08:00 
						 
				 
			
				
					
						
							
							
								Longzhi Wang 
							
						 
					 
					
						
						
							
						
						0700c90caa 
					 
					
						
						
							
							[Feat] support mixed ep ( #2969 )  
						
						... 
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						* Support mixed ep
* fix comment
* fix comment
* update mixep
* fix conflict
* fix typo
* update
* fix typo
* fix code style
* fix conflict 
						
						
					 
					
						2025-07-25 15:29:30 +08:00 
						 
				 
			
				
					
						
							
							
								chen 
							
						 
					 
					
						
						
							
						
						332154f504 
					 
					
						
						
							
							[feature] Support FA2 ( #3009 )  
						
						
						
						
					 
					
						2025-07-25 14:09:00 +08:00 
						 
				 
			
				
					
						
							
							
								xiaoxiaohehe001 
							
						 
					 
					
						
						
							
						
						2970b00dfa 
					 
					
						
						
							
							[Feature] Support_eplb ( #2997 )  
						
						... 
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						* [Feature] support_eplb
* [Feature] support_eplb
* [Fix] fix mm ep 
						
						
					 
					
						2025-07-24 20:22:45 +08:00 
						 
				 
			
				
					
						
							
							
								littledgg 
							
						 
					 
					
						
						
							
						
						f37d00e856 
					 
					
						
						
							
							[Model] Provide clearer error for missing KV cache quantization scales ( #3007 )  
						
						
						
						
					 
					
						2025-07-24 20:15:00 +08:00 
						 
				 
			
				
					
						
							
							
								EnflameGCU 
							
						 
					 
					
						
						
							
						
						c40df1802e 
					 
					
						
						
							
							[GCU] Update to develop ( #2988 )  
						
						
						
						
					 
					
						2025-07-24 19:30:52 +08:00 
						 
				 
			
				
					
						
							
							
								Zero Rains 
							
						 
					 
					
						
						
							
						
						0fb37ab7e4 
					 
					
						
						
							
							update flake8 version to support pre-commit in python3.12 ( #3000 )  
						
						... 
						
						
						
						* update flake8 version to support pre-commit in python3.12
* polish code 
						
						
					 
					
						2025-07-24 01:43:31 -07:00 
						 
				 
			
				
					
						
							
							
								xiaoxiaohehe001 
							
						 
					 
					
						
						
							
						
						2c0ff068e2 
					 
					
						
						
							
							[Fix] fix mm ep empty run ( #2999 )  
						
						
						
						
					 
					
						2025-07-24 14:15:55 +08:00 
						 
				 
			
				
					
						
							
							
								lizhenyun01 
							
						 
					 
					
						
						
							
						
						29c3292f02 
					 
					
						
						
							
							support c4 attn && fix cache  
						
						
						
						
					 
					
						2025-07-24 12:00:52 +08:00 
						 
				 
			
				
					
						
							
							
								lizexu123 
							
						 
					 
					
						
						
							
						
						832d25334a 
					 
					
						
						
							
							[Code Simplification] fix init_distributed_environment() ( #2982 )  
						
						
						
						
					 
					
						2025-07-24 11:43:28 +08:00 
						 
				 
			
				
					
						
							
							
								bukejiyu 
							
						 
					 
					
						
						
							
						
						bfeb664ab8 
					 
					
						
						
							
							update ( #2978 )  
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						
					 
					
						2025-07-24 00:16:42 +08:00 
						 
				 
			
				
					
						
							
							
								chenjian 
							
						 
					 
					
						
						
							
						
						85a78d695d 
					 
					
						
						
							
							[Feature] Support block scheduler v1 for FD ( #2928 )  
						
						... 
						
						
						
						* Support FD block scheduler v1
* Support FD block scheduler v1
* Support FD block scheduler v1
* Fix according to copilot review
* Fix according to review
* Remove is_dummy
* Fix bug when real_bsz=1
* Fix infer first token cost time
---------
Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com > 
						
						
					 
					
						2025-07-23 20:31:31 +08:00 
						 
				 
			
				
					
						
							
							
								chen 
							
						 
					 
					
						
						
							
						
						172e69fe17 
					 
					
						
						
							
							FA3 fix bug ( #2987 )  
						
						
						
						
					 
					
						2025-07-23 19:07:43 +08:00 
						 
				 
			
				
					
						
							
							
								lizexu123 
							
						 
					 
					
						
						
							
						
						9b22b8d2c3 
					 
					
						
						
							
							delete max-len ( #2959 )  
						
						
						
						
					 
					
						2025-07-23 15:11:39 +08:00 
						 
				 
			
				
					
						
							
							
								chen 
							
						 
					 
					
						
						
							
						
						ad202272ed 
					 
					
						
						
							
							【Infer】Improve the performance block_wise_fp8 of triton_moe_backend ( #2942 )  
						
						
						
						
					 
					
						2025-07-23 13:02:50 +08:00 
						 
				 
			
				
					
						
							
							
								lizhenyun01 
							
						 
					 
					
						
						
							
						
						e51f018577 
					 
					
						
						
							
							support chunk_prefill in fa3  
						
						
						
						
					 
					
						2025-07-23 12:19:20 +08:00 
						 
				 
			
				
					
						
							
							
								Ryan 
							
						 
					 
					
						
						
							
						
						95b5af24db 
					 
					
						
						
							
							[SOT] Add sot warmup (NVIDIA GPU Only) ( #2929 )  
						
						... 
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						* add sot warmup
* fix code style
* change batch_size list
* add param to config
* rm free_list settings && set sot_warmup_sizes
* finish debug with dynamic dims by type annotations
* add profile_run guard
* rm sth useless 
						
						
					 
					
						2025-07-22 21:36:14 +08:00 
						 
				 
			
				
					
						
							
							
								K11OntheBoat 
							
						 
					 
					
						
						
							
						
						93bb68aa71 
					 
					
						
						
							
							[Feature] Marlin MoE backend supports DeepseekV3 ( #2962 )  
						
						... 
						
						
						
						Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com ”> 
						
						
					 
					
						2025-07-22 18:11:15 +08:00 
						 
				 
			
				
					
						
							
							
								Nyakku Shigure 
							
						 
					 
					
						
						
							
						
						48e6a0ca26 
					 
					
						
						
							
							[SOT] Mark dynamic dims by type annotations ( #2771 )  
						
						... 
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						* [SOT] Mark dynamic dims by type annotations
* fix conflict of forward_meta
* mark more attn backend
* fix missing annotated and add env SOT_SPECIALIZED_DIM_NUMBERS
* auto infer implicit 0 dim dynamic dim
* revert manual marked dims
* revert missing update
* auto infer can use unsafe code in warmup stage
* check -> type_match
* fix codestyle
* restore blank line
* empty commit
* add need_warmup nonlocal;
* add doc for resolver
* add missing type hints
* unquote "ForwardMeta" 
						
						
					 
					
						2025-07-22 00:23:52 -07:00 
						 
				 
			
				
					
						
							
							
								lifulll 
							
						 
					 
					
						
						
							
						
						2c6a9e887e 
					 
					
						
						
							
							native top_p_sampling ( #2901 )  
						
						
						
						
					 
					
						2025-07-22 14:09:59 +08:00 
						 
				 
			
				
					
						
							
							
								K11OntheBoat 
							
						 
					 
					
						
						
							
						
						8020927f50 
					 
					
						
						
							
							[BugFix] Rename attention params of deepseekv3 ( #2939 )  
						
						... 
						
						
						
						Co-authored-by: K11OntheBoat <“ruianmaidanglao@163.com ”> 
						
						
					 
					
						2025-07-22 14:01:30 +08:00 
						 
				 
			
				
					
						
							
							
								zhink 
							
						 
					 
					
						
						
							
						
						0262ef7eb3 
					 
					
						
						
							
							custom all reduce support cuda graph ( #2938 )  
						
						... 
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						* Support enabling cuda graph and custom all reduce at the same time, and fix the overwritten custom all reduce flag
* rename communication_op to communication 
						
						
					 
					
						2025-07-21 22:52:03 +08:00 
						 
				 
			
				
					
						
							
							
								周周周 
							
						 
					 
					
						
						
							
						
						ff4569f135 
					 
					
						
						
							
							remove some code in ep.py ( #2947 )  
						
						
						
						
					 
					
						2025-07-21 22:44:57 +08:00 
						 
				 
			
				
					
						
							
							
								Yuanle Liu 
							
						 
					 
					
						
						
							
						
						2f74e93d7e 
					 
					
						
						
							
							use dist.all_reduce(min) to sync num_blocks_local ( #2933 )  
						
						... 
						
						
						
						* pre-commit all files check
* reduce min num_blocks_local
* fix nranks=1
* pre-commit when commit-msg 
						
						
					 
					
						2025-07-21 01:23:36 -07:00 
						 
				 
			
				
					
						
							
							
								lizexu123 
							
						 
					 
					
						
						
							
						
						67990e0572 
					 
					
						
						
							
							[Feature] support min_p_sampling ( #2872 )  
						
						... 
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						* Fastdeploy support min_p
* add test_min_p
* fix
* min_p_sampling
* update
* delete vl_gpu_model_runner.py
* fix
* Align usage of min_p with vLLM
* fix
* modified unit test
* fix test_min_sampling
* pre-commit all files
* fix
* fix
* fix
* fix xpu_model_runner.py 
						
						
					 
					
						2025-07-20 23:17:59 -07:00 
						 
				 
			
				
					
						
							
							
								周周周 
							
						 
					 
					
						
						
							
						
						8c5407d9e4 
					 
					
						
						
							
							remove cum_offsets from ForwardMeta ( #2925 )  
						
						
	
		
			
	 
	
	
		
	
	
		
			
				
	Deploy GitHub Pages / deploy (push) Has been cancelled 
				
			 
		
		
	 
 
	 
						
						
					 
					
						2025-07-19 23:57:27 +08:00 
						 
				 
			
				
					
						
							
							
								Zero Rains 
							
						 
					 
					
						
						
							
						
						25698d56d1 
					 
					
						
						
							
							polish code with new pre-commit rule ( #2923 )  
						
						
						
						
					 
					
						2025-07-19 23:19:27 +08:00 
						 
				 
			
				
					
						
							
							
								ming1753 
							
						 
					 
					
						
						
							
						
						5328daa333 
					 
					
						
						
							
							[Bug Fix] fix ep config bug ( #2920 )  
						
						
						
						
					 
					
						2025-07-18 19:12:56 +08:00 
						 
				 
			
				
					
						
							
							
								xiaoxiaohehe001 
							
						 
					 
					
						
						
							
						
						a42fc3f40b 
					 
					
						
						
							
							[Feature] Support 45tVL EP FP8 Infer. ( #2909 )  
						
						... 
						
						
						
						* support_mm_ep_fp8
* support_mm_ep 
						
						
					 
					
						2025-07-18 17:57:15 +08:00 
						 
				 
			
				
					
						
							
							
								gaoziyuan 
							
						 
					 
					
						
						
							
						
						6efad14b95 
					 
					
						
						
							
							support vl ori_vacab_size ( #2900 )  
						
						
						
						
					 
					
						2025-07-18 16:26:14 +08:00 
						 
				 
			
				
					
						
							
							
								周周周 
							
						 
					 
					
						
						
							
						
						d306944f4f 
					 
					
						
						
							
							remove cum_offsets from get_block_shape_and_split_kv_block ( #2913 )  
						
						... 
						
						
						
						* remove padding_offsets from get_padding_offset.cu
* remove padding_offsets from get_padding_offset.cu
* remove padding_offsets from get_padding_offset.cu
* remove cum_offsets from get_block_shape_and_split_kv_block
* remove cum_offsets from get_block_shape_and_split_kv_block 
						
						
					 
					
						2025-07-18 16:13:32 +08:00 
						 
				 
			
				
					
						
							
							
								周周周 
							
						 
					 
					
						
						
							
						
						ddb10ac509 
					 
					
						
						
							
							[Inference, rename] remove padding_offsets from atten use batch_id_per_token ( #2880 )  
						
						... 
						
						
						
						* remove padding_offsets from atten 
						
						
					 
					
						2025-07-17 18:41:31 +08:00