From 17a27170bc82773878c0196682bd8a0778b061ce Mon Sep 17 00:00:00 2001 From: co63oc Date: Mon, 15 Sep 2025 18:33:30 +0800 Subject: [PATCH] fix typos (#4093) --- .../xpu_ops/test/test_block_attn_prefix_cache.py | 8 ++++---- custom_ops/xpu_ops/test/test_moe_ep_combine.py | 2 +- custom_ops/xpu_ops/test/test_moe_ep_dispatch.py | 2 +- custom_ops/xpu_ops/test/test_weight_only_linear.py | 2 +- docs/features/plas_attention.md | 4 ++-- docs/get_started/installation/iluvatar_gpu.md | 2 +- docs/zh/features/plas_attention.md | 4 ++-- fastdeploy/engine/sched/resource_manager_v1.py | 4 ++-- fastdeploy/plugins/model_runner/__init__.py | 2 +- fastdeploy/worker/worker_process.py | 2 +- tests/entrypoints/openai/test_max_streaming_tokens.py | 4 ++-- .../test_cuda_graph_dynamic_subgraph.py | 2 +- tests/graph_optimization/test_cuda_graph_recapture.py | 10 +++++----- .../graph_optimization/test_cuda_graph_spec_decode.py | 2 +- tests/model_loader/test_model_cache.py | 2 +- tests/model_loader/utils.py | 2 +- tests/output/test_get_save_output_v1.py | 2 +- 17 files changed, 28 insertions(+), 28 deletions(-) diff --git a/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py b/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py index 1a607e192..000c0a359 100644 --- a/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py +++ b/custom_ops/xpu_ops/test/test_block_attn_prefix_cache.py @@ -308,7 +308,7 @@ assert np.allclose( ), f"C16 prefix cache != No prefix cache,\n attn_out[hit_prefix_len:]: {attn_out_np},\nattn_out_prefix_cache: {attn_out_prefix_cache_np}" -print("\n-- C8 per channle prefix cache test --") +print("\n-- C8 per channel prefix cache test --") print( "attn_out_C8[hit_prefix_len:]'s mean:", attn_out_C8[hit_prefix_len:].mean().item(), @@ -318,9 +318,9 @@ attn_out_C8_prefix_cache_np = attn_out_C8_prefix_cache.astype("float32").numpy() attn_out_C8_np = attn_out_C8[hit_prefix_len:].astype("float32").numpy() assert np.allclose( attn_out_C8_prefix_cache_np, attn_out_C8_np, rtol=1e-1, atol=1e-2 -), f"C8 per channle prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}" +), f"C8 per channel prefix cache != No prefix cache,\n attn_out_C8[hit_prefix_len:]: {attn_out_C8_np},\nattn_out_C8_prefix_cache: {attn_out_C8_prefix_cache_np}" -print("\n-- C8 per channle zp prefix cache test --") +print("\n-- C8 per channel zp prefix cache test --") print( "attn_out_C8_zp[hit_prefix_len:]'s mean:", attn_out_C8_zp[hit_prefix_len:].mean().item(), @@ -333,4 +333,4 @@ attn_out_C8_zp_prefix_cache_np = attn_out_C8_zp_prefix_cache.astype("float32").n attn_out_C8_zp_np = attn_out_C8_zp[hit_prefix_len:].astype("float32").numpy() assert np.allclose( attn_out_C8_zp_prefix_cache_np, attn_out_C8_zp_np, rtol=1e-1, atol=1e-2 -), f"C8 per channle zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}" +), f"C8 per channel zp prefix cache != No prefix cache,\n attn_out_C8_zp[hit_prefix_len:]: {attn_out_C8_zp_np},\nattn_out_C8_zp_prefix_cache: {attn_out_C8_zp_prefix_cache_np}" diff --git a/custom_ops/xpu_ops/test/test_moe_ep_combine.py b/custom_ops/xpu_ops/test/test_moe_ep_combine.py index b71e05dae..535ecba67 100644 --- a/custom_ops/xpu_ops/test/test_moe_ep_combine.py +++ b/custom_ops/xpu_ops/test/test_moe_ep_combine.py @@ -80,7 +80,7 @@ combined_out_pd = ep_moe_expert_combine( moe_index_pd.shape[1], ) -# comparation +# comparison # print("moe_index:\n", moe_index) # print("moe_weights:\n", moe_weights) # print("combined_out_np:\n", combined_out_np) diff --git a/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py b/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py index 9b38bb34e..f6f67662d 100644 --- a/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py +++ b/custom_ops/xpu_ops/test/test_moe_ep_dispatch.py @@ -117,7 +117,7 @@ print(f"cumsum_idx:\n{cumsum_idx}") "weight_only_int8", ) -# comparation +# comparison permute_input_xpu = permute_input_xpu.astype("float32").numpy() permute_indices_per_token_xpu = permute_indices_per_token_xpu.numpy() recv_num_tokens_per_expert_list_cumsum_xpu = recv_num_tokens_per_expert_list_cumsum_xpu.numpy() diff --git a/custom_ops/xpu_ops/test/test_weight_only_linear.py b/custom_ops/xpu_ops/test/test_weight_only_linear.py index fe3993e12..651f1305d 100644 --- a/custom_ops/xpu_ops/test/test_weight_only_linear.py +++ b/custom_ops/xpu_ops/test/test_weight_only_linear.py @@ -130,7 +130,7 @@ out_pd = weight_only_linear(x_pd, qw_pd, wscale_pd, None, weight_dtype, -1, -1) print(f"out_pd:\n{out_pd}") print(f"out_np:\n{out_np}") -# comparation +# comparison print(f"out_pd, mean={out_pd.mean()}, std={out_pd.std()}") print(f"out_np, mean={out_np.mean()}, std={out_np.std()}") sum_diff = np.sum(np.abs(out_pd.astype("float32").numpy() - out_np.astype("float32"))) diff --git a/docs/features/plas_attention.md b/docs/features/plas_attention.md index 8384de3b5..dfd85e676 100644 --- a/docs/features/plas_attention.md +++ b/docs/features/plas_attention.md @@ -32,7 +32,7 @@ During sparse attention computation, each query token may dynamically select dif To optimize performance in both the prefill and decode stages, we design a special joint strategy to adapt to their respective characteristics: -* **Prefill Toke Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens. +* **Prefill Token Union**: We observe that adjacent query tokens tend to select similar key blocks. Leveraging this locality, we take the union of the key blocks selected by consecutive 128 query tokens and jointly compute sparse attention for these tokens. * **Decode Head Union**: Given the widespread adoption of GQA in modern models, we find that different heads within the same group often select overlapping key blocks. Thus, we combine the key blocks selected by all query heads within a group into a unified set and jointly calculate sparse attention. This way also reduces memory access overhead and further improves decoding efficiency. * **Top-K Selection**: Conventional top-k algorithms based on sorting or direct calls to the cub library introduce significant runtime overhead. To mitigate this, we implemented an approximate top-k selection algorithm using binary search, which significantly reduces latency while maintaining accuracy, ultimately achieving significantly improved performance. @@ -132,7 +132,7 @@ We selected a subset (longbook_sum_eng) from InfiniteBench as the performance ev QPS Decode Speed (token/s) Time to First token(s) - Time per Ouput Token(ms) + Time per Output Token(ms) End-to-End Latency(s) Mean Input
Length
Mean Output Length diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index 9b4c96f00..e5fd46d96 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -33,7 +33,7 @@ docker exec -it paddle_infer bash pip3 install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ pip3 install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ ``` -For latest paddle verion on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) +For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) ### Install or build FastDeploy ```bash diff --git a/docs/zh/features/plas_attention.md b/docs/zh/features/plas_attention.md index 09a98e6f4..f415f49b1 100644 --- a/docs/zh/features/plas_attention.md +++ b/docs/zh/features/plas_attention.md @@ -34,7 +34,7 @@ 为了优化预填充和解码阶段的性能,我们设计了一种特殊的联合策略来适应各自的特点: -* **Prefill Toke Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。 +* **Prefill Token Union**: 我们观察到相邻的查询标记倾向于选择相似的关键块。利用这种局部性,我们取连续 128 个查询标记选择的关键块的并集,并联合计算这些标记的稀疏注意力机制。 * **Decode Head Union**: 鉴于 GQA 在现代模型中的广泛应用,我们发现同一组内的不同查询头经常选择重叠的关键块。因此,我们将同一组内所有查询头选择的关键块合并为一个统一的集合,并联合计算稀疏注意力机制。这种方式也减少了内存访问开销,并进一步提高了解码效率。 @@ -136,7 +136,7 @@ QPS Decode Speed (token/s) Time to First token(s) - Time per Ouput Token(ms) + Time per Output Token(ms) End-to-End Latency(s) Mean Input
Length
Mean Output Length diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index a6c7f355d..f1a663d35 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -455,7 +455,7 @@ class ResourceManagerV1(ResourceManager): # schedule when extend block tables is needed for req in self.running: num_prefill_blocks = req.need_prefill_tokens // self.config.cache_config.block_size - # alocate + # allocate if req.use_extend_tables and req.request_id not in self.using_extend_tables_req_id: llm_logger.info( f"req {req.request_id} at batch id {req.idx} with num_prefill_blocks {num_prefill_blocks} is going to enable extend tables" @@ -488,7 +488,7 @@ class ResourceManagerV1(ResourceManager): <= self.config.cache_config.prealloc_dec_block_slot_num_threshold ): llm_logger.info( - f"req {req.request_id} is going to alocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}" + f"req {req.request_id} is going to allocate more extend tables because allocated_slots {self.allocated_slots(req)} and prealloc_dec_block_slot_num_threshold {self.config.cache_config.prealloc_dec_block_slot_num_threshold} req.num_total_tokens {req.num_total_tokens}" ) if self.cache_manager.can_allocate_gpu_blocks(self.config.cache_config.enc_dec_block_num): req.extend_block_tables.extend( diff --git a/fastdeploy/plugins/model_runner/__init__.py b/fastdeploy/plugins/model_runner/__init__.py index 19ce33ce8..6741e331a 100644 --- a/fastdeploy/plugins/model_runner/__init__.py +++ b/fastdeploy/plugins/model_runner/__init__.py @@ -16,7 +16,7 @@ from fastdeploy.plugins.utils import load_plugins_by_group -# use for modle runner +# use for model runner PLUGINS_GROUP = "fastdeploy.model_runner_plugins" diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 3cf6fe928..6befe5fbd 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -576,7 +576,7 @@ def parse_args(): "--moba_attention_config", type=json.loads, default=None, - help="Configation of moba attention.", + help="Configuration of moba attention.", ) parser.add_argument( "--guided_decoding_backend", diff --git a/tests/entrypoints/openai/test_max_streaming_tokens.py b/tests/entrypoints/openai/test_max_streaming_tokens.py index fe48be8c4..0b474d332 100644 --- a/tests/entrypoints/openai/test_max_streaming_tokens.py +++ b/tests/entrypoints/openai/test_max_streaming_tokens.py @@ -181,7 +181,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): chunk_dict = json.loads(json_part) parsed_chunks.append(chunk_dict) except json.JSONDecodeError as e: - self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}") + self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}") else: self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}") for chunk_dict in parsed_chunks: @@ -260,7 +260,7 @@ class TestMaxStreamingResponseTokens(IsolatedAsyncioTestCase): chunk_dict = json.loads(json_part) parsed_chunks.append(chunk_dict) except json.JSONDecodeError as e: - self.fail(f"Cannot parser {i+1} chunck, JSON: {e}\n origin string: {repr(chunk_str)}") + self.fail(f"Cannot parser {i+1} chunk, JSON: {e}\n origin string: {repr(chunk_str)}") else: self.fail(f"{i+1} chunk is unexcepted 'data: JSON\\n\\n': {repr(chunk_str)}") self.assertEqual(len(parsed_chunks), 1) diff --git a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py index b6e74753b..4143fcfd6 100644 --- a/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py +++ b/tests/graph_optimization/test_cuda_graph_dynamic_subgraph.py @@ -89,7 +89,7 @@ class TestCase1SubLayer3(paddle.nn.Layer): class TestModel1(paddle.nn.Layer): - """Tast Model""" + """Test Model""" def __init__(self, fd_config: FDConfig, **kwargs): super().__init__() diff --git a/tests/graph_optimization/test_cuda_graph_recapture.py b/tests/graph_optimization/test_cuda_graph_recapture.py index 126d231fe..6b9fb5de5 100644 --- a/tests/graph_optimization/test_cuda_graph_recapture.py +++ b/tests/graph_optimization/test_cuda_graph_recapture.py @@ -36,7 +36,7 @@ class TestCase1SubLayer1(paddle.nn.Layer): class TestModel1(paddle.nn.Layer): - """Tast Model""" + """Test Model""" def __init__(self, fd_config: FDConfig, **kwargs): super().__init__() @@ -123,9 +123,9 @@ class TestCUDAGrpahRecapture(unittest.TestCase): assert (output1 == self.output_correct).all() # Destroy - print_gpu_memory_use(0, "before destory") + print_gpu_memory_use(0, "before destroy") self.test_model1.clear_grpah_opt_backend() - print_gpu_memory_use(0, "after destory") + print_gpu_memory_use(0, "after destroy") def recapture_and_replay(self, input_tensor1, forward_meta1): """ """ @@ -139,9 +139,9 @@ class TestCUDAGrpahRecapture(unittest.TestCase): assert (output2 == self.output_correct).all() # Destroy - print_gpu_memory_use(0, "before destory") + print_gpu_memory_use(0, "before destroy") self.test_model1.clear_grpah_opt_backend() - print_gpu_memory_use(0, "after destory") + print_gpu_memory_use(0, "after destroy") if __name__ == "__main__": diff --git a/tests/graph_optimization/test_cuda_graph_spec_decode.py b/tests/graph_optimization/test_cuda_graph_spec_decode.py index f4a95cead..9162d7173 100644 --- a/tests/graph_optimization/test_cuda_graph_spec_decode.py +++ b/tests/graph_optimization/test_cuda_graph_spec_decode.py @@ -51,7 +51,7 @@ class TestCase1SubLayer1(paddle.nn.Layer): class TestModel1(paddle.nn.Layer): - """Tast Model""" + """Test Model""" def __init__(self, fd_config: FDConfig, **kwargs): super().__init__() diff --git a/tests/model_loader/test_model_cache.py b/tests/model_loader/test_model_cache.py index 342c901af..e48a136b6 100644 --- a/tests/model_loader/test_model_cache.py +++ b/tests/model_loader/test_model_cache.py @@ -123,6 +123,6 @@ def test_model_cache( check_tokens_id_and_text_close( outputs_0_lst=fd_outputs_v1, outputs_1_lst=fd_outputs_v1_with_cache, - name_0="default_v1 laoder", + name_0="default_v1 loader", name_1="default_v1 loader using cache", ) diff --git a/tests/model_loader/utils.py b/tests/model_loader/utils.py index e625f9065..3c405c3e7 100644 --- a/tests/model_loader/utils.py +++ b/tests/model_loader/utils.py @@ -100,7 +100,7 @@ def form_model_get_output_topp0( fd_outputs = fd_model.generate_topp0(prompts, max_tokens=max_tokens) result_queue.put(fd_outputs) except Exception: - print(f"Failed using {load_choices} laoder to load model from {model_path}.") + print(f"Failed using {load_choices} loader to load model from {model_path}.") traceback.print_exc() pytest.fail(f"Failed to initialize LLM model from {model_path}") diff --git a/tests/output/test_get_save_output_v1.py b/tests/output/test_get_save_output_v1.py index 29a47be46..ddfd944ee 100644 --- a/tests/output/test_get_save_output_v1.py +++ b/tests/output/test_get_save_output_v1.py @@ -28,7 +28,7 @@ FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333)) MAX_WAIT_SECONDS = 60 os.environ["LD_LIBRARY_PATH"] = "/usr/local/nccl/" -# enbale get_save_output_v1 +# enable get_save_output_v1 os.environ["FD_USE_GET_SAVE_OUTPUT_V1"] = "1"