diff --git a/benchmarks/benchmark_mtp.py b/benchmarks/benchmark_mtp.py index 2698a553b..a28cc7b12 100644 --- a/benchmarks/benchmark_mtp.py +++ b/benchmarks/benchmark_mtp.py @@ -98,7 +98,7 @@ def main(args): raise ValueError("--max_concurrency should be same length as --s_itl_base_model") for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model): - # Wramup + # Warmup print("Starting warmup...") with open(os.devnull, "w") as f: with contextlib.redirect_stdout(f): diff --git a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh index 2dd52871a..341dbf5b5 100644 --- a/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh +++ b/custom_ops/gpu_ops/custom_all_reduce/all_reduce.cuh @@ -303,7 +303,7 @@ class CustomAllreduce { bool full_nvlink_; RankSignals sg_; - // Stores an map from a pointer to its peer pointters from all ranks. + // Stores an map from a pointer to its peer pointers from all ranks. std::unordered_map buffers_; Signal* self_sg_; diff --git a/docs/get_started/installation/nvidia_gpu.md b/docs/get_started/installation/nvidia_gpu.md index 706162151..5eeb39121 100644 --- a/docs/get_started/installation/nvidia_gpu.md +++ b/docs/get_started/installation/nvidia_gpu.md @@ -10,7 +10,7 @@ The following installation methods are available when your environment meets the ## 1. Pre-built Docker Installation (Recommended) -**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdpeloy-gpu``` after you create the container. +**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdeploy-gpu``` after you create the container. ```shell docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.2.0 diff --git a/docs/usage/code_overview.md b/docs/usage/code_overview.md index 506a51680..8008fc677 100644 --- a/docs/usage/code_overview.md +++ b/docs/usage/code_overview.md @@ -20,6 +20,6 @@ Below is an overview of the FastDeploy code structure and functionality organize - ```platforms```: Platform-specific modules for underlying hardware support. - ```scheduler```: Request scheduling module for large models. - ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts). - - ```splitwise```: Modules related to PD disaggragation deployment. + - ```splitwise```: Modules related to PD disaggregation deployment. - ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes). - ```test```: Code for unit testing and validation. diff --git a/docs/usage/log.md b/docs/usage/log.md index 60e658a5b..6f7312365 100644 --- a/docs/usage/log.md +++ b/docs/usage/log.md @@ -30,7 +30,7 @@ By default, logs are stored in the `log` directory under the execution path. To * `cache_transfer_manager.log` : Logs startup parameters and received request information. * `launch_cache_manager.log` : Records cache transfer startup parameters and error messages. -## PD Disaggragation Logs +## PD Disaggregation Logs * `cache_messager.log` : Logs transmission protocols and messages used by the P instance. * `splitwise_connector.log` : Records data received from P/D instances and connection establishment details. diff --git a/mkdocs.yml b/mkdocs.yml index e6356be74..714a6dc32 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: 'FastDeploy : Large Language Model Deployement' +site_name: 'FastDeploy : Large Language Model Deployment' repo_url: https://github.com/PaddlePaddle/FastDeploy repo_name: FastDeploy @@ -36,7 +36,7 @@ plugins: - locale: en default: true name: English - site_name: 'FastDeploy: Large Language Model Deployement' + site_name: 'FastDeploy: Large Language Model Deployment' build: true link: /FastDeploy/ - locale: zh @@ -59,7 +59,7 @@ plugins: ERNIE-4.5-VL-424B-A47B: ERNIE-4.5-VL-424B-A47B快速部署 Quick Deployment For QWEN: Qwen3-0.6b快速部署 Online Serving: 在线服务 - OpenAI-Compitable API Server: 兼容 OpenAI 协议的服务化部署 + OpenAI-Compatible API Server: 兼容 OpenAI 协议的服务化部署 Monitor Metrics: 监控Metrics Scheduler: 调度器 Graceful Shutdown: 服务优雅关闭 @@ -114,7 +114,7 @@ nav: - ERNIE-4.5-VL-424B-A47B: get_started/ernie-4.5-vl.md - Quick Deployment For QWEN: get_started/quick_start_qwen.md - Online Serving: - - OpenAI-Compitable API Server: online_serving/README.md + - OpenAI-Compatible API Server: online_serving/README.md - Monitor Metrics: online_serving/metrics.md - Scheduler: online_serving/scheduler.md - Graceful Shutdown: online_serving/graceful_shutdown_service.md diff --git a/tests/ce/server/test_return_token_ids.py b/tests/ce/server/test_return_token_ids.py index 941c21714..ccaf496af 100644 --- a/tests/ce/server/test_return_token_ids.py +++ b/tests/ce/server/test_return_token_ids.py @@ -9,12 +9,7 @@ Checking for /v1/completions parameters import json -from core import ( - TEMPLATE, - URL, - build_request_payload, - send_request, -) +from core import TEMPLATE, URL, build_request_payload, send_request COMPLETIONS_URL = URL.replace("/v1/chat/completions", "/v1/completions") @@ -29,9 +24,9 @@ def test_completion_stream_text_after_process_raw_prediction(): "stream": True, "stream_options": {"include_usage": True, "continuous_usage_stats": True}, "max_tokens": 50, - "return_token_ids": True + "return_token_ids": True, } - + payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload, stream=True) for line in resp.iter_lines(decode_unicode=True): @@ -39,7 +34,7 @@ def test_completion_stream_text_after_process_raw_prediction(): break if line.strip() == "" or not line.startswith("data: "): continue - line = line[len("data: "):] + line = line[len("data: ") :] response_data = json.loads(line) choice = response_data["choices"][0] @@ -51,21 +46,16 @@ def test_completion_stream_text_after_process_raw_prediction(): reasoning_content = choice["reasoning_content"] text = choice["text"] assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确" - if "finish_reason" in line.strip() : + if "finish_reason" in line.strip(): break - -def test_completion_text_after_process_raw_predictio_return_tokrn_ids(): + +def test_completion_text_after_process_raw_predictio_return_token_ids(): """ /v1/completions接口,非流式接口 返回属性"text_after_process"和"reasoning_content" """ - data = { - "stream": False, - "prompt": "你是谁", - "max_tokens": 50, - "return_token_ids": True - } + data = {"stream": False, "prompt": "你是谁", "max_tokens": 50, "return_token_ids": True} payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload).json() @@ -80,14 +70,10 @@ def test_completion_text_after_process_raw_predictio_return_tokrn_ids(): def test_completion_text_after_process_raw_prediction(): """ - /v1/completions接口,无return_tokrn_ids参数 + /v1/completions接口,无return_token_ids参数 非流式接口中,无return token ids 属性"text_after_process"和"reasoning_content"值为null """ - data = { - "stream": False, - "prompt": "你是谁", - "max_tokens": 50 - } + data = {"stream": False, "prompt": "你是谁", "max_tokens": 50} payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload).json() @@ -108,17 +94,17 @@ def test_stream_text_after_process_raw_prediction(): "stream": True, "stream_options": {"include_usage": True, "continuous_usage_stats": True}, "max_tokens": 50, - "return_token_ids": True + "return_token_ids": True, } payload = build_request_payload(TEMPLATE, data) resp = send_request(URL, payload, stream=True) for line in resp.iter_lines(decode_unicode=True): - if line.strip() == "data: [DONE]" : + if line.strip() == "data: [DONE]": break if line.strip() == "" or not line.startswith("data: "): continue - line = line[len("data: "):] + line = line[len("data: ") :] response_data = json.loads(line) choice = response_data["choices"][0] @@ -130,11 +116,11 @@ def test_stream_text_after_process_raw_prediction(): reasoning_content = choice["delta"]["reasoning_content"] content = choice["delta"]["content"] assert reasoning_content or content in raw_prediction, "raw_prediction取值结果不正确" - if "finish_reason" in line.strip() : + if "finish_reason" in line.strip(): break - -def test_text_after_process_raw_prediction_return_tokrn_ids(): + +def test_text_after_process_raw_prediction_return_token_ids(): """ /v1/chat/completions接口,非流式接口 返回属性"text_after_process"和"reasoning_content" @@ -161,7 +147,7 @@ def test_text_after_process_raw_prediction_return_tokrn_ids(): def test_text_after_process_raw_prediction(): """ - /v1/chat/completions接口,无return_tokrn_ids参数 + /v1/chat/completions接口,无return_token_ids参数 无return token ids 属性"text_after_process"和"reasoning_content"值为null """ data = { @@ -179,5 +165,3 @@ def test_text_after_process_raw_prediction(): raw_prediction = resp["choices"][0]["message"]["raw_prediction"] assert raw_prediction is None, "raw_prediction取值结果不正确" - - diff --git a/tests/ci_use/EB_Lite_with_adapter/zmq_client.py b/tests/ci_use/EB_Lite_with_adapter/zmq_client.py index db811d04a..349f32bc3 100644 --- a/tests/ci_use/EB_Lite_with_adapter/zmq_client.py +++ b/tests/ci_use/EB_Lite_with_adapter/zmq_client.py @@ -50,7 +50,7 @@ class LLMReqClient: if self.need_exit: break except Exception as e: - print(f"zmq client occured error {e} type: {type(e)} frames: {frames}") + print(f"zmq client occurred error {e} type: {type(e)} frames: {frames}") def start(self, result_queue): threading.Thread(target=self.consume_results, args=(result_queue,), daemon=True).start() @@ -118,4 +118,4 @@ class LLMControlClient: self.result[task_id] = result["result"] self.task_event[task_id].set() except Exception as e: - print(f"zmq client occured error {e} type: {type(e)} frames: {frames}") + print(f"zmq client occurred error {e} type: {type(e)} frames: {frames}") diff --git a/tests/distributed/custom_all_reduce.py b/tests/distributed/custom_all_reduce.py index ccc984d3d..651b69244 100644 --- a/tests/distributed/custom_all_reduce.py +++ b/tests/distributed/custom_all_reduce.py @@ -54,14 +54,14 @@ class Test(unittest.TestCase): fa = CustomAllreduce(model_parallel_group) for m, n in mns: - data_cusom_ar = paddle.rand([m, n], dtype="bfloat16") - data_paddle = data_cusom_ar.clone() - if fa.should_custom_ar(data_cusom_ar): - fa.custom_all_reduce(data_cusom_ar) + data_custom_ar = paddle.rand([m, n], dtype="bfloat16") + data_paddle = data_custom_ar.clone() + if fa.should_custom_ar(data_custom_ar): + fa.custom_all_reduce(data_custom_ar) dist.all_reduce(data_paddle) if dist.get_rank() == 0: np.testing.assert_allclose( - data_cusom_ar.numpy(), + data_custom_ar.numpy(), data_paddle.numpy(), rtol=1e-04, atol=1e-04, diff --git a/tools/deep_gemm_pre-compile/pre_compile.py b/tools/deep_gemm_pre-compile/pre_compile.py index 55cdca3a9..fd4bfdb37 100644 --- a/tools/deep_gemm_pre-compile/pre_compile.py +++ b/tools/deep_gemm_pre-compile/pre_compile.py @@ -158,7 +158,7 @@ def pre_compile_from_config(config_file: str, num_threads: int, expert_parallel: pbar.close() - logger.info(f"Total compliation time: {time() - start_time:.2f} seconds") + logger.info(f"Total compilation time: {time() - start_time:.2f} seconds") def main(args):