mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
@@ -98,7 +98,7 @@ def main(args):
|
|||||||
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
|
raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
|
||||||
|
|
||||||
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
|
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
|
||||||
# Wramup
|
# Warmup
|
||||||
print("Starting warmup...")
|
print("Starting warmup...")
|
||||||
with open(os.devnull, "w") as f:
|
with open(os.devnull, "w") as f:
|
||||||
with contextlib.redirect_stdout(f):
|
with contextlib.redirect_stdout(f):
|
||||||
|
|||||||
@@ -303,7 +303,7 @@ class CustomAllreduce {
|
|||||||
bool full_nvlink_;
|
bool full_nvlink_;
|
||||||
|
|
||||||
RankSignals sg_;
|
RankSignals sg_;
|
||||||
// Stores an map from a pointer to its peer pointters from all ranks.
|
// Stores an map from a pointer to its peer pointers from all ranks.
|
||||||
std::unordered_map<void*, RankData*> buffers_;
|
std::unordered_map<void*, RankData*> buffers_;
|
||||||
Signal* self_sg_;
|
Signal* self_sg_;
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ The following installation methods are available when your environment meets the
|
|||||||
|
|
||||||
## 1. Pre-built Docker Installation (Recommended)
|
## 1. Pre-built Docker Installation (Recommended)
|
||||||
|
|
||||||
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdpeloy-gpu``` after you create the container.
|
**Notice**: The pre-built image only supports SM80/90 GPU(e.g. H800/A800),if you are deploying on SM86/89GPU(L40/4090/L20), please reinstall ```fastdeploy-gpu``` after you create the container.
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.2.0
|
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.2.0
|
||||||
|
|||||||
@@ -20,6 +20,6 @@ Below is an overview of the FastDeploy code structure and functionality organize
|
|||||||
- ```platforms```: Platform-specific modules for underlying hardware support.
|
- ```platforms```: Platform-specific modules for underlying hardware support.
|
||||||
- ```scheduler```: Request scheduling module for large models.
|
- ```scheduler```: Request scheduling module for large models.
|
||||||
- ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts).
|
- ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts).
|
||||||
- ```splitwise```: Modules related to PD disaggragation deployment.
|
- ```splitwise```: Modules related to PD disaggregation deployment.
|
||||||
- ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes).
|
- ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes).
|
||||||
- ```test```: Code for unit testing and validation.
|
- ```test```: Code for unit testing and validation.
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ By default, logs are stored in the `log` directory under the execution path. To
|
|||||||
* `cache_transfer_manager.log` : Logs startup parameters and received request information.
|
* `cache_transfer_manager.log` : Logs startup parameters and received request information.
|
||||||
* `launch_cache_manager.log` : Records cache transfer startup parameters and error messages.
|
* `launch_cache_manager.log` : Records cache transfer startup parameters and error messages.
|
||||||
|
|
||||||
## PD Disaggragation Logs
|
## PD Disaggregation Logs
|
||||||
* `cache_messager.log` : Logs transmission protocols and messages used by the P instance.
|
* `cache_messager.log` : Logs transmission protocols and messages used by the P instance.
|
||||||
* `splitwise_connector.log` : Records data received from P/D instances and connection establishment details.
|
* `splitwise_connector.log` : Records data received from P/D instances and connection establishment details.
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
site_name: 'FastDeploy : Large Language Model Deployement'
|
site_name: 'FastDeploy : Large Language Model Deployment'
|
||||||
repo_url: https://github.com/PaddlePaddle/FastDeploy
|
repo_url: https://github.com/PaddlePaddle/FastDeploy
|
||||||
repo_name: FastDeploy
|
repo_name: FastDeploy
|
||||||
|
|
||||||
@@ -36,7 +36,7 @@ plugins:
|
|||||||
- locale: en
|
- locale: en
|
||||||
default: true
|
default: true
|
||||||
name: English
|
name: English
|
||||||
site_name: 'FastDeploy: Large Language Model Deployement'
|
site_name: 'FastDeploy: Large Language Model Deployment'
|
||||||
build: true
|
build: true
|
||||||
link: /FastDeploy/
|
link: /FastDeploy/
|
||||||
- locale: zh
|
- locale: zh
|
||||||
@@ -59,7 +59,7 @@ plugins:
|
|||||||
ERNIE-4.5-VL-424B-A47B: ERNIE-4.5-VL-424B-A47B快速部署
|
ERNIE-4.5-VL-424B-A47B: ERNIE-4.5-VL-424B-A47B快速部署
|
||||||
Quick Deployment For QWEN: Qwen3-0.6b快速部署
|
Quick Deployment For QWEN: Qwen3-0.6b快速部署
|
||||||
Online Serving: 在线服务
|
Online Serving: 在线服务
|
||||||
OpenAI-Compitable API Server: 兼容 OpenAI 协议的服务化部署
|
OpenAI-Compatible API Server: 兼容 OpenAI 协议的服务化部署
|
||||||
Monitor Metrics: 监控Metrics
|
Monitor Metrics: 监控Metrics
|
||||||
Scheduler: 调度器
|
Scheduler: 调度器
|
||||||
Graceful Shutdown: 服务优雅关闭
|
Graceful Shutdown: 服务优雅关闭
|
||||||
@@ -114,7 +114,7 @@ nav:
|
|||||||
- ERNIE-4.5-VL-424B-A47B: get_started/ernie-4.5-vl.md
|
- ERNIE-4.5-VL-424B-A47B: get_started/ernie-4.5-vl.md
|
||||||
- Quick Deployment For QWEN: get_started/quick_start_qwen.md
|
- Quick Deployment For QWEN: get_started/quick_start_qwen.md
|
||||||
- Online Serving:
|
- Online Serving:
|
||||||
- OpenAI-Compitable API Server: online_serving/README.md
|
- OpenAI-Compatible API Server: online_serving/README.md
|
||||||
- Monitor Metrics: online_serving/metrics.md
|
- Monitor Metrics: online_serving/metrics.md
|
||||||
- Scheduler: online_serving/scheduler.md
|
- Scheduler: online_serving/scheduler.md
|
||||||
- Graceful Shutdown: online_serving/graceful_shutdown_service.md
|
- Graceful Shutdown: online_serving/graceful_shutdown_service.md
|
||||||
|
|||||||
@@ -9,12 +9,7 @@ Checking for /v1/completions parameters
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from core import (
|
from core import TEMPLATE, URL, build_request_payload, send_request
|
||||||
TEMPLATE,
|
|
||||||
URL,
|
|
||||||
build_request_payload,
|
|
||||||
send_request,
|
|
||||||
)
|
|
||||||
|
|
||||||
COMPLETIONS_URL = URL.replace("/v1/chat/completions", "/v1/completions")
|
COMPLETIONS_URL = URL.replace("/v1/chat/completions", "/v1/completions")
|
||||||
|
|
||||||
@@ -29,7 +24,7 @@ def test_completion_stream_text_after_process_raw_prediction():
|
|||||||
"stream": True,
|
"stream": True,
|
||||||
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
|
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
|
||||||
"max_tokens": 50,
|
"max_tokens": 50,
|
||||||
"return_token_ids": True
|
"return_token_ids": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = build_request_payload(TEMPLATE, data)
|
payload = build_request_payload(TEMPLATE, data)
|
||||||
@@ -39,7 +34,7 @@ def test_completion_stream_text_after_process_raw_prediction():
|
|||||||
break
|
break
|
||||||
if line.strip() == "" or not line.startswith("data: "):
|
if line.strip() == "" or not line.startswith("data: "):
|
||||||
continue
|
continue
|
||||||
line = line[len("data: "):]
|
line = line[len("data: ") :]
|
||||||
response_data = json.loads(line)
|
response_data = json.loads(line)
|
||||||
|
|
||||||
choice = response_data["choices"][0]
|
choice = response_data["choices"][0]
|
||||||
@@ -51,21 +46,16 @@ def test_completion_stream_text_after_process_raw_prediction():
|
|||||||
reasoning_content = choice["reasoning_content"]
|
reasoning_content = choice["reasoning_content"]
|
||||||
text = choice["text"]
|
text = choice["text"]
|
||||||
assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确"
|
assert reasoning_content or text in raw_prediction, "raw_prediction取值结果不正确"
|
||||||
if "finish_reason" in line.strip() :
|
if "finish_reason" in line.strip():
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def test_completion_text_after_process_raw_predictio_return_tokrn_ids():
|
def test_completion_text_after_process_raw_predictio_return_token_ids():
|
||||||
"""
|
"""
|
||||||
/v1/completions接口,非流式接口
|
/v1/completions接口,非流式接口
|
||||||
返回属性"text_after_process"和"reasoning_content"
|
返回属性"text_after_process"和"reasoning_content"
|
||||||
"""
|
"""
|
||||||
data = {
|
data = {"stream": False, "prompt": "你是谁", "max_tokens": 50, "return_token_ids": True}
|
||||||
"stream": False,
|
|
||||||
"prompt": "你是谁",
|
|
||||||
"max_tokens": 50,
|
|
||||||
"return_token_ids": True
|
|
||||||
}
|
|
||||||
payload = build_request_payload(TEMPLATE, data)
|
payload = build_request_payload(TEMPLATE, data)
|
||||||
resp = send_request(COMPLETIONS_URL, payload).json()
|
resp = send_request(COMPLETIONS_URL, payload).json()
|
||||||
|
|
||||||
@@ -80,14 +70,10 @@ def test_completion_text_after_process_raw_predictio_return_tokrn_ids():
|
|||||||
|
|
||||||
def test_completion_text_after_process_raw_prediction():
|
def test_completion_text_after_process_raw_prediction():
|
||||||
"""
|
"""
|
||||||
/v1/completions接口,无return_tokrn_ids参数
|
/v1/completions接口,无return_token_ids参数
|
||||||
非流式接口中,无return token ids 属性"text_after_process"和"reasoning_content"值为null
|
非流式接口中,无return token ids 属性"text_after_process"和"reasoning_content"值为null
|
||||||
"""
|
"""
|
||||||
data = {
|
data = {"stream": False, "prompt": "你是谁", "max_tokens": 50}
|
||||||
"stream": False,
|
|
||||||
"prompt": "你是谁",
|
|
||||||
"max_tokens": 50
|
|
||||||
}
|
|
||||||
payload = build_request_payload(TEMPLATE, data)
|
payload = build_request_payload(TEMPLATE, data)
|
||||||
resp = send_request(COMPLETIONS_URL, payload).json()
|
resp = send_request(COMPLETIONS_URL, payload).json()
|
||||||
|
|
||||||
@@ -108,17 +94,17 @@ def test_stream_text_after_process_raw_prediction():
|
|||||||
"stream": True,
|
"stream": True,
|
||||||
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
|
"stream_options": {"include_usage": True, "continuous_usage_stats": True},
|
||||||
"max_tokens": 50,
|
"max_tokens": 50,
|
||||||
"return_token_ids": True
|
"return_token_ids": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
payload = build_request_payload(TEMPLATE, data)
|
payload = build_request_payload(TEMPLATE, data)
|
||||||
resp = send_request(URL, payload, stream=True)
|
resp = send_request(URL, payload, stream=True)
|
||||||
for line in resp.iter_lines(decode_unicode=True):
|
for line in resp.iter_lines(decode_unicode=True):
|
||||||
if line.strip() == "data: [DONE]" :
|
if line.strip() == "data: [DONE]":
|
||||||
break
|
break
|
||||||
if line.strip() == "" or not line.startswith("data: "):
|
if line.strip() == "" or not line.startswith("data: "):
|
||||||
continue
|
continue
|
||||||
line = line[len("data: "):]
|
line = line[len("data: ") :]
|
||||||
response_data = json.loads(line)
|
response_data = json.loads(line)
|
||||||
|
|
||||||
choice = response_data["choices"][0]
|
choice = response_data["choices"][0]
|
||||||
@@ -130,11 +116,11 @@ def test_stream_text_after_process_raw_prediction():
|
|||||||
reasoning_content = choice["delta"]["reasoning_content"]
|
reasoning_content = choice["delta"]["reasoning_content"]
|
||||||
content = choice["delta"]["content"]
|
content = choice["delta"]["content"]
|
||||||
assert reasoning_content or content in raw_prediction, "raw_prediction取值结果不正确"
|
assert reasoning_content or content in raw_prediction, "raw_prediction取值结果不正确"
|
||||||
if "finish_reason" in line.strip() :
|
if "finish_reason" in line.strip():
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
def test_text_after_process_raw_prediction_return_tokrn_ids():
|
def test_text_after_process_raw_prediction_return_token_ids():
|
||||||
"""
|
"""
|
||||||
/v1/chat/completions接口,非流式接口
|
/v1/chat/completions接口,非流式接口
|
||||||
返回属性"text_after_process"和"reasoning_content"
|
返回属性"text_after_process"和"reasoning_content"
|
||||||
@@ -161,7 +147,7 @@ def test_text_after_process_raw_prediction_return_tokrn_ids():
|
|||||||
|
|
||||||
def test_text_after_process_raw_prediction():
|
def test_text_after_process_raw_prediction():
|
||||||
"""
|
"""
|
||||||
/v1/chat/completions接口,无return_tokrn_ids参数
|
/v1/chat/completions接口,无return_token_ids参数
|
||||||
无return token ids 属性"text_after_process"和"reasoning_content"值为null
|
无return token ids 属性"text_after_process"和"reasoning_content"值为null
|
||||||
"""
|
"""
|
||||||
data = {
|
data = {
|
||||||
@@ -179,5 +165,3 @@ def test_text_after_process_raw_prediction():
|
|||||||
|
|
||||||
raw_prediction = resp["choices"][0]["message"]["raw_prediction"]
|
raw_prediction = resp["choices"][0]["message"]["raw_prediction"]
|
||||||
assert raw_prediction is None, "raw_prediction取值结果不正确"
|
assert raw_prediction is None, "raw_prediction取值结果不正确"
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ class LLMReqClient:
|
|||||||
if self.need_exit:
|
if self.need_exit:
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"zmq client occured error {e} type: {type(e)} frames: {frames}")
|
print(f"zmq client occurred error {e} type: {type(e)} frames: {frames}")
|
||||||
|
|
||||||
def start(self, result_queue):
|
def start(self, result_queue):
|
||||||
threading.Thread(target=self.consume_results, args=(result_queue,), daemon=True).start()
|
threading.Thread(target=self.consume_results, args=(result_queue,), daemon=True).start()
|
||||||
@@ -118,4 +118,4 @@ class LLMControlClient:
|
|||||||
self.result[task_id] = result["result"]
|
self.result[task_id] = result["result"]
|
||||||
self.task_event[task_id].set()
|
self.task_event[task_id].set()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"zmq client occured error {e} type: {type(e)} frames: {frames}")
|
print(f"zmq client occurred error {e} type: {type(e)} frames: {frames}")
|
||||||
|
|||||||
@@ -54,14 +54,14 @@ class Test(unittest.TestCase):
|
|||||||
fa = CustomAllreduce(model_parallel_group)
|
fa = CustomAllreduce(model_parallel_group)
|
||||||
|
|
||||||
for m, n in mns:
|
for m, n in mns:
|
||||||
data_cusom_ar = paddle.rand([m, n], dtype="bfloat16")
|
data_custom_ar = paddle.rand([m, n], dtype="bfloat16")
|
||||||
data_paddle = data_cusom_ar.clone()
|
data_paddle = data_custom_ar.clone()
|
||||||
if fa.should_custom_ar(data_cusom_ar):
|
if fa.should_custom_ar(data_custom_ar):
|
||||||
fa.custom_all_reduce(data_cusom_ar)
|
fa.custom_all_reduce(data_custom_ar)
|
||||||
dist.all_reduce(data_paddle)
|
dist.all_reduce(data_paddle)
|
||||||
if dist.get_rank() == 0:
|
if dist.get_rank() == 0:
|
||||||
np.testing.assert_allclose(
|
np.testing.assert_allclose(
|
||||||
data_cusom_ar.numpy(),
|
data_custom_ar.numpy(),
|
||||||
data_paddle.numpy(),
|
data_paddle.numpy(),
|
||||||
rtol=1e-04,
|
rtol=1e-04,
|
||||||
atol=1e-04,
|
atol=1e-04,
|
||||||
|
|||||||
@@ -158,7 +158,7 @@ def pre_compile_from_config(config_file: str, num_threads: int, expert_parallel:
|
|||||||
|
|
||||||
pbar.close()
|
pbar.close()
|
||||||
|
|
||||||
logger.info(f"Total compliation time: {time() - start_time:.2f} seconds")
|
logger.info(f"Total compilation time: {time() - start_time:.2f} seconds")
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
|
|||||||
Reference in New Issue
Block a user