diff --git a/.flake8 b/.flake8
new file mode 100644
index 000000000..869c57d3e
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+ignore = E203, E402, E501, E731, E741, W503, W605, E722
+max-line-length = 119
+
+# E402: module level import not at top of file
+per-file-ignores =
+ __init__.py:F401,F403,E402
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index cc4574e91..518b15eb9 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: CI
on:
pull_request:
- branches:
+ branches:
- develop
- 'release/*'
workflow_dispatch:
@@ -86,4 +86,4 @@ jobs:
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci.sh
- "
\ No newline at end of file
+ "
diff --git a/.github/workflows/ci_xpu.yml b/.github/workflows/ci_xpu.yml
index e87136704..7bb267fd2 100644
--- a/.github/workflows/ci_xpu.yml
+++ b/.github/workflows/ci_xpu.yml
@@ -2,7 +2,7 @@ name: CI_XPU
on:
pull_request:
- branches:
+ branches:
- develop
- 'release/*'
workflow_dispatch:
@@ -63,7 +63,7 @@ jobs:
if [[ "$last_char" =~ [0-3] ]]; then
gpu_id="$last_char"
else
- gpu_id="0"
+ gpu_id="0"
fi
FD_API_PORT=$((9180 + gpu_id * 100))
FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100))
@@ -84,4 +84,4 @@ jobs:
git config --global --add safe.directory /workspace/FastDeploy
cd FastDeploy
bash scripts/run_ci_xpu.sh
- "
\ No newline at end of file
+ "
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c7a2d150e..ce8942933 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,12 +5,27 @@ default_stages:
- pre-commit # Run locally
# - manual # Run in CI
repos:
+- repo: https://github.com/psf/black.git
+ rev: 22.8.0
+ hooks:
+ - id: black
+ files: \.(py|pyi)$
+ additional_dependencies: [toml]
+# 自动排序
+- repo: https://github.com/PyCQA/isort
+ rev: 5.11.5
+ hooks:
+ - id: isort
+- repo: https://github.com/PyCQA/flake8
+ rev: 4.0.1
+ hooks:
+ - id: flake8
# 代码检查
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.7
hooks:
- id: ruff
- args: [--output-format, github, --fix, --line-length=120]
+ args: [--output-format, github, --fix, --line-length=120, --config, pyproject.toml]
# # 拼写检查
# - repo: https://github.com/codespell-project/codespell
# rev: v2.4.1
@@ -18,17 +33,13 @@ repos:
# - id: codespell
# additional_dependencies: ['tomli']
# args: ['--toml', 'pyproject.toml']
-# 自动排序
-- repo: https://github.com/PyCQA/isort
- rev: 6.0.1
- hooks:
- - id: isort
+
# markdown
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29
hooks:
- id: pymarkdown
- args: [fix]
+ args: ["-d", "MD029,MD031", fix]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
diff --git a/README.md b/README.md
index f0dbde14a..fd94d27c5 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@
-
+
@@ -17,8 +17,8 @@
|
Quick Start
|
- Supported Models
-
+ Supported Models
+
--------------------------------------------------------------------------------
diff --git a/benchmarks/README.md b/benchmarks/README.md
index aa9858ced..85a0a6f41 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -131,4 +131,4 @@ python benchmarks/benchmark_mtp.py \
--s_itl-base-model:主模型的解码延迟,可由上述的性能压测工具获得,与batch-size一一对应
--dataset-name:指定数据集类,指定为"EBChat"可读取转存的FD格式数据集
--dataset-path:测试数据集路径
-```
\ No newline at end of file
+```
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index aacc94fab..c83b725ec 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -29,13 +29,13 @@ from typing import Optional
import aiohttp
from tqdm.asyncio import tqdm
-
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@dataclass
class RequestFuncInput:
"""Input for requesting LLMs via API"""
+
no: int
prompt: str
history_QA: Optional[dict]
@@ -55,6 +55,7 @@ class RequestFuncInput:
@dataclass
class RequestFuncOutput:
"""Output for requesting LLMs via API"""
+
no: int = 0
generated_text: str = ""
reasoning_content: str = ""
@@ -66,7 +67,7 @@ class RequestFuncOutput:
itl: list = field(default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
- prompt_tokens: int = 0 # 推理侧返回输入token数
+ prompt_tokens: int = 0 # 推理侧返回输入token数
error: str = ""
@@ -76,12 +77,9 @@ async def async_request_eb_openai_chat_completions(
) -> RequestFuncOutput:
"""Request an LLM using EB OpenAI"""
api_url = request_func_input.api_url
- assert api_url.endswith(
- ("completions", "profile")
- ), "OpenAI Chat Completions API URL must end with 'completions'."
+ assert api_url.endswith(("completions", "profile")), "OpenAI Chat Completions API URL must end with 'completions'."
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
@@ -91,7 +89,7 @@ async def async_request_eb_openai_chat_completions(
"stream": True,
"stream_options": {
"include_usage": True,
- "continuous_usage_stats": True
+ "continuous_usage_stats": True,
},
}
# 超参由yaml传入
@@ -99,8 +97,8 @@ async def async_request_eb_openai_chat_completions(
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
-
- print("payload:{}".format(json.dumps(payload, ensure_ascii=False)))
+
+ print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
headers = {
"Content-Type": "application/json",
@@ -115,16 +113,14 @@ async def async_request_eb_openai_chat_completions(
st = time.perf_counter()
most_recent_timestamp = st
try:
- async with session.post(url=api_url, json=payload,
- headers=headers) as response:
+ async with session.post(url=api_url, json=payload, headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
- chunk = chunk_bytes.decode("utf-8").removeprefix(
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
timestamp = time.perf_counter()
@@ -138,22 +134,20 @@ async def async_request_eb_openai_chat_completions(
ttft = timestamp - st
output.ttft = ttft
# cached_tokens
- output.prompt_len = data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
-
+ output.prompt_len = (
+ data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
+ )
# Decoding phase
else:
- output.itl.append(timestamp -
- most_recent_timestamp)
+ output.itl.append(timestamp - most_recent_timestamp)
output.generated_text += content or ""
output.reasoning_content += reason_content or ""
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
elif usage := data.get("usage", {}):
- output.output_tokens = usage.get(
- "completion_tokens", 0)
- output.prompt_tokens = usage.get(
- "prompt_tokens", 0)
+ output.output_tokens = usage.get("completion_tokens", 0)
+ output.prompt_tokens = usage.get("prompt_tokens", 0)
most_recent_timestamp = timestamp
@@ -166,7 +160,12 @@ async def async_request_eb_openai_chat_completions(
output.latency = most_recent_timestamp - st
else:
error_text = await response.text()
- print("####error response:", error_text, "####payload:", payload)
+ print(
+ "####error response:",
+ error_text,
+ "####payload:",
+ payload,
+ )
output.error = error_text or ""
output.success = False
except Exception:
@@ -194,15 +193,14 @@ async def async_request_eb_openai_completions(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"model": request_func_input.model,
"prompt": request_func_input.prompt,
"stream": True,
"stream_options": {
"include_usage": True,
- "continuous_usage_stats": True
+ "continuous_usage_stats": True,
},
}
# 超参由yaml传入
@@ -210,12 +208,12 @@ async def async_request_eb_openai_completions(
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
-
+
print("payload:", json.dumps(payload, ensure_ascii=False))
headers = {
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
- "Content-Type": "application/json"
+ "Content-Type": "application/json",
}
output = RequestFuncOutput()
@@ -227,8 +225,7 @@ async def async_request_eb_openai_completions(
st = time.perf_counter()
most_recent_timestamp = st
try:
- async with session.post(url=api_url, json=payload,
- headers=headers) as response:
+ async with session.post(url=api_url, json=payload, headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
@@ -236,8 +233,7 @@ async def async_request_eb_openai_completions(
if not chunk_bytes:
continue
- chunk = chunk_bytes.decode("utf-8").removeprefix(
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, chunk.usage)
timestamp = time.perf_counter()
@@ -250,7 +246,7 @@ async def async_request_eb_openai_completions(
# Note that text could be empty here
# e.g. for special tokens
text = choices[0].get("text")
-
+
# First token
if not first_chunk_received:
first_chunk_received = True
@@ -259,26 +255,23 @@ async def async_request_eb_openai_completions(
# Decoding phase
else:
- output.itl.append(timestamp -
- most_recent_timestamp)
-
+ output.itl.append(timestamp - most_recent_timestamp)
+
generated_text += text or ""
most_recent_timestamp = timestamp
output.arrival_time.append(choices[0].get("arrival_time", timestamp))
elif usage := data.get("usage"):
- output.prompt_tokens = usage.get(
- "prompt_tokens")
- output.output_tokens = usage.get(
- "completion_tokens")
+ output.prompt_tokens = usage.get("prompt_tokens")
+ output.output_tokens = usage.get("completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
- "Never received a valid chunk to calculate TTFT."
- "This response will be marked as failed!")
-
+ "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
+ )
+
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
@@ -294,8 +287,8 @@ async def async_request_eb_openai_completions(
output.success = False
exc_info = sys.exc_info()
output.error = "".join(traceback.format_exception(*exc_info))
-
- print("final_output:{}".format(output))
+
+ print(f"final_output:{output}")
if pbar:
pbar.update(1)
@@ -310,8 +303,7 @@ async def async_request_tgi(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
params = {
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
@@ -358,8 +350,7 @@ async def async_request_tgi(
# Decoding phase
else:
- output.itl.append(timestamp -
- most_recent_timestamp)
+ output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
output.arrival_time.append(data["arrival_time"])
@@ -388,8 +379,7 @@ async def async_request_trt_llm(
api_url = request_func_input.api_url
assert api_url.endswith("generate_stream")
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
@@ -414,8 +404,7 @@ async def async_request_trt_llm(
if not chunk_bytes:
continue
- chunk = chunk_bytes.decode("utf-8").removeprefix(
- "data:")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
data = json.loads(chunk)
output.generated_text += data["text_output"]
@@ -427,8 +416,7 @@ async def async_request_trt_llm(
# Decoding phase
else:
- output.itl.append(timestamp -
- most_recent_timestamp)
+ output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
@@ -453,8 +441,7 @@ async def async_request_deepspeed_mii(
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
"""Request an LLM using Deepspeed MII"""
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
payload = {
"prompt": request_func_input.prompt,
@@ -472,19 +459,16 @@ async def async_request_deepspeed_mii(
st = time.perf_counter()
try:
- async with session.post(url=request_func_input.api_url,
- json=payload) as response:
+ async with session.post(url=request_func_input.api_url, json=payload) as response:
if response.status == 200:
parsed_resp = await response.json()
output.latency = time.perf_counter() - st
if "choices" in parsed_resp:
- output.generated_text = parsed_resp["choices"][0][
- "text"]
+ output.generated_text = parsed_resp["choices"][0]["text"]
elif "text" in parsed_resp:
output.generated_text = parsed_resp["text"][0]
else:
- output.error = ("Unexpected response format: "
- "neither 'choices' nor 'text' found")
+ output.error = "Unexpected response format: " "neither 'choices' nor 'text' found"
output.success = False
output.success = True
else:
@@ -510,26 +494,22 @@ async def async_request_openai_completions(
("completions", "profile")
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
payload = {
- "model": request_func_input.model_name \
- if request_func_input.model_name else request_func_input.model,
+ "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
"prompt": request_func_input.prompt,
# "temperature": 0.0,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
- #"stream_options": {
+ # "stream_options": {
# "include_usage": True,
- #},
+ # },
}
if request_func_input.ignore_eos:
payload["ignore_eos"] = request_func_input.ignore_eos
- headers = {
- "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
- }
+ headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
@@ -538,8 +518,7 @@ async def async_request_openai_completions(
st = time.perf_counter()
most_recent_timestamp = st
try:
- async with session.post(url=api_url, json=payload,
- headers=headers) as response:
+ async with session.post(url=api_url, json=payload, headers=headers) as response:
if response.status == 200:
first_chunk_received = False
async for chunk_bytes in response.content:
@@ -547,8 +526,7 @@ async def async_request_openai_completions(
if not chunk_bytes:
continue
- chunk = chunk_bytes.decode("utf-8").removeprefix(
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
# print("####chunk:", chunk, type(chunk))
data = json.loads(chunk)
@@ -569,21 +547,19 @@ async def async_request_openai_completions(
# Decoding phase
else:
- output.itl.append(timestamp -
- most_recent_timestamp)
+ output.itl.append(timestamp - most_recent_timestamp)
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
- output.output_tokens = usage.get(
- "completion_tokens")
+ output.output_tokens = usage.get("completion_tokens")
if first_chunk_received:
output.success = True
else:
output.success = False
output.error = (
- "Never received a valid chunk to calculate TTFT."
- "This response will be marked as failed!")
+ "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
+ )
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
else:
@@ -606,25 +582,24 @@ async def async_request_openai_audio(
"""Request an LLM using OpenAI"""
# Lazy import without PlaceholderModule to avoid vllm dep.
import soundfile
+
api_url = request_func_input.api_url
assert api_url.endswith(
- ("transcriptions", "translations"
- )), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+ ("transcriptions", "translations")
+ ), "OpenAI Chat Completions API URL must end with 'transcriptions' "
"or `translations`."
- async with aiohttp.ClientSession(trust_env=True,
- timeout=AIOHTTP_TIMEOUT) as session:
+ async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
payload = {
- "model": request_func_input.model_name \
- if request_func_input.model_name else request_func_input.model,
+ "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
"temperature": 0.0,
"max_completion_tokens": request_func_input.output_len,
"stream": True,
"language": "en",
# Flattened due to multipart/form-data
"stream_include_usage": True,
- "stream_continuous_usage_stats": True
+ "stream_continuous_usage_stats": True,
}
if request_func_input.extra_body:
payload.update(request_func_input.extra_body)
@@ -639,9 +614,9 @@ async def async_request_openai_audio(
buffer.seek(0)
return buffer
- with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
+ with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
form = aiohttp.FormData()
- form.add_field('file', f, content_type='audio/wav')
+ form.add_field("file", f, content_type="audio/wav")
for key, value in payload.items():
form.add_field(key, str(value))
@@ -653,24 +628,20 @@ async def async_request_openai_audio(
st = time.perf_counter()
most_recent_timestamp = st
try:
- async with session.post(url=api_url,
- data=form,
- headers=headers) as response:
+ async with session.post(url=api_url, data=form, headers=headers) as response:
if response.status == 200:
async for chunk_bytes in response.content:
chunk_bytes = chunk_bytes.strip()
if not chunk_bytes:
continue
- chunk = chunk_bytes.decode("utf-8").removeprefix(
- "data: ")
+ chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
timestamp = time.perf_counter()
data = json.loads(chunk)
if choices := data.get("choices"):
- content = choices[0]["delta"].get(
- "content")
+ content = choices[0]["delta"].get("content")
# First token
if ttft == 0.0:
ttft = timestamp - st
@@ -678,13 +649,11 @@ async def async_request_openai_audio(
# Decoding phase
else:
- output.itl.append(
- timestamp - most_recent_timestamp)
+ output.itl.append(timestamp - most_recent_timestamp)
generated_text += content or ""
elif usage := data.get("usage"):
- output.output_tokens = usage.get(
- "completion_tokens")
+ output.output_tokens = usage.get("completion_tokens")
most_recent_timestamp = timestamp
@@ -718,8 +687,11 @@ ASYNC_REQUEST_FUNCS = {
}
OPENAI_COMPATIBLE_BACKENDS = [
- k for k, v in ASYNC_REQUEST_FUNCS.items()
- if v in (async_request_openai_completions,
- async_request_eb_openai_chat_completions)
+ k
+ for k, v in ASYNC_REQUEST_FUNCS.items()
+ if v
+ in (
+ async_request_openai_completions,
+ async_request_eb_openai_chat_completions,
+ )
]
-
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index 59ab4b454..551f0c9d5 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -26,9 +26,9 @@ from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from io import BytesIO
-from typing import Any, Callable, Optional, Union
-from PIL import Image
+from typing import Any, Optional, Union
+from PIL import Image
logger = logging.getLogger(__name__)
@@ -38,6 +38,7 @@ class SampleRequest:
"""
Represents a single inference request for benchmarking.
"""
+
no: int
prompt: Union[str, Any]
history_QA: Union[str, Any]
@@ -48,6 +49,7 @@ class SampleRequest:
class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
+
DEFAULT_SEED = 0
IS_MULTIMODAL = False
@@ -68,8 +70,7 @@ class BenchmarkDataset(ABC):
self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
- self.random_seed = (random_seed
- if random_seed is not None else self.DEFAULT_SEED)
+ self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
self.data = None
self.hyperparameter_path = hyperparameter_path
self.hyperparameters = {}
@@ -85,8 +86,7 @@ class BenchmarkDataset(ABC):
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
- raise NotImplementedError(
- "load_data must be implemented in subclasses.")
+ raise NotImplementedError("load_data must be implemented in subclasses.")
@abstractmethod
def sample(self, num_requests: int) -> list[SampleRequest]:
@@ -105,8 +105,7 @@ class BenchmarkDataset(ABC):
"""
raise NotImplementedError("sample must be implemented in subclasses.")
- def maybe_oversample_requests(self, requests: list[SampleRequest],
- num_requests: int) -> None:
+ def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
"""
Oversamples the list of requests if its size is less than the desired
number.
@@ -117,11 +116,9 @@ class BenchmarkDataset(ABC):
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
- additional = random.choices(requests,
- k=num_requests - len(requests))
+ additional = random.choices(requests, k=num_requests - len(requests))
requests.extend(additional)
- logger.info("Oversampled requests to reach %d total samples.",
- num_requests)
+ logger.info("Oversampled requests to reach %d total samples.", num_requests)
def is_valid_sequence(
@@ -141,14 +138,12 @@ def is_valid_sequence(
"""
# Check for invalid conditions
prompt_too_short = prompt_len < min_len
- output_too_short = (not skip_min_output_len_check) and (output_len
- < min_len)
+ output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met
- return not (prompt_too_short or output_too_short or prompt_too_long
- or combined_too_long)
+ return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)
def process_image(image: Any) -> Mapping[str, Any]:
@@ -171,28 +166,25 @@ def process_image(image: Any) -> Mapping[str, Any]:
Raises:
ValueError: If the input is not a supported type.
"""
- if isinstance(image, dict) and 'bytes' in image:
- image = Image.open(BytesIO(image['bytes']))
+ if isinstance(image, dict) and "bytes" in image:
+ image = Image.open(BytesIO(image["bytes"]))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
image.save(image_data, format="JPEG")
- image_base64 = base64.b64encode(
- image_data.getvalue()).decode("utf-8")
+ image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
return {
"type": "image_url",
- "image_url": {
- "url": f"data:image/jpeg;base64,{image_base64}"
- },
+ "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
}
if isinstance(image, str):
- image_url = (image if image.startswith(
- ("http://", "file://")) else f"file://{image}")
+ image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
return {"type": "image_url", "image_url": {"url": image_url}}
- raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
- " or str or dictionary with raw image bytes.")
+ raise ValueError(
+ f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
+ )
class EBDataset(BenchmarkDataset):
@@ -243,8 +235,7 @@ class EBDataset(BenchmarkDataset):
new_output_len = int(entry["max_dec_len"])
if enable_multimodal_chat:
- prompt = self.apply_multimodal_chat_transformation(
- prompt, None)
+ prompt = self.apply_multimodal_chat_transformation(prompt, None)
samples.append(
SampleRequest(
no=cnt,
@@ -252,17 +243,20 @@ class EBDataset(BenchmarkDataset):
prompt_len=self.prompt_len,
history_QA=[],
expected_output_len=new_output_len,
- ))
+ )
+ )
cnt += 1
self.maybe_oversample_requests(samples, num_requests)
return samples
+
class EBChatDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
+
prompt_len: int
def __init__(self, **kwargs) -> None:
@@ -296,8 +290,7 @@ class EBChatDataset(BenchmarkDataset):
new_output_len = int(entry.get("max_tokens", 12288))
if enable_multimodal_chat:
- prompt = self.apply_multimodal_chat_transformation(
- prompt, None)
+ prompt = self.apply_multimodal_chat_transformation(prompt, None)
samples.append(
SampleRequest(
no=cnt,
@@ -306,9 +299,9 @@ class EBChatDataset(BenchmarkDataset):
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
- ))
+ )
+ )
cnt += 1
self.maybe_oversample_requests(samples, num_requests)
return samples
-
diff --git a/benchmarks/benchmark_mtp.py b/benchmarks/benchmark_mtp.py
index 65c2392a1..2698a553b 100644
--- a/benchmarks/benchmark_mtp.py
+++ b/benchmarks/benchmark_mtp.py
@@ -18,28 +18,16 @@ import argparse
import asyncio
import contextlib
import os
-import signal
-import socket
-import subprocess
-import time
from typing import Union
-import openai
-import yaml
-from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
+from benchmark_dataset import EBChatDataset, EBDataset
from benchmark_serving import benchmark
-def prepare_input_requests(
- num_prompts: int, dataset_name: str, dataset_path: str
-) -> Union[EBDataset, EBChatDataset]:
+def prepare_input_requests(num_prompts: int, dataset_name: str, dataset_path: str) -> Union[EBDataset, EBChatDataset]:
dataset_mapping = {
- "EB": lambda: EBDataset(dataset_path=dataset_path).sample(
- num_requests=num_prompts
- ),
- "EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(
- num_requests=num_prompts
- ),
+ "EB": lambda: EBDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
+ "EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(num_requests=num_prompts),
}
try:
@@ -104,24 +92,27 @@ def calculate_speedup(acceptance_rate, draft_token_step, t_ori, t_mtp):
def main(args):
base_url = f"http://{args.host}:{args.port}"
- input_requests = prepare_input_requests(
- args.num_prompts, args.dataset_name, args.dataset_path
- )
+ input_requests = prepare_input_requests(args.num_prompts, args.dataset_name, args.dataset_path)
if len(args.max_concurrency) != len(args.s_itl_base_model):
- raise ValueError(f"--max_concurrency should be same length as --s_itl_base_model")
+ raise ValueError("--max_concurrency should be same length as --s_itl_base_model")
for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model):
# Wramup
print("Starting warmup...")
with open(os.devnull, "w") as f:
with contextlib.redirect_stdout(f):
- send_one_batch(base_url, max_concurrency, input_requests[0:max_concurrency], True)
+ send_one_batch(
+ base_url,
+ max_concurrency,
+ input_requests[0:max_concurrency],
+ True,
+ )
# Benchmark
record = send_one_batch(base_url, max_concurrency, input_requests, False)
- metric_header = f"Speed up"
+ metric_header = "Speed up"
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
for draft_token_step in args.draft_token_steps:
speedup = calculate_speedup(
@@ -130,11 +121,7 @@ def main(args):
s_itl,
record["mean_s_itl_ms"],
)
- print(
- "{:<40} {:<10.2f}".format(
- f"Speed up on {draft_token_step} steps draft", speedup
- )
- )
+ print("{:<40} {:<10.2f}".format(f"Speed up on {draft_token_step} steps draft", speedup))
print("=" * 50)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index e015117b3..25825061a 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -25,22 +25,23 @@ import os
import random
import time
import warnings
-import yaml
+from argparse import ArgumentParser as FlexibleArgumentParser
from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional
import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
- OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
- RequestFuncOutput)
-from tqdm.asyncio import tqdm
-
-from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset)
+import yaml
+from backend_request_func import (
+ ASYNC_REQUEST_FUNCS,
+ OPENAI_COMPATIBLE_BACKENDS,
+ RequestFuncInput,
+ RequestFuncOutput,
+)
+from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
+from tqdm.asyncio import tqdm
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@@ -48,6 +49,7 @@ MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@dataclass
class BenchmarkMetrics:
"""Class containing all metrics that are used in this script"""
+
completed: int
total_input: int
total_output: int
@@ -130,8 +132,7 @@ async def get_request(
input_requests: Iterable[SampleRequest] = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
- assert burstiness > 0, (
- f"A positive burstiness factor is expected, but given {burstiness}.")
+ assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}."
theta = 1.0 / (request_rate * burstiness)
for request in input_requests:
@@ -157,7 +158,7 @@ def calculate_metrics(
) -> tuple[BenchmarkMetrics, list[int]]:
"""Calculates various performance metrics based on the inputs and outputs."""
input_lens: list[int] = []
- infer_input_lens: list[int] = [] # 推理侧输入token数
+ infer_input_lens: list[int] = [] # 推理侧输入token数
actual_output_lens: list[int] = []
total_input = 0
completed = 0
@@ -208,8 +209,9 @@ def calculate_metrics(
s_e2els.append(outputs[i].arrival_time[-1])
# 解码速度去掉首token
if len(outputs[i].arrival_time) > 2:
- s_decodes.append((outputs[i].output_tokens - 1) /
- (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]))
+ s_decodes.append(
+ (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])
+ )
else:
print("len(outputs[i].arrival_time) <= 2")
completed += 1
@@ -224,16 +226,13 @@ def calculate_metrics(
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
- slo_values.append(goodput_config_dict["ttft"] /
- MILLISECONDS_TO_SECONDS_CONVERSION)
+ slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
- slo_values.append(goodput_config_dict["tpot"] /
- MILLISECONDS_TO_SECONDS_CONVERSION)
+ slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION)
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
- slo_values.append(goodput_config_dict["e2el"] /
- MILLISECONDS_TO_SECONDS_CONVERSION)
+ slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION)
for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -242,9 +241,9 @@ def calculate_metrics(
if completed == 0:
warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2)
+ "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
+ stacklevel=2,
+ )
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -253,64 +252,50 @@ def calculate_metrics(
request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
- mean_s_decode=np.mean(s_decodes or 0) *
- 1, # ttfts is empty if streaming is not supported by backend
+ mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend
std_s_decode=np.std(s_decodes or 0) * 1,
median_s_decode=np.median(s_decodes or 0) * 1,
- percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1)
- for p in selected_percentiles],
- mean_ttft_ms=np.mean(ttfts or 0) *
- 1000, # ttfts is empty if streaming is not supported by backend
+ percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles],
+ mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000,
- percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
- for p in selected_percentiles],
- mean_s_ttft_ms=np.mean(s_ttfts or 0) *
- 1000, # ttfts is empty if streaming is not supported by backend
+ percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
+ mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
std_s_ttft_ms=np.std(s_ttfts or 0) * 1000,
median_s_ttft_ms=np.median(s_ttfts or 0) * 1000,
- percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles],
mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
- percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
- percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
mean_s_itl_ms=np.mean(s_itls or 0) * 1000,
std_s_itl_ms=np.std(s_itls or 0) * 1000,
median_s_itl_ms=np.median(s_itls or 0) * 1000,
- percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles],
mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000,
- percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles],
mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000,
std_s_e2el_ms=np.std(s_e2els or 0) * 1000,
median_s_e2el_ms=np.median(s_e2els or 0) * 1000,
- percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles],
mean_input_len=np.mean(input_lens or 0) * 1,
std_input_len=np.std(input_lens or 0) * 1,
median_input_len=np.median(input_lens or 0) * 1,
- percentiles_input_len=[(p, np.percentile(input_lens or 0, p))
- for p in selected_percentiles],
+ percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles],
mean_s_input_len=np.mean(infer_input_lens or 0) * 1,
std_s_input_len=np.std(infer_input_lens or 0) * 1,
median_s_input_len=np.median(infer_input_lens or 0) * 1,
- percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p))
- for p in selected_percentiles],
+ percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles],
mean_output_len=np.mean(actual_output_lens or 0) * 1,
std_output_len=np.std(actual_output_lens or 0) * 1,
median_output_len=np.median(actual_output_lens or 0) * 1,
- percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p))
- for p in selected_percentiles],
+ percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles],
)
return metrics, actual_output_lens
@@ -344,9 +329,11 @@ async def benchmark(
raise ValueError(f"Unknown backend: {backend}")
print("Starting initial single prompt test run...")
- test_prompt, test_output_len, test_no = \
- input_requests[0].prompt, \
- input_requests[0].expected_output_len, input_requests[0].no
+ test_prompt, test_output_len, test_no = (
+ input_requests[0].prompt,
+ input_requests[0].expected_output_len,
+ input_requests[0].no,
+ )
test_history_QA = input_requests[0].history_QA
test_input = RequestFuncInput(
@@ -373,27 +360,28 @@ async def benchmark(
if not test_output.success:
raise ValueError(
"Initial test run failed - Please make sure benchmark arguments "
- f"are correctly specified. Error: {test_output.error}")
+ f"are correctly specified. Error: {test_output.error}"
+ )
else:
print("Initial test run completed. Starting main benchmark run...")
if lora_modules:
# For each input request, choose a LoRA module at random.
- lora_modules = iter(
- [random.choice(lora_modules) \
- for _ in range(len(input_requests))])
+ lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))])
if profile:
print("Starting profiler...")
- profile_input = RequestFuncInput(model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- no=test_no,
- api_url=base_url + "/start_profile",
- output_len=test_output_len,
- logprobs=logprobs,
- ignore_eos=ignore_eos,
- extra_body=extra_body)
+ profile_input = RequestFuncInput(
+ model=model_id,
+ model_name=model_name,
+ prompt=test_prompt,
+ no=test_no,
+ api_url=base_url + "/start_profile",
+ output_len=test_output_len,
+ logprobs=logprobs,
+ ignore_eos=ignore_eos,
+ extra_body=extra_body,
+ )
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
@@ -413,21 +401,22 @@ async def benchmark(
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
- semaphore = (asyncio.Semaphore(max_concurrency)
- if max_concurrency else None)
+ semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
- return await request_func(request_func_input=request_func_input,
- pbar=pbar)
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore:
- return await request_func(request_func_input=request_func_input,
- pbar=pbar)
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
benchmark_start_time = time.perf_counter()
tasks: list[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
- prompt, output_len, no = request.prompt, request.expected_output_len, request.no
+ prompt, output_len, no = (
+ request.prompt,
+ request.expected_output_len,
+ request.no,
+ )
history_QA = request.history_QA
req_model_id, req_model_name = model_id, model_name
@@ -435,22 +424,21 @@ async def benchmark(
req_lora_module = next(lora_modules)
req_model_id, req_model_name = req_lora_module, req_lora_module
- request_func_input = RequestFuncInput(model=req_model_id,
- model_name=req_model_name,
- prompt=prompt,
- no=no,
- prompt_len=0,
- history_QA=history_QA,
- hyper_parameters=hyper_parameters,
- api_url=api_url,
- output_len=output_len,
- logprobs=logprobs,
- ignore_eos=ignore_eos,
- extra_body=extra_body)
- tasks.append(
- asyncio.create_task(
- limited_request_func(request_func_input=request_func_input,
- pbar=pbar)))
+ request_func_input = RequestFuncInput(
+ model=req_model_id,
+ model_name=req_model_name,
+ prompt=prompt,
+ no=no,
+ prompt_len=0,
+ history_QA=history_QA,
+ hyper_parameters=hyper_parameters,
+ api_url=api_url,
+ output_len=output_len,
+ logprobs=logprobs,
+ ignore_eos=ignore_eos,
+ extra_body=extra_body,
+ )
+ tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
@@ -473,7 +461,6 @@ async def benchmark(
benchmark_duration = time.perf_counter() - benchmark_start_time
print("benchmark_duration:", benchmark_duration)
-
metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests,
outputs=outputs,
@@ -483,22 +470,16 @@ async def benchmark(
goodput_config_dict=goodput_config_dict,
)
- print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+ print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
- benchmark_duration))
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:",
- metrics.total_output))
- print("{:<40} {:<10.3f}".format("Request throughput (req/s):",
- metrics.request_throughput))
+ print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+ print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput))
if goodput_config_dict:
- print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
- metrics.request_goodput))
- print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
- metrics.output_throughput))
- print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
- metrics.total_token_throughput))
+ print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
+ print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
+ print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))
result = {
"duration": benchmark_duration,
@@ -506,8 +487,7 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
- "request_goodput:":
- metrics.request_goodput if goodput_config_dict else None,
+ "request_goodput:": (metrics.request_goodput if goodput_config_dict else None),
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -533,24 +513,25 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
- print("{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms")))
- print("{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms")))
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms")
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms")
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms")
- for p, value in getattr(metrics,
- f"percentiles_{metric_attribute_name}_ms"):
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Mean {metric_name} (ms):",
+ getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+ )
+ )
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Median {metric_name} (ms):",
+ getattr(metrics, f"median_{metric_attribute_name}_ms"),
+ )
+ )
+ result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms")
+ result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms")
+ result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms")
+ for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
- value))
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
def process_one_length(
@@ -565,31 +546,31 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
- print("{:<40} {:<10.2f}".format(
- f"Mean {metric_name}:",
- getattr(metrics, f"mean_{metric_attribute_name}")))
- print("{:<40} {:<10.2f}".format(
- f"Median {metric_name}:",
- getattr(metrics, f"median_{metric_attribute_name}")))
- result[f"mean_{metric_attribute_name}"] = getattr(
- metrics, f"mean_{metric_attribute_name}")
- result[f"median_{metric_attribute_name}"] = getattr(
- metrics, f"median_{metric_attribute_name}")
- result[f"std_{metric_attribute_name}"] = getattr(
- metrics, f"std_{metric_attribute_name}")
- for p, value in getattr(metrics,
- f"percentiles_{metric_attribute_name}"):
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Mean {metric_name}:",
+ getattr(metrics, f"mean_{metric_attribute_name}"),
+ )
+ )
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Median {metric_name}:",
+ getattr(metrics, f"median_{metric_attribute_name}"),
+ )
+ )
+ result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}")
+ result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}")
+ result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}")
+ for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"):
p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:",
- value))
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
result[f"p{p_word}_{metric_attribute_name}"] = value
process_one_length("s_decode", "Decode", "解码速度(tok/s)")
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
- process_one_metric("tpot", "TPOT",
- "Time per Output Token (excl. 1st token)")
+ process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -612,44 +593,37 @@ def benchmark_metrics(
):
"""Benchmark metrics statistics,generate benchmark result"""
outputs = []
- case_no_list = []
with open(result_file) as f:
for line in f.readlines():
if "RequestFuncOutput" in line:
start = line.find("RequestFuncOutput")
end = line.rfind(")")
- para_str = line[start:end + 1]
+ para_str = line[start : end + 1]
output = eval(para_str)
outputs.append(output)
-
+
input_requests = [[]] * len(outputs)
goodput_config_dict = check_goodput_args(args)
-
+
metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests,
outputs=outputs,
- dur_s=benchmark_duration,
+ dur_s=benchmark_duration,
selected_percentiles=selected_percentiles,
goodput_config_dict=goodput_config_dict,
)
- print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+ print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
- benchmark_duration))
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:",
- metrics.total_output))
- print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
- metrics.request_throughput))
+ print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+ print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput))
if goodput_config_dict:
- print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
- metrics.request_goodput))
- print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
- metrics.output_throughput))
- print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
- metrics.total_token_throughput))
+ print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
+ print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
+ print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))
result = {
"duration": benchmark_duration,
@@ -657,8 +631,7 @@ def benchmark_metrics(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
- "request_goodput:":
- metrics.request_goodput if goodput_config_dict else None,
+ "request_goodput:": (metrics.request_goodput if goodput_config_dict else None),
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -682,24 +655,25 @@ def benchmark_metrics(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
- print("{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms")))
- print("{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms")))
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms")
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms")
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms")
- for p, value in getattr(metrics,
- f"percentiles_{metric_attribute_name}_ms"):
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Mean {metric_name} (ms):",
+ getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+ )
+ )
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Median {metric_name} (ms):",
+ getattr(metrics, f"median_{metric_attribute_name}_ms"),
+ )
+ )
+ result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms")
+ result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms")
+ result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms")
+ for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
- value))
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
def process_one_length(
@@ -714,31 +688,31 @@ def benchmark_metrics(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
- print("{:<40} {:<10.2f}".format(
- f"Mean {metric_name}:",
- getattr(metrics, f"mean_{metric_attribute_name}")))
- print("{:<40} {:<10.2f}".format(
- f"Median {metric_name}:",
- getattr(metrics, f"median_{metric_attribute_name}")))
- result[f"mean_{metric_attribute_name}"] = getattr(
- metrics, f"mean_{metric_attribute_name}")
- result[f"median_{metric_attribute_name}"] = getattr(
- metrics, f"median_{metric_attribute_name}")
- result[f"std_{metric_attribute_name}"] = getattr(
- metrics, f"std_{metric_attribute_name}")
- for p, value in getattr(metrics,
- f"percentiles_{metric_attribute_name}"):
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Mean {metric_name}:",
+ getattr(metrics, f"mean_{metric_attribute_name}"),
+ )
+ )
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Median {metric_name}:",
+ getattr(metrics, f"median_{metric_attribute_name}"),
+ )
+ )
+ result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}")
+ result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}")
+ result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}")
+ for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"):
p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:",
- value))
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
result[f"p{p_word}_{metric_attribute_name}"] = value
process_one_length("s_decode", "Decode", "解码速度(tok/s)")
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
- process_one_metric("tpot", "TPOT",
- "Time per Output Token (excl. 1st token)")
+ process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -764,12 +738,14 @@ def check_goodput_args(args):
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of "
- f"{str(VALID_NAMES)}. ")
+ f"{VALID_NAMES!s}. "
+ )
if slo_val < 0:
raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
- "non-negative.")
+ "non-negative."
+ )
return goodput_config_dict
@@ -783,32 +759,37 @@ def parse_goodput(slo_pairs):
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
- "Specify service level objectives for goodput as \"KEY:VALUE\" "
+ 'Specify service level objectives for goodput as "KEY:VALUE" '
"pairs, where the key is a metric name, and the value is a "
- "number in milliseconds.") from err
+ "number in milliseconds."
+ ) from err
return goodput_config_dict
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
- results: dict[str, Any],
- file_name: str) -> None:
+def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None:
"""Save the benchmarking results to PyTorch Benchmark Format JSON file"""
metrics = [
- "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
- "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
- "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+ "median_ttft_ms",
+ "mean_ttft_ms",
+ "std_ttft_ms",
+ "p99_ttft_ms",
+ "mean_tpot_ms",
+ "median_tpot_ms",
+ "std_tpot_ms",
+ "p99_tpot_ms",
+ "median_itl_ms",
+ "mean_itl_ms",
+ "std_itl_ms",
+ "p99_itl_ms",
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
- metrics={k: [results[k]]
- for k in metrics},
- extra_info={
- k: results[k]
- for k in results if k not in metrics and k not in ignored_metrics
- })
+ metrics={k: [results[k]] for k in metrics},
+ extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics},
+ )
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
@@ -825,7 +806,6 @@ def main(args: argparse.Namespace):
model_id = args.model
model_name = args.served_model_name
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- tokenizer_mode = args.tokenizer_mode
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
@@ -835,23 +815,17 @@ def main(args: argparse.Namespace):
base_url = f"http://{args.host}:{args.port}"
if args.dataset_name is None:
- raise ValueError(
- "Please specify '--dataset-name' and the corresponding "
- "'--dataset-path' if required.")
+ raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.")
# For datasets that follow a similar structure, use a mapping.
dataset_mapping = {
- "EB":
- lambda: EBDataset(random_seed=args.seed,
- dataset_path=args.dataset_path).sample(
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
+ "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+ num_requests=args.num_prompts,
+ output_len=args.sharegpt_output_len,
),
- "EBChat":
- lambda: EBChatDataset(random_seed=args.seed,
- dataset_path=args.dataset_path).sample(
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
+ "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+ num_requests=args.num_prompts,
+ output_len=args.sharegpt_output_len,
),
}
@@ -869,15 +843,14 @@ def main(args: argparse.Namespace):
"top_p": args.top_p,
"top_k": args.top_k,
"min_p": args.min_p,
- "temperature": args.temperature
- }.items() if v is not None
+ "temperature": args.temperature,
+ }.items()
+ if v is not None
}
# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible "
- "backends.")
+ raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.")
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
@@ -908,16 +881,15 @@ def main(args: argparse.Namespace):
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
- selected_percentiles=[
- float(p) for p in args.metric_percentiles.split(",")
- ],
+ selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
extra_body=sampling_params,
- ))
-
+ )
+ )
+
# benchmark_result = benchmark_metrics(
# benchmark_duration=3600,
# result_file="your result file",
@@ -947,22 +919,23 @@ def main(args: argparse.Namespace):
kvstring = item.split("=")
result_json[kvstring[0].strip()] = kvstring[1].strip()
else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
+ raise ValueError("Invalid metadata format. Please use KEY=VALUE format.")
if not args.save_detailed:
# Remove fields with too many data points
for field in [
- "input_lens", "output_lens", "ttfts", "itls",
- "generated_texts", "errors"
+ "input_lens",
+ "output_lens",
+ "ttfts",
+ "itls",
+ "generated_texts",
+ "errors",
]:
if field in result_json:
del result_json[field]
# Traffic
- result_json["request_rate"] = (args.request_rate if args.request_rate
- < float("inf") else "inf")
+ result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf"
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
@@ -971,21 +944,19 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
- max_concurrency_str = (f"-concurrency{args.max_concurrency}"
- if args.max_concurrency is not None else "")
- file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
+ max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else ""
+ file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
- with open(file_name, "w", encoding='utf-8') as outfile:
+ with open(file_name, "w", encoding="utf-8") as outfile:
json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput.")
+ parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
type=str,
@@ -1011,18 +982,29 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
- choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"],
+ choices=[
+ "sharegpt",
+ "burstgpt",
+ "sonnet",
+ "random",
+ "hf",
+ "EB",
+ "EBChat",
+ ],
help="Name of the dataset to benchmark on.",
)
- parser.add_argument("--dataset-path",
- type=str,
- default=None,
- help="Path to the sharegpt/sonnet dataset. "
- "Or the huggingface dataset ID if using HF dataset.")
- parser.add_argument("--hyperparameter-path",
- type=str,
- default=None,
- help="Path to the hyperparameter. ")
+ parser.add_argument(
+ "--dataset-path",
+ type=str,
+ default=None,
+ help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.",
+ )
+ parser.add_argument(
+ "--hyperparameter-path",
+ type=str,
+ default=None,
+ help="Path to the hyperparameter. ",
+ )
parser.add_argument(
"--max-concurrency",
type=int,
@@ -1034,7 +1016,8 @@ if __name__ == "__main__":
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
- "if the server is not processing requests fast enough to keep up.")
+ "if the server is not processing requests fast enough to keep up.",
+ )
parser.add_argument(
"--model",
@@ -1045,7 +1028,7 @@ if __name__ == "__main__":
parser.add_argument(
"--tokenizer",
type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
@@ -1058,11 +1041,13 @@ if __name__ == "__main__":
"--logprobs",
type=int,
default=None,
- help=("Number of logprobs-per-token to compute & return as part of "
- "the request. If unspecified, then either (1) if beam search "
- "is disabled, no logprobs are computed & a single dummy "
- "logprob is returned for each token; or (2) if beam search "
- "is enabled 1 logprob per token is computed"),
+ help=(
+ "Number of logprobs-per-token to compute & return as part of "
+ "the request. If unspecified, then either (1) if beam search "
+ "is disabled, no logprobs are computed & a single dummy "
+ "logprob is returned for each token; or (2) if beam search "
+ "is enabled 1 logprob per token is computed"
+ ),
)
parser.add_argument(
"--request-rate",
@@ -1099,8 +1084,7 @@ if __name__ == "__main__":
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--save-result",
@@ -1141,35 +1125,38 @@ if __name__ == "__main__":
"--ignore-eos",
action="store_true",
help="Set ignore_eos flag when sending the benchmark request."
- "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+ "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+ )
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
- "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
- "Default value is \"ttft,tpot,itl\".")
+ 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+ 'Default value is "ttft,tpot,itl".',
+ )
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-separated list of percentiles for selected metrics. "
- "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
- "Default value is \"99\". "
- "Use \"--percentile-metrics\" to select metrics.",
+ 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+ 'Default value is "99". '
+ 'Use "--percentile-metrics" to select metrics.',
)
parser.add_argument(
"--goodput",
nargs="+",
required=False,
- help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+ help='Specify service level objectives for goodput as "KEY:VALUE" '
"pairs, where the key is a metric name, and the value is in "
- "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+ 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
"separated by spaces. Allowed request level metric names are "
- "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+ '"ttft", "tpot", "e2el". For more context on the definition of '
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
- "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+ "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+ )
# group for dataset specific arguments
sonnet_group = parser.add_argument_group("sonnet dataset options")
@@ -1197,8 +1184,8 @@ if __name__ == "__main__":
"--sharegpt-output-len",
type=int,
default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.")
+ help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.",
+ )
random_group = parser.add_argument_group("random dataset options")
random_group.add_argument(
@@ -1226,29 +1213,24 @@ if __name__ == "__main__":
"--random-prefix-len",
type=int,
default=0,
- help=("Number of fixed prefix tokens before the random context "
- "in a request. "
- "The total input length is the sum of `random-prefix-len` and "
- "a random "
- "context length sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]."),
+ help=(
+ "Number of fixed prefix tokens before the random context "
+ "in a request. "
+ "The total input length is the sum of `random-prefix-len` and "
+ "a random "
+ "context length sampled from [input_len * (1 - range_ratio), "
+ "input_len * (1 + range_ratio)]."
+ ),
)
hf_group = parser.add_argument_group("hf dataset options")
- hf_group.add_argument("--hf-subset",
- type=str,
- default=None,
- help="Subset of the HF dataset.")
- hf_group.add_argument("--hf-split",
- type=str,
- default=None,
- help="Split of the HF dataset.")
+ hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.")
+ hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.")
hf_group.add_argument(
"--hf-output-len",
type=int,
default=None,
- help="Output length for each request. Overrides the output lengths "
- "from the sampled HF dataset.",
+ help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.",
)
sampling_group = parser.add_argument_group("sampling parameters")
@@ -1256,54 +1238,59 @@ if __name__ == "__main__":
"--top-p",
type=float,
default=None,
- help="Top-p sampling parameter. Only has effect on openai-compatible "
- "backends.")
+ help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.",
+ )
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
- help="Top-k sampling parameter. Only has effect on openai-compatible "
- "backends.")
+ help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.",
+ )
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
- help="Min-p sampling parameter. Only has effect on openai-compatible "
- "backends.")
+ help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.",
+ )
sampling_group.add_argument(
"--temperature",
type=float,
default=None,
help="Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
- "decoding (i.e. temperature==0.0).")
+ "decoding (i.e. temperature==0.0).",
+ )
parser.add_argument(
- '--tokenizer-mode',
+ "--tokenizer-mode",
type=str,
default="auto",
- choices=['auto', 'slow', 'mistral', 'custom'],
+ choices=["auto", "slow", "mistral", "custom"],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
- 'always use the slow tokenizer. \n* '
+ "always use the slow tokenizer. \n* "
'"mistral" will always use the `mistral_common` tokenizer. \n*'
- '"custom" will use --tokenizer to select the preregistered tokenizer.')
+ '"custom" will use --tokenizer to select the preregistered tokenizer.',
+ )
- parser.add_argument("--served-model-name",
- type=str,
- default=None,
- help="The model name used in the API. "
- "If not specified, the model name will be the "
- "same as the ``--model`` argument. ")
+ parser.add_argument(
+ "--served-model-name",
+ type=str,
+ default=None,
+ help="The model name used in the API. "
+ "If not specified, the model name will be the "
+ "same as the ``--model`` argument. ",
+ )
- parser.add_argument("--lora-modules",
- nargs='+',
- default=None,
- help="A subset of LoRA module names passed in when "
- "launching the server. For each request, the "
- "script chooses a LoRA module at random.")
+ parser.add_argument(
+ "--lora-modules",
+ nargs="+",
+ default=None,
+ help="A subset of LoRA module names passed in when "
+ "launching the server. For each request, the "
+ "script chooses a LoRA module at random.",
+ )
args = parser.parse_args()
main(args)
-
diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py
index 6c149bf5f..4eba58a3b 100644
--- a/benchmarks/benchmark_utils.py
+++ b/benchmarks/benchmark_utils.py
@@ -24,9 +24,11 @@ import os
from typing import Any
-def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
- metrics: dict[str, list],
- extra_info: dict[str, Any]) -> list:
+def convert_to_pytorch_benchmark_format(
+ args: argparse.Namespace,
+ metrics: dict[str, list],
+ extra_info: dict[str, Any],
+) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
@@ -54,12 +56,10 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
},
}
- tp = record["benchmark"]["extra_info"]["args"].get(
- "tensor_parallel_size")
+ tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
- record["benchmark"]["extra_info"]["args"][
- "tensor_parallel_size"] = extra_info["tensor_parallel_size"]
+ record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
@@ -68,6 +68,7 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
class InfEncoder(json.JSONEncoder):
"""InfEncoder"""
+
def clear_inf(self, o: Any):
"""clear_inf"""
if isinstance(o, dict):
@@ -87,4 +88,3 @@ def write_to_json(filename: str, records: list) -> None:
"""write_to_json"""
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)
-
diff --git a/benchmarks/quick_benchmark.py b/benchmarks/quick_benchmark.py
index 7a2dbd877..899a14c54 100644
--- a/benchmarks/quick_benchmark.py
+++ b/benchmarks/quick_benchmark.py
@@ -25,32 +25,32 @@ import os
import random
import time
import warnings
-import yaml
-import requests
-import copy
+from argparse import ArgumentParser as FlexibleArgumentParser
from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Optional
import numpy as np
-from backend_request_func import (ASYNC_REQUEST_FUNCS,
- OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput,
- RequestFuncOutput)
+import requests
+import yaml
+from backend_request_func import (
+ ASYNC_REQUEST_FUNCS,
+ OPENAI_COMPATIBLE_BACKENDS,
+ RequestFuncInput,
+ RequestFuncOutput,
+)
+from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest
+from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm.asyncio import tqdm
-from argparse import ArgumentParser as FlexibleArgumentParser
-
-from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset)
-from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
-
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
-
@dataclass
class BenchmarkMetrics:
"""Class containing all metrics that are used in this script"""
+
completed: int
total_input: int
total_output: int
@@ -133,8 +133,7 @@ async def get_request(
input_requests: Iterable[SampleRequest] = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
- assert burstiness > 0, (
- f"A positive burstiness factor is expected, but given {burstiness}.")
+ assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}."
theta = 1.0 / (request_rate * burstiness)
for request in input_requests:
@@ -160,7 +159,7 @@ def calculate_metrics(
) -> tuple[BenchmarkMetrics, list[int]]:
"""Calculates various performance metrics based on the inputs and outputs."""
input_lens: list[int] = []
- infer_input_lens: list[int] = [] # 推理侧输入token数
+ infer_input_lens: list[int] = [] # 推理侧输入token数
actual_output_lens: list[int] = []
total_input = 0
completed = 0
@@ -210,8 +209,9 @@ def calculate_metrics(
s_e2els.append(outputs[i].arrival_time[-1])
# 解码速度去掉首token
if len(outputs[i].arrival_time) > 2:
- s_decodes.append((outputs[i].output_tokens - 1) /
- (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]))
+ s_decodes.append(
+ (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])
+ )
completed += 1
else:
actual_output_lens.append(0)
@@ -224,16 +224,13 @@ def calculate_metrics(
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
- slo_values.append(goodput_config_dict["ttft"] /
- MILLISECONDS_TO_SECONDS_CONVERSION)
+ slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION)
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
- slo_values.append(goodput_config_dict["tpot"] /
- MILLISECONDS_TO_SECONDS_CONVERSION)
+ slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION)
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
- slo_values.append(goodput_config_dict["e2el"] /
- MILLISECONDS_TO_SECONDS_CONVERSION)
+ slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION)
for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
@@ -242,9 +239,9 @@ def calculate_metrics(
if completed == 0:
warnings.warn(
- "All requests failed. This is likely due to a misconfiguration "
- "on the benchmark arguments.",
- stacklevel=2)
+ "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.",
+ stacklevel=2,
+ )
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
@@ -253,64 +250,50 @@ def calculate_metrics(
request_goodput=good_completed / dur_s,
output_throughput=sum(actual_output_lens) / dur_s,
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
- mean_s_decode=np.mean(s_decodes or 0) *
- 1, # ttfts is empty if streaming is not supported by backend
+ mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend
std_s_decode=np.std(s_decodes or 0) * 1,
median_s_decode=np.median(s_decodes or 0) * 1,
- percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1)
- for p in selected_percentiles],
- mean_ttft_ms=np.mean(ttfts or 0) *
- 1000, # ttfts is empty if streaming is not supported by backend
+ percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles],
+ mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
std_ttft_ms=np.std(ttfts or 0) * 1000,
median_ttft_ms=np.median(ttfts or 0) * 1000,
- percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
- for p in selected_percentiles],
- mean_s_ttft_ms=np.mean(s_ttfts or 0) *
- 1000, # ttfts is empty if streaming is not supported by backend
+ percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles],
+ mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend
std_s_ttft_ms=np.std(s_ttfts or 0) * 1000,
median_s_ttft_ms=np.median(s_ttfts or 0) * 1000,
- percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles],
mean_tpot_ms=np.mean(tpots or 0) * 1000,
std_tpot_ms=np.std(tpots or 0) * 1000,
median_tpot_ms=np.median(tpots or 0) * 1000,
- percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles],
mean_itl_ms=np.mean(itls or 0) * 1000,
std_itl_ms=np.std(itls or 0) * 1000,
median_itl_ms=np.median(itls or 0) * 1000,
- percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles],
mean_s_itl_ms=np.mean(s_itls or 0) * 1000,
std_s_itl_ms=np.std(s_itls or 0) * 1000,
median_s_itl_ms=np.median(s_itls or 0) * 1000,
- percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles],
mean_e2el_ms=np.mean(e2els or 0) * 1000,
std_e2el_ms=np.std(e2els or 0) * 1000,
median_e2el_ms=np.median(e2els or 0) * 1000,
- percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles],
mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000,
std_s_e2el_ms=np.std(s_e2els or 0) * 1000,
median_s_e2el_ms=np.median(s_e2els or 0) * 1000,
- percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000)
- for p in selected_percentiles],
+ percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles],
mean_input_len=np.mean(input_lens or 0) * 1,
std_input_len=np.std(input_lens or 0) * 1,
median_input_len=np.median(input_lens or 0) * 1,
- percentiles_input_len=[(p, np.percentile(input_lens or 0, p))
- for p in selected_percentiles],
+ percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles],
mean_s_input_len=np.mean(infer_input_lens or 0) * 1,
std_s_input_len=np.std(infer_input_lens or 0) * 1,
median_s_input_len=np.median(infer_input_lens or 0) * 1,
- percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p))
- for p in selected_percentiles],
+ percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles],
mean_output_len=np.mean(actual_output_lens or 0) * 1,
std_output_len=np.std(actual_output_lens or 0) * 1,
median_output_len=np.median(actual_output_lens or 0) * 1,
- percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p))
- for p in selected_percentiles],
+ percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles],
)
return metrics, actual_output_lens
@@ -351,20 +334,22 @@ async def benchmark(
if lora_modules:
# For each input request, choose a LoRA module at random.
- lora_modules = iter(
- [random.choice(lora_modules) \
- for _ in range(len(input_requests))])
+ lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))])
if profile:
print("Starting profiler...")
- profile_input = RequestFuncInput(model=model_id,
- model_name=model_name,
- prompt=test_prompt,
- api_url=base_url + "/start_profile",
- output_len=test_output_len,
- logprobs=logprobs,
- ignore_eos=ignore_eos,
- extra_body=extra_body)
+ test_prompt = None
+ test_output_len = None
+ profile_input = RequestFuncInput(
+ model=model_id,
+ model_name=model_name,
+ prompt=test_prompt,
+ api_url=base_url + "/start_profile",
+ output_len=test_output_len,
+ logprobs=logprobs,
+ ignore_eos=ignore_eos,
+ extra_body=extra_body,
+ )
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler started")
@@ -384,19 +369,16 @@ async def benchmark(
# and it will simplify the code in limited_request_func.
# semaphore = (asyncio.Semaphore(max_concurrency)
# if max_concurrency else contextlib.nullcontext())
- semaphore = (asyncio.Semaphore(max_concurrency)
- if max_concurrency else None)
+ semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
async def limited_request_func(request_func_input, pbar):
if semaphore is None:
- return await request_func(request_func_input=request_func_input,
- pbar=pbar)
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
async with semaphore:
- return await request_func(request_func_input=request_func_input,
- pbar=pbar)
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
benchmark_start_time = time.perf_counter()
-
+
print(f"开始时间:{datetime.now()}")
tasks: list[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
@@ -409,25 +391,26 @@ async def benchmark(
req_lora_module = next(lora_modules)
req_model_id, req_model_name = req_lora_module, req_lora_module
- request_func_input = RequestFuncInput(model=req_model_id,
- model_name=req_model_name,
- prompt=prompt,
- prompt_len=0,
- history_QA=history_QA,
- hyper_parameters=hyper_parameters,
- api_url=api_url,
- output_len=output_len,
- logprobs=logprobs,
- ignore_eos=ignore_eos,
- extra_body=extra_body)
- tasks.append(
- asyncio.create_task(
- limited_request_func(request_func_input=request_func_input,
- pbar=pbar)))
+ request_func_input = RequestFuncInput(
+ model=req_model_id,
+ model_name=req_model_name,
+ prompt=prompt,
+ prompt_len=0,
+ history_QA=history_QA,
+ hyper_parameters=hyper_parameters,
+ api_url=api_url,
+ output_len=output_len,
+ logprobs=logprobs,
+ ignore_eos=ignore_eos,
+ extra_body=extra_body,
+ )
+ tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
print(f"完成时间:{datetime.now()}")
if profile:
print("Stopping profiler...")
+ test_output_len = None
+ test_output_len = None
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
@@ -454,22 +437,16 @@ async def benchmark(
)
print("Benchmark complete!!!")
- print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+ print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
- print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
- benchmark_duration))
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
- print("{:<40} {:<10}".format("Total generated tokens:",
- metrics.total_output))
- print("{:<40} {:<10.3f}".format("Request throughput (req/s):",
- metrics.request_throughput))
+ print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+ print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput))
if goodput_config_dict:
- print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
- metrics.request_goodput))
- print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
- metrics.output_throughput))
- print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
- metrics.total_token_throughput))
+ print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput))
+ print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput))
+ print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput))
result = {
"duration": benchmark_duration,
@@ -477,8 +454,7 @@ async def benchmark(
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
"request_throughput": metrics.request_throughput,
- "request_goodput:":
- metrics.request_goodput if goodput_config_dict else None,
+ "request_goodput:": (metrics.request_goodput if goodput_config_dict else None),
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"input_lens": [output.prompt_len for output in outputs],
@@ -491,7 +467,6 @@ async def benchmark(
"reasoning_contents": [output.reasoning_content for output in outputs],
"errors": [output.error for output in outputs],
}
- quick_result = copy.deepcopy(result)
def process_one_metric(
# E.g., "ttft"
@@ -505,24 +480,25 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
- print("{:<40} {:<10.2f}".format(
- f"Mean {metric_name} (ms):",
- getattr(metrics, f"mean_{metric_attribute_name}_ms")))
- print("{:<40} {:<10.2f}".format(
- f"Median {metric_name} (ms):",
- getattr(metrics, f"median_{metric_attribute_name}_ms")))
- result[f"mean_{metric_attribute_name}_ms"] = getattr(
- metrics, f"mean_{metric_attribute_name}_ms")
- result[f"median_{metric_attribute_name}_ms"] = getattr(
- metrics, f"median_{metric_attribute_name}_ms")
- result[f"std_{metric_attribute_name}_ms"] = getattr(
- metrics, f"std_{metric_attribute_name}_ms")
- for p, value in getattr(metrics,
- f"percentiles_{metric_attribute_name}_ms"):
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Mean {metric_name} (ms):",
+ getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+ )
+ )
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Median {metric_name} (ms):",
+ getattr(metrics, f"median_{metric_attribute_name}_ms"),
+ )
+ )
+ result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms")
+ result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms")
+ result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms")
+ for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
- value))
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
def process_one_length(
@@ -537,31 +513,31 @@ async def benchmark(
# metric.
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
- print("{:<40} {:<10.2f}".format(
- f"Mean {metric_name}:",
- getattr(metrics, f"mean_{metric_attribute_name}")))
- print("{:<40} {:<10.2f}".format(
- f"Median {metric_name}:",
- getattr(metrics, f"median_{metric_attribute_name}")))
- result[f"mean_{metric_attribute_name}"] = getattr(
- metrics, f"mean_{metric_attribute_name}")
- result[f"median_{metric_attribute_name}"] = getattr(
- metrics, f"median_{metric_attribute_name}")
- result[f"std_{metric_attribute_name}"] = getattr(
- metrics, f"std_{metric_attribute_name}")
- for p, value in getattr(metrics,
- f"percentiles_{metric_attribute_name}"):
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Mean {metric_name}:",
+ getattr(metrics, f"mean_{metric_attribute_name}"),
+ )
+ )
+ print(
+ "{:<40} {:<10.2f}".format(
+ f"Median {metric_name}:",
+ getattr(metrics, f"median_{metric_attribute_name}"),
+ )
+ )
+ result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}")
+ result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}")
+ result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}")
+ for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"):
p_word = str(int(p)) if int(p) == p else str(p)
- print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:",
- value))
+ print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value))
result[f"p{p_word}_{metric_attribute_name}"] = value
process_one_length("s_decode", "Decode", "解码速度(tok/s)")
process_one_metric("ttft", "TTFT", "Time to First Token")
process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
- process_one_metric("tpot", "TPOT",
- "Time per Output Token (excl. 1st token)")
+ process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_one_metric("itl", "ITL", "Inter-token Latency")
process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
process_one_metric("e2el", "E2EL", "End-to-end Latency")
@@ -581,6 +557,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
"""
快速评估
"""
+
def process_quick_metric(
metric_attribute_name: str,
metric_name: str,
@@ -588,7 +565,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
):
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
mean_value = getattr(metrics, f"mean_{metric_attribute_name}_ms")
print("{:<40} {:<10.2f}".format(f"Mean {metric_name} (ms):", mean_value))
quick_result[f"mean_{metric_attribute_name}_ms"] = mean_value
@@ -600,17 +577,17 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics):
):
if metric_attribute_name not in selected_percentile_metrics:
return
- print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
+ print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
mean_value = getattr(metrics, f"mean_{metric_attribute_name}")
print("{:<40} {:<10.2f}".format(f"Mean {metric_name}:", mean_value))
quick_result[f"mean_{metric_attribute_name}"] = mean_value
+
print("\n\n\n")
- print("{s:{c}^{n}}".format(s=' Benchmark Quick Summary ', n=50, c='='))
+ print("{s:{c}^{n}}".format(s=" Benchmark Quick Summary ", n=50, c="="))
process_quick_length("s_decode", "Decode", "解码速度(tok/s)")
process_quick_metric("ttft", "TTFT", "Time to First Token")
process_quick_metric("s_ttft", "S_TTFT", "Infer Time to First Token")
- process_quick_metric("tpot", "TPOT",
- "Time per Output Token (excl. 1st token)")
+ process_quick_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
process_quick_metric("itl", "ITL", "Inter-token Latency")
process_quick_metric("s_itl", "S_ITL", "Infer Inter-token Latency")
process_quick_metric("e2el", "E2EL", "End-to-end Latency")
@@ -633,12 +610,14 @@ def check_goodput_args(args):
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of "
- f"{str(VALID_NAMES)}. ")
+ f"{VALID_NAMES!s}. "
+ )
if slo_val < 0:
raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
- "non-negative.")
+ "non-negative."
+ )
return goodput_config_dict
@@ -652,37 +631,43 @@ def parse_goodput(slo_pairs):
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
- "Specify service level objectives for goodput as \"KEY:VALUE\" "
+ 'Specify service level objectives for goodput as "KEY:VALUE" '
"pairs, where the key is a metric name, and the value is a "
- "number in milliseconds.") from err
+ "number in milliseconds."
+ ) from err
return goodput_config_dict
-def save_to_pytorch_benchmark_format(args: argparse.Namespace,
- results: dict[str, Any],
- file_name: str) -> None:
+def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None:
"""Save the benchmarking results to PyTorch Benchmark Format JSON file"""
metrics = [
- "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
- "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
- "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
+ "median_ttft_ms",
+ "mean_ttft_ms",
+ "std_ttft_ms",
+ "p99_ttft_ms",
+ "mean_tpot_ms",
+ "median_tpot_ms",
+ "std_tpot_ms",
+ "p99_tpot_ms",
+ "median_itl_ms",
+ "mean_itl_ms",
+ "std_itl_ms",
+ "p99_itl_ms",
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
- metrics={k: [results[k]]
- for k in metrics},
- extra_info={
- k: results[k]
- for k in results if k not in metrics and k not in ignored_metrics
- })
+ metrics={k: [results[k]] for k in metrics},
+ extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics},
+ )
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
+
def check_health(api_base_url: str) -> bool:
health_url = api_base_url.rstrip("/") + "/health"
try:
@@ -697,6 +682,7 @@ def check_health(api_base_url: str) -> bool:
print(f"[HEALTH] Failed to connect to {health_url}: {e}")
return False
+
def main(args: argparse.Namespace):
"""Main entry point"""
print(args)
@@ -707,7 +693,6 @@ def main(args: argparse.Namespace):
model_id = args.model
model_name = args.served_model_name
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
- tokenizer_mode = args.tokenizer_mode
if args.base_url is not None:
api_url = f"{args.base_url}{args.endpoint}"
@@ -717,23 +702,17 @@ def main(args: argparse.Namespace):
base_url = f"http://{args.host}:{args.port}"
if args.dataset_name is None:
- raise ValueError(
- "Please specify '--dataset-name' and the corresponding "
- "'--dataset-path' if required.")
+ raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.")
# For datasets that follow a similar structure, use a mapping.
dataset_mapping = {
- "EB":
- lambda: EBDataset(random_seed=args.seed,
- dataset_path=args.dataset_path).sample(
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
+ "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+ num_requests=args.num_prompts,
+ output_len=args.sharegpt_output_len,
),
- "EBChat":
- lambda: EBChatDataset(random_seed=args.seed,
- dataset_path=args.dataset_path).sample(
- num_requests=args.num_prompts,
- output_len=args.sharegpt_output_len,
+ "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample(
+ num_requests=args.num_prompts,
+ output_len=args.sharegpt_output_len,
),
}
@@ -751,15 +730,14 @@ def main(args: argparse.Namespace):
"top_p": args.top_p,
"top_k": args.top_k,
"min_p": args.min_p,
- "temperature": args.temperature
- }.items() if v is not None
+ "temperature": args.temperature,
+ }.items()
+ if v is not None
}
# Sampling parameters are only supported by openai-compatible backend.
if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS:
- raise ValueError(
- "Sampling parameters are only supported by openai-compatible "
- "backends.")
+ raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.")
if "temperature" not in sampling_params:
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
@@ -790,15 +768,14 @@ def main(args: argparse.Namespace):
disable_tqdm=args.disable_tqdm,
profile=args.profile,
selected_percentile_metrics=args.percentile_metrics.split(","),
- selected_percentiles=[
- float(p) for p in args.metric_percentiles.split(",")
- ],
+ selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
extra_body=sampling_params,
- ))
+ )
+ )
# Save config and results to json
if args.save_result:
@@ -819,22 +796,23 @@ def main(args: argparse.Namespace):
kvstring = item.split("=")
result_json[kvstring[0].strip()] = kvstring[1].strip()
else:
- raise ValueError(
- "Invalid metadata format. Please use KEY=VALUE format."
- )
+ raise ValueError("Invalid metadata format. Please use KEY=VALUE format.")
if not args.save_detailed:
# Remove fields with too many data points
for field in [
- "input_lens", "output_lens", "ttfts", "itls",
- "generated_texts", "errors"
+ "input_lens",
+ "output_lens",
+ "ttfts",
+ "itls",
+ "generated_texts",
+ "errors",
]:
if field in result_json:
del result_json[field]
# Traffic
- result_json["request_rate"] = (args.request_rate if args.request_rate
- < float("inf") else "inf")
+ result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf"
result_json["burstiness"] = args.burstiness
result_json["max_concurrency"] = args.max_concurrency
@@ -843,21 +821,19 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
- max_concurrency_str = (f"-concurrency{args.max_concurrency}"
- if args.max_concurrency is not None else "")
- file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa
+ max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else ""
+ file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"
if args.result_filename:
file_name = args.result_filename
if args.result_dir:
file_name = os.path.join(args.result_dir, file_name)
- with open(file_name, "w", encoding='utf-8') as outfile:
+ with open(file_name, "w", encoding="utf-8") as outfile:
json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
- parser = FlexibleArgumentParser(
- description="Benchmark the online serving throughput.")
+ parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.")
parser.add_argument(
"--backend",
type=str,
@@ -883,18 +859,29 @@ if __name__ == "__main__":
"--dataset-name",
type=str,
default="sharegpt",
- choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"],
+ choices=[
+ "sharegpt",
+ "burstgpt",
+ "sonnet",
+ "random",
+ "hf",
+ "EB",
+ "EBChat",
+ ],
help="Name of the dataset to benchmark on.",
)
- parser.add_argument("--dataset-path",
- type=str,
- default=None,
- help="Path to the sharegpt/sonnet dataset. "
- "Or the huggingface dataset ID if using HF dataset.")
- parser.add_argument("--hyperparameter-path",
- type=str,
- default=None,
- help="Path to the hyperparameter. ")
+ parser.add_argument(
+ "--dataset-path",
+ type=str,
+ default=None,
+ help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.",
+ )
+ parser.add_argument(
+ "--hyperparameter-path",
+ type=str,
+ default=None,
+ help="Path to the hyperparameter. ",
+ )
parser.add_argument(
"--max-concurrency",
type=int,
@@ -906,7 +893,8 @@ if __name__ == "__main__":
"initiated, this argument will control how many are actually allowed "
"to execute at a time. This means that when used in combination, the "
"actual request rate may be lower than specified with --request-rate, "
- "if the server is not processing requests fast enough to keep up.")
+ "if the server is not processing requests fast enough to keep up.",
+ )
parser.add_argument(
"--model",
@@ -917,7 +905,7 @@ if __name__ == "__main__":
parser.add_argument(
"--tokenizer",
type=str,
- help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
+ help="Name or path of the tokenizer, if not using the default tokenizer.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
@@ -930,11 +918,13 @@ if __name__ == "__main__":
"--logprobs",
type=int,
default=None,
- help=("Number of logprobs-per-token to compute & return as part of "
- "the request. If unspecified, then either (1) if beam search "
- "is disabled, no logprobs are computed & a single dummy "
- "logprob is returned for each token; or (2) if beam search "
- "is enabled 1 logprob per token is computed"),
+ help=(
+ "Number of logprobs-per-token to compute & return as part of "
+ "the request. If unspecified, then either (1) if beam search "
+ "is disabled, no logprobs are computed & a single dummy "
+ "logprob is returned for each token; or (2) if beam search "
+ "is enabled 1 logprob per token is computed"
+ ),
)
parser.add_argument(
"--request-rate",
@@ -971,8 +961,7 @@ if __name__ == "__main__":
parser.add_argument(
"--profile",
action="store_true",
- help="Use Torch Profiler. The endpoint must be launched with "
- "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+ help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.",
)
parser.add_argument(
"--save-result",
@@ -1013,35 +1002,38 @@ if __name__ == "__main__":
"--ignore-eos",
action="store_true",
help="Set ignore_eos flag when sending the benchmark request."
- "Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
+ "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+ )
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
- "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
- "Default value is \"ttft,tpot,itl\".")
+ 'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+ 'Default value is "ttft,tpot,itl".',
+ )
parser.add_argument(
"--metric-percentiles",
type=str,
default="99",
help="Comma-separated list of percentiles for selected metrics. "
- "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
- "Default value is \"99\". "
- "Use \"--percentile-metrics\" to select metrics.",
+ 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+ 'Default value is "99". '
+ 'Use "--percentile-metrics" to select metrics.',
)
parser.add_argument(
"--goodput",
nargs="+",
required=False,
- help="Specify service level objectives for goodput as \"KEY:VALUE\" "
+ help='Specify service level objectives for goodput as "KEY:VALUE" '
"pairs, where the key is a metric name, and the value is in "
- "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
+ 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
"separated by spaces. Allowed request level metric names are "
- "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
+ '"ttft", "tpot", "e2el". For more context on the definition of '
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
- "and the blog: https://hao-ai-lab.github.io/blogs/distserve")
+ "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+ )
# group for dataset specific arguments
sonnet_group = parser.add_argument_group("sonnet dataset options")
@@ -1069,8 +1061,8 @@ if __name__ == "__main__":
"--sharegpt-output-len",
type=int,
default=None,
- help="Output length for each request. Overrides the output length "
- "from the ShareGPT dataset.")
+ help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.",
+ )
random_group = parser.add_argument_group("random dataset options")
random_group.add_argument(
@@ -1098,29 +1090,24 @@ if __name__ == "__main__":
"--random-prefix-len",
type=int,
default=0,
- help=("Number of fixed prefix tokens before the random context "
- "in a request. "
- "The total input length is the sum of `random-prefix-len` and "
- "a random "
- "context length sampled from [input_len * (1 - range_ratio), "
- "input_len * (1 + range_ratio)]."),
+ help=(
+ "Number of fixed prefix tokens before the random context "
+ "in a request. "
+ "The total input length is the sum of `random-prefix-len` and "
+ "a random "
+ "context length sampled from [input_len * (1 - range_ratio), "
+ "input_len * (1 + range_ratio)]."
+ ),
)
hf_group = parser.add_argument_group("hf dataset options")
- hf_group.add_argument("--hf-subset",
- type=str,
- default=None,
- help="Subset of the HF dataset.")
- hf_group.add_argument("--hf-split",
- type=str,
- default=None,
- help="Split of the HF dataset.")
+ hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.")
+ hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.")
hf_group.add_argument(
"--hf-output-len",
type=int,
default=None,
- help="Output length for each request. Overrides the output lengths "
- "from the sampled HF dataset.",
+ help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.",
)
sampling_group = parser.add_argument_group("sampling parameters")
@@ -1128,52 +1115,58 @@ if __name__ == "__main__":
"--top-p",
type=float,
default=None,
- help="Top-p sampling parameter. Only has effect on openai-compatible "
- "backends.")
+ help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.",
+ )
sampling_group.add_argument(
"--top-k",
type=int,
default=None,
- help="Top-k sampling parameter. Only has effect on openai-compatible "
- "backends.")
+ help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.",
+ )
sampling_group.add_argument(
"--min-p",
type=float,
default=None,
- help="Min-p sampling parameter. Only has effect on openai-compatible "
- "backends.")
+ help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.",
+ )
sampling_group.add_argument(
"--temperature",
type=float,
default=None,
help="Temperature sampling parameter. Only has effect on "
"openai-compatible backends. If not specified, default to greedy "
- "decoding (i.e. temperature==0.0).")
+ "decoding (i.e. temperature==0.0).",
+ )
parser.add_argument(
- '--tokenizer-mode',
+ "--tokenizer-mode",
type=str,
default="auto",
- choices=['auto', 'slow', 'mistral', 'custom'],
+ choices=["auto", "slow", "mistral", "custom"],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
- 'always use the slow tokenizer. \n* '
+ "always use the slow tokenizer. \n* "
'"mistral" will always use the `mistral_common` tokenizer. \n*'
- '"custom" will use --tokenizer to select the preregistered tokenizer.')
+ '"custom" will use --tokenizer to select the preregistered tokenizer.',
+ )
- parser.add_argument("--served-model-name",
- type=str,
- default=None,
- help="The model name used in the API. "
- "If not specified, the model name will be the "
- "same as the ``--model`` argument. ")
+ parser.add_argument(
+ "--served-model-name",
+ type=str,
+ default=None,
+ help="The model name used in the API. "
+ "If not specified, the model name will be the "
+ "same as the ``--model`` argument. ",
+ )
- parser.add_argument("--lora-modules",
- nargs='+',
- default=None,
- help="A subset of LoRA module names passed in when "
- "launching the server. For each request, the "
- "script chooses a LoRA module at random.")
+ parser.add_argument(
+ "--lora-modules",
+ nargs="+",
+ default=None,
+ help="A subset of LoRA module names passed in when "
+ "launching the server. For each request, the "
+ "script chooses a LoRA module at random.",
+ )
args = parser.parse_args()
diff --git a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
index db8a20b86..ffa5ceac3 100644
--- a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
+++ b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml
@@ -7,4 +7,4 @@ tensor_parallel_size: 1
enable_chunked_prefill: True
max_num_batched_tokens: 384
quantization: wint4
-reasoning_parser: ernie-45-vl
\ No newline at end of file
+reasoning_parser: ernie-45-vl
diff --git a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
index 957f59d2a..985ef7a34 100644
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml
@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
-max_long_partial_prefills: 3
\ No newline at end of file
+max_long_partial_prefills: 3
diff --git a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
index c1466160d..2831838fd 100644
--- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml
@@ -9,4 +9,4 @@ cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
-pd_comm_port: "2333"
\ No newline at end of file
+pd_comm_port: "2333"
diff --git a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
index e6d0fa6e0..b7c26ac39 100644
--- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml
@@ -10,4 +10,4 @@ engine_worker_queue_port: 6677
num_gpu_blocks_override: 1024
cache_transfer_protocol: "rdma"
rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678"
-pd_comm_port: "2334"
\ No newline at end of file
+pd_comm_port: "2334"
diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
index e239cea89..401cd61be 100644
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml
@@ -10,4 +10,4 @@ splitwise_role: decode
engine_worker_queue_port: 6678
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7671,7672,7673,7674"
-pd_comm_port: "2334"
\ No newline at end of file
+pd_comm_port: "2334"
diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
index 6d759c843..a4e9ca7af 100644
--- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml
@@ -9,4 +9,4 @@ cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
-pd_comm_port: "2333"
\ No newline at end of file
+pd_comm_port: "2333"
diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
index 957f59d2a..985ef7a34 100644
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml
@@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674"
pd_comm_port: "2334"
max_num_batched_tokens: 384
max_num_partial_prefills: 3
-max_long_partial_prefills: 3
\ No newline at end of file
+max_long_partial_prefills: 3
diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
index c1466160d..2831838fd 100644
--- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
+++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml
@@ -9,4 +9,4 @@ cache_queue_port: 55664
engine_worker_queue_port: 6677
cache_transfer_protocol: "rdma,ipc"
rdma_comm_ports: "7675,7676,7677,7678"
-pd_comm_port: "2333"
\ No newline at end of file
+pd_comm_port: "2333"
diff --git a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
index 7a127995e..8e4c5717c 100644
--- a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 75
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.75
quantization: wint4
-tensor_parallel_size: 4
\ No newline at end of file
+tensor_parallel_size: 4
diff --git a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
index 4d6cff601..8531d311e 100644
--- a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
+++ b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 25
gpu_memory_utilization: 0.9
kv_cache_ratio: 0.75
quantization: wint8
-tensor_parallel_size: 4
\ No newline at end of file
+tensor_parallel_size: 4
diff --git a/benchmarks/yaml/request_yaml/quick_benchmark.yaml b/benchmarks/yaml/request_yaml/quick_benchmark.yaml
index c7e608c80..2af93c8f1 100644
--- a/benchmarks/yaml/request_yaml/quick_benchmark.yaml
+++ b/benchmarks/yaml/request_yaml/quick_benchmark.yaml
@@ -1,3 +1,3 @@
metadata:
min_tokens: 32
-max_tokens: 33
\ No newline at end of file
+max_tokens: 33
diff --git a/benchmarks/yaml/request_yaml/qwen2-32k.yaml b/benchmarks/yaml/request_yaml/qwen2-32k.yaml
index 464277942..8227a373d 100644
--- a/benchmarks/yaml/request_yaml/qwen2-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen2-32k.yaml
@@ -5,4 +5,4 @@ metadata:
max_tokens: 12288
repetition_penalty: 1.05
frequency_penalty: 0
-presence_penalty: 0
\ No newline at end of file
+presence_penalty: 0
diff --git a/benchmarks/yaml/request_yaml/qwen3-32k.yaml b/benchmarks/yaml/request_yaml/qwen3-32k.yaml
index 8f1fc1fd7..b00f2aa26 100644
--- a/benchmarks/yaml/request_yaml/qwen3-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen3-32k.yaml
@@ -5,4 +5,4 @@ metadata:
max_tokens: 12288
repetition_penalty: 1.0
frequency_penalty: 0
-presence_penalty: 1.5
\ No newline at end of file
+presence_penalty: 1.5
diff --git a/benchmarks/yaml/request_yaml/vLLM_default.yaml b/benchmarks/yaml/request_yaml/vLLM_default.yaml
index 4be43ad1b..a6385823b 100644
--- a/benchmarks/yaml/request_yaml/vLLM_default.yaml
+++ b/benchmarks/yaml/request_yaml/vLLM_default.yaml
@@ -8,4 +8,4 @@ frequency_penalty: 0
presence_penalty: 0
skip_special_tokens: false
chat_template_kwargs:
- enable_thinking: true
\ No newline at end of file
+ enable_thinking: true
diff --git a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
index 376177602..220db3068 100644
--- a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
+++ b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 64
gpu_memory_utilization: 0.9
tensor_parallel_size: 8
quantization: wint8
-reasoning_parser: ernie-x1
\ No newline at end of file
+reasoning_parser: ernie-x1
diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
index c25f68211..b3fe75b2c 100644
--- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h
@@ -40,4 +40,4 @@ void DecoderWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
- paddle::Tensor* value_cache_out);
\ No newline at end of file
+ paddle::Tensor* value_cache_out);
diff --git a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
index f63f36a6b..2cba8d547 100644
--- a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
+++ b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu
@@ -216,7 +216,7 @@ __global__ void append_dequant_cache_kv_c8(
uint32_t k_smem_offset_r = smem_t::get_permuted_offset(
wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8);
-
+
uint32_t k_read_idx = (wid * 4 + tid / 8) * HEAD_DIM +
tid % 8 * num_elems_per_128b();
@@ -330,7 +330,7 @@ __global__ void append_dequant_cache_kv_c8(
v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale;
v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale;
-
+
convert_c8(frag_dq_T + 4, v_frag[2 * i + 1]); // 4个uint8/fp8 -> 4个T
#ifdef C8_DEBUG
if (tid == 0 && wid == 0 && tile_idx == 0 && kv_head_idx == 0) {
@@ -373,14 +373,14 @@ void AppendDequantCache(
paddle::Tensor *k_out,
paddle::Tensor *v_out,
const cudaStream_t& stream
-) {
+) {
using NV_TYPE = typename cascade_attn_type_traits::type;
if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") {
constexpr int NUM_WARPS = 4;
int block_num = cache_num_blocks_x.data()[0];
dim3 grids(block_num, 1, kv_num_heads);
dim3 blocks(32, NUM_WARPS);
-
+
const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) * 2;
auto kernel_func = append_dequant_cache_kv_c8;
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
index ed8952ad5..936d88e87 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh
@@ -41,7 +41,7 @@ __global__ void append_clear_cache_int8_block(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
-
+
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
@@ -115,7 +115,7 @@ __global__ void append_clear_cache_int4_block(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
-
+
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
@@ -484,7 +484,7 @@ __global__ void append_speculate_cache_int8_rope_kernel(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
-
+
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
@@ -716,7 +716,7 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel(
const int wid = tid / 32;
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
-
+
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
@@ -1097,7 +1097,7 @@ __global__ void append_speculate_cache_int4_rope_kernel(
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
-
+
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
@@ -1403,7 +1403,7 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
const int lane_id = tid % 32;
const int token_id = blockIdx.x;
-
+
const int bid = batch_id_per_token[token_id];
const int start_token_idx = cu_seqlens_q[bid];
@@ -1792,4 +1792,4 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel(
(uint_quant_value2 << 4) | (uint_quant_value1 & 0x0F);
}
}
-}
\ No newline at end of file
+}
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
index b7c533a38..fb6a24fef 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu
@@ -582,4 +582,4 @@ SpeculateWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
- paddle::Tensor* value_cache_out);
\ No newline at end of file
+ paddle::Tensor* value_cache_out);
diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
index bb192f5a9..40ab34e05 100644
--- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
+++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h
@@ -39,4 +39,4 @@ void SpeculateWriteCacheWithRoPEKernel(
cudaStream_t& stream,
paddle::Tensor* qkv_out,
paddle::Tensor* key_cache_out,
- paddle::Tensor* value_cache_out);
\ No newline at end of file
+ paddle::Tensor* value_cache_out);
diff --git a/custom_ops/gpu_ops/common/cudaUtils.h b/custom_ops/gpu_ops/common/cudaUtils.h
index 2a2abfffb..9bbd1f6e8 100644
--- a/custom_ops/gpu_ops/common/cudaUtils.h
+++ b/custom_ops/gpu_ops/common/cudaUtils.h
@@ -30,4 +30,4 @@ inline int getSMVersion()
return sm_major * 10 + sm_minor;
}
-}
\ No newline at end of file
+}
diff --git a/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h b/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h
index 31fc95b81..6ed5b9b92 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h
@@ -136,4 +136,4 @@ struct Epilogue;
};
-} // namespace cutlass_extensions
\ No newline at end of file
+} // namespace cutlass_extensions
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp
index 7d25428b5..d327eb18a 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp
@@ -1,11 +1,11 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
+//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
-//
+//
// http://www.apache.org/licenses/LICENSE-2.0
-//
+//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
index d4dd7d3a8..0a530e5c1 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp
@@ -1,11 +1,11 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
+//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
-//
+//
// http://www.apache.org/licenses/LICENSE-2.0
-//
+//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -54,7 +54,7 @@
///////////////////////////////////FP8 Accumulation///////////////////////////
//////////////////////////////////////////////////////////////////////////////
/// This class provides API to promote (add) or scale (multiply_add) the results
-/// from the tensor core accumulators to the main accumulators when the number
+/// from the tensor core accumulators to the main accumulators when the number
/// of MMAs reaches the max number of MMA interval specified by user, after that
/// the tensor core accumulators are zeroed.
//////////////////////////////////////////////////////////////////////////////
@@ -64,7 +64,7 @@ namespace cutlass::gemm::collective {
template <
class EngineAccum,
class LayoutAccum>
-struct GmmaFP8AccumulationWithScale {
+struct GmmaFP8AccumulationWithScale {
using TensorAccum = cute::Tensor;
using ElementAccumulator = typename EngineAccum::value_type;
@@ -78,7 +78,7 @@ private:
uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted.
uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop
uint32_t mma_count_; // current executed MMAs
- uint32_t reset_accum_flag_; // accum needs to be zeroed or not.
+ uint32_t reset_accum_flag_; // accum needs to be zeroed or not.
// promote or `add` the partial accumulators to main accumulator (FADD).
CUTLASS_DEVICE
@@ -116,11 +116,11 @@ public:
TensorAccum &accum,
uint32_t accum_promotion_interval,
uint32_t mma_count_per_mainloop_iteration)
- : accum_(accum),
+ : accum_(accum),
accum_promotion_interval_(accum_promotion_interval),
mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration),
- mma_count_(0),
- reset_accum_flag_(0)
+ mma_count_(0),
+ reset_accum_flag_(0)
{
accum_temp_ = cute::make_fragment_like(accum);
}
@@ -129,14 +129,14 @@ public:
// Methods (Common)
//
- CUTLASS_DEVICE
+ CUTLASS_DEVICE
TensorAccum& operator()() {
return accum_temp_;
}
/// prepare the MMA accumulators when initialization or zeroing is required.
CUTLASS_DEVICE
- bool prepare_if_needed() {
+ bool prepare_if_needed() {
return reset_accum_flag_;
}
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
index bd25a9004..be1f9747e 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp
@@ -1,11 +1,11 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
+//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
-//
+//
// http://www.apache.org/licenses/LICENSE-2.0
-//
+//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -137,7 +137,7 @@ struct CollectiveMma<
using PipelineParams = typename MainloopPipeline::Params;
// Two threads per CTA are producers (1 for operand tile and 32 for scales)
- static constexpr int NumProducerThreadEvents = 33;
+ static constexpr int NumProducerThreadEvents = 33;
static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_;
static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM;
@@ -161,11 +161,11 @@ struct CollectiveMma<
SmemLayoutAtomB{},
make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}),
cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{}));
-
- // Block scaling gmem-to-smem copy atom
+
+ // Block scaling gmem-to-smem copy atom
using SmemBlockScalingCopyAtomA = Copy_Atom, ElementBlockScale>;
using SmemBlockScalingCopyAtomB = Copy_Atom, ElementBlockScale>;
-
+
// Block scaling smem layout
using SmemLayoutScaleA = Layout, Int>>;
using SmemLayoutScaleB = Layout>, Stride<_1>>; // `ScaleNsPerTile` is always 1.
@@ -202,7 +202,7 @@ struct CollectiveMma<
StrideA dA;
ElementB const* ptr_B;
StrideB dB;
- ElementBlockScale const* ptr_scale_A;
+ ElementBlockScale const* ptr_scale_A;
ElementBlockScale const* ptr_scale_B;
};
@@ -228,7 +228,7 @@ struct CollectiveMma<
uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK;
uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK;
// Block scaling factors for A and B
- ElementBlockScale const* ptr_scale_A;
+ ElementBlockScale const* ptr_scale_A;
ElementBlockScale const* ptr_scale_B;
};
@@ -285,7 +285,7 @@ struct CollectiveMma<
constexpr int tma_alignment_bits = 128;
auto problem_shape_MNKL = append<4>(problem_shape, 1);
auto [M,N,K,L] = problem_shape_MNKL;
-
+
bool implementable = true;
constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value;
implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{});
@@ -346,7 +346,7 @@ struct CollectiveMma<
auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l)
auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{});
- // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
+ // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and
// gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl.
Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l)
Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l)
@@ -406,26 +406,26 @@ struct CollectiveMma<
Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape());
- Tensor gScaleA = local_tile(
- mScaleA_mkl, make_tile(Int{}),
+ Tensor gScaleA = local_tile(
+ mScaleA_mkl, make_tile(Int{}),
make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1)
- Tensor cScaleA = local_tile(
- cScaleA_mkl, make_tile(Int{}),
+ Tensor cScaleA = local_tile(
+ cScaleA_mkl, make_tile(Int{}),
make_coord(m_coord,_,l_coord));
Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1)
// TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128
- TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
+ TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{},
Layout>{}, Layout>{}); // (1,1,1)
- TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{},
+ TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{},
Layout>{}, Layout>{}); // (1,1,1)
ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x);
ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x);
-
+
Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA);
Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA);
Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA);
-
+
Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB);
Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB);
@@ -455,7 +455,7 @@ struct CollectiveMma<
}
}
- // Allocate predicate tensors for a_scales (since we can't guarantee that
+ // Allocate predicate tensors for a_scales (since we can't guarantee that
// all scales are valid, since we could have a partial tiles along M)
Tensor tApA_ScaleA = make_tensor(shape(tAsA_ScaleA(_,_,0)));
#pragma unroll
@@ -536,7 +536,7 @@ struct CollectiveMma<
Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE)
Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE)
-
+
// Block scaling
Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()),
Layout<
@@ -548,17 +548,17 @@ struct CollectiveMma<
//
// Define C accumulators and A/B partitioning
//
-
+
// Layout of warp group to thread mapping
- static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
+ static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and
stride<0>(typename TiledMma::BLayout{}) == 0 and
size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and
- size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
+ size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup,
"Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup;
- Layout warp_group_thread_layout = make_layout(Int{},
+ Layout warp_group_thread_layout = make_layout(Int{},
Int{});
int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0);
@@ -590,7 +590,7 @@ struct CollectiveMma<
// We release buffers to producer warps(dma load) with some mmas in flight
PipelineState smem_pipe_release = smem_pipe_read;
-
+
// Per block scale values for operand A and B
using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout.
@@ -618,7 +618,7 @@ struct CollectiveMma<
}
int read_stage = smem_pipe_read.index();
-
+
// Load per block scale values from shared memory to registers.
scale_b = sScaleB[read_stage];
CUTLASS_PRAGMA_UNROLL
@@ -668,7 +668,7 @@ struct CollectiveMma<
int read_stage = smem_pipe_read.index();
- // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N)
+ // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N)
scale_b = sScaleB[read_stage];
CUTLASS_PRAGMA_UNROLL
for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) {
@@ -712,7 +712,7 @@ struct CollectiveMma<
++smem_pipe_read;
++smem_pipe_release;
}
-
+
accumulation.scale_residue_if_needed(tCrScaleAViewAsC);
warpgroup_fence_operand(accumulation());
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp
index ca0acd826..f4cf0bf42 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp
@@ -1,11 +1,11 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
+//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
-//
+//
// http://www.apache.org/licenses/LICENSE-2.0
-//
+//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -50,4 +50,4 @@ struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8
//////////////////////////////////////////////////////////////////////////////
-} // namespace cutlass::gemm
\ No newline at end of file
+} // namespace cutlass::gemm
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
index 2cc91d611..5bce307a2 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h
@@ -90,4 +90,4 @@ struct GemmMoeProblemVisitor
} // namespace gemm
} // namespace cutlass
-/////////////////////////////////////////////////////////////////////////////////////////////////
\ No newline at end of file
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
index 38fdcf9fe..9531b01a7 100644
--- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
+++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h
@@ -90,7 +90,7 @@ template <
SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone,
/// Used for partial specialization
typename Enable = bool>
-class Wint2xMmaMultistage :
+class Wint2xMmaMultistage :
public Wint2xMmaBase {
public:
///< Base class
diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h
index 113ea5bf6..1a5b838b8 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h
@@ -57,7 +57,7 @@ bool dispatch_fuse_block_gemm_c3x(GemmEpilogueAllParams params){
hasbias,
ElementD,
void>;
-
+
constexpr int ScaleMsPerTile = size<0>(TileShape{});
constexpr int ScaleGranularityM = size<0>(TileShape{}) / ScaleMsPerTile;
@@ -161,7 +161,7 @@ bool dispatch_fuse_block_gemm_c3x(GemmEpilogueAllParams params){
arguments.scheduler.decomposition_mode = DecompositionMode::StreamK;
arguments.scheduler.reduction_mode = ReductionMode::Nondeterministic;
}
-
+
Gemm gemm_op;
diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h
index 943921e14..632cdc296 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h
@@ -170,4 +170,4 @@ bool dispatch_dual_gemm_act_sm90(DualGemmEpilogueAllParams params) {
return false;
}
return true;
-}
\ No newline at end of file
+}
diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h
index 819463175..c47015107 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h
@@ -148,4 +148,4 @@ bool dispatch_fuse_gemm_act_sm90(GemmEpilogueAllParams params) {
return false;
}
return true;
-}
\ No newline at end of file
+}
diff --git a/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
index bf65242d5..6b1ab209e 100644
--- a/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
+++ b/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h
@@ -54,7 +54,7 @@ public:
virtual size_t getWorkspaceSize(int const m, int const n, int const k) = 0;
virtual std::vector getConfigs(int k) const = 0;
-
+
protected:
static constexpr int SPLIT_K_LIMIT = 7;
static constexpr int MIN_M_TILE = 16;
diff --git a/custom_ops/gpu_ops/extract_text_token_output.cu b/custom_ops/gpu_ops/extract_text_token_output.cu
index 292c67078..ff04a813e 100644
--- a/custom_ops/gpu_ops/extract_text_token_output.cu
+++ b/custom_ops/gpu_ops/extract_text_token_output.cu
@@ -93,8 +93,8 @@ std::vector ExtractTextTokenOutputInferDtype(const paddle::Dat
PD_BUILD_STATIC_OP(extract_text_token_output)
.Inputs({"max_seq_len",
- "max_seq_len_index",
- "mm_token_num_len",
+ "max_seq_len_index",
+ "mm_token_num_len",
"seq_lens_this_time",
"cu_seqlens_q",
"score_text"})
diff --git a/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu b/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu
index 06295cd62..3e1ce299a 100644
--- a/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu
+++ b/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu
@@ -105,7 +105,7 @@ __global__ void cudaCoreGemm(InputType const* __restrict__ act,
}
}
}
-
+
__syncthreads();
for (int32_t ii = tid; ii < TILE_M * TILE_N; ii += BLOCK_SIZE) {
int32_t mid = ii / TILE_N, nid = ii % TILE_N;
@@ -188,4 +188,4 @@ bool cuda_core_gemm_launcher(GemmParams const& params) {
template bool cuda_core_gemm_launcher<__nv_fp8_e4m3, __nv_bfloat16>(GemmParams const&);
template bool cuda_core_gemm_launcher<__nv_fp8_e4m3, half>(GemmParams const&);
template bool cuda_core_gemm_launcher<__nv_fp8_e5m2, __nv_bfloat16>(GemmParams const&);
-template bool cuda_core_gemm_launcher<__nv_fp8_e5m2, half>(GemmParams const&);
\ No newline at end of file
+template bool cuda_core_gemm_launcher<__nv_fp8_e5m2, half>(GemmParams const&);
diff --git a/custom_ops/gpu_ops/get_mm_split_fuse.cc b/custom_ops/gpu_ops/get_mm_split_fuse.cc
index 7a69d26f2..3d70258d0 100644
--- a/custom_ops/gpu_ops/get_mm_split_fuse.cc
+++ b/custom_ops/gpu_ops/get_mm_split_fuse.cc
@@ -61,7 +61,7 @@ std::vector GetMmSplitFuse(const paddle::Tensor& task_input_ids,
st_idx += cur_st_len;
}
}
-
+
while (idx < seq_lens_origin) {
idx = idx + split_fuse_text_size;
if (idx >= seq_lens_origin) {
@@ -116,7 +116,7 @@ std::vector GetMmSplitFuse(const paddle::Tensor& task_input_ids,
while (ib < img_total && cur_img_len < chunk_image_token_number) {
int token_times = 4;
cur_img_len += (grid_thw_cpu[ib * 3 + 1] * grid_thw_cpu[ib * 3 + 2]) / token_times;
- ib ++;
+ ib ++;
chunk_image_number ++;
}
image_chunk_selections_vector.emplace_back(chunk_image_number);
diff --git a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
index 34fc2c16f..21effd59c 100644
--- a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
+++ b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu
@@ -88,7 +88,7 @@ void sent_key_value_by_remote_ptr(
#ifdef DEBUG_IPC_SENT
std::cout<<"remote_key_tensor_sent_ptr:"<<(int64_t)remote_key_tensor_sent_ptr
<<" local_key_tensor_sent_ptr:"<<(int64_t)local_key_tensor_sent_ptr
- <<" local_device_id:" << local_device_id
+ <<" local_device_id:" << local_device_id
<<" remote_device_id:" << remote_device_id
<<" block_idx_stride:" << block_idx_stride
<<" block_size_byte:" << block_size_byte
@@ -107,25 +107,25 @@ void sent_key_value_by_remote_ptr(
#endif
#ifndef DEBUG_IPC_SENT_SYNC_AND_PRINT
cudaMemcpyPeerAsync(
- reinterpret_cast(remote_key_tensor_sent_ptr),
- remote_device_id,
- reinterpret_cast(local_key_tensor_sent_ptr),
- local_device_id,
- block_size_byte,
+ reinterpret_cast(remote_key_tensor_sent_ptr),
+ remote_device_id,
+ reinterpret_cast(local_key_tensor_sent_ptr),
+ local_device_id,
+ block_size_byte,
stream);
#endif
#ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
cudaMemcpyPeer(
- reinterpret_cast(remote_key_tensor_sent_ptr),
- remote_device_id,
- reinterpret_cast(local_key_tensor_sent_ptr),
- local_device_id,
+ reinterpret_cast(remote_key_tensor_sent_ptr),
+ remote_device_id,
+ reinterpret_cast(local_key_tensor_sent_ptr),
+ local_device_id,
block_size_byte);
#endif
cudaError_t err = cudaGetLastError();
if ( err != cudaSuccess )
{
- printf("CUDA Error: %s\n", cudaGetErrorString(err));
+ printf("CUDA Error: %s\n", cudaGetErrorString(err));
}
#ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
cudaDeviceSynchronize();
@@ -140,7 +140,7 @@ void sent_key_value_by_remote_ptr(
#ifdef DEBUG_IPC_SENT
std::cout<<"remote_value_tensor_sent_ptr:"<<(int64_t)remote_value_tensor_sent_ptr
<<" local_value_tensor_sent_ptr:"<<(int64_t)local_value_tensor_sent_ptr
- <<" local_device_id:" << local_device_id
+ <<" local_device_id:" << local_device_id
<<" remote_device_id:" << remote_device_id
<<" block_idx_stride:" << block_idx_stride
<<" block_size_byte:" << block_size_byte
@@ -159,26 +159,26 @@ void sent_key_value_by_remote_ptr(
#endif
#ifndef DEBUG_IPC_SENT_SYNC_AND_PRINT
cudaMemcpyPeerAsync(
- reinterpret_cast(remote_value_tensor_sent_ptr),
- remote_device_id,
- reinterpret_cast(local_value_tensor_sent_ptr),
- local_device_id,
- block_size_byte,
+ reinterpret_cast(remote_value_tensor_sent_ptr),
+ remote_device_id,
+ reinterpret_cast(local_value_tensor_sent_ptr),
+ local_device_id,
+ block_size_byte,
stream);
#endif
#ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
cudaMemcpyPeer(
- reinterpret_cast(remote_value_tensor_sent_ptr),
- remote_device_id,
- reinterpret_cast(local_value_tensor_sent_ptr),
- local_device_id,
+ reinterpret_cast(remote_value_tensor_sent_ptr),
+ remote_device_id,
+ reinterpret_cast(local_value_tensor_sent_ptr),
+ local_device_id,
block_size_byte);
cudaDeviceSynchronize();
#endif
err = cudaGetLastError();
if ( err != cudaSuccess )
{
- printf("CUDA Error: %s\n", cudaGetErrorString(err));
+ printf("CUDA Error: %s\n", cudaGetErrorString(err));
}
#ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT
PrintMatrix(reinterpret_cast(remote_value_tensor_sent_ptr),
@@ -316,11 +316,11 @@ void SentKeyValueByRemotePtrBlockSync(const paddle::Tensor& local_key_tensor,
cudaStream_t cuda_stream = (cudaStream_t)cuda_stream_raw;
cudaStreamSynchronize(cuda_stream);
}
-
+
PD_BUILD_STATIC_OP(ipc_sent_key_value_cache_by_remote_ptr)
.Inputs({"local_key_tensor", "local_value_tensor", "local_block_ids", "remote_block_ids", "remote_key_tensor", "remote_value_tensor"})
- .Attrs({ "block_num: int",
- "local_device_id: int",
+ .Attrs({ "block_num: int",
+ "local_device_id: int",
"remote_device_id: int",
"cuda_stream_raw: int64_t"})
.Outputs({"local_key_tensor_out", "local_value_tensor_out"})
@@ -332,4 +332,4 @@ PD_BUILD_STATIC_OP(ipc_sent_key_value_cache_by_remote_ptr_block_sync)
.Attrs({"cuda_stream_raw: int64_t"})
.Outputs({"local_key_tensor_out", "local_value_tensor_out"})
.SetInplaceMap({{"local_key_tensor", "local_key_tensor_out"},{"local_value_tensor","local_value_tensor_out"}})
- .SetKernelFn(PD_KERNEL(SentKeyValueByRemotePtrBlockSync));
\ No newline at end of file
+ .SetKernelFn(PD_KERNEL(SentKeyValueByRemotePtrBlockSync));
diff --git a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu
index 64d8c3866..c963bb12e 100644
--- a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu
+++ b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu
@@ -57,5 +57,3 @@ paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids,
num_experts);
return token_nums_per_expert;
}
-
-
diff --git a/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu b/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu
index 42476a293..66d9f72fe 100644
--- a/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu
+++ b/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu
@@ -737,7 +737,7 @@ void MoeFastHardamardWrapper(const T *x_data,
bool FLAGS_hardamard_use_diagonal_block_matrix = true;
static const char* FLAGS_hardamard_moe_block_size = std::getenv("FLAGS_hardamard_moe_block_size");
- static const int32_t hardamard_moe_block_size = FLAGS_hardamard_moe_block_size != nullptr ?
+ static const int32_t hardamard_moe_block_size = FLAGS_hardamard_moe_block_size != nullptr ?
stoi(std::string(FLAGS_hardamard_moe_block_size)) : 512;
constexpr int kThreads = 128;
if (FLAGS_hardamard_use_diagonal_block_matrix) {
diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
index 1078ae218..254f80e67 100644
--- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
+++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h
@@ -124,4 +124,4 @@ class CubKeyValueSorter {
int num_bits_;
};
-} // namespace phi
\ No newline at end of file
+} // namespace phi
diff --git a/custom_ops/gpu_ops/moe/fused_moe_op.h b/custom_ops/gpu_ops/moe/fused_moe_op.h
index f46e1523c..09d705d41 100644
--- a/custom_ops/gpu_ops/moe/fused_moe_op.h
+++ b/custom_ops/gpu_ops/moe/fused_moe_op.h
@@ -360,10 +360,10 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_fused(const T* input,
normalizing_factor = 1.f / Z;
}
__syncthreads();
-
+
T val = T(threadDataExp * normalizing_factor);
- // top_k
+ // top_k
using cub_kvp = cub::KeyValuePair;
using BlockReduceP = cub::BlockReduce;
__shared__ typename BlockReduceP::TempStorage tmpStorageP;
@@ -374,10 +374,10 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_fused(const T* input,
for (int k_idx = 0; k_idx < k; ++k_idx) {
thread_kvp.key = 0;
thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities
-
+
if (threadIdx.x < num_experts) {
cub_kvp inp_kvp;
- int expert = threadIdx.x;
+ int expert = threadIdx.x;
inp_kvp.key = expert;
inp_kvp.value = bias ? val + bias[expert] : val;
@@ -518,12 +518,12 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_normed_fused(const T* i
if (threadIdx.x == 0) {
normalizing_factor = 1.f / Z;
}
-
+
__syncthreads();
-
+
T val = T(threadDataExp * normalizing_factor);
- // top_k
+ // top_k
using cub_kvp = cub::KeyValuePair;
using BlockReduceP = cub::BlockReduce;
__shared__ typename BlockReduceP::TempStorage tmpStorageP;
@@ -541,7 +541,7 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_normed_fused(const T* i
if (threadIdx.x < num_experts) {
cub_kvp inp_kvp;
- int expert = threadIdx.x;
+ int expert = threadIdx.x;
inp_kvp.key = expert;
inp_kvp.value = bias ? val + bias[expert] : val;
@@ -1065,7 +1065,7 @@ __global__ void initialize_moe_routing_kernel(
const T* unpermuted_input,
OutT* permuted_output,
const int* expanded_dest_row_to_expanded_source_row,
- const int *expert_idx_per_token,
+ const int *expert_idx_per_token,
const float *w4a8_in_scale,
int* expanded_source_row_to_expanded_dest_row,
const int64_t num_rows,
@@ -1088,7 +1088,7 @@ __global__ void initialize_moe_routing_kernel(
expanded_source_row_to_expanded_dest_row[expanded_source_row] =
expanded_dest_row;
}
-
+
if (expanded_dest_row < active_rows) {
const int expert_idx = expert_idx_per_token[expanded_dest_row];
@@ -1130,7 +1130,7 @@ static void run(
const T* unpermuted_input,
OutT* permuted_output,
const int* expanded_dest_row_to_expanded_source_row,
- const int *expert_idx_per_token,
+ const int *expert_idx_per_token,
const float *w4a8_in_scale,
int* expanded_source_row_to_expanded_dest_row,
const int64_t num_rows,
diff --git a/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu b/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu
index 9b4182c7d..ec44a5bfc 100644
--- a/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu
+++ b/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu
@@ -17,7 +17,7 @@
// topk warps
template
__global__ void MoEDeepGEMMPermuteKernel(T* out, int* token_nums_per_expert, int* permute_indices_per_token, const T* x, const int64_t* topk_idx, const int token_num, const int topk, const int num_vecs, const int hidden, const int max_tokens_per_expert) {
-
+
AlignedVector in_vec;
const int bid = blockIdx.x;
@@ -32,7 +32,7 @@ __global__ void MoEDeepGEMMPermuteKernel(T* out, int* token_nums_per_expert, int
}
tgt_expert_token = __shfl_sync(0xFFFFFFFF, tgt_expert_token, 0);
-
+
for (int hidden_vec_id = tid; hidden_vec_id < num_vecs; hidden_vec_id += 32) {
Load(x + token_idx * hidden + hidden_vec_id * VecSize, &in_vec);
Store(in_vec, out + tgt_expert_id * max_tokens_per_expert * hidden + tgt_expert_token * hidden + hidden_vec_id * VecSize);
@@ -81,7 +81,7 @@ std::vector MoEDeepGEMMPermuteDispatch(
permute_indices_per_token.data(),
reinterpret_cast(x.data()),
topk_idx.data(),
- token_num, topk, num_vecs,
+ token_num, topk, num_vecs,
hidden, max_tokens_per_expert
);
@@ -112,4 +112,4 @@ PD_BUILD_STATIC_OP(moe_deepgemm_permute)
.Inputs({"x", "topk_idx"})
.Outputs({"permute_output", "token_nums_per_expert", "permute_indices_per_token"})
.Attrs({"num_experts: int", "max_tokens_per_expert: int"})
- .SetKernelFn(PD_KERNEL(MoEDeepGEMMPermute));
\ No newline at end of file
+ .SetKernelFn(PD_KERNEL(MoEDeepGEMMPermute));
diff --git a/custom_ops/gpu_ops/moe/moe_dispatch.cu b/custom_ops/gpu_ops/moe/moe_dispatch.cu
index dedd5fbdd..7ae20e0ae 100644
--- a/custom_ops/gpu_ops/moe/moe_dispatch.cu
+++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu
@@ -232,12 +232,12 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype,
/**
* @brief Mixture of Experts (MoE) Expert Dispatch Operator
- *
+ *
* This operator performs the following key functions:
* 1. Computes top-k experts for each input token based on gating scores
* 2. Permutes input tokens according to their selected experts for efficient expert processing
* 3. Computes prefix sums of tokens per expert for group_gemm optimization
- *
+ *
* Inputs:
* - input: The input tensor to be routed to experts
* Shape: [total_tokens, hidden_size]
@@ -246,7 +246,7 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype,
* Shape: [total_tokens, expert_num]
* dtype: must be float32
* - gating_correction_bias: Optional bias term for gating correction (expert_num)
- *
+ *
* Outputs:
* - permute_input: Permuted input tensor organized by expert
* Shape: [moe_topk * total_tokens, hidden_size]
@@ -263,7 +263,7 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype,
* - top_k_indices: Indices of selected top-k experts for each token
* Shape: [total_tokens, moe_topk]
* dtype: int32
- *
+ *
* Attributes:
* - moe_topk: Number of experts to select for each token (k value in top-k routing)
* - group_moe: Whether to perform group softmax within the operator
@@ -272,7 +272,7 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype,
* - topk_only_mode: Operation mode selector
* (true: only performs topk selection without softmax,
* false: performs full softmax+topk computation)
- *
+ *
* Note:
* - The operator requires 2D input format [total_tokens, hidden_size]
* - For optimal performance, expert_num should be a power of 2 when possible
@@ -283,7 +283,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
paddle::Optional("gating_correction_bias"),
paddle::Optional("w4a8_in_scale")})
.Outputs({"permute_input", "tokens_expert_prefix_sum",
- "permute_indices_per_token", "topk_weight", "topk_idx",
+ "permute_indices_per_token", "topk_weight", "topk_idx",
"expert_idx_per_token"})
.Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
.SetKernelFn(PD_KERNEL(MoeExpertDispatch))
diff --git a/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu b/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu
index ba939ec2d..a53cb0a95 100644
--- a/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu
+++ b/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu
@@ -263,4 +263,4 @@ PD_BUILD_OP(moe_redundant_topk_select)
.SetInplaceMap({{"tokens_per_expert_stats_list", "tokens_per_expert_stats_list_out"}})
.SetKernelFn(PD_KERNEL(MoERedundantTopKSelectKernel))
.SetInferShapeFn(PD_INFER_SHAPE(MoERedundantTopKSelectKernelInferShape))
- .SetInferDtypeFn(PD_INFER_DTYPE(MoERedundantTopKSelectKernelInferDtype));
\ No newline at end of file
+ .SetInferDtypeFn(PD_INFER_DTYPE(MoERedundantTopKSelectKernelInferDtype));
diff --git a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu
index 68d756a1a..b45f36947 100644
--- a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu
+++ b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu
@@ -106,4 +106,4 @@ template __global__ void Marlin( MARLIN_KERNEL_PARAMS );
-}
\ No newline at end of file
+}
diff --git a/custom_ops/gpu_ops/msg_utils.h b/custom_ops/gpu_ops/msg_utils.h
index e3ca0f646..ff46ccb00 100644
--- a/custom_ops/gpu_ops/msg_utils.h
+++ b/custom_ops/gpu_ops/msg_utils.h
@@ -36,4 +36,4 @@ struct msgdata {
struct msgdatakv {
long mtype;
int mtext[MAX_BSZ * 3 + 2]; // encoder_count, layer_id, bid- pair
-};
\ No newline at end of file
+};
diff --git a/custom_ops/gpu_ops/read_ids.py b/custom_ops/gpu_ops/read_ids.py
index 560c9758e..d84c54b4d 100644
--- a/custom_ops/gpu_ops/read_ids.py
+++ b/custom_ops/gpu_ops/read_ids.py
@@ -14,9 +14,10 @@
"""read_ids"""
import os
-import numpy as np
import struct
+import numpy as np
+
def deserialize_from_file(fp):
"""deserialize from file"""
diff --git a/custom_ops/gpu_ops/read_temp_ids.py b/custom_ops/gpu_ops/read_temp_ids.py
index 65c49a719..585bd900c 100644
--- a/custom_ops/gpu_ops/read_temp_ids.py
+++ b/custom_ops/gpu_ops/read_temp_ids.py
@@ -13,9 +13,10 @@
# limitations under the License.
"""read temp_ids from file"""
import os
-import numpy as np
import struct
+import numpy as np
+
def deserialize_from_file(fp):
"""
diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.cc b/custom_ops/gpu_ops/remote_cache_kv_ipc.cc
index edbacd5d6..f1f53513b 100644
--- a/custom_ops/gpu_ops/remote_cache_kv_ipc.cc
+++ b/custom_ops/gpu_ops/remote_cache_kv_ipc.cc
@@ -15,7 +15,7 @@
#include "remote_cache_kv_ipc.h"
RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data RemoteCacheKvIpc::kv_complete_signal_meta_data;
-RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query
+RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query
RemoteCacheKvIpc::kv_complete_signal_meta_data_per_query;
void* RemoteCacheKvIpc::kv_complete_signal_identity_ptr = nullptr;
bool RemoteCacheKvIpc::kv_complete_signal_shmem_opened = false;
@@ -118,4 +118,3 @@ void CUDART_CB RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_per_que
RemoteCacheKvIpc::kv_complete_signal_meta_data_per_query.send_signal();
// std::printf("#### save_cache_kv_complete_signal_layerwise_per_query);
}
-
diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.h b/custom_ops/gpu_ops/remote_cache_kv_ipc.h
index 4694e0b39..3c09af1e4 100644
--- a/custom_ops/gpu_ops/remote_cache_kv_ipc.h
+++ b/custom_ops/gpu_ops/remote_cache_kv_ipc.h
@@ -71,7 +71,7 @@ struct RemoteCacheKvIpc {
}
}
msg_sed.mtext[0] = encoder_count;
-
+
if (!inited) {
// just init once
const int msg_id = 1024 + rank;
@@ -90,7 +90,7 @@ struct RemoteCacheKvIpc {
assert(layer_id_ <= num_layers_);
}
};
-
+
static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data kv_complete_signal_meta_data;
static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query kv_complete_signal_meta_data_per_query;
static void* kv_complete_signal_identity_ptr;
diff --git a/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu b/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu
index 3e30db4a3..88b985b45 100644
--- a/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu
+++ b/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu
@@ -125,7 +125,7 @@ void group_wise_scale(ScaleT* scale,
}
}
-std::vector Fp8Int4WeightQuantizeKernel(const paddle::Tensor &input,
+std::vector Fp8Int4WeightQuantizeKernel(const paddle::Tensor &input,
int groupsize,
std::string scale_dtype) {
auto input_cpu = input.copy_to(paddle::CPUPlace(), false);
@@ -139,47 +139,47 @@ std::vector Fp8Int4WeightQuantizeKernel(const paddle::Tensor &in
if (groupsize > 0) {
scale = paddle::full({shape[0] / groupsize * shape[1]}, 1.0, paddle::DataType::BFLOAT16, paddle::CPUPlace());
group_wise_scale(scale.data(), input_cpu.data(), k, n, 7.0f, groupsize);
- group_wise_quant(packed_int4.data(),
- input_cpu.data(),
- scale.data(),
- k,
+ group_wise_quant(packed_int4.data(),
+ input_cpu.data(),
+ scale.data(),
+ k,
n,
groupsize);
} else {
scale = paddle::full({shape[1]}, 1.0, paddle::DataType::BFLOAT16, paddle::CPUPlace());
per_channel_scale(scale.data(), input_cpu.data(), k, n, 7.0f);
- per_channel_quant(packed_int4.data(),
- input_cpu.data(),
- scale.data(),
- k,
+ per_channel_quant(packed_int4.data(),
+ input_cpu.data(),
+ scale.data(),
+ k,
n);
}
} else if (scale_dtype == "float16") {
if (groupsize > 0) {
- scale = paddle::full({shape[0] / groupsize * shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace());
+ scale = paddle::full({shape[0] / groupsize * shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace());
group_wise_scale(scale.data(), input_cpu.data(), k, n, 7.0f, groupsize);
- group_wise_quant(packed_int4.data(),
- input_cpu.data(),
- scale.data(),
- k,
+ group_wise_quant(packed_int4.data(),
+ input_cpu.data(),
+ scale.data(),
+ k,
n,
groupsize);
} else {
- scale = paddle::full({shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace());
+ scale = paddle::full({shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace());
per_channel_scale(scale.data(), input_cpu.data(), k, n, 7.0f);
- per_channel_quant(packed_int4.data(),
- input_cpu.data(),
- scale.data(),
- k,
+ per_channel_quant(packed_int4.data(),
+ input_cpu.data(),
+ scale.data(),
+ k,
n);
}
}
auto out = paddle::full({shape[1] / 2, shape[0]}, 0, paddle::DataType::INT8, paddle::CPUPlace());
preprocess_weights_for_mixed_gemm(
- out.data(),
- packed_int4.data(),
- {k, n},
+ out.data(),
+ packed_int4.data(),
+ {k, n},
kernels::cutlass_kernels::QuantType::W4_AFP8,
false);
return {out, scale};
diff --git a/custom_ops/gpu_ops/share_external_data.cu b/custom_ops/gpu_ops/share_external_data.cu
index 194a66795..8b204ccc3 100644
--- a/custom_ops/gpu_ops/share_external_data.cu
+++ b/custom_ops/gpu_ops/share_external_data.cu
@@ -1,11 +1,11 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
+//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
-//
+//
// http://www.apache.org/licenses/LICENSE-2.0
-//
+//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -27,7 +27,7 @@
std::vector ShareExternalData(paddle::Tensor& input,
const std::string shm_name,
- const std::vector& shape) {
+ const std::vector& shape) {
volatile shmStruct *shm = NULL;
sharedMemoryInfo info;
if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) {
@@ -62,4 +62,4 @@ PD_BUILD_STATIC_OP(share_external_data)
.Inputs({"input"})
.Outputs({"output"})
.Attrs({"shm_name: std::string", "shape: std::vector"})
- .SetKernelFn(PD_KERNEL(ShareExternalData));
\ No newline at end of file
+ .SetKernelFn(PD_KERNEL(ShareExternalData));
diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
index dcc9337f0..97d900319 100644
--- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu
@@ -19,7 +19,7 @@
// #define DEBUG_EAGLE_KERNEL
__global__ void ComputeOrderKernel(
- const int* seq_lens_this_time,
+ const int* seq_lens_this_time,
const int* seq_lens_encoder,
const int* base_model_seq_lens_this_time,
const int* base_model_seq_lens_encoder,
@@ -47,7 +47,7 @@ __global__ void ComputeOrderKernel(
printf("batch %d: cur_seq_lens_encoder > 0 \n", i);
#endif
for (int j = 0; j < cur_seq_lens_encoder; j++) {
- position_map[in_offset++] = out_offset++;
+ position_map[in_offset++] = out_offset++;
}
// 2. base model encoder. Base step=0
} else if (cur_base_model_seq_lens_encoder != 0) {
@@ -69,13 +69,13 @@ __global__ void ComputeOrderKernel(
in_offset += cur_base_model_seq_lens_this_time;
} else /*Accept all draft tokens*/ {
#ifdef DEBUG_EAGLE_KERNEL
- printf("batch %d: accept_num > actual_draft_token_num \n", i);
+ printf("batch %d: accept_num > actual_draft_token_num \n", i);
#endif
position_map[in_offset + accept_num - 2] = out_offset++;
position_map[in_offset + accept_num - 1] = out_offset++;
in_offset += cur_base_model_seq_lens_this_time;
}
- }
+ }
}
output_token_num[0] = out_offset;
#ifdef DEBUG_EAGLE_KERNEL
@@ -208,7 +208,7 @@ std::vector EagleGetHiddenStates(
}
case paddle::DataType::BFLOAT16: {
return DispatchDtype(
- input,
+ input,
seq_lens_this_time,
seq_lens_encoder,
seq_lens_decoder,
diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu
index f440c43c6..878926f3b 100644
--- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu
@@ -72,7 +72,7 @@ __global__ void computeOrderKernel(
output_token_num[0] = out_offset;
#ifdef DEBUG_EAGLE_KERNEL
printf("position map output_token_num%d:\n", output_token_num[0]);
- for (int i = 0; i < output_token_num[0]; i++) {
+ for (int i = 0; i < output_token_num[0]; i++) {
printf("%d ", src_map[i]);
}
printf("\n");
@@ -187,4 +187,4 @@ PD_BUILD_STATIC_OP(eagle_get_self_hidden_states)
"seq_lens_this_time",
"step_idx"})
.Outputs({"out"})
- .SetKernelFn(PD_KERNEL(EagleGetSelfHiddenStates));
\ No newline at end of file
+ .SetKernelFn(PD_KERNEL(EagleGetSelfHiddenStates));
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu
index 48c24a0e0..d4937116c 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu
@@ -26,7 +26,7 @@ __global__ void RebuildAppendPaddingKernel(
const int seq_len,
const int dim_embed,
const size_t elem_nums) {
- using LoadT = AlignedVector;
+ using LoadT = AlignedVector;
LoadT src_vec;
const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = global_idx * VecSize; i < elem_nums; i += gridDim.x * blockDim.x * VecSize) {
@@ -42,7 +42,7 @@ __global__ void RebuildAppendPaddingKernel(
const int input_token_id = ori_token_id - cum_offset[bi] + seq_id;
const int bias_idx = i % dim_embed;
-
+
Load(&full_hidden_states[input_token_id * dim_embed + bias_idx], &src_vec);
Store(src_vec, &out[i]);
}
@@ -78,14 +78,14 @@ std::vector DispatchDtype(
GetNumBlocks(pack_num, &grid_size);
RebuildAppendPaddingKernel<<>>(
- reinterpret_cast(out.data()),
- reinterpret_cast(full_hidden_states.data()),
- cum_offsets.data(),
- seq_len_encoder.data(),
- seq_len_decoder.data(),
- output_padding_offset.data(),
- max_seq_len,
- dim_embed,
+ reinterpret_cast(out.data()),
+ reinterpret_cast(full_hidden_states.data()),
+ cum_offsets.data(),
+ seq_len_encoder.data(),
+ seq_len_decoder.data(),
+ output_padding_offset.data(),
+ max_seq_len,
+ dim_embed,
elem_nums);
return {out};
}
@@ -99,7 +99,7 @@ std::vector RebuildAppendPadding(
const paddle::Tensor& output_padding_offset,
const int max_seq_len) {
-
+
switch (full_hidden_states.dtype()) {
case paddle::DataType::BFLOAT16:
return DispatchDtype(
@@ -137,7 +137,7 @@ std::vector RebuildAppendPaddingInferDtype(
PD_BUILD_STATIC_OP(speculate_rebuild_append_padding)
- .Inputs({"full_hidden_states",
+ .Inputs({"full_hidden_states",
"cum_offsets",
"seq_len_encoder",
"seq_len_decoder",
@@ -146,4 +146,4 @@ PD_BUILD_STATIC_OP(speculate_rebuild_append_padding)
.Outputs({"out"})
.SetKernelFn(PD_KERNEL(RebuildAppendPadding))
.SetInferShapeFn(PD_INFER_SHAPE(RebuildAppendPaddingInferShape))
- .SetInferDtypeFn(PD_INFER_DTYPE(RebuildAppendPaddingInferDtype));
\ No newline at end of file
+ .SetInferDtypeFn(PD_INFER_DTYPE(RebuildAppendPaddingInferDtype));
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
index bd18bdd6b..baf1da9e1 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu
@@ -93,7 +93,7 @@ __global__ void speculate_free_and_reschedule(bool *stop_flags,
used_list_len[tid] = 0;
}
} else if (seq_lens_this_time[tid] != 0 && max_possible_block_idx < block_num_per_seq &&
- block_table_now[(seq_lens_decoder[tid] + max_draft_tokens +
+ block_table_now[(seq_lens_decoder[tid] + max_draft_tokens +
1) /
block_size] == -1) {
// 统计需要分配block的位置和总数
@@ -347,7 +347,7 @@ PD_BUILD_STATIC_OP(speculate_step_reschedule)
"next_tokens",
"first_token_ids",
"accept_num"})
- .Attrs({"block_size: int",
+ .Attrs({"block_size: int",
"encoder_decoder_block_num: int",
"max_draft_tokens: int"})
.Outputs({"stop_flags_out",
diff --git a/custom_ops/gpu_ops/step_system_cache.cu b/custom_ops/gpu_ops/step_system_cache.cu
index a432110af..4b236bd80 100644
--- a/custom_ops/gpu_ops/step_system_cache.cu
+++ b/custom_ops/gpu_ops/step_system_cache.cu
@@ -60,7 +60,7 @@ __global__ void recover_block_system_cache(int *recover_block_list, // [bsz]
const int ori_free_list_len_tid0 = atomicSub(free_list_len, decoder_used_len);
ori_free_list_len = ori_free_list_len_tid0;
#ifdef DEBUG_STEP
- printf("seq_id: %d, ori_seq_len_encoder: %d, step_idx_now: %d, seq_len: %d, ori_free_list_len_tid0: %d, ori_free_list_len: %d\n",
+ printf("seq_id: %d, ori_seq_len_encoder: %d, step_idx_now: %d, seq_len: %d, ori_free_list_len_tid0: %d, ori_free_list_len: %d\n",
recover_id, ori_seq_len_encoder, step_idx_now, seq_len, ori_free_list_len_tid0, ori_free_list_len);
#endif
}
@@ -95,7 +95,7 @@ void StepSystemCache(const paddle::Tensor& stop_flags,
const paddle::Tensor& recover_lens,
const paddle::Tensor& need_block_list,
const paddle::Tensor& need_block_len,
- const paddle::Tensor& used_list_len,
+ const paddle::Tensor& used_list_len,
const paddle::Tensor& free_list,
const paddle::Tensor& free_list_len,
const paddle::Tensor& input_ids,
@@ -178,7 +178,7 @@ void StepSystemCache(const paddle::Tensor& stop_flags,
}
PD_BUILD_STATIC_OP(step_system_cache)
- .Inputs({"stop_flags",
+ .Inputs({"stop_flags",
"seq_lens_this_time",
"ori_seq_lens_encoder",
"ori_seq_lens_decoder",
diff --git a/custom_ops/gpu_ops/swap_cache.cu b/custom_ops/gpu_ops/swap_cache.cu
index 6ccdaab43..a25d08886 100644
--- a/custom_ops/gpu_ops/swap_cache.cu
+++ b/custom_ops/gpu_ops/swap_cache.cu
@@ -68,26 +68,26 @@ void SwapCache(const paddle::Tensor& cache_gpu, // gpu
switch (cache_gpu.dtype()) {
case paddle::DataType::BFLOAT16:
return SwapCacheImpl(
- cache_gpu,
- cache_cpu_ptr,
+ cache_gpu,
+ cache_cpu_ptr,
max_block_num_cpu,
- swap_block_ids_gpu,
+ swap_block_ids_gpu,
swap_block_ids_cpu,
mode);
case paddle::DataType::FLOAT16:
return SwapCacheImpl(
- cache_gpu,
- cache_cpu_ptr,
+ cache_gpu,
+ cache_cpu_ptr,
max_block_num_cpu,
- swap_block_ids_gpu,
+ swap_block_ids_gpu,
swap_block_ids_cpu,
mode);
case paddle::DataType::UINT8:
return SwapCacheImpl(
- cache_gpu,
- cache_cpu_ptr,
+ cache_gpu,
+ cache_cpu_ptr,
max_block_num_cpu,
- swap_block_ids_gpu,
+ swap_block_ids_gpu,
swap_block_ids_cpu,
mode);
default:
diff --git a/custom_ops/gpu_ops/text_image_gather_scatter.cu b/custom_ops/gpu_ops/text_image_gather_scatter.cu
index 6bcd92263..09fc07f96 100644
--- a/custom_ops/gpu_ops/text_image_gather_scatter.cu
+++ b/custom_ops/gpu_ops/text_image_gather_scatter.cu
@@ -47,7 +47,7 @@ inline cudaError_t GetGridSize(int64_t n, int block_size, int num_waves, int* nu
template
__global__ void text_image_scatter_kernel(
- T* input_ptr,
+ T* input_ptr,
T* text_gather_ptr,
T* image_gather_ptr,
int32_t* token_type_ids,
@@ -72,8 +72,8 @@ __global__ void text_image_scatter_kernel(
int32_t token_type_ids_num = token_type_ids[token_idx];
int64_t input_load_offset = token_idx * hidden_size + hidden_offset;
-
- Load(input_ptr + input_load_offset, &input_ptr_vec);
+
+ Load(input_ptr + input_load_offset, &input_ptr_vec);
#pragma unroll
for(int vi = 0; vi < VecSize; ++vi) {
text_imgaes_vec[vi] = input_ptr_vec[vi];
@@ -92,7 +92,7 @@ __global__ void text_image_scatter_kernel(
template
__global__ void text_image_gather_kernel(
- T* output_ptr,
+ T* output_ptr,
T* text_gather_ptr,
T* image_gather_ptr,
int32_t* token_type_ids,
@@ -131,8 +131,8 @@ __global__ void text_image_gather_kernel(
}
int64_t input_load_offset = token_idx * hidden_size + hidden_offset;
-
- Store(output_ptr_vec, output_ptr + input_load_offset);
+
+ Store(output_ptr_vec, output_ptr + input_load_offset);
}
}
@@ -159,7 +159,7 @@ void LaunchTextImageGatherScatter(
const int64_t tot_element_num = token_num * hidden_size;
int64_t tot_pack_num = (tot_element_num + VecSize - 1) / VecSize;
-
+
const int block_size = 128;
int grid_index = (token_num + block_size - 1) / block_size;
constexpr int32_t kNumWaves = 16;
@@ -170,8 +170,8 @@ void LaunchTextImageGatherScatter(
if (is_scatter) {
text_image_scatter_kernel<<>>(
reinterpret_cast(input.data()),
- reinterpret_cast(text_input.data()),
- reinterpret_cast(image_input.data()),
+ reinterpret_cast(text_input.data()),
+ reinterpret_cast(image_input.data()),
reinterpret_cast(token_type_ids.data()),
reinterpret_cast(text_index.data()),
reinterpret_cast(image_index.data()),
@@ -181,8 +181,8 @@ void LaunchTextImageGatherScatter(
} else {
text_image_gather_kernel<<>>(
reinterpret_cast(input.data()),
- reinterpret_cast(text_input.data()),
- reinterpret_cast(image_input.data()),
+ reinterpret_cast(text_input.data()),
+ reinterpret_cast(image_input.data()),
reinterpret_cast(token_type_ids.data()),
reinterpret_cast(text_index.data()),
reinterpret_cast(image_index.data()),
@@ -216,8 +216,8 @@ void TextImageGatherScatter(
PD_BUILD_STATIC_OP(text_image_gather_scatter)
.Inputs({"input",
- "text_input",
- "image_input",
+ "text_input",
+ "image_input",
"token_type_ids",
"text_index",
"image_index"})
@@ -229,5 +229,5 @@ PD_BUILD_STATIC_OP(text_image_gather_scatter)
.SetInplaceMap({{"text_input", "text_input_out"},
{"image_input", "image_input_out"},
{"text_index", "text_index_out"},
- {"image_index", "image_index_out"}})
+ {"image_index", "image_index_out"}})
.SetKernelFn(PD_KERNEL(TextImageGatherScatter));
diff --git a/custom_ops/gpu_ops/text_image_index_out.cu b/custom_ops/gpu_ops/text_image_index_out.cu
index 4140e2742..b6d8941d6 100644
--- a/custom_ops/gpu_ops/text_image_index_out.cu
+++ b/custom_ops/gpu_ops/text_image_index_out.cu
@@ -16,7 +16,7 @@
template
__global__ void text_image_index_out_kernel(
- int32_t* token_type_ids,
+ int32_t* token_type_ids,
int32_t* text_index,
int32_t* image_index,
const int64_t token_num
@@ -25,7 +25,7 @@ __global__ void text_image_index_out_kernel(
if (global_thread_idx >= 1) return;
int text_count = 0;
int images_count = 0;
-
+
for (int i = 0; i < token_num; ++i) {
// printf(" %d %d %d %d \n", text_index[i], text_count, images_count, i);
if (token_type_ids[i] == 0) {
@@ -60,5 +60,5 @@ PD_BUILD_STATIC_OP(text_image_index_out)
.Outputs({"text_index_out",
"image_index_out"})
.SetInplaceMap({{"text_index", "text_index_out"},
- {"image_index", "image_index_out"}})
+ {"image_index", "image_index_out"}})
.SetKernelFn(PD_KERNEL(TextImageIndexOut));
diff --git a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu
index fab6976bc..428d56364 100644
--- a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu
+++ b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu
@@ -810,4 +810,4 @@ PD_BUILD_STATIC_OP(tune_cublaslt_gemm)
"is_test: bool",
"is_read_from_file: bool",
"path: std::string"})
- .SetKernelFn(PD_KERNEL(TuneCublasltGemm));
\ No newline at end of file
+ .SetKernelFn(PD_KERNEL(TuneCublasltGemm));
diff --git a/custom_ops/gpu_ops/update_inputs_beam.cu b/custom_ops/gpu_ops/update_inputs_beam.cu
index 74d4c2b53..aea374661 100644
--- a/custom_ops/gpu_ops/update_inputs_beam.cu
+++ b/custom_ops/gpu_ops/update_inputs_beam.cu
@@ -33,7 +33,7 @@ __global__ void update_inputs_beam_kernel(
if (block_idx == 0) {
seq_lens_this_time[thread_idx] = seq_lens_this_time[bsz_index];
seq_lens_encoder[thread_idx] = seq_lens_encoder[bsz_index];
- }
+ }
if (block_idx < seq_len) {
input_ids[thread_idx * seq_len + block_idx] = input_ids[bsz_index * seq_len + block_idx];
}
@@ -74,8 +74,8 @@ void UpdateInputesBeam(
PD_BUILD_STATIC_OP(update_inputs_beam)
.Inputs({"beam_width",
- "seq_lens_this_time",
- "seq_lens_encoder",
+ "seq_lens_this_time",
+ "seq_lens_encoder",
"input_ids",
"logits"})
.Outputs({"seq_lens_this_time_out",
@@ -86,4 +86,4 @@ PD_BUILD_STATIC_OP(update_inputs_beam)
{"seq_lens_encoder", "seq_lens_encoder_out"},
{"input_ids", "input_ids_out"},
{"logits", "logits_out"}})
- .SetKernelFn(PD_KERNEL(UpdateInputesBeam));
\ No newline at end of file
+ .SetKernelFn(PD_KERNEL(UpdateInputesBeam));
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index c002beeb6..de49ab4ea 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-""" setup for FastDeploy custom ops """
+"""setup for FastDeploy custom ops"""
import importlib
import json
import os
@@ -41,8 +41,7 @@ ROOT_DIR = Path(__file__).parent.parent
# cannot import envs directly because it depends on fastdeploy,
# which is not installed yet
-envs = load_module_from_path('envs',
- os.path.join(ROOT_DIR, 'fastdeploy', 'envs.py'))
+envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "fastdeploy", "envs.py"))
archs = json.loads(envs.FD_BUILDING_ARCS)
use_bf16 = envs.FD_CPU_USE_BF16 == "True"
@@ -143,8 +142,7 @@ def get_nvcc_version():
"""
Get cuda version of nvcc.
"""
- nvcc_output = subprocess.check_output(["nvcc", "--version"],
- universal_newlines=True)
+ nvcc_output = subprocess.check_output(["nvcc", "--version"], universal_newlines=True)
output = nvcc_output.split()
release_idx = output.index("release") + 1
nvcc_cuda_version = float(output[release_idx].split(",")[0])
@@ -160,13 +158,19 @@ def get_gencode_flags(archs):
for cc_val in cc_s:
if cc_val == 90:
arch_code = "90a"
- flags += ["-gencode", f"arch=compute_{arch_code},code=sm_{arch_code}"]
- elif cc_val == 100: # Assuming 100 is the code for Blackwell SM10.x
+ flags += [
+ "-gencode",
+ f"arch=compute_{arch_code},code=sm_{arch_code}",
+ ]
+ elif cc_val == 100: # Assuming 100 is the code for Blackwell SM10.x
# Per NVIDIA dev blog, for CUTLASS and architecture-specific features on CC 10.0, use '100a'
# https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/
# "The CUTLASS build instructions specify using the a flag when building for devices of CC 9.0 and 10.0"
arch_code = "100a"
- flags += ["-gencode", f"arch=compute_{arch_code},code=sm_{arch_code}"]
+ flags += [
+ "-gencode",
+ f"arch=compute_{arch_code},code=sm_{arch_code}",
+ ]
else:
flags += ["-gencode", f"arch=compute_{cc_val},code=sm_{cc_val}"]
return flags
@@ -194,7 +198,7 @@ if paddle.is_compiled_with_rocm():
clone_git_repo("v3.11.3", "https://bgithub.xyz/nlohmann/json.git", json_dir)
if not os.listdir(json_dir):
raise ValueError("Git clone nlohmann_json failed!")
- sources=[
+ sources = [
"gpu_ops/set_value_by_flags.cu",
"gpu_ops/token_penalty_multi_scores.cu",
"gpu_ops/stop_generation.cu",
@@ -302,8 +306,7 @@ elif paddle.is_compiled_with_cuda():
if not os.path.exists(cutlass_dir) or not os.listdir(cutlass_dir):
if not os.path.exists(cutlass_dir):
os.makedirs(cutlass_dir)
- clone_git_repo("v3.8.0", "https://github.com/NVIDIA/cutlass.git",
- cutlass_dir)
+ clone_git_repo("v3.8.0", "https://github.com/NVIDIA/cutlass.git", cutlass_dir)
if not os.listdir(cutlass_dir):
raise ValueError("Git clone cutlass failed!")
@@ -312,8 +315,7 @@ elif paddle.is_compiled_with_cuda():
if not os.path.exists(deep_gemm_dir) or not os.listdir(deep_gemm_dir):
if not os.path.exists(deep_gemm_dir):
os.makedirs(deep_gemm_dir)
- clone_git_repo("main", "https://github.com/deepseek-ai/DeepGEMM.git",
- deep_gemm_dir)
+ clone_git_repo("main", "https://github.com/deepseek-ai/DeepGEMM.git", deep_gemm_dir)
if not os.listdir(deep_gemm_dir):
raise ValueError("Git clone DeepGEMM failed!")
cur_path = os.path.dirname(os.path.abspath(__file__))
@@ -347,15 +349,13 @@ elif paddle.is_compiled_with_cuda():
try:
shutil.copytree(src_dir, dst_dir)
except Exception as e:
- raise RuntimeError(
- f"Failed to copy from {src_dir} to {dst_dir}: {e}")
+ raise RuntimeError(f"Failed to copy from {src_dir} to {dst_dir}: {e}")
json_dir = "third_party/nlohmann_json"
if not os.path.exists(json_dir) or not os.listdir(json_dir):
if not os.path.exists(json_dir):
os.makedirs(json_dir)
- clone_git_repo("v3.11.3", "https://github.com/nlohmann/json.git",
- json_dir)
+ clone_git_repo("v3.11.3", "https://github.com/nlohmann/json.git", json_dir)
if not os.listdir(json_dir):
raise ValueError("Git clone nlohmann_json failed!")
@@ -372,7 +372,7 @@ elif paddle.is_compiled_with_cuda():
"-Ithird_party/nlohmann_json/include",
]
nvcc_version = get_nvcc_version()
- print(f'nvcc_version = {nvcc_version}')
+ print(f"nvcc_version = {nvcc_version}")
if nvcc_version >= 12.0:
sources += ["gpu_ops/sample_kernels/air_top_p_sampling.cu"]
cc = max(get_sm_version(archs))
@@ -414,31 +414,24 @@ elif paddle.is_compiled_with_cuda():
# Running generate fp8 gemm codes.
# Common for SM89, SM90, SM100 (Blackwell)
nvcc_compile_args += ["-DENABLE_FP8"]
- nvcc_compile_args += [
- "-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"
- ]
+ nvcc_compile_args += ["-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"]
# This script seems general enough for different SM versions, specific templates are chosen by CUTLASS.
os.system("python utils/auto_gen_visitor_fp8_gemm_fused_kernels.py")
- if cc >= 90: # Hopper and newer
+ if cc >= 90: # Hopper and newer
# SM90 (Hopper) specific auto-generation and flags
- if cc == 90: # Only for SM90
+ if cc == 90: # Only for SM90
nvcc_compile_args += [
# The gencode for 90a is added in get_gencode_flags now
# "-gencode",
# "arch=compute_90a,code=compute_90a",
"-O3",
- "-DNDEBUG", # NDEBUG is common, consider moving if not specific to 90a
+ "-DNDEBUG", # NDEBUG is common, consider moving if not specific to 90a
]
print("SM90: Running SM90-specific FP8 kernel auto-generation.")
- os.system(
- "python utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py")
- os.system(
- "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py"
- )
- os.system(
- "python utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py"
- )
+ os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py")
+ os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py")
+ os.system("python utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py")
nvcc_compile_args += [
"-DENABLE_SCALED_MM_SM90=1",
@@ -450,14 +443,14 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_sm90_int8.cu",
"gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_azp_sm90_int8.cu",
]
- elif cc == 100 and nvcc_version >= 12.9: # Blackwell SM100 specifics
+ elif cc == 100 and nvcc_version >= 12.9: # Blackwell SM100 specifics
print("SM100 (Blackwell): Applying SM100 configurations.")
nvcc_compile_args += [
# The gencode for 100a is added in get_gencode_flags
# "-gencode",
# "arch=compute_100a,code=compute_100a",
- "-O3", # Common optimization flag
- "-DNDEBUG", # Common debug flag
+ "-O3", # Common optimization flag
+ "-DNDEBUG", # Common debug flag
# Potentially add -DENABLE_SM100_FEATURES if specific macros are identified
]
# Placeholder for SM100-specific kernel auto-generation scripts
@@ -469,18 +462,16 @@ elif paddle.is_compiled_with_cuda():
# Add SM100 specific sources if any, e.g., for new hardware intrinsics
# sources += ["gpu_ops/cutlass_kernels/w8a8/c4x_sm100.cu"] # Example
- pass # No SM100 specific sources identified yet beyond what CUTLASS handles
- else: # For cc >= 89 but not 90 or 100 (e.g. SM89)
+ pass # No SM100 specific sources identified yet beyond what CUTLASS handles
+ else: # For cc >= 89 but not 90 or 100 (e.g. SM89)
print(f"SM{cc}: Running generic FP8 kernel auto-generation.")
os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py")
- os.system(
- "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
+ os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
- else: # For cc == 89 (Ada)
+ else: # For cc == 89 (Ada)
print("SM89: Running generic FP8 kernel auto-generation.")
os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py")
- os.system(
- "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
+ os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py")
# Common FP8 sources for SM89+
sources += [
@@ -493,7 +484,7 @@ elif paddle.is_compiled_with_cuda():
"gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",
"gpu_ops/cutlass_kernels/cutlass_heuristic.cu",
"gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",
- "gpu_ops/fused_hadamard_quant_fp8.cu"
+ "gpu_ops/fused_hadamard_quant_fp8.cu",
]
sources += find_end_files(fp8_auto_gen_directory, ".cu")
diff --git a/custom_ops/setup_ops_base.py b/custom_ops/setup_ops_base.py
index d05b1d39e..2386fee19 100644
--- a/custom_ops/setup_ops_base.py
+++ b/custom_ops/setup_ops_base.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-""" setup for FASTDEPLOY base ops """
+"""setup for FASTDEPLOY base ops"""
from paddle.utils.cpp_extension import CppExtension, setup
@@ -27,7 +27,8 @@ setup(
"cpu_ops/rebuild_padding.cc",
],
extra_compile_args=[
- "-DPy_LIMITED_API=0x03090000", "-DPADDLE_ON_INFERENCE"
+ "-DPy_LIMITED_API=0x03090000",
+ "-DPADDLE_ON_INFERENCE",
],
),
)
diff --git a/custom_ops/setup_ops_cpu.py b/custom_ops/setup_ops_cpu.py
index 9990d2f58..6e6083e72 100644
--- a/custom_ops/setup_ops_cpu.py
+++ b/custom_ops/setup_ops_cpu.py
@@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-""" setup for FASTDEPLOY custom cpu ops """
+"""setup for FASTDEPLOY custom cpu ops"""
import os
import subprocess
import tarfile
@@ -26,8 +26,7 @@ ROOT_DIR = Path(__file__).parent.parent
# which is not installed yet
from .setup_ops import load_module_from_path
-envs = load_module_from_path('envs',
- os.path.join(ROOT_DIR, 'fastdeploy', 'envs.py'))
+envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "fastdeploy", "envs.py"))
BUILDING_ARCS = []
use_bf16 = envs.FD_CPU_USE_BF16 == "True"
diff --git a/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py b/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py
index 53fae917a..0e9e755be 100644
--- a/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py
+++ b/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py
@@ -48,17 +48,26 @@ def get_candidate_configs(sm):
candidate_configs = list()
hasbias = ("false", "true")
- KernelSchedule = (
- "KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<1>", )
- EpilogueSchedule = ("TmaWarpSpecializedCooperative", )
+ KernelSchedule = ("KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<1>",)
+ EpilogueSchedule = ("TmaWarpSpecializedCooperative",)
TileSchedule = ("PersistentScheduler", "StreamKScheduler")
for act_tag in [
("noact", "Identity"),
- # ("relu", "ReLu"),
- # ("gelu", "GELU"),
+ # ("relu", "ReLu"),
+ # ("gelu", "GELU"),
]:
- candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule,
- EpilogueSchedule, TileSchedule)])
+ candidate_configs.extend(
+ [
+ (
+ hasbias,
+ act_tag,
+ tiles,
+ KernelSchedule,
+ EpilogueSchedule,
+ TileSchedule,
+ )
+ ]
+ )
return candidate_configs
@@ -66,16 +75,13 @@ def get_shape_str(tile_shape):
"""
return tile_shape string.
"""
- blocks, clusters = [
- s.replace(" ", "").strip("<>").split(",") for s in tile_shape
- ]
+ blocks, clusters = [s.replace(" ", "").strip("<>").split(",") for s in tile_shape]
blocks = [elem.strip("_") for elem in blocks]
clusters = [elem.strip("_") for elem in clusters]
return blocks, clusters
-def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule,
- tile_schedule):
+def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule, tile_schedule):
"""
check the cutlass config valid.
"""
@@ -304,13 +310,10 @@ def SubstituteTemplate(template, values_base):
SubstituteTemplate
"""
values = copy.deepcopy(values_base)
- if values.get("KernelSchedule"
- ) is not None and "Auto" in values["KernelSchedule"]:
+ if values.get("KernelSchedule") is not None and "Auto" in values["KernelSchedule"]:
values["KernelSchedule"] = "collective::" + values["KernelSchedule"]
- if values.get("EpilogueSchedule"
- ) is not None and "Auto" in values["EpilogueSchedule"]:
- values[
- "EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"]
+ if values.get("EpilogueSchedule") is not None and "Auto" in values["EpilogueSchedule"]:
+ values["EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"]
text = template
changed = True
while changed:
@@ -329,8 +332,7 @@ def parse_args():
parse_args
"""
parser = argparse.ArgumentParser(
- description=
- "The argument for generating the generic_mixed_gemm_kernelLauncher instance."
+ description="The argument for generating the generic_mixed_gemm_kernelLauncher instance."
)
parser.add_argument(
"--cuda_arch",
@@ -346,15 +348,15 @@ def parse_args():
# generate source .cu
def generate_source_cu(
- inputs_type: (str),
- outputs_type: (str),
- hasbiases: (str),
- act_tag: (str),
- tiles: (str),
- KernelSchedule: (str),
- EpilogueSchedule: (str),
- TileSchedule: (str),
- sm: str,
+ inputs_type: str,
+ outputs_type: str,
+ hasbiases: str,
+ act_tag: str,
+ tiles: str,
+ KernelSchedule: str,
+ EpilogueSchedule: str,
+ TileSchedule: str,
+ sm: str,
):
"""
generate_source_cu
@@ -369,8 +371,11 @@ def generate_source_cu(
for epilogue_schedule in EpilogueSchedule:
for tile_schedule in TileSchedule:
if not check_config_valid(
- tile_config, kernel_schedule,
- epilogue_schedule, tile_schedule):
+ tile_config,
+ kernel_schedule,
+ epilogue_schedule,
+ tile_schedule,
+ ):
continue
value_dict = {
"input_type": input_type,
@@ -385,30 +390,32 @@ def generate_source_cu(
"SM": sm,
"sm": sm[-2:],
}
- all_code += SubstituteTemplate(
- GemmDeclare, value_dict)
+ all_code += SubstituteTemplate(GemmDeclare, value_dict)
return all_code
# generate gemm launch .cu
def generate_launch_gemm_cus(
- generate_dir: (str), inputs_type: (str), outputs_type: (str),
- fuse_gemm_configs: tuple, sm: str):
+ generate_dir: str,
+ inputs_type: str,
+ outputs_type: str,
+ fuse_gemm_configs: tuple,
+ sm: str,
+):
"""
generate_launch_gemm_cus
"""
act_tags = [single_config[1] for single_config in fuse_gemm_configs]
single_config = fuse_gemm_configs[0]
- hasbiases: (str) = single_config[0]
- tiles: (str) = single_config[2]
- KernelSchedule: (str) = single_config[3]
- EpilogueSchedule: (str) = single_config[4]
- TileSchedule: (str) = single_config[5]
+ hasbiases: str = single_config[0]
+ tiles: str = single_config[2]
+ KernelSchedule: str = single_config[3]
+ EpilogueSchedule: str = single_config[4]
+ TileSchedule: str = single_config[5]
code_map = {}
- head_path = os.path.join(generate_dir,
- f"launch_block_gemm_kernel_sm{sm[-2:]}.h")
+ head_path = os.path.join(generate_dir, f"launch_block_gemm_kernel_sm{sm[-2:]}.h")
head_all_code = LaunchGemmHead
for tile_config in tiles:
blocks, clusters = get_shape_str(tile_config)
@@ -418,19 +425,19 @@ def generate_launch_gemm_cus(
for epilogue_schedule in EpilogueSchedule:
gemm_config_str_2 = gemm_config_str_1 + f"_{epilogue_schedule}"
for tile_schedule in TileSchedule:
- if not check_config_valid(tile_config, kernel_schedule,
- epilogue_schedule,
- tile_schedule):
+ if not check_config_valid(
+ tile_config,
+ kernel_schedule,
+ epilogue_schedule,
+ tile_schedule,
+ ):
continue
gemm_config_str = gemm_config_str_2 + f"_{tile_schedule}"
value_dict = {
- "sm":
- sm[-2:],
- "gemm_config":
- gemm_config_str.replace("<", "").replace(">", ""),
+ "sm": sm[-2:],
+ "gemm_config": gemm_config_str.replace("<", "").replace(">", ""),
}
- head_all_code += SubstituteTemplate(
- LaunchGemmDeclare, value_dict)
+ head_all_code += SubstituteTemplate(LaunchGemmDeclare, value_dict)
os.makedirs(generate_dir, exist_ok=True)
with open(head_path, "w") as f:
f.write(head_all_code)
@@ -444,19 +451,19 @@ def generate_launch_gemm_cus(
for epilogue_schedule in EpilogueSchedule:
gemm_config_str_2 = gemm_config_str_1 + f"_{epilogue_schedule}"
for tile_schedule in TileSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule,
- tile_schedule):
+ if not check_config_valid(
+ tile_shape,
+ kernel_schedule,
+ epilogue_schedule,
+ tile_schedule,
+ ):
continue
gemm_config_str = gemm_config_str_2 + f"_{tile_schedule}"
value_dict = {
- "sm":
- sm[-2:],
- "gemm_config":
- gemm_config_str.replace("<", "").replace(">", ""),
+ "sm": sm[-2:],
+ "gemm_config": gemm_config_str.replace("<", "").replace(">", ""),
}
- source_all_code = SubstituteTemplate(
- LaunchGemmPart0, value_dict)
+ source_all_code = SubstituteTemplate(LaunchGemmPart0, value_dict)
type_id = 0
for input_type in inputs_type:
for output_type in outputs_type:
@@ -476,16 +483,14 @@ def generate_launch_gemm_cus(
"SM": sm,
"sm": sm[-2:],
}
- source_all_code += SubstituteTemplate(
- LaunchGemmPart1, value_dict)
+ source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict)
type_id += 1
source_all_code += LaunchGemmPart2
- gemm_config_str = gemm_config_str.replace("<", "").replace(
- ">", "")
+ gemm_config_str = gemm_config_str.replace("<", "").replace(">", "")
code_map[gemm_config_str] = source_all_code
source_path = os.path.join(
generate_dir,
- f"launch_block_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu"
+ f"launch_block_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu",
)
with open(source_path, "w") as f:
f.write(source_all_code)
@@ -495,19 +500,18 @@ def generate_launch_gemm_cus(
# generate fp8_fp8_gemm_scale_bias_act_sm90.cu
-def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str),
- fuse_gemm_configs: tuple, sm: str):
+def generate_dispatch_gemm_cu(inputs_type: str, outputs_type: str, fuse_gemm_configs: tuple, sm: str):
"""
generate_dispatch_gemm_cu
"""
act_tags = [single_config[1] for single_config in fuse_gemm_configs]
single_config = fuse_gemm_configs[0]
- hasbiases: (str) = single_config[0]
- tiles: (str) = single_config[2]
- KernelSchedule: (str) = single_config[3]
- EpilogueSchedule: (str) = single_config[4]
- TileSchedule: (str) = single_config[5]
+ hasbiases: str = single_config[0]
+ tiles: str = single_config[2]
+ KernelSchedule: str = single_config[3]
+ EpilogueSchedule: str = single_config[4]
+ TileSchedule: str = single_config[5]
all_code = SubstituteTemplate(code_part0, {"sm": sm[-2:]})
type_id = 0
for input_type in inputs_type:
@@ -530,9 +534,12 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str),
for kernel_schedule in KernelSchedule:
for epilogue_schedule in EpilogueSchedule:
for tile_schedule in TileSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule,
- tile_schedule):
+ if not check_config_valid(
+ tile_shape,
+ kernel_schedule,
+ epilogue_schedule,
+ tile_schedule,
+ ):
continue
value_dict = {
"TileShape": tile_shape[0],
@@ -554,18 +561,18 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str),
for epilogue_schedule in EpilogueSchedule:
gemm_config_str_2 = gemm_config_str_1 + f"_{epilogue_schedule}"
for tile_schedule in TileSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule,
- tile_schedule):
+ if not check_config_valid(
+ tile_shape,
+ kernel_schedule,
+ epilogue_schedule,
+ tile_schedule,
+ ):
continue
gemm_config_str = gemm_config_str_2 + f"_{tile_schedule}"
value_dict = {
- "sm":
- sm[-2:],
- "tile_id":
- str(tile_id),
- "gemm_config":
- gemm_config_str.replace("<", "").replace(">", ""),
+ "sm": sm[-2:],
+ "tile_id": str(tile_id),
+ "gemm_config": gemm_config_str.replace("<", "").replace(">", ""),
}
all_code += SubstituteTemplate(code_part5, value_dict)
tile_id += 1
@@ -610,12 +617,17 @@ if __name__ == "__main__":
f.close()
# Compile parallelization
generate_launch_gemm_cus(
- "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", inputs_type,
- outputs_type, fuse_gemm_configs, sm_dict[sm])
+ "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen",
+ inputs_type,
+ outputs_type,
+ fuse_gemm_configs,
+ sm_dict[sm],
+ )
# hard code for act_tag
- file_name = (f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/"
- f"fp8_fp8_block_gemm_scale_bias_act_sm{sm}.cu")
+ file_name = (
+ f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/" f"fp8_fp8_block_gemm_scale_bias_act_sm{sm}.cu"
+ )
all_code = generate_dispatch_gemm_cu(
inputs_type,
outputs_type,
diff --git a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py
index bf319d2f9..105ed5bac 100644
--- a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py
+++ b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py
@@ -24,27 +24,28 @@ def get_candidate_tiles():
"""
base_configs = [("<64, 64, 64>", "<32, 32, 64>", "<16, 8, 32>")]
- base_configs.extend([
- ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"),
- ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"),
- ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"),
- ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"),
- ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"),
- ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"),
- ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"),
- ])
+ base_configs.extend(
+ [
+ ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"),
+ ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"),
+ ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"),
+ ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"),
+ ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"),
+ ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"),
+ ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"),
+ ]
+ )
return base_configs
-def get_dual_gemm_candidate_configs(sm, min_split_k, max_split_k, min_stages,
- max_stages):
+def get_dual_gemm_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages):
"""
get_dual_gemm_candidate_configs returns a list of candidate configs for the dual_gemm_fused_kernel.
"""
@@ -299,8 +300,7 @@ def check_min_split_k(value):
"""
ivalue = int(value)
if ivalue > 1:
- raise argparse.ArgumentTypeError(
- "Dual gemm split_k mode is not support.")
+ raise argparse.ArgumentTypeError("Dual gemm split_k mode is not support.")
return ivalue
@@ -310,8 +310,7 @@ def check_max_split_k(value):
"""
ivalue = int(value)
if ivalue > 1:
- raise argparse.ArgumentTypeError(
- "Dual gemm split_k mode is not support..")
+ raise argparse.ArgumentTypeError("Dual gemm split_k mode is not support..")
return ivalue
@@ -320,8 +319,7 @@ def parse_args():
parse_args
"""
parser = argparse.ArgumentParser(
- description=
- "The argument for generating the generic_mixed_gemm_kernelLauncher instance."
+ description="The argument for generating the generic_mixed_gemm_kernelLauncher instance."
)
parser.add_argument(
"--cuda_arch",
@@ -421,8 +419,7 @@ def generate_dual_gemm_source_cu(
"hasbias": hasbias,
"SM": sm,
}
- all_code += SubstituteTemplate(
- GemmSplitKDeclare, value_dict)
+ all_code += SubstituteTemplate(GemmSplitKDeclare, value_dict)
all_code += CommonTail
return all_code
@@ -449,12 +446,12 @@ def generate_launch_dual_gemm_cus(
head_path = os.path.join(generate_dir, "launch_dual_gemm_kernel.h")
head_all_code = LaunchGemmHead
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
- gemm_config = (f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_"
- f"warp{warps[0]}x{warps[1]}x{warps[2]}_"
- f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}")
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
+ gemm_config = (
+ f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_"
+ f"warp{warps[0]}x{warps[1]}x{warps[2]}_"
+ f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
+ )
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
value_dict = {
@@ -467,12 +464,12 @@ def generate_launch_dual_gemm_cus(
f.close()
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
- gemm_config = (f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_"
- f"warp{warps[0]}x{warps[1]}x{warps[2]}_"
- f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}")
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
+ gemm_config = (
+ f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_"
+ f"warp{warps[0]}x{warps[1]}x{warps[2]}_"
+ f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
+ )
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
value_dict = {
@@ -498,16 +495,14 @@ def generate_launch_dual_gemm_cus(
"num_stages": str(stage),
"SM": sm,
}
- source_all_code += SubstituteTemplate(
- LaunchGemmPart1, value_dict)
+ source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict)
# split_k_code += SubstituteTemplate(LaunchGemmPart3, value_dict)
type_id += 1
source_all_code += LaunchGemmPart2
# source_all_code += split_k_code
# source_all_code += LaunchGemmPart4
code_map[gemm_config_str] = source_all_code
- source_path = os.path.join(
- generate_dir, f"launch_dual_gemm_kernel_{gemm_config_str}.cu")
+ source_path = os.path.join(generate_dir, f"launch_dual_gemm_kernel_{gemm_config_str}.cu")
with open(source_path, "w") as f:
f.write(source_all_code)
f.close()
@@ -566,12 +561,12 @@ def generate_dispatch_dual_gemm_cu(
tile_id = 0
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
- gemm_config = (f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_"
- f"warp{warps[0]}x{warps[1]}x{warps[2]}_"
- f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}")
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
+ gemm_config = (
+ f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_"
+ f"warp{warps[0]}x{warps[1]}x{warps[2]}_"
+ f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
+ )
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
value_dict = {
@@ -580,10 +575,12 @@ def generate_dispatch_dual_gemm_cu(
}
all_code += SubstituteTemplate(code_part5, value_dict)
tile_id += 1
- value_dict.update({
- "min_split_k": str(min_split_k),
- "max_split_k": str(max_split_k),
- })
+ value_dict.update(
+ {
+ "min_split_k": str(min_split_k),
+ "max_split_k": str(max_split_k),
+ }
+ )
all_code += SubstituteTemplate(code_part6, value_dict)
return all_code
@@ -602,8 +599,7 @@ if __name__ == "__main__":
for sm in archs:
if sm == "89":
- fuse_gemm_configs = get_dual_gemm_candidate_configs(
- sm, min_split_k, max_split_k, min_stages, max_stages)
+ fuse_gemm_configs = get_dual_gemm_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages)
for fuse_gemm_config in fuse_gemm_configs:
file_name = (
f"gpu_ops/cutlass_kernels/fp8_gemm_fused/"
diff --git a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py
index 018e4eead..b2ef38f40 100644
--- a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py
+++ b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py
@@ -19,8 +19,7 @@ import re
def get_candidate_tiles():
- """
- """
+ """ """
cta_shape = [
("<_64, _16, _128>"),
("<_64, _32, _128>"),
@@ -45,8 +44,7 @@ def get_candidate_tiles():
def get_dual_gemm_candidate_configs(sm):
- """
- """
+ """ """
tiles = get_candidate_tiles()
candidate_configs = list()
@@ -64,35 +62,27 @@ def get_dual_gemm_candidate_configs(sm):
("swiglu", "SiLu"),
("geglu", "GELU"),
]:
- candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule,
- EpilogueSchedule)])
+ candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, EpilogueSchedule)])
return candidate_configs
def get_shape_str(tile_shape):
- """
- """
- blocks, clusters = [
- s.replace(" ", "").strip("<>").split(",") for s in tile_shape
- ]
+ """ """
+ blocks, clusters = [s.replace(" ", "").strip("<>").split(",") for s in tile_shape]
blocks = [elem.strip("_") for elem in blocks]
clusters = [elem.strip("_") for elem in clusters]
return blocks, clusters
def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
- """
- """
+ """ """
blocks, clusters = get_shape_str(tile_shape)
- if int(
- blocks[0]
- ) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum":
+ if int(blocks[0]) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum":
return False
if "Cooperative" in kernel_schedule and "Cooperative" not in epilogue_schedule:
return False
- if tile_shape[
- 0] == "<_128, _128, _128>" and kernel_schedule == "KernelTmaWarpSpecializedPingpongFP8FastAccum":
+ if tile_shape[0] == "<_128, _128, _128>" and kernel_schedule == "KernelTmaWarpSpecializedPingpongFP8FastAccum":
return False
return True
@@ -302,8 +292,7 @@ bool fp8_fp8_dual_gemm_scale_bias_act(DualGemmEpilogueAllParams params) {
def SubstituteTemplate(template, values):
- """
- """
+ """ """
text = template
changed = True
while changed:
@@ -318,10 +307,8 @@ def SubstituteTemplate(template, values):
def parse_args():
- """
- """
- parser = argparse.ArgumentParser(
- description="auto generate the fp8_fp8_dual_gemm_fused_kernels_sm90.")
+ """ """
+ parser = argparse.ArgumentParser(description="auto generate the fp8_fp8_dual_gemm_fused_kernels_sm90.")
parser.add_argument(
"--cuda_arch",
type=str,
@@ -336,17 +323,16 @@ def parse_args():
# generate source .cu
def generate_dual_gemm_source_cu(
- inputs_type: (str),
- biases_type: (str),
- hasbiases: (str),
- act_tag: (str),
- tiles: (str),
- KernelSchedule: (str),
- EpilogueSchedule: (str),
- sm: str,
+ inputs_type: str,
+ biases_type: str,
+ hasbiases: str,
+ act_tag: str,
+ tiles: str,
+ KernelSchedule: str,
+ EpilogueSchedule: str,
+ sm: str,
):
- """
- """
+ """ """
all_code = CommonHead
for input_type in inputs_type:
for bias_type in biases_type:
@@ -354,9 +340,7 @@ def generate_dual_gemm_source_cu(
for tile_config in tiles:
for kernel_schedule in KernelSchedule:
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_config,
- kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule):
continue
value_dict = {
"input_type": input_type,
@@ -370,28 +354,29 @@ def generate_dual_gemm_source_cu(
"SM": sm,
"sm": sm[-2:],
}
- all_code += SubstituteTemplate(
- GemmDeclare, value_dict)
+ all_code += SubstituteTemplate(GemmDeclare, value_dict)
return all_code
# generate gemm launch .cu
def generate_launch_dual_gemm_cus(
- generate_dir: (str), inputs_type: (str), biases_type: (str),
- fuse_gemm_configs: tuple, sm: str):
- """
- """
+ generate_dir: str,
+ inputs_type: str,
+ biases_type: str,
+ fuse_gemm_configs: tuple,
+ sm: str,
+):
+ """ """
act_tags = [single_config[1] for single_config in fuse_gemm_configs]
single_config = fuse_gemm_configs[0]
- hasbiases: (str) = single_config[0]
- tiles: (str) = single_config[2]
- KernelSchedule: (str) = single_config[3]
- EpilogueSchedule: (str) = single_config[4]
+ hasbiases: str = single_config[0]
+ tiles: str = single_config[2]
+ KernelSchedule: str = single_config[3]
+ EpilogueSchedule: str = single_config[4]
code_map = {}
- head_path = os.path.join(generate_dir,
- f"launch_dual_gemm_kernel_sm{sm[-2:]}.h")
+ head_path = os.path.join(generate_dir, f"launch_dual_gemm_kernel_sm{sm[-2:]}.h")
head_all_code = LaunchGemmHead
for tile_config in tiles:
blocks, clusters = get_shape_str(tile_config)
@@ -401,16 +386,14 @@ def generate_launch_dual_gemm_cus(
for kernel_schedule in KernelSchedule:
gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}"
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_config, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule):
continue
gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}"
value_dict = {
"sm": sm[-2:],
"gemm_config": gemm_config_str,
}
- head_all_code += SubstituteTemplate(LaunchGemmDeclare,
- value_dict)
+ head_all_code += SubstituteTemplate(LaunchGemmDeclare, value_dict)
os.makedirs(generate_dir, exist_ok=True)
with open(head_path, "w") as f:
f.write(head_all_code)
@@ -422,16 +405,14 @@ def generate_launch_dual_gemm_cus(
for kernel_schedule in KernelSchedule:
gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}"
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
continue
gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}"
value_dict = {
"sm": sm[-2:],
"gemm_config": gemm_config_str,
}
- source_all_code = SubstituteTemplate(LaunchGemmPart0,
- value_dict)
+ source_all_code = SubstituteTemplate(LaunchGemmPart0, value_dict)
type_id = 0
for input_type in inputs_type:
for bias_type in biases_type:
@@ -450,14 +431,13 @@ def generate_launch_dual_gemm_cus(
"SM": sm,
"sm": sm[-2:],
}
- source_all_code += SubstituteTemplate(
- LaunchGemmPart1, value_dict)
+ source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict)
type_id += 1
source_all_code += LaunchGemmPart2
code_map[gemm_config_str] = source_all_code
source_path = os.path.join(
generate_dir,
- f"launch_dual_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu"
+ f"launch_dual_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu",
)
with open(source_path, "w") as f:
f.write(source_all_code)
@@ -467,16 +447,14 @@ def generate_launch_dual_gemm_cus(
# generate fp8_fp8_gemm_scale_bias_act.cu
-def generate_dispatch_dual_gemm_cu(inputs_type: (str), biases_type: (str),
- fuse_gemm_configs: tuple, sm: str):
- """
- """
+def generate_dispatch_dual_gemm_cu(inputs_type: str, biases_type: str, fuse_gemm_configs: tuple, sm: str):
+ """ """
act_tags = [single_config[1] for single_config in fuse_gemm_configs]
single_config = fuse_gemm_configs[0]
- hasbiases: (str) = single_config[0]
- tiles: (str) = single_config[2]
- KernelSchedule: (str) = single_config[3]
- EpilogueSchedule: (str) = single_config[4]
+ hasbiases: str = single_config[0]
+ tiles: str = single_config[2]
+ KernelSchedule: str = single_config[3]
+ EpilogueSchedule: str = single_config[4]
all_code = SubstituteTemplate(code_part0, {"sm": sm[-2:]})
type_id = 0
@@ -500,8 +478,7 @@ def generate_dispatch_dual_gemm_cu(inputs_type: (str), biases_type: (str),
for tile_shape in tiles:
for kernel_schedule in KernelSchedule:
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
continue
value_dict = {
"TileShape": tile_shape[0],
@@ -520,8 +497,7 @@ def generate_dispatch_dual_gemm_cu(inputs_type: (str), biases_type: (str),
for kernel_schedule in KernelSchedule:
gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}"
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
continue
gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}"
value_dict = {
@@ -570,12 +546,15 @@ if __name__ == "__main__":
f.close()
# Compile parallelization
generate_launch_dual_gemm_cus(
- "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", inputs_type,
- biases_type, fuse_gemm_configs, sm_dict[sm])
+ "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen",
+ inputs_type,
+ biases_type,
+ fuse_gemm_configs,
+ sm_dict[sm],
+ )
# hard code for act_tag
file_name = (
- f"gpu_ops/cutlass_kernels/fp8_gemm_fused/"
- f"autogen/fp8_fp8_dual_gemm_scale_bias_act_sm{sm}.cu"
+ f"gpu_ops/cutlass_kernels/fp8_gemm_fused/" f"autogen/fp8_fp8_dual_gemm_scale_bias_act_sm{sm}.cu"
)
all_code = generate_dispatch_dual_gemm_cu(
inputs_type,
diff --git a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py
index cb2e93a03..14f147afc 100644
--- a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py
+++ b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py
@@ -31,25 +31,26 @@ def get_candidate_tiles():
"""
base_configs = [("<64, 64, 64>", "<32, 32, 64>", "<16, 8, 32>")]
- base_configs.extend([
- ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"),
- ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"),
- ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"),
- ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"),
- ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"),
- ])
+ base_configs.extend(
+ [
+ ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"),
+ ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"),
+ ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"),
+ ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"),
+ ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"),
+ ]
+ )
return base_configs
-def get_candidate_configs(sm, min_split_k, max_split_k, min_stages,
- max_stages):
+def get_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages):
"""
获取候选的gemm算子配置列表。
@@ -353,8 +354,7 @@ def parse_args():
代码参数解析
"""
parser = argparse.ArgumentParser(
- description=
- "The argument for generating the generic_mixed_gemm_kernelLauncher instance."
+ description="The argument for generating the generic_mixed_gemm_kernelLauncher instance."
)
parser.add_argument(
"--cuda_arch",
@@ -448,8 +448,7 @@ def generate_source_cu(
"hasbias": hasbias,
"SM": sm,
}
- all_code += SubstituteTemplate(GemmSplitKDeclare,
- value_dict)
+ all_code += SubstituteTemplate(GemmSplitKDeclare, value_dict)
all_code += CommonTail
return all_code
@@ -473,9 +472,7 @@ def generate_launch_gemm_cus(
head_path = os.path.join(generate_dir, "launch_gemm_kernel.h")
head_all_code = LaunchGemmHead
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
@@ -489,9 +486,7 @@ def generate_launch_gemm_cus(
f.close()
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
@@ -517,17 +512,14 @@ def generate_launch_gemm_cus(
"num_stages": str(stage),
"SM": sm,
}
- source_all_code += SubstituteTemplate(
- LaunchGemmPart1, value_dict)
- split_k_code += SubstituteTemplate(
- LaunchGemmPart3, value_dict)
+ source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict)
+ split_k_code += SubstituteTemplate(LaunchGemmPart3, value_dict)
type_id += 1
source_all_code += LaunchGemmPart2
source_all_code += split_k_code
source_all_code += LaunchGemmPart4
code_map[gemm_config_str] = source_all_code
- source_path = os.path.join(
- generate_dir, f"launch_gemm_kernel_{gemm_config_str}.cu")
+ source_path = os.path.join(generate_dir, f"launch_gemm_kernel_{gemm_config_str}.cu")
with open(source_path, "w") as f:
f.write(source_all_code)
f.close()
@@ -581,9 +573,7 @@ def generate_dispatch_gemm_cu(
all_code += code_part4
tile_id = 0
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
@@ -593,10 +583,12 @@ def generate_dispatch_gemm_cu(
}
all_code += SubstituteTemplate(code_part5, value_dict)
tile_id += 1
- value_dict.update({
- "min_split_k": str(min_split_k),
- "max_split_k": str(max_split_k),
- })
+ value_dict.update(
+ {
+ "min_split_k": str(min_split_k),
+ "max_split_k": str(max_split_k),
+ }
+ )
all_code += SubstituteTemplate(code_part6, value_dict)
return all_code
@@ -614,9 +606,7 @@ if __name__ == "__main__":
for sm in archs:
if sm == "89":
- fuse_gemm_configs = get_candidate_configs(sm, min_split_k,
- max_split_k, min_stages,
- max_stages)
+ fuse_gemm_configs = get_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages)
for fuse_gemm_config in fuse_gemm_configs:
file_name = f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/generic_gemm_kernel_sm{sm}_{fuse_gemm_config[3][0]}.cu"
all_code = generate_source_cu(
@@ -654,9 +644,7 @@ if __name__ == "__main__":
# hard code for act_tag
- file_name = (
- "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/fp8_fp8_gemm_scale_bias_act.cu"
- )
+ file_name = "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/fp8_fp8_gemm_scale_bias_act.cu"
all_code = generate_dispatch_gemm_cu(
inputs_type,
outputs_type,
diff --git a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py
index 2268fa3a4..6c9efea21 100644
--- a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py
+++ b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py
@@ -20,44 +20,44 @@ import re
def get_candidate_tiles():
- """
- """
+ """ """
base_configs = [
("<_64, _64, _128>", "<_1, _8, _1>"),
("<_64, _128, _128>", "<_2, _1, _1>"),
("<_128, _128, _128>", "<_2, _1, _1>"),
]
- base_configs.extend([
- ("<_64, _64, _128>", "<_1, _1, _1>"),
- ("<_64, _64, _128>", "<_1, _2, _1>"),
- ("<_64, _64, _128>", "<_2, _1, _1>"),
- ("<_64, _64, _64>", "<_1, _1, _1>"),
- ("<_64, _64, _64>", "<_1, _2, _1>"),
- ("<_64, _64, _64>", "<_2, _1, _1>"),
- ("<_64, _128, _128>", "<_1, _2, _1>"),
- ("<_64, _128, _128>", "<_1, _1, _1>"),
- ("<_128, _128, _64>", "<_2, _1, _1>"),
- ("<_256, _128, _128>", "<_1, _2, _1>"),
- ("<_256, _128, _128>", "<_1, _1, _1>"),
- # The following configurations are rarely selected in Qwen2-7B-model.
- # ("<_256, _128, _128>", "<_4, _1, _1>"),
- # ("<_256, _128, _128>", "<_1, _4, _1>"),
- # ("<_256, _128, _128>", "<_2, _4, _1>"),
- # ("<_128, _128, _256>", "<_1, _2, _1>"),
- # ("<_128, _128, _128>", "<_4, _1, _1>"),
- # ("<_128, _128, _128>", "<_2, _4, _1>"),
- # ("<_128, _128, _128>", "<_1, _2, _1>"),
- # ("<_128, _128, _128>", "<_1, _1, _1>"),
- # ("<_128, _128, _128>", "<_1, _4, _1>"),
- # ("<_128, _128, _64>", "<_2, _2, _1>"),
- ])
+ base_configs.extend(
+ [
+ ("<_64, _64, _128>", "<_1, _1, _1>"),
+ ("<_64, _64, _128>", "<_1, _2, _1>"),
+ ("<_64, _64, _128>", "<_2, _1, _1>"),
+ ("<_64, _64, _64>", "<_1, _1, _1>"),
+ ("<_64, _64, _64>", "<_1, _2, _1>"),
+ ("<_64, _64, _64>", "<_2, _1, _1>"),
+ ("<_64, _128, _128>", "<_1, _2, _1>"),
+ ("<_64, _128, _128>", "<_1, _1, _1>"),
+ ("<_128, _128, _64>", "<_2, _1, _1>"),
+ ("<_256, _128, _128>", "<_1, _2, _1>"),
+ ("<_256, _128, _128>", "<_1, _1, _1>"),
+ # The following configurations are rarely selected in Qwen2-7B-model.
+ # ("<_256, _128, _128>", "<_4, _1, _1>"),
+ # ("<_256, _128, _128>", "<_1, _4, _1>"),
+ # ("<_256, _128, _128>", "<_2, _4, _1>"),
+ # ("<_128, _128, _256>", "<_1, _2, _1>"),
+ # ("<_128, _128, _128>", "<_4, _1, _1>"),
+ # ("<_128, _128, _128>", "<_2, _4, _1>"),
+ # ("<_128, _128, _128>", "<_1, _2, _1>"),
+ # ("<_128, _128, _128>", "<_1, _1, _1>"),
+ # ("<_128, _128, _128>", "<_1, _4, _1>"),
+ # ("<_128, _128, _64>", "<_2, _2, _1>"),
+ ]
+ )
return base_configs
def get_candidate_configs(sm):
- """
- """
+ """ """
tiles = get_candidate_tiles()
candidate_configs = list()
@@ -73,36 +73,31 @@ def get_candidate_configs(sm):
("relu", "ReLu"),
("gelu", "GELU"),
]:
- candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule,
- EpilogueSchedule)])
+ candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, EpilogueSchedule)])
return candidate_configs
def get_shape_str(tile_shape):
- """
- """
- blocks, clusters = [
- s.replace(" ", "").strip("<>").split(",") for s in tile_shape
- ]
+ """ """
+ blocks, clusters = [s.replace(" ", "").strip("<>").split(",") for s in tile_shape]
blocks = [elem.strip("_") for elem in blocks]
clusters = [elem.strip("_") for elem in clusters]
return blocks, clusters
def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
- """
- """
+ """ """
blocks, clusters = get_shape_str(tile_shape)
- if int(
- blocks[0]
- ) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum":
+ if int(blocks[0]) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum":
return False
if "Cooperative" in kernel_schedule and "Cooperative" not in epilogue_schedule:
return False
- if (tile_shape[0] == "<_256, _128, _128>"
- and "Cooperative" not in kernel_schedule
- and "Cooperative" not in epilogue_schedule):
+ if (
+ tile_shape[0] == "<_256, _128, _128>"
+ and "Cooperative" not in kernel_schedule
+ and "Cooperative" not in epilogue_schedule
+ ):
return False
return True
@@ -321,16 +316,12 @@ bool fp8_fp8_gemm_scale_bias_act(GemmEpilogueAllParams params) {
def SubstituteTemplate(template, values_base):
- """
- """
+ """ """
values = copy.deepcopy(values_base)
- if values.get("KernelSchedule"
- ) is not None and "Auto" in values["KernelSchedule"]:
+ if values.get("KernelSchedule") is not None and "Auto" in values["KernelSchedule"]:
values["KernelSchedule"] = "collective::" + values["KernelSchedule"]
- if values.get("EpilogueSchedule"
- ) is not None and "Auto" in values["EpilogueSchedule"]:
- values[
- "EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"]
+ if values.get("EpilogueSchedule") is not None and "Auto" in values["EpilogueSchedule"]:
+ values["EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"]
text = template
changed = True
while changed:
@@ -345,10 +336,8 @@ def SubstituteTemplate(template, values_base):
def parse_args():
- """
- """
- parser = argparse.ArgumentParser(
- description="auto generate fp8_fp8_gemm_fused_kernels_sm90.")
+ """ """
+ parser = argparse.ArgumentParser(description="auto generate fp8_fp8_gemm_fused_kernels_sm90.")
parser.add_argument(
"--cuda_arch",
type=str,
@@ -363,17 +352,16 @@ def parse_args():
# generate source .cu
def generate_source_cu(
- inputs_type: (str),
- outputs_type: (str),
- hasbiases: (str),
- act_tag: (str),
- tiles: (str),
- KernelSchedule: (str),
- EpilogueSchedule: (str),
- sm: str,
+ inputs_type: str,
+ outputs_type: str,
+ hasbiases: str,
+ act_tag: str,
+ tiles: str,
+ KernelSchedule: str,
+ EpilogueSchedule: str,
+ sm: str,
):
- """
- """
+ """ """
all_code = CommonHead
for input_type in inputs_type:
@@ -382,9 +370,7 @@ def generate_source_cu(
for tile_config in tiles:
for kernel_schedule in KernelSchedule:
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_config,
- kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule):
continue
value_dict = {
"input_type": input_type,
@@ -398,25 +384,27 @@ def generate_source_cu(
"SM": sm,
"sm": sm[-2:],
}
- all_code += SubstituteTemplate(
- GemmDeclare, value_dict)
+ all_code += SubstituteTemplate(GemmDeclare, value_dict)
return all_code
# generate gemm launch .cu
def generate_launch_gemm_cus(
- generate_dir: (str), inputs_type: (str), outputs_type: (str),
- fuse_gemm_configs: tuple, sm: str):
- """
- """
+ generate_dir: str,
+ inputs_type: str,
+ outputs_type: str,
+ fuse_gemm_configs: tuple,
+ sm: str,
+):
+ """ """
act_tags = [single_config[1] for single_config in fuse_gemm_configs]
single_config = fuse_gemm_configs[0]
- hasbiases: (str) = single_config[0]
- tiles: (str) = single_config[2]
- KernelSchedule: (str) = single_config[3]
- EpilogueSchedule: (str) = single_config[4]
+ hasbiases: str = single_config[0]
+ tiles: str = single_config[2]
+ KernelSchedule: str = single_config[3]
+ EpilogueSchedule: str = single_config[4]
code_map = {}
head_path = os.path.join(generate_dir, f"launch_gemm_kernel_sm{sm[-2:]}.h")
head_all_code = LaunchGemmHead
@@ -426,16 +414,14 @@ def generate_launch_gemm_cus(
for kernel_schedule in KernelSchedule:
gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}"
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_config, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule):
continue
gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}"
value_dict = {
"sm": sm[-2:],
"gemm_config": gemm_config_str,
}
- head_all_code += SubstituteTemplate(LaunchGemmDeclare,
- value_dict)
+ head_all_code += SubstituteTemplate(LaunchGemmDeclare, value_dict)
os.makedirs(generate_dir, exist_ok=True)
with open(head_path, "w") as f:
f.write(head_all_code)
@@ -447,16 +433,14 @@ def generate_launch_gemm_cus(
for kernel_schedule in KernelSchedule:
gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}"
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
continue
gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}"
value_dict = {
"sm": sm[-2:],
"gemm_config": gemm_config_str,
}
- source_all_code = SubstituteTemplate(LaunchGemmPart0,
- value_dict)
+ source_all_code = SubstituteTemplate(LaunchGemmPart0, value_dict)
type_id = 0
for input_type in inputs_type:
for output_type in outputs_type:
@@ -475,14 +459,14 @@ def generate_launch_gemm_cus(
"SM": sm,
"sm": sm[-2:],
}
- source_all_code += SubstituteTemplate(
- LaunchGemmPart1, value_dict)
+ source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict)
type_id += 1
source_all_code += LaunchGemmPart2
code_map[gemm_config_str] = source_all_code
source_path = os.path.join(
generate_dir,
- f"launch_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu")
+ f"launch_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu",
+ )
with open(source_path, "w") as f:
f.write(source_all_code)
f.close()
@@ -491,17 +475,15 @@ def generate_launch_gemm_cus(
# generate fp8_fp8_gemm_scale_bias_act_sm90.cu
-def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str),
- fuse_gemm_configs: tuple, sm: str):
- """
- """
+def generate_dispatch_gemm_cu(inputs_type: str, outputs_type: str, fuse_gemm_configs: tuple, sm: str):
+ """ """
act_tags = [single_config[1] for single_config in fuse_gemm_configs]
single_config = fuse_gemm_configs[0]
- hasbiases: (str) = single_config[0]
- tiles: (str) = single_config[2]
- KernelSchedule: (str) = single_config[3]
- EpilogueSchedule: (str) = single_config[4]
+ hasbiases: str = single_config[0]
+ tiles: str = single_config[2]
+ KernelSchedule: str = single_config[3]
+ EpilogueSchedule: str = single_config[4]
all_code = SubstituteTemplate(code_part0, {"sm": sm[-2:]})
type_id = 0
@@ -524,8 +506,7 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str),
for tile_shape in tiles:
for kernel_schedule in KernelSchedule:
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
continue
value_dict = {
"TileShape": tile_shape[0],
@@ -544,8 +525,7 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str),
for kernel_schedule in KernelSchedule:
gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}"
for epilogue_schedule in EpilogueSchedule:
- if not check_config_valid(tile_shape, kernel_schedule,
- epilogue_schedule):
+ if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule):
continue
gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}"
value_dict = {
@@ -576,7 +556,8 @@ if __name__ == "__main__":
for fuse_gemm_config in fuse_gemm_configs:
file_name = (
f"gpu_ops/cutlass_kernels/fp8_gemm_fused/"
- f"autogen/generic_gemm_kernel_sm{sm}_{fuse_gemm_config[1][0]}.cu")
+ f"autogen/generic_gemm_kernel_sm{sm}_{fuse_gemm_config[1][0]}.cu"
+ )
all_code = generate_source_cu(
inputs_type,
outputs_type,
@@ -594,8 +575,12 @@ if __name__ == "__main__":
f.close()
# Compile parallelization
generate_launch_gemm_cus(
- "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", inputs_type,
- outputs_type, fuse_gemm_configs, sm_dict[sm])
+ "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen",
+ inputs_type,
+ outputs_type,
+ fuse_gemm_configs,
+ sm_dict[sm],
+ )
# hard code for act_tag
file_name = f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/fp8_fp8_gemm_scale_bias_act_sm{sm}.cu"
diff --git a/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py b/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py
index f234f7290..d9a53f87a 100644
--- a/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py
+++ b/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py
@@ -30,22 +30,24 @@ def get_candidate_tiles():
"""
base_configs = [("<64, 64, 64>", "<32, 32, 64>", "<16, 8, 32>")]
- base_configs.extend([
- ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"),
- ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"),
- ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"),
- ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"),
- ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"),
- ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"),
- ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
- ("<128, 64, 128>", "<64, 32, 128>", "<16, 8, 32>"),
- ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"),
- ])
+ base_configs.extend(
+ [
+ ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"),
+ ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"),
+ ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"),
+ ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"),
+ ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"),
+ ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"),
+ ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"),
+ ("<128, 64, 128>", "<64, 32, 128>", "<16, 8, 32>"),
+ ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"),
+ ]
+ )
return base_configs
@@ -278,8 +280,7 @@ def parse_args():
代码参数解析
"""
parser = argparse.ArgumentParser(
- description=
- "The argument for generating the generic_mixed_gemm_kernelLauncher instance."
+ description="The argument for generating the generic_mixed_gemm_kernelLauncher instance."
)
parser.add_argument(
"--cuda_arch",
@@ -370,13 +371,10 @@ def generate_launch_gemm_cus(
- dict (code_map) - 包含每个Gemm配置对应的源代码的字典,格式为{"gemm_config": source_code}。
"""
code_map = {}
- head_path = os.path.join(generate_dir,
- "launch_visitor_gemm_fused_kernel.h")
+ head_path = os.path.join(generate_dir, "launch_visitor_gemm_fused_kernel.h")
head_all_code = LaunchGemmHead
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
@@ -390,9 +388,7 @@ def generate_launch_gemm_cus(
f.close()
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
@@ -415,14 +411,14 @@ def generate_launch_gemm_cus(
"num_stages": str(stage),
"SM": sm,
}
- source_all_code += SubstituteTemplate(
- LaunchGemmPart1, value_dict)
+ source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict)
type_id += 1
source_all_code += LaunchGemmPart2
code_map[gemm_config_str] = source_all_code
source_path = os.path.join(
generate_dir,
- f"launch_visitor_gemm_fused_kernel_{gemm_config_str}.cu")
+ f"launch_visitor_gemm_fused_kernel_{gemm_config_str}.cu",
+ )
with open(source_path, "w") as f:
f.write(source_all_code)
f.close()
@@ -485,9 +481,7 @@ def generate_dispatch_gemm_cu(
all_code += code_part4
tile_id = 0
for tile in tiles:
- blocks, warps, mmas = [
- s.replace(" ", "").strip("<>").split(",") for s in tile
- ]
+ blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile]
gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}"
for stage in stages:
gemm_config_str = gemm_config + f"_stage{stage}"
@@ -512,10 +506,11 @@ if __name__ == "__main__":
for sm in archs:
if sm == "89":
- fuse_gemm_configs = get_candidate_configs(sm, min_stages,
- max_stages)
+ fuse_gemm_configs = get_candidate_configs(sm, min_stages, max_stages)
for fuse_gemm_config in fuse_gemm_configs:
- file_name = f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/generic_visitor_gemm_fused_kernel_sm{sm}.cu"
+ file_name = (
+ f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/generic_visitor_gemm_fused_kernel_sm{sm}.cu"
+ )
all_code = generate_source_cu(
inputs_type,
outputs_type,
@@ -544,9 +539,7 @@ if __name__ == "__main__":
sm_dict[sm],
)
- file_name = (
- "gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu"
- )
+ file_name = "gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu"
all_code = generate_dispatch_gemm_cu(
inputs_type,
outputs_type,
diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc
index c136851f4..04eb0c568 100644
--- a/custom_ops/xpu_ops/src/ops/block_attn.cc
+++ b/custom_ops/xpu_ops/src/ops/block_attn.cc
@@ -113,7 +113,7 @@ std::vector BlockAttnKernel(
vsl.kv_lod_vp = {
const_cast(encoder_seq_lod_cpu.data()),
enc_batch + 1, nullptr};
-
+
baidu::xpu::api::VectorParam prefix_lens_vp{
nullptr,
0,
diff --git a/custom_ops/xpu_ops/src/setup_ops.py b/custom_ops/xpu_ops/src/setup_ops.py
index 4b2bc19f4..c819cf9d9 100755
--- a/custom_ops/xpu_ops/src/setup_ops.py
+++ b/custom_ops/xpu_ops/src/setup_ops.py
@@ -30,8 +30,7 @@ current_file = Path(__file__).resolve()
base_dir = current_file.parent
-def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR,
- XDNN_LIB_DIR):
+def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, XDNN_LIB_DIR):
"""
build xpu plugin
"""
@@ -49,7 +48,10 @@ def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR,
# 删除指定目录
dirs_to_remove = [
- "dist", "fastdeploy_ops.egg-info", "build", "plugin/build"
+ "dist",
+ "fastdeploy_ops.egg-info",
+ "build",
+ "plugin/build",
]
for dir_name in dirs_to_remove:
if os.path.exists(dir_name):
@@ -58,8 +60,7 @@ def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR,
# 在 plugin 目录中执行构建脚本
plugin_dir = "plugin"
- build_script = os.path.join(current_working_directory, plugin_dir,
- "build.sh")
+ build_script = os.path.join(current_working_directory, plugin_dir, "build.sh")
print("build_script: ", build_script)
@@ -74,14 +75,16 @@ def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR,
# 执行构建脚本
try:
print("Running build script...")
- subprocess.run([build_script],
- check=True,
- cwd=os.path.join(current_working_directory, plugin_dir))
+ subprocess.run(
+ [build_script],
+ check=True,
+ cwd=os.path.join(current_working_directory, plugin_dir),
+ )
print("Build completed successfully.")
except subprocess.CalledProcessError as e:
print(f"Build failed with error: {e}")
except Exception as e:
- print(f"Unexpected error: {str(e)}")
+ print(f"Unexpected error: {e!s}")
def xpu_setup_ops():
@@ -124,17 +127,14 @@ def xpu_setup_ops():
XVLLM_PATH = os.getenv("XVLLM_PATH")
assert XVLLM_PATH is not None, "XVLLM_PATH is not set."
XVLLM_KERNEL_INC_PATH = os.path.join(XVLLM_PATH, "infer_ops", "include")
- XVLLM_KERNEL_LIB_PATH = os.path.join(XVLLM_PATH, "infer_ops", "so",
- "libapiinfer.so")
+ XVLLM_KERNEL_LIB_PATH = os.path.join(XVLLM_PATH, "infer_ops", "so", "libapiinfer.so")
XVLLM_KERNEL_LIB_DIR = os.path.join(XVLLM_PATH, "infer_ops", "so")
XVLLM_OP_INC_PATH = os.path.join(XVLLM_PATH, "xft_blocks", "include")
- XVLLM_OP_LIB_PATH = os.path.join(XVLLM_PATH, "xft_blocks", "so",
- "libxft_blocks.so")
+ XVLLM_OP_LIB_PATH = os.path.join(XVLLM_PATH, "xft_blocks", "so", "libxft_blocks.so")
XVLLM_OP_LIB_DIR = os.path.join(XVLLM_PATH, "xft_blocks", "so")
# build plugin
- build_plugin(CLANG_PATH, XRE_INC_PATH, XRE_LIB_DIR, XDNN_INC_PATH,
- XDNN_LIB_DIR)
+ build_plugin(CLANG_PATH, XRE_INC_PATH, XRE_LIB_DIR, XDNN_INC_PATH, XDNN_LIB_DIR)
ops = [
# custom ops
@@ -152,7 +152,6 @@ def xpu_setup_ops():
"./ops/block_attn.cc",
"./ops/moe_layer.cc",
"./ops/weight_quantize_xpu.cc",
-
# device manage ops
"./ops/device/get_context_gm_max_mem_demand.cc",
"./ops/device/get_free_global_memory.cc",
diff --git a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py
index 35e38e478..441912a6d 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py
@@ -29,7 +29,7 @@ for i in range(bs):
ids_len = seq_lens[i, 0]
input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64")
-x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset(
+(x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k,) = get_padding_offset(
paddle.to_tensor(input_ids),
paddle.to_tensor(cum_offset),
paddle.to_tensor(token_num),
@@ -46,19 +46,14 @@ print("padding_offset:\n", padding_offset)
print("cu_seqlens_q:\n", cu_seqlens_q)
print("cu_seqlens_k:\n", cu_seqlens_k)
-ref_x_remove_padding = np.array([8, 7, 8, 2, 4, 5, 5, 7, 6, 1, 7, 2, 6],
- "int64")
+ref_x_remove_padding = np.array([8, 7, 8, 2, 4, 5, 5, 7, 6, 1, 7, 2, 6], "int64")
ref_cum_offsets_out = np.array([0, 6, 13], "int32")
-ref_padding_offset = np.array([0, 0, 0, 0, 6, 6, 6, 13, 13, 13, 13, 13, 13],
- "int32")
+ref_padding_offset = np.array([0, 0, 0, 0, 6, 6, 6, 13, 13, 13, 13, 13, 13], "int32")
ref_cu_seqlens_q = np.array([0, 4, 7, 13], "int32")
ref_cu_seqlens_k = np.array([0, 4, 7, 13], "int32")
-assert sum(ref_x_remove_padding -
- x_remove_padding) == 0, 'Check x_remove_padding failed.'
-assert sum(ref_cum_offsets_out -
- cum_offsets_out) == 0, 'Check cum_offsets_out failed.'
-assert sum(ref_padding_offset -
- padding_offset) == 0, 'Check padding_offset failed.'
-assert sum(ref_cu_seqlens_q - cu_seqlens_q) == 0, 'Check cu_seqlens_q failed.'
-assert sum(ref_cu_seqlens_k - cu_seqlens_k) == 0, 'Check cu_seqlens_k failed.'
+assert sum(ref_x_remove_padding - x_remove_padding) == 0, "Check x_remove_padding failed."
+assert sum(ref_cum_offsets_out - cum_offsets_out) == 0, "Check cum_offsets_out failed."
+assert sum(ref_padding_offset - padding_offset) == 0, "Check padding_offset failed."
+assert sum(ref_cu_seqlens_q - cu_seqlens_q) == 0, "Check cu_seqlens_q failed."
+assert sum(ref_cu_seqlens_k - cu_seqlens_k) == 0, "Check cu_seqlens_k failed."
diff --git a/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py b/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py
index 5bce2d352..39a05b5aa 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py
@@ -21,10 +21,15 @@ paddle.seed(2023)
pre_ids = paddle.to_tensor(
[[1, 9, 3, 4, 5, 6, 7, -1, -1, -1], [1, 9, 7, 6, 5, 4, -1, -1, -1, -1]],
- "int64")
-logits = paddle.to_tensor([[0.1, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7, 0.1, 0.1, 0.1],
- [0.1, 0.9, 0.7, 0.6, 0.5, 0.4, 0.1, 0.1, 0.1, 0.1]],
- "float32")
+ "int64",
+)
+logits = paddle.to_tensor(
+ [
+ [0.1, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7, 0.1, 0.1, 0.1],
+ [0.1, 0.9, 0.7, 0.6, 0.5, 0.4, 0.1, 0.1, 0.1, 0.1],
+ ],
+ "float32",
+)
penalty_scores = paddle.to_tensor([1.0, 1.0], "float32")
frequency_scores = paddle.to_tensor([0.1, 0.1], "float32")
presence_scores = paddle.to_tensor([0.0, 0.0], "float32")
@@ -88,78 +93,536 @@ ref_logits = np.array(
)
diff_logits = np.sum(np.abs(ref_logits - logits.numpy()))
print("diff_logits\n", diff_logits)
-assert diff_logits < 1e-6, 'Check failed.'
+assert diff_logits < 1e-6, "Check failed."
pre_ids = paddle.to_tensor(
- [[
- 2, 3, 3, 5, 8, 9, 3, 9, 1, 8, 9, 2, 3, 8, 8, 9, 9, 1, 4, 2, 6, 2, 6, 8,
- 7, 2, 2, 3, 8, 1, 5, 7, 9, 2, 2, 9, 1, 4, 9, 8, 5, 8, 5, 7, 3, 6, 4, 4,
- 9, 9, 8, 5, 5, 2, 2, 9, 4, 8, 1, 9, 6, 9, 2, 2, 7, 2, 2, 9, 4, 6, 4, 6,
- 1, 4, 1, 9, 1, 8, 8, 5, 7, 9, 4, 2, 5, 1, 1, 4, 1, 5, 5, 4, 4, 2, 1, 8,
- 7, 1, 2, 9, 6, 7, 9, 6, 7, 7, 4, 9, 9, 7, 5, 1, 8, 9, 8, 8, 5, 4, 6, 4,
- 7, 5, 5, 7, 6, 9, 3, 9
- ],
- [
- 7, 8, 1, 3, 1, 7, 6, 3, 5, 3, 8, 3, 1, 9, 7, 1, 1, 9, 5, 4, 9, 6, 1,
- 9, 3, 8, 3, 9, 9, 6, 4, 2, 8, 5, 3, 1, 6, 9, 1, 3, 9, 8, 1, 7, 5, 1,
- 5, 1, 8, 7, 4, 5, 9, 8, 7, 4, 7, 3, 6, 4, 6, 6, 5, 5, 2, 9, 9, 5, 8,
- 8, 4, 8, 2, 8, 1, 3, 9, 1, 8, 5, 8, 3, 8, 8, 2, 7, 3, 7, 5, 7, 2, 6,
- 3, 5, 1, 4, 6, 1, 9, 8, 2, 2, 3, 6, 7, 6, 2, 6, 5, 1, 5, 6, 2, 1, 6,
- 4, 7, 7, 3, 8, 5, 1, 9, 1, 2, 8, 6, 8
- ]])
+ [
+ [
+ 2,
+ 3,
+ 3,
+ 5,
+ 8,
+ 9,
+ 3,
+ 9,
+ 1,
+ 8,
+ 9,
+ 2,
+ 3,
+ 8,
+ 8,
+ 9,
+ 9,
+ 1,
+ 4,
+ 2,
+ 6,
+ 2,
+ 6,
+ 8,
+ 7,
+ 2,
+ 2,
+ 3,
+ 8,
+ 1,
+ 5,
+ 7,
+ 9,
+ 2,
+ 2,
+ 9,
+ 1,
+ 4,
+ 9,
+ 8,
+ 5,
+ 8,
+ 5,
+ 7,
+ 3,
+ 6,
+ 4,
+ 4,
+ 9,
+ 9,
+ 8,
+ 5,
+ 5,
+ 2,
+ 2,
+ 9,
+ 4,
+ 8,
+ 1,
+ 9,
+ 6,
+ 9,
+ 2,
+ 2,
+ 7,
+ 2,
+ 2,
+ 9,
+ 4,
+ 6,
+ 4,
+ 6,
+ 1,
+ 4,
+ 1,
+ 9,
+ 1,
+ 8,
+ 8,
+ 5,
+ 7,
+ 9,
+ 4,
+ 2,
+ 5,
+ 1,
+ 1,
+ 4,
+ 1,
+ 5,
+ 5,
+ 4,
+ 4,
+ 2,
+ 1,
+ 8,
+ 7,
+ 1,
+ 2,
+ 9,
+ 6,
+ 7,
+ 9,
+ 6,
+ 7,
+ 7,
+ 4,
+ 9,
+ 9,
+ 7,
+ 5,
+ 1,
+ 8,
+ 9,
+ 8,
+ 8,
+ 5,
+ 4,
+ 6,
+ 4,
+ 7,
+ 5,
+ 5,
+ 7,
+ 6,
+ 9,
+ 3,
+ 9,
+ ],
+ [
+ 7,
+ 8,
+ 1,
+ 3,
+ 1,
+ 7,
+ 6,
+ 3,
+ 5,
+ 3,
+ 8,
+ 3,
+ 1,
+ 9,
+ 7,
+ 1,
+ 1,
+ 9,
+ 5,
+ 4,
+ 9,
+ 6,
+ 1,
+ 9,
+ 3,
+ 8,
+ 3,
+ 9,
+ 9,
+ 6,
+ 4,
+ 2,
+ 8,
+ 5,
+ 3,
+ 1,
+ 6,
+ 9,
+ 1,
+ 3,
+ 9,
+ 8,
+ 1,
+ 7,
+ 5,
+ 1,
+ 5,
+ 1,
+ 8,
+ 7,
+ 4,
+ 5,
+ 9,
+ 8,
+ 7,
+ 4,
+ 7,
+ 3,
+ 6,
+ 4,
+ 6,
+ 6,
+ 5,
+ 5,
+ 2,
+ 9,
+ 9,
+ 5,
+ 8,
+ 8,
+ 4,
+ 8,
+ 2,
+ 8,
+ 1,
+ 3,
+ 9,
+ 1,
+ 8,
+ 5,
+ 8,
+ 3,
+ 8,
+ 8,
+ 2,
+ 7,
+ 3,
+ 7,
+ 5,
+ 7,
+ 2,
+ 6,
+ 3,
+ 5,
+ 1,
+ 4,
+ 6,
+ 1,
+ 9,
+ 8,
+ 2,
+ 2,
+ 3,
+ 6,
+ 7,
+ 6,
+ 2,
+ 6,
+ 5,
+ 1,
+ 5,
+ 6,
+ 2,
+ 1,
+ 6,
+ 4,
+ 7,
+ 7,
+ 3,
+ 8,
+ 5,
+ 1,
+ 9,
+ 1,
+ 2,
+ 8,
+ 6,
+ 8,
+ ],
+ ]
+)
logits = paddle.to_tensor(
- [[
- 0.16274983, 0.61470598, 0.94366980, 0.82005417, 0.50752640, 0.38316748,
- 0.92648441, 0.24050158, 0.05461595, 0.42218581, 0.36270225, 0.15464807,
- 0.13614719, 0.67509544, 0.40315166, 0.10671722, 0.24832056, 0.76091218,
- 0.11598995, 0.10962527, 0.04688513, 0.81536716, 0.72259802, 0.60476679,
- 0.16701800, 0.84160781, 0.79649884, 0.78021604, 0.75329530, 0.98587888,
- 0.13421868, 0.16027625, 0.15269397, 0.06228730, 0.73856270, 0.34721911,
- 0.73683006, 0.78178608, 0.32068327, 0.79906309, 0.44214272, 0.63330448,
- 0.08016958, 0.63367140, 0.19788943, 0.55346787, 0.11142531, 0.90518415,
- 0.21236691, 0.81587470, 0.83752930, 0.70979482, 0.35684183, 0.28715104,
- 0.87162822, 0.17679396, 0.98725849, 0.76129991, 0.04090235, 0.37181064,
- 0.63317049, 0.24689502, 0.21126501, 0.57617670, 0.74346697, 0.40613672,
- 0.56907010, 0.68556929, 0.29032683, 0.17866278, 0.35165095, 0.97015840,
- 0.70785582, 0.54259878, 0.14712237, 0.90483177, 0.02094105, 0.36411613,
- 0.02495066, 0.88874054, 0.88895452, 0.86216462, 0.58062190, 0.95583254,
- 0.20553111, 0.29870346, 0.69652933, 0.36861244, 0.85316223, 0.50240189,
- 0.17566244, 0.61080140, 0.88203174, 0.98675215, 0.24344546, 0.17213407,
- 0.78160852, 0.25165486, 0.48188508, 0.82812423, 0.10199814, 0.90475923,
- 0.66907483, 0.71910626, 0.40660757, 0.59460294, 0.70212913, 0.90841550,
- 0.00329034, 0.11290466, 0.89654654, 0.69114941, 0.29473618, 0.62027222,
- 0.37333879, 0.98911142, 0.46510187, 0.65914583, 0.73022646, 0.12790845,
- 0.12817244, 0.43015456, 0.75011456, 0.43562204, 0.48086026, 0.75587070,
- 0.98481447, 0.77367836
- ],
- [
- 0.12336024, 0.74152875, 0.09191196, 0.99301219, 0.44764417,
- 0.01848883, 0.78326035, 0.99228370, 0.81447607, 0.02627683,
- 0.51033205, 0.98703283, 0.15247856, 0.77640921, 0.60799915,
- 0.87518770, 0.76818430, 0.86542630, 0.31795895, 0.04829503,
- 0.85567141, 0.30271924, 0.67515039, 0.59728831, 0.78710967,
- 0.75111693, 0.56837374, 0.49085775, 0.91510201, 0.59545547,
- 0.99482232, 0.59036905, 0.58267909, 0.28770933, 0.53237396,
- 0.95318258, 0.93987304, 0.61142951, 0.26737869, 0.52285451,
- 0.03479086, 0.61631846, 0.66777998, 0.15736090, 0.00447258,
- 0.37035006, 0.15281211, 0.95372260, 0.25963321, 0.61036694,
- 0.15020694, 0.19171195, 0.55252832, 0.00391038, 0.31052542,
- 0.96495175, 0.42586124, 0.05630261, 0.99728668, 0.01856293,
- 0.83201504, 0.10701843, 0.56434178, 0.38009524, 0.51095045,
- 0.13202040, 0.07133843, 0.75313550, 0.17111187, 0.80716974,
- 0.00172165, 0.83906764, 0.73240769, 0.85843354, 0.11042888,
- 0.07912333, 0.33689004, 0.22334915, 0.59059596, 0.52789515,
- 0.29831955, 0.39515004, 0.55602801, 0.83818001, 0.05865780,
- 0.25654668, 0.76624149, 0.35190639, 0.04158346, 0.59157544,
- 0.30779791, 0.94609004, 0.10759670, 0.65575141, 0.37828529,
- 0.29571742, 0.76361233, 0.72476572, 0.18568406, 0.85430276,
- 0.02057583, 0.76195669, 0.65507215, 0.69129735, 0.25084621,
- 0.75223947, 0.06064088, 0.20287007, 0.35887691, 0.75043523,
- 0.47575447, 0.40021798, 0.44464844, 0.67975360, 0.40443239,
- 0.71052992, 0.21782248, 0.50568426, 0.89037591, 0.06661721,
- 0.28788096, 0.70773387, 0.42428264, 0.80419677, 0.42710736,
- 0.87317258, 0.88229448, 0.79217333
- ]])
+ [
+ [
+ 0.16274983,
+ 0.61470598,
+ 0.94366980,
+ 0.82005417,
+ 0.50752640,
+ 0.38316748,
+ 0.92648441,
+ 0.24050158,
+ 0.05461595,
+ 0.42218581,
+ 0.36270225,
+ 0.15464807,
+ 0.13614719,
+ 0.67509544,
+ 0.40315166,
+ 0.10671722,
+ 0.24832056,
+ 0.76091218,
+ 0.11598995,
+ 0.10962527,
+ 0.04688513,
+ 0.81536716,
+ 0.72259802,
+ 0.60476679,
+ 0.16701800,
+ 0.84160781,
+ 0.79649884,
+ 0.78021604,
+ 0.75329530,
+ 0.98587888,
+ 0.13421868,
+ 0.16027625,
+ 0.15269397,
+ 0.06228730,
+ 0.73856270,
+ 0.34721911,
+ 0.73683006,
+ 0.78178608,
+ 0.32068327,
+ 0.79906309,
+ 0.44214272,
+ 0.63330448,
+ 0.08016958,
+ 0.63367140,
+ 0.19788943,
+ 0.55346787,
+ 0.11142531,
+ 0.90518415,
+ 0.21236691,
+ 0.81587470,
+ 0.83752930,
+ 0.70979482,
+ 0.35684183,
+ 0.28715104,
+ 0.87162822,
+ 0.17679396,
+ 0.98725849,
+ 0.76129991,
+ 0.04090235,
+ 0.37181064,
+ 0.63317049,
+ 0.24689502,
+ 0.21126501,
+ 0.57617670,
+ 0.74346697,
+ 0.40613672,
+ 0.56907010,
+ 0.68556929,
+ 0.29032683,
+ 0.17866278,
+ 0.35165095,
+ 0.97015840,
+ 0.70785582,
+ 0.54259878,
+ 0.14712237,
+ 0.90483177,
+ 0.02094105,
+ 0.36411613,
+ 0.02495066,
+ 0.88874054,
+ 0.88895452,
+ 0.86216462,
+ 0.58062190,
+ 0.95583254,
+ 0.20553111,
+ 0.29870346,
+ 0.69652933,
+ 0.36861244,
+ 0.85316223,
+ 0.50240189,
+ 0.17566244,
+ 0.61080140,
+ 0.88203174,
+ 0.98675215,
+ 0.24344546,
+ 0.17213407,
+ 0.78160852,
+ 0.25165486,
+ 0.48188508,
+ 0.82812423,
+ 0.10199814,
+ 0.90475923,
+ 0.66907483,
+ 0.71910626,
+ 0.40660757,
+ 0.59460294,
+ 0.70212913,
+ 0.90841550,
+ 0.00329034,
+ 0.11290466,
+ 0.89654654,
+ 0.69114941,
+ 0.29473618,
+ 0.62027222,
+ 0.37333879,
+ 0.98911142,
+ 0.46510187,
+ 0.65914583,
+ 0.73022646,
+ 0.12790845,
+ 0.12817244,
+ 0.43015456,
+ 0.75011456,
+ 0.43562204,
+ 0.48086026,
+ 0.75587070,
+ 0.98481447,
+ 0.77367836,
+ ],
+ [
+ 0.12336024,
+ 0.74152875,
+ 0.09191196,
+ 0.99301219,
+ 0.44764417,
+ 0.01848883,
+ 0.78326035,
+ 0.99228370,
+ 0.81447607,
+ 0.02627683,
+ 0.51033205,
+ 0.98703283,
+ 0.15247856,
+ 0.77640921,
+ 0.60799915,
+ 0.87518770,
+ 0.76818430,
+ 0.86542630,
+ 0.31795895,
+ 0.04829503,
+ 0.85567141,
+ 0.30271924,
+ 0.67515039,
+ 0.59728831,
+ 0.78710967,
+ 0.75111693,
+ 0.56837374,
+ 0.49085775,
+ 0.91510201,
+ 0.59545547,
+ 0.99482232,
+ 0.59036905,
+ 0.58267909,
+ 0.28770933,
+ 0.53237396,
+ 0.95318258,
+ 0.93987304,
+ 0.61142951,
+ 0.26737869,
+ 0.52285451,
+ 0.03479086,
+ 0.61631846,
+ 0.66777998,
+ 0.15736090,
+ 0.00447258,
+ 0.37035006,
+ 0.15281211,
+ 0.95372260,
+ 0.25963321,
+ 0.61036694,
+ 0.15020694,
+ 0.19171195,
+ 0.55252832,
+ 0.00391038,
+ 0.31052542,
+ 0.96495175,
+ 0.42586124,
+ 0.05630261,
+ 0.99728668,
+ 0.01856293,
+ 0.83201504,
+ 0.10701843,
+ 0.56434178,
+ 0.38009524,
+ 0.51095045,
+ 0.13202040,
+ 0.07133843,
+ 0.75313550,
+ 0.17111187,
+ 0.80716974,
+ 0.00172165,
+ 0.83906764,
+ 0.73240769,
+ 0.85843354,
+ 0.11042888,
+ 0.07912333,
+ 0.33689004,
+ 0.22334915,
+ 0.59059596,
+ 0.52789515,
+ 0.29831955,
+ 0.39515004,
+ 0.55602801,
+ 0.83818001,
+ 0.05865780,
+ 0.25654668,
+ 0.76624149,
+ 0.35190639,
+ 0.04158346,
+ 0.59157544,
+ 0.30779791,
+ 0.94609004,
+ 0.10759670,
+ 0.65575141,
+ 0.37828529,
+ 0.29571742,
+ 0.76361233,
+ 0.72476572,
+ 0.18568406,
+ 0.85430276,
+ 0.02057583,
+ 0.76195669,
+ 0.65507215,
+ 0.69129735,
+ 0.25084621,
+ 0.75223947,
+ 0.06064088,
+ 0.20287007,
+ 0.35887691,
+ 0.75043523,
+ 0.47575447,
+ 0.40021798,
+ 0.44464844,
+ 0.67975360,
+ 0.40443239,
+ 0.71052992,
+ 0.21782248,
+ 0.50568426,
+ 0.89037591,
+ 0.06661721,
+ 0.28788096,
+ 0.70773387,
+ 0.42428264,
+ 0.80419677,
+ 0.42710736,
+ 0.87317258,
+ 0.88229448,
+ 0.79217333,
+ ],
+ ]
+)
# pre_ids = paddle.to_tensor(np.float32(np.random.random([2, 1024])))
# logits = paddle.to_tensor(np.float32(np.random.random([2, 1024])))
penalty_scores = paddle.to_tensor([1.0, 1.0], "float32")
@@ -195,60 +658,270 @@ print("min_len\n", min_len)
print("eos_token_id\n", eos_token_id)
ref_logits = np.array(
- [[
- -10000000000., -10000000000., 1.88733959, 1.64010835, 1.01505280,
- 0.76633495, 1.85296881, 0.48100317, 0.10923190, 0.84437162, 0.72540450,
- 0.30929613, 0.27229437, 1.35019088, 0.80630332, 0.21343444, 0.49664113,
- 1.52182436, 0.23197991, 0.21925054, 0.09377026, 1.63073432, 1.44519603,
- 1.20953357, 0.33403599, 1.68321562, 1.59299767, 1.56043208, 1.50659060,
- 1.97175777, 0.26843736, 0.32055250, 0.30538794, 0.12457460, 1.47712541,
- 0.69443822, 1.47366011, 1.56357217, 0.64136654, 1.59812617, 0.88428545,
- 1.26660895, 0.16033916, 1.26734281, 0.39577886, 1.10693574, 0.22285062,
- 1.81036830, 0.42473382, 1.63174939, 1.67505860, 1.41958964, 0.71368366,
- 0.57430208, 1.74325645, 0.35358793, 1.97451699, 1.52259982, 0.08180470,
- 0.74362129, 1.26634097, 0.49379003, 0.42253003, 1.15235341, 1.48693395,
- 0.81227344, 1.13814020, 1.37113857, 0.58065367, 0.35732555, 0.70330191,
- 1.94031680, 1.41571164, 1.08519757, 0.29424474, 1.80966353, 0.04188210,
- 0.72823226, 0.04990132, 1.77748108, 1.77790904, 1.72432923, 1.16124380,
- 1.91166508, 0.41106221, 0.59740692, 1.39305866, 0.73722488, 1.70632446,
- 1.00480378, 0.35132489, 1.22160280, 1.76406348, 1.97350430, 0.48689091,
- 0.34426814, 1.56321704, 0.50330973, 0.96377015, 1.65624845, 0.20399629,
- 1.80951846, 1.33814967, 1.43821251, 0.81321514, 1.18920588, 1.40425825,
- 1.81683099, 0.00658068, 0.22580932, 1.79309309, 1.38229883, 0.58947235,
- 1.24054444, 0.74667758, 1.97822285, 0.93020374, 1.31829166, 1.46045291,
- 0.25581691, 0.25634488, 0.86030912, 1.50022912, 0.87124407, 0.96172053,
- 1.51174140, 1.96962893, 1.54735672
+ [
+ [
+ -10000000000.0,
+ -10000000000.0,
+ 1.88733959,
+ 1.64010835,
+ 1.01505280,
+ 0.76633495,
+ 1.85296881,
+ 0.48100317,
+ 0.10923190,
+ 0.84437162,
+ 0.72540450,
+ 0.30929613,
+ 0.27229437,
+ 1.35019088,
+ 0.80630332,
+ 0.21343444,
+ 0.49664113,
+ 1.52182436,
+ 0.23197991,
+ 0.21925054,
+ 0.09377026,
+ 1.63073432,
+ 1.44519603,
+ 1.20953357,
+ 0.33403599,
+ 1.68321562,
+ 1.59299767,
+ 1.56043208,
+ 1.50659060,
+ 1.97175777,
+ 0.26843736,
+ 0.32055250,
+ 0.30538794,
+ 0.12457460,
+ 1.47712541,
+ 0.69443822,
+ 1.47366011,
+ 1.56357217,
+ 0.64136654,
+ 1.59812617,
+ 0.88428545,
+ 1.26660895,
+ 0.16033916,
+ 1.26734281,
+ 0.39577886,
+ 1.10693574,
+ 0.22285062,
+ 1.81036830,
+ 0.42473382,
+ 1.63174939,
+ 1.67505860,
+ 1.41958964,
+ 0.71368366,
+ 0.57430208,
+ 1.74325645,
+ 0.35358793,
+ 1.97451699,
+ 1.52259982,
+ 0.08180470,
+ 0.74362129,
+ 1.26634097,
+ 0.49379003,
+ 0.42253003,
+ 1.15235341,
+ 1.48693395,
+ 0.81227344,
+ 1.13814020,
+ 1.37113857,
+ 0.58065367,
+ 0.35732555,
+ 0.70330191,
+ 1.94031680,
+ 1.41571164,
+ 1.08519757,
+ 0.29424474,
+ 1.80966353,
+ 0.04188210,
+ 0.72823226,
+ 0.04990132,
+ 1.77748108,
+ 1.77790904,
+ 1.72432923,
+ 1.16124380,
+ 1.91166508,
+ 0.41106221,
+ 0.59740692,
+ 1.39305866,
+ 0.73722488,
+ 1.70632446,
+ 1.00480378,
+ 0.35132489,
+ 1.22160280,
+ 1.76406348,
+ 1.97350430,
+ 0.48689091,
+ 0.34426814,
+ 1.56321704,
+ 0.50330973,
+ 0.96377015,
+ 1.65624845,
+ 0.20399629,
+ 1.80951846,
+ 1.33814967,
+ 1.43821251,
+ 0.81321514,
+ 1.18920588,
+ 1.40425825,
+ 1.81683099,
+ 0.00658068,
+ 0.22580932,
+ 1.79309309,
+ 1.38229883,
+ 0.58947235,
+ 1.24054444,
+ 0.74667758,
+ 1.97822285,
+ 0.93020374,
+ 1.31829166,
+ 1.46045291,
+ 0.25581691,
+ 0.25634488,
+ 0.86030912,
+ 1.50022912,
+ 0.87124407,
+ 0.96172053,
+ 1.51174140,
+ 1.96962893,
+ 1.54735672,
+ ],
+ [
+ -10000000000.0,
+ -10000000000.0,
+ -40000.0,
+ 3.97204876,
+ 1.79057670,
+ 0.07395532,
+ 3.13304138,
+ 3.96913481,
+ 3.25790429,
+ -40000.0,
+ 2.04132819,
+ 3.94813132,
+ 0.60991424,
+ 3.10563684,
+ 2.43199658,
+ 3.50075078,
+ 3.07273722,
+ 3.46170521,
+ 1.27183580,
+ 0.19318011,
+ 3.42268562,
+ 1.21087694,
+ 2.70060158,
+ 2.38915324,
+ 3.14843869,
+ 3.00446773,
+ 2.27349496,
+ 1.96343100,
+ 3.66040802,
+ 2.38182187,
+ 3.97928929,
+ 2.36147618,
+ 2.33071637,
+ 1.15083730,
+ 2.12949586,
+ 3.81273031,
+ 3.75949216,
+ 2.44571805,
+ 1.06951475,
+ 2.09141803,
+ 0.13916343,
+ 2.46527386,
+ 2.67111993,
+ 0.62944359,
+ 0.01789032,
+ 1.48140025,
+ 0.61124843,
+ 3.81489038,
+ 1.03853285,
+ 2.44146776,
+ 0.60082775,
+ 0.76684779,
+ 2.21011329,
+ 0.01564152,
+ 1.24210167,
+ 3.85980701,
+ 1.70344496,
+ 0.22521044,
+ 3.98914671,
+ 0.07425172,
+ 3.32806015,
+ 0.42807373,
+ 2.25736713,
+ 1.52038097,
+ 2.04380178,
+ 0.52808160,
+ 0.28535372,
+ 3.01254201,
+ 0.68444747,
+ 3.22867894,
+ 0.00688660,
+ 3.35627055,
+ 2.92963076,
+ 3.43373418,
+ 0.44171551,
+ 0.31649333,
+ 1.34756017,
+ 0.89339662,
+ 2.36238384,
+ 2.11158061,
+ 1.19327819,
+ 1.58060014,
+ 2.22411203,
+ 3.35272002,
+ 0.23463120,
+ 1.02618670,
+ 3.06496596,
+ 1.40762556,
+ 0.16633384,
+ 2.36630177,
+ 1.23119164,
+ 3.78436017,
+ 0.43038681,
+ 2.62300563,
+ 1.51314116,
+ 1.18286967,
+ 3.05444932,
+ 2.89906287,
+ 0.74273622,
+ 3.41721106,
+ 0.08230332,
+ 3.04782677,
+ 2.62028861,
+ 2.76518941,
+ 1.00338483,
+ 3.00895786,
+ 0.24256352,
+ 0.81148028,
+ 1.43550766,
+ 3.00174093,
+ 1.90301788,
+ 1.60087192,
+ 1.77859378,
+ 2.71901441,
+ 1.61772954,
+ 2.84211969,
+ 0.87128991,
+ 2.02273703,
+ 3.56150365,
+ 0.26646885,
+ 1.15152383,
+ 2.83093548,
+ 1.69713056,
+ 3.21678710,
+ 1.70842946,
+ 3.49269032,
+ 3.52917790,
+ 3.16869330,
+ ],
],
- [
- -10000000000., -10000000000., -40000., 3.97204876, 1.79057670,
- 0.07395532, 3.13304138, 3.96913481, 3.25790429, -40000., 2.04132819,
- 3.94813132, 0.60991424, 3.10563684, 2.43199658, 3.50075078,
- 3.07273722, 3.46170521, 1.27183580, 0.19318011, 3.42268562,
- 1.21087694, 2.70060158, 2.38915324, 3.14843869, 3.00446773,
- 2.27349496, 1.96343100, 3.66040802, 2.38182187, 3.97928929,
- 2.36147618, 2.33071637, 1.15083730, 2.12949586, 3.81273031,
- 3.75949216, 2.44571805, 1.06951475, 2.09141803, 0.13916343,
- 2.46527386, 2.67111993, 0.62944359, 0.01789032, 1.48140025,
- 0.61124843, 3.81489038, 1.03853285, 2.44146776, 0.60082775,
- 0.76684779, 2.21011329, 0.01564152, 1.24210167, 3.85980701,
- 1.70344496, 0.22521044, 3.98914671, 0.07425172, 3.32806015,
- 0.42807373, 2.25736713, 1.52038097, 2.04380178, 0.52808160,
- 0.28535372, 3.01254201, 0.68444747, 3.22867894, 0.00688660,
- 3.35627055, 2.92963076, 3.43373418, 0.44171551, 0.31649333,
- 1.34756017, 0.89339662, 2.36238384, 2.11158061, 1.19327819,
- 1.58060014, 2.22411203, 3.35272002, 0.23463120, 1.02618670,
- 3.06496596, 1.40762556, 0.16633384, 2.36630177, 1.23119164,
- 3.78436017, 0.43038681, 2.62300563, 1.51314116, 1.18286967,
- 3.05444932, 2.89906287, 0.74273622, 3.41721106, 0.08230332,
- 3.04782677, 2.62028861, 2.76518941, 1.00338483, 3.00895786,
- 0.24256352, 0.81148028, 1.43550766, 3.00174093, 1.90301788,
- 1.60087192, 1.77859378, 2.71901441, 1.61772954, 2.84211969,
- 0.87128991, 2.02273703, 3.56150365, 0.26646885, 1.15152383,
- 2.83093548, 1.69713056, 3.21678710, 1.70842946, 3.49269032,
- 3.52917790, 3.16869330
- ]],
"float32",
)
diff_logits = np.sum(np.abs(ref_logits - logits.numpy()))
print("diff_logits\n", diff_logits)
-assert diff_logits < 1e-6, 'Check failed.'
+assert diff_logits < 1e-6, "Check failed."
diff --git a/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py b/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py
index 70e4901ac..966ec5de2 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py
@@ -21,19 +21,30 @@ paddle.seed(2023)
pre_ids_all = paddle.to_tensor(
[[1, 9, 3, 4, 5, 6, 7, -1, -1, -1], [1, 9, 7, 6, 5, 4, -1, -1, -1, -1]],
- "int64")
-input_ids = paddle.to_tensor([[1, 9, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1],
- [1, 9, 7, 6, 5, 4, -1, -1, -1, -1, -1, -1, -1]],
- "int64")
+ "int64",
+)
+input_ids = paddle.to_tensor(
+ [
+ [1, 9, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1],
+ [1, 9, 7, 6, 5, 4, -1, -1, -1, -1, -1, -1, -1],
+ ],
+ "int64",
+)
seq_lens_this_time = paddle.to_tensor([1, 1], "int32")
seq_lens_encoder = paddle.to_tensor([1, 1], "int32")
seq_lens_decoder = paddle.to_tensor([1, 1], "int32")
step_idx = paddle.to_tensor([1, 1], "int64")
stop_flags = paddle.to_tensor([0, 1], "bool")
print("pre_ids_all\n", pre_ids_all)
-set_value_by_flags_and_idx(pre_ids_all, input_ids, seq_lens_this_time,
- seq_lens_encoder, seq_lens_decoder, step_idx,
- stop_flags)
+set_value_by_flags_and_idx(
+ pre_ids_all,
+ input_ids,
+ seq_lens_this_time,
+ seq_lens_encoder,
+ seq_lens_decoder,
+ step_idx,
+ stop_flags,
+)
print("pre_ids_all\n", pre_ids_all)
print("input_ids\n", input_ids)
print("seq_lens_this_time\n", seq_lens_this_time)
@@ -73,4 +84,4 @@ ref_pre_ids_all = np.array(
)
diff_pre_ids_all = np.sum(np.abs(ref_pre_ids_all - pre_ids_all.numpy()))
print("diff_pre_ids_all\n", diff_pre_ids_all)
-assert diff_pre_ids_all == 0, 'Check failed.'
+assert diff_pre_ids_all == 0, "Check failed."
diff --git a/custom_ops/xpu_ops/test/python/ops/test_step.py b/custom_ops/xpu_ops/test/python/ops/test_step.py
index 5334c316c..9d9eaf7e4 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_step.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_step.py
@@ -41,10 +41,7 @@ step_idx = (seq_lens_decoder - ori_seq_lens_encoder).astype("int64")
max_block_num = block_bs * max_seq_len // block_size
free_list_len = int(max_block_num * (1 - block_ratio))
free_list_len = np.full([1], free_list_len, "int32")
-free_list = np.arange(max_block_num - 1,
- max_block_num - free_list_len - 1,
- -1,
- dtype="int32")
+free_list = np.arange(max_block_num - 1, max_block_num - free_list_len - 1, -1, dtype="int32")
encoder_block_lens = np.zeros([max_bs], "int32")
used_list_len = np.zeros([max_bs], "int32")
@@ -53,19 +50,15 @@ encoder_block_id = 0
for i in range(bs):
enc_block_num = (ori_seq_lens_encoder[i] + block_size - 1) // block_size
encoder_block_lens[i] = enc_block_num
- dec_block_num = (seq_lens_decoder[i] + block_size -
- 1) // block_size - enc_block_num
+ dec_block_num = (seq_lens_decoder[i] + block_size - 1) // block_size - enc_block_num
used_list_len[i] = dec_block_num
- block_tables[i, :enc_block_num] = np.arange(
- encoder_block_id, encoder_block_id + enc_block_num, 1, "int32")
+ block_tables[i, :enc_block_num] = np.arange(encoder_block_id, encoder_block_id + enc_block_num, 1, "int32")
encoder_block_id += enc_block_num
if dec_block_num > 0:
- block_tables[
- i, enc_block_num:enc_block_num +
- dec_block_num] = free_list[free_list_len[0] - 1 -
- dec_block_num:free_list_len[0] - 1]
- free_list[free_list_len[0] - 1 - dec_block_num:free_list_len[0] -
- 1] = -1
+ block_tables[i, enc_block_num : enc_block_num + dec_block_num] = free_list[
+ free_list_len[0] - 1 - dec_block_num : free_list_len[0] - 1
+ ]
+ free_list[free_list_len[0] - 1 - dec_block_num : free_list_len[0] - 1] = -1
free_list_len[0] -= dec_block_num
assert free_list_len[0] >= 0
@@ -137,13 +130,32 @@ first_token_ids = paddle.to_tensor(first_token_ids)
# print("step_idx: ", step_idx)
# print("next_tokens: ", next_tokens)
-step_paddle(stop_flags, seq_lens_this_time, ori_seq_lens_encoder,
- seq_lens_encoder, seq_lens_decoder, block_tables,
- encoder_block_lens, is_block_step, step_block_list, step_lens,
- recover_block_list, recover_lens, need_block_list, need_block_len,
- used_list_len, free_list, free_list_len, input_ids, pre_ids,
- step_idx, next_tokens, first_token_ids, block_size,
- encoder_decoder_block_num)
+step_paddle(
+ stop_flags,
+ seq_lens_this_time,
+ ori_seq_lens_encoder,
+ seq_lens_encoder,
+ seq_lens_decoder,
+ block_tables,
+ encoder_block_lens,
+ is_block_step,
+ step_block_list,
+ step_lens,
+ recover_block_list,
+ recover_lens,
+ need_block_list,
+ need_block_len,
+ used_list_len,
+ free_list,
+ free_list_len,
+ input_ids,
+ pre_ids,
+ step_idx,
+ next_tokens,
+ first_token_ids,
+ block_size,
+ encoder_decoder_block_num,
+)
print("-" * 50 + "after step op" + "-" * 50)
print("stop_flags: ", stop_flags)
diff --git a/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py b/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py
index cbe4c48bf..537e41f5e 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py
@@ -30,8 +30,7 @@ end_ids = paddle.to_tensor([0, 1, 2, 3, 4, 5], "int64")
print("topk_ids\n", topk_ids)
print("next_tokens\n", next_tokens)
print("stop_flags\n", stop_flags)
-set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens,
- False)
+set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens, False)
print("topk_ids\n", topk_ids)
print("next_tokens\n", next_tokens)
print("stop_flags\n", stop_flags)
@@ -40,44 +39,220 @@ print("end_ids\n", end_ids)
ref_topk_ids = np.array(
[
- 0, 0, 2, 3, -1, 0, 0, 0, 0, 9, 10, 0, 12, 0, -1, 15, 16, 0, 18, 19, 20,
- 0, 22, 23, 0, 25, 26, 27, -1, 29, 30, 31, 0, 0, 0, -1, -1, 37, 38, 39,
- -1, -1, 0, 0, 0, 0, 46, -1, 0, 49, 50, 0, 52, 53, 0, -1, 0, 57, -1, 59,
- 60, 0, 0, 63
+ 0,
+ 0,
+ 2,
+ 3,
+ -1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 9,
+ 10,
+ 0,
+ 12,
+ 0,
+ -1,
+ 15,
+ 16,
+ 0,
+ 18,
+ 19,
+ 20,
+ 0,
+ 22,
+ 23,
+ 0,
+ 25,
+ 26,
+ 27,
+ -1,
+ 29,
+ 30,
+ 31,
+ 0,
+ 0,
+ 0,
+ -1,
+ -1,
+ 37,
+ 38,
+ 39,
+ -1,
+ -1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 46,
+ -1,
+ 0,
+ 49,
+ 50,
+ 0,
+ 52,
+ 53,
+ 0,
+ -1,
+ 0,
+ 57,
+ -1,
+ 59,
+ 60,
+ 0,
+ 0,
+ 63,
],
"int64",
)
ref_next_tokens = np.array(
[
- 0, 0, 2, 3, 0, 0, 0, 0, 0, 9, 10, 0, 12, 0, 0, 15, 16, 0, 18, 19, 20,
- 0, 22, 23, 0, 25, 26, 27, 0, 29, 30, 31, 0, 0, 0, 0, 0, 37, 38, 39, 0,
- 0, 0, 0, 0, 0, 46, 0, 0, 49, 50, 0, 52, 53, 0, 0, 0, 57, 0, 59, 60, 0,
- 0, 63
+ 0,
+ 0,
+ 2,
+ 3,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 9,
+ 10,
+ 0,
+ 12,
+ 0,
+ 0,
+ 15,
+ 16,
+ 0,
+ 18,
+ 19,
+ 20,
+ 0,
+ 22,
+ 23,
+ 0,
+ 25,
+ 26,
+ 27,
+ 0,
+ 29,
+ 30,
+ 31,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 37,
+ 38,
+ 39,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 46,
+ 0,
+ 0,
+ 49,
+ 50,
+ 0,
+ 52,
+ 53,
+ 0,
+ 0,
+ 0,
+ 57,
+ 0,
+ 59,
+ 60,
+ 0,
+ 0,
+ 63,
],
"int64",
)
ref_stop_flags = np.array(
[
- True, True, True, True, True, True, True, True, True, False, False,
- True, False, True, True, False, False, True, False, False, False, True,
- False, False, True, False, False, False, True, False, False, False,
- True, True, True, True, True, False, False, False, True, True, True,
- True, True, True, False, True, True, False, False, True, False, False,
- True, True, True, False, True, False, False, True, True, False
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ False,
+ False,
+ True,
+ False,
+ True,
+ True,
+ False,
+ False,
+ True,
+ False,
+ False,
+ False,
+ True,
+ False,
+ False,
+ True,
+ False,
+ False,
+ False,
+ True,
+ False,
+ False,
+ False,
+ True,
+ True,
+ True,
+ True,
+ True,
+ False,
+ False,
+ False,
+ True,
+ True,
+ True,
+ True,
+ True,
+ True,
+ False,
+ True,
+ True,
+ False,
+ False,
+ True,
+ False,
+ False,
+ True,
+ True,
+ True,
+ False,
+ True,
+ False,
+ False,
+ True,
+ True,
+ False,
],
"bool",
)
diff_topk_ids = np.sum(np.abs(ref_topk_ids - topk_ids.numpy()))
print("diff_topk_ids\n", diff_topk_ids)
-assert diff_topk_ids == 0, 'Check failed.'
+assert diff_topk_ids == 0, "Check failed."
diff_next_tokens = np.sum(np.abs(ref_next_tokens - next_tokens.numpy()))
print("diff_next_tokens\n", diff_next_tokens)
-assert diff_next_tokens == 0, 'Check failed.'
-diff_stop_flags = np.sum(
- np.abs(
- ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32)))
+assert diff_next_tokens == 0, "Check failed."
+diff_stop_flags = np.sum(np.abs(ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32)))
print("diff_stop_flags\n", diff_stop_flags)
-assert diff_stop_flags == 0, 'Check failed.'
+assert diff_stop_flags == 0, "Check failed."
# test beam_search=True
topk_ids = paddle.arange(0, bs, dtype="int64")
@@ -88,8 +263,7 @@ end_ids = paddle.to_tensor([0, 1, 2, 3, 4, 5], "int64")
print("topk_ids\n", topk_ids)
print("next_tokens\n", next_tokens)
print("stop_flags\n", stop_flags)
-set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens,
- True)
+set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens, True)
print("topk_ids\n", topk_ids)
print("next_tokens\n", next_tokens)
print("stop_flags\n", stop_flags)
@@ -98,42 +272,217 @@ print("end_ids\n", end_ids)
ref_topk_ids = np.array(
[
- 0, 1, 2, 3, 4, 0, 6, 7, -1, 9, 10, 0, -1, 13, 14, 15, 0, 17, 18, 19,
- 20, 0, 22, 23, 24, 25, -1, -1, 28, 29, 0, 0, -1, 33, 34, 35, 36, 37, 0,
- -1, 0, 41, -1, 0, 44, 45, 46, 0, 0, 49, 0, 0, 0, 53, 0, 0, 0, 0, 58,
- -1, 60, 61, -1, 63
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 0,
+ 6,
+ 7,
+ -1,
+ 9,
+ 10,
+ 0,
+ -1,
+ 13,
+ 14,
+ 15,
+ 0,
+ 17,
+ 18,
+ 19,
+ 20,
+ 0,
+ 22,
+ 23,
+ 24,
+ 25,
+ -1,
+ -1,
+ 28,
+ 29,
+ 0,
+ 0,
+ -1,
+ 33,
+ 34,
+ 35,
+ 36,
+ 37,
+ 0,
+ -1,
+ 0,
+ 41,
+ -1,
+ 0,
+ 44,
+ 45,
+ 46,
+ 0,
+ 0,
+ 49,
+ 0,
+ 0,
+ 0,
+ 53,
+ 0,
+ 0,
+ 0,
+ 0,
+ 58,
+ -1,
+ 60,
+ 61,
+ -1,
+ 63,
],
"int64",
)
ref_next_tokens = np.array(
[
- 0, 1, 2, 3, 4, 0, 6, 7, 0, 9, 10, 0, 0, 13, 14, 15, 0, 17, 18, 19, 20,
- 0, 22, 23, 24, 25, 0, 0, 28, 29, 0, 0, 0, 33, 34, 35, 36, 37, 0, 0, 0,
- 41, 0, 0, 44, 45, 46, 0, 0, 49, 0, 0, 0, 53, 0, 0, 0, 0, 58, 0, 60, 61,
- 0, 63
+ 0,
+ 1,
+ 2,
+ 3,
+ 4,
+ 0,
+ 6,
+ 7,
+ 0,
+ 9,
+ 10,
+ 0,
+ 0,
+ 13,
+ 14,
+ 15,
+ 0,
+ 17,
+ 18,
+ 19,
+ 20,
+ 0,
+ 22,
+ 23,
+ 24,
+ 25,
+ 0,
+ 0,
+ 28,
+ 29,
+ 0,
+ 0,
+ 0,
+ 33,
+ 34,
+ 35,
+ 36,
+ 37,
+ 0,
+ 0,
+ 0,
+ 41,
+ 0,
+ 0,
+ 44,
+ 45,
+ 46,
+ 0,
+ 0,
+ 49,
+ 0,
+ 0,
+ 0,
+ 53,
+ 0,
+ 0,
+ 0,
+ 0,
+ 58,
+ 0,
+ 60,
+ 61,
+ 0,
+ 63,
],
"int64",
)
ref_stop_flags = np.array(
[
- False, False, False, False, False, True, False, False, True, False,
- False, True, True, False, False, False, True, False, False, False,
- False, True, False, False, False, False, True, True, False, False,
- True, True, True, False, False, False, False, False, True, True, True,
- False, True, True, False, False, False, True, True, False, True, True,
- True, False, True, True, True, True, False, True, False, False, True,
- False
+ False,
+ False,
+ False,
+ False,
+ False,
+ True,
+ False,
+ False,
+ True,
+ False,
+ False,
+ True,
+ True,
+ False,
+ False,
+ False,
+ True,
+ False,
+ False,
+ False,
+ False,
+ True,
+ False,
+ False,
+ False,
+ False,
+ True,
+ True,
+ False,
+ False,
+ True,
+ True,
+ True,
+ False,
+ False,
+ False,
+ False,
+ False,
+ True,
+ True,
+ True,
+ False,
+ True,
+ True,
+ False,
+ False,
+ False,
+ True,
+ True,
+ False,
+ True,
+ True,
+ True,
+ False,
+ True,
+ True,
+ True,
+ True,
+ False,
+ True,
+ False,
+ False,
+ True,
+ False,
],
"bool",
)
diff_topk_ids = np.sum(np.abs(ref_topk_ids - topk_ids.numpy()))
print("diff_topk_ids\n", diff_topk_ids)
-assert diff_topk_ids == 0, 'Check failed.'
+assert diff_topk_ids == 0, "Check failed."
diff_next_tokens = np.sum(np.abs(ref_next_tokens - next_tokens.numpy()))
print("diff_next_tokens\n", diff_next_tokens)
-assert diff_next_tokens == 0, 'Check failed.'
-diff_stop_flags = np.sum(
- np.abs(
- ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32)))
+assert diff_next_tokens == 0, "Check failed."
+diff_stop_flags = np.sum(np.abs(ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32)))
print("diff_stop_flags\n", diff_stop_flags)
-assert diff_stop_flags == 0, 'Check failed.'
+assert diff_stop_flags == 0, "Check failed."
diff --git a/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py b/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py
index d1e8e36dd..037429b22 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py
@@ -60,9 +60,17 @@ print("stop_nums:\n", stop_nums)
print("next_tokens:\n", next_tokens)
print("is_block_step:\n", is_block_step)
-update_inputs(stop_flags, not_need_stop, seq_lens_this_time, seq_lens_encoder,
- seq_lens_decoder, input_ids, stop_nums, next_tokens,
- is_block_step)
+update_inputs(
+ stop_flags,
+ not_need_stop,
+ seq_lens_this_time,
+ seq_lens_encoder,
+ seq_lens_decoder,
+ input_ids,
+ stop_nums,
+ next_tokens,
+ is_block_step,
+)
print("-" * 50)
print("stop_flags:\n", stop_flags)
@@ -75,32 +83,269 @@ print("stop_nums:\n", stop_nums)
print("next_tokens:\n", next_tokens)
ref_not_need_stop_out = np.array([True])
-ref_seq_lens_this_time_out = np.array([
- 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
- 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1
-], "int32")
-ref_seq_lens_encoder_out = np.array([
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-], "int32")
-ref_seq_lens_decoder_out = np.array([
- 0, 0, 2, 0, 0, 6, 0, 8, 8, 10, 0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 20, 22, 0,
- 24, 24, 0, 26, 28, 0, 0, 0, 32, 32, 0, 34, 0, 0, 38, 0, 40, 0, 0, 42, 0, 0,
- 46, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-], "int32")
-input_ids_np[:, 0] = np.array([
- 6, 5, 9, 8, 6, 2, 8, 1, 3, 1, 3, 6, 9, 8, 1, 9, 1, 8, 8, 6, 7, 6, 5, 3, 5,
- 9, 3, 6, 3, 9, 8, 8, 8, 8, 4, 8, 7, 4, 2, 3, 5, 8, 4, 2, 5, 6, 8, 9, 6, 7,
- 4, 2, 4, 6, 2, 3, 4, 9, 7, 2, 1, 8, 7, 8
-], "int64")
+ref_seq_lens_this_time_out = np.array(
+ [
+ 0,
+ 0,
+ 1,
+ 0,
+ 0,
+ 1,
+ 0,
+ 1,
+ 1,
+ 1,
+ 0,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 1,
+ 1,
+ 0,
+ 1,
+ 1,
+ 0,
+ 1,
+ 1,
+ 0,
+ 0,
+ 0,
+ 1,
+ 1,
+ 0,
+ 1,
+ 0,
+ 0,
+ 1,
+ 0,
+ 1,
+ 0,
+ 0,
+ 1,
+ 0,
+ 0,
+ 1,
+ 1,
+ 1,
+ ],
+ "int32",
+)
+ref_seq_lens_encoder_out = np.array(
+ [
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ ],
+ "int32",
+)
+ref_seq_lens_decoder_out = np.array(
+ [
+ 0,
+ 0,
+ 2,
+ 0,
+ 0,
+ 6,
+ 0,
+ 8,
+ 8,
+ 10,
+ 0,
+ 12,
+ 12,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 20,
+ 22,
+ 0,
+ 24,
+ 24,
+ 0,
+ 26,
+ 28,
+ 0,
+ 0,
+ 0,
+ 32,
+ 32,
+ 0,
+ 34,
+ 0,
+ 0,
+ 38,
+ 0,
+ 40,
+ 0,
+ 0,
+ 42,
+ 0,
+ 0,
+ 46,
+ 46,
+ 48,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ ],
+ "int32",
+)
+input_ids_np[:, 0] = np.array(
+ [
+ 6,
+ 5,
+ 9,
+ 8,
+ 6,
+ 2,
+ 8,
+ 1,
+ 3,
+ 1,
+ 3,
+ 6,
+ 9,
+ 8,
+ 1,
+ 9,
+ 1,
+ 8,
+ 8,
+ 6,
+ 7,
+ 6,
+ 5,
+ 3,
+ 5,
+ 9,
+ 3,
+ 6,
+ 3,
+ 9,
+ 8,
+ 8,
+ 8,
+ 8,
+ 4,
+ 8,
+ 7,
+ 4,
+ 2,
+ 3,
+ 5,
+ 8,
+ 4,
+ 2,
+ 5,
+ 6,
+ 8,
+ 9,
+ 6,
+ 7,
+ 4,
+ 2,
+ 4,
+ 6,
+ 2,
+ 3,
+ 4,
+ 9,
+ 7,
+ 2,
+ 1,
+ 8,
+ 7,
+ 8,
+ ],
+ "int64",
+)
-assert not_need_stop.numpy(
-) == ref_not_need_stop_out, 'Check not_need_stop failed.'
-assert np.all(seq_lens_this_time.numpy() ==
- ref_seq_lens_this_time_out), 'Check seq_lens_this_time failed.'
-assert np.all(seq_lens_encoder.numpy() ==
- ref_seq_lens_encoder_out), 'Check seq_lens_encoder failed.'
-assert np.all(seq_lens_decoder.numpy() ==
- ref_seq_lens_decoder_out), 'Check seq_lens_decoder failed.'
-assert np.all(input_ids.numpy() == input_ids_np), 'Check input_ids failed.'
+assert not_need_stop.numpy() == ref_not_need_stop_out, "Check not_need_stop failed."
+assert np.all(seq_lens_this_time.numpy() == ref_seq_lens_this_time_out), "Check seq_lens_this_time failed."
+assert np.all(seq_lens_encoder.numpy() == ref_seq_lens_encoder_out), "Check seq_lens_encoder failed."
+assert np.all(seq_lens_decoder.numpy() == ref_seq_lens_decoder_out), "Check seq_lens_decoder failed."
+assert np.all(input_ids.numpy() == input_ids_np), "Check input_ids failed."
diff --git a/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py b/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py
index e946d4069..59312c95d 100644
--- a/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py
+++ b/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py
@@ -29,16 +29,15 @@ def np_quant_weight_int4(weight_np):
weight = np.transpose(weight_np, [1, 0]) # n,k
max_value = np.max(np.abs(weight), axis=1).reshape(-1, 1) # k => k,1
quanted_weight = np_clip_and_round(weight / max_value * 7.0, 7) # n,k
- quanted_weight = (quanted_weight[:, 1::2] & 0xF) << 4 | (
- quanted_weight[:, ::2] & 0xF) # pack int4, [n,k//2]
+ quanted_weight = (quanted_weight[:, 1::2] & 0xF) << 4 | (quanted_weight[:, ::2] & 0xF) # pack int4, [n,k//2]
weight_scales = (max_value).astype(weight_np.dtype).reshape(-1)
return quanted_weight, weight_scales.astype(np.float32)
-def np_quant_weight(weight_np, algo='weight_only_int8'):
+def np_quant_weight(weight_np, algo="weight_only_int8"):
assert weight_np.dtype == np.float32
- if algo == 'weight_only_int4':
+ if algo == "weight_only_int4":
return np_quant_weight_int4(weight_np)
weight = np.transpose(weight_np, [1, 0])
@@ -56,7 +55,7 @@ def int8_to_bin_np(value):
def int8_to_bin(value):
if not -128 <= value <= 127:
raise ValueError("int8 值必须在 -128 到 127 之间")
- return format(value & 0xFF, '08b') # '08b' 表示 8 位二进制,高位补零
+ return format(value & 0xFF, "08b") # '08b' 表示 8 位二进制,高位补零
# 1) preparation
@@ -70,7 +69,7 @@ w_np = (np.random.random((k, n)).astype(np.float32) - 0.5) * 10
qw_np, wscale_np = np_quant_weight(w_np, algo)
# 3) xpu calculation
-dtype = 'float32'
+dtype = "float32"
x_pd = paddle.to_tensor(w_np, dtype=dtype)
qw_pd, wscale_pd = weight_quantize_xpu(x_pd, algo, -1, -1)
qw_pd_trans = paddle.transpose(qw_pd, [1, 0])
@@ -83,12 +82,7 @@ qw_pd_trans = paddle.transpose(qw_pd, [1, 0])
# comparation
print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}")
print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}")
-print(
- f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}"
-)
-print(
- f"qw_pd_trans, mean={qw_pd_trans.astype('float32').mean()}, std={qw_pd_trans.astype('float32').std()}"
-)
-sum_diff = np.sum(
- np.abs(qw_pd_trans.astype("float32").numpy() - qw_np.astype("float32")))
+print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}")
+print(f"qw_pd_trans, mean={qw_pd_trans.astype('float32').mean()}, std={qw_pd_trans.astype('float32').std()}")
+sum_diff = np.sum(np.abs(qw_pd_trans.astype("float32").numpy() - qw_np.astype("float32")))
print(f"sum_diff: {sum_diff}")
diff --git a/docs/benchmark.md b/docs/benchmark.md
index 67f2a8c05..46283b627 100644
--- a/docs/benchmark.md
+++ b/docs/benchmark.md
@@ -37,4 +37,4 @@ python benchmark_serving.py \
--num-prompts 1 \
--max-concurrency 1 \
--save-result
-```
\ No newline at end of file
+```
diff --git a/docs/features/disaggregated.md b/docs/features/disaggregated.md
index 4fddfc84a..e5e20dcae 100644
--- a/docs/features/disaggregated.md
+++ b/docs/features/disaggregated.md
@@ -15,7 +15,7 @@ We provide two transmission methods for KV Cache, targeting intra-machine and in
Uses cudaMemcpyPeer for KV Cache transmission between two GPUs within a single machine, offering low latency and high throughput.
### Inter-machine Transmission
-For transmission between multiple machines, uses high-speed RDMA network for KV Cache transmission. We provide the `rdma_comm` high-speed transmission network library for cross-machine KV Cache transmission.
+For transmission between multiple machines, uses high-speed RDMA network for KV Cache transmission. We provide the `rdma_comm` high-speed transmission network library for cross-machine KV Cache transmission.
## PD Disaggregated Scheduling

@@ -60,7 +60,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--cache-queue-port 8187 \
--tensor-parallel-size 4 \
--quantization wint4 \
- --innode-prefill-ports 8182 \
+ --innode-prefill-ports 8182 \
--splitwise-role "decode"
```
@@ -72,7 +72,8 @@ Refer to the example code `offline_disaggregated_demo.py` in the `fastdeploy/dem
### Multi-machine Disaggregated Deployment
#### Prerequisite: Redis
-- Installation via `conda`
+* Installation via `conda`
+
```bash
# Install
conda install redis
@@ -80,7 +81,8 @@ conda install redis
nohup redis-server > redis.log 2>&1 &
```
-- Installation via `apt`
+* Installation via `apt`
+
```bash
# Install
sudo apt install redis-server -y
@@ -88,7 +90,8 @@ sudo apt install redis-server -y
sudo systemctl start redis-server
```
-- Installation via `yum`
+* Installation via `yum`
+
```bash
# Install
sudo yum install redis -y
diff --git a/docs/features/load_balance.md b/docs/features/load_balance.md
index a022470d1..78f832e7c 100644
--- a/docs/features/load_balance.md
+++ b/docs/features/load_balance.md
@@ -38,6 +38,7 @@ conda install redis
# Launch
nohup redis-server > redis.log 2>&1 &
```
+
### apt installation (Debian/Ubuntu)
```bash
@@ -57,6 +58,7 @@ sudo systemctl start redis
```
## Launching FastDeploy
+
```bash
python -m fastdeploy.entrypoints.openai.api_server \
--port 8801 \
@@ -72,6 +74,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--scheduler-min-load_score 3 \
--scheduler-load-shards-num 1
```
+
[Scheduler Launching Parameter](../online_serving/scheduler.md)
### Deployment notes:
diff --git a/docs/features/prefix_caching.md b/docs/features/prefix_caching.md
index 1e2148135..0a58336de 100644
--- a/docs/features/prefix_caching.md
+++ b/docs/features/prefix_caching.md
@@ -36,4 +36,4 @@ python -m fastdeploy.entrypoints.openai.api_server \
Set `enable_prefix_caching=True` when launching FastDeploy. Enable CPU caching via `swap_space` based on available machine memory.
-A test example is provided: `demo/offline_prefix_caching_demo.py`
\ No newline at end of file
+A test example is provided: `demo/offline_prefix_caching_demo.py`
diff --git a/docs/features/reasoning_output.md b/docs/features/reasoning_output.md
index 78ea9de2e..5f23e65d5 100644
--- a/docs/features/reasoning_output.md
+++ b/docs/features/reasoning_output.md
@@ -18,8 +18,9 @@ Interfaces that support toggling the reasoning mode:
For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `metadata={"reasoning_max_tokens": 1024}` to the request.
### Quick Start
-When launching the model service, specify the parser name using the `--reasoning-parser` argument.
+When launching the model service, specify the parser name using the `--reasoning-parser` argument.
This parser will process the model's output and extract the `reasoning_content` field.
+
```bash
python -m fastdeploy.entrypoints.openai.api_server \
--model /path/to/your/model \
@@ -29,7 +30,9 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--reasoning-parser ernie-45-vl
```
+
Next, make a request to the model that should return the reasoning content in the response.
+
```bash
curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
-H "Content-Type: application/json" \
@@ -43,10 +46,12 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
"metadata": {"enable_thinking": true}
}'
```
+
The `reasoning_content` field contains the reasoning steps to reach the final conclusion, while the `content` field holds the conclusion itself.
### Streaming chat completions
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in `chat completion response chunks`
+
```python
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
@@ -69,4 +74,4 @@ for chunk in chat_response:
if chunk.choices[0].delta is not None:
print(chunk.choices[0].delta, end='')
print("\n")
-```
\ No newline at end of file
+```
diff --git a/docs/features/speculative_decoding.md b/docs/features/speculative_decoding.md
index 0e6da2283..4093dcca5 100644
--- a/docs/features/speculative_decoding.md
+++ b/docs/features/speculative_decoding.md
@@ -10,22 +10,22 @@ This project implements an efficient **Speculative Decoding** inference framewor
- **Ngram**
-- **MTP (Multi-Token Prediction)**
- - ✅ Supported: TP Sharding
- - ✅ Supported: Shared Prefix
- - ✅ Supported: TP Sharding + PD Separation
+- **MTP (Multi-Token Prediction)**
+ - ✅ Supported: TP Sharding
+ - ✅ Supported: Shared Prefix
+ - ✅ Supported: TP Sharding + PD Separation
- ⏳ Coming Soon: EP + DP + PD Separation
- ⏳ Coming Soon: Support Chunk-prefill
- - ⏳ Coming Soon: Multi-layer MTP Layer
+ - ⏳ Coming Soon: Multi-layer MTP Layer
---
### Coming Soon
-- Draft Model
-- Eagle
-- Hydra
-- Medusa
+- Draft Model
+- Eagle
+- Hydra
+- Medusa
- ...
---
@@ -54,7 +54,7 @@ This project implements an efficient **Speculative Decoding** inference framewor
## 🚀 Using Multi-Token Prediction (MTP)
-For detailed theory, refer to:
+For detailed theory, refer to:
📄 [DeepSeek-V3 Paper](https://arxiv.org/pdf/2412.19437)
### TP Sharding Mode
@@ -147,4 +147,4 @@ python -m fastdeploy.entrypoints.openai.api_server \
--config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \
--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${mtp_model_path}"}'
-```
\ No newline at end of file
+```
diff --git a/docs/get_started/installation/Enflame_gcu.md b/docs/get_started/installation/Enflame_gcu.md
index 844c38626..46d7f0d84 100644
--- a/docs/get_started/installation/Enflame_gcu.md
+++ b/docs/get_started/installation/Enflame_gcu.md
@@ -132,4 +132,3 @@ Upon completion, accuracy results are saved in ```result.jsonl```, e.g.:
```json
{"task": "gsm8k", "backend": "paddlepaddle", "num_gpus": 1, "latency": 13446.01, "accuracy": 0.956, "num_requests": 1319, "other": {"num_questions": 1319, "parallel": 8}}
```
-
diff --git a/docs/get_started/installation/README.md b/docs/get_started/installation/README.md
index 5fb4ab6b9..ba7042e26 100644
--- a/docs/get_started/installation/README.md
+++ b/docs/get_started/installation/README.md
@@ -6,4 +6,4 @@ FastDeploy currently supports installation on the following hardware platforms:
- [Kunlun XPU Installation](kunlunxin_xpu.md)
- [Enflame S60 GCU Installation](Enflame_gcu.md)
- [Iluvatar GPU Installation](iluvatar_gpu.md)
-- [Hygon DCU Installation](hygon_dcu.md)
\ No newline at end of file
+- [Hygon DCU Installation](hygon_dcu.md)
diff --git a/docs/get_started/installation/hygon_dcu.md b/docs/get_started/installation/hygon_dcu.md
index e5e3eea67..245ee4457 100644
--- a/docs/get_started/installation/hygon_dcu.md
+++ b/docs/get_started/installation/hygon_dcu.md
@@ -37,6 +37,7 @@ image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04
```
## 2. Start service
+
```bash
export FD_ATTENTION_BACKEND="BLOCK_ATTN"
python -m fastdeploy.entrypoints.openai.api_server \
@@ -47,7 +48,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--gpu-memory-utilization=0.8
```
-#### Send requests
+### Send requests
Send requests using either curl or Python
@@ -78,4 +79,4 @@ response = client.chat.completions.create(
stream=False,
)
print(response)
-```
\ No newline at end of file
+```
diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md
index 5284d08d5..754cc7c0f 100644
--- a/docs/get_started/installation/iluvatar_gpu.md
+++ b/docs/get_started/installation/iluvatar_gpu.md
@@ -1,115 +1,120 @@
-# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
-The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version.
-
-## Machine Preparation
-First, you need to prepare a machine with the following configurations:
-
-| CPU | Memory | Card | Hard Disk|
-| :---: | :---: | :---: | :---: |
-| x86 | 1TB| 8xBI150| 1TB|
-
-Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions.
-
-## Image Preparation
-Pull the Docker image
-
-```bash
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
-```
-
-## Container Preparation
-1. Start Container
-```bash
-docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
-docker exec -it paddle_infer bash
-```
-/home/paddle contains the model files, *.whl packages, and scripts.
-
-2. Install packages
-
-```bash
-pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
-pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
-pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
-```
-
-## Prepare the inference demo script
-
-script list below:
-
-`run_demo.sh`:
-```bash
-#!/bin/bash
-export PADDLE_XCCL_BACKEND=iluvatar_gpu
-export INFERENCE_MSG_QUEUE_ID=232132
-export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
-export FD_DEBUG=1
-python3 run_demo.py
-```
-
-`run_demo.py`:
-
-```python
-from fastdeploy import LLM, SamplingParams
-
-prompts = [
- "Hello, my name is",
- "The largest ocean is",
-]
-
-# sampling parameters
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
-
-# load the model
-llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
-
-# Perform batch inference
-outputs = llm.generate(prompts, sampling_params)
-# Note:Replace `/home/paddle/ernie-4_5-21b-a3b-bf16-paddle` in it with the path to the ERNIE model you have downloaded.
-
-for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs.text
- print(prompt, generated_text)
-```
-
-## run demo
-
-```bash
-./run_demo.sh
-```
-The following logs will be printed: Loading the model took approximately 74 seconds, and running the demo took approximately 240 seconds.
-```
-/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
- warnings.warn(warning_message)
-/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
- warnings.warn(
-[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json
-/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
- warnings.warn(
-/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
- warnings.warn(
-INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready...
-Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s]
-Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s]
-INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds.
-Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
-Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started!
- (1) First, draw a big circle for the ghost's head.
- (2) Then, add two small circles for the eyes, making sure they're not too big.
- (3) Next, draw a wide, open mouth that looks like a big "U".
- (4) After that, create the body by drawing a slightly smaller circle below the head.
- (5) Now, let's add some arms. Draw two short, curly lines on each side of the body.
- (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance.
-
-Now, let's break down each step:
-
-**Step 1: Drawing the Head**
-- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing.
-
-**Step 2: Adding Eyes**
-- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look.
-
-**Step 3: Drawing the
-The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
-```
+# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine
+The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version.
+
+## Machine Preparation
+First, you need to prepare a machine with the following configurations:
+
+| CPU | Memory | Card | Hard Disk|
+| :---: | :---: | :---: | :---: |
+| x86 | 1TB| 8xBI150| 1TB|
+
+Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions.
+
+## Image Preparation
+Pull the Docker image
+
+```bash
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+```
+
+## Container Preparation
+1. Start Container
+
+```bash
+docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker exec -it paddle_infer bash
+```
+
+/home/paddle contains the model files, *.whl packages, and scripts.
+
+1. Install packages
+
+```bash
+pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
+pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
+```
+
+## Prepare the inference demo script
+
+script list below:
+
+`run_demo.sh`:
+
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_DEBUG=1
+python3 run_demo.py
+```
+
+`run_demo.py`:
+
+```python
+from fastdeploy import LLM, SamplingParams
+
+prompts = [
+ "Hello, my name is",
+ "The largest ocean is",
+]
+
+# sampling parameters
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
+
+# load the model
+llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
+
+# Perform batch inference
+outputs = llm.generate(prompts, sampling_params)
+# Note:Replace `/home/paddle/ernie-4_5-21b-a3b-bf16-paddle` in it with the path to the ERNIE model you have downloaded.
+
+for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs.text
+ print(prompt, generated_text)
+```
+
+## run demo
+
+```bash
+./run_demo.sh
+```
+
+The following logs will be printed: Loading the model took approximately 74 seconds, and running the demo took approximately 240 seconds.
+
+```
+/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
+ warnings.warn(warning_message)
+/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
+ warnings.warn(
+[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+ warnings.warn(
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+ warnings.warn(
+INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready...
+Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s]
+Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s]
+INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds.
+Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started!
+ (1) First, draw a big circle for the ghost's head.
+ (2) Then, add two small circles for the eyes, making sure they're not too big.
+ (3) Next, draw a wide, open mouth that looks like a big "U".
+ (4) After that, create the body by drawing a slightly smaller circle below the head.
+ (5) Now, let's add some arms. Draw two short, curly lines on each side of the body.
+ (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance.
+
+Now, let's break down each step:
+
+**Step 1: Drawing the Head**
+- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing.
+
+**Step 2: Adding Eyes**
+- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look.
+
+**Step 3: Drawing the
+The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
+```
diff --git a/docs/get_started/quick_start.md b/docs/get_started/quick_start.md
index 5368941a3..a9d2331ee 100644
--- a/docs/get_started/quick_start.md
+++ b/docs/get_started/quick_start.md
@@ -25,9 +25,9 @@ python -m fastdeploy.entrypoints.openai.api_server \
--max-num-seqs 32
```
-> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).
-```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service.
-```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service.
+> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).
+```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service.
+```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service.
**Related Documents**
- [Service Deployment](../online_serving/README.md)
diff --git a/docs/get_started/quick_start_vl.md b/docs/get_started/quick_start_vl.md
index acd805a11..82bc609b1 100644
--- a/docs/get_started/quick_start_vl.md
+++ b/docs/get_started/quick_start_vl.md
@@ -30,10 +30,10 @@ python -m fastdeploy.entrypoints.openai.api_server \
--enable-mm
```
-> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Base-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).
-```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service.
-```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service.
-```--reasoning-parser``` specifies the thinking content parser.
+> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Base-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md).
+```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service.
+```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service.
+```--reasoning-parser``` specifies the thinking content parser.
```--enable-mm``` indicates whether to enable multi-modal support.
**Related Documents**
diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md
index 2e5ff98fb..8062fe76a 100644
--- a/docs/online_serving/README.md
+++ b/docs/online_serving/README.md
@@ -36,6 +36,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
]
}'
```
+
Here's an example curl command demonstrating how to include the logprobs parameter in a user request:
```bash
@@ -49,6 +50,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
```
Here is an example of sending a user request using a Python script:
+
```python
import openai
host = "0.0.0.0"
@@ -87,10 +89,10 @@ The differences in request parameters between FastDeploy and the OpenAI protocol
- `temperature`: Optional[float] = None
- `top_p`: Optional[float] = None
- `metadata`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `metadata={"enable_thinking": True}`)
- - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated)
- - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`)
- - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking)
- - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition))
+ - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated)
+ - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`)
+ - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking)
+ - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition))
> Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used.
diff --git a/docs/online_serving/metrics.md b/docs/online_serving/metrics.md
index 6eee4f47d..c5c16ee81 100644
--- a/docs/online_serving/metrics.md
+++ b/docs/online_serving/metrics.md
@@ -24,4 +24,4 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo
## Accessing Metrics
- Access URL: `http://localhost:8000/metrics`
-- Metric Type: Prometheus format
\ No newline at end of file
+- Metric Type: Prometheus format
diff --git a/docs/online_serving/scheduler.md b/docs/online_serving/scheduler.md
index f985de05a..8ce9fa4cd 100644
--- a/docs/online_serving/scheduler.md
+++ b/docs/online_serving/scheduler.md
@@ -11,9 +11,9 @@ The Local Scheduler functions similarly to a memory manager, performing eviction
The Global Scheduler is implemented using Redis. Each node actively steals tasks from others when its GPU is idle, then pushes the execution results back to the originating node.
### PD-Separated Scheduler
-Building upon the Global Scheduler, FastDeploy introduces the **PD-Separated Scheduling Strategy**, specifically optimized for large language model inference scenarios. It decouples the inference pipeline into two distinct phases:
-- **Prefill Phase**: Builds KV cache, which is compute-intensive with high memory usage but low latency.
-- **Decode Phase**: Performs autoregressive decoding, which is sequential and time-consuming but requires less memory.
+Building upon the Global Scheduler, FastDeploy introduces the **PD-Separated Scheduling Strategy**, specifically optimized for large language model inference scenarios. It decouples the inference pipeline into two distinct phases:
+- **Prefill Phase**: Builds KV cache, which is compute-intensive with high memory usage but low latency.
+- **Decode Phase**: Performs autoregressive decoding, which is sequential and time-consuming but requires less memory.
By separating roles (prefill nodes handle request processing while decode nodes manage generation), this strategy enables finer-grained resource allocation, improving throughput and GPU utilization.
@@ -36,4 +36,4 @@ By separating roles (prefill nodes handle request processing while decode nodes
| scheduler_reader_parallel | int | No | 4 | splitwise | Number of output reader threads |
| scheduler_writer_parallel | int | No | 4 | splitwise | Number of writer threads |
| scheduler_reader_batch_size | int | No | 200 | splitwise | Batch size for fetching results from Redis |
-| scheduler_writer_batch_size | int | No | 200 | splitwise | Batch size for writing results to Redis |
\ No newline at end of file
+| scheduler_writer_batch_size | int | No | 200 | splitwise | Batch size for writing results to Redis |
diff --git a/docs/parameters.md b/docs/parameters.md
index 8c9e3cbee..c52fc9ac6 100644
--- a/docs/parameters.md
+++ b/docs/parameters.md
@@ -45,7 +45,6 @@ When using FastDeploy to deploy models (including offline inference and service
| ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel |
| ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting |
-
## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```?
During FastDeploy inference, GPU memory is occupied by ```model weights```, ```preallocated KVCache blocks``` and ```model computation intermediate activation values```. The preallocated KVCache blocks are determined by ```num_gpu_blocks_override```, with ```block_size``` (default: 64) as its unit, meaning one block can store KVCache for 64 Tokens.
@@ -55,14 +54,14 @@ In actual inference, it's difficult for users to know how to properly configure
- Load the model, after completing model loading, record current memory usage ```total_memory_after_load``` and FastDeploy framework memory usage ```fd_memory_after_load```; note the former is actual GPU memory usage (may include other processes), the latter is memory used by FD framework itself;
- According to user-configured ```max_num_batched_tokens``` (default: ```max_model_len```), perform fake prefill computation with corresponding length input data, record current maximum FastDeploy framework memory allocation ```fd_memory_after_prefill```, thus ```model computation intermediate activation values``` can be considered as ```fd_memory_after_prefill - fd_memory_after_load```;
- - At this point, available GPU memory for KVCache allocation (taking A800 80G as example) is ```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)```
- - Based on model KVCache precision (e.g. 8bit/16bit), calculate memory size per block, then calculate total allocatable blocks, assign to ```num_gpu_blocks_override```
+ - At this point, available GPU memory for KVCache allocation (taking A800 80G as example) is ```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)```
+ - Based on model KVCache precision (e.g. 8bit/16bit), calculate memory size per block, then calculate total allocatable blocks, assign to ```num_gpu_blocks_override```
> In service startup logs, we can find ```Reset block num, the total_block_num:17220, prefill_kvcache_block_num:12915``` in log/fastdeploy.log, where ```total_block_num``` is the automatically calculated KVCache block count, multiply by ```block_size``` to get total cacheable Tokens.
## 2. Relationship between ```kv_cache_ratio```, ```block_size``` and ```max_num_seqs```?
- - FastDeploy divides KVCache between Prefill and Decode phases according to ```kv_cache_ratio```. When configuring this parameter, you can use ```kv_cache_ratio = average input Tokens / (average input + average output Tokens)```. Typically input is 3x output, so can be configured as 0.75.
- - ```max_num_seqs``` is the maximum concurrency in Decode phase, generally can be set to maximum 128, but users can also configure based on KVCache situation, e.g. output KVCache Token amount is ```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```, to prevent extreme OOM situations, can configure ```max_num_seqs = decode_token_cache / average output Tokens```, not exceeding 128.
+- FastDeploy divides KVCache between Prefill and Decode phases according to ```kv_cache_ratio```. When configuring this parameter, you can use ```kv_cache_ratio = average input Tokens / (average input + average output Tokens)```. Typically input is 3x output, so can be configured as 0.75.
+- ```max_num_seqs``` is the maximum concurrency in Decode phase, generally can be set to maximum 128, but users can also configure based on KVCache situation, e.g. output KVCache Token amount is ```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```, to prevent extreme OOM situations, can configure ```max_num_seqs = decode_token_cache / average output Tokens```, not exceeding 128.
## 3. ```enable_chunked_prefill``` parameter description
@@ -74,24 +73,24 @@ To optimize scheduling priority for short requests, new `max_long_partial_prefil
Currently, only user configuration of the following parameters is supported:
- `use_cudagraph` : bool = False
- `graph_optimization_config` : Dict[str, Any]
- - `graph_opt_level`: int = 0
- - `use_cudagraph`: bool = False
- - `cudagraph_capture_sizes` : List[int] = None
+ - `graph_opt_level`: int = 0
+ - `use_cudagraph`: bool = False
+ - `cudagraph_capture_sizes` : List[int] = None
CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-config '{"use_cudagraph":true}'`. Using two different methods to set the use graph simultaneously may cause conflicts.
-
The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options:
- `0`: Use Dynamic compute graph, default to 0
- `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image
- `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize
In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs.
-For adapted models, FastDeploy's CudaGraph * * can support both dynamic and static graphs * * simultaneously.
+For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously.
When CudaGraph is enabled in the default configuration, a list of Batch Sizes that CudaGraph needs to capture will be automatically set based on the 'max_num_deqs' parameter. The logic for generating the list of Batch Sizes that need to be captured is as follows:
1. Generate a candidate list with a range of [1,1024] Batch Size.
+
```
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
@@ -100,24 +99,25 @@ When CudaGraph is enabled in the default configuration, a list of Batch Sizes th
# Batch Size (256, 288, ... 992, 1024]
candidate_capture_sizes += [32 * i for i in range(17, 33)]
```
+
2. Crop the candidate list based on the user set 'max_num_deqs' to obtain a CudaGraph capture list with a range of [1,' max_num_deqs'].
Users can also customize the batch size list that needs to be captured by CudaGraph through the parameter `cudagraph_capture_sizes` in`--graph-optimization-config`:
+
```
--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
```
-
### CudaGraph related parameters
Using CudaGraph incurs some additional memory overhead, divided into two categories in FastDeploy:
-* Additional input Buffer overhead
-* CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework
+- Additional input Buffer overhead
+- CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework
FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter to calculate available memory for `KVCache`, after initializing `KVCache` then uses remaining memory to initialize CudaGraph. Since CudaGraph is not enabled by default currently, using default startup parameters may encounter `Out of memory` errors, can try following solutions:
-* Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph.
-* Lower `max_num_seqs` to decrease the maximum concurrency.
-* Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes`
+- Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph.
+- Lower `max_num_seqs` to decrease the maximum concurrency.
+- Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes`
- Before use, must ensure loaded model is properly decorated with ```@support_graph_optimization```.
@@ -148,5 +148,6 @@ FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter
class Ernie45TModel(nn.Layer): # Note decorator is added to nn.Layer subclass
...
```
+
- When ```use_cudagraph``` is enabled, currently only supports single-GPU inference, i.e. ```tensor_parallel_size``` set to 1.
- When ```use_cudagraph``` is enabled, cannot enable ```enable_prefix_caching``` or ```enable_chunked_prefill```.
diff --git a/docs/quantization/README.md b/docs/quantization/README.md
index 96cb6c684..d564223b1 100644
--- a/docs/quantization/README.md
+++ b/docs/quantization/README.md
@@ -24,7 +24,7 @@ FastDeploy supports various quantization inference precisions including FP8, INT
## 2. Model Support List
-| Model Name | Supported Quantization Precision |
+| Model Name | Supported Quantization Precision |
|---------|---------|
| ERNIE-4.5-300B-A47B | WINT8, WINT4, Block-wise FP8, MixQuant|
@@ -43,4 +43,4 @@ Examples:
- **W4A16C16 / WInt4 / weight-only int4**: 4 defaults to INT4
- **WNF4A8C8**: NF4 refers to 4bits norm-float numerical type
- **Wfp8Afp8**: Both weights and activations are FP8 precision
-- **W4Afp8**: Weights are INT4, activations are FP8
+- **W4Afp8**: Weights are INT4, activations are FP8
diff --git a/docs/quantization/online_quantization.md b/docs/quantization/online_quantization.md
index 3e3f24df9..bf8b9a536 100644
--- a/docs/quantization/online_quantization.md
+++ b/docs/quantization/online_quantization.md
@@ -24,7 +24,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
- By specifying `--model baidu/ERNIE-4.5-300B-A47B-Paddle`, the model can be automatically downloaded from AIStudio. FastDeploy depends on Paddle format models. For more information, please refer to [Supported Model List](../supported_models.md).
- By setting `--quantization` to `wint8` or `wint4`, online INT8/INT4 quantization can be selected.
-- Deploying ERNIE-4.5-300B-A47B-Paddle WINT8 requires at least 80G * 8 cards, while WINT4 requires 80GB * 4 cards.
+- Deploying ERNIE-4.5-300B-A47B-Paddle WINT8 requires at least 80G *8 cards, while WINT4 requires 80GB* 4 cards.
- For more deployment tutorials, please refer to [get_started](../get_started/ernie-4.5.md).
## 2. Block-wise FP8
@@ -51,4 +51,4 @@ python -m fastdeploy.entrypoints.openai.api_server \
- By specifying `--model baidu/ERNIE-4.5-300B-A47B-Paddle`, the model can be automatically downloaded from AIStudio. FastDeploy depends on Paddle format models. For more information, please refer to [Supported Model List](../supported_models.md).
- By setting `--quantization` to `block_wise_fp8`, online Block-wise FP8 quantization can be selected.
- Deploying ERNIE-4.5-300B-A47B-Paddle Block-wise FP8 requires at least 80G * 8 cards.
-- For more deployment tutorials, please refer to [get_started](../get_started/ernie-4.5.md)
+- For more deployment tutorials, please refer to [get_started](../get_started/ernie-4.5.md)
diff --git a/docs/quantization/wint2.md b/docs/quantization/wint2.md
index cbec8aff5..82dd60609 100644
--- a/docs/quantization/wint2.md
+++ b/docs/quantization/wint2.md
@@ -59,4 +59,4 @@ On the ERNIE-4.5-300B-A47B model, comparison of WINT2 vs WINT4 performance:
|DROP|9536|91.17|89.97|
|GSM8K|1319|96.21|95.98|
|CMath|600|96.50|96.00|
-|CMMLU|11477|89.92|86.22|
\ No newline at end of file
+|CMMLU|11477|89.92|86.22|
diff --git a/docs/usage/code_overview.md b/docs/usage/code_overview.md
index fb8e70615..506a51680 100644
--- a/docs/usage/code_overview.md
+++ b/docs/usage/code_overview.md
@@ -22,4 +22,4 @@ Below is an overview of the FastDeploy code structure and functionality organize
- ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts).
- ```splitwise```: Modules related to PD disaggragation deployment.
- ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes).
-- ```test```: Code for unit testing and validation.
\ No newline at end of file
+- ```test```: Code for unit testing and validation.
diff --git a/docs/usage/log.md b/docs/usage/log.md
index 7afa9bf6c..60e658a5b 100644
--- a/docs/usage/log.md
+++ b/docs/usage/log.md
@@ -1,6 +1,6 @@
# Log Description
-FastDeploy generates the following log files during deployment. Below is an explanation of each log's purpose.
+FastDeploy generates the following log files during deployment. Below is an explanation of each log's purpose.
By default, logs are stored in the `log` directory under the execution path. To specify a custom directory, set the environment variable `FD_LOG_DIR`.
## Inference Service Logs
diff --git a/docs/zh/features/disaggregated.md b/docs/zh/features/disaggregated.md
index c23cd75dd..ac895639c 100644
--- a/docs/zh/features/disaggregated.md
+++ b/docs/zh/features/disaggregated.md
@@ -25,13 +25,10 @@
多实例情况下,每收到一条请求需要根据不同的策略将请求分配到不同的Prefill实例和Decode实例。通过角色分离(prefill 节点负责接收并处理请求,decode节点完成后续生成),可以更细粒度地控制资源分配、提高吞吐量与 GPU 利用率。
-
## 使用说明
-
### 单机分离式部署
-
#### 在线推理服务
使用如下命令进行服务部署
@@ -63,7 +60,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--cache-queue-port 8187 \
--tensor-parallel-size 4 \
--quantization wint4 \
- --innode-prefill-ports 8182 \
+ --innode-prefill-ports 8182 \
--splitwise-role "decode"
```
@@ -75,9 +72,9 @@ python -m fastdeploy.entrypoints.openai.api_server \
### 多机分离式部署
-
#### 前置依赖 Redis
-- 使用`conda`安装
+* 使用`conda`安装
+
```bash
# 安装
conda install redis
@@ -85,7 +82,8 @@ conda install redis
nohup redis-server > redis.log 2>&1 &
```
-- 使用`apt`安装
+* 使用`apt`安装
+
```bash
# 安装
sudo apt install redis-server -y
@@ -93,7 +91,8 @@ sudo apt install redis-server -y
sudo systemctl start redis-server
```
-- 使用`yum`安装
+* 使用`yum`安装
+
```bash
# 安装
sudo yum install redis -y
diff --git a/docs/zh/features/load_balance.md b/docs/zh/features/load_balance.md
index 6626269f6..7e1bab1df 100644
--- a/docs/zh/features/load_balance.md
+++ b/docs/zh/features/load_balance.md
@@ -23,6 +23,7 @@
### 前置依赖 Redis
- 使用`conda`安装
+
```bash
# 安装
conda install redis
@@ -31,6 +32,7 @@ nohup redis-server > redis.log 2>&1 &
```
- 使用`apt`安装
+
```bash
# 安装
sudo apt install redis-server -y
@@ -39,6 +41,7 @@ sudo systemctl start redis-server
```
- 使用`yum`安装
+
```bash
# 安装
sudo yum install redis -y
@@ -47,6 +50,7 @@ sudo systemctl start redis
```
### 启动FastDeploy
+
```bash
python -m fastdeploy.entrypoints.openai.api_server \
--port 8801 \
@@ -62,6 +66,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--scheduler-min-load_score 3 \
--scheduler-load-shards-num 1
```
+
[启动参数说明](../online_serving/scheduler.md)
可以将上述启动命令在多个机器执行,启动多个推理实例(如果是在一个机器中启动多个推理实例,注意端口不要冲突)。
diff --git a/docs/zh/features/prefix_caching.md b/docs/zh/features/prefix_caching.md
index 3eff20b63..b6020483f 100644
--- a/docs/zh/features/prefix_caching.md
+++ b/docs/zh/features/prefix_caching.md
@@ -8,7 +8,6 @@ Prefix Caching(前缀缓存)是一种优化生成式模型推理效率的技
增量计算:对于后续请求,只需计算新增部分(如用户追加的输入)并复用缓存的中间结果,显著减少计算量。
-
## 服务化部署开启 Prefix Caching
启动服务增加以下参数 `enable-prefix-caching`,默认只开启一级缓存(GPU 缓存)。
@@ -37,4 +36,4 @@ python -m fastdeploy.entrypoints.openai.api_server \
FastDeploy 启动时设置 `enable_prefix_caching=True`,CPU Cache 根据机器内存选择开启 `swap_space`。
-提供了测试示例 `demo/offline_prefix_caching_demo.py`。
\ No newline at end of file
+提供了测试示例 `demo/offline_prefix_caching_demo.py`。
diff --git a/docs/zh/features/reasoning_output.md b/docs/zh/features/reasoning_output.md
index 1090facb6..5417f66d7 100644
--- a/docs/zh/features/reasoning_output.md
+++ b/docs/zh/features/reasoning_output.md
@@ -8,7 +8,7 @@
| baidu/ERNIE-4.5-VL-424B-A47B-Paddle | ernie-45-vl | ✓ |
| baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl | ✓ |
-思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式.
+思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式.
可以支持思考模式开关的接口:
1. OpenAI 服务中 `/v1/chat/completions` 请求.
@@ -17,10 +17,10 @@
同时在思考模型中,支持通过```reasoning_max_tokens```控制思考内容的长度,在请求中添加```metadata={"reasoning_max_tokens": 1024}```即可。
-
-### 快速使用
+## 快速使用
在启动模型服务时, 通过`--reasoning-parser`参数指定解析器名称.
该解析器会解析思考模型的输出, 提取`reasoning_content`字段.
+
```bash
python -m fastdeploy.entrypoints.openai.api_server \
--model /path/to/your/model \
@@ -30,7 +30,9 @@ python -m fastdeploy.entrypoints.openai.api_server \
--quantization wint4 \
--reasoning-parser ernie-45-vl
```
+
接下来, 向模型发送 `chat completion` 请求
+
```bash
curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
-H "Content-Type: application/json" \
@@ -45,10 +47,12 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \
}'
```
+
字段`reasoning_content`包含得出最终结论的思考步骤,而`content`字段包含最终结论。
### 流式会话
在流式会话中, `reasoning_content`字段会可以在`chat completion response chunks`中的 `delta` 中获取
+
```python
from openai import OpenAI
# Set OpenAI's API key and API base to use vLLM's API server.
@@ -73,4 +77,3 @@ for chunk in chat_response:
print("\n")
```
-
diff --git a/docs/zh/features/speculative_decoding.md b/docs/zh/features/speculative_decoding.md
index 38cb02ad2..eb898e873 100644
--- a/docs/zh/features/speculative_decoding.md
+++ b/docs/zh/features/speculative_decoding.md
@@ -6,10 +6,10 @@
- **Ngram**
-- **MTP (Multi-Token Prediction)**
- - ✅ 已支持:TP 切分
- - ✅ 已支持:共享前缀
- - ✅ 已支持:单机 TP 切分 + PD 分离
+- **MTP (Multi-Token Prediction)**
+ - ✅ 已支持:TP 切分
+ - ✅ 已支持:共享前缀
+ - ✅ 已支持:单机 TP 切分 + PD 分离
- ⏳ 即将支持:EP + DP + PD 分离
- ⏳ 即将支持:兼容 Chunk Prefill
- ⏳ 即将支持:多层 MTP layer
@@ -18,10 +18,10 @@
### ⏳ 规划中
-- Draft Model
-- Eagle
-- Hydra
-- Medusa
+- Draft Model
+- Eagle
+- Hydra
+- Medusa
- ...
## ⚙️ 高效投机解码框架设计
@@ -40,7 +40,7 @@
## 🚀 使用 Multi-Token-Prediction(MTP) 解码
详见论文:[DeepSeek-V3](https://arxiv.org/pdf/2412.19437)
### TP 并行部署
-> 使用 4×H100,量化方式选择 WINT4
+> 使用 4×H100,量化方式选择 WINT4
> 配置文件:`benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml`
```
@@ -50,13 +50,15 @@ python -m fastdeploy.entrypoints.openai.api_server \
--config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \
--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}'
```
+
### PD 分离式部署(1P1D)
-> 在8×H100上部署1P1D,P、D节点 分别使用 4×H100;量化方式选择 WINT4
-> 与常规 PD 分离部署一致,仅需替换配置文件并新增 speculative_config
+> 在8×H100上部署1P1D,P、D节点 分别使用 4×H100;量化方式选择 WINT4
+> 与常规 PD 分离部署一致,仅需替换配置文件并新增 speculative_config
详情请参考[PD分离式部署](./disaggregated.md)。
- P 节点(Prefill)
> 配置文件: `benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml`
+
```
export FD_LOG_DIR="log_prefill"
rm -rf ${FD_LOG_DIR}
@@ -80,9 +82,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
--scheduler-password "scheduler_mtp" \
--speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": ""${path_to_mtp_model}"}' &
```
+
- D 节点(Decode)
> 配置文件: `benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-decode.yaml`
+
```
export FD_LOG_DIR="log_prefill"
rm -rf ${FD_LOG_DIR}
@@ -109,8 +113,9 @@ python -m fastdeploy.entrypoints.openai.api_server \
## 🧠 使用 Ngram 解码
该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token,适合输入和输出有很大 overlap 的场景,如代码续写、文档查询等。
-> 使用 4×H100;量化方式选择 WINT4
+> 使用 4×H100;量化方式选择 WINT4
> 配置文件:benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml
+
```
python -m fastdeploy.entrypoints.openai.api_server \
--model ${path_to_main_model} \
diff --git a/docs/zh/get_started/installation/Enflame_gcu.md b/docs/zh/get_started/installation/Enflame_gcu.md
index f47212dc6..b71a97a8a 100644
--- a/docs/zh/get_started/installation/Enflame_gcu.md
+++ b/docs/zh/get_started/installation/Enflame_gcu.md
@@ -131,4 +131,3 @@ python -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parall
```json
{"task": "gsm8k", "backend": "paddlepaddle", "num_gpus": 1, "latency": 13446.01, "accuracy": 0.956, "num_requests": 1319, "other": {"num_questions": 1319, "parallel": 8}}
```
-
diff --git a/docs/zh/get_started/installation/README.md b/docs/zh/get_started/installation/README.md
index 014c092f5..80638604b 100644
--- a/docs/zh/get_started/installation/README.md
+++ b/docs/zh/get_started/installation/README.md
@@ -2,8 +2,8 @@
FastDeploy currently supports installation on the following hardware platforms:
-- [NVIDIA GPU Installation](nvidia_gpu.md)
+- [NVIDIA GPU Installation](nvidia_gpu.md)
- [Kunlunxin XPU Installation](kunlunxin_xpu.md)
- [Enflame S60 GCU Installation](Enflame_gcu.md)
- [Iluvatar GPU Installation](iluvatar_gpu.md)
-- [Hygon DCU Installation](hygon_dcu.md)
\ No newline at end of file
+- [Hygon DCU Installation](hygon_dcu.md)
diff --git a/docs/zh/get_started/installation/hygon_dcu.md b/docs/zh/get_started/installation/hygon_dcu.md
index 7408ec733..d9bdae0dd 100644
--- a/docs/zh/get_started/installation/hygon_dcu.md
+++ b/docs/zh/get_started/installation/hygon_dcu.md
@@ -37,6 +37,7 @@ image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04
```
## 2. 启动服务
+
```bash
export FD_ATTENTION_BACKEND="BLOCK_ATTN"
python -m fastdeploy.entrypoints.openai.api_server \
@@ -47,7 +48,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--gpu-memory-utilization=0.8
```
-#### 请求服务
+### 请求服务
您可以基于 OpenAI 协议,通过 curl 和 python 两种方式请求服务。
@@ -78,4 +79,4 @@ response = client.chat.completions.create(
stream=False,
)
print(response)
-```
\ No newline at end of file
+```
diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md
index aa045c7bb..f1ab2b38d 100644
--- a/docs/zh/get_started/installation/iluvatar_gpu.md
+++ b/docs/zh/get_started/installation/iluvatar_gpu.md
@@ -1,115 +1,120 @@
-# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
-当前版本软件只是作为天数芯片 + Fastdeploy 推理大模型的一个演示 demo,跑最新ERNIE4.5模型可能存在问题,后续进行修复和性能优化,给客户提供一个更稳定的版本。
-
-## 准备机器
-首先您需要准备以下配置的机器
-| CPU | 内存 | 天数 | 硬盘|
-|-----|------|-----|-----|
-| x86 | 1TB| 8xBI150| 1TB|
-
-目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。
-
-## 镜像
-从官网获取:
-
-```bash
-docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
-```
-
-## 准备容器
-1. 启动容器
-```bash
-docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
-docker exec -it paddle_infer bash
-```
-/home/paddle 为模型文件、whl包、脚本所在目录
-
-2. 安装whl包
-
-```bash
-pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
-pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
-pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
-```
-
-## 准备推理demo脚本
-推理 demo 路径:/home/paddle/scripts
-脚本内容如下
-
-`run_demo.sh`:
-```bash
-#!/bin/bash
-export PADDLE_XCCL_BACKEND=iluvatar_gpu
-export INFERENCE_MSG_QUEUE_ID=232132
-export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
-export FD_DEBUG=1
-python3 run_demo.py
-```
-
-run_demo.py
-
-
-```python
-from fastdeploy import LLM, SamplingParams
-
-prompts = [
- "Hello, my name is",
- "The largest ocean is",
-]
-
-# 采样参数
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
-
-# 加载模型
-llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
-
-# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
-outputs = llm.generate(prompts, sampling_params)
-# 注意将其中`/home/paddle/ernie-4_5-21b-a3b-bf16-paddle`替换为您下载的ERNIE模型的路径。
-# 输出结果
-for output in outputs:
- prompt = output.prompt
- generated_text = output.outputs.text
- print(prompt, generated_text)
-```
-
-## 运行demo
-执行
-```bash
-./run_demo.sh
-```
-会有如下 log 打印;load 模型耗时约74s,demo 运行约240s。
-```
-/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
- warnings.warn(warning_message)
-/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
- warnings.warn(
-[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json
-/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
- warnings.warn(
-/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
- warnings.warn(
-INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready...
-Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s]
-Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s]
-INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds.
-Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
-Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started!
- (1) First, draw a big circle for the ghost's head.
- (2) Then, add two small circles for the eyes, making sure they're not too big.
- (3) Next, draw a wide, open mouth that looks like a big "U".
- (4) After that, create the body by drawing a slightly smaller circle below the head.
- (5) Now, let's add some arms. Draw two short, curly lines on each side of the body.
- (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance.
-
-Now, let's break down each step:
-
-**Step 1: Drawing the Head**
-- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing.
-
-**Step 2: Adding Eyes**
-- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look.
-
-**Step 3: Drawing the
-The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
-```
+# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B
+当前版本软件只是作为天数芯片 + Fastdeploy 推理大模型的一个演示 demo,跑最新ERNIE4.5模型可能存在问题,后续进行修复和性能优化,给客户提供一个更稳定的版本。
+
+## 准备机器
+首先您需要准备以下配置的机器
+| CPU | 内存 | 天数 | 硬盘|
+|-----|------|-----|-----|
+| x86 | 1TB| 8xBI150| 1TB|
+
+目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。
+
+## 镜像
+从官网获取:
+
+```bash
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+```
+
+## 准备容器
+1. 启动容器
+
+```bash
+docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker exec -it paddle_infer bash
+```
+
+/home/paddle 为模型文件、whl包、脚本所在目录
+
+1. 安装whl包
+
+```bash
+pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
+pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
+```
+
+## 准备推理demo脚本
+推理 demo 路径:/home/paddle/scripts
+脚本内容如下
+
+`run_demo.sh`:
+
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_DEBUG=1
+python3 run_demo.py
+```
+
+run_demo.py
+
+```python
+from fastdeploy import LLM, SamplingParams
+
+prompts = [
+ "Hello, my name is",
+ "The largest ocean is",
+]
+
+# 采样参数
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256)
+
+# 加载模型
+llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8')
+
+# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
+outputs = llm.generate(prompts, sampling_params)
+# 注意将其中`/home/paddle/ernie-4_5-21b-a3b-bf16-paddle`替换为您下载的ERNIE模型的路径。
+# 输出结果
+for output in outputs:
+ prompt = output.prompt
+ generated_text = output.outputs.text
+ print(prompt, generated_text)
+```
+
+## 运行demo
+执行
+
+```bash
+./run_demo.sh
+```
+
+会有如下 log 打印;load 模型耗时约74s,demo 运行约240s。
+
+```
+/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md
+ warnings.warn(warning_message)
+/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml
+ warnings.warn(
+[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+ warnings.warn(
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+ warnings.warn(
+INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready...
+Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s]
+Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s]
+INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds.
+Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started!
+ (1) First, draw a big circle for the ghost's head.
+ (2) Then, add two small circles for the eyes, making sure they're not too big.
+ (3) Next, draw a wide, open mouth that looks like a big "U".
+ (4) After that, create the body by drawing a slightly smaller circle below the head.
+ (5) Now, let's add some arms. Draw two short, curly lines on each side of the body.
+ (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance.
+
+Now, let's break down each step:
+
+**Step 1: Drawing the Head**
+- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing.
+
+**Step 2: Adding Eyes**
+- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look.
+
+**Step 3: Drawing the
+The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean
+```
diff --git a/docs/zh/get_started/installation/nvidia_gpu.md b/docs/zh/get_started/installation/nvidia_gpu.md
index 348e350b7..94c111fe1 100644
--- a/docs/zh/get_started/installation/nvidia_gpu.md
+++ b/docs/zh/get_started/installation/nvidia_gpu.md
@@ -21,6 +21,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12
## 2. 预编译Pip安装
首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html)
+
``` shell
python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
```
@@ -28,6 +29,7 @@ python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn
再安装 fastdeploy,**注意不要通过pypi源安装**,需要通过如下方式安装
如你的 GPU 是 SM80/90 架构(A100/H100等),按如下方式安装
+
```
# 安装稳定版本fastdeploy
python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
@@ -37,6 +39,7 @@ python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages
```
如你的 GPU 是 SM86/89 架构(4090/L20/L40等),按如下方式安装
+
```
# 安装稳定版本fastdeploy
python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
@@ -59,11 +62,13 @@ docker build -f dockerfiles/Dockerfile.gpu -t fastdeploy:gpu .
## 4. Wheel包源码编译
首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/)
+
``` shell
python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
```
接着克隆源代码,编译安装
+
``` shell
git clone https://github.com/PaddlePaddle/FastDeploy
cd FastDeploy
@@ -74,11 +79,13 @@ cd FastDeploy
# 第4个参数: 编译的GPU架构
bash build.sh 1 python false [80,90]
```
+
编译后的产物在```FastDeploy/dist```目录下。
## 环境检查
在安装 FastDeploy 后,通过如下 Python 代码检查环境的可用性
+
``` python
import paddle
from paddle.jit.marker import unified
@@ -87,4 +94,5 @@ paddle.utils.run_check()
# 检查FastDeploy自定义算子编译成功与否
from fastdeploy.model_executor.ops.gpu import beam_search_softmax
```
+
如上代码执行成功,则认为环境可用。
diff --git a/docs/zh/get_started/quick_start.md b/docs/zh/get_started/quick_start.md
index 36ac0e855..46da9fa05 100644
--- a/docs/zh/get_started/quick_start.md
+++ b/docs/zh/get_started/quick_start.md
@@ -15,6 +15,7 @@
## 1. 启动服务
安装FastDeploy后,在终端执行如下命令,启动服务,其中启动命令配置方式参考[参数说明](../parameters.md)
+
```shell
python -m fastdeploy.entrypoints.openai.api_server \
--model baidu/ERNIE-4.5-0.3B-Paddle \
@@ -24,9 +25,10 @@ python -m fastdeploy.entrypoints.openai.api_server \
--max-model-len 32768 \
--max-num-seqs 32
```
->💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。
-```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。
-```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。
+
+>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。
+```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。
+```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。
**相关文档**
@@ -36,6 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
## 2. 用户发起服务请求
执行启动服务指令后,当终端打印如下信息,说明服务已经启动成功。
+
```
api_server.py[line:91] Launching metrics service at http://0.0.0.0:8181/metrics
api_server.py[line:94] Launching chat completion service at http://0.0.0.0:8180/v1/chat/completions
@@ -47,11 +50,13 @@ INFO: Uvicorn running on http://0.0.0.0:8180 (Press CTRL+C to quit)
```
FastDeploy提供服务探活接口,用以判断服务的启动状态,执行如下命令返回 ```HTTP/1.1 200 OK``` 即表示服务启动成功。
+
```shell
curl -i http://0.0.0.0:8180/health
```
通过如下命令发起服务请求
+
```shell
curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
-H "Content-Type: application/json" \
diff --git a/docs/zh/get_started/quick_start_vl.md b/docs/zh/get_started/quick_start_vl.md
index 11f9133b0..deaf3e10d 100644
--- a/docs/zh/get_started/quick_start_vl.md
+++ b/docs/zh/get_started/quick_start_vl.md
@@ -30,11 +30,11 @@ python -m fastdeploy.entrypoints.openai.api_server \
--enable-mm
```
->💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Base-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。
-```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。
-```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。
-```--reasoning-parser``` 指定思考内容解析器。
-```--enable-mm``` 表示是否开启多模态支持。
+>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Base-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。
+```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。
+```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。
+```--reasoning-parser``` 指定思考内容解析器。
+```--enable-mm``` 表示是否开启多模态支持。
**相关文档**
diff --git a/docs/zh/index.md b/docs/zh/index.md
index 0e98a53b3..40417db2c 100644
--- a/docs/zh/index.md
+++ b/docs/zh/index.md
@@ -2,11 +2,11 @@
**FastDeploy** 是基于飞桨(PaddlePaddle)的大语言模型(LLM)与视觉语言模型(VLM)推理部署工具包,提供**开箱即用的生产级部署方案**,核心技术特性包括:
-🚀 **负载均衡式PD分解**:工业级解决方案,支持上下文缓存与动态实例角色切换,在保障SLO达标和吞吐量的同时优化资源利用率
-🔄 **统一KV缓存传输**:轻量级高性能传输库,支持智能NVLink/RDMA选择
-🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口
-🧮 **全量化格式支持**:W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
-⏩ **高级加速技术**:推测解码、多令牌预测(MTP)及分块预填充
+🚀 **负载均衡式PD分解**:工业级解决方案,支持上下文缓存与动态实例角色切换,在保障SLO达标和吞吐量的同时优化资源利用率
+🔄 **统一KV缓存传输**:轻量级高性能传输库,支持智能NVLink/RDMA选择
+🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口
+🧮 **全量化格式支持**:W8A16、W8A8、W4A16、W4A8、W2A16、FP8等
+⏩ **高级加速技术**:推测解码、多令牌预测(MTP)及分块预填充
🖥️ **多硬件支持**:NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等
## 支持模型
@@ -24,6 +24,7 @@
## 文档说明
本项目文档基于mkdocs支持编译可视化查看,参考如下命令进行编译预览,
+
```
pip install requirements.txt
@@ -32,4 +33,5 @@ mkdocs build
mkdocs serve
```
+
根据提示打开相应地址即可。
diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md
index 26c985883..d2c001037 100644
--- a/docs/zh/online_serving/README.md
+++ b/docs/zh/online_serving/README.md
@@ -19,7 +19,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
--enable-logprob
```
-
服务部署时的命令行更多使用方式参考[参数说明](../parameters.md)。
## 发送用户请求
@@ -51,6 +50,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
```
使用 Python 脚本发送用户请求示例如下:
+
```python
import openai
host = "0.0.0.0"
@@ -88,10 +88,10 @@ FastDeploy 与 OpenAI 协议的请求参数差异如下,其余请求参数会
- `temperature`: Optional[float] = None
- `top_p`: Optional[float] = None
- `metadata`: Optional[dict] = None (仅在v1/chat/compeltions中支持,用于配置额外参数, 如metadata={"enable_thinking": True})
- - `min_tokens`: Optional[int] = 1 最小生成的Token个数
- - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数,默认与max_tokens一致
- - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考
- - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数(>1时惩罚重复,<1时鼓励重复)
+ - `min_tokens`: Optional[int] = 1 最小生成的Token个数
+ - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数,默认与max_tokens一致
+ - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考
+ - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数(>1时惩罚重复,<1时鼓励重复)
> 注: 若为多模态模型 由于思考链默认打开导致输出过长,max tokens 可以设置为模型最长输出,或使用默认值。
@@ -103,6 +103,7 @@ FastDeploy 增加的返回字段如下:
- `reasoning_content`: 思考链的返回结果
返回参数总览:
+
```python
ChatCompletionStreamResponse:
id: str
diff --git a/docs/zh/online_serving/scheduler.md b/docs/zh/online_serving/scheduler.md
index 9f92ac0b0..afbd819ba 100644
--- a/docs/zh/online_serving/scheduler.md
+++ b/docs/zh/online_serving/scheduler.md
@@ -14,11 +14,10 @@ FastDeploy 目前支持两种调度器: **本地调度器** 和 **全局调度
基于全局调度器,FastDeploy 引入了专为大语言模型推理场景优化的 **PD 分离调度策略**。该策略将推理流程解耦为两个独立阶段:
- **Prefill 阶段** :构建 KV 缓存,该过程计算密集度高、显存占用大,但延迟低;
-- **Decode 阶段**:进行自回归解码,该过程串行执行、时延高,但显存占用低。
+- **Decode 阶段**:进行自回归解码,该过程串行执行、时延高,但显存占用低。
通过角色分离(prefill 节点负责接收并处理请求,decode节点完成后续生成),可以更细粒度地控制资源分配、提高吞吐量与 GPU 利用率。
-
## 配置参数
| 字段名 | 字段类型 | 是否必填 | 默认值 | 生效范围 | 说明 |
| ------------------------------------ | -------- | -------- | --------- |------------------------|-----------------------------------|
diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md
index b6865f554..fbf57a971 100644
--- a/docs/zh/parameters.md
+++ b/docs/zh/parameters.md
@@ -2,7 +2,6 @@
在使用FastDeploy部署模型(包括离线推理、服务化部署),涉及如下参数配置,其实需要注意,在使用离线推理时,各参数配置即为如下参数名;而在使用命令行启动服务时,相应参数中的分隔符需要从```_```修改为```-```,如```max_model_len```在命令行中则为```--max-model-len```。
-
| 参数名 | 类型 | 说明 |
|:-----------------------------------|:----------| :----- |
| ```port``` | `int` | 仅服务化部署需配置,服务HTTP请求端口号,默认8000 |
@@ -44,7 +43,6 @@
| ```enable_expert_parallel``` | `bool` | 是否启用专家并行 |
| ```enable_logprob``` | `bool` | 是否启用输出token返回logprob。如果未使用 logrpob,则在启动时可以省略此参数。 |
-
## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系?
FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache块```和```模型计算中间激活值```占用。其中预分配KVCache块由```num_gpu_blocks_override```决定,其单位为```block_size```(默认64),即一个块可以存储64个Token的KVCache。
@@ -53,14 +51,14 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache
- 加载模型,在完成模型加载后,记录当前显存占用情况```total_memory_after_load```和FastDeploy框架占用的显存值```fd_memory_after_load```; 注意前者为GPU实际被占用显存(可能有其它进程也占用),后者是FD框架本身占用显存;
- 根据用户配置的```max_num_batched_tokens```(默认为```max_model_len```),Fake相应长度的输入数据进行Prefill计算,记录当前FastDeploy框架显存最大分配值```fd_memory_after_prefill```,因此可以认为```模型计算中间激活值```为```fd_memory_after_prefill - fd_memory_after_load```;
- - 截止当前,认为GPU卡可以剩分配KVCache的显存(以A800 80G为例)为```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)```
- - 根据模型KVCache的精度(如8bit/16bit),计算一个block占用的KVCache大小,从而计算出总共可分配的block数量,赋值给```num_gpu_blocks_override```
+ - 截止当前,认为GPU卡可以剩分配KVCache的显存(以A800 80G为例)为```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)```
+ - 根据模型KVCache的精度(如8bit/16bit),计算一个block占用的KVCache大小,从而计算出总共可分配的block数量,赋值给```num_gpu_blocks_override```
> 在服务启动日志中,我们可以在log/fastdeploy.log中找到```Reset block num, the total_block_num:17220, prefill_kvcache_block_num:12915```,其中```total_block_num```即为自动计算出来的KVCache block数量,将其乘以```block_size```即可知道整个服务可以缓存多少Token的KV值。
## 2. ```kv_cache_ratio```、```block_size```、```max_num_seqs```的关系?
- - FastDeploy里面将KVCache按照```kv_cache_ratio```分为Prefill阶段使用和Decode阶段使用,在配置这个参数时,可以按照```kv_cache_ratio = 平均输入Token数/(平均输入+平均输出Token数)```进行配置,常规情况输入是输出的3倍,因此可以配置成0.75
- - ```max_num_seqs```是Decode阶段的最大并发数,一般而言可以配置成最大值128,但用户也可以根据KVCache情况作调用,例如输出的KVCache Token量为```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```,为了防止极端情况下的显存不足问题,可以配置```max_num_seqs = decode_token_cache / 平均输出Token数```,不高于128即可。
+- FastDeploy里面将KVCache按照```kv_cache_ratio```分为Prefill阶段使用和Decode阶段使用,在配置这个参数时,可以按照```kv_cache_ratio = 平均输入Token数/(平均输入+平均输出Token数)```进行配置,常规情况输入是输出的3倍,因此可以配置成0.75
+- ```max_num_seqs```是Decode阶段的最大并发数,一般而言可以配置成最大值128,但用户也可以根据KVCache情况作调用,例如输出的KVCache Token量为```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```,为了防止极端情况下的显存不足问题,可以配置```max_num_seqs = decode_token_cache / 平均输出Token数```,不高于128即可。
## 3. ```enable_chunked_prefill```参数配置说明
@@ -72,9 +70,9 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache
当前仅支持用户配置以下参数:
- `use_cudagraph` : bool = False
- `graph_optimization_config` : Dict[str, Any]
- - `graph_opt_level`: int = 0
- - `use_cudagraph`: bool = False
- - `cudagraph_capture_sizes` : List[int] = None
+ - `graph_opt_level`: int = 0
+ - `use_cudagraph`: bool = False
+ - `cudagraph_capture_sizes` : List[int] = None
可以通过设置 `--use-cudagraph` 或 `--graph-optimization-config '{"use_cudagraph":true}'` 开启 CudaGrpah。
@@ -88,6 +86,7 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache
在默认配置下开启 CudaGraph 时,会根据 `max_num_seqs` 参数自动设置 CudaGraph 需要捕获的 Batch Size 列表,需要捕获的 Batch Size 的列表自动生成逻辑如下:
1. 生成一个范围为 [1,1024] Batch Size 的候选列表
+
```
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
@@ -96,24 +95,24 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache
# Batch Size (256, 288, ... 992, 1024]
candidate_capture_sizes += [32 * i for i in range(17, 33)]
```
+
2. 根据用户设置的 `max_num_seqs` 裁剪候选列表,得到范围为 [1, `max_num_seqs`] 的 CudaGraph 捕获列表。
用户也可以通过 `--graph-optimization-config` 中的 `cudagraph_capture_sizes` 参数自定义需要被 CudaGraph 捕获的 Batch Size 列表:
+
```
--graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}'
```
-
### CudaGraph相关参数说明
使用 CudaGraph 会产生一些额外的显存开销,在FastDeploy中分为下面两类:
-* 额外的输入 Buffer 开销
-* CudaGraph 使用了专用的显存池,因此会持有一部分与主框架隔离的中间激活显存
+- 额外的输入 Buffer 开销
+- CudaGraph 使用了专用的显存池,因此会持有一部分与主框架隔离的中间激活显存
FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 `KVCache` 可用的显存,初始化完 `KVCache` 之后才会使用剩余显存初始化 CudaGraph。由于 CudaGraph 目前还不是默认开启的,因此使用默认启动参数可能会遇到 `Out Of Memory` 错误,可以尝试使用下面三种方式解决:
-* 调低 `gpu_memory_utilization` 的值,多预留一些显存给CudaGraph使用。
-* 调低 `max_num_seqs` 的值,降低最大并发数。
-* 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`,减少捕获的图的数量
-
+- 调低 `gpu_memory_utilization` 的值,多预留一些显存给CudaGraph使用。
+- 调低 `max_num_seqs` 的值,降低最大并发数。
+- 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`,减少捕获的图的数量
使用CudaGraph之前,需要确保加载的模型被装饰器 ```@support_graph_optimization```正确修饰。
@@ -144,5 +143,6 @@ FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算
class Ernie45TModel(nn.Layer): # 注意 decorator 加在 nn.Layer 的子类上
...
```
+
- 当开启 ```use_cudagraph``` 时,暂时只支持单卡推理,即 ```tensor_parallel_size``` 设为1。
- 当开启 ```use_cudagraph``` 时,暂不支持开启 ```enable_prefix_caching``` 或 ```enable_chunked_prefill``` 。
diff --git a/docs/zh/quantization/README.md b/docs/zh/quantization/README.md
index 7b85c094d..77705c1e0 100644
--- a/docs/zh/quantization/README.md
+++ b/docs/zh/quantization/README.md
@@ -24,7 +24,7 @@ FastDeploy支持FP8、INT8、INT4、2-bit等多种量化推理精度,支持模
## 2. 模型支持列表
-| 模型名称 | 支持量化精度 |
+| 模型名称 | 支持量化精度 |
|---------|---------|
| ERNIE-4.5-300B-A47B | WINT8, WINT4, Block_wise= FP8, MixQuant|
@@ -37,11 +37,10 @@ FastDeploy 按以下格式命名各种量化精度:
```
部分示例如下:
-
+
- **W8A8C8**:W=weights,A=activations,C=CacheKV;8默认为INT8
- **W8A8C16**:16默认为BF16,其它同上
- **W4A16C16 / WInt4 / weight-only int4**:4默认为INT4
- **WNF4A8C8**:NF4指4bit norm-float数值类型
- **Wfp8Afp8**:权重和激活均为FP8精度
- **W4Afp8**:权重为INT4, 激活为FP8
-
diff --git a/docs/zh/quantization/online_quantization.md b/docs/zh/quantization/online_quantization.md
index f487f8ac8..2e5040239 100644
--- a/docs/zh/quantization/online_quantization.md
+++ b/docs/zh/quantization/online_quantization.md
@@ -23,8 +23,8 @@ python -m fastdeploy.entrypoints.openai.api_server \
```
- 通过指定 `--model baidu/ERNIE-4.5-300B-A47B-Paddle` 可自动从AIStudio下载模型。FastDeploy依赖Paddle格式的模型,更多说明参考[支持模型列表](../supported_models.md)。
-- 通过设置 `--quantization` 为 `wint8` 或 `wint4` 选择在线 INT8/INT4 量化。
-- 部署 ERNIE-4.5-300B-A47B-Paddle WINT8 最少需要 80G * 8卡, WINT4 则需要 80GB * 4卡。
+- 通过设置 `--quantization` 为 `wint8` 或 `wint4` 选择在线 INT8/INT4 量化。
+- 部署 ERNIE-4.5-300B-A47B-Paddle WINT8 最少需要 80G *8卡, WINT4 则需要 80GB* 4卡。
- 更多部署教程请参考[get_started](../get_started/ernie-4.5.md).
## 2. Block-wise FP8
@@ -49,9 +49,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
```
- 通过指定 `--model baidu/ERNIE-4.5-300B-A47B-Paddle` 可自动从AIStudio下载模型。FastDeploy依赖Paddle格式的模型,更多说明参考[支持模型列表](../supported_models.md)。
-- 通过设置 `--quantization` 为 `block_wise_fp8` 选择在线 Block-wise FP8 量化。
+- 通过设置 `--quantization` 为 `block_wise_fp8` 选择在线 Block-wise FP8 量化。
- 部署 ERNIE-4.5-300B-A47B-Paddle Block-wise FP8 最少需要 80G * 8卡。
- 更多部署教程请参考[get_started](../get_started/ernie-4.5.md)
-
-
-
diff --git a/docs/zh/quantization/wint2.md b/docs/zh/quantization/wint2.md
index 79da233e8..91c1441bf 100644
--- a/docs/zh/quantization/wint2.md
+++ b/docs/zh/quantization/wint2.md
@@ -48,7 +48,6 @@ python -m fastdeploy.entrypoints.openai.api_server \
- 更多部署教程请参考[get_started](../get_started/ernie-4.5.md);
- 更多模型说明请参考[支持模型列表](../supported_models.md)。
-
## WINT2效果
在ERNIE-4.5-300B-A47B模型上,WINT2与WINT4效果对比:
diff --git a/docs/zh/usage/code_overview.md b/docs/zh/usage/code_overview.md
index 2fda9caef..170652a5e 100644
--- a/docs/zh/usage/code_overview.md
+++ b/docs/zh/usage/code_overview.md
@@ -22,4 +22,3 @@
- ```splitwise```: 分离式部署相关模块
- ```scripts```/```tools```:FastDeploy 用于执行功能的辅助脚本,比如编译,单测执行,代码风格纠正等
- ```test```:项目单测验证使用到的代码
-
diff --git a/docs/zh/usage/log.md b/docs/zh/usage/log.md
index 5e521f1a1..c9b287523 100644
--- a/docs/zh/usage/log.md
+++ b/docs/zh/usage/log.md
@@ -19,14 +19,12 @@ FastDeploy 在部署过程中,会产生如下日志文件,各日志含义说
## 在线推理客户端日志
* `api_server.log` : 记录启动参数,及接收到的请求信息
-
## 调度器日志
* `scheduler.log` : 记录调度器的信息包含当前结点的信息,每条请求分配的信息
## 投机解码日志
* `speculate.log` : 投机解码相关信息
-
## Prefix Caching 相关日志
* `cache_queue_manager.log` : 记录启动参数,及接收到的请求信息
diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py
index e511eb6c4..15186dfb7 100644
--- a/fastdeploy/__init__.py
+++ b/fastdeploy/__init__.py
@@ -22,14 +22,14 @@ import sys
os.environ["GLOG_minloglevel"] = "2"
# suppress log from aistudio
os.environ["AISTUDIO_LOG"] = "critical"
-from fastdeploy.utils import version
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.entrypoints.llm import LLM
-__all__ = ['LLM', 'SamplingParams']
+__all__ = ["LLM", "SamplingParams"]
try:
import use_triton_in_paddle
+
use_triton_in_paddle.make_triton_compatible_with_paddle()
except ImportError:
pass
@@ -38,13 +38,21 @@ except ImportError:
def _patch_fastsafetensors():
try:
- file_path = subprocess.check_output([
- sys.executable, "-c", "import fastsafetensors, os; \
+ file_path = (
+ subprocess.check_output(
+ [
+ sys.executable,
+ "-c",
+ "import fastsafetensors, os; \
print(os.path.join(os.path.dirname(fastsafetensors.__file__), \
- 'frameworks', '_paddle.py'))"
- ]).decode().strip()
+ 'frameworks', '_paddle.py'))",
+ ]
+ )
+ .decode()
+ .strip()
+ )
- with open(file_path, 'r') as f:
+ with open(file_path, "r") as f:
content = f.read()
if "DType.U16: DType.BF16," in content and "DType.U8: paddle.uint8," in content:
return
@@ -56,21 +64,20 @@ def _patch_fastsafetensors():
inside_block = False
for line in lines:
new_lines.append(line)
- if 'need_workaround_dtypes: Dict[DType, DType] = {' in line:
+ if "need_workaround_dtypes: Dict[DType, DType] = {" in line:
inside_block = True
- elif inside_block and '}' in line:
- new_lines.insert(-1, ' DType.U16: DType.BF16,')
+ elif inside_block and "}" in line:
+ new_lines.insert(-1, " DType.U16: DType.BF16,")
inside_block = False
modified = True
content = "\n".join(new_lines)
if "DType.I8: paddle.uint8," in content:
- content = content.replace("DType.I8: paddle.uint8,",
- "DType.U8: paddle.uint8,")
+ content = content.replace("DType.I8: paddle.uint8,", "DType.U8: paddle.uint8,")
modified = True
if modified:
- with open(file_path, 'w') as f:
+ with open(file_path, "w") as f:
f.write(content + "\n")
except Exception as e:
diff --git a/fastdeploy/cache_manager/__init__.py b/fastdeploy/cache_manager/__init__.py
index c40559bc8..f4ede9062 100644
--- a/fastdeploy/cache_manager/__init__.py
+++ b/fastdeploy/cache_manager/__init__.py
@@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-"""
\ No newline at end of file
+"""
diff --git a/fastdeploy/cache_manager/cache_data.py b/fastdeploy/cache_manager/cache_data.py
index aeb58d55f..638da70bc 100644
--- a/fastdeploy/cache_manager/cache_data.py
+++ b/fastdeploy/cache_manager/cache_data.py
@@ -109,13 +109,12 @@ class BlockNode:
parent_node_id = None
return (
f"node_id {self.node_id}: depth {self.depth} hash_value {self.hash_value}"
- +
- f" shared_count {self.shared_count} is_gpu_leaf_node {self.is_gpu_leaf_node}"
- +
- f" is_cpu_leaf_node {self.is_cpu_leaf_node} block_id {self.block_id} "
- + f"has_in_gpu {self.has_in_gpu} " +
- f"cache_status {self.cache_status} parent {parent_node_id} with children number "
- + f"{len(self.children)} req_id_set {self.req_id_set}")
+ + f" shared_count {self.shared_count} is_gpu_leaf_node {self.is_gpu_leaf_node}"
+ + f" is_cpu_leaf_node {self.is_cpu_leaf_node} block_id {self.block_id} "
+ + f"has_in_gpu {self.has_in_gpu} "
+ + f"cache_status {self.cache_status} parent {parent_node_id} with children number "
+ + f"{len(self.children)} req_id_set {self.req_id_set}"
+ )
@property
def has_in_gpu(self):
@@ -141,8 +140,7 @@ class BlockNode:
"""
check if the node is a leaf node in CPU
"""
- if (self.cache_status == CacheStatus.CPU) and (len(self.children)
- == 0):
+ if (self.cache_status == CacheStatus.CPU) and (len(self.children) == 0):
return True
return False
diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py
index 641f44bb1..f11c40690 100644
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -21,51 +21,54 @@ import time
import numpy as np
import paddle
-from fastdeploy.cache_manager.transfer_factory import (IPCCommManager,
- RDMACommManager)
+from fastdeploy.cache_manager.transfer_factory import IPCCommManager, RDMACommManager
from fastdeploy.inter_communicator import EngineWorkerQueue, IPCSignal
from fastdeploy.utils import get_logger
logger = get_logger("cache_messager", "cache_messager.log")
-class CacheMessager(object):
+class CacheMessager:
"""
CacheMessager is used to send the cache data between the engine worker and the cache server.
"""
- def __init__(self,
- splitwise_role,
- transfer_protocol,
- pod_ip,
- engine_worker_queue_port,
- local_data_parallel_id,
- gpu_cache_kvs,
- rank,
- nranks,
- num_layers,
- gpu_id=0,
- rdma_port=None):
+ def __init__(
+ self,
+ splitwise_role,
+ transfer_protocol,
+ pod_ip,
+ engine_worker_queue_port,
+ local_data_parallel_id,
+ gpu_cache_kvs,
+ rank,
+ nranks,
+ num_layers,
+ gpu_id=0,
+ rdma_port=None,
+ ):
"""
- Initialize the CacheMessager object.
+ Initialize the CacheMessager object.
- Args:
- splitwise_role (str): splitwise_role only can be 'prefill' or 'decode'.
- transfer_protocol (str): support ipc and rdma
- engine_worker_queue_port (int): engine_worker_queue port
- gpu_cache_kvs (dict): GPU kv cache
- rank (int): current rank
- nranks (int): global rank number
- num_layers (int): model layer number
- gpu_id (int, optional): GPU ID
- rdma_port (int, optional): RDMA port
+ Args:
+ splitwise_role (str): splitwise_role only can be 'prefill' or 'decode'.
+ transfer_protocol (str): support ipc and rdma
+ engine_worker_queue_port (int): engine_worker_queue port
+ gpu_cache_kvs (dict): GPU kv cache
+ rank (int): current rank
+ nranks (int): global rank number
+ num_layers (int): model layer number
+ gpu_id (int, optional): GPU ID
+ rdma_port (int, optional): RDMA port
- Returns:
- None
+ Returns:
+ None
"""
- assert splitwise_role in ["prefill", "decode"], \
- "splitwise_role must be prefill or decode"
+ assert splitwise_role in [
+ "prefill",
+ "decode",
+ ], "splitwise_role must be prefill or decode"
self.splitwise_role = splitwise_role
self.gpu_cache_kvs = gpu_cache_kvs
self.rank = rank
@@ -76,11 +79,11 @@ class CacheMessager(object):
is_server=False,
num_client=self.nranks,
client_id=self.rank,
- local_data_parallel_id=local_data_parallel_id)
+ local_data_parallel_id=local_data_parallel_id,
+ )
transfer_protocol = transfer_protocol.split(",")
- logger.info(f"splitwise role: {splitwise_role}, {transfer_protocol}"
- f"rank: {rank}")
+ logger.info(f"splitwise role: {splitwise_role}, {transfer_protocol}" f"rank: {rank}")
# 1. initialize the cache_k_ptr_list and cache_v_ptr_list
self.num_layers = num_layers
@@ -90,10 +93,8 @@ class CacheMessager(object):
cache_v = []
self.messager = {}
for layer_idx in range(self.num_layers):
- key_cache = self.gpu_cache_kvs[
- f'key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}']
- val_cache = self.gpu_cache_kvs[
- f'value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}']
+ key_cache = self.gpu_cache_kvs[f"key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
+ val_cache = self.gpu_cache_kvs[f"value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"]
cache_k.append(key_cache)
cache_v.append(val_cache)
cache_k_ptr_list.append(key_cache.data_ptr())
@@ -109,7 +110,8 @@ class CacheMessager(object):
block_bytes *= 2
logger.info(
f"layers {num_layers} cache_shape: {cache_shape}, max_block_num: {max_block_num}, "
- f"block_bytes: {block_bytes}, dtype: {key_cache.dtype}")
+ f"block_bytes: {block_bytes}, dtype: {key_cache.dtype}"
+ )
self.block_bytes = block_bytes
# 3. initialize the messager
@@ -122,24 +124,26 @@ class CacheMessager(object):
cache_v,
)
local_device_id = int(str(cache_k[0].place)[-2])
- logger.info(
- f"done create ipc_comm with local_device_id:{local_device_id}, "
- )
+ logger.info(f"done create ipc_comm with local_device_id:{local_device_id}, ")
elif protocol == "rdma":
- logger.info(
- f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}"
- )
+ logger.info(f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}")
self.messager[protocol] = RDMACommManager(
- splitwise_role, rank, gpu_id, cache_k_ptr_list,
- cache_v_ptr_list, max_block_num, block_bytes, rdma_port)
+ splitwise_role,
+ rank,
+ gpu_id,
+ cache_k_ptr_list,
+ cache_v_ptr_list,
+ max_block_num,
+ block_bytes,
+ rdma_port,
+ )
self.gpu_id = gpu_id
self.cache_info = dict()
- layerwise_send_cache_thread = threading.Thread(
- target=self._prefill_layerwise_send_cache_thread)
+ layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread)
layerwise_send_cache_thread.daemon = True
layerwise_send_cache_thread.start()
@@ -159,26 +163,30 @@ class CacheMessager(object):
array=prefilled_step_idx_data,
dtype=np.int32,
suffix=self.gpu_id,
- create=True)
+ create=True,
+ )
layer_shm_value = IPCSignal(
name=f"splitwise_complete_prefilled_layer_{self.rank}",
array=prefilled_layer_idx_data,
dtype=np.int32,
suffix=self.gpu_id,
- create=True)
+ create=True,
+ )
except:
step_shm_value = IPCSignal(
name=f"splitwise_complete_prefilled_step_{self.rank}",
array=prefilled_step_idx_data,
dtype=np.int32,
suffix=self.gpu_id,
- create=False)
+ create=False,
+ )
layer_shm_value = IPCSignal(
name=f"splitwise_complete_prefilled_layer_{self.rank}",
array=prefilled_layer_idx_data,
dtype=np.int32,
suffix=self.gpu_id,
- create=False)
+ create=False,
+ )
step_shm_value.value[0] = -1
layer_shm_value.value[0] = -1
@@ -193,21 +201,19 @@ class CacheMessager(object):
if cache_info:
logger.debug(f"cache info {cache_info}")
for info in cache_info:
- if info['request_id'] in self.cache_info:
+ if info["request_id"] in self.cache_info:
self.cache_info[info["request_id"]].update(info)
current_info = self.cache_info[info["request_id"]]
if "dest_block_ids" in current_info and "src_block_ids" in current_info:
- current_src_blocks = current_info[
- "src_block_ids"][-len(current_info["dest_block_ids"]):]
- current_info[
- "src_block_ids"] = current_src_blocks
+ current_src_blocks = current_info["src_block_ids"][
+ -len(current_info["dest_block_ids"]) :
+ ]
+ current_info["src_block_ids"] = current_src_blocks
current_info["current_layer_ids"] = 0
current_info["status"] = "init"
- logger.info(
- f"start cache_infos: {current_info}")
+ logger.info(f"start cache_infos: {current_info}")
self.cache_info[info["request_id"]] = current_info
- self.last_step_idx = min(
- self.last_step_idx, current_info['current_id'])
+ self.last_step_idx = min(self.last_step_idx, current_info["current_id"])
else:
self.cache_info[info["request_id"]] = info
prefilled_layer_idx = layer_shm_value.value[0]
@@ -223,64 +229,53 @@ class CacheMessager(object):
if not self.cache_info:
time.sleep(0.001)
continue
- logger.debug(
- f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}"
- )
+ logger.debug(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}")
for req_id, item in list(self.cache_info.items()):
if "status" not in item:
continue
if "layer_idx" not in item:
item["layer_idx"] = 0
- if item['status'] == 'error':
+ if item["status"] == "error":
del self.cache_info[req_id]
continue
- if item['current_id'] > prefilled_step_idx:
+ if item["current_id"] > prefilled_step_idx:
continue
current_transfer_protocol = item["transfer_protocol"]
if item["transfer_protocol"] == "rdma":
- target_ip = item['ip']
- target_id = int(item['rdma_ports'][self.rank])
- status = self.messager[
- current_transfer_protocol].connect(
- target_ip, target_id)
+ target_ip = item["ip"]
+ target_id = int(item["rdma_ports"][self.rank])
+ status = self.messager[current_transfer_protocol].connect(target_ip, target_id)
if not status:
- logger.error(
- f"connect to {target_ip}:{target_id} failed")
+ logger.error(f"connect to {target_ip}:{target_id} failed")
item["status"] = "error"
self.engine_worker_queue.finish_request_barrier.wait()
if self.rank == 0:
- self.engine_worker_queue.put_finished_req([
- (item['request_id'], "connect error")
- ])
+ self.engine_worker_queue.put_finished_req([(item["request_id"], "connect error")])
continue
elif item["transfer_protocol"] == "ipc":
target_ip = "0.0.0.0"
- target_id = int(item['device_ids'][self.rank])
- src_block_ids = paddle.to_tensor(item['src_block_ids'],
- dtype='int32',
- place='cpu')
- dest_block_ids = paddle.to_tensor(item['dest_block_ids'],
- dtype='int32',
- place='cpu')
- if item['current_id'] < prefilled_step_idx:
+ target_id = int(item["device_ids"][self.rank])
+ src_block_ids = paddle.to_tensor(item["src_block_ids"], dtype="int32", place="cpu")
+ dest_block_ids = paddle.to_tensor(item["dest_block_ids"], dtype="int32", place="cpu")
+ if item["current_id"] < prefilled_step_idx:
current_layer_idx = self.num_layers
else:
current_layer_idx = prefilled_layer_idx + 1
- for layer_idx in range(item["layer_idx"],
- current_layer_idx):
+ for layer_idx in range(item["layer_idx"], current_layer_idx):
tic = time.time()
- return_code = self.messager[
- current_transfer_protocol].write_cache(
- target_ip, target_id, src_block_ids,
- dest_block_ids, layer_idx)
+ return_code = self.messager[current_transfer_protocol].write_cache(
+ target_ip,
+ target_id,
+ src_block_ids,
+ dest_block_ids,
+ layer_idx,
+ )
if return_code != 0:
item["status"] = "error"
self.engine_worker_queue.finish_request_barrier.wait()
if self.rank == 0:
- self.engine_worker_queue.put_finished_req([
- (item['request_id'], "write cache error")
- ])
+ self.engine_worker_queue.put_finished_req([(item["request_id"], "write cache error")])
logger.error(
f"write cache failed, layer_idx: {layer_idx}, "
f"req_id: {item['request_id']}, dest_ip: {target_ip}"
@@ -298,16 +293,14 @@ class CacheMessager(object):
f"block_num: {block_num}, send_cache_speed(GB/s): {round(send_cache_speed, 5)},"
f"avg_time per block(ms): {round(avg_time_per_block, 5)}"
)
- item['layer_idx'] = current_layer_idx
- if item['layer_idx'] == self.num_layers:
+ item["layer_idx"] = current_layer_idx
+ if item["layer_idx"] == self.num_layers:
if item["transfer_protocol"] == "ipc":
self.messager["ipc"].write_block_by_sync(target_id)
logger.info(f"finish write cache {item['request_id']}")
self.engine_worker_queue.finish_request_barrier.wait()
if self.rank == 0:
- self.engine_worker_queue.put_finished_req([
- (item['request_id'], "finished")
- ])
+ self.engine_worker_queue.put_finished_req([(item["request_id"], "finished")])
logger.info(f"put write cache {item['request_id']}")
del self.cache_info[req_id]
@@ -315,5 +308,4 @@ class CacheMessager(object):
self.last_layer_idx = prefilled_layer_idx
except Exception as e:
- logger.error(
- f"prefill layerwise send cache thread has exception: {e}")
+ logger.error(f"prefill layerwise send cache thread has exception: {e}")
diff --git a/fastdeploy/cache_manager/cache_metrics.py b/fastdeploy/cache_manager/cache_metrics.py
index 212b5c2dd..2f5acf36a 100644
--- a/fastdeploy/cache_manager/cache_metrics.py
+++ b/fastdeploy/cache_manager/cache_metrics.py
@@ -14,52 +14,45 @@
# limitations under the License.
"""
-
from fastdeploy.utils import get_logger
logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log")
-
-
class CacheMetrics:
"""
- Cache Metrics used to record the cache hit time, token num, request num, etc.
+ Cache Metrics used to record the cache hit time, token num, request num, etc.
"""
+
def __init__(self):
- self.total_match_time = 0.0
- self.avg_match_time = 0.0
+ self.total_match_time = 0.0
+ self.avg_match_time = 0.0
self.min_match_time = 1e9
self.max_match_time = 0.0
# request level
- self.req_count = 0
- self.hit_req_count = 0
- self.hit_req_ratio = 0.0
+ self.req_count = 0
+ self.hit_req_count = 0
+ self.hit_req_ratio = 0.0
# token level
- self.total_gpu_matched_token_num = 0
+ self.total_gpu_matched_token_num = 0
self.total_cpu_matched_token_num = 0
self.matched_token_num = 0
- self.total_token_num = 0
- self.hit_token_ratio = 0.0
+ self.total_token_num = 0
+ self.hit_token_ratio = 0.0
self.cpu_hit_token_ratio = 0.0
self.gpu_hit_token_ratio = 0.0
-
def _update_history_hit_metrics(self):
"""
update hit ratio
"""
self.hit_req_ratio = self.hit_req_count / self.req_count
self.hit_token_ratio = self.matched_token_num / self.total_token_num
- self.cpu_hit_token_ratio = (
- self.total_cpu_matched_token_num / self.total_token_num
- )
- self.gpu_hit_token_ratio = (
- self.total_gpu_matched_token_num / self.total_token_num
- )
+ self.cpu_hit_token_ratio = self.total_cpu_matched_token_num / self.total_token_num
+ self.gpu_hit_token_ratio = self.total_gpu_matched_token_num / self.total_token_num
logger.info(
f"Metrics for all requests: req_count {self.req_count} hit_req_count {self.hit_req_count}"
@@ -82,31 +75,17 @@ class CacheMetrics:
"""
calculate hit metrics for current query
"""
-
- cpu_cache_match_ratio = (
- current_query_cpu_match_token_num / current_query_token_num
- )
- gpu_cache_match_ratio = (
- current_query_gpu_match_token_num / current_query_token_num
- )
- total_match_ratio = (
- cpu_cache_match_ratio + gpu_cache_match_ratio
- )
+ cpu_cache_match_ratio = current_query_cpu_match_token_num / current_query_token_num
+ gpu_cache_match_ratio = current_query_gpu_match_token_num / current_query_token_num
-
- self.total_cpu_matched_token_num += (
- current_query_cpu_match_token_num
- )
- self.total_gpu_matched_token_num += (
- current_query_gpu_match_token_num
- )
+ total_match_ratio = cpu_cache_match_ratio + gpu_cache_match_ratio
- self.matched_token_num += (
- current_query_cpu_match_token_num
- + current_query_gpu_match_token_num
- )
- self.total_token_num += current_query_token_num
+ self.total_cpu_matched_token_num += current_query_cpu_match_token_num
+ self.total_gpu_matched_token_num += current_query_gpu_match_token_num
+
+ self.matched_token_num += current_query_cpu_match_token_num + current_query_gpu_match_token_num
+ self.total_token_num += current_query_token_num
logger.info(
f"Metrics for req_id {req_id}: token_num {current_query_token_num}"
+ f" cpu_cache_match_ratio {cpu_cache_match_ratio}"
@@ -134,4 +113,4 @@ class CacheMetrics:
self.total_token_num = 0
self.hit_token_ratio = 0.0
self.cpu_hit_token_ratio = 0.0
- self.gpu_hit_token_ratio = 0.0
\ No newline at end of file
+ self.gpu_hit_token_ratio = 0.0
diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py
index 912624512..678819723 100644
--- a/fastdeploy/cache_manager/cache_transfer_manager.py
+++ b/fastdeploy/cache_manager/cache_transfer_manager.py
@@ -26,8 +26,11 @@ import paddle
from fastdeploy.cache_manager.cache_data import CacheStatus
from fastdeploy.engine.config import SpeculativeConfig
from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal
-from fastdeploy.model_executor.ops.gpu import (cuda_host_alloc, set_data_ipc,
- swap_cache_all_layers)
+from fastdeploy.model_executor.ops.gpu import (
+ cuda_host_alloc,
+ set_data_ipc,
+ swap_cache_all_layers,
+)
from fastdeploy.utils import get_logger
@@ -36,79 +39,58 @@ def parse_args():
从命令行解析参数
"""
parser = argparse.ArgumentParser("Cache transfer manager")
- parser.add_argument("--splitwise_role",
- type=str,
- default="mixed",
- help="splitwise role, can be decode, prefill or mixed")
+ parser.add_argument(
+ "--splitwise_role",
+ type=str,
+ default="mixed",
+ help="splitwise role, can be decode, prefill or mixed",
+ )
parser.add_argument("--rank", type=int, default=0, help="current rank")
parser.add_argument("--device_id", type=int, default=0, help="device id")
- parser.add_argument("--num_layers",
- type=int,
- default=1,
- help="model num layers")
- parser.add_argument("--head_dim",
- type=int,
- default=1,
- help="model head dim")
- parser.add_argument("--kv_num_head",
- type=int,
- default=1,
- help="model kv num head")
+ parser.add_argument("--num_layers", type=int, default=1, help="model num layers")
+ parser.add_argument("--head_dim", type=int, default=1, help="model head dim")
+ parser.add_argument("--kv_num_head", type=int, default=1, help="model kv num head")
parser.add_argument("--rdma_port", type=str, default="", help="rmda port")
- parser.add_argument("--mp_num",
- type=int,
- default=1,
- help="number of model parallel")
- parser.add_argument("--protocol",
- type=str,
- default="ipc",
- help="cache transfer protocol, only surport ipc now")
- parser.add_argument("--enable_splitwise",
- type=int,
- default=0,
- help="enable splitwise ")
- parser.add_argument("--cache_queue_port",
- type=int,
- default=9923,
- help="cache queue port")
- parser.add_argument("--pod_ip",
- type=str,
- default="0.0.0.0",
- help="pod ip")
- parser.add_argument("--engine_worker_queue_port",
- type=int,
- default=9923,
- help="engine worker queue port")
- parser.add_argument("--engine_pid",
- type=str,
- default=None,
- help="engine pid")
+ parser.add_argument("--mp_num", type=int, default=1, help="number of model parallel")
+ parser.add_argument(
+ "--protocol",
+ type=str,
+ default="ipc",
+ help="cache transfer protocol, only surport ipc now",
+ )
+ parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ")
+ parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port")
+ parser.add_argument("--pod_ip", type=str, default="0.0.0.0", help="pod ip")
+ parser.add_argument(
+ "--engine_worker_queue_port",
+ type=int,
+ default=9923,
+ help="engine worker queue port",
+ )
+ parser.add_argument("--engine_pid", type=str, default=None, help="engine pid")
- parser.add_argument("--num_gpu_blocks",
- type=int,
- default=1,
- help="gpu cache block number")
- parser.add_argument("--num_cpu_blocks",
- type=int,
- default=4,
- help="cpu cache block number")
- parser.add_argument("--block_size",
- type=int,
- default=64,
- help="cache block size(tokens)")
- parser.add_argument("--bytes_per_layer_per_block",
- type=int,
- default=1024,
- help="per layer per block bytes")
- parser.add_argument("--cache_dtype",
- type=str,
- default="bfloat16",
- choices=["uint8", "bfloat16"],
- help="cache dtype")
- parser.add_argument("--speculative_config",
- type=json.loads,
- default="{}",
- help="speculative config")
+ parser.add_argument("--num_gpu_blocks", type=int, default=1, help="gpu cache block number")
+ parser.add_argument("--num_cpu_blocks", type=int, default=4, help="cpu cache block number")
+ parser.add_argument("--block_size", type=int, default=64, help="cache block size(tokens)")
+ parser.add_argument(
+ "--bytes_per_layer_per_block",
+ type=int,
+ default=1024,
+ help="per layer per block bytes",
+ )
+ parser.add_argument(
+ "--cache_dtype",
+ type=str,
+ default="bfloat16",
+ choices=["uint8", "bfloat16"],
+ help="cache dtype",
+ )
+ parser.add_argument(
+ "--speculative_config",
+ type=json.loads,
+ default="{}",
+ help="speculative config",
+ )
parser.add_argument("--local_data_parallel_id", type=int, default=0)
args = parser.parse_args()
@@ -134,14 +116,10 @@ class CacheTransferManager:
self.gpu_cache_v_tensors = []
self.speculative_config = SpeculativeConfig(**args.speculative_config)
self.num_extra_layers = self.speculative_config.num_extra_cache_layer
- self.num_extra_layer_gpu_blocks = \
- int(args.num_gpu_blocks * \
- self.speculative_config.num_gpu_block_expand_ratio)
+ self.num_extra_layer_gpu_blocks = int(args.num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio)
- self.swap_to_cpu_thread_pool = concurrent.futures.ThreadPoolExecutor(
- max_workers=1)
- self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor(
- max_workers=1)
+ self.swap_to_cpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+ self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
self.transfer_task_queue = queue.Queue() # 用来接收传输任务
self.tansfer_done_queue = queue.Queue() # 用来告知任务执行完毕
self.n_ranks = args.mp_num
@@ -154,81 +132,72 @@ class CacheTransferManager:
is_server=False,
num_client=args.mp_num,
client_id=rank,
- local_data_parallel_id=args.local_data_parallel_id)
+ local_data_parallel_id=args.local_data_parallel_id,
+ )
self.num_cpu_blocks = args.num_cpu_blocks
cache_type = args.cache_dtype
for i in range(args.num_layers + self.num_extra_layers):
- num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else \
- self.num_extra_layer_gpu_blocks
+ num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks
- self.gpu_cache_kvs["key_caches_{}_rank{}_device{}".format(
- i, rank, device)] = paddle.full(
- shape=[
- num_gpu_blocks,
- args.kv_num_head,
- args.block_size,
- args.head_dim,
- ],
- fill_value=0,
- dtype=cache_type,
- )
- self.gpu_cache_k_tensors.append(
- self.gpu_cache_kvs["key_caches_{}_rank{}_device{}".format(
- i, rank, device)])
- self.gpu_cache_kvs["value_caches_{}_rank{}_device{}".format(
- i, rank, device)] = paddle.full(
- shape=[
- num_gpu_blocks,
- args.kv_num_head,
- args.block_size,
- args.head_dim,
- ],
- fill_value=0,
- dtype=cache_type,
- )
- self.gpu_cache_v_tensors.append(
- self.gpu_cache_kvs["value_caches_{}_rank{}_device{}".format(
- i, rank, device)])
+ self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"] = paddle.full(
+ shape=[
+ num_gpu_blocks,
+ args.kv_num_head,
+ args.block_size,
+ args.head_dim,
+ ],
+ fill_value=0,
+ dtype=cache_type,
+ )
+ self.gpu_cache_k_tensors.append(self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"])
+ self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"] = paddle.full(
+ shape=[
+ num_gpu_blocks,
+ args.kv_num_head,
+ args.block_size,
+ args.head_dim,
+ ],
+ fill_value=0,
+ dtype=cache_type,
+ )
+ self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"])
set_data_ipc(
- self.gpu_cache_kvs["key_caches_{}_rank{}_device{}".format(
- i, rank, device)],
- "key_caches_{}_rank{}.device{}".format(i, rank, device))
+ self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"],
+ f"key_caches_{i}_rank{rank}.device{device}",
+ )
set_data_ipc(
- self.gpu_cache_kvs["value_caches_{}_rank{}_device{}".format(
- i, rank, device)],
- "value_caches_{}_rank{}.device{}".format(i, rank, device))
- cache_kv_size_byte = sum(
- [tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
+ self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"],
+ f"value_caches_{i}_rank{rank}.device{device}",
+ )
+ cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()])
logger.info(f"device :{self.device}")
logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}")
- logger.info(
- f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}"
- )
+ logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}")
paddle.set_device("cpu")
self.k_dst_ptrs = []
self.v_dst_ptrs = []
for i in range(args.num_layers + self.num_extra_layers):
- self.cpu_cache_kvs["key_caches_{}_rank{}".format(
- i, rank)] = cuda_host_alloc(args.num_cpu_blocks *
- args.bytes_per_layer_per_block)
- self.k_dst_ptrs.append(
- self.cpu_cache_kvs["key_caches_{}_rank{}".format(i, rank)])
- self.cpu_cache_kvs["value_caches_{}_rank{}".format(
- i, rank)] = cuda_host_alloc(args.num_cpu_blocks *
- args.bytes_per_layer_per_block)
- self.v_dst_ptrs.append(
- self.cpu_cache_kvs["value_caches_{}_rank{}".format(i, rank)])
+ self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"] = cuda_host_alloc(
+ args.num_cpu_blocks * args.bytes_per_layer_per_block
+ )
+ self.k_dst_ptrs.append(self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"])
+ self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"] = cuda_host_alloc(
+ args.num_cpu_blocks * args.bytes_per_layer_per_block
+ )
+ self.v_dst_ptrs.append(self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"])
cache_ready_signal_data = np.zeros(shape=[args.mp_num], dtype=np.int32)
- self.cache_ready_signal = IPCSignal(name="cache_ready_signal",
- array=cache_ready_signal_data,
- dtype=np.int32,
- suffix=args.engine_pid,
- create=False)
+ self.cache_ready_signal = IPCSignal(
+ name="cache_ready_signal",
+ array=cache_ready_signal_data,
+ dtype=np.int32,
+ suffix=args.engine_pid,
+ create=False,
+ )
self.cache_ready_signal.value[self.rank] = 1
paddle.set_device(f"gpu:{device}")
@@ -251,9 +220,7 @@ class CacheTransferManager:
rdma_port=args.rdma_port,
)
logger.info("successfully create cache messager")
- logger.info(
- f"done init CacheMessager gmem alloc : {paddle.device.cuda.memory_allocated()}"
- )
+ logger.info(f"done init CacheMessager gmem alloc : {paddle.device.cuda.memory_allocated()}")
cache_task_broadcast_data = np.zeros(shape=[1], dtype=np.int32)
self.cache_task_broadcast_signal = IPCSignal(
@@ -261,10 +228,17 @@ class CacheTransferManager:
array=cache_task_broadcast_data,
dtype=np.int32,
suffix=args.engine_pid,
- create=False)
+ create=False,
+ )
- def _do_swap_to_cpu_task(self, swap_node_ids, gpu_block_id, cpu_block_id,
- event_type, transfer_task_id):
+ def _do_swap_to_cpu_task(
+ self,
+ swap_node_ids,
+ gpu_block_id,
+ cpu_block_id,
+ event_type,
+ transfer_task_id,
+ ):
"""
swap cache GPU->CPU
"""
@@ -282,14 +256,17 @@ class CacheTransferManager:
if self.rank == 0:
self.cache_task_queue.swap_to_cpu_barrier2.reset()
self.cache_task_queue.put_transfer_done_signal(result)
- logger.debug(
- f"_do_swap_to_cpu_task: put_transfer_done_signal {result}")
- logger.info(
- f"_do_swap_to_cpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}"
- )
+ logger.debug(f"_do_swap_to_cpu_task: put_transfer_done_signal {result}")
+ logger.info(f"_do_swap_to_cpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}")
- def _do_swap_to_gpu_task(self, swap_node_ids, gpu_block_id, cpu_block_id,
- event_type, transfer_task_id):
+ def _do_swap_to_gpu_task(
+ self,
+ swap_node_ids,
+ gpu_block_id,
+ cpu_block_id,
+ event_type,
+ transfer_task_id,
+ ):
"""
swap cache CPU->GPU
"""
@@ -307,11 +284,8 @@ class CacheTransferManager:
if self.rank == 0:
self.cache_task_queue.swap_to_gpu_barrier2.reset()
self.cache_task_queue.put_transfer_done_signal(result)
- logger.debug(
- f"_do_swap_to_gpu_task: put_transfer_done_signal {result}")
- logger.info(
- f"_do_swap_to_gpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}"
- )
+ logger.debug(f"_do_swap_to_gpu_task: put_transfer_done_signal {result}")
+ logger.info(f"_do_swap_to_gpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}")
def do_data_transfer(self):
"""
@@ -327,8 +301,7 @@ class CacheTransferManager:
if self.rank == 0:
self.cache_task_queue.barrier1.reset()
if self.cache_task_broadcast_signal.value[0] == 1:
- data, read_finish = self.cache_task_queue.get_transfer_task(
- )
+ data, read_finish = self.cache_task_queue.get_transfer_task()
logger.debug(f"transfer data: get_transfer_task {data}")
if read_finish:
self.cache_task_broadcast_signal.value[0] = 0
@@ -386,8 +359,7 @@ class CacheTransferManager:
"""
logger.debug(
f"transfer data: transfer_task_id {transfer_task_id}: swap_node_ids {swap_node_ids}"
- +
- f"task_gpu_block_id {task_gpu_block_id} task_cpu_block_id {task_cpu_block_id} event_type {event_type}"
+ + f"task_gpu_block_id {task_gpu_block_id} task_cpu_block_id {task_cpu_block_id} event_type {event_type}"
)
start_time = time.time()
try:
@@ -446,8 +418,7 @@ class CacheTransferManager:
elasped_time = end_time - start_time
logger.info(
f"transfer data: transfer_task_id {transfer_task_id} event_type {event_type}: "
- +
- f"transfer {len(gpu_block_ids)} blocks done elapsed_time {elasped_time:.4f}"
+ + f"transfer {len(gpu_block_ids)} blocks done elapsed_time {elasped_time:.4f}"
)
return (
swap_node_ids,
diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py
index 10e463cd0..e64dbb5ae 100644
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -41,11 +41,13 @@ class PrefixCacheManager:
PrefixCacheManager is used to manage the prefix tree and the cache.
"""
- def __init__(self,
- config,
- tensor_parallel_size,
- splitwise_role="mixed",
- local_data_parallel_id=0):
+ def __init__(
+ self,
+ config,
+ tensor_parallel_size,
+ splitwise_role="mixed",
+ local_data_parallel_id=0,
+ ):
"""
initialize the PrefixCacheManager
"""
@@ -66,14 +68,12 @@ class PrefixCacheManager:
self.num_cpu_blocks = self.cache_config.num_cpu_blocks
self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1))
if self.num_cpu_blocks > 0:
- self.cpu_free_block_list = list(
- range(self.num_cpu_blocks - 1, -1, -1))
+ self.cpu_free_block_list = list(range(self.num_cpu_blocks - 1, -1, -1))
else:
self.cpu_free_block_list = []
heapq.heapify(self.gpu_free_block_list)
heapq.heapify(self.cpu_free_block_list)
- self.node_id_pool = list(
- range(self.num_gpu_blocks + self.num_cpu_blocks))
+ self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
self.radix_tree_root = BlockNode(-1, [], 0, 0, -1, 0, None, None, None)
@@ -90,7 +90,7 @@ class PrefixCacheManager:
self.task_swapping_event = {}
self.node_map = {}
- self.req_leaf_map = ({}) # {request_id: leaf node}
+ self.req_leaf_map = {} # {request_id: leaf node}
self.leaf_req_map = defaultdict(set)
self.unfilled_req_block_map = defaultdict(list)
@@ -102,14 +102,18 @@ class PrefixCacheManager:
logger.info(
f"num_gpu_blocks_server_owned {self.num_gpu_blocks} num_cpu_blocks "
- +
- f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}"
+ + f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}"
)
-
-
- def launch_cache_manager(self, cache_config, tensor_parallel_size, \
- device_ids, pod_ip, engine_worker_queue_port, pid_suffix):
+ def launch_cache_manager(
+ self,
+ cache_config,
+ tensor_parallel_size,
+ device_ids,
+ pod_ip,
+ engine_worker_queue_port,
+ pid_suffix,
+ ):
"""
launch_cache_manager function used to initialize the cache manager.
"""
@@ -120,70 +124,72 @@ class PrefixCacheManager:
array=broadcast_cache_task_flag_array,
dtype=np.int32,
suffix=pid_suffix,
- create=True)
+ create=True,
+ )
self.cache_task_queue = EngineCacheQueue(
address=(pod_ip, cache_config.cache_queue_port),
- authkey=b'cache_queue_service',
+ authkey=b"cache_queue_service",
is_server=False,
num_client=tensor_parallel_size,
client_id=0,
- local_data_parallel_id=self.local_data_parallel_id)
+ local_data_parallel_id=self.local_data_parallel_id,
+ )
current_dir_path = os.path.split(os.path.abspath(__file__))[0]
filename = "cache_transfer_manager.py"
py_path = os.path.join(current_dir_path, filename)
- if (hasattr(cache_config.model_cfg, "num_key_value_heads")
- and hasattr(cache_config.model_cfg, "num_key_value_heads")
- and cache_config.model_cfg.num_key_value_heads is not None
- and int(cache_config.model_cfg.num_key_value_heads) > 0):
- kv_num_head = int(cache_config.model_cfg.num_key_value_heads
- ) // tensor_parallel_size
+ if (
+ hasattr(cache_config.model_cfg, "num_key_value_heads")
+ and hasattr(cache_config.model_cfg, "num_key_value_heads")
+ and cache_config.model_cfg.num_key_value_heads is not None
+ and int(cache_config.model_cfg.num_key_value_heads) > 0
+ ):
+ kv_num_head = int(cache_config.model_cfg.num_key_value_heads) // tensor_parallel_size
else:
kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size
- cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size],
- dtype=np.int32)
- self.cache_ready_signal = IPCSignal(name="cache_ready_signal",
- array=cache_ready_signal_data,
- dtype=np.int32,
- suffix=pid_suffix,
- create=True)
+ cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32)
+ self.cache_ready_signal = IPCSignal(
+ name="cache_ready_signal",
+ array=cache_ready_signal_data,
+ dtype=np.int32,
+ suffix=pid_suffix,
+ create=True,
+ )
log_dir = envs.FD_LOG_DIR
cache_manager_processes = []
for i in range(tensor_parallel_size):
launch_cmd = (
"FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7"
- + " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0" +
- f" {sys.executable} {py_path}" +
- f" --device_id {int(device_ids[i])}" + f" --rank {i}" +
- f" --splitwise_role {self.splitwise_role}" +
- f" --num_layers {cache_config.model_cfg.num_layers}" +
- f" --head_dim {cache_config.model_cfg.head_dim}" +
- f" --kv_num_head {kv_num_head}" +
- f" --mp_num {tensor_parallel_size}" +
- f" --cache_dtype {cache_config.cache_dtype}" +
- f" --cache_queue_port {cache_config.cache_queue_port}" +
- f" --enable_splitwise {int(self.enable_splitwise)}" +
- f" --pod_ip {pod_ip}" +
- f" --engine_worker_queue_port {engine_worker_queue_port}" +
- f" --num_gpu_blocks {cache_config.total_block_num}" +
- f" --num_cpu_blocks {cache_config.num_cpu_blocks}" +
- f" --bytes_per_layer_per_block {cache_config.bytes_per_layer_per_block}"
- + f" --block_size {cache_config.block_size}" +
- f" --engine_pid {pid_suffix}" +
- f" --protocol {cache_config.cache_transfer_protocol}" +
- f" --local_data_parallel_id {self.local_data_parallel_id}" +
- f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
- +
- f" --speculative_config '{self.speculative_config.to_json_string()}'"
- +
- f" >{log_dir}/launch_cache_manager_{int(device_ids[i])}.log 2>&1"
+ + " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0"
+ + f" {sys.executable} {py_path}"
+ + f" --device_id {int(device_ids[i])}"
+ + f" --rank {i}"
+ + f" --splitwise_role {self.splitwise_role}"
+ + f" --num_layers {cache_config.model_cfg.num_layers}"
+ + f" --head_dim {cache_config.model_cfg.head_dim}"
+ + f" --kv_num_head {kv_num_head}"
+ + f" --mp_num {tensor_parallel_size}"
+ + f" --cache_dtype {cache_config.cache_dtype}"
+ + f" --cache_queue_port {cache_config.cache_queue_port}"
+ + f" --enable_splitwise {int(self.enable_splitwise)}"
+ + f" --pod_ip {pod_ip}"
+ + f" --engine_worker_queue_port {engine_worker_queue_port}"
+ + f" --num_gpu_blocks {cache_config.total_block_num}"
+ + f" --num_cpu_blocks {cache_config.num_cpu_blocks}"
+ + f" --bytes_per_layer_per_block {cache_config.bytes_per_layer_per_block}"
+ + f" --block_size {cache_config.block_size}"
+ + f" --engine_pid {pid_suffix}"
+ + f" --protocol {cache_config.cache_transfer_protocol}"
+ + f" --local_data_parallel_id {self.local_data_parallel_id}"
+ + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}"
+ + f" --speculative_config '{self.speculative_config.to_json_string()}'"
+ + f" >{log_dir}/launch_cache_manager_{int(device_ids[i])}.log 2>&1"
)
logger.info(f"Launch cache transfer manager, command:{launch_cmd}")
- cache_manager_processes.append(
- subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
+ cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid))
# 等待cache初始化完毕
logger.info("Waiting for cache transfer manager ready...")
while np.sum(self.cache_ready_signal.value) != tensor_parallel_size:
@@ -192,9 +198,7 @@ class PrefixCacheManager:
if exit_code is None:
logger.info("Launch cache transfer manager successful")
else:
- logger.info(
- "Launch cache transfer manager failed, see launch_cache_manager.log for more information"
- )
+ logger.info("Launch cache transfer manager failed, see launch_cache_manager.log for more information")
if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0:
logger.info("Enable hierarchical cache.")
@@ -207,12 +211,10 @@ class PrefixCacheManager:
"""
self.cache_config = cache_config
self.num_gpu_blocks = cache_config.prefill_kvcache_block_num
- self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1,
- -1)) # 服务端管理的GPU上剩余的block id
+ self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) # 服务端管理的GPU上剩余的block id
heapq.heapify(self.gpu_free_block_list)
- self.node_id_pool = list(
- range(self.num_gpu_blocks + self.num_cpu_blocks))
+ self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks))
def _enable_cpu_cache(self):
"""
@@ -226,8 +228,7 @@ class PrefixCacheManager:
# port=ipc_cache_queue_port,
# )
# 开启获取传输任务结果的监听线程
- self.transfer_recv_thread = threading.Thread(
- target=self.recv_data_transfer_result)
+ self.transfer_recv_thread = threading.Thread(target=self.recv_data_transfer_result)
self.transfer_recv_thread.start()
def allocate_gpu_blocks(self, num_blocks):
@@ -237,9 +238,7 @@ class PrefixCacheManager:
assert num_blocks <= len(
self.gpu_free_block_list
), f"gpu free block num: {len(self.gpu_free_block_list)} < needed number {num_blocks}"
- allocated_block_ids = [
- heapq.heappop(self.gpu_free_block_list) for i in range(num_blocks)
- ]
+ allocated_block_ids = [heapq.heappop(self.gpu_free_block_list) for i in range(num_blocks)]
logger.info(
f"allocate_gpu_blocks: {allocated_block_ids}, len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}"
)
@@ -265,9 +264,7 @@ class PrefixCacheManager:
assert num_blocks <= len(
self.cpu_free_block_list
), f"cpu free block num: {len(self.cpu_free_block_list)} < needed number {num_blocks}"
- allocated_block_ids = [
- heapq.heappop(self.cpu_free_block_list) for i in range(num_blocks)
- ]
+ allocated_block_ids = [heapq.heappop(self.cpu_free_block_list) for i in range(num_blocks)]
logger.info(
f"allocate_cpu_blocks: {allocated_block_ids}, len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}"
)
@@ -307,16 +304,17 @@ class PrefixCacheManager:
"""
self.task_swapping_event[transfer_task_id] = Event()
- self.cache_task_queue.put_transfer_task((
- swap_node_ids,
- gpu_block_ids,
- cpu_block_ids,
- event_type,
- transfer_task_id,
- ))
+ self.cache_task_queue.put_transfer_task(
+ (
+ swap_node_ids,
+ gpu_block_ids,
+ cpu_block_ids,
+ event_type,
+ transfer_task_id,
+ )
+ )
if is_sync:
self.sync_swap_task(transfer_task_id)
- return
def sync_swap_task(self, transfer_task_id):
"""
@@ -325,26 +323,27 @@ class PrefixCacheManager:
self.task_swapping_event[transfer_task_id].wait()
del self.task_swapping_event[transfer_task_id]
- def _check_validity(self, req_id, match_gpu_blocks_num,
- expected_block_num):
+ def _check_validity(self, req_id, match_gpu_blocks_num, expected_block_num):
"""
check enough gpu memory to allocate cache
"""
- if expected_block_num - match_gpu_blocks_num > len(
- self.gpu_free_block_list):
+ if expected_block_num - match_gpu_blocks_num > len(self.gpu_free_block_list):
msg = (
f"request_block_ids: request block for req_id {req_id} failed. "
- +
- f"matched gpu block num: {match_gpu_blocks_num} require extra gpu block num: "
- +
- f"{expected_block_num - match_gpu_blocks_num} > free block num: {len(self.gpu_free_block_list)}"
+ + f"matched gpu block num: {match_gpu_blocks_num} require extra gpu block num: "
+ + f"{expected_block_num - match_gpu_blocks_num} > free block num: {len(self.gpu_free_block_list)}"
)
logger.info(msg)
raise Exception("Not enough GPU memory to allocate cache")
-
- def _prepare_cpu_cache(self, req_id, swap_node_ids, gpu_recv_block_ids, \
- cpu_recv_block_ids, match_cpu_block_ids):
+ def _prepare_cpu_cache(
+ self,
+ req_id,
+ swap_node_ids,
+ gpu_recv_block_ids,
+ cpu_recv_block_ids,
+ match_cpu_block_ids,
+ ):
"""
将cpu cache转移到GPU
"""
@@ -357,11 +356,8 @@ class PrefixCacheManager:
for tmp_cpu_block_id in match_cpu_block_ids:
need_transfer_task_cpu_block_ids.append(tmp_cpu_block_id)
- assert len(need_transfer_task_gpu_block_ids) == len(
- need_transfer_task_cpu_block_ids)
- logger.info(
- f"request_block_ids: req_id {req_id} issue_swap_task transfer_task_id {transfer_task_id}"
- )
+ assert len(need_transfer_task_gpu_block_ids) == len(need_transfer_task_cpu_block_ids)
+ logger.info(f"request_block_ids: req_id {req_id} issue_swap_task transfer_task_id {transfer_task_id}")
self.issue_swap_task(
transfer_task_id,
swap_node_ids,
@@ -371,8 +367,16 @@ class PrefixCacheManager:
True,
)
- def _prepare_cache(self, req_id, input_ids, block_size, \
- expected_block_num, match_gpu_block_ids, match_cpu_block_ids, match_node_ids):
+ def _prepare_cache(
+ self,
+ req_id,
+ input_ids,
+ block_size,
+ expected_block_num,
+ match_gpu_block_ids,
+ match_cpu_block_ids,
+ match_node_ids,
+ ):
"""
prepare cache for request
"""
@@ -394,26 +398,31 @@ class PrefixCacheManager:
gpu_extra_block_ids = self.allocate_gpu_blocks(gpu_extra_block_num)
if len(gpu_recv_block_ids) > 0:
- self._prepare_cpu_cache(req_id, match_node_ids, gpu_recv_block_ids, \
- cpu_recv_block_ids, match_cpu_block_ids)
+ self._prepare_cpu_cache(
+ req_id,
+ match_node_ids,
+ gpu_recv_block_ids,
+ cpu_recv_block_ids,
+ match_cpu_block_ids,
+ )
return gpu_recv_block_ids, gpu_extra_block_ids
def request_block_ids(self, task, block_size, dec_token_num, *args):
"""
- Allocate blocks for a task.
- This is a synchronous interface. If CPU-to-GPU data transfer occurs,
- it will block until synchronization completes.
- Callers requiring asynchronous behavior should invoke this via a thread pool.
+ Allocate blocks for a task.
+ This is a synchronous interface. If CPU-to-GPU data transfer occurs,
+ it will block until synchronization completes.
+ Callers requiring asynchronous behavior should invoke this via a thread pool.
- Parameters:
- - task: Task dictionary
- - block_size: Size per block (in tokens)
- - dec_token_num: Number of tokens reserved for decoding on the server side
+ Parameters:
+ - task: Task dictionary
+ - block_size: Size per block (in tokens)
+ - dec_token_num: Number of tokens reserved for decoding on the server side
- Returns:
- - common_block_ids: List of matched shared blocks
- - unique_block_ids: List of exclusively allocated blocks
+ Returns:
+ - common_block_ids: List of matched shared blocks
+ - unique_block_ids: List of exclusively allocated blocks
"""
with self.request_release_lock:
try:
@@ -423,9 +432,7 @@ class PrefixCacheManager:
self.metrics.req_count += 1
input_ids = task.prompt_token_ids
req_id = task.request_id
- logger.info(
- f"request_block_ids: start to allocate blocks for req_id {req_id}"
- )
+ logger.info(f"request_block_ids: start to allocate blocks for req_id {req_id}")
input_token_num = len(input_ids)
common_block_ids = []
unique_block_ids = []
@@ -443,34 +450,43 @@ class PrefixCacheManager:
matched_block_num = match_gpu_blocks_num + match_cpu_blocks_num
matched_token_num_in_cpu_and_gpu = gpu_match_token_num + cpu_match_token_num
# check enough gpu memory to allocate cache
- block_num = (input_token_num + block_size - 1 +
- dec_token_num) // block_size
+ block_num = (input_token_num + block_size - 1 + dec_token_num) // block_size
self._check_validity(req_id, matched_block_num, block_num)
# update matched node info
current_time = time.time()
- self._update_matched_node_info(req_id, match_block_node,
- current_time)
+ self._update_matched_node_info(req_id, match_block_node, current_time)
# 2. prepare cache
- gpu_recv_block_ids, gpu_extra_block_ids, = self._prepare_cache(req_id, \
- input_ids, block_size, block_num, match_gpu_block_ids, match_cpu_block_ids, swap_node_ids)
+ (gpu_recv_block_ids, gpu_extra_block_ids,) = self._prepare_cache(
+ req_id,
+ input_ids,
+ block_size,
+ block_num,
+ match_gpu_block_ids,
+ match_cpu_block_ids,
+ swap_node_ids,
+ )
# update matched token num
- matched_block_num = (gpu_match_token_num + cpu_match_token_num)
+ matched_block_num = gpu_match_token_num + cpu_match_token_num
common_block_ids = match_gpu_block_ids + gpu_recv_block_ids
unique_block_ids = gpu_extra_block_ids
dec_block_num = dec_token_num // block_size
- left_input_ids = input_ids[
- matched_token_num_in_cpu_and_gpu:] # 没在前缀树中的token
+ left_input_ids = input_ids[matched_token_num_in_cpu_and_gpu:] # 没在前缀树中的token
gpu_build_path_block_ids = []
gpu_build_path_block_ids = gpu_extra_block_ids
- leaf_node = self.build_path(req_id, current_time, input_ids,
- left_input_ids,
- gpu_build_path_block_ids,
- block_size, match_block_node,
- dec_block_num)
+ leaf_node = self.build_path(
+ req_id,
+ current_time,
+ input_ids,
+ left_input_ids,
+ gpu_build_path_block_ids,
+ block_size,
+ match_block_node,
+ dec_block_num,
+ )
self.req_leaf_map[req_id] = leaf_node
self.leaf_req_map[leaf_node].add(req_id)
# 3. update metrics
@@ -482,17 +498,15 @@ class PrefixCacheManager:
gpu_match_token_num,
input_token_num,
)
- hit_info[
- "gpu_cache_blocks"] = gpu_match_token_num // block_size
- hit_info[
- "cpu_cache_blocks"] = cpu_match_token_num // block_size
+ hit_info["gpu_cache_blocks"] = gpu_match_token_num // block_size
+ hit_info["cpu_cache_blocks"] = cpu_match_token_num // block_size
self.metrics._update_history_hit_metrics()
if self.metrics.req_count % 10000 == 0:
self.metrics.reset_metrics()
logger.info(
f"request_block_ids: request block for req_id {req_id}: common_block_ids "
- +
- f"{common_block_ids}, unique_block_ids {unique_block_ids}")
+ + f"{common_block_ids}, unique_block_ids {unique_block_ids}"
+ )
return common_block_ids, unique_block_ids, hit_info
except Exception as e:
logger.error(f"request_block_ids: error: {type(e)} {e}")
@@ -523,25 +537,21 @@ class PrefixCacheManager:
node.decrement_shared_count()
node = node.parent
- logger.info(
- f"release_block_ids: req_id {req_id} leaf_node {leaf_node}"
- )
+ logger.info(f"release_block_ids: req_id {req_id} leaf_node {leaf_node}")
if leaf_node == self.radix_tree_root:
- self.recycle_gpu_blocks(
- self.unfilled_req_block_map[req_id])
+ self.recycle_gpu_blocks(self.unfilled_req_block_map[req_id])
del self.unfilled_req_block_map[req_id]
return
if leaf_node in self.gpu_lru_leaf_set:
return
- if (leaf_node.shared_count == 0 and leaf_node.is_gpu_leaf_node
- and leaf_node.is_persistent is False):
+ if leaf_node.shared_count == 0 and leaf_node.is_gpu_leaf_node and leaf_node.is_persistent is False:
self.gpu_lru_leaf_set.add(leaf_node)
heapq.heappush(self.gpu_lru_leaf_heap, leaf_node)
logger.info(
- f"release_block_ids: req_id {req_id} has been finished, " +
- f"current gpu_lru_leaf_heap length {len(self.gpu_lru_leaf_heap)}"
+ f"release_block_ids: req_id {req_id} has been finished, "
+ + f"current gpu_lru_leaf_heap length {len(self.gpu_lru_leaf_heap)}"
)
return
except Exception as e:
@@ -563,8 +573,15 @@ class PrefixCacheManager:
node.reverved_dec_block_ids = []
self.recycle_gpu_blocks(node.block_id)
- def _handle_free_gpu_node_with_cpu(self, node, hash_value_input_ids_map, \
- hash_value_depth_map, need_recycle_gpu_block_ids, hash_value_gpu_block_ids_map, hash_value_swap_node_ids_map):
+ def _handle_free_gpu_node_with_cpu(
+ self,
+ node,
+ hash_value_input_ids_map,
+ hash_value_depth_map,
+ need_recycle_gpu_block_ids,
+ hash_value_gpu_block_ids_map,
+ hash_value_swap_node_ids_map,
+ ):
"""
GPU node eviction in hierarchical cache layers
"""
@@ -573,14 +590,19 @@ class PrefixCacheManager:
node.reverved_dec_block_ids = []
need_recycle_gpu_block_ids.append(node.block_id)
- hash_value_gpu_block_ids_map[node.input_hash_value].append(
- node.block_id)
- hash_value_swap_node_ids_map[node.input_hash_value].append(
- node.node_id)
+ hash_value_gpu_block_ids_map[node.input_hash_value].append(node.block_id)
+ hash_value_swap_node_ids_map[node.input_hash_value].append(node.node_id)
- def _evict_cache_async(self, future, total_gpu_free_count, \
- hash_value_gpu_block_ids_map, hash_value_block_ids_map, \
- hash_value_swap_node_ids_map, hash_value_input_ids_map, hash_value_depth_map):
+ def _evict_cache_async(
+ self,
+ future,
+ total_gpu_free_count,
+ hash_value_gpu_block_ids_map,
+ hash_value_block_ids_map,
+ hash_value_swap_node_ids_map,
+ hash_value_input_ids_map,
+ hash_value_depth_map,
+ ):
"""
evict cache async (GPU --> CPU)
"""
@@ -592,23 +614,21 @@ class PrefixCacheManager:
need_transfer_task_cpu_block_ids = []
cpu_block_ids = self.allocate_cpu_blocks(total_gpu_free_count)
for input_hash_value in hash_value_gpu_block_ids_map.keys():
- need_transfer_task_gpu_block_ids.extend(
- reversed(hash_value_gpu_block_ids_map[input_hash_value]))
+ need_transfer_task_gpu_block_ids.extend(reversed(hash_value_gpu_block_ids_map[input_hash_value]))
all_allocated_cpu_block_ids = []
for _ in reversed(hash_value_gpu_block_ids_map[input_hash_value]):
cpu_block_id_t = cpu_block_ids.pop(0)
all_allocated_cpu_block_ids.append(cpu_block_id_t)
need_transfer_task_cpu_block_ids.append(cpu_block_id_t)
- swap_node_ids.extend(
- reversed(hash_value_swap_node_ids_map[input_hash_value]))
+ swap_node_ids.extend(reversed(hash_value_swap_node_ids_map[input_hash_value]))
logger.info(
- "free_block_ids_async: issue transfer task: " +
- f"transfer_task_id {transfer_task_id}: " +
- f"swap_node_ids {swap_node_ids} need_transfer_task_gpu_block_ids "
- +
- f"{need_transfer_task_gpu_block_ids}, need_transfer_task_cpu_block_ids "
- + f"{need_transfer_task_cpu_block_ids}, CacheStatus.SWAP2CPU")
+ "free_block_ids_async: issue transfer task: "
+ + f"transfer_task_id {transfer_task_id}: "
+ + f"swap_node_ids {swap_node_ids} need_transfer_task_gpu_block_ids "
+ + f"{need_transfer_task_gpu_block_ids}, need_transfer_task_cpu_block_ids "
+ + f"{need_transfer_task_cpu_block_ids}, CacheStatus.SWAP2CPU"
+ )
self.issue_swap_task(
transfer_task_id,
swap_node_ids,
@@ -619,9 +639,8 @@ class PrefixCacheManager:
)
logger.info(
- "free_block_ids_async: after free, " +
- f"len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}")
- return
+ "free_block_ids_async: after free, " + f"len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}"
+ )
def free_block_ids_async(self, need_block_num):
"""
@@ -654,8 +673,10 @@ class PrefixCacheManager:
break
node = heapq.heappop(self.gpu_lru_leaf_heap)
self.gpu_lru_leaf_set.remove(node)
- if not self.cache_config.enable_hierarchical_cache or \
- self.cache_config.num_cpu_blocks < need_block_num:
+ if (
+ not self.cache_config.enable_hierarchical_cache
+ or self.cache_config.num_cpu_blocks < need_block_num
+ ):
if node.shared_count == 0 and node.is_gpu_leaf_node: # 直接回收
self._handle_free_gpu_node_without_cpu(node)
total_gpu_free_count += 1
@@ -666,12 +687,13 @@ class PrefixCacheManager:
if not node.children:
if node in self.gpu_lru_leaf_set:
continue
- if (node != self.radix_tree_root
- and node.shared_count == 0
- and node.is_gpu_leaf_node
- and node.is_persistent is False):
- heapq.heappush(self.gpu_lru_leaf_heap,
- node)
+ if (
+ node != self.radix_tree_root
+ and node.shared_count == 0
+ and node.is_gpu_leaf_node
+ and node.is_persistent is False
+ ):
+ heapq.heappush(self.gpu_lru_leaf_heap, node)
self.gpu_lru_leaf_set.add(node)
else:
continue
@@ -680,18 +702,25 @@ class PrefixCacheManager:
node.cache_status = CacheStatus.SWAP2CPU
else:
continue
- self._handle_free_gpu_node_with_cpu(node, hash_value_input_ids_map, \
- hash_value_depth_map, need_recycle_gpu_block_ids, \
- hash_value_gpu_block_ids_map, hash_value_swap_node_ids_map)
+ self._handle_free_gpu_node_with_cpu(
+ node,
+ hash_value_input_ids_map,
+ hash_value_depth_map,
+ need_recycle_gpu_block_ids,
+ hash_value_gpu_block_ids_map,
+ hash_value_swap_node_ids_map,
+ )
total_gpu_free_count += 1
node = node.parent
if node in self.gpu_lru_leaf_set:
continue
- if (node != self.radix_tree_root
- and node.shared_count == 0
- and node.is_gpu_leaf_node
- and node.is_persistent is False):
+ if (
+ node != self.radix_tree_root
+ and node.shared_count == 0
+ and node.is_gpu_leaf_node
+ and node.is_persistent is False
+ ):
heapq.heappush(self.gpu_lru_leaf_heap, node)
self.gpu_lru_leaf_set.add(node)
@@ -702,12 +731,16 @@ class PrefixCacheManager:
cpu_free_count = total_gpu_free_count
if cpu_free_count < need_block_num:
cpu_free_count = need_block_num
- cpu_free_future = self.free_cpu_executor_pool.submit(
- self.free_cpu_block_ids, cpu_free_count)
+ cpu_free_future = self.free_cpu_executor_pool.submit(self.free_cpu_block_ids, cpu_free_count)
self.gpu_free_task_future = self.free_gpu_executor_pool.submit(
- self._evict_cache_async, cpu_free_future, total_gpu_free_count, \
- hash_value_gpu_block_ids_map, hash_value_block_ids_map, \
- hash_value_swap_node_ids_map, hash_value_input_ids_map, hash_value_depth_map
+ self._evict_cache_async,
+ cpu_free_future,
+ total_gpu_free_count,
+ hash_value_gpu_block_ids_map,
+ hash_value_block_ids_map,
+ hash_value_swap_node_ids_map,
+ hash_value_input_ids_map,
+ hash_value_depth_map,
)
else:
self.gpu_free_task_future = None
@@ -717,17 +750,14 @@ class PrefixCacheManager:
def free_cpu_block_ids(self, need_block_num):
"""
- Evict CPU blocks (at least need_block_num blocks)
- Parameters:
- - need_block_num: Number of CPU blocks required to evict
+ Evict CPU blocks (at least need_block_num blocks)
+ Parameters:
+ - need_block_num: Number of CPU blocks required to evict
- Returns:
- - freed_block_num: Number of CPU blocks successfully evicted
+ Returns:
+ - freed_block_num: Number of CPU blocks successfully evicted
"""
- hash_value_input_ids_map = {}
hash_value_block_ids_map = defaultdict(list)
- hash_value_depth_map = {}
- need_recycle_cpu_block_ids = []
total_cpu_free_count = 0
with self.request_release_lock:
while True:
@@ -739,13 +769,10 @@ class PrefixCacheManager:
node = heapq.heappop(self.cpu_lru_leaf_heap)
self.cpu_lru_leaf_set.remove(node)
tmp_block_ids = []
- if (node.shared_count == 0
- and node.cache_status == CacheStatus.CPU
- and node.is_cpu_leaf_node):
+ if node.shared_count == 0 and node.cache_status == CacheStatus.CPU and node.is_cpu_leaf_node:
self.recycle_cpu_blocks(node.block_id)
- hash_value_block_ids_map[node.input_hash_value].extend(
- reversed(tmp_block_ids))
+ hash_value_block_ids_map[node.input_hash_value].extend(reversed(tmp_block_ids))
logger.info(f"free_cpu_block_ids: free node {node}")
self.node_id_pool.append(node.node_id)
@@ -759,15 +786,17 @@ class PrefixCacheManager:
if not node.children:
if node in self.cpu_lru_leaf_set:
continue
- if (node != self.radix_tree_root
- and node.shared_count == 0
- and node.is_cpu_leaf_node
- and node.cache_status == CacheStatus.CPU):
+ if (
+ node != self.radix_tree_root
+ and node.shared_count == 0
+ and node.is_cpu_leaf_node
+ and node.cache_status == CacheStatus.CPU
+ ):
heapq.heappush(self.cpu_lru_leaf_heap, node)
self.cpu_lru_leaf_set.add(node)
logger.info(
- "free_cpu_block_ids: after free, " +
- f"len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}")
+ "free_cpu_block_ids: after free, " + f"len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}"
+ )
return total_cpu_free_count
def cal_block_hash(self, block):
@@ -778,18 +807,18 @@ class PrefixCacheManager:
def match_block(self, req_id, input_ids, block_size):
"""
- Args:
- req_id: Task request ID
- input_ids: Input token IDs
- block_size: Size of each block
+ Args:
+ req_id: Task request ID
+ input_ids: Input token IDs
+ block_size: Size of each block
- Returns:
- match_gpu_block_ids: List of matched GPU block IDs
- match_cpu_block_ids: List of matched CPU block IDs
- swap_node_ids: List of node IDs requiring swap operations
- match_block_node: Last matched node in the path
- gpu_match_token_num: Number of tokens matched in GPU blocks
- cpu_match_token_num: Number of tokens matched in CPU blocks
+ Returns:
+ match_gpu_block_ids: List of matched GPU block IDs
+ match_cpu_block_ids: List of matched CPU block IDs
+ swap_node_ids: List of node IDs requiring swap operations
+ match_block_node: Last matched node in the path
+ gpu_match_token_num: Number of tokens matched in GPU blocks
+ cpu_match_token_num: Number of tokens matched in CPU blocks
"""
total_token_num = len(input_ids)
@@ -807,8 +836,7 @@ class PrefixCacheManager:
with self.cache_status_lock:
while match_token_num < total_token_num:
- token_block = input_ids[match_token_num:match_token_num +
- block_size]
+ token_block = input_ids[match_token_num : match_token_num + block_size]
token_num = len(token_block)
if token_num != block_size:
break
@@ -817,11 +845,11 @@ class PrefixCacheManager:
child = current_match_node.children[hash_value]
matche_nodes.append(child)
match_node_ids.append(child.node_id)
- if (child in self.gpu_lru_leaf_set):
+ if child in self.gpu_lru_leaf_set:
self.gpu_lru_leaf_set.remove(child)
self.gpu_lru_leaf_heap.remove(child)
has_modified_gpu_lru_leaf_heap = True
- elif (child in self.cpu_lru_leaf_set):
+ elif child in self.cpu_lru_leaf_set:
self.cpu_lru_leaf_set.remove(child)
self.cpu_lru_leaf_heap.remove(child)
has_modified_cpu_lru_leaf_heap = True
@@ -831,8 +859,9 @@ class PrefixCacheManager:
else:
if child.cache_status == CacheStatus.SWAP2CPU:
logger.info(
- f"match_block: req_id {req_id} matched node" +
- f" {child.node_id} which is being SWAP2CPU")
+ f"match_block: req_id {req_id} matched node"
+ + f" {child.node_id} which is being SWAP2CPU"
+ )
child.cache_status = CacheStatus.GPU
match_gpu_block_ids.append(child.block_id)
gpu_match_token_num += block_size
@@ -851,8 +880,7 @@ class PrefixCacheManager:
if has_modified_cpu_lru_leaf_heap:
heapq.heapify(self.cpu_lru_leaf_heap)
- logger.info(
- f"match_block: req_id {req_id} matched nodes: {match_node_ids}")
+ logger.info(f"match_block: req_id {req_id} matched nodes: {match_node_ids}")
return (
match_gpu_block_ids,
match_cpu_block_ids,
@@ -873,9 +901,17 @@ class PrefixCacheManager:
node.req_id_set.add(req_id)
node = node.parent
- def build_path(self, req_id, current_time, input_ids, left_input_ids,
- gpu_block_ids, block_size, last_node,
- reverved_dec_block_num):
+ def build_path(
+ self,
+ req_id,
+ current_time,
+ input_ids,
+ left_input_ids,
+ gpu_block_ids,
+ block_size,
+ last_node,
+ reverved_dec_block_num,
+ ):
"""
Build path for blocks beyond the common prefix
Parameters:
@@ -906,7 +942,7 @@ class PrefixCacheManager:
has_unfilled_block = False
for i in range(0, token_num, block_size):
- current_block = left_input_ids[i:i + block_size]
+ current_block = left_input_ids[i : i + block_size]
current_block_size = len(current_block) # 最后一个block可能没填满
if current_block_size != block_size:
has_unfilled_block = True
@@ -915,17 +951,19 @@ class PrefixCacheManager:
allocated_block_id = gpu_block_ids.pop(0)
node_id = self.node_id_pool.pop()
unique_node_ids.append(node_id)
- new_last_node = BlockNode(node_id,
- input_ids,
- input_hash_value,
- node.depth + 1,
- allocated_block_id,
- current_block_size,
- hash_value,
- current_time,
- parent=node,
- shared_count=1,
- reverved_dec_block_ids=[])
+ new_last_node = BlockNode(
+ node_id,
+ input_ids,
+ input_hash_value,
+ node.depth + 1,
+ allocated_block_id,
+ current_block_size,
+ hash_value,
+ current_time,
+ parent=node,
+ shared_count=1,
+ reverved_dec_block_ids=[],
+ )
new_last_node.req_id_set.add(req_id)
self.node_map[node_id] = new_last_node
node.children[hash_value] = new_last_node
@@ -939,46 +977,44 @@ class PrefixCacheManager:
self.unfilled_req_block_map[req_id] = reverved_dec_block_ids
else:
new_last_node.reverved_dec_block_ids.extend(reverved_dec_block_ids)
- logger.info(
- f"build_path: allocate unique node ids {unique_node_ids} for req_id {req_id}"
- )
+ logger.info(f"build_path: allocate unique node ids {unique_node_ids} for req_id {req_id}")
return new_last_node
- def _handle_swap_result(self, swap_node_id, task_gpu_block_id,
- task_cpu_block_id, event_type):
+ def _handle_swap_result(self, swap_node_id, task_gpu_block_id, task_cpu_block_id, event_type):
"""
handle swap resuha
"""
if swap_node_id is None:
return
with self.cache_status_lock:
- if (event_type.value == CacheStatus.SWAP2CPU.value):
+ if event_type.value == CacheStatus.SWAP2CPU.value:
gpu_block_id = task_gpu_block_id
cpu_block_id = task_cpu_block_id
node = self.node_map[swap_node_id]
if node.cache_status.value == CacheStatus.GPU.value:
logger.info(
- f"recv_data_transfer_result: node {node.node_id} " +
- f"has been reused when SWAP2CPU, recycle cpu block id {cpu_block_id}"
+ f"recv_data_transfer_result: node {node.node_id} "
+ + f"has been reused when SWAP2CPU, recycle cpu block id {cpu_block_id}"
)
self.recycle_cpu_blocks(cpu_block_id)
else:
node.cache_status = CacheStatus.CPU
node.block_id = cpu_block_id
- if (node != self.radix_tree_root and node.shared_count == 0
- and node.is_cpu_leaf_node
- and node.cache_status == CacheStatus.CPU):
+ if (
+ node != self.radix_tree_root
+ and node.shared_count == 0
+ and node.is_cpu_leaf_node
+ and node.cache_status == CacheStatus.CPU
+ ):
if node not in self.cpu_lru_leaf_set:
heapq.heappush(self.cpu_lru_leaf_heap, node)
self.cpu_lru_leaf_set.add(node)
self.recycle_gpu_blocks(gpu_block_id)
- logger.info(
- f"recv_data_transfer_result: after SWAP2CPU, node {node}"
- )
+ logger.info(f"recv_data_transfer_result: after SWAP2CPU, node {node}")
- elif (event_type.value == CacheStatus.SWAP2GPU.value):
+ elif event_type.value == CacheStatus.SWAP2GPU.value:
gpu_block_id = task_gpu_block_id
cpu_block_id = task_cpu_block_id
@@ -987,12 +1023,12 @@ class PrefixCacheManager:
node.block_id = gpu_block_id
self.recycle_cpu_blocks(cpu_block_id)
- logger.info(
- f"recv_data_transfer_result: after SWAP2GPU, node {node}")
+ logger.info(f"recv_data_transfer_result: after SWAP2GPU, node {node}")
else:
logger.warning(
f"recv_data_transfer_result: Get unexpected event type {event_type}"
- + ", only SWAP2CPU and SWAP2GPU supported")
+ + ", only SWAP2CPU and SWAP2GPU supported"
+ )
def recv_data_transfer_result(self):
"""
@@ -1024,10 +1060,8 @@ class PrefixCacheManager:
self.task_swapping_event[transfer_task_id].set()
logger.info(
f"recv_data_transfer_result: transfer_task_id {transfer_task_id}: "
- +
- f"task_node_ids {swap_node_ids} task_gpu_block_id {task_gpu_block_id} "
- +
- f"task_cpu_block_id {task_cpu_block_id} event_type {event_type} done"
+ + f"task_node_ids {swap_node_ids} task_gpu_block_id {task_gpu_block_id} "
+ + f"task_cpu_block_id {task_cpu_block_id} event_type {event_type} done"
)
except Exception as e:
logger.warning(f"recv_data_transfer_result: error: {e}")
diff --git a/fastdeploy/cache_manager/transfer_factory/__init__.py b/fastdeploy/cache_manager/transfer_factory/__init__.py
index c5270bbdd..31298a918 100644
--- a/fastdeploy/cache_manager/transfer_factory/__init__.py
+++ b/fastdeploy/cache_manager/transfer_factory/__init__.py
@@ -13,5 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
+
from .ipc_cache_transfer import IPCCommManager
from .rdma_cache_transfer import RDMACommManager
+
+__all__ = ["IPCCommManager", "RDMACommManager"]
diff --git a/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py
index 2f7bcffb5..61a4fa10b 100644
--- a/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py
+++ b/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py
@@ -14,13 +14,13 @@
# limitations under the License.
"""
-import os
-
import paddle
from fastdeploy.model_executor.ops.gpu import (
- get_data_ptr_ipc, ipc_sent_key_value_cache_by_remote_ptr,
- ipc_sent_key_value_cache_by_remote_ptr_block_sync)
+ get_data_ptr_ipc,
+ ipc_sent_key_value_cache_by_remote_ptr,
+ ipc_sent_key_value_cache_by_remote_ptr_block_sync,
+)
from fastdeploy.utils import get_logger
logger = get_logger("cache_messager", "cache_messager.log")
@@ -44,17 +44,13 @@ class IPCConnector:
self.rank_id = rank_id_
self.local_gpu_id = int(local_gpu_id_)
tmp = paddle.ones([1, 1])
- logger.info(
- f"init ipc rank{self.rank_id} with remote {self.remote_gpu_id} {self.local_gpu_id}"
- )
+ logger.info(f"init ipc rank{self.rank_id} with remote {self.remote_gpu_id} {self.local_gpu_id}")
for layer_id in range(layer_num):
key_unique_name = f"key_caches_{layer_id}_rank{self.rank_id}.device{self.remote_gpu_id}"
value_unique_name = f"value_caches_{layer_id}_rank{self.rank_id}.device{self.remote_gpu_id}"
- self.remote_key_tensor_ptr_list.append(
- get_data_ptr_ipc(tmp, key_unique_name))
- self.remote_value_tensor_ptr_list.append(
- get_data_ptr_ipc(tmp, value_unique_name))
- self.write_stream = paddle.device.Stream(f'gpu:{self.local_gpu_id}')
+ self.remote_key_tensor_ptr_list.append(get_data_ptr_ipc(tmp, key_unique_name))
+ self.remote_value_tensor_ptr_list.append(get_data_ptr_ipc(tmp, value_unique_name))
+ self.write_stream = paddle.device.Stream(f"gpu:{self.local_gpu_id}")
self.finish_event = paddle.device.Event()
@@ -64,11 +60,11 @@ class IPCCommManager:
"""
def __init__(
- self,
- rank_id_,
- gpu_idx_,
- local_key_cache_tensor_list, # tensor list
- local_value_cache_tensor_list, # tensor
+ self,
+ rank_id_,
+ gpu_idx_,
+ local_key_cache_tensor_list, # tensor list
+ local_value_cache_tensor_list, # tensor
):
self.rank_id = rank_id_
self.gpu_idx = gpu_idx_
@@ -83,14 +79,11 @@ class IPCCommManager:
"""
Connect to remote gpu.
"""
- logger.info(
- f"{self.rank_id}: connect to remote_gpu_id:{remote_gpu_id_} {self.layer_num} {self.gpu_idx}"
- )
+ logger.info(f"{self.rank_id}: connect to remote_gpu_id:{remote_gpu_id_} {self.layer_num} {self.gpu_idx}")
if self.is_connected(remote_gpu_id_):
return True
else:
- self.comm_map[remote_gpu_id_] = IPCConnector(
- self.rank_id, remote_gpu_id_, self.layer_num, self.gpu_idx)
+ self.comm_map[remote_gpu_id_] = IPCConnector(self.rank_id, remote_gpu_id_, self.layer_num, self.gpu_idx)
return True
def is_connected(self, remote_gpu_id_=0):
@@ -102,8 +95,7 @@ class IPCCommManager:
else:
return False
- def write_cache(self, ip, remote_gpu_id, local_block_ids, remote_block_ids,
- layer_idx):
+ def write_cache(self, ip, remote_gpu_id, local_block_ids, remote_block_ids, layer_idx):
"""
Connect to remote gpu and write cache.
"""
@@ -114,20 +106,26 @@ class IPCCommManager:
with paddle.device.stream_guard(comm.write_stream):
ipc_sent_key_value_cache_by_remote_ptr(
self.local_key_cache_tensor_list[layer_idx],
- self.local_value_cache_tensor_list[layer_idx], local_block_ids,
- remote_block_ids, comm.remote_key_tensor_ptr_list[layer_idx],
- comm.remote_value_tensor_ptr_list[layer_idx], block_num,
- self.gpu_idx, comm.remote_gpu_id,
- comm.write_stream.stream_base.cuda_stream)
+ self.local_value_cache_tensor_list[layer_idx],
+ local_block_ids,
+ remote_block_ids,
+ comm.remote_key_tensor_ptr_list[layer_idx],
+ comm.remote_value_tensor_ptr_list[layer_idx],
+ block_num,
+ self.gpu_idx,
+ comm.remote_gpu_id,
+ comm.write_stream.stream_base.cuda_stream,
+ )
return 0
def write_block_by_sync(self, remote_gpu_id):
"""
check finish event and wait for it
"""
- paddle.set_device(f'gpu:{self.gpu_idx}')
+ paddle.set_device(f"gpu:{self.gpu_idx}")
comm = self.comm_map[remote_gpu_id]
ipc_sent_key_value_cache_by_remote_ptr_block_sync(
- self.local_key_cache_tensor_list[0], #tensor no use
- self.local_value_cache_tensor_list[0], #tensor no use
- comm.write_stream.stream_base.cuda_stream)
+ self.local_key_cache_tensor_list[0], # tensor no use
+ self.local_value_cache_tensor_list[0], # tensor no use
+ comm.write_stream.stream_base.cuda_stream,
+ )
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt
index c241538c8..7bed564e9 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt
@@ -25,7 +25,7 @@ find_package(pybind11 CONFIG REQUIRED)
include_directories("${PROJECT_SOURCE_DIR}/include")
add_library(rdma_comm MODULE ${PROJECT_SOURCE_DIR}/src/pybind.cpp ${PROJECT_SOURCE_DIR}/src/kvcache_rdma.cpp ${PROJECT_SOURCE_DIR}/src/kvcache_connection.cpp ${PROJECT_SOURCE_DIR}/src/log.cpp)
-set_target_properties(rdma_comm PROPERTIES
+set_target_properties(rdma_comm PROPERTIES
OUTPUT_NAME "rdma_comm"
PREFIX ""
SUFFIX ".so"
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md
index b16ab460a..700a045fe 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md
@@ -11,7 +11,7 @@ A dedicated component for transferring KV Cache between Prefill and Decode nodes
- Single Mellanox ConnectX-7 400G NIC (single port)
- Tested with BATCH_SIZE = 1538 and block size = 1K - 256K
- Single pressure thread (threads = 1)
-
+
- **Comparison Baseline**:
- Mooncake performance measured using transfer_engine_bench from example directory
- Same hardware configuration and test parameters applied to KVTransferManager
@@ -42,11 +42,13 @@ Bandwidth Saturation Capability: Under multi-threaded high-pressure scenarios, b
### Dependencies Installation
#### Python Packages
+
```bash
pip install pyzmq pybind11[global]
```
#### System Libraries (Linux)
+
```bash
# Ubuntu/Debian
sudo apt-get install -y libibverbs-dev librdmacm-dev
@@ -62,10 +64,10 @@ sudo yum install -y libibverbs-devel librdmacm-devel
#### Ampere Architecture Note
To support Ampere GPUs, enable the environment variable KVCACHE_GDRCOPY_FLUSH_ENABLE.
- What it does:
- Forces memory flushing after a GDRCopy write operation to ensure data consistency on the Ampere architecture. Here if KVCACHE_GDRCOPY_FLUSH_ENABLE is enable we trigger an RDMA read operation after the last RDMA write.
+ Forces memory flushing after a GDRCopy write operation to ensure data consistency on the Ampere architecture. Here if KVCACHE_GDRCOPY_FLUSH_ENABLE is enable we trigger an RDMA read operation after the last RDMA write.
- Why it’s needed:
When the NIC delivers a completion to the CPU, it indicates that the data has reach the GPU. However, it doesn't mean that the GPU can read that data yet. To make sure the data has gone all the way down to the GPU memory and the GPU can read it, we need to perform a read.
-[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) |
+[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) |
[NCCL Issue #1702](https://github.com/NVIDIA/nccl/issues/1702)
Since the upper layer typically issues a cache arrival notification only after polling a Completion Queue Entry (CQE), this prevents the application from being notified before the data is actually written back to memory. Therefore, the potential race condition where the cache has not yet been flushed but the application assumes completion is considered a rare event in practice.
- How to enable:
@@ -75,14 +77,14 @@ To support Ampere GPUs, enable the environment variable KVCACHE_GDRCOPY_FLUSH_EN
```bash
# Build and make symbolic links for SO files
-python setup.py bdist_wheel
+python setup.py bdist_wheel
pip install dist/*.whl
```
## Environment Variables Configuration
-### RDMA Settings
+### RDMA Settings
| Variable | Default | Description |
|----------|---------|-------------|
| `KVCACHE_RDMA_GID_INDEX` | 3 | RDMA GID index |
@@ -90,25 +92,23 @@ pip install dist/*.whl
| `KVCACHE_IB_TIMEOUT` | 18 | InfiniBand communication timeout (14-31), where timeout = 4.096μs * 2^value (default 18 ≈ 1.07s).|
| `KVCACHE_RELAX_ORDERING` | false | Enable RDMA relaxed ordering to improve performance in multi-GPU scenarios. Recommended when multiple GPUs share the same NIC to mitigate TX pause issues. |
-### Network Settings
+### Network Settings
| Variable | Default | Description |
|----------|---------|-------------|
| `KVCACHE_SOCKET_IFNAME` | auto | Network interface for socket comm (e.g. "eth0") |
-### Debugging
+### Debugging
| Variable | Default | Description |
|----------|---------|-------------|
| `KVCACHE_DEBUG` | false | Enable debug logging |
| `KVCACHE_DEBUG_FILE` | - | Debug log file path |
| `KVCACHE_ERROR_FILE` | - | Error log file path |
-### Performance Tuning
+### Performance Tuning
| Variable | Default | Description |
|----------|---------|-------------|
| `KVCACHE_GDRCOPY_FLUSH_ENABLE` | false | Enable GDRCopy flush for Ampere GPUs |
-
-
# Set RDMA GID index
export KVCACHE_RDMA_GID_INDEX=3
@@ -125,7 +125,6 @@ export KVCACHE_DEBUG=1
export KVCACHE_DEBUG_FILE=/var/log/kvcache_debug.log
export KVCACHE_ERROR_FILE=/var/log/kvcache_error.log
-
## Network configurations
kvcache transfer is fully tested with RDMA over Converged Ethernet (RoCE) networks. However, it is theoretically compatible with Infiniband as well.
@@ -164,14 +163,14 @@ comm.write_cache(
**Parameter Details**:
-1. `role`:
+1. `role`:
- "prefill": Prefill node role
- "decode": Decode node role
-2. `gpu_idx`:
+2. `gpu_idx`:
- GPU device index to use
-3. `port`:
+3. `port`:
- RDMA communication port number
4. `local_key_cache`/`local_value_cache`:
@@ -216,7 +215,7 @@ comm = RDMACommunicator(
if comm.connect("192.168.1.100", "12345"):
print("Connection established")
-
+
# Write cache
comm.write_cache(
ip="192.168.1.100", # Target server IP
@@ -229,4 +228,4 @@ if comm.connect("192.168.1.100", "12345"):
## Citation
-If you use this codebase, or otherwise found our work valuable, please cite:
\ No newline at end of file
+If you use this codebase, or otherwise found our work valuable, please cite:
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md
index bed94d860..b2a2be91a 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md
@@ -11,7 +11,7 @@
- 单张Mellanox ConnectX-7 400G网卡(单端口)
- 测试参数: BATCH_SIZE = 1538, 块大小 = 1K - 256K
- 单压力线程(threads = 1)
-
+
- **对比基准**:
- Mooncake性能使用example目录中的transfer_engine_bench测量
- KVTransferManager使用相同的硬件配置和测试参数
@@ -43,11 +43,13 @@
### 依赖安装
#### Python包
+
```bash
pip install pyzmq pybind11[global]
```
#### 系统库(Linux)
+
```bash
# Ubuntu/Debian
sudo apt-get install -y libibverbs-dev librdmacm-dev
@@ -66,7 +68,7 @@ sudo yum install -y libibverbs-devel librdmacm-devel
在GDRCopy写操作后强制内存刷新,确保Ampere架构上的数据一致性。启用后会在最后一个RDMA写操作后触发一个RDMA读操作。
- 原因:
当网卡向CPU发送完成通知时,仅表示数据已到达GPU,但不保证GPU可以立即读取该数据。为确保数据已完全写入GPU内存且可被GPU读取,需要执行读操作。
-[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) |
+[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) |
[NCCL Issue #1702](https://github.com/NVIDIA/nccl/issues/1702)
由于上层通常只在轮询完成队列条目(CQE)后发出缓存到达通知,这避免了应用在数据实际写回内存前收到通知的情况。因此,缓存未刷新但应用认为已完成这种潜在问题在实践中被认为是罕见情况。
- 启用方式:
@@ -76,7 +78,7 @@ sudo yum install -y libibverbs-devel librdmacm-devel
```bash
# 构建并创建SO文件的符号链接
-python setup.py bdist_wheel
+python setup.py bdist_wheel
pip install dist/*.whl
```
@@ -108,7 +110,6 @@ pip install dist/*.whl
|------|--------|------|
| `KVCACHE_GDRCOPY_FLUSH_ENABLE` | false | 为Ampere GPU启用GDRCopy刷新 |
-
# 设置RDMA GID索引
export KVCACHE_RDMA_GID_INDEX=3
@@ -125,7 +126,6 @@ export KVCACHE_DEBUG=1
export KVCACHE_DEBUG_FILE=/var/log/kvcache_debug.log
export KVCACHE_ERROR_FILE=/var/log/kvcache_error.log
-
## 网络配置
kvcache transfer已通过RDMA over Converged Ethernet (RoCE)网络全面测试。理论上也兼容Infiniband。
@@ -145,7 +145,7 @@ comm = RDMACommunicator(
gpu_idx, # GPU设备索引(0~7)
port, # 通信端口
local_key_cache, # 本地key缓存指针列表
- local_value_cache, # 本地value缓存指针列表
+ local_value_cache, # 本地value缓存指针列表
block_number, # 块数量
block_bytes # 每块字节数
)
@@ -159,19 +159,19 @@ comm.write_cache(
local_block_ids, # 本地缓存块ID列表,指定要传输的本地块
remote_block_ids, # 远程缓存块ID列表,指定要写入的远程块
layer_idx # 模型层索引,用于多层模型场景
-)
+)
```
**参数说明**:
-1. `role`:
+1. `role`:
- "prefill"
- "decode"
-2. `gpu_idx`:
+2. `gpu_idx`:
- 使用的GPU设备索引
-3. `port`:
+3. `port`:
- RDMA通信端口号
4. `local_key_cache`/`local_value_cache`:
@@ -216,7 +216,7 @@ comm = RDMACommunicator(
if comm.connect("192.168.1.100", "12345"):
print("连接成功")
-
+
# 写入缓存
comm.write_cache(
ip="192.168.1.100", # 目标服务器IP
@@ -229,4 +229,4 @@ if comm.connect("192.168.1.100", "12345"):
## 引用
-如果您使用此代码库,或认为我们的工作有价值,请引用:
\ No newline at end of file
+如果您使用此代码库,或认为我们的工作有价值,请引用:
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h
index 28877ea65..596e3b2e6 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h
@@ -3,13 +3,13 @@
* @brief RDMA connection management for key-value cache
* @version 1.0.0
* @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
- *
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,22 +32,22 @@
#include
#include
#include
-#include
-#include
-#include
-#include
-#include
-#include
#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
#include "kvcache_rdma.h"
#include "util.h"
@@ -88,8 +88,8 @@ struct QpInfo {
intBuffer[0] = htonl(lid);
intBuffer[1] = htonl(qpn);
intBuffer[2] = htonl(psn);
- memcpy(buffer + 12, gid.raw, sizeof(gid.raw));
- intBuffer[7] = htonl(static_cast(mtu));
+ memcpy(buffer + 12, gid.raw, sizeof(gid.raw));
+ intBuffer[7] = htonl(static_cast(mtu));
}
/// @brief Deserialize QP info from buffer
@@ -102,7 +102,7 @@ struct QpInfo {
mtu = static_cast(ntohl(intBuffer[7]));
}
- static const size_t size = 12 + sizeof(gid.raw) + 4;
+ static const size_t size = 12 + sizeof(gid.raw) + 4;
};
/// @brief RDMA connection context
@@ -137,13 +137,13 @@ struct Connection {
std::vector send_write_cache_key_remote_ptr_list;
std::vector send_write_cache_key_remote_rkey_list;
- std::vector send_write_cache_value_remote_ptr_list;
+ std::vector send_write_cache_value_remote_ptr_list;
std::vector send_write_cache_value_remote_rkey_list;
// For rdma read operations
std::vector read_bufs;
std::vector read_mrs;
-
+
// Work completion tracking
int wc_count;
int wc_target_count;
@@ -208,4 +208,4 @@ int setup_listening_socket(int port);
int configure_epoll(int sockfd);
std::vector get_net_ifname();
-#endif // FASTDEPLOY_KVCACHE_CONNECTION_H
\ No newline at end of file
+#endif // FASTDEPLOY_KVCACHE_CONNECTION_H
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
index 73df757fd..de759e909 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h
@@ -61,30 +61,30 @@ private:
uint32_t rkey, const std::string &ip,
const std::string &port);
- bool execute_rdma_writes(struct RdmaContext* ctx, int layer_idx,
- const std::vector& local_block_ids,
- bool is_key, std::vector& remote_addr,
+ bool execute_rdma_writes(struct RdmaContext* ctx, int layer_idx,
+ const std::vector& local_block_ids,
+ bool is_key, std::vector& remote_addr,
uint32_t rkey);
-
- void prepare_write_requests(struct ibv_sge* sge_list,
+
+ void prepare_write_requests(struct ibv_sge* sge_list,
struct ibv_send_wr* send_wr_list,
- int layer_idx,
+ int layer_idx,
const std::vector& local_block_ids,
- bool is_key,
- std::vector& remote_addr,
+ bool is_key,
+ std::vector& remote_addr,
uint32_t rkey);
-
- bool execute_read_verification(struct RdmaContext* ctx,
- size_t block_idx,
- uint64_t remote_addr,
+
+ bool execute_read_verification(struct RdmaContext* ctx,
+ size_t block_idx,
+ uint64_t remote_addr,
uint32_t rkey,
int layer_idx,
- const std::string& ip,
+ const std::string& ip,
const std::string& port);
-
- bool post_send_with_retry(struct RdmaContext* ctx,
- struct ibv_send_wr* wr_list,
- size_t inflight_wr,
+
+ bool post_send_with_retry(struct RdmaContext* ctx,
+ struct ibv_send_wr* wr_list,
+ size_t inflight_wr,
bool need_poll);
// Connection management
@@ -119,7 +119,7 @@ private:
std::map conn_map; // Active connections map
std::mutex mutex_; // Thread synchronization mutex
int rdma_event_channel_epoll_fd; // Epoll file descriptor
- struct ibv_pd *g_pd = NULL; // fd
+ struct ibv_pd *g_pd = NULL; // fd
int RDMACommunicator_status; // Communicator status flag
bool start_client_listener = false; // Client listener flag
};
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h
index 923a0316d..d0bf18ae2 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h
@@ -5,13 +5,13 @@
* @brief Logging module for key-value cache system
* @version 1.0.0
* @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
- *
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -43,7 +43,7 @@ typedef enum {
KV_LOG_LEVEL_ERROR = 3
} KVLogLevel;
-void debug_log(KVLogLevel level, bool enable_to_terminal, const char *filefunc,
+void debug_log(KVLogLevel level, bool enable_to_terminal, const char *filefunc,
int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
/**
@@ -107,11 +107,11 @@ void debug_log(KVLogLevel level, bool enable_to_terminal, const char *filefunc,
LOGD(fmt, __VA_ARGS__); \
} while (0)
-#define LOGD_RAW(fmt, arg...) do { \
+#define LOGD_RAW(fmt, arg...) do { \
if (ENV_ENABLE_RAW("KV_IS_DEBUG_ENABLED")) { \
GET_CURRENT_TIME(); \
fprintf(stdout, "[%s][DBG][KV_CACHE][%s:%d] " \
fmt "\n", str, \
FILE_NAME(__FILE__), __LINE__, ## arg); \
} \
- } while (0)
\ No newline at end of file
+ } while (0)
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h
index d2149a6dc..c040b2a62 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h
@@ -15,12 +15,12 @@
#include
#include
#include
-#include
+#include
#include "log.h"
#define PATH_MAX 4096 /* # chars in a path name including nul */
#define RDMA_WR_LIST_MAX_SIZE 32
-#define RDMA_SQ_MAX_SIZE 1024
+#define RDMA_SQ_MAX_SIZE 1024
#define RDMA_DEFAULT_PORT 20001
#define RDMA_TCP_CONNECT_SIZE 1024
@@ -54,19 +54,19 @@ enum class QpStatus {
inline void busid_to_int64(const char *busId, int64_t *id) {
char hexStr[17] = {0};
int hexOffset = 0;
-
+
// Filter valid hex characters
for (int i = 0; hexOffset < sizeof(hexStr) - 1 && busId[i] != '\0'; i++) {
char c = busId[i];
if (c == '.' || c == ':') continue;
-
- if ((c >= '0' && c <= '9') ||
- (c >= 'A' && c <= 'F') ||
+
+ if ((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'F') ||
(c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = c;
}
}
-
+
*id = strtol(hexStr, NULL, 16);
}
@@ -78,45 +78,45 @@ public:
bool is_up;
bool is_running;
bool is_loopback;
-
+
bool isUsable() const {
return is_up && is_running && !is_loopback;
}
};
-
+
static std::vector getAllInterfaces() {
std::vector interfaces;
struct ifaddrs *ifaddrs_ptr = nullptr;
-
+
if (getifaddrs(&ifaddrs_ptr) == -1) {
return interfaces;
}
-
+
for (struct ifaddrs *ifa = ifaddrs_ptr; ifa != nullptr; ifa = ifa->ifa_next) {
if (ifa->ifa_addr == nullptr) continue;
if (ifa->ifa_addr->sa_family != AF_INET) continue;
-
+
InterfaceInfo info;
info.name = ifa->ifa_name;
info.is_up = (ifa->ifa_flags & IFF_UP) != 0;
info.is_running = (ifa->ifa_flags & IFF_RUNNING) != 0;
info.is_loopback = (ifa->ifa_flags & IFF_LOOPBACK) != 0;
-
+
struct sockaddr_in* sa = (struct sockaddr_in*)ifa->ifa_addr;
char ip_str[INET_ADDRSTRLEN];
inet_ntop(AF_INET, &sa->sin_addr, ip_str, INET_ADDRSTRLEN);
info.ip = ip_str;
-
+
interfaces.push_back(info);
}
-
+
freeifaddrs(ifaddrs_ptr);
return interfaces;
}
-
+
static std::string getFirstUsableInterface() {
auto interfaces = getAllInterfaces();
-
+
for (const auto& iface : interfaces) {
if (iface.isUsable()) {
return iface.name;
@@ -124,14 +124,14 @@ public:
}
return "";
}
-
+
static void displayAllInterfaces() {
auto interfaces = getAllInterfaces();
-
+
printf("Available network interfaces:\n");
for (const auto& iface : interfaces) {
- printf(" %s: %s [%s%s%s]\n",
- iface.name.c_str(),
+ printf(" %s: %s [%s%s%s]\n",
+ iface.name.c_str(),
iface.ip.c_str(),
iface.is_up ? "UP" : "DOWN",
iface.is_running ? ",RUNNING" : "",
@@ -157,13 +157,13 @@ private:
bool relax_ordering_enabled_;
int ib_timeout_;
const char* rdma_nics_;
-
+
// Private constructor for singleton pattern
KVCacheConfig() {
// Initialize configuration from environment variables
rdma_gid_index_ = parse_int_value(
std::getenv("KVCACHE_RDMA_GID_INDEX"), 3, "KVCACHE_RDMA_GID_INDEX");
-
+
// Parse optional RDMA port override
const char* port_value = std::getenv("SET_RDMA_DEST_PORT");
has_rdma_dest_port_override_ = false; // 默认为false
@@ -177,7 +177,7 @@ private:
}
const char* env_interface = std::getenv("KVCACHE_SOCKET_IFNAME");
-
+
if (env_interface && env_interface[0] != '\0') {
socket_interface_ = env_interface;
printf("Using specified interface: %s\n", socket_interface_);
@@ -194,14 +194,14 @@ private:
}
NetworkInterfaceManager::displayAllInterfaces();
}
-
+
socket_interface_ = std::getenv("KVCACHE_SOCKET_IFNAME");
debug_file_path_ = std::getenv("KVCACHE_DEBUG_FILE");
error_file_path_ = std::getenv("KVCACHE_ERROR_FILE");
-
+
gdrcopy_flush_enabled_ = parse_bool_value(std::getenv("KVCACHE_GDRCOPY_FLUSH_ENABLE"));
verify_read_enabled_ = parse_bool_value(std::getenv("KVCACHE_VERIFY_READ"));
- debug_mode_enabled_ = parse_bool_value(std::getenv("KVCACHE_DEBUG")) ||
+ debug_mode_enabled_ = parse_bool_value(std::getenv("KVCACHE_DEBUG")) ||
parse_bool_value(std::getenv("KV_IS_DEBUG_ENABLED"));
debug_output_enabled_ = parse_bool_value(std::getenv("KVCACHE_DEBUG_OUTPUT"));
@@ -215,29 +215,29 @@ private:
rdma_nics_ = std::getenv("KVCACHE_RDMA_NICS");
}
-
+
// Helper methods
bool parse_bool_value(const char* value) {
if (!value) return false;
-
+
std::string str_value(value);
std::transform(str_value.begin(), str_value.end(), str_value.begin(), ::tolower);
-
- return (str_value == "1" || str_value == "true" ||
+
+ return (str_value == "1" || str_value == "true" ||
str_value == "on" || str_value == "yes");
}
-
+
int parse_int_value(const char* value, int default_value, const char* env_name) {
if (!value) return default_value;
-
+
try {
return std::stoi(std::string(value));
} catch (const std::invalid_argument& e) {
- fprintf(stderr, "Invalid value for %s: '%s', using default: %d\n",
+ fprintf(stderr, "Invalid value for %s: '%s', using default: %d\n",
env_name, value, default_value);
return default_value;
} catch (const std::out_of_range& e) {
- fprintf(stderr, "%s value out of range: '%s', using default: %d\n",
+ fprintf(stderr, "%s value out of range: '%s', using default: %d\n",
env_name, value, default_value);
return default_value;
}
@@ -247,7 +247,7 @@ public:
// Prevent copying and assignment
KVCacheConfig(const KVCacheConfig&) = delete;
KVCacheConfig& operator=(const KVCacheConfig&) = delete;
-
+
// Get singleton instance
static KVCacheConfig& getInstance() {
static KVCacheConfig instance;
@@ -255,14 +255,14 @@ public:
}
int get_ib_timeout() const { return ib_timeout_; }
-
+
// Configuration retrieval methods
int get_rdma_gid_index() const { return rdma_gid_index_; }
-
+
int resolve_rdma_dest_port(int default_port) const {
return has_rdma_dest_port_override_ ? rdma_dest_port_override_ : default_port;
}
-
+
int resolve_rdma_dest_port(const std::string& default_port) const {
try {
return resolve_rdma_dest_port(std::stoi(default_port));
@@ -271,45 +271,45 @@ public:
return 0;
}
}
-
+
const char* get_socket_interface() const { return socket_interface_; }
const char* get_debug_file_path() const { return debug_file_path_; }
const char* get_error_file_path() const { return error_file_path_; }
const char* get_rdma_nics() const { return rdma_nics_; }
-
+
// Feature check methods
bool is_gdrcopy_flush_enabled() const { return gdrcopy_flush_enabled_; }
bool is_verify_read_enabled() const { return verify_read_enabled_; }
bool is_debug_mode_enabled() const { return debug_mode_enabled_; }
bool is_debug_output_enabled() const { return debug_output_enabled_; }
bool is_relax_ordering_enabled() const { return relax_ordering_enabled_; }
-
+
// Display configuration
void displayConfiguration() const {
INFO("KVCache Configuration:\n");
INFO("Init KVCacheConfig RDMA GID Index: %d\n", rdma_gid_index_);
-
+
if (has_rdma_dest_port_override_) {
INFO("Init KVCacheConfig RDMA Destination Port Override: %d\n", rdma_dest_port_override_);
}
-
+
if (socket_interface_) {
INFO("Init KVCacheConfig Socket Interface: %s\n", socket_interface_);
}
-
+
INFO("Init KVCacheConfig GDRCopy Flush: %s\n", gdrcopy_flush_enabled_ ? "enabled" : "disabled");
INFO("Init KVCacheConfig Verify Read: %s\n", verify_read_enabled_ ? "enabled" : "disabled");
INFO("Init KVCacheConfig Debug Mode: %s\n", debug_mode_enabled_ ? "enabled" : "disabled");
INFO("Init KVCacheConfig Debug Output: %s\n", debug_output_enabled_ ? "enabled" : "disabled");
-
+
if (debug_file_path_) {
INFO("Init KVCacheConfig Debug File: %s\n", debug_file_path_);
}
-
+
if (error_file_path_) {
INFO("Init KVCacheConfig Error File: %s\n", error_file_path_);
}
}
};
-#endif
\ No newline at end of file
+#endif
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp
index 1551e7c78..d49a8271c 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp
@@ -3,13 +3,13 @@
* @brief RDMA connection implementation for key-value cache
* @version 1.0.0
* @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
- *
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -32,7 +32,7 @@ std::vector g_ib_all_devs;
static int64_t get_ib_busid(const char *dev_name) {
char dev_path[PATH_MAX];
snprintf(dev_path, PATH_MAX, "/sys/class/infiniband/%s/device", dev_name);
-
+
char *p = realpath(dev_path, NULL);
if (p == NULL) {
WARN("Failed to get realpath for device %s: %s", dev_name, strerror(errno));
@@ -63,7 +63,7 @@ static int64_t get_ib_busid(const char *dev_name) {
/**
* @brief Parse and cache IB device information
* @return Number of IB devices found, negative on error
- *
+ *
* @note This function is thread-safe and will only parse once
*/
int parse_port_ib_info() {
@@ -448,7 +448,7 @@ bool poll_cq_with_timeout(struct RdmaContext *ctx, int timeout_seconds, int cqe_
if ((current_time.tv_sec - start_time.tv_sec) >= timeout_seconds) {
ERR("Timeout occurred after %d seconds", timeout_seconds);
free(wc_array);
- return false;
+ return false;
}
}
return true;
@@ -468,7 +468,7 @@ bool clear_qp_info(struct RdmaContext* ctx) {
success = false;
}
}
-
+
if (ctx->cq) {
if (ibv_destroy_cq(ctx->cq)) {
ERR("Failed to deallocate cq Domain.");
@@ -565,7 +565,7 @@ struct RdmaContext* create_qp(struct IbDeviceInfo* ib_dev, struct ibv_pd** g_pd)
return NULL;
}
- INFO("Successfully created QP 0x%x on device %s",
+ INFO("Successfully created QP 0x%x on device %s",
ctx->qp->qp_num, ib_dev->devName);
return ctx;
@@ -601,10 +601,10 @@ bool client_exchange_destinations(
ERR("Failed to get port info for port %d", ib_port);
return false;
}
-
+
my_dest.lid = ctx->portinfo.lid;
my_dest.mtu = ctx->portinfo.active_mtu;
-
+
// Validate LID for InfiniBand
if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !my_dest.lid) {
ERR("Invalid LID 0x%04x for non-Ethernet link layer", my_dest.lid);
@@ -722,24 +722,24 @@ bool server_exchange_mr(struct RdmaContext *ctx) {
auto layer_num = ctx->conn.layer_number;
auto& key_mrs = ctx->conn.write_cache_key_server_mr_list;
auto& val_mrs = ctx->conn.write_cache_value_server_mr_list;
-
+
// Verify that server memory regions are properly initialized
if (key_mrs.size() != layer_num || val_mrs.size() != layer_num) {
ERR("server write cache memory region size error");
return false;
}
-
+
// Prepare memory region information to send
std::vector send_key_ptrs;
std::vector send_key_rkeys;
std::vector send_val_ptrs;
std::vector send_val_rkeys;
-
+
send_key_ptrs.reserve(layer_num);
send_key_rkeys.reserve(layer_num);
send_val_ptrs.reserve(layer_num);
send_val_rkeys.reserve(layer_num);
-
+
// Collect memory region information from local MRs
for (int i = 0; i < layer_num; ++i) {
send_key_ptrs.push_back(reinterpret_cast(key_mrs[i]->addr));
@@ -753,13 +753,13 @@ bool server_exchange_mr(struct RdmaContext *ctx) {
if (!exchange_mr_vector(ctx, send_key_rkeys, false)) return false;
if (!exchange_mr_vector(ctx, send_val_ptrs, false)) return false;
if (!exchange_mr_vector(ctx, send_val_rkeys, false)) return false;
-
+
return true;
}
/**
* Send memory region information from server to client
- *
+ *
* @param ctx The RDMA context
* @param local_mr Pointer to the local memory region to be sent
* @param byte_num Size of the memory region in bytes
@@ -796,16 +796,16 @@ bool server_send_memory_region(struct RdmaContext *ctx, void *local_mr, int byte
ibv_dereg_mr(ctx->conn.send_mr);
return false;
}
-
+
// Wait for completion
struct ibv_wc wc;
ctx->conn.wc_count = 0;
ctx->conn.wc_target_count = 0;
-
+
if (!poll_cq_with_timeout(ctx, RDMA_POLL_CQE_TIMEOUT, 1)) {
return false;
}
-
+
// Deregister the memory region
ibv_dereg_mr(ctx->conn.send_mr);
return true;
@@ -813,7 +813,7 @@ bool server_send_memory_region(struct RdmaContext *ctx, void *local_mr, int byte
/**
* Receive memory region information on the client side
- *
+ *
* @param ctx The RDMA context
* @param remote_mr Pointer to the buffer where remote memory region info will be stored
* @param byte_num Size of the memory region in bytes
@@ -863,17 +863,17 @@ bool client_receive_memory_region(struct RdmaContext *ctx, void *remote_mr, int
/**
* Sets up a listening socket on the specified port
- *
+ *
* @param port The port number to listen on
* @return The socket file descriptor on success, -1 on failure
*/
int setup_listening_socket(int port) {
int sockfd = -1;
struct addrinfo hints = {0};
-
+
// Set up hints for getaddrinfo
hints.ai_flags = AI_PASSIVE;
- hints.ai_family = AF_UNSPEC;
+ hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
struct addrinfo *res = nullptr;
@@ -881,14 +881,14 @@ int setup_listening_socket(int port) {
// Convert port to string for getaddrinfo
std::ostringstream service;
service << port;
-
+
// Get address info for the specified port
int n = getaddrinfo(nullptr, service.str().c_str(), &hints, &res);
if (n != 0) {
ERR("getaddrinfo failed for port %d: %s", port, gai_strerror(n));
return -1;
}
-
+
// Check if a specific network interface is specified
const char *ifname = KVCacheConfig::getInstance().get_socket_interface();
// Try each address until we successfully bind to one
@@ -913,7 +913,7 @@ int setup_listening_socket(int port) {
// Enable address reuse
n = 1;
setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n));
-
+
// Attempt to bind to the address
if (bind(sockfd, t->ai_addr, t->ai_addrlen) == 0) {
break; // Successful bind
@@ -948,7 +948,7 @@ int setup_listening_socket(int port) {
close(sockfd);
return -1;
}
-
+
// Enable TCP keep-alive
int enable = 1;
if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, &enable, sizeof(enable)) < 0) {
diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
index 16df80701..3f2d21016 100644
--- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
+++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp
@@ -3,13 +3,13 @@
* @brief RDMA-based Key-Value Cache Communication Implementation
* @version 1.0.0
* @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
- *
+ *
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,15 +34,15 @@
/**
* @brief Construct a new RDMACommunicator object
- *
+ *
* @param role Role in distributed system ("decode" or "prefill")
* @param gpu_idx GPU device index to use
* @param port Communication port number
* @param local_key_cache Vector of local key cache pointers
- * @param local_value_cache Vector of local value cache pointers
+ * @param local_value_cache Vector of local value cache pointers
* @param block_number Number of blocks in cache
* @param block_bytes Size of each block in bytes
- *
+ *
* @throws std::runtime_error If initialization fails
*/
RDMACommunicator::RDMACommunicator(std::string &role, int gpu_idx,
@@ -50,16 +50,16 @@ RDMACommunicator::RDMACommunicator(std::string &role, int gpu_idx,
std::vector