diff --git a/.flake8 b/.flake8 new file mode 100644 index 000000000..869c57d3e --- /dev/null +++ b/.flake8 @@ -0,0 +1,7 @@ +[flake8] +ignore = E203, E402, E501, E731, E741, W503, W605, E722 +max-line-length = 119 + +# E402: module level import not at top of file +per-file-ignores = + __init__.py:F401,F403,E402 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc4574e91..518b15eb9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI on: pull_request: - branches: + branches: - develop - 'release/*' workflow_dispatch: @@ -86,4 +86,4 @@ jobs: git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy bash scripts/run_ci.sh - " \ No newline at end of file + " diff --git a/.github/workflows/ci_xpu.yml b/.github/workflows/ci_xpu.yml index e87136704..7bb267fd2 100644 --- a/.github/workflows/ci_xpu.yml +++ b/.github/workflows/ci_xpu.yml @@ -2,7 +2,7 @@ name: CI_XPU on: pull_request: - branches: + branches: - develop - 'release/*' workflow_dispatch: @@ -63,7 +63,7 @@ jobs: if [[ "$last_char" =~ [0-3] ]]; then gpu_id="$last_char" else - gpu_id="0" + gpu_id="0" fi FD_API_PORT=$((9180 + gpu_id * 100)) FD_ENGINE_QUEUE_PORT=$((9150 + gpu_id * 100)) @@ -84,4 +84,4 @@ jobs: git config --global --add safe.directory /workspace/FastDeploy cd FastDeploy bash scripts/run_ci_xpu.sh - " \ No newline at end of file + " diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c7a2d150e..ce8942933 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,12 +5,27 @@ default_stages: - pre-commit # Run locally # - manual # Run in CI repos: +- repo: https://github.com/psf/black.git + rev: 22.8.0 + hooks: + - id: black + files: \.(py|pyi)$ + additional_dependencies: [toml] +# 自动排序 +- repo: https://github.com/PyCQA/isort + rev: 5.11.5 + hooks: + - id: isort +- repo: https://github.com/PyCQA/flake8 + rev: 4.0.1 + hooks: + - id: flake8 # 代码检查 - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.11.7 hooks: - id: ruff - args: [--output-format, github, --fix, --line-length=120] + args: [--output-format, github, --fix, --line-length=120, --config, pyproject.toml] # # 拼写检查 # - repo: https://github.com/codespell-project/codespell # rev: v2.4.1 @@ -18,17 +33,13 @@ repos: # - id: codespell # additional_dependencies: ['tomli'] # args: ['--toml', 'pyproject.toml'] -# 自动排序 -- repo: https://github.com/PyCQA/isort - rev: 6.0.1 - hooks: - - id: isort + # markdown - repo: https://github.com/jackdewinter/pymarkdown rev: v0.9.29 hooks: - id: pymarkdown - args: [fix] + args: ["-d", "MD029,MD031", fix] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v5.0.0 hooks: diff --git a/README.md b/README.md index f0dbde14a..fd94d27c5 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ - +

@@ -17,8 +17,8 @@ | Quick Start | - Supported Models - + Supported Models +

-------------------------------------------------------------------------------- diff --git a/benchmarks/README.md b/benchmarks/README.md index aa9858ced..85a0a6f41 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -131,4 +131,4 @@ python benchmarks/benchmark_mtp.py \ --s_itl-base-model:主模型的解码延迟,可由上述的性能压测工具获得,与batch-size一一对应 --dataset-name:指定数据集类,指定为"EBChat"可读取转存的FD格式数据集 --dataset-path:测试数据集路径 -``` \ No newline at end of file +``` diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py index aacc94fab..c83b725ec 100644 --- a/benchmarks/backend_request_func.py +++ b/benchmarks/backend_request_func.py @@ -29,13 +29,13 @@ from typing import Optional import aiohttp from tqdm.asyncio import tqdm - AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) @dataclass class RequestFuncInput: """Input for requesting LLMs via API""" + no: int prompt: str history_QA: Optional[dict] @@ -55,6 +55,7 @@ class RequestFuncInput: @dataclass class RequestFuncOutput: """Output for requesting LLMs via API""" + no: int = 0 generated_text: str = "" reasoning_content: str = "" @@ -66,7 +67,7 @@ class RequestFuncOutput: itl: list = field(default_factory=list) # list of inter-token latencies tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 - prompt_tokens: int = 0 # 推理侧返回输入token数 + prompt_tokens: int = 0 # 推理侧返回输入token数 error: str = "" @@ -76,12 +77,9 @@ async def async_request_eb_openai_chat_completions( ) -> RequestFuncOutput: """Request an LLM using EB OpenAI""" api_url = request_func_input.api_url - assert api_url.endswith( - ("completions", "profile") - ), "OpenAI Chat Completions API URL must end with 'completions'." + assert api_url.endswith(("completions", "profile")), "OpenAI Chat Completions API URL must end with 'completions'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] if request_func_input.multi_modal_content: content.append(request_func_input.multi_modal_content) @@ -91,7 +89,7 @@ async def async_request_eb_openai_chat_completions( "stream": True, "stream_options": { "include_usage": True, - "continuous_usage_stats": True + "continuous_usage_stats": True, }, } # 超参由yaml传入 @@ -99,8 +97,8 @@ async def async_request_eb_openai_chat_completions( if request_func_input.ignore_eos: payload["ignore_eos"] = request_func_input.ignore_eos - - print("payload:{}".format(json.dumps(payload, ensure_ascii=False))) + + print(f"payload:{json.dumps(payload, ensure_ascii=False)}") headers = { "Content-Type": "application/json", @@ -115,16 +113,14 @@ async def async_request_eb_openai_chat_completions( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: + async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": # print("####chunk:", chunk, type(chunk)) timestamp = time.perf_counter() @@ -138,22 +134,20 @@ async def async_request_eb_openai_chat_completions( ttft = timestamp - st output.ttft = ttft # cached_tokens - output.prompt_len = data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0) - + output.prompt_len = ( + data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0) + ) # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) output.generated_text += content or "" output.reasoning_content += reason_content or "" output.arrival_time.append(choices[0].get("arrival_time", timestamp)) elif usage := data.get("usage", {}): - output.output_tokens = usage.get( - "completion_tokens", 0) - output.prompt_tokens = usage.get( - "prompt_tokens", 0) + output.output_tokens = usage.get("completion_tokens", 0) + output.prompt_tokens = usage.get("prompt_tokens", 0) most_recent_timestamp = timestamp @@ -166,7 +160,12 @@ async def async_request_eb_openai_chat_completions( output.latency = most_recent_timestamp - st else: error_text = await response.text() - print("####error response:", error_text, "####payload:", payload) + print( + "####error response:", + error_text, + "####payload:", + payload, + ) output.error = error_text or "" output.success = False except Exception: @@ -194,15 +193,14 @@ async def async_request_eb_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: payload = { "model": request_func_input.model, "prompt": request_func_input.prompt, "stream": True, "stream_options": { "include_usage": True, - "continuous_usage_stats": True + "continuous_usage_stats": True, }, } # 超参由yaml传入 @@ -210,12 +208,12 @@ async def async_request_eb_openai_completions( if request_func_input.ignore_eos: payload["ignore_eos"] = request_func_input.ignore_eos - + print("payload:", json.dumps(payload, ensure_ascii=False)) headers = { "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}", - "Content-Type": "application/json" + "Content-Type": "application/json", } output = RequestFuncOutput() @@ -227,8 +225,7 @@ async def async_request_eb_openai_completions( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: + async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: first_chunk_received = False async for chunk_bytes in response.content: @@ -236,8 +233,7 @@ async def async_request_eb_openai_completions( if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": # print("####chunk:", chunk, chunk.usage) timestamp = time.perf_counter() @@ -250,7 +246,7 @@ async def async_request_eb_openai_completions( # Note that text could be empty here # e.g. for special tokens text = choices[0].get("text") - + # First token if not first_chunk_received: first_chunk_received = True @@ -259,26 +255,23 @@ async def async_request_eb_openai_completions( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) - + output.itl.append(timestamp - most_recent_timestamp) + generated_text += text or "" most_recent_timestamp = timestamp output.arrival_time.append(choices[0].get("arrival_time", timestamp)) elif usage := data.get("usage"): - output.prompt_tokens = usage.get( - "prompt_tokens") - output.output_tokens = usage.get( - "completion_tokens") + output.prompt_tokens = usage.get("prompt_tokens") + output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True else: output.success = False output.error = ( - "Never received a valid chunk to calculate TTFT." - "This response will be marked as failed!") - + "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!" + ) + output.generated_text = generated_text output.latency = most_recent_timestamp - st @@ -294,8 +287,8 @@ async def async_request_eb_openai_completions( output.success = False exc_info = sys.exc_info() output.error = "".join(traceback.format_exception(*exc_info)) - - print("final_output:{}".format(output)) + + print(f"final_output:{output}") if pbar: pbar.update(1) @@ -310,8 +303,7 @@ async def async_request_tgi( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: params = { "max_new_tokens": request_func_input.output_len, "do_sample": True, @@ -358,8 +350,7 @@ async def async_request_tgi( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) most_recent_timestamp = timestamp output.arrival_time.append(data["arrival_time"]) @@ -388,8 +379,7 @@ async def async_request_trt_llm( api_url = request_func_input.api_url assert api_url.endswith("generate_stream") - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: payload = { "accumulate_tokens": True, "text_input": request_func_input.prompt, @@ -414,8 +404,7 @@ async def async_request_trt_llm( if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data:") + chunk = chunk_bytes.decode("utf-8").removeprefix("data:") data = json.loads(chunk) output.generated_text += data["text_output"] @@ -427,8 +416,7 @@ async def async_request_trt_llm( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) most_recent_timestamp = timestamp @@ -453,8 +441,7 @@ async def async_request_deepspeed_mii( pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: """Request an LLM using Deepspeed MII""" - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: payload = { "prompt": request_func_input.prompt, @@ -472,19 +459,16 @@ async def async_request_deepspeed_mii( st = time.perf_counter() try: - async with session.post(url=request_func_input.api_url, - json=payload) as response: + async with session.post(url=request_func_input.api_url, json=payload) as response: if response.status == 200: parsed_resp = await response.json() output.latency = time.perf_counter() - st if "choices" in parsed_resp: - output.generated_text = parsed_resp["choices"][0][ - "text"] + output.generated_text = parsed_resp["choices"][0]["text"] elif "text" in parsed_resp: output.generated_text = parsed_resp["text"][0] else: - output.error = ("Unexpected response format: " - "neither 'choices' nor 'text' found") + output.error = "Unexpected response format: " "neither 'choices' nor 'text' found" output.success = False output.success = True else: @@ -510,26 +494,22 @@ async def async_request_openai_completions( ("completions", "profile") ), "OpenAI Completions API URL must end with 'completions' or 'profile'." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, + "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model), "prompt": request_func_input.prompt, # "temperature": 0.0, "max_tokens": request_func_input.output_len, "logprobs": request_func_input.logprobs, "stream": True, - #"stream_options": { + # "stream_options": { # "include_usage": True, - #}, + # }, } if request_func_input.ignore_eos: payload["ignore_eos"] = request_func_input.ignore_eos - headers = { - "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}" - } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -538,8 +518,7 @@ async def async_request_openai_completions( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, json=payload, - headers=headers) as response: + async with session.post(url=api_url, json=payload, headers=headers) as response: if response.status == 200: first_chunk_received = False async for chunk_bytes in response.content: @@ -547,8 +526,7 @@ async def async_request_openai_completions( if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": # print("####chunk:", chunk, type(chunk)) data = json.loads(chunk) @@ -569,21 +547,19 @@ async def async_request_openai_completions( # Decoding phase else: - output.itl.append(timestamp - - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) most_recent_timestamp = timestamp generated_text += text or "" elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + output.output_tokens = usage.get("completion_tokens") if first_chunk_received: output.success = True else: output.success = False output.error = ( - "Never received a valid chunk to calculate TTFT." - "This response will be marked as failed!") + "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!" + ) output.generated_text = generated_text output.latency = most_recent_timestamp - st else: @@ -606,25 +582,24 @@ async def async_request_openai_audio( """Request an LLM using OpenAI""" # Lazy import without PlaceholderModule to avoid vllm dep. import soundfile + api_url = request_func_input.api_url assert api_url.endswith( - ("transcriptions", "translations" - )), "OpenAI Chat Completions API URL must end with 'transcriptions' " + ("transcriptions", "translations") + ), "OpenAI Chat Completions API URL must end with 'transcriptions' " "or `translations`." - async with aiohttp.ClientSession(trust_env=True, - timeout=AIOHTTP_TIMEOUT) as session: + async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session: content = [{"type": "text", "text": request_func_input.prompt}] payload = { - "model": request_func_input.model_name \ - if request_func_input.model_name else request_func_input.model, + "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model), "temperature": 0.0, "max_completion_tokens": request_func_input.output_len, "stream": True, "language": "en", # Flattened due to multipart/form-data "stream_include_usage": True, - "stream_continuous_usage_stats": True + "stream_continuous_usage_stats": True, } if request_func_input.extra_body: payload.update(request_func_input.extra_body) @@ -639,9 +614,9 @@ async def async_request_openai_audio( buffer.seek(0) return buffer - with to_bytes(*request_func_input.multi_modal_content['audio']) as f: + with to_bytes(*request_func_input.multi_modal_content["audio"]) as f: form = aiohttp.FormData() - form.add_field('file', f, content_type='audio/wav') + form.add_field("file", f, content_type="audio/wav") for key, value in payload.items(): form.add_field(key, str(value)) @@ -653,24 +628,20 @@ async def async_request_openai_audio( st = time.perf_counter() most_recent_timestamp = st try: - async with session.post(url=api_url, - data=form, - headers=headers) as response: + async with session.post(url=api_url, data=form, headers=headers) as response: if response.status == 200: async for chunk_bytes in response.content: chunk_bytes = chunk_bytes.strip() if not chunk_bytes: continue - chunk = chunk_bytes.decode("utf-8").removeprefix( - "data: ") + chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") if chunk != "[DONE]": timestamp = time.perf_counter() data = json.loads(chunk) if choices := data.get("choices"): - content = choices[0]["delta"].get( - "content") + content = choices[0]["delta"].get("content") # First token if ttft == 0.0: ttft = timestamp - st @@ -678,13 +649,11 @@ async def async_request_openai_audio( # Decoding phase else: - output.itl.append( - timestamp - most_recent_timestamp) + output.itl.append(timestamp - most_recent_timestamp) generated_text += content or "" elif usage := data.get("usage"): - output.output_tokens = usage.get( - "completion_tokens") + output.output_tokens = usage.get("completion_tokens") most_recent_timestamp = timestamp @@ -718,8 +687,11 @@ ASYNC_REQUEST_FUNCS = { } OPENAI_COMPATIBLE_BACKENDS = [ - k for k, v in ASYNC_REQUEST_FUNCS.items() - if v in (async_request_openai_completions, - async_request_eb_openai_chat_completions) + k + for k, v in ASYNC_REQUEST_FUNCS.items() + if v + in ( + async_request_openai_completions, + async_request_eb_openai_chat_completions, + ) ] - diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py index 59ab4b454..551f0c9d5 100644 --- a/benchmarks/benchmark_dataset.py +++ b/benchmarks/benchmark_dataset.py @@ -26,9 +26,9 @@ from abc import ABC, abstractmethod from collections.abc import Mapping from dataclasses import dataclass from io import BytesIO -from typing import Any, Callable, Optional, Union -from PIL import Image +from typing import Any, Optional, Union +from PIL import Image logger = logging.getLogger(__name__) @@ -38,6 +38,7 @@ class SampleRequest: """ Represents a single inference request for benchmarking. """ + no: int prompt: Union[str, Any] history_QA: Union[str, Any] @@ -48,6 +49,7 @@ class SampleRequest: class BenchmarkDataset(ABC): """BenchmarkDataset""" + DEFAULT_SEED = 0 IS_MULTIMODAL = False @@ -68,8 +70,7 @@ class BenchmarkDataset(ABC): self.dataset_path = dataset_path # Set the random seed, ensuring that a None value is replaced with the # default seed. - self.random_seed = (random_seed - if random_seed is not None else self.DEFAULT_SEED) + self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED self.data = None self.hyperparameter_path = hyperparameter_path self.hyperparameters = {} @@ -85,8 +86,7 @@ class BenchmarkDataset(ABC): NotImplementedError: If a subclass does not implement this method. """ # TODO (jenniferzhao): add support for downloading data - raise NotImplementedError( - "load_data must be implemented in subclasses.") + raise NotImplementedError("load_data must be implemented in subclasses.") @abstractmethod def sample(self, num_requests: int) -> list[SampleRequest]: @@ -105,8 +105,7 @@ class BenchmarkDataset(ABC): """ raise NotImplementedError("sample must be implemented in subclasses.") - def maybe_oversample_requests(self, requests: list[SampleRequest], - num_requests: int) -> None: + def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None: """ Oversamples the list of requests if its size is less than the desired number. @@ -117,11 +116,9 @@ class BenchmarkDataset(ABC): """ if len(requests) < num_requests: random.seed(self.random_seed) - additional = random.choices(requests, - k=num_requests - len(requests)) + additional = random.choices(requests, k=num_requests - len(requests)) requests.extend(additional) - logger.info("Oversampled requests to reach %d total samples.", - num_requests) + logger.info("Oversampled requests to reach %d total samples.", num_requests) def is_valid_sequence( @@ -141,14 +138,12 @@ def is_valid_sequence( """ # Check for invalid conditions prompt_too_short = prompt_len < min_len - output_too_short = (not skip_min_output_len_check) and (output_len - < min_len) + output_too_short = (not skip_min_output_len_check) and (output_len < min_len) prompt_too_long = prompt_len > max_prompt_len combined_too_long = (prompt_len + output_len) > max_total_len # Return True if none of the invalid conditions are met - return not (prompt_too_short or output_too_short or prompt_too_long - or combined_too_long) + return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long) def process_image(image: Any) -> Mapping[str, Any]: @@ -171,28 +166,25 @@ def process_image(image: Any) -> Mapping[str, Any]: Raises: ValueError: If the input is not a supported type. """ - if isinstance(image, dict) and 'bytes' in image: - image = Image.open(BytesIO(image['bytes'])) + if isinstance(image, dict) and "bytes" in image: + image = Image.open(BytesIO(image["bytes"])) if isinstance(image, Image.Image): image = image.convert("RGB") with io.BytesIO() as image_data: image.save(image_data, format="JPEG") - image_base64 = base64.b64encode( - image_data.getvalue()).decode("utf-8") + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") return { "type": "image_url", - "image_url": { - "url": f"data:image/jpeg;base64,{image_base64}" - }, + "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}, } if isinstance(image, str): - image_url = (image if image.startswith( - ("http://", "file://")) else f"file://{image}") + image_url = image if image.startswith(("http://", "file://")) else f"file://{image}" return {"type": "image_url", "image_url": {"url": image_url}} - raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image" - " or str or dictionary with raw image bytes.") + raise ValueError( + f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes." + ) class EBDataset(BenchmarkDataset): @@ -243,8 +235,7 @@ class EBDataset(BenchmarkDataset): new_output_len = int(entry["max_dec_len"]) if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation( - prompt, None) + prompt = self.apply_multimodal_chat_transformation(prompt, None) samples.append( SampleRequest( no=cnt, @@ -252,17 +243,20 @@ class EBDataset(BenchmarkDataset): prompt_len=self.prompt_len, history_QA=[], expected_output_len=new_output_len, - )) + ) + ) cnt += 1 self.maybe_oversample_requests(samples, num_requests) return samples + class EBChatDataset(BenchmarkDataset): """ Implements the ShareGPT dataset. Loads data from a JSON file and generates sample requests based on conversation turns. """ + prompt_len: int def __init__(self, **kwargs) -> None: @@ -296,8 +290,7 @@ class EBChatDataset(BenchmarkDataset): new_output_len = int(entry.get("max_tokens", 12288)) if enable_multimodal_chat: - prompt = self.apply_multimodal_chat_transformation( - prompt, None) + prompt = self.apply_multimodal_chat_transformation(prompt, None) samples.append( SampleRequest( no=cnt, @@ -306,9 +299,9 @@ class EBChatDataset(BenchmarkDataset): prompt_len=0, history_QA=history_QA, expected_output_len=new_output_len, - )) + ) + ) cnt += 1 self.maybe_oversample_requests(samples, num_requests) return samples - diff --git a/benchmarks/benchmark_mtp.py b/benchmarks/benchmark_mtp.py index 65c2392a1..2698a553b 100644 --- a/benchmarks/benchmark_mtp.py +++ b/benchmarks/benchmark_mtp.py @@ -18,28 +18,16 @@ import argparse import asyncio import contextlib import os -import signal -import socket -import subprocess -import time from typing import Union -import openai -import yaml -from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest +from benchmark_dataset import EBChatDataset, EBDataset from benchmark_serving import benchmark -def prepare_input_requests( - num_prompts: int, dataset_name: str, dataset_path: str -) -> Union[EBDataset, EBChatDataset]: +def prepare_input_requests(num_prompts: int, dataset_name: str, dataset_path: str) -> Union[EBDataset, EBChatDataset]: dataset_mapping = { - "EB": lambda: EBDataset(dataset_path=dataset_path).sample( - num_requests=num_prompts - ), - "EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample( - num_requests=num_prompts - ), + "EB": lambda: EBDataset(dataset_path=dataset_path).sample(num_requests=num_prompts), + "EBChat": lambda: EBChatDataset(dataset_path=dataset_path).sample(num_requests=num_prompts), } try: @@ -104,24 +92,27 @@ def calculate_speedup(acceptance_rate, draft_token_step, t_ori, t_mtp): def main(args): base_url = f"http://{args.host}:{args.port}" - input_requests = prepare_input_requests( - args.num_prompts, args.dataset_name, args.dataset_path - ) + input_requests = prepare_input_requests(args.num_prompts, args.dataset_name, args.dataset_path) if len(args.max_concurrency) != len(args.s_itl_base_model): - raise ValueError(f"--max_concurrency should be same length as --s_itl_base_model") + raise ValueError("--max_concurrency should be same length as --s_itl_base_model") for max_concurrency, s_itl in zip(args.max_concurrency, args.s_itl_base_model): # Wramup print("Starting warmup...") with open(os.devnull, "w") as f: with contextlib.redirect_stdout(f): - send_one_batch(base_url, max_concurrency, input_requests[0:max_concurrency], True) + send_one_batch( + base_url, + max_concurrency, + input_requests[0:max_concurrency], + True, + ) # Benchmark record = send_one_batch(base_url, max_concurrency, input_requests, False) - metric_header = f"Speed up" + metric_header = "Speed up" print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) for draft_token_step in args.draft_token_steps: speedup = calculate_speedup( @@ -130,11 +121,7 @@ def main(args): s_itl, record["mean_s_itl_ms"], ) - print( - "{:<40} {:<10.2f}".format( - f"Speed up on {draft_token_step} steps draft", speedup - ) - ) + print("{:<40} {:<10.2f}".format(f"Speed up on {draft_token_step} steps draft", speedup)) print("=" * 50) diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index e015117b3..25825061a 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -25,22 +25,23 @@ import os import random import time import warnings -import yaml +from argparse import ArgumentParser as FlexibleArgumentParser from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Optional import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, - RequestFuncOutput) -from tqdm.asyncio import tqdm - -from argparse import ArgumentParser as FlexibleArgumentParser - -from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset) +import yaml +from backend_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, +) +from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json +from tqdm.asyncio import tqdm MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @@ -48,6 +49,7 @@ MILLISECONDS_TO_SECONDS_CONVERSION = 1000 @dataclass class BenchmarkMetrics: """Class containing all metrics that are used in this script""" + completed: int total_input: int total_output: int @@ -130,8 +132,7 @@ async def get_request( input_requests: Iterable[SampleRequest] = iter(input_requests) # Calculate scale parameter theta to maintain the desired request_rate. - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}.") + assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}." theta = 1.0 / (request_rate * burstiness) for request in input_requests: @@ -157,7 +158,7 @@ def calculate_metrics( ) -> tuple[BenchmarkMetrics, list[int]]: """Calculates various performance metrics based on the inputs and outputs.""" input_lens: list[int] = [] - infer_input_lens: list[int] = [] # 推理侧输入token数 + infer_input_lens: list[int] = [] # 推理侧输入token数 actual_output_lens: list[int] = [] total_input = 0 completed = 0 @@ -208,8 +209,9 @@ def calculate_metrics( s_e2els.append(outputs[i].arrival_time[-1]) # 解码速度去掉首token if len(outputs[i].arrival_time) > 2: - s_decodes.append((outputs[i].output_tokens - 1) / - (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])) + s_decodes.append( + (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]) + ) else: print("len(outputs[i].arrival_time) <= 2") completed += 1 @@ -224,16 +226,13 @@ def calculate_metrics( if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(goodput_config_dict["ttft"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(goodput_config_dict["tpot"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(goodput_config_dict["e2el"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) for req_metric in zip(*valid_metrics): is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) @@ -242,9 +241,9 @@ def calculate_metrics( if completed == 0: warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2) + "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", + stacklevel=2, + ) metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -253,64 +252,50 @@ def calculate_metrics( request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_s_decode=np.mean(s_decodes or 0) * - 1, # ttfts is empty if streaming is not supported by backend + mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend std_s_decode=np.std(s_decodes or 0) * 1, median_s_decode=np.median(s_decodes or 0) * 1, - percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) - for p in selected_percentiles], - mean_ttft_ms=np.mean(ttfts or 0) * - 1000, # ttfts is empty if streaming is not supported by backend + percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles], + mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) - for p in selected_percentiles], - mean_s_ttft_ms=np.mean(s_ttfts or 0) * - 1000, # ttfts is empty if streaming is not supported by backend + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles], + mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend std_s_ttft_ms=np.std(s_ttfts or 0) * 1000, median_s_ttft_ms=np.median(s_ttfts or 0) * 1000, - percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) - for p in selected_percentiles], + percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles], mean_tpot_ms=np.mean(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) - for p in selected_percentiles], + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles], mean_itl_ms=np.mean(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) - for p in selected_percentiles], + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], mean_s_itl_ms=np.mean(s_itls or 0) * 1000, std_s_itl_ms=np.std(s_itls or 0) * 1000, median_s_itl_ms=np.median(s_itls or 0) * 1000, - percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) - for p in selected_percentiles], + percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles], mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) - for p in selected_percentiles], + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000, std_s_e2el_ms=np.std(s_e2els or 0) * 1000, median_s_e2el_ms=np.median(s_e2els or 0) * 1000, - percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) - for p in selected_percentiles], + percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles], mean_input_len=np.mean(input_lens or 0) * 1, std_input_len=np.std(input_lens or 0) * 1, median_input_len=np.median(input_lens or 0) * 1, - percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) - for p in selected_percentiles], + percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles], mean_s_input_len=np.mean(infer_input_lens or 0) * 1, std_s_input_len=np.std(infer_input_lens or 0) * 1, median_s_input_len=np.median(infer_input_lens or 0) * 1, - percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) - for p in selected_percentiles], + percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles], mean_output_len=np.mean(actual_output_lens or 0) * 1, std_output_len=np.std(actual_output_lens or 0) * 1, median_output_len=np.median(actual_output_lens or 0) * 1, - percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) - for p in selected_percentiles], + percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles], ) return metrics, actual_output_lens @@ -344,9 +329,11 @@ async def benchmark( raise ValueError(f"Unknown backend: {backend}") print("Starting initial single prompt test run...") - test_prompt, test_output_len, test_no = \ - input_requests[0].prompt, \ - input_requests[0].expected_output_len, input_requests[0].no + test_prompt, test_output_len, test_no = ( + input_requests[0].prompt, + input_requests[0].expected_output_len, + input_requests[0].no, + ) test_history_QA = input_requests[0].history_QA test_input = RequestFuncInput( @@ -373,27 +360,28 @@ async def benchmark( if not test_output.success: raise ValueError( "Initial test run failed - Please make sure benchmark arguments " - f"are correctly specified. Error: {test_output.error}") + f"are correctly specified. Error: {test_output.error}" + ) else: print("Initial test run completed. Starting main benchmark run...") if lora_modules: # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) \ - for _ in range(len(input_requests))]) + lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))]) if profile: print("Starting profiler...") - profile_input = RequestFuncInput(model=model_id, - model_name=model_name, - prompt=test_prompt, - no=test_no, - api_url=base_url + "/start_profile", - output_len=test_output_len, - logprobs=logprobs, - ignore_eos=ignore_eos, - extra_body=extra_body) + profile_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + no=test_no, + api_url=base_url + "/start_profile", + output_len=test_output_len, + logprobs=logprobs, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") @@ -413,21 +401,22 @@ async def benchmark( # and it will simplify the code in limited_request_func. # semaphore = (asyncio.Semaphore(max_concurrency) # if max_concurrency else contextlib.nullcontext()) - semaphore = (asyncio.Semaphore(max_concurrency) - if max_concurrency else None) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None async def limited_request_func(request_func_input, pbar): if semaphore is None: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) async with semaphore: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) benchmark_start_time = time.perf_counter() tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): - prompt, output_len, no = request.prompt, request.expected_output_len, request.no + prompt, output_len, no = ( + request.prompt, + request.expected_output_len, + request.no, + ) history_QA = request.history_QA req_model_id, req_model_name = model_id, model_name @@ -435,22 +424,21 @@ async def benchmark( req_lora_module = next(lora_modules) req_model_id, req_model_name = req_lora_module, req_lora_module - request_func_input = RequestFuncInput(model=req_model_id, - model_name=req_model_name, - prompt=prompt, - no=no, - prompt_len=0, - history_QA=history_QA, - hyper_parameters=hyper_parameters, - api_url=api_url, - output_len=output_len, - logprobs=logprobs, - ignore_eos=ignore_eos, - extra_body=extra_body) - tasks.append( - asyncio.create_task( - limited_request_func(request_func_input=request_func_input, - pbar=pbar))) + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + no=no, + prompt_len=0, + history_QA=history_QA, + hyper_parameters=hyper_parameters, + api_url=api_url, + output_len=output_len, + logprobs=logprobs, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar))) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) if profile: @@ -473,7 +461,6 @@ async def benchmark( benchmark_duration = time.perf_counter() - benchmark_start_time print("benchmark_duration:", benchmark_duration) - metrics, actual_output_lens = calculate_metrics( input_requests=input_requests, outputs=outputs, @@ -483,22 +470,16 @@ async def benchmark( goodput_config_dict=goodput_config_dict, ) - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.3f}".format("Request throughput (req/s):", - metrics.request_throughput)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput)) if goodput_config_dict: - print("{:<40} {:<10.2f}".format("Request goodput (req/s):", - metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", - metrics.total_token_throughput)) + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) result = { "duration": benchmark_duration, @@ -506,8 +487,7 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, - "request_goodput:": - metrics.request_goodput if goodput_config_dict else None, + "request_goodput:": (metrics.request_goodput if goodput_config_dict else None), "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -533,24 +513,25 @@ async def benchmark( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"))) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms") - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms") - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}_ms"): + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value def process_one_length( @@ -565,31 +546,31 @@ async def benchmark( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name}:", - getattr(metrics, f"mean_{metric_attribute_name}"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name}:", - getattr(metrics, f"median_{metric_attribute_name}"))) - result[f"mean_{metric_attribute_name}"] = getattr( - metrics, f"mean_{metric_attribute_name}") - result[f"median_{metric_attribute_name}"] = getattr( - metrics, f"median_{metric_attribute_name}") - result[f"std_{metric_attribute_name}"] = getattr( - metrics, f"std_{metric_attribute_name}") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}"): + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name}:", + getattr(metrics, f"mean_{metric_attribute_name}"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name}:", + getattr(metrics, f"median_{metric_attribute_name}"), + ) + ) + result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}") + result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}") + result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) result[f"p{p_word}_{metric_attribute_name}"] = value process_one_length("s_decode", "Decode", "解码速度(tok/s)") process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token") - process_one_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -612,44 +593,37 @@ def benchmark_metrics( ): """Benchmark metrics statistics,generate benchmark result""" outputs = [] - case_no_list = [] with open(result_file) as f: for line in f.readlines(): if "RequestFuncOutput" in line: start = line.find("RequestFuncOutput") end = line.rfind(")") - para_str = line[start:end + 1] + para_str = line[start : end + 1] output = eval(para_str) outputs.append(output) - + input_requests = [[]] * len(outputs) goodput_config_dict = check_goodput_args(args) - + metrics, actual_output_lens = calculate_metrics( input_requests=input_requests, outputs=outputs, - dur_s=benchmark_duration, + dur_s=benchmark_duration, selected_percentiles=selected_percentiles, goodput_config_dict=goodput_config_dict, ) - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.2f}".format("Request throughput (req/s):", - metrics.request_throughput)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) if goodput_config_dict: - print("{:<40} {:<10.2f}".format("Request goodput (req/s):", - metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", - metrics.total_token_throughput)) + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) result = { "duration": benchmark_duration, @@ -657,8 +631,7 @@ def benchmark_metrics( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, - "request_goodput:": - metrics.request_goodput if goodput_config_dict else None, + "request_goodput:": (metrics.request_goodput if goodput_config_dict else None), "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -682,24 +655,25 @@ def benchmark_metrics( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"))) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms") - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms") - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}_ms"): + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value def process_one_length( @@ -714,31 +688,31 @@ def benchmark_metrics( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name}:", - getattr(metrics, f"mean_{metric_attribute_name}"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name}:", - getattr(metrics, f"median_{metric_attribute_name}"))) - result[f"mean_{metric_attribute_name}"] = getattr( - metrics, f"mean_{metric_attribute_name}") - result[f"median_{metric_attribute_name}"] = getattr( - metrics, f"median_{metric_attribute_name}") - result[f"std_{metric_attribute_name}"] = getattr( - metrics, f"std_{metric_attribute_name}") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}"): + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name}:", + getattr(metrics, f"mean_{metric_attribute_name}"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name}:", + getattr(metrics, f"median_{metric_attribute_name}"), + ) + ) + result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}") + result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}") + result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) result[f"p{p_word}_{metric_attribute_name}"] = value process_one_length("s_decode", "Decode", "解码速度(tok/s)") process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token") - process_one_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -764,12 +738,14 @@ def check_goodput_args(args): raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " "The service level objective name should be one of " - f"{str(VALID_NAMES)}. ") + f"{VALID_NAMES!s}. " + ) if slo_val < 0: raise ValueError( f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " - "non-negative.") + "non-negative." + ) return goodput_config_dict @@ -783,32 +759,37 @@ def parse_goodput(slo_pairs): except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " - "Specify service level objectives for goodput as \"KEY:VALUE\" " + 'Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is a " - "number in milliseconds.") from err + "number in milliseconds." + ) from err return goodput_config_dict -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any], - file_name: str) -> None: +def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None: """Save the benchmarking results to PyTorch Benchmark Format JSON file""" metrics = [ - "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", - "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", - "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + "median_ttft_ms", + "mean_ttft_ms", + "std_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "std_tpot_ms", + "p99_tpot_ms", + "median_itl_ms", + "mean_itl_ms", + "std_itl_ms", + "p99_itl_ms", ] # These raw data might be useful, but they are rather big. They can be added # later if needed ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] pt_records = convert_to_pytorch_benchmark_format( args=args, - metrics={k: [results[k]] - for k in metrics}, - extra_info={ - k: results[k] - for k in results if k not in metrics and k not in ignored_metrics - }) + metrics={k: [results[k]] for k in metrics}, + extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics}, + ) if pt_records: # Don't use json suffix here as we don't want CI to pick it up pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" @@ -825,7 +806,6 @@ def main(args: argparse.Namespace): model_id = args.model model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" @@ -835,23 +815,17 @@ def main(args: argparse.Namespace): base_url = f"http://{args.host}:{args.port}" if args.dataset_name is None: - raise ValueError( - "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required.") + raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") # For datasets that follow a similar structure, use a mapping. dataset_mapping = { - "EB": - lambda: EBDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, + "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample( + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, ), - "EBChat": - lambda: EBChatDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, + "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample( + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, ), } @@ -869,15 +843,14 @@ def main(args: argparse.Namespace): "top_p": args.top_p, "top_k": args.top_k, "min_p": args.min_p, - "temperature": args.temperature - }.items() if v is not None + "temperature": args.temperature, + }.items() + if v is not None } # Sampling parameters are only supported by openai-compatible backend. if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError( - "Sampling parameters are only supported by openai-compatible " - "backends.") + raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.") if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. @@ -908,16 +881,15 @@ def main(args: argparse.Namespace): disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[ - float(p) for p in args.metric_percentiles.split(",") - ], + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], ignore_eos=args.ignore_eos, goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_body=sampling_params, - )) - + ) + ) + # benchmark_result = benchmark_metrics( # benchmark_duration=3600, # result_file="your result file", @@ -947,22 +919,23 @@ def main(args: argparse.Namespace): kvstring = item.split("=") result_json[kvstring[0].strip()] = kvstring[1].strip() else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) + raise ValueError("Invalid metadata format. Please use KEY=VALUE format.") if not args.save_detailed: # Remove fields with too many data points for field in [ - "input_lens", "output_lens", "ttfts", "itls", - "generated_texts", "errors" + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", ]: if field in result_json: del result_json[field] # Traffic - result_json["request_rate"] = (args.request_rate if args.request_rate - < float("inf") else "inf") + result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf" result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency @@ -971,21 +944,19 @@ def main(args: argparse.Namespace): # Save to file base_model_id = model_id.split("/")[-1] - max_concurrency_str = (f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None else "") - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "" + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" if args.result_filename: file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) - with open(file_name, "w", encoding='utf-8') as outfile: + with open(file_name, "w", encoding="utf-8") as outfile: json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput.") + parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.") parser.add_argument( "--backend", type=str, @@ -1011,18 +982,29 @@ if __name__ == "__main__": "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"], + choices=[ + "sharegpt", + "burstgpt", + "sonnet", + "random", + "hf", + "EB", + "EBChat", + ], help="Name of the dataset to benchmark on.", ) - parser.add_argument("--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.") - parser.add_argument("--hyperparameter-path", - type=str, - default=None, - help="Path to the hyperparameter. ") + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.", + ) + parser.add_argument( + "--hyperparameter-path", + type=str, + default=None, + help="Path to the hyperparameter. ", + ) parser.add_argument( "--max-concurrency", type=int, @@ -1034,7 +1016,8 @@ if __name__ == "__main__": "initiated, this argument will control how many are actually allowed " "to execute at a time. This means that when used in combination, the " "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.") + "if the server is not processing requests fast enough to keep up.", + ) parser.add_argument( "--model", @@ -1045,7 +1028,7 @@ if __name__ == "__main__": parser.add_argument( "--tokenizer", type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( @@ -1058,11 +1041,13 @@ if __name__ == "__main__": "--logprobs", type=int, default=None, - help=("Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed"), + help=( + "Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed" + ), ) parser.add_argument( "--request-rate", @@ -1099,8 +1084,7 @@ if __name__ == "__main__": parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.", ) parser.add_argument( "--save-result", @@ -1141,35 +1125,38 @@ if __name__ == "__main__": "--ignore-eos", action="store_true", help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) parser.add_argument( "--percentile-metrics", type=str, default="ttft,tpot,itl", help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " - "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " - "Default value is \"ttft,tpot,itl\".") + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'Default value is "ttft,tpot,itl".', + ) parser.add_argument( "--metric-percentiles", type=str, default="99", help="Comma-separated list of percentiles for selected metrics. " - "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " - "Default value is \"99\". " - "Use \"--percentile-metrics\" to select metrics.", + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99". ' + 'Use "--percentile-metrics" to select metrics.', ) parser.add_argument( "--goodput", nargs="+", required=False, - help="Specify service level objectives for goodput as \"KEY:VALUE\" " + help='Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is in " - "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' "separated by spaces. Allowed request level metric names are " - "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + '"ttft", "tpot", "e2el". For more context on the definition of ' "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") @@ -1197,8 +1184,8 @@ if __name__ == "__main__": "--sharegpt-output-len", type=int, default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.") + help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.", + ) random_group = parser.add_argument_group("random dataset options") random_group.add_argument( @@ -1226,29 +1213,24 @@ if __name__ == "__main__": "--random-prefix-len", type=int, default=0, - help=("Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]."), + help=( + "Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]." + ), ) hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument("--hf-subset", - type=str, - default=None, - help="Subset of the HF dataset.") - hf_group.add_argument("--hf-split", - type=str, - default=None, - help="Split of the HF dataset.") + hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.") hf_group.add_argument( "--hf-output-len", type=int, default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", + help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.", ) sampling_group = parser.add_argument_group("sampling parameters") @@ -1256,54 +1238,59 @@ if __name__ == "__main__": "--top-p", type=float, default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.", + ) sampling_group.add_argument( "--top-k", type=int, default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.", + ) sampling_group.add_argument( "--min-p", type=float, default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.", + ) sampling_group.add_argument( "--temperature", type=float, default=None, help="Temperature sampling parameter. Only has effect on " "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).") + "decoding (i.e. temperature==0.0).", + ) parser.add_argument( - '--tokenizer-mode', + "--tokenizer-mode", type=str, default="auto", - choices=['auto', 'slow', 'mistral', 'custom'], + choices=["auto", "slow", "mistral", "custom"], help='The tokenizer mode.\n\n* "auto" will use the ' 'fast tokenizer if available.\n* "slow" will ' - 'always use the slow tokenizer. \n* ' + "always use the slow tokenizer. \n* " '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.') + '"custom" will use --tokenizer to select the preregistered tokenizer.', + ) - parser.add_argument("--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ") + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) - parser.add_argument("--lora-modules", - nargs='+', - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.") + parser.add_argument( + "--lora-modules", + nargs="+", + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.", + ) args = parser.parse_args() main(args) - diff --git a/benchmarks/benchmark_utils.py b/benchmarks/benchmark_utils.py index 6c149bf5f..4eba58a3b 100644 --- a/benchmarks/benchmark_utils.py +++ b/benchmarks/benchmark_utils.py @@ -24,9 +24,11 @@ import os from typing import Any -def convert_to_pytorch_benchmark_format(args: argparse.Namespace, - metrics: dict[str, list], - extra_info: dict[str, Any]) -> list: +def convert_to_pytorch_benchmark_format( + args: argparse.Namespace, + metrics: dict[str, list], + extra_info: dict[str, Any], +) -> list: """ Save the benchmark results in the format used by PyTorch OSS benchmark with on metric per record @@ -54,12 +56,10 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, }, } - tp = record["benchmark"]["extra_info"]["args"].get( - "tensor_parallel_size") + tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size") # Save tensor_parallel_size parameter if it's part of the metadata if not tp and "tensor_parallel_size" in extra_info: - record["benchmark"]["extra_info"]["args"][ - "tensor_parallel_size"] = extra_info["tensor_parallel_size"] + record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = extra_info["tensor_parallel_size"] records.append(record) @@ -68,6 +68,7 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace, class InfEncoder(json.JSONEncoder): """InfEncoder""" + def clear_inf(self, o: Any): """clear_inf""" if isinstance(o, dict): @@ -87,4 +88,3 @@ def write_to_json(filename: str, records: list) -> None: """write_to_json""" with open(filename, "w") as f: json.dump(records, f, cls=InfEncoder) - diff --git a/benchmarks/quick_benchmark.py b/benchmarks/quick_benchmark.py index 7a2dbd877..899a14c54 100644 --- a/benchmarks/quick_benchmark.py +++ b/benchmarks/quick_benchmark.py @@ -25,32 +25,32 @@ import os import random import time import warnings -import yaml -import requests -import copy +from argparse import ArgumentParser as FlexibleArgumentParser from collections.abc import AsyncGenerator, Iterable from dataclasses import dataclass from datetime import datetime from typing import Any, Optional import numpy as np -from backend_request_func import (ASYNC_REQUEST_FUNCS, - OPENAI_COMPATIBLE_BACKENDS, RequestFuncInput, - RequestFuncOutput) +import requests +import yaml +from backend_request_func import ( + ASYNC_REQUEST_FUNCS, + OPENAI_COMPATIBLE_BACKENDS, + RequestFuncInput, + RequestFuncOutput, +) +from benchmark_dataset import EBChatDataset, EBDataset, SampleRequest +from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json from tqdm.asyncio import tqdm -from argparse import ArgumentParser as FlexibleArgumentParser - -from benchmark_dataset import (SampleRequest, EBDataset, EBChatDataset) -from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json - MILLISECONDS_TO_SECONDS_CONVERSION = 1000 - @dataclass class BenchmarkMetrics: """Class containing all metrics that are used in this script""" + completed: int total_input: int total_output: int @@ -133,8 +133,7 @@ async def get_request( input_requests: Iterable[SampleRequest] = iter(input_requests) # Calculate scale parameter theta to maintain the desired request_rate. - assert burstiness > 0, ( - f"A positive burstiness factor is expected, but given {burstiness}.") + assert burstiness > 0, f"A positive burstiness factor is expected, but given {burstiness}." theta = 1.0 / (request_rate * burstiness) for request in input_requests: @@ -160,7 +159,7 @@ def calculate_metrics( ) -> tuple[BenchmarkMetrics, list[int]]: """Calculates various performance metrics based on the inputs and outputs.""" input_lens: list[int] = [] - infer_input_lens: list[int] = [] # 推理侧输入token数 + infer_input_lens: list[int] = [] # 推理侧输入token数 actual_output_lens: list[int] = [] total_input = 0 completed = 0 @@ -210,8 +209,9 @@ def calculate_metrics( s_e2els.append(outputs[i].arrival_time[-1]) # 解码速度去掉首token if len(outputs[i].arrival_time) > 2: - s_decodes.append((outputs[i].output_tokens - 1) / - (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1])) + s_decodes.append( + (outputs[i].output_tokens - 1) / (outputs[i].arrival_time[-1] - outputs[i].arrival_time[1]) + ) completed += 1 else: actual_output_lens.append(0) @@ -224,16 +224,13 @@ def calculate_metrics( if "ttft" in goodput_config_dict: valid_metrics.append(ttfts) - slo_values.append(goodput_config_dict["ttft"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append(goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION) if "tpot" in goodput_config_dict: valid_metrics.append(all_tpots) - slo_values.append(goodput_config_dict["tpot"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append(goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION) if "e2el" in goodput_config_dict: valid_metrics.append(e2els) - slo_values.append(goodput_config_dict["e2el"] / - MILLISECONDS_TO_SECONDS_CONVERSION) + slo_values.append(goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION) for req_metric in zip(*valid_metrics): is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)]) @@ -242,9 +239,9 @@ def calculate_metrics( if completed == 0: warnings.warn( - "All requests failed. This is likely due to a misconfiguration " - "on the benchmark arguments.", - stacklevel=2) + "All requests failed. This is likely due to a misconfiguration " "on the benchmark arguments.", + stacklevel=2, + ) metrics = BenchmarkMetrics( completed=completed, total_input=total_input, @@ -253,64 +250,50 @@ def calculate_metrics( request_goodput=good_completed / dur_s, output_throughput=sum(actual_output_lens) / dur_s, total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, - mean_s_decode=np.mean(s_decodes or 0) * - 1, # ttfts is empty if streaming is not supported by backend + mean_s_decode=np.mean(s_decodes or 0) * 1, # ttfts is empty if streaming is not supported by backend std_s_decode=np.std(s_decodes or 0) * 1, median_s_decode=np.median(s_decodes or 0) * 1, - percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) - for p in selected_percentiles], - mean_ttft_ms=np.mean(ttfts or 0) * - 1000, # ttfts is empty if streaming is not supported by backend + percentiles_s_decode=[(p, np.percentile(s_decodes or 0, p) * 1) for p in selected_percentiles], + mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, median_ttft_ms=np.median(ttfts or 0) * 1000, - percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) - for p in selected_percentiles], - mean_s_ttft_ms=np.mean(s_ttfts or 0) * - 1000, # ttfts is empty if streaming is not supported by backend + percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles], + mean_s_ttft_ms=np.mean(s_ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend std_s_ttft_ms=np.std(s_ttfts or 0) * 1000, median_s_ttft_ms=np.median(s_ttfts or 0) * 1000, - percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) - for p in selected_percentiles], + percentiles_s_ttft_ms=[(p, np.percentile(s_ttfts or 0, p) * 1000) for p in selected_percentiles], mean_tpot_ms=np.mean(tpots or 0) * 1000, std_tpot_ms=np.std(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, - percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) - for p in selected_percentiles], + percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles], mean_itl_ms=np.mean(itls or 0) * 1000, std_itl_ms=np.std(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, - percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) - for p in selected_percentiles], + percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles], mean_s_itl_ms=np.mean(s_itls or 0) * 1000, std_s_itl_ms=np.std(s_itls or 0) * 1000, median_s_itl_ms=np.median(s_itls or 0) * 1000, - percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) - for p in selected_percentiles], + percentiles_s_itl_ms=[(p, np.percentile(s_itls or 0, p) * 1000) for p in selected_percentiles], mean_e2el_ms=np.mean(e2els or 0) * 1000, std_e2el_ms=np.std(e2els or 0) * 1000, median_e2el_ms=np.median(e2els or 0) * 1000, - percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) - for p in selected_percentiles], + percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], mean_s_e2el_ms=np.mean(s_e2els or 0) * 1000, std_s_e2el_ms=np.std(s_e2els or 0) * 1000, median_s_e2el_ms=np.median(s_e2els or 0) * 1000, - percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) - for p in selected_percentiles], + percentiles_s_e2el_ms=[(p, np.percentile(s_e2els or 0, p) * 1000) for p in selected_percentiles], mean_input_len=np.mean(input_lens or 0) * 1, std_input_len=np.std(input_lens or 0) * 1, median_input_len=np.median(input_lens or 0) * 1, - percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) - for p in selected_percentiles], + percentiles_input_len=[(p, np.percentile(input_lens or 0, p)) for p in selected_percentiles], mean_s_input_len=np.mean(infer_input_lens or 0) * 1, std_s_input_len=np.std(infer_input_lens or 0) * 1, median_s_input_len=np.median(infer_input_lens or 0) * 1, - percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) - for p in selected_percentiles], + percentiles_s_input_len=[(p, np.percentile(infer_input_lens or 0, p)) for p in selected_percentiles], mean_output_len=np.mean(actual_output_lens or 0) * 1, std_output_len=np.std(actual_output_lens or 0) * 1, median_output_len=np.median(actual_output_lens or 0) * 1, - percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) - for p in selected_percentiles], + percentiles_output_len=[(p, np.percentile(actual_output_lens or 0, p)) for p in selected_percentiles], ) return metrics, actual_output_lens @@ -351,20 +334,22 @@ async def benchmark( if lora_modules: # For each input request, choose a LoRA module at random. - lora_modules = iter( - [random.choice(lora_modules) \ - for _ in range(len(input_requests))]) + lora_modules = iter([random.choice(lora_modules) for _ in range(len(input_requests))]) if profile: print("Starting profiler...") - profile_input = RequestFuncInput(model=model_id, - model_name=model_name, - prompt=test_prompt, - api_url=base_url + "/start_profile", - output_len=test_output_len, - logprobs=logprobs, - ignore_eos=ignore_eos, - extra_body=extra_body) + test_prompt = None + test_output_len = None + profile_input = RequestFuncInput( + model=model_id, + model_name=model_name, + prompt=test_prompt, + api_url=base_url + "/start_profile", + output_len=test_output_len, + logprobs=logprobs, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) profile_output = await request_func(request_func_input=profile_input) if profile_output.success: print("Profiler started") @@ -384,19 +369,16 @@ async def benchmark( # and it will simplify the code in limited_request_func. # semaphore = (asyncio.Semaphore(max_concurrency) # if max_concurrency else contextlib.nullcontext()) - semaphore = (asyncio.Semaphore(max_concurrency) - if max_concurrency else None) + semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None async def limited_request_func(request_func_input, pbar): if semaphore is None: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) async with semaphore: - return await request_func(request_func_input=request_func_input, - pbar=pbar) + return await request_func(request_func_input=request_func_input, pbar=pbar) benchmark_start_time = time.perf_counter() - + print(f"开始时间:{datetime.now()}") tasks: list[asyncio.Task] = [] async for request in get_request(input_requests, request_rate, burstiness): @@ -409,25 +391,26 @@ async def benchmark( req_lora_module = next(lora_modules) req_model_id, req_model_name = req_lora_module, req_lora_module - request_func_input = RequestFuncInput(model=req_model_id, - model_name=req_model_name, - prompt=prompt, - prompt_len=0, - history_QA=history_QA, - hyper_parameters=hyper_parameters, - api_url=api_url, - output_len=output_len, - logprobs=logprobs, - ignore_eos=ignore_eos, - extra_body=extra_body) - tasks.append( - asyncio.create_task( - limited_request_func(request_func_input=request_func_input, - pbar=pbar))) + request_func_input = RequestFuncInput( + model=req_model_id, + model_name=req_model_name, + prompt=prompt, + prompt_len=0, + history_QA=history_QA, + hyper_parameters=hyper_parameters, + api_url=api_url, + output_len=output_len, + logprobs=logprobs, + ignore_eos=ignore_eos, + extra_body=extra_body, + ) + tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar))) outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks) print(f"完成时间:{datetime.now()}") if profile: print("Stopping profiler...") + test_output_len = None + test_output_len = None profile_input = RequestFuncInput( model=model_id, prompt=test_prompt, @@ -454,22 +437,16 @@ async def benchmark( ) print("Benchmark complete!!!") - print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="=")) print("{:<40} {:<10}".format("Successful requests:", metrics.completed)) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", - benchmark_duration)) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration)) print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input)) - print("{:<40} {:<10}".format("Total generated tokens:", - metrics.total_output)) - print("{:<40} {:<10.3f}".format("Request throughput (req/s):", - metrics.request_throughput)) + print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output)) + print("{:<40} {:<10.3f}".format("Request throughput (req/s):", metrics.request_throughput)) if goodput_config_dict: - print("{:<40} {:<10.2f}".format("Request goodput (req/s):", - metrics.request_goodput)) - print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", - metrics.output_throughput)) - print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", - metrics.total_token_throughput)) + print("{:<40} {:<10.2f}".format("Request goodput (req/s):", metrics.request_goodput)) + print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", metrics.total_token_throughput)) result = { "duration": benchmark_duration, @@ -477,8 +454,7 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, - "request_goodput:": - metrics.request_goodput if goodput_config_dict else None, + "request_goodput:": (metrics.request_goodput if goodput_config_dict else None), "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], @@ -491,7 +467,6 @@ async def benchmark( "reasoning_contents": [output.reasoning_content for output in outputs], "errors": [output.error for output in outputs], } - quick_result = copy.deepcopy(result) def process_one_metric( # E.g., "ttft" @@ -505,24 +480,25 @@ async def benchmark( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name} (ms):", - getattr(metrics, f"mean_{metric_attribute_name}_ms"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name} (ms):", - getattr(metrics, f"median_{metric_attribute_name}_ms"))) - result[f"mean_{metric_attribute_name}_ms"] = getattr( - metrics, f"mean_{metric_attribute_name}_ms") - result[f"median_{metric_attribute_name}_ms"] = getattr( - metrics, f"median_{metric_attribute_name}_ms") - result[f"std_{metric_attribute_name}_ms"] = getattr( - metrics, f"std_{metric_attribute_name}_ms") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}_ms"): + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name} (ms):", + getattr(metrics, f"mean_{metric_attribute_name}_ms"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name} (ms):", + getattr(metrics, f"median_{metric_attribute_name}_ms"), + ) + ) + result[f"mean_{metric_attribute_name}_ms"] = getattr(metrics, f"mean_{metric_attribute_name}_ms") + result[f"median_{metric_attribute_name}_ms"] = getattr(metrics, f"median_{metric_attribute_name}_ms") + result[f"std_{metric_attribute_name}_ms"] = getattr(metrics, f"std_{metric_attribute_name}_ms") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value)) result[f"p{p_word}_{metric_attribute_name}_ms"] = value def process_one_length( @@ -537,31 +513,31 @@ async def benchmark( # metric. if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) - print("{:<40} {:<10.2f}".format( - f"Mean {metric_name}:", - getattr(metrics, f"mean_{metric_attribute_name}"))) - print("{:<40} {:<10.2f}".format( - f"Median {metric_name}:", - getattr(metrics, f"median_{metric_attribute_name}"))) - result[f"mean_{metric_attribute_name}"] = getattr( - metrics, f"mean_{metric_attribute_name}") - result[f"median_{metric_attribute_name}"] = getattr( - metrics, f"median_{metric_attribute_name}") - result[f"std_{metric_attribute_name}"] = getattr( - metrics, f"std_{metric_attribute_name}") - for p, value in getattr(metrics, - f"percentiles_{metric_attribute_name}"): + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) + print( + "{:<40} {:<10.2f}".format( + f"Mean {metric_name}:", + getattr(metrics, f"mean_{metric_attribute_name}"), + ) + ) + print( + "{:<40} {:<10.2f}".format( + f"Median {metric_name}:", + getattr(metrics, f"median_{metric_attribute_name}"), + ) + ) + result[f"mean_{metric_attribute_name}"] = getattr(metrics, f"mean_{metric_attribute_name}") + result[f"median_{metric_attribute_name}"] = getattr(metrics, f"median_{metric_attribute_name}") + result[f"std_{metric_attribute_name}"] = getattr(metrics, f"std_{metric_attribute_name}") + for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}"): p_word = str(int(p)) if int(p) == p else str(p) - print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", - value)) + print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name}:", value)) result[f"p{p_word}_{metric_attribute_name}"] = value process_one_length("s_decode", "Decode", "解码速度(tok/s)") process_one_metric("ttft", "TTFT", "Time to First Token") process_one_metric("s_ttft", "S_TTFT", "Infer Time to First Token") - process_one_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") + process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_one_metric("itl", "ITL", "Inter-token Latency") process_one_metric("s_itl", "S_ITL", "Infer Inter-token Latency") process_one_metric("e2el", "E2EL", "End-to-end Latency") @@ -581,6 +557,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics): """ 快速评估 """ + def process_quick_metric( metric_attribute_name: str, metric_name: str, @@ -588,7 +565,7 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics): ): if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) mean_value = getattr(metrics, f"mean_{metric_attribute_name}_ms") print("{:<40} {:<10.2f}".format(f"Mean {metric_name} (ms):", mean_value)) quick_result[f"mean_{metric_attribute_name}_ms"] = mean_value @@ -600,17 +577,17 @@ def quick_summary(quick_result, selected_percentile_metrics, metrics): ): if metric_attribute_name not in selected_percentile_metrics: return - print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-')) + print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-")) mean_value = getattr(metrics, f"mean_{metric_attribute_name}") print("{:<40} {:<10.2f}".format(f"Mean {metric_name}:", mean_value)) quick_result[f"mean_{metric_attribute_name}"] = mean_value + print("\n\n\n") - print("{s:{c}^{n}}".format(s=' Benchmark Quick Summary ', n=50, c='=')) + print("{s:{c}^{n}}".format(s=" Benchmark Quick Summary ", n=50, c="=")) process_quick_length("s_decode", "Decode", "解码速度(tok/s)") process_quick_metric("ttft", "TTFT", "Time to First Token") process_quick_metric("s_ttft", "S_TTFT", "Infer Time to First Token") - process_quick_metric("tpot", "TPOT", - "Time per Output Token (excl. 1st token)") + process_quick_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)") process_quick_metric("itl", "ITL", "Inter-token Latency") process_quick_metric("s_itl", "S_ITL", "Infer Inter-token Latency") process_quick_metric("e2el", "E2EL", "End-to-end Latency") @@ -633,12 +610,14 @@ def check_goodput_args(args): raise ValueError( f"Invalid metric name found, {slo_name}: {slo_val}. " "The service level objective name should be one of " - f"{str(VALID_NAMES)}. ") + f"{VALID_NAMES!s}. " + ) if slo_val < 0: raise ValueError( f"Invalid value found, {slo_name}: {slo_val}. " "The service level objective value should be " - "non-negative.") + "non-negative." + ) return goodput_config_dict @@ -652,37 +631,43 @@ def parse_goodput(slo_pairs): except ValueError as err: raise argparse.ArgumentTypeError( "Invalid format found for service level objectives. " - "Specify service level objectives for goodput as \"KEY:VALUE\" " + 'Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is a " - "number in milliseconds.") from err + "number in milliseconds." + ) from err return goodput_config_dict -def save_to_pytorch_benchmark_format(args: argparse.Namespace, - results: dict[str, Any], - file_name: str) -> None: +def save_to_pytorch_benchmark_format(args: argparse.Namespace, results: dict[str, Any], file_name: str) -> None: """Save the benchmarking results to PyTorch Benchmark Format JSON file""" metrics = [ - "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", - "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", - "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + "median_ttft_ms", + "mean_ttft_ms", + "std_ttft_ms", + "p99_ttft_ms", + "mean_tpot_ms", + "median_tpot_ms", + "std_tpot_ms", + "p99_tpot_ms", + "median_itl_ms", + "mean_itl_ms", + "std_itl_ms", + "p99_itl_ms", ] # These raw data might be useful, but they are rather big. They can be added # later if needed ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] pt_records = convert_to_pytorch_benchmark_format( args=args, - metrics={k: [results[k]] - for k in metrics}, - extra_info={ - k: results[k] - for k in results if k not in metrics and k not in ignored_metrics - }) + metrics={k: [results[k]] for k in metrics}, + extra_info={k: results[k] for k in results if k not in metrics and k not in ignored_metrics}, + ) if pt_records: # Don't use json suffix here as we don't want CI to pick it up pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json" write_to_json(pt_file, pt_records) + def check_health(api_base_url: str) -> bool: health_url = api_base_url.rstrip("/") + "/health" try: @@ -697,6 +682,7 @@ def check_health(api_base_url: str) -> bool: print(f"[HEALTH] Failed to connect to {health_url}: {e}") return False + def main(args: argparse.Namespace): """Main entry point""" print(args) @@ -707,7 +693,6 @@ def main(args: argparse.Namespace): model_id = args.model model_name = args.served_model_name tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model - tokenizer_mode = args.tokenizer_mode if args.base_url is not None: api_url = f"{args.base_url}{args.endpoint}" @@ -717,23 +702,17 @@ def main(args: argparse.Namespace): base_url = f"http://{args.host}:{args.port}" if args.dataset_name is None: - raise ValueError( - "Please specify '--dataset-name' and the corresponding " - "'--dataset-path' if required.") + raise ValueError("Please specify '--dataset-name' and the corresponding " "'--dataset-path' if required.") # For datasets that follow a similar structure, use a mapping. dataset_mapping = { - "EB": - lambda: EBDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, + "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample( + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, ), - "EBChat": - lambda: EBChatDataset(random_seed=args.seed, - dataset_path=args.dataset_path).sample( - num_requests=args.num_prompts, - output_len=args.sharegpt_output_len, + "EBChat": lambda: EBChatDataset(random_seed=args.seed, dataset_path=args.dataset_path).sample( + num_requests=args.num_prompts, + output_len=args.sharegpt_output_len, ), } @@ -751,15 +730,14 @@ def main(args: argparse.Namespace): "top_p": args.top_p, "top_k": args.top_k, "min_p": args.min_p, - "temperature": args.temperature - }.items() if v is not None + "temperature": args.temperature, + }.items() + if v is not None } # Sampling parameters are only supported by openai-compatible backend. if sampling_params and args.backend not in OPENAI_COMPATIBLE_BACKENDS: - raise ValueError( - "Sampling parameters are only supported by openai-compatible " - "backends.") + raise ValueError("Sampling parameters are only supported by openai-compatible " "backends.") if "temperature" not in sampling_params: sampling_params["temperature"] = 0.0 # Default to greedy decoding. @@ -790,15 +768,14 @@ def main(args: argparse.Namespace): disable_tqdm=args.disable_tqdm, profile=args.profile, selected_percentile_metrics=args.percentile_metrics.split(","), - selected_percentiles=[ - float(p) for p in args.metric_percentiles.split(",") - ], + selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")], ignore_eos=args.ignore_eos, goodput_config_dict=goodput_config_dict, max_concurrency=args.max_concurrency, lora_modules=args.lora_modules, extra_body=sampling_params, - )) + ) + ) # Save config and results to json if args.save_result: @@ -819,22 +796,23 @@ def main(args: argparse.Namespace): kvstring = item.split("=") result_json[kvstring[0].strip()] = kvstring[1].strip() else: - raise ValueError( - "Invalid metadata format. Please use KEY=VALUE format." - ) + raise ValueError("Invalid metadata format. Please use KEY=VALUE format.") if not args.save_detailed: # Remove fields with too many data points for field in [ - "input_lens", "output_lens", "ttfts", "itls", - "generated_texts", "errors" + "input_lens", + "output_lens", + "ttfts", + "itls", + "generated_texts", + "errors", ]: if field in result_json: del result_json[field] # Traffic - result_json["request_rate"] = (args.request_rate if args.request_rate - < float("inf") else "inf") + result_json["request_rate"] = args.request_rate if args.request_rate < float("inf") else "inf" result_json["burstiness"] = args.burstiness result_json["max_concurrency"] = args.max_concurrency @@ -843,21 +821,19 @@ def main(args: argparse.Namespace): # Save to file base_model_id = model_id.split("/")[-1] - max_concurrency_str = (f"-concurrency{args.max_concurrency}" - if args.max_concurrency is not None else "") - file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" #noqa + max_concurrency_str = f"-concurrency{args.max_concurrency}" if args.max_concurrency is not None else "" + file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" if args.result_filename: file_name = args.result_filename if args.result_dir: file_name = os.path.join(args.result_dir, file_name) - with open(file_name, "w", encoding='utf-8') as outfile: + with open(file_name, "w", encoding="utf-8") as outfile: json.dump(result_json, outfile) save_to_pytorch_benchmark_format(args, result_json, file_name) if __name__ == "__main__": - parser = FlexibleArgumentParser( - description="Benchmark the online serving throughput.") + parser = FlexibleArgumentParser(description="Benchmark the online serving throughput.") parser.add_argument( "--backend", type=str, @@ -883,18 +859,29 @@ if __name__ == "__main__": "--dataset-name", type=str, default="sharegpt", - choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "EB", "EBChat"], + choices=[ + "sharegpt", + "burstgpt", + "sonnet", + "random", + "hf", + "EB", + "EBChat", + ], help="Name of the dataset to benchmark on.", ) - parser.add_argument("--dataset-path", - type=str, - default=None, - help="Path to the sharegpt/sonnet dataset. " - "Or the huggingface dataset ID if using HF dataset.") - parser.add_argument("--hyperparameter-path", - type=str, - default=None, - help="Path to the hyperparameter. ") + parser.add_argument( + "--dataset-path", + type=str, + default=None, + help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.", + ) + parser.add_argument( + "--hyperparameter-path", + type=str, + default=None, + help="Path to the hyperparameter. ", + ) parser.add_argument( "--max-concurrency", type=int, @@ -906,7 +893,8 @@ if __name__ == "__main__": "initiated, this argument will control how many are actually allowed " "to execute at a time. This means that when used in combination, the " "actual request rate may be lower than specified with --request-rate, " - "if the server is not processing requests fast enough to keep up.") + "if the server is not processing requests fast enough to keep up.", + ) parser.add_argument( "--model", @@ -917,7 +905,7 @@ if __name__ == "__main__": parser.add_argument( "--tokenizer", type=str, - help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501 + help="Name or path of the tokenizer, if not using the default tokenizer.", ) parser.add_argument("--use-beam-search", action="store_true") parser.add_argument( @@ -930,11 +918,13 @@ if __name__ == "__main__": "--logprobs", type=int, default=None, - help=("Number of logprobs-per-token to compute & return as part of " - "the request. If unspecified, then either (1) if beam search " - "is disabled, no logprobs are computed & a single dummy " - "logprob is returned for each token; or (2) if beam search " - "is enabled 1 logprob per token is computed"), + help=( + "Number of logprobs-per-token to compute & return as part of " + "the request. If unspecified, then either (1) if beam search " + "is disabled, no logprobs are computed & a single dummy " + "logprob is returned for each token; or (2) if beam search " + "is enabled 1 logprob per token is computed" + ), ) parser.add_argument( "--request-rate", @@ -971,8 +961,7 @@ if __name__ == "__main__": parser.add_argument( "--profile", action="store_true", - help="Use Torch Profiler. The endpoint must be launched with " - "VLLM_TORCH_PROFILER_DIR to enable profiler.", + help="Use Torch Profiler. The endpoint must be launched with " "VLLM_TORCH_PROFILER_DIR to enable profiler.", ) parser.add_argument( "--save-result", @@ -1013,35 +1002,38 @@ if __name__ == "__main__": "--ignore-eos", action="store_true", help="Set ignore_eos flag when sending the benchmark request." - "Warning: ignore_eos is not supported in deepspeed_mii and tgi.") + "Warning: ignore_eos is not supported in deepspeed_mii and tgi.", + ) parser.add_argument( "--percentile-metrics", type=str, default="ttft,tpot,itl", help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " - "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " - "Default value is \"ttft,tpot,itl\".") + 'Allowed metric names are "ttft", "tpot", "itl", "e2el". ' + 'Default value is "ttft,tpot,itl".', + ) parser.add_argument( "--metric-percentiles", type=str, default="99", help="Comma-separated list of percentiles for selected metrics. " - "To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". " - "Default value is \"99\". " - "Use \"--percentile-metrics\" to select metrics.", + 'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". ' + 'Default value is "99". ' + 'Use "--percentile-metrics" to select metrics.', ) parser.add_argument( "--goodput", nargs="+", required=False, - help="Specify service level objectives for goodput as \"KEY:VALUE\" " + help='Specify service level objectives for goodput as "KEY:VALUE" ' "pairs, where the key is a metric name, and the value is in " - "milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, " + 'milliseconds. Multiple "KEY:VALUE" pairs can be provided, ' "separated by spaces. Allowed request level metric names are " - "\"ttft\", \"tpot\", \"e2el\". For more context on the definition of " + '"ttft", "tpot", "e2el". For more context on the definition of ' "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 " - "and the blog: https://hao-ai-lab.github.io/blogs/distserve") + "and the blog: https://hao-ai-lab.github.io/blogs/distserve", + ) # group for dataset specific arguments sonnet_group = parser.add_argument_group("sonnet dataset options") @@ -1069,8 +1061,8 @@ if __name__ == "__main__": "--sharegpt-output-len", type=int, default=None, - help="Output length for each request. Overrides the output length " - "from the ShareGPT dataset.") + help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.", + ) random_group = parser.add_argument_group("random dataset options") random_group.add_argument( @@ -1098,29 +1090,24 @@ if __name__ == "__main__": "--random-prefix-len", type=int, default=0, - help=("Number of fixed prefix tokens before the random context " - "in a request. " - "The total input length is the sum of `random-prefix-len` and " - "a random " - "context length sampled from [input_len * (1 - range_ratio), " - "input_len * (1 + range_ratio)]."), + help=( + "Number of fixed prefix tokens before the random context " + "in a request. " + "The total input length is the sum of `random-prefix-len` and " + "a random " + "context length sampled from [input_len * (1 - range_ratio), " + "input_len * (1 + range_ratio)]." + ), ) hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument("--hf-subset", - type=str, - default=None, - help="Subset of the HF dataset.") - hf_group.add_argument("--hf-split", - type=str, - default=None, - help="Split of the HF dataset.") + hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.") + hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.") hf_group.add_argument( "--hf-output-len", type=int, default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", + help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.", ) sampling_group = parser.add_argument_group("sampling parameters") @@ -1128,52 +1115,58 @@ if __name__ == "__main__": "--top-p", type=float, default=None, - help="Top-p sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Top-p sampling parameter. Only has effect on openai-compatible " "backends.", + ) sampling_group.add_argument( "--top-k", type=int, default=None, - help="Top-k sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Top-k sampling parameter. Only has effect on openai-compatible " "backends.", + ) sampling_group.add_argument( "--min-p", type=float, default=None, - help="Min-p sampling parameter. Only has effect on openai-compatible " - "backends.") + help="Min-p sampling parameter. Only has effect on openai-compatible " "backends.", + ) sampling_group.add_argument( "--temperature", type=float, default=None, help="Temperature sampling parameter. Only has effect on " "openai-compatible backends. If not specified, default to greedy " - "decoding (i.e. temperature==0.0).") + "decoding (i.e. temperature==0.0).", + ) parser.add_argument( - '--tokenizer-mode', + "--tokenizer-mode", type=str, default="auto", - choices=['auto', 'slow', 'mistral', 'custom'], + choices=["auto", "slow", "mistral", "custom"], help='The tokenizer mode.\n\n* "auto" will use the ' 'fast tokenizer if available.\n* "slow" will ' - 'always use the slow tokenizer. \n* ' + "always use the slow tokenizer. \n* " '"mistral" will always use the `mistral_common` tokenizer. \n*' - '"custom" will use --tokenizer to select the preregistered tokenizer.') + '"custom" will use --tokenizer to select the preregistered tokenizer.', + ) - parser.add_argument("--served-model-name", - type=str, - default=None, - help="The model name used in the API. " - "If not specified, the model name will be the " - "same as the ``--model`` argument. ") + parser.add_argument( + "--served-model-name", + type=str, + default=None, + help="The model name used in the API. " + "If not specified, the model name will be the " + "same as the ``--model`` argument. ", + ) - parser.add_argument("--lora-modules", - nargs='+', - default=None, - help="A subset of LoRA module names passed in when " - "launching the server. For each request, the " - "script chooses a LoRA module at random.") + parser.add_argument( + "--lora-modules", + nargs="+", + default=None, + help="A subset of LoRA module names passed in when " + "launching the server. For each request, the " + "script chooses a LoRA module at random.", + ) args = parser.parse_args() diff --git a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml index db8a20b86..ffa5ceac3 100644 --- a/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml +++ b/benchmarks/yaml/eb45-21B-vl-128k-wint4-h800-tp1.yaml @@ -7,4 +7,4 @@ tensor_parallel_size: 1 enable_chunked_prefill: True max_num_batched_tokens: 384 quantization: wint4 -reasoning_parser: ernie-45-vl \ No newline at end of file +reasoning_parser: ernie-45-vl diff --git a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml index 957f59d2a..985ef7a34 100644 --- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml +++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_decode.yaml @@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674" pd_comm_port: "2334" max_num_batched_tokens: 384 max_num_partial_prefills: 3 -max_long_partial_prefills: 3 \ No newline at end of file +max_long_partial_prefills: 3 diff --git a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml index c1466160d..2831838fd 100644 --- a/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml +++ b/benchmarks/yaml/eb45-32k-w4a8c8-tp4_prefill.yaml @@ -9,4 +9,4 @@ cache_queue_port: 55664 engine_worker_queue_port: 6677 cache_transfer_protocol: "rdma,ipc" rdma_comm_ports: "7675,7676,7677,7678" -pd_comm_port: "2333" \ No newline at end of file +pd_comm_port: "2333" diff --git a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml index e6d0fa6e0..b7c26ac39 100644 --- a/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-h800-dp8_prefill.yaml @@ -10,4 +10,4 @@ engine_worker_queue_port: 6677 num_gpu_blocks_override: 1024 cache_transfer_protocol: "rdma" rdma_comm_ports: "7671,7672,7673,7674,7675,7676,7677,7678" -pd_comm_port: "2334" \ No newline at end of file +pd_comm_port: "2334" diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml index e239cea89..401cd61be 100644 --- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-decode.yaml @@ -10,4 +10,4 @@ splitwise_role: decode engine_worker_queue_port: 6678 cache_transfer_protocol: "rdma,ipc" rdma_comm_ports: "7671,7672,7673,7674" -pd_comm_port: "2334" \ No newline at end of file +pd_comm_port: "2334" diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml index 6d759c843..a4e9ca7af 100644 --- a/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-mtp-tp4-prefill.yaml @@ -9,4 +9,4 @@ cache_queue_port: 55664 engine_worker_queue_port: 6677 cache_transfer_protocol: "rdma,ipc" rdma_comm_ports: "7675,7676,7677,7678" -pd_comm_port: "2333" \ No newline at end of file +pd_comm_port: "2333" diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml index 957f59d2a..985ef7a34 100644 --- a/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_decode.yaml @@ -12,4 +12,4 @@ rdma_comm_ports: "7671,7672,7673,7674" pd_comm_port: "2334" max_num_batched_tokens: 384 max_num_partial_prefills: 3 -max_long_partial_prefills: 3 \ No newline at end of file +max_long_partial_prefills: 3 diff --git a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml index c1466160d..2831838fd 100644 --- a/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-tp4_prefill.yaml @@ -9,4 +9,4 @@ cache_queue_port: 55664 engine_worker_queue_port: 6677 cache_transfer_protocol: "rdma,ipc" rdma_comm_ports: "7675,7676,7677,7678" -pd_comm_port: "2333" \ No newline at end of file +pd_comm_port: "2333" diff --git a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml index 7a127995e..8e4c5717c 100644 --- a/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml +++ b/benchmarks/yaml/qwen3moe235b-32k-wint4-h800-tp4.yaml @@ -3,4 +3,4 @@ max_num_seqs: 75 gpu_memory_utilization: 0.85 kv_cache_ratio: 0.75 quantization: wint4 -tensor_parallel_size: 4 \ No newline at end of file +tensor_parallel_size: 4 diff --git a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml index 4d6cff601..8531d311e 100644 --- a/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml +++ b/benchmarks/yaml/qwen3moe235b-32k-wint8-h800-tp4.yaml @@ -3,4 +3,4 @@ max_num_seqs: 25 gpu_memory_utilization: 0.9 kv_cache_ratio: 0.75 quantization: wint8 -tensor_parallel_size: 4 \ No newline at end of file +tensor_parallel_size: 4 diff --git a/benchmarks/yaml/request_yaml/quick_benchmark.yaml b/benchmarks/yaml/request_yaml/quick_benchmark.yaml index c7e608c80..2af93c8f1 100644 --- a/benchmarks/yaml/request_yaml/quick_benchmark.yaml +++ b/benchmarks/yaml/request_yaml/quick_benchmark.yaml @@ -1,3 +1,3 @@ metadata: min_tokens: 32 -max_tokens: 33 \ No newline at end of file +max_tokens: 33 diff --git a/benchmarks/yaml/request_yaml/qwen2-32k.yaml b/benchmarks/yaml/request_yaml/qwen2-32k.yaml index 464277942..8227a373d 100644 --- a/benchmarks/yaml/request_yaml/qwen2-32k.yaml +++ b/benchmarks/yaml/request_yaml/qwen2-32k.yaml @@ -5,4 +5,4 @@ metadata: max_tokens: 12288 repetition_penalty: 1.05 frequency_penalty: 0 -presence_penalty: 0 \ No newline at end of file +presence_penalty: 0 diff --git a/benchmarks/yaml/request_yaml/qwen3-32k.yaml b/benchmarks/yaml/request_yaml/qwen3-32k.yaml index 8f1fc1fd7..b00f2aa26 100644 --- a/benchmarks/yaml/request_yaml/qwen3-32k.yaml +++ b/benchmarks/yaml/request_yaml/qwen3-32k.yaml @@ -5,4 +5,4 @@ metadata: max_tokens: 12288 repetition_penalty: 1.0 frequency_penalty: 0 -presence_penalty: 1.5 \ No newline at end of file +presence_penalty: 1.5 diff --git a/benchmarks/yaml/request_yaml/vLLM_default.yaml b/benchmarks/yaml/request_yaml/vLLM_default.yaml index 4be43ad1b..a6385823b 100644 --- a/benchmarks/yaml/request_yaml/vLLM_default.yaml +++ b/benchmarks/yaml/request_yaml/vLLM_default.yaml @@ -8,4 +8,4 @@ frequency_penalty: 0 presence_penalty: 0 skip_special_tokens: false chat_template_kwargs: - enable_thinking: true \ No newline at end of file + enable_thinking: true diff --git a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml index 376177602..220db3068 100644 --- a/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml +++ b/benchmarks/yaml/x1-32k-wint8-p800-tp8.yaml @@ -3,4 +3,4 @@ max_num_seqs: 64 gpu_memory_utilization: 0.9 tensor_parallel_size: 8 quantization: wint8 -reasoning_parser: ernie-x1 \ No newline at end of file +reasoning_parser: ernie-x1 diff --git a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h index c25f68211..b3fe75b2c 100644 --- a/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h +++ b/custom_ops/gpu_ops/append_attn/decoder_write_cache_with_rope_kernel.h @@ -40,4 +40,4 @@ void DecoderWriteCacheWithRoPEKernel( cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out); \ No newline at end of file + paddle::Tensor* value_cache_out); diff --git a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu index f63f36a6b..2cba8d547 100644 --- a/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu +++ b/custom_ops/gpu_ops/append_attn/gqa_rope_write_cache.cu @@ -216,7 +216,7 @@ __global__ void append_dequant_cache_kv_c8( uint32_t k_smem_offset_r = smem_t::get_permuted_offset( wid * 16 + 8 * (tid / 16) + tid % 8, (tid % 16) / 8); - + uint32_t k_read_idx = (wid * 4 + tid / 8) * HEAD_DIM + tid % 8 * num_elems_per_128b(); @@ -330,7 +330,7 @@ __global__ void append_dequant_cache_kv_c8( v_tile_ptr0[8 * kv_t_stride] = frag_dq_T[2] * cache_v_scale; v_tile_ptr0[9 * kv_t_stride] = frag_dq_T[3] * cache_v_scale; - + convert_c8(frag_dq_T + 4, v_frag[2 * i + 1]); // 4个uint8/fp8 -> 4个T #ifdef C8_DEBUG if (tid == 0 && wid == 0 && tile_idx == 0 && kv_head_idx == 0) { @@ -373,14 +373,14 @@ void AppendDequantCache( paddle::Tensor *k_out, paddle::Tensor *v_out, const cudaStream_t& stream -) { +) { using NV_TYPE = typename cascade_attn_type_traits::type; if (cache_quant_type == "cache_int8" || cache_quant_type == "cache_fp8") { constexpr int NUM_WARPS = 4; int block_num = cache_num_blocks_x.data()[0]; dim3 grids(block_num, 1, kv_num_heads); dim3 blocks(32, NUM_WARPS); - + const uint32_t smem_size = BLOCK_SIZE * HEAD_DIM * sizeof(uint8_t) * 2; auto kernel_func = append_dequant_cache_kv_c8; diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh index ed8952ad5..936d88e87 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh @@ -41,7 +41,7 @@ __global__ void append_clear_cache_int8_block( const int wid = tid / 32; const int lane_id = tid % 32; const int token_id = blockIdx.x; - + const int bid = batch_id_per_token[token_id]; const int start_token_idx = cu_seqlens_q[bid]; @@ -115,7 +115,7 @@ __global__ void append_clear_cache_int4_block( const int wid = tid / 32; const int lane_id = tid % 32; const int token_id = blockIdx.x; - + const int bid = batch_id_per_token[token_id]; const int start_token_idx = cu_seqlens_q[bid]; @@ -484,7 +484,7 @@ __global__ void append_speculate_cache_int8_rope_kernel( const int wid = tid / 32; const int lane_id = tid % 32; const int token_id = blockIdx.x; - + const int bid = batch_id_per_token[token_id]; const int start_token_idx = cu_seqlens_q[bid]; @@ -716,7 +716,7 @@ __global__ void append_speculate_cache_int8_neox_rope_kernel( const int wid = tid / 32; const int lane_id = tid % 32; const int token_id = blockIdx.x; - + const int bid = batch_id_per_token[token_id]; const int start_token_idx = cu_seqlens_q[bid]; @@ -1097,7 +1097,7 @@ __global__ void append_speculate_cache_int4_rope_kernel( const int lane_id = tid % 32; const int token_id = blockIdx.x; - + const int bid = batch_id_per_token[token_id]; const int start_token_idx = cu_seqlens_q[bid]; @@ -1403,7 +1403,7 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel( const int lane_id = tid % 32; const int token_id = blockIdx.x; - + const int bid = batch_id_per_token[token_id]; const int start_token_idx = cu_seqlens_q[bid]; @@ -1792,4 +1792,4 @@ __global__ void append_speculate_cache_int4_neox_rope_kernel( (uint_quant_value2 << 4) | (uint_quant_value1 & 0x0F); } } -} \ No newline at end of file +} diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu index b7c533a38..fb6a24fef 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.cu @@ -582,4 +582,4 @@ SpeculateWriteCacheWithRoPEKernel( cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out); \ No newline at end of file + paddle::Tensor* value_cache_out); diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h index bb192f5a9..40ab34e05 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_kernel.h @@ -39,4 +39,4 @@ void SpeculateWriteCacheWithRoPEKernel( cudaStream_t& stream, paddle::Tensor* qkv_out, paddle::Tensor* key_cache_out, - paddle::Tensor* value_cache_out); \ No newline at end of file + paddle::Tensor* value_cache_out); diff --git a/custom_ops/gpu_ops/common/cudaUtils.h b/custom_ops/gpu_ops/common/cudaUtils.h index 2a2abfffb..9bbd1f6e8 100644 --- a/custom_ops/gpu_ops/common/cudaUtils.h +++ b/custom_ops/gpu_ops/common/cudaUtils.h @@ -30,4 +30,4 @@ inline int getSMVersion() return sm_major * 10 + sm_minor; } -} \ No newline at end of file +} diff --git a/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h b/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h index 31fc95b81..6ed5b9b92 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h +++ b/custom_ops/gpu_ops/cutlass_extensions/epilogue_helpers.h @@ -136,4 +136,4 @@ struct Epilogue; }; -} // namespace cutlass_extensions \ No newline at end of file +} // namespace cutlass_extensions diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp index 7d25428b5..d327eb18a 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/collective_builder.hpp @@ -1,11 +1,11 @@ // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp index d4dd7d3a8..0a530e5c1 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/fp8_accumulation.hpp @@ -1,11 +1,11 @@ // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -54,7 +54,7 @@ ///////////////////////////////////FP8 Accumulation/////////////////////////// ////////////////////////////////////////////////////////////////////////////// /// This class provides API to promote (add) or scale (multiply_add) the results -/// from the tensor core accumulators to the main accumulators when the number +/// from the tensor core accumulators to the main accumulators when the number /// of MMAs reaches the max number of MMA interval specified by user, after that /// the tensor core accumulators are zeroed. ////////////////////////////////////////////////////////////////////////////// @@ -64,7 +64,7 @@ namespace cutlass::gemm::collective { template < class EngineAccum, class LayoutAccum> -struct GmmaFP8AccumulationWithScale { +struct GmmaFP8AccumulationWithScale { using TensorAccum = cute::Tensor; using ElementAccumulator = typename EngineAccum::value_type; @@ -78,7 +78,7 @@ private: uint32_t accum_promotion_interval_; // defines the max num of executed MMAs after which accum should be promoted. uint32_t mma_count_per_mainloop_iteration_; // num of MMAs per k_tile of mainloop uint32_t mma_count_; // current executed MMAs - uint32_t reset_accum_flag_; // accum needs to be zeroed or not. + uint32_t reset_accum_flag_; // accum needs to be zeroed or not. // promote or `add` the partial accumulators to main accumulator (FADD). CUTLASS_DEVICE @@ -116,11 +116,11 @@ public: TensorAccum &accum, uint32_t accum_promotion_interval, uint32_t mma_count_per_mainloop_iteration) - : accum_(accum), + : accum_(accum), accum_promotion_interval_(accum_promotion_interval), mma_count_per_mainloop_iteration_(mma_count_per_mainloop_iteration), - mma_count_(0), - reset_accum_flag_(0) + mma_count_(0), + reset_accum_flag_(0) { accum_temp_ = cute::make_fragment_like(accum); } @@ -129,14 +129,14 @@ public: // Methods (Common) // - CUTLASS_DEVICE + CUTLASS_DEVICE TensorAccum& operator()() { return accum_temp_; } /// prepare the MMA accumulators when initialization or zeroing is required. CUTLASS_DEVICE - bool prepare_if_needed() { + bool prepare_if_needed() { return reset_accum_flag_; } diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp index bd25a9004..be1f9747e 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp @@ -1,11 +1,11 @@ // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -137,7 +137,7 @@ struct CollectiveMma< using PipelineParams = typename MainloopPipeline::Params; // Two threads per CTA are producers (1 for operand tile and 32 for scales) - static constexpr int NumProducerThreadEvents = 33; + static constexpr int NumProducerThreadEvents = 33; static constexpr int ScaleGranularityM = ScaleGranularityM_ == 0 ? size<0>(TileShape{}) : ScaleGranularityM_; static constexpr int ScaleMsPerTile = size<0>(TileShape{}) / ScaleGranularityM; @@ -161,11 +161,11 @@ struct CollectiveMma< SmemLayoutAtomB{}, make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), cute::conditional_t< ::cutlass::gemm::detail::is_major<0,StrideB>(), Step<_2,_1,_3>, Step<_1,_2,_3>>{})); - - // Block scaling gmem-to-smem copy atom + + // Block scaling gmem-to-smem copy atom using SmemBlockScalingCopyAtomA = Copy_Atom, ElementBlockScale>; using SmemBlockScalingCopyAtomB = Copy_Atom, ElementBlockScale>; - + // Block scaling smem layout using SmemLayoutScaleA = Layout, Int>>; using SmemLayoutScaleB = Layout>, Stride<_1>>; // `ScaleNsPerTile` is always 1. @@ -202,7 +202,7 @@ struct CollectiveMma< StrideA dA; ElementB const* ptr_B; StrideB dB; - ElementBlockScale const* ptr_scale_A; + ElementBlockScale const* ptr_scale_A; ElementBlockScale const* ptr_scale_B; }; @@ -228,7 +228,7 @@ struct CollectiveMma< uint32_t tma_transaction_bytes_mk = TmaTransactionBytesMK; uint32_t tma_transaction_bytes_nk = TmaTransactionBytesNK; // Block scaling factors for A and B - ElementBlockScale const* ptr_scale_A; + ElementBlockScale const* ptr_scale_A; ElementBlockScale const* ptr_scale_B; }; @@ -285,7 +285,7 @@ struct CollectiveMma< constexpr int tma_alignment_bits = 128; auto problem_shape_MNKL = append<4>(problem_shape, 1); auto [M,N,K,L] = problem_shape_MNKL; - + bool implementable = true; constexpr int min_tma_aligned_elements_A = tma_alignment_bits / cutlass::sizeof_bits::value; implementable = implementable && cutlass::detail::check_alignment(cute::make_shape(M,K,L), StrideA{}); @@ -346,7 +346,7 @@ struct CollectiveMma< auto scaleB_shape = make_shape(tN, tK, L); // (n,k,l) auto scaleB_layout = make_ordered_layout(scaleB_shape, Step<_1, _0, _2>{}); - // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and + // Note that mScaleA_mkl and mScaleB_nkl are already blocked tiled in the `m` host and // gScaleA_mkl and gScaleB_nkl in `g` global memory are same as mScaleA_mkl and mScaleB_nkl. Tensor mScaleA_mkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_A), scaleA_layout); // (scale_m,k,l) Tensor mScaleB_nkl = make_tensor(make_gmem_ptr(mainloop_params.ptr_scale_B), scaleB_layout); // (n,k,l) @@ -406,26 +406,26 @@ struct CollectiveMma< Tensor cScaleA_mkl = make_identity_tensor(mScaleA_mkl.shape()); - Tensor gScaleA = local_tile( - mScaleA_mkl, make_tile(Int{}), + Tensor gScaleA = local_tile( + mScaleA_mkl, make_tile(Int{}), make_coord(m_coord,_,l_coord)); // (ScaleMsPerTile,k,1) - Tensor cScaleA = local_tile( - cScaleA_mkl, make_tile(Int{}), + Tensor cScaleA = local_tile( + cScaleA_mkl, make_tile(Int{}), make_coord(m_coord,_,l_coord)); Tensor gScaleB = mScaleB_nkl(n_coord,_,l_coord); // (1,k,1) // TODO: test `scale_copy_a` with `ScaleMsPerTile` < 128 - TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, + TiledCopy scale_copy_a = make_tiled_copy(SmemBlockScalingCopyAtomA{}, Layout>{}, Layout>{}); // (1,1,1) - TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, + TiledCopy scale_copy_b = make_tiled_copy(SmemBlockScalingCopyAtomB{}, Layout>{}, Layout>{}); // (1,1,1) ThrCopy thr_scale_copy_a = scale_copy_a.get_slice(threadIdx.x); ThrCopy thr_scale_copy_b = scale_copy_b.get_slice(threadIdx.x); - + Tensor tAgA_ScaleA = thr_scale_copy_a.partition_S(gScaleA); Tensor tAcA_ScaleA = thr_scale_copy_a.partition_S(cScaleA); Tensor tAsA_ScaleA = thr_scale_copy_a.partition_D(sScaleA); - + Tensor tBgB_ScaleB = thr_scale_copy_b.partition_S(gScaleB); Tensor tBsB_ScaleB = thr_scale_copy_b.partition_D(sScaleB); @@ -455,7 +455,7 @@ struct CollectiveMma< } } - // Allocate predicate tensors for a_scales (since we can't guarantee that + // Allocate predicate tensors for a_scales (since we can't guarantee that // all scales are valid, since we could have a partial tiles along M) Tensor tApA_ScaleA = make_tensor(shape(tAsA_ScaleA(_,_,0))); #pragma unroll @@ -536,7 +536,7 @@ struct CollectiveMma< Tensor sA = make_tensor(make_smem_ptr(shared_tensors.smem_A.data()), SmemLayoutA{}); // (BLK_M,BLK_K,PIPE) Tensor sB = make_tensor(make_smem_ptr(shared_tensors.smem_B.data()), SmemLayoutB{}); // (BLK_N,BLK_K,PIPE) - + // Block scaling Tensor sScaleAViewAsC = make_tensor(cute::make_smem_ptr(shared_tensors.smem_scale_A.data()), Layout< @@ -548,17 +548,17 @@ struct CollectiveMma< // // Define C accumulators and A/B partitioning // - + // Layout of warp group to thread mapping - static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and + static_assert(stride<0>(typename TiledMma::ALayout{}) == 0 and stride<0>(typename TiledMma::BLayout{}) == 0 and size<0>(typename TiledMma::ALayout{}) == NumThreadsPerWarpGroup and - size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, + size<0>(typename TiledMma::BLayout{}) == NumThreadsPerWarpGroup, "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup"); constexpr int MmaWarpGroups = size(TiledMma{}) / NumThreadsPerWarpGroup; - Layout warp_group_thread_layout = make_layout(Int{}, + Layout warp_group_thread_layout = make_layout(Int{}, Int{}); int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / NumThreadsPerWarpGroup, 0); @@ -590,7 +590,7 @@ struct CollectiveMma< // We release buffers to producer warps(dma load) with some mmas in flight PipelineState smem_pipe_release = smem_pipe_read; - + // Per block scale values for operand A and B using RegLayoutScaleAViewAsC = decltype(make_layout_like(tCsScaleAViewAsC(_, _, _, 0).layout())); // `make_layout_like` makes a compact layout. @@ -618,7 +618,7 @@ struct CollectiveMma< } int read_stage = smem_pipe_read.index(); - + // Load per block scale values from shared memory to registers. scale_b = sScaleB[read_stage]; CUTLASS_PRAGMA_UNROLL @@ -668,7 +668,7 @@ struct CollectiveMma< int read_stage = smem_pipe_read.index(); - // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) + // Load per block scale values from shared memory to registers (at most twice per block along M and exactly once per block along N) scale_b = sScaleB[read_stage]; CUTLASS_PRAGMA_UNROLL for (int i = 0; i < size(RegLayoutScaleAEssential{}); i++) { @@ -712,7 +712,7 @@ struct CollectiveMma< ++smem_pipe_read; ++smem_pipe_release; } - + accumulation.scale_residue_if_needed(tCrScaleAViewAsC); warpgroup_fence_operand(accumulation()); diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp b/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp index ca0acd826..f4cf0bf42 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/dispatch_policy.hpp @@ -1,11 +1,11 @@ // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -50,4 +50,4 @@ struct MainloopSm90TmaGmmaWarpSpecializedBlockScalingSubGroupMFP8 ////////////////////////////////////////////////////////////////////////////// -} // namespace cutlass::gemm \ No newline at end of file +} // namespace cutlass::gemm diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h index 2cc91d611..5bce307a2 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/kernel/gemm_moe_problem_visitor.h @@ -90,4 +90,4 @@ struct GemmMoeProblemVisitor } // namespace gemm } // namespace cutlass -///////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h index 38fdcf9fe..9531b01a7 100644 --- a/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h +++ b/custom_ops/gpu_ops/cutlass_extensions/gemm/threadblock/wint2x_mma_multistage.h @@ -90,7 +90,7 @@ template < SharedMemoryClearOption SharedMemoryClear = SharedMemoryClearOption::kNone, /// Used for partial specialization typename Enable = bool> -class Wint2xMmaMultistage : +class Wint2xMmaMultistage : public Wint2xMmaBase { public: ///< Base class diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h index 113ea5bf6..1a5b838b8 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_block_gemm_act_template_3x.h @@ -57,7 +57,7 @@ bool dispatch_fuse_block_gemm_c3x(GemmEpilogueAllParams params){ hasbias, ElementD, void>; - + constexpr int ScaleMsPerTile = size<0>(TileShape{}); constexpr int ScaleGranularityM = size<0>(TileShape{}) / ScaleMsPerTile; @@ -161,7 +161,7 @@ bool dispatch_fuse_block_gemm_c3x(GemmEpilogueAllParams params){ arguments.scheduler.decomposition_mode = DecompositionMode::StreamK; arguments.scheduler.reduction_mode = ReductionMode::Nondeterministic; } - + Gemm gemm_op; diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h index 943921e14..632cdc296 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_dual_gemm_act_template_3x.h @@ -170,4 +170,4 @@ bool dispatch_dual_gemm_act_sm90(DualGemmEpilogueAllParams params) { return false; } return true; -} \ No newline at end of file +} diff --git a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h index 819463175..c47015107 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fp8_gemm_fused/fuse_gemm_act_template_3x.h @@ -148,4 +148,4 @@ bool dispatch_fuse_gemm_act_sm90(GemmEpilogueAllParams params) { return false; } return true; -} \ No newline at end of file +} diff --git a/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h b/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h index bf65242d5..6b1ab209e 100644 --- a/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h +++ b/custom_ops/gpu_ops/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h @@ -54,7 +54,7 @@ public: virtual size_t getWorkspaceSize(int const m, int const n, int const k) = 0; virtual std::vector getConfigs(int k) const = 0; - + protected: static constexpr int SPLIT_K_LIMIT = 7; static constexpr int MIN_M_TILE = 16; diff --git a/custom_ops/gpu_ops/extract_text_token_output.cu b/custom_ops/gpu_ops/extract_text_token_output.cu index 292c67078..ff04a813e 100644 --- a/custom_ops/gpu_ops/extract_text_token_output.cu +++ b/custom_ops/gpu_ops/extract_text_token_output.cu @@ -93,8 +93,8 @@ std::vector ExtractTextTokenOutputInferDtype(const paddle::Dat PD_BUILD_STATIC_OP(extract_text_token_output) .Inputs({"max_seq_len", - "max_seq_len_index", - "mm_token_num_len", + "max_seq_len_index", + "mm_token_num_len", "seq_lens_this_time", "cu_seqlens_q", "score_text"}) diff --git a/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu b/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu index 06295cd62..3e1ce299a 100644 --- a/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu +++ b/custom_ops/gpu_ops/fp8_gemm_with_cutlass/fp8_fp8_half_cuda_core_gemm.cu @@ -105,7 +105,7 @@ __global__ void cudaCoreGemm(InputType const* __restrict__ act, } } } - + __syncthreads(); for (int32_t ii = tid; ii < TILE_M * TILE_N; ii += BLOCK_SIZE) { int32_t mid = ii / TILE_N, nid = ii % TILE_N; @@ -188,4 +188,4 @@ bool cuda_core_gemm_launcher(GemmParams const& params) { template bool cuda_core_gemm_launcher<__nv_fp8_e4m3, __nv_bfloat16>(GemmParams const&); template bool cuda_core_gemm_launcher<__nv_fp8_e4m3, half>(GemmParams const&); template bool cuda_core_gemm_launcher<__nv_fp8_e5m2, __nv_bfloat16>(GemmParams const&); -template bool cuda_core_gemm_launcher<__nv_fp8_e5m2, half>(GemmParams const&); \ No newline at end of file +template bool cuda_core_gemm_launcher<__nv_fp8_e5m2, half>(GemmParams const&); diff --git a/custom_ops/gpu_ops/get_mm_split_fuse.cc b/custom_ops/gpu_ops/get_mm_split_fuse.cc index 7a69d26f2..3d70258d0 100644 --- a/custom_ops/gpu_ops/get_mm_split_fuse.cc +++ b/custom_ops/gpu_ops/get_mm_split_fuse.cc @@ -61,7 +61,7 @@ std::vector GetMmSplitFuse(const paddle::Tensor& task_input_ids, st_idx += cur_st_len; } } - + while (idx < seq_lens_origin) { idx = idx + split_fuse_text_size; if (idx >= seq_lens_origin) { @@ -116,7 +116,7 @@ std::vector GetMmSplitFuse(const paddle::Tensor& task_input_ids, while (ib < img_total && cur_img_len < chunk_image_token_number) { int token_times = 4; cur_img_len += (grid_thw_cpu[ib * 3 + 1] * grid_thw_cpu[ib * 3 + 2]) / token_times; - ib ++; + ib ++; chunk_image_number ++; } image_chunk_selections_vector.emplace_back(chunk_image_number); diff --git a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu index 34fc2c16f..21effd59c 100644 --- a/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu +++ b/custom_ops/gpu_ops/ipc_sent_key_value_cache_by_remote_ptr.cu @@ -88,7 +88,7 @@ void sent_key_value_by_remote_ptr( #ifdef DEBUG_IPC_SENT std::cout<<"remote_key_tensor_sent_ptr:"<<(int64_t)remote_key_tensor_sent_ptr <<" local_key_tensor_sent_ptr:"<<(int64_t)local_key_tensor_sent_ptr - <<" local_device_id:" << local_device_id + <<" local_device_id:" << local_device_id <<" remote_device_id:" << remote_device_id <<" block_idx_stride:" << block_idx_stride <<" block_size_byte:" << block_size_byte @@ -107,25 +107,25 @@ void sent_key_value_by_remote_ptr( #endif #ifndef DEBUG_IPC_SENT_SYNC_AND_PRINT cudaMemcpyPeerAsync( - reinterpret_cast(remote_key_tensor_sent_ptr), - remote_device_id, - reinterpret_cast(local_key_tensor_sent_ptr), - local_device_id, - block_size_byte, + reinterpret_cast(remote_key_tensor_sent_ptr), + remote_device_id, + reinterpret_cast(local_key_tensor_sent_ptr), + local_device_id, + block_size_byte, stream); #endif #ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT cudaMemcpyPeer( - reinterpret_cast(remote_key_tensor_sent_ptr), - remote_device_id, - reinterpret_cast(local_key_tensor_sent_ptr), - local_device_id, + reinterpret_cast(remote_key_tensor_sent_ptr), + remote_device_id, + reinterpret_cast(local_key_tensor_sent_ptr), + local_device_id, block_size_byte); #endif cudaError_t err = cudaGetLastError(); if ( err != cudaSuccess ) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); + printf("CUDA Error: %s\n", cudaGetErrorString(err)); } #ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT cudaDeviceSynchronize(); @@ -140,7 +140,7 @@ void sent_key_value_by_remote_ptr( #ifdef DEBUG_IPC_SENT std::cout<<"remote_value_tensor_sent_ptr:"<<(int64_t)remote_value_tensor_sent_ptr <<" local_value_tensor_sent_ptr:"<<(int64_t)local_value_tensor_sent_ptr - <<" local_device_id:" << local_device_id + <<" local_device_id:" << local_device_id <<" remote_device_id:" << remote_device_id <<" block_idx_stride:" << block_idx_stride <<" block_size_byte:" << block_size_byte @@ -159,26 +159,26 @@ void sent_key_value_by_remote_ptr( #endif #ifndef DEBUG_IPC_SENT_SYNC_AND_PRINT cudaMemcpyPeerAsync( - reinterpret_cast(remote_value_tensor_sent_ptr), - remote_device_id, - reinterpret_cast(local_value_tensor_sent_ptr), - local_device_id, - block_size_byte, + reinterpret_cast(remote_value_tensor_sent_ptr), + remote_device_id, + reinterpret_cast(local_value_tensor_sent_ptr), + local_device_id, + block_size_byte, stream); #endif #ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT cudaMemcpyPeer( - reinterpret_cast(remote_value_tensor_sent_ptr), - remote_device_id, - reinterpret_cast(local_value_tensor_sent_ptr), - local_device_id, + reinterpret_cast(remote_value_tensor_sent_ptr), + remote_device_id, + reinterpret_cast(local_value_tensor_sent_ptr), + local_device_id, block_size_byte); cudaDeviceSynchronize(); #endif err = cudaGetLastError(); if ( err != cudaSuccess ) { - printf("CUDA Error: %s\n", cudaGetErrorString(err)); + printf("CUDA Error: %s\n", cudaGetErrorString(err)); } #ifdef DEBUG_IPC_SENT_SYNC_AND_PRINT PrintMatrix(reinterpret_cast(remote_value_tensor_sent_ptr), @@ -316,11 +316,11 @@ void SentKeyValueByRemotePtrBlockSync(const paddle::Tensor& local_key_tensor, cudaStream_t cuda_stream = (cudaStream_t)cuda_stream_raw; cudaStreamSynchronize(cuda_stream); } - + PD_BUILD_STATIC_OP(ipc_sent_key_value_cache_by_remote_ptr) .Inputs({"local_key_tensor", "local_value_tensor", "local_block_ids", "remote_block_ids", "remote_key_tensor", "remote_value_tensor"}) - .Attrs({ "block_num: int", - "local_device_id: int", + .Attrs({ "block_num: int", + "local_device_id: int", "remote_device_id: int", "cuda_stream_raw: int64_t"}) .Outputs({"local_key_tensor_out", "local_value_tensor_out"}) @@ -332,4 +332,4 @@ PD_BUILD_STATIC_OP(ipc_sent_key_value_cache_by_remote_ptr_block_sync) .Attrs({"cuda_stream_raw: int64_t"}) .Outputs({"local_key_tensor_out", "local_value_tensor_out"}) .SetInplaceMap({{"local_key_tensor", "local_key_tensor_out"},{"local_value_tensor","local_value_tensor_out"}}) - .SetKernelFn(PD_KERNEL(SentKeyValueByRemotePtrBlockSync)); \ No newline at end of file + .SetKernelFn(PD_KERNEL(SentKeyValueByRemotePtrBlockSync)); diff --git a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu index 64d8c3866..c963bb12e 100644 --- a/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu +++ b/custom_ops/gpu_ops/moe/deepgemm_preprocess.cu @@ -57,5 +57,3 @@ paddle::Tensor count_tokens_per_expert_func(const paddle::Tensor &topk_ids, num_experts); return token_nums_per_expert; } - - diff --git a/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu b/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu index 42476a293..66d9f72fe 100644 --- a/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu +++ b/custom_ops/gpu_ops/moe/fast_hardamard_kernel.cu @@ -737,7 +737,7 @@ void MoeFastHardamardWrapper(const T *x_data, bool FLAGS_hardamard_use_diagonal_block_matrix = true; static const char* FLAGS_hardamard_moe_block_size = std::getenv("FLAGS_hardamard_moe_block_size"); - static const int32_t hardamard_moe_block_size = FLAGS_hardamard_moe_block_size != nullptr ? + static const int32_t hardamard_moe_block_size = FLAGS_hardamard_moe_block_size != nullptr ? stoi(std::string(FLAGS_hardamard_moe_block_size)) : 512; constexpr int kThreads = 128; if (FLAGS_hardamard_use_diagonal_block_matrix) { diff --git a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h index 1078ae218..254f80e67 100644 --- a/custom_ops/gpu_ops/moe/fused_moe_imp_op.h +++ b/custom_ops/gpu_ops/moe/fused_moe_imp_op.h @@ -124,4 +124,4 @@ class CubKeyValueSorter { int num_bits_; }; -} // namespace phi \ No newline at end of file +} // namespace phi diff --git a/custom_ops/gpu_ops/moe/fused_moe_op.h b/custom_ops/gpu_ops/moe/fused_moe_op.h index f46e1523c..09d705d41 100644 --- a/custom_ops/gpu_ops/moe/fused_moe_op.h +++ b/custom_ops/gpu_ops/moe/fused_moe_op.h @@ -360,10 +360,10 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_fused(const T* input, normalizing_factor = 1.f / Z; } __syncthreads(); - + T val = T(threadDataExp * normalizing_factor); - // top_k + // top_k using cub_kvp = cub::KeyValuePair; using BlockReduceP = cub::BlockReduce; __shared__ typename BlockReduceP::TempStorage tmpStorageP; @@ -374,10 +374,10 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_fused(const T* input, for (int k_idx = 0; k_idx < k; ++k_idx) { thread_kvp.key = 0; thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities - + if (threadIdx.x < num_experts) { cub_kvp inp_kvp; - int expert = threadIdx.x; + int expert = threadIdx.x; inp_kvp.key = expert; inp_kvp.value = bias ? val + bias[expert] : val; @@ -518,12 +518,12 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_normed_fused(const T* i if (threadIdx.x == 0) { normalizing_factor = 1.f / Z; } - + __syncthreads(); - + T val = T(threadDataExp * normalizing_factor); - // top_k + // top_k using cub_kvp = cub::KeyValuePair; using BlockReduceP = cub::BlockReduce; __shared__ typename BlockReduceP::TempStorage tmpStorageP; @@ -541,7 +541,7 @@ __launch_bounds__(TPB) __global__ void moe_softmax_top_k_normed_fused(const T* i if (threadIdx.x < num_experts) { cub_kvp inp_kvp; - int expert = threadIdx.x; + int expert = threadIdx.x; inp_kvp.key = expert; inp_kvp.value = bias ? val + bias[expert] : val; @@ -1065,7 +1065,7 @@ __global__ void initialize_moe_routing_kernel( const T* unpermuted_input, OutT* permuted_output, const int* expanded_dest_row_to_expanded_source_row, - const int *expert_idx_per_token, + const int *expert_idx_per_token, const float *w4a8_in_scale, int* expanded_source_row_to_expanded_dest_row, const int64_t num_rows, @@ -1088,7 +1088,7 @@ __global__ void initialize_moe_routing_kernel( expanded_source_row_to_expanded_dest_row[expanded_source_row] = expanded_dest_row; } - + if (expanded_dest_row < active_rows) { const int expert_idx = expert_idx_per_token[expanded_dest_row]; @@ -1130,7 +1130,7 @@ static void run( const T* unpermuted_input, OutT* permuted_output, const int* expanded_dest_row_to_expanded_source_row, - const int *expert_idx_per_token, + const int *expert_idx_per_token, const float *w4a8_in_scale, int* expanded_source_row_to_expanded_dest_row, const int64_t num_rows, diff --git a/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu b/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu index 9b4182c7d..ec44a5bfc 100644 --- a/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu +++ b/custom_ops/gpu_ops/moe/moe_deepgemm_permute.cu @@ -17,7 +17,7 @@ // topk warps template __global__ void MoEDeepGEMMPermuteKernel(T* out, int* token_nums_per_expert, int* permute_indices_per_token, const T* x, const int64_t* topk_idx, const int token_num, const int topk, const int num_vecs, const int hidden, const int max_tokens_per_expert) { - + AlignedVector in_vec; const int bid = blockIdx.x; @@ -32,7 +32,7 @@ __global__ void MoEDeepGEMMPermuteKernel(T* out, int* token_nums_per_expert, int } tgt_expert_token = __shfl_sync(0xFFFFFFFF, tgt_expert_token, 0); - + for (int hidden_vec_id = tid; hidden_vec_id < num_vecs; hidden_vec_id += 32) { Load(x + token_idx * hidden + hidden_vec_id * VecSize, &in_vec); Store(in_vec, out + tgt_expert_id * max_tokens_per_expert * hidden + tgt_expert_token * hidden + hidden_vec_id * VecSize); @@ -81,7 +81,7 @@ std::vector MoEDeepGEMMPermuteDispatch( permute_indices_per_token.data(), reinterpret_cast(x.data()), topk_idx.data(), - token_num, topk, num_vecs, + token_num, topk, num_vecs, hidden, max_tokens_per_expert ); @@ -112,4 +112,4 @@ PD_BUILD_STATIC_OP(moe_deepgemm_permute) .Inputs({"x", "topk_idx"}) .Outputs({"permute_output", "token_nums_per_expert", "permute_indices_per_token"}) .Attrs({"num_experts: int", "max_tokens_per_expert: int"}) - .SetKernelFn(PD_KERNEL(MoEDeepGEMMPermute)); \ No newline at end of file + .SetKernelFn(PD_KERNEL(MoEDeepGEMMPermute)); diff --git a/custom_ops/gpu_ops/moe/moe_dispatch.cu b/custom_ops/gpu_ops/moe/moe_dispatch.cu index dedd5fbdd..7ae20e0ae 100644 --- a/custom_ops/gpu_ops/moe/moe_dispatch.cu +++ b/custom_ops/gpu_ops/moe/moe_dispatch.cu @@ -232,12 +232,12 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype, /** * @brief Mixture of Experts (MoE) Expert Dispatch Operator - * + * * This operator performs the following key functions: * 1. Computes top-k experts for each input token based on gating scores * 2. Permutes input tokens according to their selected experts for efficient expert processing * 3. Computes prefix sums of tokens per expert for group_gemm optimization - * + * * Inputs: * - input: The input tensor to be routed to experts * Shape: [total_tokens, hidden_size] @@ -246,7 +246,7 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype, * Shape: [total_tokens, expert_num] * dtype: must be float32 * - gating_correction_bias: Optional bias term for gating correction (expert_num) - * + * * Outputs: * - permute_input: Permuted input tensor organized by expert * Shape: [moe_topk * total_tokens, hidden_size] @@ -263,7 +263,7 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype, * - top_k_indices: Indices of selected top-k experts for each token * Shape: [total_tokens, moe_topk] * dtype: int32 - * + * * Attributes: * - moe_topk: Number of experts to select for each token (k value in top-k routing) * - group_moe: Whether to perform group softmax within the operator @@ -272,7 +272,7 @@ MoeExpertDispatchInferDtype(const paddle::DataType &input_dtype, * - topk_only_mode: Operation mode selector * (true: only performs topk selection without softmax, * false: performs full softmax+topk computation) - * + * * Note: * - The operator requires 2D input format [total_tokens, hidden_size] * - For optimal performance, expert_num should be a power of 2 when possible @@ -283,7 +283,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch) paddle::Optional("gating_correction_bias"), paddle::Optional("w4a8_in_scale")}) .Outputs({"permute_input", "tokens_expert_prefix_sum", - "permute_indices_per_token", "topk_weight", "topk_idx", + "permute_indices_per_token", "topk_weight", "topk_idx", "expert_idx_per_token"}) .Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"}) .SetKernelFn(PD_KERNEL(MoeExpertDispatch)) diff --git a/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu b/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu index ba939ec2d..a53cb0a95 100644 --- a/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu +++ b/custom_ops/gpu_ops/moe/moe_redundant_topk_select.cu @@ -263,4 +263,4 @@ PD_BUILD_OP(moe_redundant_topk_select) .SetInplaceMap({{"tokens_per_expert_stats_list", "tokens_per_expert_stats_list_out"}}) .SetKernelFn(PD_KERNEL(MoERedundantTopKSelectKernel)) .SetInferShapeFn(PD_INFER_SHAPE(MoERedundantTopKSelectKernelInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(MoERedundantTopKSelectKernelInferDtype)); \ No newline at end of file + .SetInferDtypeFn(PD_INFER_DTYPE(MoERedundantTopKSelectKernelInferDtype)); diff --git a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu index 68d756a1a..b45f36947 100644 --- a/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu +++ b/custom_ops/gpu_ops/moe/moe_wna16_marlin_utils/kernel_fp16_ku4b8.cu @@ -106,4 +106,4 @@ template __global__ void Marlin( MARLIN_KERNEL_PARAMS ); -} \ No newline at end of file +} diff --git a/custom_ops/gpu_ops/msg_utils.h b/custom_ops/gpu_ops/msg_utils.h index e3ca0f646..ff46ccb00 100644 --- a/custom_ops/gpu_ops/msg_utils.h +++ b/custom_ops/gpu_ops/msg_utils.h @@ -36,4 +36,4 @@ struct msgdata { struct msgdatakv { long mtype; int mtext[MAX_BSZ * 3 + 2]; // encoder_count, layer_id, bid- pair -}; \ No newline at end of file +}; diff --git a/custom_ops/gpu_ops/read_ids.py b/custom_ops/gpu_ops/read_ids.py index 560c9758e..d84c54b4d 100644 --- a/custom_ops/gpu_ops/read_ids.py +++ b/custom_ops/gpu_ops/read_ids.py @@ -14,9 +14,10 @@ """read_ids""" import os -import numpy as np import struct +import numpy as np + def deserialize_from_file(fp): """deserialize from file""" diff --git a/custom_ops/gpu_ops/read_temp_ids.py b/custom_ops/gpu_ops/read_temp_ids.py index 65c49a719..585bd900c 100644 --- a/custom_ops/gpu_ops/read_temp_ids.py +++ b/custom_ops/gpu_ops/read_temp_ids.py @@ -13,9 +13,10 @@ # limitations under the License. """read temp_ids from file""" import os -import numpy as np import struct +import numpy as np + def deserialize_from_file(fp): """ diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.cc b/custom_ops/gpu_ops/remote_cache_kv_ipc.cc index edbacd5d6..f1f53513b 100644 --- a/custom_ops/gpu_ops/remote_cache_kv_ipc.cc +++ b/custom_ops/gpu_ops/remote_cache_kv_ipc.cc @@ -15,7 +15,7 @@ #include "remote_cache_kv_ipc.h" RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data RemoteCacheKvIpc::kv_complete_signal_meta_data; -RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query +RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query RemoteCacheKvIpc::kv_complete_signal_meta_data_per_query; void* RemoteCacheKvIpc::kv_complete_signal_identity_ptr = nullptr; bool RemoteCacheKvIpc::kv_complete_signal_shmem_opened = false; @@ -118,4 +118,3 @@ void CUDART_CB RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_per_que RemoteCacheKvIpc::kv_complete_signal_meta_data_per_query.send_signal(); // std::printf("#### save_cache_kv_complete_signal_layerwise_per_query); } - diff --git a/custom_ops/gpu_ops/remote_cache_kv_ipc.h b/custom_ops/gpu_ops/remote_cache_kv_ipc.h index 4694e0b39..3c09af1e4 100644 --- a/custom_ops/gpu_ops/remote_cache_kv_ipc.h +++ b/custom_ops/gpu_ops/remote_cache_kv_ipc.h @@ -71,7 +71,7 @@ struct RemoteCacheKvIpc { } } msg_sed.mtext[0] = encoder_count; - + if (!inited) { // just init once const int msg_id = 1024 + rank; @@ -90,7 +90,7 @@ struct RemoteCacheKvIpc { assert(layer_id_ <= num_layers_); } }; - + static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data kv_complete_signal_meta_data; static RemoteCacheKvIpc::save_cache_kv_complete_signal_layerwise_meta_data_per_query kv_complete_signal_meta_data_per_query; static void* kv_complete_signal_identity_ptr; diff --git a/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu b/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu index 3e30db4a3..88b985b45 100644 --- a/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu +++ b/custom_ops/gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu @@ -125,7 +125,7 @@ void group_wise_scale(ScaleT* scale, } } -std::vector Fp8Int4WeightQuantizeKernel(const paddle::Tensor &input, +std::vector Fp8Int4WeightQuantizeKernel(const paddle::Tensor &input, int groupsize, std::string scale_dtype) { auto input_cpu = input.copy_to(paddle::CPUPlace(), false); @@ -139,47 +139,47 @@ std::vector Fp8Int4WeightQuantizeKernel(const paddle::Tensor &in if (groupsize > 0) { scale = paddle::full({shape[0] / groupsize * shape[1]}, 1.0, paddle::DataType::BFLOAT16, paddle::CPUPlace()); group_wise_scale(scale.data(), input_cpu.data(), k, n, 7.0f, groupsize); - group_wise_quant(packed_int4.data(), - input_cpu.data(), - scale.data(), - k, + group_wise_quant(packed_int4.data(), + input_cpu.data(), + scale.data(), + k, n, groupsize); } else { scale = paddle::full({shape[1]}, 1.0, paddle::DataType::BFLOAT16, paddle::CPUPlace()); per_channel_scale(scale.data(), input_cpu.data(), k, n, 7.0f); - per_channel_quant(packed_int4.data(), - input_cpu.data(), - scale.data(), - k, + per_channel_quant(packed_int4.data(), + input_cpu.data(), + scale.data(), + k, n); } } else if (scale_dtype == "float16") { if (groupsize > 0) { - scale = paddle::full({shape[0] / groupsize * shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace()); + scale = paddle::full({shape[0] / groupsize * shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace()); group_wise_scale(scale.data(), input_cpu.data(), k, n, 7.0f, groupsize); - group_wise_quant(packed_int4.data(), - input_cpu.data(), - scale.data(), - k, + group_wise_quant(packed_int4.data(), + input_cpu.data(), + scale.data(), + k, n, groupsize); } else { - scale = paddle::full({shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace()); + scale = paddle::full({shape[1]}, 1.0, paddle::DataType::FLOAT16, paddle::CPUPlace()); per_channel_scale(scale.data(), input_cpu.data(), k, n, 7.0f); - per_channel_quant(packed_int4.data(), - input_cpu.data(), - scale.data(), - k, + per_channel_quant(packed_int4.data(), + input_cpu.data(), + scale.data(), + k, n); } } auto out = paddle::full({shape[1] / 2, shape[0]}, 0, paddle::DataType::INT8, paddle::CPUPlace()); preprocess_weights_for_mixed_gemm( - out.data(), - packed_int4.data(), - {k, n}, + out.data(), + packed_int4.data(), + {k, n}, kernels::cutlass_kernels::QuantType::W4_AFP8, false); return {out, scale}; diff --git a/custom_ops/gpu_ops/share_external_data.cu b/custom_ops/gpu_ops/share_external_data.cu index 194a66795..8b204ccc3 100644 --- a/custom_ops/gpu_ops/share_external_data.cu +++ b/custom_ops/gpu_ops/share_external_data.cu @@ -1,11 +1,11 @@ // Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,7 +27,7 @@ std::vector ShareExternalData(paddle::Tensor& input, const std::string shm_name, - const std::vector& shape) { + const std::vector& shape) { volatile shmStruct *shm = NULL; sharedMemoryInfo info; if (sharedMemoryOpen(shm_name.c_str(), sizeof(shmStruct), &info) != 0) { @@ -62,4 +62,4 @@ PD_BUILD_STATIC_OP(share_external_data) .Inputs({"input"}) .Outputs({"output"}) .Attrs({"shm_name: std::string", "shape: std::vector"}) - .SetKernelFn(PD_KERNEL(ShareExternalData)); \ No newline at end of file + .SetKernelFn(PD_KERNEL(ShareExternalData)); diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu index dcc9337f0..97d900319 100644 --- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu +++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_base_model_hidden_states.cu @@ -19,7 +19,7 @@ // #define DEBUG_EAGLE_KERNEL __global__ void ComputeOrderKernel( - const int* seq_lens_this_time, + const int* seq_lens_this_time, const int* seq_lens_encoder, const int* base_model_seq_lens_this_time, const int* base_model_seq_lens_encoder, @@ -47,7 +47,7 @@ __global__ void ComputeOrderKernel( printf("batch %d: cur_seq_lens_encoder > 0 \n", i); #endif for (int j = 0; j < cur_seq_lens_encoder; j++) { - position_map[in_offset++] = out_offset++; + position_map[in_offset++] = out_offset++; } // 2. base model encoder. Base step=0 } else if (cur_base_model_seq_lens_encoder != 0) { @@ -69,13 +69,13 @@ __global__ void ComputeOrderKernel( in_offset += cur_base_model_seq_lens_this_time; } else /*Accept all draft tokens*/ { #ifdef DEBUG_EAGLE_KERNEL - printf("batch %d: accept_num > actual_draft_token_num \n", i); + printf("batch %d: accept_num > actual_draft_token_num \n", i); #endif position_map[in_offset + accept_num - 2] = out_offset++; position_map[in_offset + accept_num - 1] = out_offset++; in_offset += cur_base_model_seq_lens_this_time; } - } + } } output_token_num[0] = out_offset; #ifdef DEBUG_EAGLE_KERNEL @@ -208,7 +208,7 @@ std::vector EagleGetHiddenStates( } case paddle::DataType::BFLOAT16: { return DispatchDtype( - input, + input, seq_lens_this_time, seq_lens_encoder, seq_lens_decoder, diff --git a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu index f440c43c6..878926f3b 100644 --- a/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu +++ b/custom_ops/gpu_ops/speculate_decoding/draft_model/eagle_get_self_hidden_states.cu @@ -72,7 +72,7 @@ __global__ void computeOrderKernel( output_token_num[0] = out_offset; #ifdef DEBUG_EAGLE_KERNEL printf("position map output_token_num%d:\n", output_token_num[0]); - for (int i = 0; i < output_token_num[0]; i++) { + for (int i = 0; i < output_token_num[0]; i++) { printf("%d ", src_map[i]); } printf("\n"); @@ -187,4 +187,4 @@ PD_BUILD_STATIC_OP(eagle_get_self_hidden_states) "seq_lens_this_time", "step_idx"}) .Outputs({"out"}) - .SetKernelFn(PD_KERNEL(EagleGetSelfHiddenStates)); \ No newline at end of file + .SetKernelFn(PD_KERNEL(EagleGetSelfHiddenStates)); diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu index 48c24a0e0..d4937116c 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_rebuild_append_padding.cu @@ -26,7 +26,7 @@ __global__ void RebuildAppendPaddingKernel( const int seq_len, const int dim_embed, const size_t elem_nums) { - using LoadT = AlignedVector; + using LoadT = AlignedVector; LoadT src_vec; const int64_t global_idx = blockDim.x * blockIdx.x + threadIdx.x; for (int64_t i = global_idx * VecSize; i < elem_nums; i += gridDim.x * blockDim.x * VecSize) { @@ -42,7 +42,7 @@ __global__ void RebuildAppendPaddingKernel( const int input_token_id = ori_token_id - cum_offset[bi] + seq_id; const int bias_idx = i % dim_embed; - + Load(&full_hidden_states[input_token_id * dim_embed + bias_idx], &src_vec); Store(src_vec, &out[i]); } @@ -78,14 +78,14 @@ std::vector DispatchDtype( GetNumBlocks(pack_num, &grid_size); RebuildAppendPaddingKernel<<>>( - reinterpret_cast(out.data()), - reinterpret_cast(full_hidden_states.data()), - cum_offsets.data(), - seq_len_encoder.data(), - seq_len_decoder.data(), - output_padding_offset.data(), - max_seq_len, - dim_embed, + reinterpret_cast(out.data()), + reinterpret_cast(full_hidden_states.data()), + cum_offsets.data(), + seq_len_encoder.data(), + seq_len_decoder.data(), + output_padding_offset.data(), + max_seq_len, + dim_embed, elem_nums); return {out}; } @@ -99,7 +99,7 @@ std::vector RebuildAppendPadding( const paddle::Tensor& output_padding_offset, const int max_seq_len) { - + switch (full_hidden_states.dtype()) { case paddle::DataType::BFLOAT16: return DispatchDtype( @@ -137,7 +137,7 @@ std::vector RebuildAppendPaddingInferDtype( PD_BUILD_STATIC_OP(speculate_rebuild_append_padding) - .Inputs({"full_hidden_states", + .Inputs({"full_hidden_states", "cum_offsets", "seq_len_encoder", "seq_len_decoder", @@ -146,4 +146,4 @@ PD_BUILD_STATIC_OP(speculate_rebuild_append_padding) .Outputs({"out"}) .SetKernelFn(PD_KERNEL(RebuildAppendPadding)) .SetInferShapeFn(PD_INFER_SHAPE(RebuildAppendPaddingInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(RebuildAppendPaddingInferDtype)); \ No newline at end of file + .SetInferDtypeFn(PD_INFER_DTYPE(RebuildAppendPaddingInferDtype)); diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu index bd18bdd6b..baf1da9e1 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_step_reschedule.cu @@ -93,7 +93,7 @@ __global__ void speculate_free_and_reschedule(bool *stop_flags, used_list_len[tid] = 0; } } else if (seq_lens_this_time[tid] != 0 && max_possible_block_idx < block_num_per_seq && - block_table_now[(seq_lens_decoder[tid] + max_draft_tokens + + block_table_now[(seq_lens_decoder[tid] + max_draft_tokens + 1) / block_size] == -1) { // 统计需要分配block的位置和总数 @@ -347,7 +347,7 @@ PD_BUILD_STATIC_OP(speculate_step_reschedule) "next_tokens", "first_token_ids", "accept_num"}) - .Attrs({"block_size: int", + .Attrs({"block_size: int", "encoder_decoder_block_num: int", "max_draft_tokens: int"}) .Outputs({"stop_flags_out", diff --git a/custom_ops/gpu_ops/step_system_cache.cu b/custom_ops/gpu_ops/step_system_cache.cu index a432110af..4b236bd80 100644 --- a/custom_ops/gpu_ops/step_system_cache.cu +++ b/custom_ops/gpu_ops/step_system_cache.cu @@ -60,7 +60,7 @@ __global__ void recover_block_system_cache(int *recover_block_list, // [bsz] const int ori_free_list_len_tid0 = atomicSub(free_list_len, decoder_used_len); ori_free_list_len = ori_free_list_len_tid0; #ifdef DEBUG_STEP - printf("seq_id: %d, ori_seq_len_encoder: %d, step_idx_now: %d, seq_len: %d, ori_free_list_len_tid0: %d, ori_free_list_len: %d\n", + printf("seq_id: %d, ori_seq_len_encoder: %d, step_idx_now: %d, seq_len: %d, ori_free_list_len_tid0: %d, ori_free_list_len: %d\n", recover_id, ori_seq_len_encoder, step_idx_now, seq_len, ori_free_list_len_tid0, ori_free_list_len); #endif } @@ -95,7 +95,7 @@ void StepSystemCache(const paddle::Tensor& stop_flags, const paddle::Tensor& recover_lens, const paddle::Tensor& need_block_list, const paddle::Tensor& need_block_len, - const paddle::Tensor& used_list_len, + const paddle::Tensor& used_list_len, const paddle::Tensor& free_list, const paddle::Tensor& free_list_len, const paddle::Tensor& input_ids, @@ -178,7 +178,7 @@ void StepSystemCache(const paddle::Tensor& stop_flags, } PD_BUILD_STATIC_OP(step_system_cache) - .Inputs({"stop_flags", + .Inputs({"stop_flags", "seq_lens_this_time", "ori_seq_lens_encoder", "ori_seq_lens_decoder", diff --git a/custom_ops/gpu_ops/swap_cache.cu b/custom_ops/gpu_ops/swap_cache.cu index 6ccdaab43..a25d08886 100644 --- a/custom_ops/gpu_ops/swap_cache.cu +++ b/custom_ops/gpu_ops/swap_cache.cu @@ -68,26 +68,26 @@ void SwapCache(const paddle::Tensor& cache_gpu, // gpu switch (cache_gpu.dtype()) { case paddle::DataType::BFLOAT16: return SwapCacheImpl( - cache_gpu, - cache_cpu_ptr, + cache_gpu, + cache_cpu_ptr, max_block_num_cpu, - swap_block_ids_gpu, + swap_block_ids_gpu, swap_block_ids_cpu, mode); case paddle::DataType::FLOAT16: return SwapCacheImpl( - cache_gpu, - cache_cpu_ptr, + cache_gpu, + cache_cpu_ptr, max_block_num_cpu, - swap_block_ids_gpu, + swap_block_ids_gpu, swap_block_ids_cpu, mode); case paddle::DataType::UINT8: return SwapCacheImpl( - cache_gpu, - cache_cpu_ptr, + cache_gpu, + cache_cpu_ptr, max_block_num_cpu, - swap_block_ids_gpu, + swap_block_ids_gpu, swap_block_ids_cpu, mode); default: diff --git a/custom_ops/gpu_ops/text_image_gather_scatter.cu b/custom_ops/gpu_ops/text_image_gather_scatter.cu index 6bcd92263..09fc07f96 100644 --- a/custom_ops/gpu_ops/text_image_gather_scatter.cu +++ b/custom_ops/gpu_ops/text_image_gather_scatter.cu @@ -47,7 +47,7 @@ inline cudaError_t GetGridSize(int64_t n, int block_size, int num_waves, int* nu template __global__ void text_image_scatter_kernel( - T* input_ptr, + T* input_ptr, T* text_gather_ptr, T* image_gather_ptr, int32_t* token_type_ids, @@ -72,8 +72,8 @@ __global__ void text_image_scatter_kernel( int32_t token_type_ids_num = token_type_ids[token_idx]; int64_t input_load_offset = token_idx * hidden_size + hidden_offset; - - Load(input_ptr + input_load_offset, &input_ptr_vec); + + Load(input_ptr + input_load_offset, &input_ptr_vec); #pragma unroll for(int vi = 0; vi < VecSize; ++vi) { text_imgaes_vec[vi] = input_ptr_vec[vi]; @@ -92,7 +92,7 @@ __global__ void text_image_scatter_kernel( template __global__ void text_image_gather_kernel( - T* output_ptr, + T* output_ptr, T* text_gather_ptr, T* image_gather_ptr, int32_t* token_type_ids, @@ -131,8 +131,8 @@ __global__ void text_image_gather_kernel( } int64_t input_load_offset = token_idx * hidden_size + hidden_offset; - - Store(output_ptr_vec, output_ptr + input_load_offset); + + Store(output_ptr_vec, output_ptr + input_load_offset); } } @@ -159,7 +159,7 @@ void LaunchTextImageGatherScatter( const int64_t tot_element_num = token_num * hidden_size; int64_t tot_pack_num = (tot_element_num + VecSize - 1) / VecSize; - + const int block_size = 128; int grid_index = (token_num + block_size - 1) / block_size; constexpr int32_t kNumWaves = 16; @@ -170,8 +170,8 @@ void LaunchTextImageGatherScatter( if (is_scatter) { text_image_scatter_kernel<<>>( reinterpret_cast(input.data()), - reinterpret_cast(text_input.data()), - reinterpret_cast(image_input.data()), + reinterpret_cast(text_input.data()), + reinterpret_cast(image_input.data()), reinterpret_cast(token_type_ids.data()), reinterpret_cast(text_index.data()), reinterpret_cast(image_index.data()), @@ -181,8 +181,8 @@ void LaunchTextImageGatherScatter( } else { text_image_gather_kernel<<>>( reinterpret_cast(input.data()), - reinterpret_cast(text_input.data()), - reinterpret_cast(image_input.data()), + reinterpret_cast(text_input.data()), + reinterpret_cast(image_input.data()), reinterpret_cast(token_type_ids.data()), reinterpret_cast(text_index.data()), reinterpret_cast(image_index.data()), @@ -216,8 +216,8 @@ void TextImageGatherScatter( PD_BUILD_STATIC_OP(text_image_gather_scatter) .Inputs({"input", - "text_input", - "image_input", + "text_input", + "image_input", "token_type_ids", "text_index", "image_index"}) @@ -229,5 +229,5 @@ PD_BUILD_STATIC_OP(text_image_gather_scatter) .SetInplaceMap({{"text_input", "text_input_out"}, {"image_input", "image_input_out"}, {"text_index", "text_index_out"}, - {"image_index", "image_index_out"}}) + {"image_index", "image_index_out"}}) .SetKernelFn(PD_KERNEL(TextImageGatherScatter)); diff --git a/custom_ops/gpu_ops/text_image_index_out.cu b/custom_ops/gpu_ops/text_image_index_out.cu index 4140e2742..b6d8941d6 100644 --- a/custom_ops/gpu_ops/text_image_index_out.cu +++ b/custom_ops/gpu_ops/text_image_index_out.cu @@ -16,7 +16,7 @@ template __global__ void text_image_index_out_kernel( - int32_t* token_type_ids, + int32_t* token_type_ids, int32_t* text_index, int32_t* image_index, const int64_t token_num @@ -25,7 +25,7 @@ __global__ void text_image_index_out_kernel( if (global_thread_idx >= 1) return; int text_count = 0; int images_count = 0; - + for (int i = 0; i < token_num; ++i) { // printf(" %d %d %d %d \n", text_index[i], text_count, images_count, i); if (token_type_ids[i] == 0) { @@ -60,5 +60,5 @@ PD_BUILD_STATIC_OP(text_image_index_out) .Outputs({"text_index_out", "image_index_out"}) .SetInplaceMap({{"text_index", "text_index_out"}, - {"image_index", "image_index_out"}}) + {"image_index", "image_index_out"}}) .SetKernelFn(PD_KERNEL(TextImageIndexOut)); diff --git a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu index fab6976bc..428d56364 100644 --- a/custom_ops/gpu_ops/tune_cublaslt_gemm.cu +++ b/custom_ops/gpu_ops/tune_cublaslt_gemm.cu @@ -810,4 +810,4 @@ PD_BUILD_STATIC_OP(tune_cublaslt_gemm) "is_test: bool", "is_read_from_file: bool", "path: std::string"}) - .SetKernelFn(PD_KERNEL(TuneCublasltGemm)); \ No newline at end of file + .SetKernelFn(PD_KERNEL(TuneCublasltGemm)); diff --git a/custom_ops/gpu_ops/update_inputs_beam.cu b/custom_ops/gpu_ops/update_inputs_beam.cu index 74d4c2b53..aea374661 100644 --- a/custom_ops/gpu_ops/update_inputs_beam.cu +++ b/custom_ops/gpu_ops/update_inputs_beam.cu @@ -33,7 +33,7 @@ __global__ void update_inputs_beam_kernel( if (block_idx == 0) { seq_lens_this_time[thread_idx] = seq_lens_this_time[bsz_index]; seq_lens_encoder[thread_idx] = seq_lens_encoder[bsz_index]; - } + } if (block_idx < seq_len) { input_ids[thread_idx * seq_len + block_idx] = input_ids[bsz_index * seq_len + block_idx]; } @@ -74,8 +74,8 @@ void UpdateInputesBeam( PD_BUILD_STATIC_OP(update_inputs_beam) .Inputs({"beam_width", - "seq_lens_this_time", - "seq_lens_encoder", + "seq_lens_this_time", + "seq_lens_encoder", "input_ids", "logits"}) .Outputs({"seq_lens_this_time_out", @@ -86,4 +86,4 @@ PD_BUILD_STATIC_OP(update_inputs_beam) {"seq_lens_encoder", "seq_lens_encoder_out"}, {"input_ids", "input_ids_out"}, {"logits", "logits_out"}}) - .SetKernelFn(PD_KERNEL(UpdateInputesBeam)); \ No newline at end of file + .SetKernelFn(PD_KERNEL(UpdateInputesBeam)); diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index c002beeb6..de49ab4ea 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" setup for FastDeploy custom ops """ +"""setup for FastDeploy custom ops""" import importlib import json import os @@ -41,8 +41,7 @@ ROOT_DIR = Path(__file__).parent.parent # cannot import envs directly because it depends on fastdeploy, # which is not installed yet -envs = load_module_from_path('envs', - os.path.join(ROOT_DIR, 'fastdeploy', 'envs.py')) +envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "fastdeploy", "envs.py")) archs = json.loads(envs.FD_BUILDING_ARCS) use_bf16 = envs.FD_CPU_USE_BF16 == "True" @@ -143,8 +142,7 @@ def get_nvcc_version(): """ Get cuda version of nvcc. """ - nvcc_output = subprocess.check_output(["nvcc", "--version"], - universal_newlines=True) + nvcc_output = subprocess.check_output(["nvcc", "--version"], universal_newlines=True) output = nvcc_output.split() release_idx = output.index("release") + 1 nvcc_cuda_version = float(output[release_idx].split(",")[0]) @@ -160,13 +158,19 @@ def get_gencode_flags(archs): for cc_val in cc_s: if cc_val == 90: arch_code = "90a" - flags += ["-gencode", f"arch=compute_{arch_code},code=sm_{arch_code}"] - elif cc_val == 100: # Assuming 100 is the code for Blackwell SM10.x + flags += [ + "-gencode", + f"arch=compute_{arch_code},code=sm_{arch_code}", + ] + elif cc_val == 100: # Assuming 100 is the code for Blackwell SM10.x # Per NVIDIA dev blog, for CUTLASS and architecture-specific features on CC 10.0, use '100a' # https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/ # "The CUTLASS build instructions specify using the a flag when building for devices of CC 9.0 and 10.0" arch_code = "100a" - flags += ["-gencode", f"arch=compute_{arch_code},code=sm_{arch_code}"] + flags += [ + "-gencode", + f"arch=compute_{arch_code},code=sm_{arch_code}", + ] else: flags += ["-gencode", f"arch=compute_{cc_val},code=sm_{cc_val}"] return flags @@ -194,7 +198,7 @@ if paddle.is_compiled_with_rocm(): clone_git_repo("v3.11.3", "https://bgithub.xyz/nlohmann/json.git", json_dir) if not os.listdir(json_dir): raise ValueError("Git clone nlohmann_json failed!") - sources=[ + sources = [ "gpu_ops/set_value_by_flags.cu", "gpu_ops/token_penalty_multi_scores.cu", "gpu_ops/stop_generation.cu", @@ -302,8 +306,7 @@ elif paddle.is_compiled_with_cuda(): if not os.path.exists(cutlass_dir) or not os.listdir(cutlass_dir): if not os.path.exists(cutlass_dir): os.makedirs(cutlass_dir) - clone_git_repo("v3.8.0", "https://github.com/NVIDIA/cutlass.git", - cutlass_dir) + clone_git_repo("v3.8.0", "https://github.com/NVIDIA/cutlass.git", cutlass_dir) if not os.listdir(cutlass_dir): raise ValueError("Git clone cutlass failed!") @@ -312,8 +315,7 @@ elif paddle.is_compiled_with_cuda(): if not os.path.exists(deep_gemm_dir) or not os.listdir(deep_gemm_dir): if not os.path.exists(deep_gemm_dir): os.makedirs(deep_gemm_dir) - clone_git_repo("main", "https://github.com/deepseek-ai/DeepGEMM.git", - deep_gemm_dir) + clone_git_repo("main", "https://github.com/deepseek-ai/DeepGEMM.git", deep_gemm_dir) if not os.listdir(deep_gemm_dir): raise ValueError("Git clone DeepGEMM failed!") cur_path = os.path.dirname(os.path.abspath(__file__)) @@ -347,15 +349,13 @@ elif paddle.is_compiled_with_cuda(): try: shutil.copytree(src_dir, dst_dir) except Exception as e: - raise RuntimeError( - f"Failed to copy from {src_dir} to {dst_dir}: {e}") + raise RuntimeError(f"Failed to copy from {src_dir} to {dst_dir}: {e}") json_dir = "third_party/nlohmann_json" if not os.path.exists(json_dir) or not os.listdir(json_dir): if not os.path.exists(json_dir): os.makedirs(json_dir) - clone_git_repo("v3.11.3", "https://github.com/nlohmann/json.git", - json_dir) + clone_git_repo("v3.11.3", "https://github.com/nlohmann/json.git", json_dir) if not os.listdir(json_dir): raise ValueError("Git clone nlohmann_json failed!") @@ -372,7 +372,7 @@ elif paddle.is_compiled_with_cuda(): "-Ithird_party/nlohmann_json/include", ] nvcc_version = get_nvcc_version() - print(f'nvcc_version = {nvcc_version}') + print(f"nvcc_version = {nvcc_version}") if nvcc_version >= 12.0: sources += ["gpu_ops/sample_kernels/air_top_p_sampling.cu"] cc = max(get_sm_version(archs)) @@ -414,31 +414,24 @@ elif paddle.is_compiled_with_cuda(): # Running generate fp8 gemm codes. # Common for SM89, SM90, SM100 (Blackwell) nvcc_compile_args += ["-DENABLE_FP8"] - nvcc_compile_args += [ - "-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen" - ] + nvcc_compile_args += ["-Igpu_ops/cutlass_kernels/fp8_gemm_fused/autogen"] # This script seems general enough for different SM versions, specific templates are chosen by CUTLASS. os.system("python utils/auto_gen_visitor_fp8_gemm_fused_kernels.py") - if cc >= 90: # Hopper and newer + if cc >= 90: # Hopper and newer # SM90 (Hopper) specific auto-generation and flags - if cc == 90: # Only for SM90 + if cc == 90: # Only for SM90 nvcc_compile_args += [ # The gencode for 90a is added in get_gencode_flags now # "-gencode", # "arch=compute_90a,code=compute_90a", "-O3", - "-DNDEBUG", # NDEBUG is common, consider moving if not specific to 90a + "-DNDEBUG", # NDEBUG is common, consider moving if not specific to 90a ] print("SM90: Running SM90-specific FP8 kernel auto-generation.") - os.system( - "python utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py") - os.system( - "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py" - ) - os.system( - "python utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py" - ) + os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py") + os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py") + os.system("python utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py") nvcc_compile_args += [ "-DENABLE_SCALED_MM_SM90=1", @@ -450,14 +443,14 @@ elif paddle.is_compiled_with_cuda(): "gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_sm90_int8.cu", "gpu_ops/cutlass_kernels/w8a8/c3x/scaled_mm_azp_sm90_int8.cu", ] - elif cc == 100 and nvcc_version >= 12.9: # Blackwell SM100 specifics + elif cc == 100 and nvcc_version >= 12.9: # Blackwell SM100 specifics print("SM100 (Blackwell): Applying SM100 configurations.") nvcc_compile_args += [ # The gencode for 100a is added in get_gencode_flags # "-gencode", # "arch=compute_100a,code=compute_100a", - "-O3", # Common optimization flag - "-DNDEBUG", # Common debug flag + "-O3", # Common optimization flag + "-DNDEBUG", # Common debug flag # Potentially add -DENABLE_SM100_FEATURES if specific macros are identified ] # Placeholder for SM100-specific kernel auto-generation scripts @@ -469,18 +462,16 @@ elif paddle.is_compiled_with_cuda(): # Add SM100 specific sources if any, e.g., for new hardware intrinsics # sources += ["gpu_ops/cutlass_kernels/w8a8/c4x_sm100.cu"] # Example - pass # No SM100 specific sources identified yet beyond what CUTLASS handles - else: # For cc >= 89 but not 90 or 100 (e.g. SM89) + pass # No SM100 specific sources identified yet beyond what CUTLASS handles + else: # For cc >= 89 but not 90 or 100 (e.g. SM89) print(f"SM{cc}: Running generic FP8 kernel auto-generation.") os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py") - os.system( - "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py") + os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py") - else: # For cc == 89 (Ada) + else: # For cc == 89 (Ada) print("SM89: Running generic FP8 kernel auto-generation.") os.system("python utils/auto_gen_fp8_fp8_gemm_fused_kernels.py") - os.system( - "python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py") + os.system("python utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py") # Common FP8 sources for SM89+ sources += [ @@ -493,7 +484,7 @@ elif paddle.is_compiled_with_cuda(): "gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu", "gpu_ops/cutlass_kernels/cutlass_heuristic.cu", "gpu_ops/cutlass_kernels/cutlass_preprocessors.cu", - "gpu_ops/fused_hadamard_quant_fp8.cu" + "gpu_ops/fused_hadamard_quant_fp8.cu", ] sources += find_end_files(fp8_auto_gen_directory, ".cu") diff --git a/custom_ops/setup_ops_base.py b/custom_ops/setup_ops_base.py index d05b1d39e..2386fee19 100644 --- a/custom_ops/setup_ops_base.py +++ b/custom_ops/setup_ops_base.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" setup for FASTDEPLOY base ops """ +"""setup for FASTDEPLOY base ops""" from paddle.utils.cpp_extension import CppExtension, setup @@ -27,7 +27,8 @@ setup( "cpu_ops/rebuild_padding.cc", ], extra_compile_args=[ - "-DPy_LIMITED_API=0x03090000", "-DPADDLE_ON_INFERENCE" + "-DPy_LIMITED_API=0x03090000", + "-DPADDLE_ON_INFERENCE", ], ), ) diff --git a/custom_ops/setup_ops_cpu.py b/custom_ops/setup_ops_cpu.py index 9990d2f58..6e6083e72 100644 --- a/custom_ops/setup_ops_cpu.py +++ b/custom_ops/setup_ops_cpu.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" setup for FASTDEPLOY custom cpu ops """ +"""setup for FASTDEPLOY custom cpu ops""" import os import subprocess import tarfile @@ -26,8 +26,7 @@ ROOT_DIR = Path(__file__).parent.parent # which is not installed yet from .setup_ops import load_module_from_path -envs = load_module_from_path('envs', - os.path.join(ROOT_DIR, 'fastdeploy', 'envs.py')) +envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "fastdeploy", "envs.py")) BUILDING_ARCS = [] use_bf16 = envs.FD_CPU_USE_BF16 == "True" diff --git a/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py b/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py index 53fae917a..0e9e755be 100644 --- a/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py +++ b/custom_ops/utils/auto_gen_fp8_fp8_block_gemm_fused_kernels_sm90.py @@ -48,17 +48,26 @@ def get_candidate_configs(sm): candidate_configs = list() hasbias = ("false", "true") - KernelSchedule = ( - "KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<1>", ) - EpilogueSchedule = ("TmaWarpSpecializedCooperative", ) + KernelSchedule = ("KernelTmaWarpSpecializedCooperativeFP8BlockScaledSubGroupMAccum<1>",) + EpilogueSchedule = ("TmaWarpSpecializedCooperative",) TileSchedule = ("PersistentScheduler", "StreamKScheduler") for act_tag in [ ("noact", "Identity"), - # ("relu", "ReLu"), - # ("gelu", "GELU"), + # ("relu", "ReLu"), + # ("gelu", "GELU"), ]: - candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, - EpilogueSchedule, TileSchedule)]) + candidate_configs.extend( + [ + ( + hasbias, + act_tag, + tiles, + KernelSchedule, + EpilogueSchedule, + TileSchedule, + ) + ] + ) return candidate_configs @@ -66,16 +75,13 @@ def get_shape_str(tile_shape): """ return tile_shape string. """ - blocks, clusters = [ - s.replace(" ", "").strip("<>").split(",") for s in tile_shape - ] + blocks, clusters = [s.replace(" ", "").strip("<>").split(",") for s in tile_shape] blocks = [elem.strip("_") for elem in blocks] clusters = [elem.strip("_") for elem in clusters] return blocks, clusters -def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule, - tile_schedule): +def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule, tile_schedule): """ check the cutlass config valid. """ @@ -304,13 +310,10 @@ def SubstituteTemplate(template, values_base): SubstituteTemplate """ values = copy.deepcopy(values_base) - if values.get("KernelSchedule" - ) is not None and "Auto" in values["KernelSchedule"]: + if values.get("KernelSchedule") is not None and "Auto" in values["KernelSchedule"]: values["KernelSchedule"] = "collective::" + values["KernelSchedule"] - if values.get("EpilogueSchedule" - ) is not None and "Auto" in values["EpilogueSchedule"]: - values[ - "EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"] + if values.get("EpilogueSchedule") is not None and "Auto" in values["EpilogueSchedule"]: + values["EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"] text = template changed = True while changed: @@ -329,8 +332,7 @@ def parse_args(): parse_args """ parser = argparse.ArgumentParser( - description= - "The argument for generating the generic_mixed_gemm_kernelLauncher instance." + description="The argument for generating the generic_mixed_gemm_kernelLauncher instance." ) parser.add_argument( "--cuda_arch", @@ -346,15 +348,15 @@ def parse_args(): # generate source .cu def generate_source_cu( - inputs_type: (str), - outputs_type: (str), - hasbiases: (str), - act_tag: (str), - tiles: (str), - KernelSchedule: (str), - EpilogueSchedule: (str), - TileSchedule: (str), - sm: str, + inputs_type: str, + outputs_type: str, + hasbiases: str, + act_tag: str, + tiles: str, + KernelSchedule: str, + EpilogueSchedule: str, + TileSchedule: str, + sm: str, ): """ generate_source_cu @@ -369,8 +371,11 @@ def generate_source_cu( for epilogue_schedule in EpilogueSchedule: for tile_schedule in TileSchedule: if not check_config_valid( - tile_config, kernel_schedule, - epilogue_schedule, tile_schedule): + tile_config, + kernel_schedule, + epilogue_schedule, + tile_schedule, + ): continue value_dict = { "input_type": input_type, @@ -385,30 +390,32 @@ def generate_source_cu( "SM": sm, "sm": sm[-2:], } - all_code += SubstituteTemplate( - GemmDeclare, value_dict) + all_code += SubstituteTemplate(GemmDeclare, value_dict) return all_code # generate gemm launch .cu def generate_launch_gemm_cus( - generate_dir: (str), inputs_type: (str), outputs_type: (str), - fuse_gemm_configs: tuple, sm: str): + generate_dir: str, + inputs_type: str, + outputs_type: str, + fuse_gemm_configs: tuple, + sm: str, +): """ generate_launch_gemm_cus """ act_tags = [single_config[1] for single_config in fuse_gemm_configs] single_config = fuse_gemm_configs[0] - hasbiases: (str) = single_config[0] - tiles: (str) = single_config[2] - KernelSchedule: (str) = single_config[3] - EpilogueSchedule: (str) = single_config[4] - TileSchedule: (str) = single_config[5] + hasbiases: str = single_config[0] + tiles: str = single_config[2] + KernelSchedule: str = single_config[3] + EpilogueSchedule: str = single_config[4] + TileSchedule: str = single_config[5] code_map = {} - head_path = os.path.join(generate_dir, - f"launch_block_gemm_kernel_sm{sm[-2:]}.h") + head_path = os.path.join(generate_dir, f"launch_block_gemm_kernel_sm{sm[-2:]}.h") head_all_code = LaunchGemmHead for tile_config in tiles: blocks, clusters = get_shape_str(tile_config) @@ -418,19 +425,19 @@ def generate_launch_gemm_cus( for epilogue_schedule in EpilogueSchedule: gemm_config_str_2 = gemm_config_str_1 + f"_{epilogue_schedule}" for tile_schedule in TileSchedule: - if not check_config_valid(tile_config, kernel_schedule, - epilogue_schedule, - tile_schedule): + if not check_config_valid( + tile_config, + kernel_schedule, + epilogue_schedule, + tile_schedule, + ): continue gemm_config_str = gemm_config_str_2 + f"_{tile_schedule}" value_dict = { - "sm": - sm[-2:], - "gemm_config": - gemm_config_str.replace("<", "").replace(">", ""), + "sm": sm[-2:], + "gemm_config": gemm_config_str.replace("<", "").replace(">", ""), } - head_all_code += SubstituteTemplate( - LaunchGemmDeclare, value_dict) + head_all_code += SubstituteTemplate(LaunchGemmDeclare, value_dict) os.makedirs(generate_dir, exist_ok=True) with open(head_path, "w") as f: f.write(head_all_code) @@ -444,19 +451,19 @@ def generate_launch_gemm_cus( for epilogue_schedule in EpilogueSchedule: gemm_config_str_2 = gemm_config_str_1 + f"_{epilogue_schedule}" for tile_schedule in TileSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule, - tile_schedule): + if not check_config_valid( + tile_shape, + kernel_schedule, + epilogue_schedule, + tile_schedule, + ): continue gemm_config_str = gemm_config_str_2 + f"_{tile_schedule}" value_dict = { - "sm": - sm[-2:], - "gemm_config": - gemm_config_str.replace("<", "").replace(">", ""), + "sm": sm[-2:], + "gemm_config": gemm_config_str.replace("<", "").replace(">", ""), } - source_all_code = SubstituteTemplate( - LaunchGemmPart0, value_dict) + source_all_code = SubstituteTemplate(LaunchGemmPart0, value_dict) type_id = 0 for input_type in inputs_type: for output_type in outputs_type: @@ -476,16 +483,14 @@ def generate_launch_gemm_cus( "SM": sm, "sm": sm[-2:], } - source_all_code += SubstituteTemplate( - LaunchGemmPart1, value_dict) + source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict) type_id += 1 source_all_code += LaunchGemmPart2 - gemm_config_str = gemm_config_str.replace("<", "").replace( - ">", "") + gemm_config_str = gemm_config_str.replace("<", "").replace(">", "") code_map[gemm_config_str] = source_all_code source_path = os.path.join( generate_dir, - f"launch_block_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu" + f"launch_block_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu", ) with open(source_path, "w") as f: f.write(source_all_code) @@ -495,19 +500,18 @@ def generate_launch_gemm_cus( # generate fp8_fp8_gemm_scale_bias_act_sm90.cu -def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str), - fuse_gemm_configs: tuple, sm: str): +def generate_dispatch_gemm_cu(inputs_type: str, outputs_type: str, fuse_gemm_configs: tuple, sm: str): """ generate_dispatch_gemm_cu """ act_tags = [single_config[1] for single_config in fuse_gemm_configs] single_config = fuse_gemm_configs[0] - hasbiases: (str) = single_config[0] - tiles: (str) = single_config[2] - KernelSchedule: (str) = single_config[3] - EpilogueSchedule: (str) = single_config[4] - TileSchedule: (str) = single_config[5] + hasbiases: str = single_config[0] + tiles: str = single_config[2] + KernelSchedule: str = single_config[3] + EpilogueSchedule: str = single_config[4] + TileSchedule: str = single_config[5] all_code = SubstituteTemplate(code_part0, {"sm": sm[-2:]}) type_id = 0 for input_type in inputs_type: @@ -530,9 +534,12 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str), for kernel_schedule in KernelSchedule: for epilogue_schedule in EpilogueSchedule: for tile_schedule in TileSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule, - tile_schedule): + if not check_config_valid( + tile_shape, + kernel_schedule, + epilogue_schedule, + tile_schedule, + ): continue value_dict = { "TileShape": tile_shape[0], @@ -554,18 +561,18 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str), for epilogue_schedule in EpilogueSchedule: gemm_config_str_2 = gemm_config_str_1 + f"_{epilogue_schedule}" for tile_schedule in TileSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule, - tile_schedule): + if not check_config_valid( + tile_shape, + kernel_schedule, + epilogue_schedule, + tile_schedule, + ): continue gemm_config_str = gemm_config_str_2 + f"_{tile_schedule}" value_dict = { - "sm": - sm[-2:], - "tile_id": - str(tile_id), - "gemm_config": - gemm_config_str.replace("<", "").replace(">", ""), + "sm": sm[-2:], + "tile_id": str(tile_id), + "gemm_config": gemm_config_str.replace("<", "").replace(">", ""), } all_code += SubstituteTemplate(code_part5, value_dict) tile_id += 1 @@ -610,12 +617,17 @@ if __name__ == "__main__": f.close() # Compile parallelization generate_launch_gemm_cus( - "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", inputs_type, - outputs_type, fuse_gemm_configs, sm_dict[sm]) + "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", + inputs_type, + outputs_type, + fuse_gemm_configs, + sm_dict[sm], + ) # hard code for act_tag - file_name = (f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/" - f"fp8_fp8_block_gemm_scale_bias_act_sm{sm}.cu") + file_name = ( + f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/" f"fp8_fp8_block_gemm_scale_bias_act_sm{sm}.cu" + ) all_code = generate_dispatch_gemm_cu( inputs_type, outputs_type, diff --git a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py index bf319d2f9..105ed5bac 100644 --- a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py +++ b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels.py @@ -24,27 +24,28 @@ def get_candidate_tiles(): """ base_configs = [("<64, 64, 64>", "<32, 32, 64>", "<16, 8, 32>")] - base_configs.extend([ - ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"), - ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"), - ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"), - ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"), - ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"), - ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"), - ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"), - ]) + base_configs.extend( + [ + ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"), + ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"), + ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"), + ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"), + ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"), + ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"), + ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"), + ] + ) return base_configs -def get_dual_gemm_candidate_configs(sm, min_split_k, max_split_k, min_stages, - max_stages): +def get_dual_gemm_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages): """ get_dual_gemm_candidate_configs returns a list of candidate configs for the dual_gemm_fused_kernel. """ @@ -299,8 +300,7 @@ def check_min_split_k(value): """ ivalue = int(value) if ivalue > 1: - raise argparse.ArgumentTypeError( - "Dual gemm split_k mode is not support.") + raise argparse.ArgumentTypeError("Dual gemm split_k mode is not support.") return ivalue @@ -310,8 +310,7 @@ def check_max_split_k(value): """ ivalue = int(value) if ivalue > 1: - raise argparse.ArgumentTypeError( - "Dual gemm split_k mode is not support..") + raise argparse.ArgumentTypeError("Dual gemm split_k mode is not support..") return ivalue @@ -320,8 +319,7 @@ def parse_args(): parse_args """ parser = argparse.ArgumentParser( - description= - "The argument for generating the generic_mixed_gemm_kernelLauncher instance." + description="The argument for generating the generic_mixed_gemm_kernelLauncher instance." ) parser.add_argument( "--cuda_arch", @@ -421,8 +419,7 @@ def generate_dual_gemm_source_cu( "hasbias": hasbias, "SM": sm, } - all_code += SubstituteTemplate( - GemmSplitKDeclare, value_dict) + all_code += SubstituteTemplate(GemmSplitKDeclare, value_dict) all_code += CommonTail return all_code @@ -449,12 +446,12 @@ def generate_launch_dual_gemm_cus( head_path = os.path.join(generate_dir, "launch_dual_gemm_kernel.h") head_all_code = LaunchGemmHead for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] - gemm_config = (f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_" - f"warp{warps[0]}x{warps[1]}x{warps[2]}_" - f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}") + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] + gemm_config = ( + f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_" + f"warp{warps[0]}x{warps[1]}x{warps[2]}_" + f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}" + ) for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" value_dict = { @@ -467,12 +464,12 @@ def generate_launch_dual_gemm_cus( f.close() for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] - gemm_config = (f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_" - f"warp{warps[0]}x{warps[1]}x{warps[2]}_" - f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}") + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] + gemm_config = ( + f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_" + f"warp{warps[0]}x{warps[1]}x{warps[2]}_" + f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}" + ) for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" value_dict = { @@ -498,16 +495,14 @@ def generate_launch_dual_gemm_cus( "num_stages": str(stage), "SM": sm, } - source_all_code += SubstituteTemplate( - LaunchGemmPart1, value_dict) + source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict) # split_k_code += SubstituteTemplate(LaunchGemmPart3, value_dict) type_id += 1 source_all_code += LaunchGemmPart2 # source_all_code += split_k_code # source_all_code += LaunchGemmPart4 code_map[gemm_config_str] = source_all_code - source_path = os.path.join( - generate_dir, f"launch_dual_gemm_kernel_{gemm_config_str}.cu") + source_path = os.path.join(generate_dir, f"launch_dual_gemm_kernel_{gemm_config_str}.cu") with open(source_path, "w") as f: f.write(source_all_code) f.close() @@ -566,12 +561,12 @@ def generate_dispatch_dual_gemm_cu( tile_id = 0 for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] - gemm_config = (f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_" - f"warp{warps[0]}x{warps[1]}x{warps[2]}_" - f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}") + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] + gemm_config = ( + f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_" + f"warp{warps[0]}x{warps[1]}x{warps[2]}_" + f"mma{mmas[0]}x{mmas[1]}x{mmas[2]}" + ) for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" value_dict = { @@ -580,10 +575,12 @@ def generate_dispatch_dual_gemm_cu( } all_code += SubstituteTemplate(code_part5, value_dict) tile_id += 1 - value_dict.update({ - "min_split_k": str(min_split_k), - "max_split_k": str(max_split_k), - }) + value_dict.update( + { + "min_split_k": str(min_split_k), + "max_split_k": str(max_split_k), + } + ) all_code += SubstituteTemplate(code_part6, value_dict) return all_code @@ -602,8 +599,7 @@ if __name__ == "__main__": for sm in archs: if sm == "89": - fuse_gemm_configs = get_dual_gemm_candidate_configs( - sm, min_split_k, max_split_k, min_stages, max_stages) + fuse_gemm_configs = get_dual_gemm_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages) for fuse_gemm_config in fuse_gemm_configs: file_name = ( f"gpu_ops/cutlass_kernels/fp8_gemm_fused/" diff --git a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py index 018e4eead..b2ef38f40 100644 --- a/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py +++ b/custom_ops/utils/auto_gen_fp8_fp8_dual_gemm_fused_kernels_sm90.py @@ -19,8 +19,7 @@ import re def get_candidate_tiles(): - """ - """ + """ """ cta_shape = [ ("<_64, _16, _128>"), ("<_64, _32, _128>"), @@ -45,8 +44,7 @@ def get_candidate_tiles(): def get_dual_gemm_candidate_configs(sm): - """ - """ + """ """ tiles = get_candidate_tiles() candidate_configs = list() @@ -64,35 +62,27 @@ def get_dual_gemm_candidate_configs(sm): ("swiglu", "SiLu"), ("geglu", "GELU"), ]: - candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, - EpilogueSchedule)]) + candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, EpilogueSchedule)]) return candidate_configs def get_shape_str(tile_shape): - """ - """ - blocks, clusters = [ - s.replace(" ", "").strip("<>").split(",") for s in tile_shape - ] + """ """ + blocks, clusters = [s.replace(" ", "").strip("<>").split(",") for s in tile_shape] blocks = [elem.strip("_") for elem in blocks] clusters = [elem.strip("_") for elem in clusters] return blocks, clusters def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): - """ - """ + """ """ blocks, clusters = get_shape_str(tile_shape) - if int( - blocks[0] - ) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum": + if int(blocks[0]) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum": return False if "Cooperative" in kernel_schedule and "Cooperative" not in epilogue_schedule: return False - if tile_shape[ - 0] == "<_128, _128, _128>" and kernel_schedule == "KernelTmaWarpSpecializedPingpongFP8FastAccum": + if tile_shape[0] == "<_128, _128, _128>" and kernel_schedule == "KernelTmaWarpSpecializedPingpongFP8FastAccum": return False return True @@ -302,8 +292,7 @@ bool fp8_fp8_dual_gemm_scale_bias_act(DualGemmEpilogueAllParams params) { def SubstituteTemplate(template, values): - """ - """ + """ """ text = template changed = True while changed: @@ -318,10 +307,8 @@ def SubstituteTemplate(template, values): def parse_args(): - """ - """ - parser = argparse.ArgumentParser( - description="auto generate the fp8_fp8_dual_gemm_fused_kernels_sm90.") + """ """ + parser = argparse.ArgumentParser(description="auto generate the fp8_fp8_dual_gemm_fused_kernels_sm90.") parser.add_argument( "--cuda_arch", type=str, @@ -336,17 +323,16 @@ def parse_args(): # generate source .cu def generate_dual_gemm_source_cu( - inputs_type: (str), - biases_type: (str), - hasbiases: (str), - act_tag: (str), - tiles: (str), - KernelSchedule: (str), - EpilogueSchedule: (str), - sm: str, + inputs_type: str, + biases_type: str, + hasbiases: str, + act_tag: str, + tiles: str, + KernelSchedule: str, + EpilogueSchedule: str, + sm: str, ): - """ - """ + """ """ all_code = CommonHead for input_type in inputs_type: for bias_type in biases_type: @@ -354,9 +340,7 @@ def generate_dual_gemm_source_cu( for tile_config in tiles: for kernel_schedule in KernelSchedule: for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_config, - kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule): continue value_dict = { "input_type": input_type, @@ -370,28 +354,29 @@ def generate_dual_gemm_source_cu( "SM": sm, "sm": sm[-2:], } - all_code += SubstituteTemplate( - GemmDeclare, value_dict) + all_code += SubstituteTemplate(GemmDeclare, value_dict) return all_code # generate gemm launch .cu def generate_launch_dual_gemm_cus( - generate_dir: (str), inputs_type: (str), biases_type: (str), - fuse_gemm_configs: tuple, sm: str): - """ - """ + generate_dir: str, + inputs_type: str, + biases_type: str, + fuse_gemm_configs: tuple, + sm: str, +): + """ """ act_tags = [single_config[1] for single_config in fuse_gemm_configs] single_config = fuse_gemm_configs[0] - hasbiases: (str) = single_config[0] - tiles: (str) = single_config[2] - KernelSchedule: (str) = single_config[3] - EpilogueSchedule: (str) = single_config[4] + hasbiases: str = single_config[0] + tiles: str = single_config[2] + KernelSchedule: str = single_config[3] + EpilogueSchedule: str = single_config[4] code_map = {} - head_path = os.path.join(generate_dir, - f"launch_dual_gemm_kernel_sm{sm[-2:]}.h") + head_path = os.path.join(generate_dir, f"launch_dual_gemm_kernel_sm{sm[-2:]}.h") head_all_code = LaunchGemmHead for tile_config in tiles: blocks, clusters = get_shape_str(tile_config) @@ -401,16 +386,14 @@ def generate_launch_dual_gemm_cus( for kernel_schedule in KernelSchedule: gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}" for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_config, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule): continue gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}" value_dict = { "sm": sm[-2:], "gemm_config": gemm_config_str, } - head_all_code += SubstituteTemplate(LaunchGemmDeclare, - value_dict) + head_all_code += SubstituteTemplate(LaunchGemmDeclare, value_dict) os.makedirs(generate_dir, exist_ok=True) with open(head_path, "w") as f: f.write(head_all_code) @@ -422,16 +405,14 @@ def generate_launch_dual_gemm_cus( for kernel_schedule in KernelSchedule: gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}" for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): continue gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}" value_dict = { "sm": sm[-2:], "gemm_config": gemm_config_str, } - source_all_code = SubstituteTemplate(LaunchGemmPart0, - value_dict) + source_all_code = SubstituteTemplate(LaunchGemmPart0, value_dict) type_id = 0 for input_type in inputs_type: for bias_type in biases_type: @@ -450,14 +431,13 @@ def generate_launch_dual_gemm_cus( "SM": sm, "sm": sm[-2:], } - source_all_code += SubstituteTemplate( - LaunchGemmPart1, value_dict) + source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict) type_id += 1 source_all_code += LaunchGemmPart2 code_map[gemm_config_str] = source_all_code source_path = os.path.join( generate_dir, - f"launch_dual_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu" + f"launch_dual_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu", ) with open(source_path, "w") as f: f.write(source_all_code) @@ -467,16 +447,14 @@ def generate_launch_dual_gemm_cus( # generate fp8_fp8_gemm_scale_bias_act.cu -def generate_dispatch_dual_gemm_cu(inputs_type: (str), biases_type: (str), - fuse_gemm_configs: tuple, sm: str): - """ - """ +def generate_dispatch_dual_gemm_cu(inputs_type: str, biases_type: str, fuse_gemm_configs: tuple, sm: str): + """ """ act_tags = [single_config[1] for single_config in fuse_gemm_configs] single_config = fuse_gemm_configs[0] - hasbiases: (str) = single_config[0] - tiles: (str) = single_config[2] - KernelSchedule: (str) = single_config[3] - EpilogueSchedule: (str) = single_config[4] + hasbiases: str = single_config[0] + tiles: str = single_config[2] + KernelSchedule: str = single_config[3] + EpilogueSchedule: str = single_config[4] all_code = SubstituteTemplate(code_part0, {"sm": sm[-2:]}) type_id = 0 @@ -500,8 +478,7 @@ def generate_dispatch_dual_gemm_cu(inputs_type: (str), biases_type: (str), for tile_shape in tiles: for kernel_schedule in KernelSchedule: for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): continue value_dict = { "TileShape": tile_shape[0], @@ -520,8 +497,7 @@ def generate_dispatch_dual_gemm_cu(inputs_type: (str), biases_type: (str), for kernel_schedule in KernelSchedule: gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}" for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): continue gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}" value_dict = { @@ -570,12 +546,15 @@ if __name__ == "__main__": f.close() # Compile parallelization generate_launch_dual_gemm_cus( - "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", inputs_type, - biases_type, fuse_gemm_configs, sm_dict[sm]) + "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", + inputs_type, + biases_type, + fuse_gemm_configs, + sm_dict[sm], + ) # hard code for act_tag file_name = ( - f"gpu_ops/cutlass_kernels/fp8_gemm_fused/" - f"autogen/fp8_fp8_dual_gemm_scale_bias_act_sm{sm}.cu" + f"gpu_ops/cutlass_kernels/fp8_gemm_fused/" f"autogen/fp8_fp8_dual_gemm_scale_bias_act_sm{sm}.cu" ) all_code = generate_dispatch_dual_gemm_cu( inputs_type, diff --git a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py index cb2e93a03..14f147afc 100644 --- a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py +++ b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels.py @@ -31,25 +31,26 @@ def get_candidate_tiles(): """ base_configs = [("<64, 64, 64>", "<32, 32, 64>", "<16, 8, 32>")] - base_configs.extend([ - ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"), - ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"), - ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"), - ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"), - ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"), - ]) + base_configs.extend( + [ + ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"), + ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"), + ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"), + ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"), + ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"), + ] + ) return base_configs -def get_candidate_configs(sm, min_split_k, max_split_k, min_stages, - max_stages): +def get_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages): """ 获取候选的gemm算子配置列表。 @@ -353,8 +354,7 @@ def parse_args(): 代码参数解析 """ parser = argparse.ArgumentParser( - description= - "The argument for generating the generic_mixed_gemm_kernelLauncher instance." + description="The argument for generating the generic_mixed_gemm_kernelLauncher instance." ) parser.add_argument( "--cuda_arch", @@ -448,8 +448,7 @@ def generate_source_cu( "hasbias": hasbias, "SM": sm, } - all_code += SubstituteTemplate(GemmSplitKDeclare, - value_dict) + all_code += SubstituteTemplate(GemmSplitKDeclare, value_dict) all_code += CommonTail return all_code @@ -473,9 +472,7 @@ def generate_launch_gemm_cus( head_path = os.path.join(generate_dir, "launch_gemm_kernel.h") head_all_code = LaunchGemmHead for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}" for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" @@ -489,9 +486,7 @@ def generate_launch_gemm_cus( f.close() for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}" for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" @@ -517,17 +512,14 @@ def generate_launch_gemm_cus( "num_stages": str(stage), "SM": sm, } - source_all_code += SubstituteTemplate( - LaunchGemmPart1, value_dict) - split_k_code += SubstituteTemplate( - LaunchGemmPart3, value_dict) + source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict) + split_k_code += SubstituteTemplate(LaunchGemmPart3, value_dict) type_id += 1 source_all_code += LaunchGemmPart2 source_all_code += split_k_code source_all_code += LaunchGemmPart4 code_map[gemm_config_str] = source_all_code - source_path = os.path.join( - generate_dir, f"launch_gemm_kernel_{gemm_config_str}.cu") + source_path = os.path.join(generate_dir, f"launch_gemm_kernel_{gemm_config_str}.cu") with open(source_path, "w") as f: f.write(source_all_code) f.close() @@ -581,9 +573,7 @@ def generate_dispatch_gemm_cu( all_code += code_part4 tile_id = 0 for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}" for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" @@ -593,10 +583,12 @@ def generate_dispatch_gemm_cu( } all_code += SubstituteTemplate(code_part5, value_dict) tile_id += 1 - value_dict.update({ - "min_split_k": str(min_split_k), - "max_split_k": str(max_split_k), - }) + value_dict.update( + { + "min_split_k": str(min_split_k), + "max_split_k": str(max_split_k), + } + ) all_code += SubstituteTemplate(code_part6, value_dict) return all_code @@ -614,9 +606,7 @@ if __name__ == "__main__": for sm in archs: if sm == "89": - fuse_gemm_configs = get_candidate_configs(sm, min_split_k, - max_split_k, min_stages, - max_stages) + fuse_gemm_configs = get_candidate_configs(sm, min_split_k, max_split_k, min_stages, max_stages) for fuse_gemm_config in fuse_gemm_configs: file_name = f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/generic_gemm_kernel_sm{sm}_{fuse_gemm_config[3][0]}.cu" all_code = generate_source_cu( @@ -654,9 +644,7 @@ if __name__ == "__main__": # hard code for act_tag - file_name = ( - "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/fp8_fp8_gemm_scale_bias_act.cu" - ) + file_name = "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/fp8_fp8_gemm_scale_bias_act.cu" all_code = generate_dispatch_gemm_cu( inputs_type, outputs_type, diff --git a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py index 2268fa3a4..6c9efea21 100644 --- a/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py +++ b/custom_ops/utils/auto_gen_fp8_fp8_gemm_fused_kernels_sm90.py @@ -20,44 +20,44 @@ import re def get_candidate_tiles(): - """ - """ + """ """ base_configs = [ ("<_64, _64, _128>", "<_1, _8, _1>"), ("<_64, _128, _128>", "<_2, _1, _1>"), ("<_128, _128, _128>", "<_2, _1, _1>"), ] - base_configs.extend([ - ("<_64, _64, _128>", "<_1, _1, _1>"), - ("<_64, _64, _128>", "<_1, _2, _1>"), - ("<_64, _64, _128>", "<_2, _1, _1>"), - ("<_64, _64, _64>", "<_1, _1, _1>"), - ("<_64, _64, _64>", "<_1, _2, _1>"), - ("<_64, _64, _64>", "<_2, _1, _1>"), - ("<_64, _128, _128>", "<_1, _2, _1>"), - ("<_64, _128, _128>", "<_1, _1, _1>"), - ("<_128, _128, _64>", "<_2, _1, _1>"), - ("<_256, _128, _128>", "<_1, _2, _1>"), - ("<_256, _128, _128>", "<_1, _1, _1>"), - # The following configurations are rarely selected in Qwen2-7B-model. - # ("<_256, _128, _128>", "<_4, _1, _1>"), - # ("<_256, _128, _128>", "<_1, _4, _1>"), - # ("<_256, _128, _128>", "<_2, _4, _1>"), - # ("<_128, _128, _256>", "<_1, _2, _1>"), - # ("<_128, _128, _128>", "<_4, _1, _1>"), - # ("<_128, _128, _128>", "<_2, _4, _1>"), - # ("<_128, _128, _128>", "<_1, _2, _1>"), - # ("<_128, _128, _128>", "<_1, _1, _1>"), - # ("<_128, _128, _128>", "<_1, _4, _1>"), - # ("<_128, _128, _64>", "<_2, _2, _1>"), - ]) + base_configs.extend( + [ + ("<_64, _64, _128>", "<_1, _1, _1>"), + ("<_64, _64, _128>", "<_1, _2, _1>"), + ("<_64, _64, _128>", "<_2, _1, _1>"), + ("<_64, _64, _64>", "<_1, _1, _1>"), + ("<_64, _64, _64>", "<_1, _2, _1>"), + ("<_64, _64, _64>", "<_2, _1, _1>"), + ("<_64, _128, _128>", "<_1, _2, _1>"), + ("<_64, _128, _128>", "<_1, _1, _1>"), + ("<_128, _128, _64>", "<_2, _1, _1>"), + ("<_256, _128, _128>", "<_1, _2, _1>"), + ("<_256, _128, _128>", "<_1, _1, _1>"), + # The following configurations are rarely selected in Qwen2-7B-model. + # ("<_256, _128, _128>", "<_4, _1, _1>"), + # ("<_256, _128, _128>", "<_1, _4, _1>"), + # ("<_256, _128, _128>", "<_2, _4, _1>"), + # ("<_128, _128, _256>", "<_1, _2, _1>"), + # ("<_128, _128, _128>", "<_4, _1, _1>"), + # ("<_128, _128, _128>", "<_2, _4, _1>"), + # ("<_128, _128, _128>", "<_1, _2, _1>"), + # ("<_128, _128, _128>", "<_1, _1, _1>"), + # ("<_128, _128, _128>", "<_1, _4, _1>"), + # ("<_128, _128, _64>", "<_2, _2, _1>"), + ] + ) return base_configs def get_candidate_configs(sm): - """ - """ + """ """ tiles = get_candidate_tiles() candidate_configs = list() @@ -73,36 +73,31 @@ def get_candidate_configs(sm): ("relu", "ReLu"), ("gelu", "GELU"), ]: - candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, - EpilogueSchedule)]) + candidate_configs.extend([(hasbias, act_tag, tiles, KernelSchedule, EpilogueSchedule)]) return candidate_configs def get_shape_str(tile_shape): - """ - """ - blocks, clusters = [ - s.replace(" ", "").strip("<>").split(",") for s in tile_shape - ] + """ """ + blocks, clusters = [s.replace(" ", "").strip("<>").split(",") for s in tile_shape] blocks = [elem.strip("_") for elem in blocks] clusters = [elem.strip("_") for elem in clusters] return blocks, clusters def check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): - """ - """ + """ """ blocks, clusters = get_shape_str(tile_shape) - if int( - blocks[0] - ) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum": + if int(blocks[0]) < 128 and kernel_schedule == "KernelTmaWarpSpecializedCooperativeFP8FastAccum": return False if "Cooperative" in kernel_schedule and "Cooperative" not in epilogue_schedule: return False - if (tile_shape[0] == "<_256, _128, _128>" - and "Cooperative" not in kernel_schedule - and "Cooperative" not in epilogue_schedule): + if ( + tile_shape[0] == "<_256, _128, _128>" + and "Cooperative" not in kernel_schedule + and "Cooperative" not in epilogue_schedule + ): return False return True @@ -321,16 +316,12 @@ bool fp8_fp8_gemm_scale_bias_act(GemmEpilogueAllParams params) { def SubstituteTemplate(template, values_base): - """ - """ + """ """ values = copy.deepcopy(values_base) - if values.get("KernelSchedule" - ) is not None and "Auto" in values["KernelSchedule"]: + if values.get("KernelSchedule") is not None and "Auto" in values["KernelSchedule"]: values["KernelSchedule"] = "collective::" + values["KernelSchedule"] - if values.get("EpilogueSchedule" - ) is not None and "Auto" in values["EpilogueSchedule"]: - values[ - "EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"] + if values.get("EpilogueSchedule") is not None and "Auto" in values["EpilogueSchedule"]: + values["EpilogueSchedule"] = "collective::" + values["EpilogueSchedule"] text = template changed = True while changed: @@ -345,10 +336,8 @@ def SubstituteTemplate(template, values_base): def parse_args(): - """ - """ - parser = argparse.ArgumentParser( - description="auto generate fp8_fp8_gemm_fused_kernels_sm90.") + """ """ + parser = argparse.ArgumentParser(description="auto generate fp8_fp8_gemm_fused_kernels_sm90.") parser.add_argument( "--cuda_arch", type=str, @@ -363,17 +352,16 @@ def parse_args(): # generate source .cu def generate_source_cu( - inputs_type: (str), - outputs_type: (str), - hasbiases: (str), - act_tag: (str), - tiles: (str), - KernelSchedule: (str), - EpilogueSchedule: (str), - sm: str, + inputs_type: str, + outputs_type: str, + hasbiases: str, + act_tag: str, + tiles: str, + KernelSchedule: str, + EpilogueSchedule: str, + sm: str, ): - """ - """ + """ """ all_code = CommonHead for input_type in inputs_type: @@ -382,9 +370,7 @@ def generate_source_cu( for tile_config in tiles: for kernel_schedule in KernelSchedule: for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_config, - kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule): continue value_dict = { "input_type": input_type, @@ -398,25 +384,27 @@ def generate_source_cu( "SM": sm, "sm": sm[-2:], } - all_code += SubstituteTemplate( - GemmDeclare, value_dict) + all_code += SubstituteTemplate(GemmDeclare, value_dict) return all_code # generate gemm launch .cu def generate_launch_gemm_cus( - generate_dir: (str), inputs_type: (str), outputs_type: (str), - fuse_gemm_configs: tuple, sm: str): - """ - """ + generate_dir: str, + inputs_type: str, + outputs_type: str, + fuse_gemm_configs: tuple, + sm: str, +): + """ """ act_tags = [single_config[1] for single_config in fuse_gemm_configs] single_config = fuse_gemm_configs[0] - hasbiases: (str) = single_config[0] - tiles: (str) = single_config[2] - KernelSchedule: (str) = single_config[3] - EpilogueSchedule: (str) = single_config[4] + hasbiases: str = single_config[0] + tiles: str = single_config[2] + KernelSchedule: str = single_config[3] + EpilogueSchedule: str = single_config[4] code_map = {} head_path = os.path.join(generate_dir, f"launch_gemm_kernel_sm{sm[-2:]}.h") head_all_code = LaunchGemmHead @@ -426,16 +414,14 @@ def generate_launch_gemm_cus( for kernel_schedule in KernelSchedule: gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}" for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_config, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_config, kernel_schedule, epilogue_schedule): continue gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}" value_dict = { "sm": sm[-2:], "gemm_config": gemm_config_str, } - head_all_code += SubstituteTemplate(LaunchGemmDeclare, - value_dict) + head_all_code += SubstituteTemplate(LaunchGemmDeclare, value_dict) os.makedirs(generate_dir, exist_ok=True) with open(head_path, "w") as f: f.write(head_all_code) @@ -447,16 +433,14 @@ def generate_launch_gemm_cus( for kernel_schedule in KernelSchedule: gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}" for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): continue gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}" value_dict = { "sm": sm[-2:], "gemm_config": gemm_config_str, } - source_all_code = SubstituteTemplate(LaunchGemmPart0, - value_dict) + source_all_code = SubstituteTemplate(LaunchGemmPart0, value_dict) type_id = 0 for input_type in inputs_type: for output_type in outputs_type: @@ -475,14 +459,14 @@ def generate_launch_gemm_cus( "SM": sm, "sm": sm[-2:], } - source_all_code += SubstituteTemplate( - LaunchGemmPart1, value_dict) + source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict) type_id += 1 source_all_code += LaunchGemmPart2 code_map[gemm_config_str] = source_all_code source_path = os.path.join( generate_dir, - f"launch_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu") + f"launch_gemm_kernel_sm{sm[-2:]}_{gemm_config_str}.cu", + ) with open(source_path, "w") as f: f.write(source_all_code) f.close() @@ -491,17 +475,15 @@ def generate_launch_gemm_cus( # generate fp8_fp8_gemm_scale_bias_act_sm90.cu -def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str), - fuse_gemm_configs: tuple, sm: str): - """ - """ +def generate_dispatch_gemm_cu(inputs_type: str, outputs_type: str, fuse_gemm_configs: tuple, sm: str): + """ """ act_tags = [single_config[1] for single_config in fuse_gemm_configs] single_config = fuse_gemm_configs[0] - hasbiases: (str) = single_config[0] - tiles: (str) = single_config[2] - KernelSchedule: (str) = single_config[3] - EpilogueSchedule: (str) = single_config[4] + hasbiases: str = single_config[0] + tiles: str = single_config[2] + KernelSchedule: str = single_config[3] + EpilogueSchedule: str = single_config[4] all_code = SubstituteTemplate(code_part0, {"sm": sm[-2:]}) type_id = 0 @@ -524,8 +506,7 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str), for tile_shape in tiles: for kernel_schedule in KernelSchedule: for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): continue value_dict = { "TileShape": tile_shape[0], @@ -544,8 +525,7 @@ def generate_dispatch_gemm_cu(inputs_type: (str), outputs_type: (str), for kernel_schedule in KernelSchedule: gemm_config_str_1 = gemm_config_str_0 + f"_{kernel_schedule}" for epilogue_schedule in EpilogueSchedule: - if not check_config_valid(tile_shape, kernel_schedule, - epilogue_schedule): + if not check_config_valid(tile_shape, kernel_schedule, epilogue_schedule): continue gemm_config_str = gemm_config_str_1 + f"_{epilogue_schedule}" value_dict = { @@ -576,7 +556,8 @@ if __name__ == "__main__": for fuse_gemm_config in fuse_gemm_configs: file_name = ( f"gpu_ops/cutlass_kernels/fp8_gemm_fused/" - f"autogen/generic_gemm_kernel_sm{sm}_{fuse_gemm_config[1][0]}.cu") + f"autogen/generic_gemm_kernel_sm{sm}_{fuse_gemm_config[1][0]}.cu" + ) all_code = generate_source_cu( inputs_type, outputs_type, @@ -594,8 +575,12 @@ if __name__ == "__main__": f.close() # Compile parallelization generate_launch_gemm_cus( - "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", inputs_type, - outputs_type, fuse_gemm_configs, sm_dict[sm]) + "gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen", + inputs_type, + outputs_type, + fuse_gemm_configs, + sm_dict[sm], + ) # hard code for act_tag file_name = f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/fp8_fp8_gemm_scale_bias_act_sm{sm}.cu" diff --git a/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py b/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py index f234f7290..d9a53f87a 100644 --- a/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py +++ b/custom_ops/utils/auto_gen_visitor_fp8_gemm_fused_kernels.py @@ -30,22 +30,24 @@ def get_candidate_tiles(): """ base_configs = [("<64, 64, 64>", "<32, 32, 64>", "<16, 8, 32>")] - base_configs.extend([ - ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"), - ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"), - ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"), - ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"), - ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"), - ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"), - ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), - ("<128, 64, 128>", "<64, 32, 128>", "<16, 8, 32>"), - ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"), - ]) + base_configs.extend( + [ + ("<16, 32, 64>", "<16, 32, 64>", "<16, 8, 32>"), + ("<16, 64, 64>", "<16, 32, 64>", "<16, 8, 32>"), + ("<32, 128, 64>", "<32, 32, 64>", "<16, 8, 32>"), + ("<64, 128, 64>", "<32, 64, 64>", "<16, 8, 32>"), + ("<64, 64, 128>", "<32, 64, 64>", "<16, 8, 32>"), + ("<64, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 64, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<64, 32, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<128, 128, 64>", "<128, 32, 64>", "<16, 8, 32>"), + ("<128, 256, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<256, 128, 64>", "<64, 64, 64>", "<16, 8, 32>"), + ("<128, 64, 128>", "<64, 32, 128>", "<16, 8, 32>"), + ("<16, 256, 128>", "<16, 64, 128>", "<16, 8, 32>"), + ] + ) return base_configs @@ -278,8 +280,7 @@ def parse_args(): 代码参数解析 """ parser = argparse.ArgumentParser( - description= - "The argument for generating the generic_mixed_gemm_kernelLauncher instance." + description="The argument for generating the generic_mixed_gemm_kernelLauncher instance." ) parser.add_argument( "--cuda_arch", @@ -370,13 +371,10 @@ def generate_launch_gemm_cus( - dict (code_map) - 包含每个Gemm配置对应的源代码的字典,格式为{"gemm_config": source_code}。 """ code_map = {} - head_path = os.path.join(generate_dir, - "launch_visitor_gemm_fused_kernel.h") + head_path = os.path.join(generate_dir, "launch_visitor_gemm_fused_kernel.h") head_all_code = LaunchGemmHead for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}" for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" @@ -390,9 +388,7 @@ def generate_launch_gemm_cus( f.close() for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}" for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" @@ -415,14 +411,14 @@ def generate_launch_gemm_cus( "num_stages": str(stage), "SM": sm, } - source_all_code += SubstituteTemplate( - LaunchGemmPart1, value_dict) + source_all_code += SubstituteTemplate(LaunchGemmPart1, value_dict) type_id += 1 source_all_code += LaunchGemmPart2 code_map[gemm_config_str] = source_all_code source_path = os.path.join( generate_dir, - f"launch_visitor_gemm_fused_kernel_{gemm_config_str}.cu") + f"launch_visitor_gemm_fused_kernel_{gemm_config_str}.cu", + ) with open(source_path, "w") as f: f.write(source_all_code) f.close() @@ -485,9 +481,7 @@ def generate_dispatch_gemm_cu( all_code += code_part4 tile_id = 0 for tile in tiles: - blocks, warps, mmas = [ - s.replace(" ", "").strip("<>").split(",") for s in tile - ] + blocks, warps, mmas = [s.replace(" ", "").strip("<>").split(",") for s in tile] gemm_config = f"block{blocks[0]}x{blocks[1]}x{blocks[2]}_warp{warps[0]}x{warps[1]}x{warps[2]}_mma{mmas[0]}x{mmas[1]}x{mmas[2]}" for stage in stages: gemm_config_str = gemm_config + f"_stage{stage}" @@ -512,10 +506,11 @@ if __name__ == "__main__": for sm in archs: if sm == "89": - fuse_gemm_configs = get_candidate_configs(sm, min_stages, - max_stages) + fuse_gemm_configs = get_candidate_configs(sm, min_stages, max_stages) for fuse_gemm_config in fuse_gemm_configs: - file_name = f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/generic_visitor_gemm_fused_kernel_sm{sm}.cu" + file_name = ( + f"gpu_ops/cutlass_kernels/fp8_gemm_fused/autogen/generic_visitor_gemm_fused_kernel_sm{sm}.cu" + ) all_code = generate_source_cu( inputs_type, outputs_type, @@ -544,9 +539,7 @@ if __name__ == "__main__": sm_dict[sm], ) - file_name = ( - "gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu" - ) + file_name = "gpu_ops/cutlass_kernels/fp8_gemm_fused/visitor_fp8_gemm_fused.cu" all_code = generate_dispatch_gemm_cu( inputs_type, outputs_type, diff --git a/custom_ops/xpu_ops/src/ops/block_attn.cc b/custom_ops/xpu_ops/src/ops/block_attn.cc index c136851f4..04eb0c568 100644 --- a/custom_ops/xpu_ops/src/ops/block_attn.cc +++ b/custom_ops/xpu_ops/src/ops/block_attn.cc @@ -113,7 +113,7 @@ std::vector BlockAttnKernel( vsl.kv_lod_vp = { const_cast(encoder_seq_lod_cpu.data()), enc_batch + 1, nullptr}; - + baidu::xpu::api::VectorParam prefix_lens_vp{ nullptr, 0, diff --git a/custom_ops/xpu_ops/src/setup_ops.py b/custom_ops/xpu_ops/src/setup_ops.py index 4b2bc19f4..c819cf9d9 100755 --- a/custom_ops/xpu_ops/src/setup_ops.py +++ b/custom_ops/xpu_ops/src/setup_ops.py @@ -30,8 +30,7 @@ current_file = Path(__file__).resolve() base_dir = current_file.parent -def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, - XDNN_LIB_DIR): +def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, XDNN_LIB_DIR): """ build xpu plugin """ @@ -49,7 +48,10 @@ def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, # 删除指定目录 dirs_to_remove = [ - "dist", "fastdeploy_ops.egg-info", "build", "plugin/build" + "dist", + "fastdeploy_ops.egg-info", + "build", + "plugin/build", ] for dir_name in dirs_to_remove: if os.path.exists(dir_name): @@ -58,8 +60,7 @@ def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, # 在 plugin 目录中执行构建脚本 plugin_dir = "plugin" - build_script = os.path.join(current_working_directory, plugin_dir, - "build.sh") + build_script = os.path.join(current_working_directory, plugin_dir, "build.sh") print("build_script: ", build_script) @@ -74,14 +75,16 @@ def build_plugin(CLANG_PATH, XRE_INC_DIR, XRE_LIB_DIR, XDNN_INC_DIR, # 执行构建脚本 try: print("Running build script...") - subprocess.run([build_script], - check=True, - cwd=os.path.join(current_working_directory, plugin_dir)) + subprocess.run( + [build_script], + check=True, + cwd=os.path.join(current_working_directory, plugin_dir), + ) print("Build completed successfully.") except subprocess.CalledProcessError as e: print(f"Build failed with error: {e}") except Exception as e: - print(f"Unexpected error: {str(e)}") + print(f"Unexpected error: {e!s}") def xpu_setup_ops(): @@ -124,17 +127,14 @@ def xpu_setup_ops(): XVLLM_PATH = os.getenv("XVLLM_PATH") assert XVLLM_PATH is not None, "XVLLM_PATH is not set." XVLLM_KERNEL_INC_PATH = os.path.join(XVLLM_PATH, "infer_ops", "include") - XVLLM_KERNEL_LIB_PATH = os.path.join(XVLLM_PATH, "infer_ops", "so", - "libapiinfer.so") + XVLLM_KERNEL_LIB_PATH = os.path.join(XVLLM_PATH, "infer_ops", "so", "libapiinfer.so") XVLLM_KERNEL_LIB_DIR = os.path.join(XVLLM_PATH, "infer_ops", "so") XVLLM_OP_INC_PATH = os.path.join(XVLLM_PATH, "xft_blocks", "include") - XVLLM_OP_LIB_PATH = os.path.join(XVLLM_PATH, "xft_blocks", "so", - "libxft_blocks.so") + XVLLM_OP_LIB_PATH = os.path.join(XVLLM_PATH, "xft_blocks", "so", "libxft_blocks.so") XVLLM_OP_LIB_DIR = os.path.join(XVLLM_PATH, "xft_blocks", "so") # build plugin - build_plugin(CLANG_PATH, XRE_INC_PATH, XRE_LIB_DIR, XDNN_INC_PATH, - XDNN_LIB_DIR) + build_plugin(CLANG_PATH, XRE_INC_PATH, XRE_LIB_DIR, XDNN_INC_PATH, XDNN_LIB_DIR) ops = [ # custom ops @@ -152,7 +152,6 @@ def xpu_setup_ops(): "./ops/block_attn.cc", "./ops/moe_layer.cc", "./ops/weight_quantize_xpu.cc", - # device manage ops "./ops/device/get_context_gm_max_mem_demand.cc", "./ops/device/get_free_global_memory.cc", diff --git a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py index 35e38e478..441912a6d 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py +++ b/custom_ops/xpu_ops/test/python/ops/test_get_padding_offset.py @@ -29,7 +29,7 @@ for i in range(bs): ids_len = seq_lens[i, 0] input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64") -x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k = get_padding_offset( +(x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k,) = get_padding_offset( paddle.to_tensor(input_ids), paddle.to_tensor(cum_offset), paddle.to_tensor(token_num), @@ -46,19 +46,14 @@ print("padding_offset:\n", padding_offset) print("cu_seqlens_q:\n", cu_seqlens_q) print("cu_seqlens_k:\n", cu_seqlens_k) -ref_x_remove_padding = np.array([8, 7, 8, 2, 4, 5, 5, 7, 6, 1, 7, 2, 6], - "int64") +ref_x_remove_padding = np.array([8, 7, 8, 2, 4, 5, 5, 7, 6, 1, 7, 2, 6], "int64") ref_cum_offsets_out = np.array([0, 6, 13], "int32") -ref_padding_offset = np.array([0, 0, 0, 0, 6, 6, 6, 13, 13, 13, 13, 13, 13], - "int32") +ref_padding_offset = np.array([0, 0, 0, 0, 6, 6, 6, 13, 13, 13, 13, 13, 13], "int32") ref_cu_seqlens_q = np.array([0, 4, 7, 13], "int32") ref_cu_seqlens_k = np.array([0, 4, 7, 13], "int32") -assert sum(ref_x_remove_padding - - x_remove_padding) == 0, 'Check x_remove_padding failed.' -assert sum(ref_cum_offsets_out - - cum_offsets_out) == 0, 'Check cum_offsets_out failed.' -assert sum(ref_padding_offset - - padding_offset) == 0, 'Check padding_offset failed.' -assert sum(ref_cu_seqlens_q - cu_seqlens_q) == 0, 'Check cu_seqlens_q failed.' -assert sum(ref_cu_seqlens_k - cu_seqlens_k) == 0, 'Check cu_seqlens_k failed.' +assert sum(ref_x_remove_padding - x_remove_padding) == 0, "Check x_remove_padding failed." +assert sum(ref_cum_offsets_out - cum_offsets_out) == 0, "Check cum_offsets_out failed." +assert sum(ref_padding_offset - padding_offset) == 0, "Check padding_offset failed." +assert sum(ref_cu_seqlens_q - cu_seqlens_q) == 0, "Check cu_seqlens_q failed." +assert sum(ref_cu_seqlens_k - cu_seqlens_k) == 0, "Check cu_seqlens_k failed." diff --git a/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py b/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py index 5bce2d352..39a05b5aa 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py +++ b/custom_ops/xpu_ops/test/python/ops/test_get_token_penalty_multi_scores.py @@ -21,10 +21,15 @@ paddle.seed(2023) pre_ids = paddle.to_tensor( [[1, 9, 3, 4, 5, 6, 7, -1, -1, -1], [1, 9, 7, 6, 5, 4, -1, -1, -1, -1]], - "int64") -logits = paddle.to_tensor([[0.1, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7, 0.1, 0.1, 0.1], - [0.1, 0.9, 0.7, 0.6, 0.5, 0.4, 0.1, 0.1, 0.1, 0.1]], - "float32") + "int64", +) +logits = paddle.to_tensor( + [ + [0.1, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7, 0.1, 0.1, 0.1], + [0.1, 0.9, 0.7, 0.6, 0.5, 0.4, 0.1, 0.1, 0.1, 0.1], + ], + "float32", +) penalty_scores = paddle.to_tensor([1.0, 1.0], "float32") frequency_scores = paddle.to_tensor([0.1, 0.1], "float32") presence_scores = paddle.to_tensor([0.0, 0.0], "float32") @@ -88,78 +93,536 @@ ref_logits = np.array( ) diff_logits = np.sum(np.abs(ref_logits - logits.numpy())) print("diff_logits\n", diff_logits) -assert diff_logits < 1e-6, 'Check failed.' +assert diff_logits < 1e-6, "Check failed." pre_ids = paddle.to_tensor( - [[ - 2, 3, 3, 5, 8, 9, 3, 9, 1, 8, 9, 2, 3, 8, 8, 9, 9, 1, 4, 2, 6, 2, 6, 8, - 7, 2, 2, 3, 8, 1, 5, 7, 9, 2, 2, 9, 1, 4, 9, 8, 5, 8, 5, 7, 3, 6, 4, 4, - 9, 9, 8, 5, 5, 2, 2, 9, 4, 8, 1, 9, 6, 9, 2, 2, 7, 2, 2, 9, 4, 6, 4, 6, - 1, 4, 1, 9, 1, 8, 8, 5, 7, 9, 4, 2, 5, 1, 1, 4, 1, 5, 5, 4, 4, 2, 1, 8, - 7, 1, 2, 9, 6, 7, 9, 6, 7, 7, 4, 9, 9, 7, 5, 1, 8, 9, 8, 8, 5, 4, 6, 4, - 7, 5, 5, 7, 6, 9, 3, 9 - ], - [ - 7, 8, 1, 3, 1, 7, 6, 3, 5, 3, 8, 3, 1, 9, 7, 1, 1, 9, 5, 4, 9, 6, 1, - 9, 3, 8, 3, 9, 9, 6, 4, 2, 8, 5, 3, 1, 6, 9, 1, 3, 9, 8, 1, 7, 5, 1, - 5, 1, 8, 7, 4, 5, 9, 8, 7, 4, 7, 3, 6, 4, 6, 6, 5, 5, 2, 9, 9, 5, 8, - 8, 4, 8, 2, 8, 1, 3, 9, 1, 8, 5, 8, 3, 8, 8, 2, 7, 3, 7, 5, 7, 2, 6, - 3, 5, 1, 4, 6, 1, 9, 8, 2, 2, 3, 6, 7, 6, 2, 6, 5, 1, 5, 6, 2, 1, 6, - 4, 7, 7, 3, 8, 5, 1, 9, 1, 2, 8, 6, 8 - ]]) + [ + [ + 2, + 3, + 3, + 5, + 8, + 9, + 3, + 9, + 1, + 8, + 9, + 2, + 3, + 8, + 8, + 9, + 9, + 1, + 4, + 2, + 6, + 2, + 6, + 8, + 7, + 2, + 2, + 3, + 8, + 1, + 5, + 7, + 9, + 2, + 2, + 9, + 1, + 4, + 9, + 8, + 5, + 8, + 5, + 7, + 3, + 6, + 4, + 4, + 9, + 9, + 8, + 5, + 5, + 2, + 2, + 9, + 4, + 8, + 1, + 9, + 6, + 9, + 2, + 2, + 7, + 2, + 2, + 9, + 4, + 6, + 4, + 6, + 1, + 4, + 1, + 9, + 1, + 8, + 8, + 5, + 7, + 9, + 4, + 2, + 5, + 1, + 1, + 4, + 1, + 5, + 5, + 4, + 4, + 2, + 1, + 8, + 7, + 1, + 2, + 9, + 6, + 7, + 9, + 6, + 7, + 7, + 4, + 9, + 9, + 7, + 5, + 1, + 8, + 9, + 8, + 8, + 5, + 4, + 6, + 4, + 7, + 5, + 5, + 7, + 6, + 9, + 3, + 9, + ], + [ + 7, + 8, + 1, + 3, + 1, + 7, + 6, + 3, + 5, + 3, + 8, + 3, + 1, + 9, + 7, + 1, + 1, + 9, + 5, + 4, + 9, + 6, + 1, + 9, + 3, + 8, + 3, + 9, + 9, + 6, + 4, + 2, + 8, + 5, + 3, + 1, + 6, + 9, + 1, + 3, + 9, + 8, + 1, + 7, + 5, + 1, + 5, + 1, + 8, + 7, + 4, + 5, + 9, + 8, + 7, + 4, + 7, + 3, + 6, + 4, + 6, + 6, + 5, + 5, + 2, + 9, + 9, + 5, + 8, + 8, + 4, + 8, + 2, + 8, + 1, + 3, + 9, + 1, + 8, + 5, + 8, + 3, + 8, + 8, + 2, + 7, + 3, + 7, + 5, + 7, + 2, + 6, + 3, + 5, + 1, + 4, + 6, + 1, + 9, + 8, + 2, + 2, + 3, + 6, + 7, + 6, + 2, + 6, + 5, + 1, + 5, + 6, + 2, + 1, + 6, + 4, + 7, + 7, + 3, + 8, + 5, + 1, + 9, + 1, + 2, + 8, + 6, + 8, + ], + ] +) logits = paddle.to_tensor( - [[ - 0.16274983, 0.61470598, 0.94366980, 0.82005417, 0.50752640, 0.38316748, - 0.92648441, 0.24050158, 0.05461595, 0.42218581, 0.36270225, 0.15464807, - 0.13614719, 0.67509544, 0.40315166, 0.10671722, 0.24832056, 0.76091218, - 0.11598995, 0.10962527, 0.04688513, 0.81536716, 0.72259802, 0.60476679, - 0.16701800, 0.84160781, 0.79649884, 0.78021604, 0.75329530, 0.98587888, - 0.13421868, 0.16027625, 0.15269397, 0.06228730, 0.73856270, 0.34721911, - 0.73683006, 0.78178608, 0.32068327, 0.79906309, 0.44214272, 0.63330448, - 0.08016958, 0.63367140, 0.19788943, 0.55346787, 0.11142531, 0.90518415, - 0.21236691, 0.81587470, 0.83752930, 0.70979482, 0.35684183, 0.28715104, - 0.87162822, 0.17679396, 0.98725849, 0.76129991, 0.04090235, 0.37181064, - 0.63317049, 0.24689502, 0.21126501, 0.57617670, 0.74346697, 0.40613672, - 0.56907010, 0.68556929, 0.29032683, 0.17866278, 0.35165095, 0.97015840, - 0.70785582, 0.54259878, 0.14712237, 0.90483177, 0.02094105, 0.36411613, - 0.02495066, 0.88874054, 0.88895452, 0.86216462, 0.58062190, 0.95583254, - 0.20553111, 0.29870346, 0.69652933, 0.36861244, 0.85316223, 0.50240189, - 0.17566244, 0.61080140, 0.88203174, 0.98675215, 0.24344546, 0.17213407, - 0.78160852, 0.25165486, 0.48188508, 0.82812423, 0.10199814, 0.90475923, - 0.66907483, 0.71910626, 0.40660757, 0.59460294, 0.70212913, 0.90841550, - 0.00329034, 0.11290466, 0.89654654, 0.69114941, 0.29473618, 0.62027222, - 0.37333879, 0.98911142, 0.46510187, 0.65914583, 0.73022646, 0.12790845, - 0.12817244, 0.43015456, 0.75011456, 0.43562204, 0.48086026, 0.75587070, - 0.98481447, 0.77367836 - ], - [ - 0.12336024, 0.74152875, 0.09191196, 0.99301219, 0.44764417, - 0.01848883, 0.78326035, 0.99228370, 0.81447607, 0.02627683, - 0.51033205, 0.98703283, 0.15247856, 0.77640921, 0.60799915, - 0.87518770, 0.76818430, 0.86542630, 0.31795895, 0.04829503, - 0.85567141, 0.30271924, 0.67515039, 0.59728831, 0.78710967, - 0.75111693, 0.56837374, 0.49085775, 0.91510201, 0.59545547, - 0.99482232, 0.59036905, 0.58267909, 0.28770933, 0.53237396, - 0.95318258, 0.93987304, 0.61142951, 0.26737869, 0.52285451, - 0.03479086, 0.61631846, 0.66777998, 0.15736090, 0.00447258, - 0.37035006, 0.15281211, 0.95372260, 0.25963321, 0.61036694, - 0.15020694, 0.19171195, 0.55252832, 0.00391038, 0.31052542, - 0.96495175, 0.42586124, 0.05630261, 0.99728668, 0.01856293, - 0.83201504, 0.10701843, 0.56434178, 0.38009524, 0.51095045, - 0.13202040, 0.07133843, 0.75313550, 0.17111187, 0.80716974, - 0.00172165, 0.83906764, 0.73240769, 0.85843354, 0.11042888, - 0.07912333, 0.33689004, 0.22334915, 0.59059596, 0.52789515, - 0.29831955, 0.39515004, 0.55602801, 0.83818001, 0.05865780, - 0.25654668, 0.76624149, 0.35190639, 0.04158346, 0.59157544, - 0.30779791, 0.94609004, 0.10759670, 0.65575141, 0.37828529, - 0.29571742, 0.76361233, 0.72476572, 0.18568406, 0.85430276, - 0.02057583, 0.76195669, 0.65507215, 0.69129735, 0.25084621, - 0.75223947, 0.06064088, 0.20287007, 0.35887691, 0.75043523, - 0.47575447, 0.40021798, 0.44464844, 0.67975360, 0.40443239, - 0.71052992, 0.21782248, 0.50568426, 0.89037591, 0.06661721, - 0.28788096, 0.70773387, 0.42428264, 0.80419677, 0.42710736, - 0.87317258, 0.88229448, 0.79217333 - ]]) + [ + [ + 0.16274983, + 0.61470598, + 0.94366980, + 0.82005417, + 0.50752640, + 0.38316748, + 0.92648441, + 0.24050158, + 0.05461595, + 0.42218581, + 0.36270225, + 0.15464807, + 0.13614719, + 0.67509544, + 0.40315166, + 0.10671722, + 0.24832056, + 0.76091218, + 0.11598995, + 0.10962527, + 0.04688513, + 0.81536716, + 0.72259802, + 0.60476679, + 0.16701800, + 0.84160781, + 0.79649884, + 0.78021604, + 0.75329530, + 0.98587888, + 0.13421868, + 0.16027625, + 0.15269397, + 0.06228730, + 0.73856270, + 0.34721911, + 0.73683006, + 0.78178608, + 0.32068327, + 0.79906309, + 0.44214272, + 0.63330448, + 0.08016958, + 0.63367140, + 0.19788943, + 0.55346787, + 0.11142531, + 0.90518415, + 0.21236691, + 0.81587470, + 0.83752930, + 0.70979482, + 0.35684183, + 0.28715104, + 0.87162822, + 0.17679396, + 0.98725849, + 0.76129991, + 0.04090235, + 0.37181064, + 0.63317049, + 0.24689502, + 0.21126501, + 0.57617670, + 0.74346697, + 0.40613672, + 0.56907010, + 0.68556929, + 0.29032683, + 0.17866278, + 0.35165095, + 0.97015840, + 0.70785582, + 0.54259878, + 0.14712237, + 0.90483177, + 0.02094105, + 0.36411613, + 0.02495066, + 0.88874054, + 0.88895452, + 0.86216462, + 0.58062190, + 0.95583254, + 0.20553111, + 0.29870346, + 0.69652933, + 0.36861244, + 0.85316223, + 0.50240189, + 0.17566244, + 0.61080140, + 0.88203174, + 0.98675215, + 0.24344546, + 0.17213407, + 0.78160852, + 0.25165486, + 0.48188508, + 0.82812423, + 0.10199814, + 0.90475923, + 0.66907483, + 0.71910626, + 0.40660757, + 0.59460294, + 0.70212913, + 0.90841550, + 0.00329034, + 0.11290466, + 0.89654654, + 0.69114941, + 0.29473618, + 0.62027222, + 0.37333879, + 0.98911142, + 0.46510187, + 0.65914583, + 0.73022646, + 0.12790845, + 0.12817244, + 0.43015456, + 0.75011456, + 0.43562204, + 0.48086026, + 0.75587070, + 0.98481447, + 0.77367836, + ], + [ + 0.12336024, + 0.74152875, + 0.09191196, + 0.99301219, + 0.44764417, + 0.01848883, + 0.78326035, + 0.99228370, + 0.81447607, + 0.02627683, + 0.51033205, + 0.98703283, + 0.15247856, + 0.77640921, + 0.60799915, + 0.87518770, + 0.76818430, + 0.86542630, + 0.31795895, + 0.04829503, + 0.85567141, + 0.30271924, + 0.67515039, + 0.59728831, + 0.78710967, + 0.75111693, + 0.56837374, + 0.49085775, + 0.91510201, + 0.59545547, + 0.99482232, + 0.59036905, + 0.58267909, + 0.28770933, + 0.53237396, + 0.95318258, + 0.93987304, + 0.61142951, + 0.26737869, + 0.52285451, + 0.03479086, + 0.61631846, + 0.66777998, + 0.15736090, + 0.00447258, + 0.37035006, + 0.15281211, + 0.95372260, + 0.25963321, + 0.61036694, + 0.15020694, + 0.19171195, + 0.55252832, + 0.00391038, + 0.31052542, + 0.96495175, + 0.42586124, + 0.05630261, + 0.99728668, + 0.01856293, + 0.83201504, + 0.10701843, + 0.56434178, + 0.38009524, + 0.51095045, + 0.13202040, + 0.07133843, + 0.75313550, + 0.17111187, + 0.80716974, + 0.00172165, + 0.83906764, + 0.73240769, + 0.85843354, + 0.11042888, + 0.07912333, + 0.33689004, + 0.22334915, + 0.59059596, + 0.52789515, + 0.29831955, + 0.39515004, + 0.55602801, + 0.83818001, + 0.05865780, + 0.25654668, + 0.76624149, + 0.35190639, + 0.04158346, + 0.59157544, + 0.30779791, + 0.94609004, + 0.10759670, + 0.65575141, + 0.37828529, + 0.29571742, + 0.76361233, + 0.72476572, + 0.18568406, + 0.85430276, + 0.02057583, + 0.76195669, + 0.65507215, + 0.69129735, + 0.25084621, + 0.75223947, + 0.06064088, + 0.20287007, + 0.35887691, + 0.75043523, + 0.47575447, + 0.40021798, + 0.44464844, + 0.67975360, + 0.40443239, + 0.71052992, + 0.21782248, + 0.50568426, + 0.89037591, + 0.06661721, + 0.28788096, + 0.70773387, + 0.42428264, + 0.80419677, + 0.42710736, + 0.87317258, + 0.88229448, + 0.79217333, + ], + ] +) # pre_ids = paddle.to_tensor(np.float32(np.random.random([2, 1024]))) # logits = paddle.to_tensor(np.float32(np.random.random([2, 1024]))) penalty_scores = paddle.to_tensor([1.0, 1.0], "float32") @@ -195,60 +658,270 @@ print("min_len\n", min_len) print("eos_token_id\n", eos_token_id) ref_logits = np.array( - [[ - -10000000000., -10000000000., 1.88733959, 1.64010835, 1.01505280, - 0.76633495, 1.85296881, 0.48100317, 0.10923190, 0.84437162, 0.72540450, - 0.30929613, 0.27229437, 1.35019088, 0.80630332, 0.21343444, 0.49664113, - 1.52182436, 0.23197991, 0.21925054, 0.09377026, 1.63073432, 1.44519603, - 1.20953357, 0.33403599, 1.68321562, 1.59299767, 1.56043208, 1.50659060, - 1.97175777, 0.26843736, 0.32055250, 0.30538794, 0.12457460, 1.47712541, - 0.69443822, 1.47366011, 1.56357217, 0.64136654, 1.59812617, 0.88428545, - 1.26660895, 0.16033916, 1.26734281, 0.39577886, 1.10693574, 0.22285062, - 1.81036830, 0.42473382, 1.63174939, 1.67505860, 1.41958964, 0.71368366, - 0.57430208, 1.74325645, 0.35358793, 1.97451699, 1.52259982, 0.08180470, - 0.74362129, 1.26634097, 0.49379003, 0.42253003, 1.15235341, 1.48693395, - 0.81227344, 1.13814020, 1.37113857, 0.58065367, 0.35732555, 0.70330191, - 1.94031680, 1.41571164, 1.08519757, 0.29424474, 1.80966353, 0.04188210, - 0.72823226, 0.04990132, 1.77748108, 1.77790904, 1.72432923, 1.16124380, - 1.91166508, 0.41106221, 0.59740692, 1.39305866, 0.73722488, 1.70632446, - 1.00480378, 0.35132489, 1.22160280, 1.76406348, 1.97350430, 0.48689091, - 0.34426814, 1.56321704, 0.50330973, 0.96377015, 1.65624845, 0.20399629, - 1.80951846, 1.33814967, 1.43821251, 0.81321514, 1.18920588, 1.40425825, - 1.81683099, 0.00658068, 0.22580932, 1.79309309, 1.38229883, 0.58947235, - 1.24054444, 0.74667758, 1.97822285, 0.93020374, 1.31829166, 1.46045291, - 0.25581691, 0.25634488, 0.86030912, 1.50022912, 0.87124407, 0.96172053, - 1.51174140, 1.96962893, 1.54735672 + [ + [ + -10000000000.0, + -10000000000.0, + 1.88733959, + 1.64010835, + 1.01505280, + 0.76633495, + 1.85296881, + 0.48100317, + 0.10923190, + 0.84437162, + 0.72540450, + 0.30929613, + 0.27229437, + 1.35019088, + 0.80630332, + 0.21343444, + 0.49664113, + 1.52182436, + 0.23197991, + 0.21925054, + 0.09377026, + 1.63073432, + 1.44519603, + 1.20953357, + 0.33403599, + 1.68321562, + 1.59299767, + 1.56043208, + 1.50659060, + 1.97175777, + 0.26843736, + 0.32055250, + 0.30538794, + 0.12457460, + 1.47712541, + 0.69443822, + 1.47366011, + 1.56357217, + 0.64136654, + 1.59812617, + 0.88428545, + 1.26660895, + 0.16033916, + 1.26734281, + 0.39577886, + 1.10693574, + 0.22285062, + 1.81036830, + 0.42473382, + 1.63174939, + 1.67505860, + 1.41958964, + 0.71368366, + 0.57430208, + 1.74325645, + 0.35358793, + 1.97451699, + 1.52259982, + 0.08180470, + 0.74362129, + 1.26634097, + 0.49379003, + 0.42253003, + 1.15235341, + 1.48693395, + 0.81227344, + 1.13814020, + 1.37113857, + 0.58065367, + 0.35732555, + 0.70330191, + 1.94031680, + 1.41571164, + 1.08519757, + 0.29424474, + 1.80966353, + 0.04188210, + 0.72823226, + 0.04990132, + 1.77748108, + 1.77790904, + 1.72432923, + 1.16124380, + 1.91166508, + 0.41106221, + 0.59740692, + 1.39305866, + 0.73722488, + 1.70632446, + 1.00480378, + 0.35132489, + 1.22160280, + 1.76406348, + 1.97350430, + 0.48689091, + 0.34426814, + 1.56321704, + 0.50330973, + 0.96377015, + 1.65624845, + 0.20399629, + 1.80951846, + 1.33814967, + 1.43821251, + 0.81321514, + 1.18920588, + 1.40425825, + 1.81683099, + 0.00658068, + 0.22580932, + 1.79309309, + 1.38229883, + 0.58947235, + 1.24054444, + 0.74667758, + 1.97822285, + 0.93020374, + 1.31829166, + 1.46045291, + 0.25581691, + 0.25634488, + 0.86030912, + 1.50022912, + 0.87124407, + 0.96172053, + 1.51174140, + 1.96962893, + 1.54735672, + ], + [ + -10000000000.0, + -10000000000.0, + -40000.0, + 3.97204876, + 1.79057670, + 0.07395532, + 3.13304138, + 3.96913481, + 3.25790429, + -40000.0, + 2.04132819, + 3.94813132, + 0.60991424, + 3.10563684, + 2.43199658, + 3.50075078, + 3.07273722, + 3.46170521, + 1.27183580, + 0.19318011, + 3.42268562, + 1.21087694, + 2.70060158, + 2.38915324, + 3.14843869, + 3.00446773, + 2.27349496, + 1.96343100, + 3.66040802, + 2.38182187, + 3.97928929, + 2.36147618, + 2.33071637, + 1.15083730, + 2.12949586, + 3.81273031, + 3.75949216, + 2.44571805, + 1.06951475, + 2.09141803, + 0.13916343, + 2.46527386, + 2.67111993, + 0.62944359, + 0.01789032, + 1.48140025, + 0.61124843, + 3.81489038, + 1.03853285, + 2.44146776, + 0.60082775, + 0.76684779, + 2.21011329, + 0.01564152, + 1.24210167, + 3.85980701, + 1.70344496, + 0.22521044, + 3.98914671, + 0.07425172, + 3.32806015, + 0.42807373, + 2.25736713, + 1.52038097, + 2.04380178, + 0.52808160, + 0.28535372, + 3.01254201, + 0.68444747, + 3.22867894, + 0.00688660, + 3.35627055, + 2.92963076, + 3.43373418, + 0.44171551, + 0.31649333, + 1.34756017, + 0.89339662, + 2.36238384, + 2.11158061, + 1.19327819, + 1.58060014, + 2.22411203, + 3.35272002, + 0.23463120, + 1.02618670, + 3.06496596, + 1.40762556, + 0.16633384, + 2.36630177, + 1.23119164, + 3.78436017, + 0.43038681, + 2.62300563, + 1.51314116, + 1.18286967, + 3.05444932, + 2.89906287, + 0.74273622, + 3.41721106, + 0.08230332, + 3.04782677, + 2.62028861, + 2.76518941, + 1.00338483, + 3.00895786, + 0.24256352, + 0.81148028, + 1.43550766, + 3.00174093, + 1.90301788, + 1.60087192, + 1.77859378, + 2.71901441, + 1.61772954, + 2.84211969, + 0.87128991, + 2.02273703, + 3.56150365, + 0.26646885, + 1.15152383, + 2.83093548, + 1.69713056, + 3.21678710, + 1.70842946, + 3.49269032, + 3.52917790, + 3.16869330, + ], ], - [ - -10000000000., -10000000000., -40000., 3.97204876, 1.79057670, - 0.07395532, 3.13304138, 3.96913481, 3.25790429, -40000., 2.04132819, - 3.94813132, 0.60991424, 3.10563684, 2.43199658, 3.50075078, - 3.07273722, 3.46170521, 1.27183580, 0.19318011, 3.42268562, - 1.21087694, 2.70060158, 2.38915324, 3.14843869, 3.00446773, - 2.27349496, 1.96343100, 3.66040802, 2.38182187, 3.97928929, - 2.36147618, 2.33071637, 1.15083730, 2.12949586, 3.81273031, - 3.75949216, 2.44571805, 1.06951475, 2.09141803, 0.13916343, - 2.46527386, 2.67111993, 0.62944359, 0.01789032, 1.48140025, - 0.61124843, 3.81489038, 1.03853285, 2.44146776, 0.60082775, - 0.76684779, 2.21011329, 0.01564152, 1.24210167, 3.85980701, - 1.70344496, 0.22521044, 3.98914671, 0.07425172, 3.32806015, - 0.42807373, 2.25736713, 1.52038097, 2.04380178, 0.52808160, - 0.28535372, 3.01254201, 0.68444747, 3.22867894, 0.00688660, - 3.35627055, 2.92963076, 3.43373418, 0.44171551, 0.31649333, - 1.34756017, 0.89339662, 2.36238384, 2.11158061, 1.19327819, - 1.58060014, 2.22411203, 3.35272002, 0.23463120, 1.02618670, - 3.06496596, 1.40762556, 0.16633384, 2.36630177, 1.23119164, - 3.78436017, 0.43038681, 2.62300563, 1.51314116, 1.18286967, - 3.05444932, 2.89906287, 0.74273622, 3.41721106, 0.08230332, - 3.04782677, 2.62028861, 2.76518941, 1.00338483, 3.00895786, - 0.24256352, 0.81148028, 1.43550766, 3.00174093, 1.90301788, - 1.60087192, 1.77859378, 2.71901441, 1.61772954, 2.84211969, - 0.87128991, 2.02273703, 3.56150365, 0.26646885, 1.15152383, - 2.83093548, 1.69713056, 3.21678710, 1.70842946, 3.49269032, - 3.52917790, 3.16869330 - ]], "float32", ) diff_logits = np.sum(np.abs(ref_logits - logits.numpy())) print("diff_logits\n", diff_logits) -assert diff_logits < 1e-6, 'Check failed.' +assert diff_logits < 1e-6, "Check failed." diff --git a/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py b/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py index 70e4901ac..966ec5de2 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py +++ b/custom_ops/xpu_ops/test/python/ops/test_set_value_by_flags_and_idx.py @@ -21,19 +21,30 @@ paddle.seed(2023) pre_ids_all = paddle.to_tensor( [[1, 9, 3, 4, 5, 6, 7, -1, -1, -1], [1, 9, 7, 6, 5, 4, -1, -1, -1, -1]], - "int64") -input_ids = paddle.to_tensor([[1, 9, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1], - [1, 9, 7, 6, 5, 4, -1, -1, -1, -1, -1, -1, -1]], - "int64") + "int64", +) +input_ids = paddle.to_tensor( + [ + [1, 9, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1], + [1, 9, 7, 6, 5, 4, -1, -1, -1, -1, -1, -1, -1], + ], + "int64", +) seq_lens_this_time = paddle.to_tensor([1, 1], "int32") seq_lens_encoder = paddle.to_tensor([1, 1], "int32") seq_lens_decoder = paddle.to_tensor([1, 1], "int32") step_idx = paddle.to_tensor([1, 1], "int64") stop_flags = paddle.to_tensor([0, 1], "bool") print("pre_ids_all\n", pre_ids_all) -set_value_by_flags_and_idx(pre_ids_all, input_ids, seq_lens_this_time, - seq_lens_encoder, seq_lens_decoder, step_idx, - stop_flags) +set_value_by_flags_and_idx( + pre_ids_all, + input_ids, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + step_idx, + stop_flags, +) print("pre_ids_all\n", pre_ids_all) print("input_ids\n", input_ids) print("seq_lens_this_time\n", seq_lens_this_time) @@ -73,4 +84,4 @@ ref_pre_ids_all = np.array( ) diff_pre_ids_all = np.sum(np.abs(ref_pre_ids_all - pre_ids_all.numpy())) print("diff_pre_ids_all\n", diff_pre_ids_all) -assert diff_pre_ids_all == 0, 'Check failed.' +assert diff_pre_ids_all == 0, "Check failed." diff --git a/custom_ops/xpu_ops/test/python/ops/test_step.py b/custom_ops/xpu_ops/test/python/ops/test_step.py index 5334c316c..9d9eaf7e4 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_step.py +++ b/custom_ops/xpu_ops/test/python/ops/test_step.py @@ -41,10 +41,7 @@ step_idx = (seq_lens_decoder - ori_seq_lens_encoder).astype("int64") max_block_num = block_bs * max_seq_len // block_size free_list_len = int(max_block_num * (1 - block_ratio)) free_list_len = np.full([1], free_list_len, "int32") -free_list = np.arange(max_block_num - 1, - max_block_num - free_list_len - 1, - -1, - dtype="int32") +free_list = np.arange(max_block_num - 1, max_block_num - free_list_len - 1, -1, dtype="int32") encoder_block_lens = np.zeros([max_bs], "int32") used_list_len = np.zeros([max_bs], "int32") @@ -53,19 +50,15 @@ encoder_block_id = 0 for i in range(bs): enc_block_num = (ori_seq_lens_encoder[i] + block_size - 1) // block_size encoder_block_lens[i] = enc_block_num - dec_block_num = (seq_lens_decoder[i] + block_size - - 1) // block_size - enc_block_num + dec_block_num = (seq_lens_decoder[i] + block_size - 1) // block_size - enc_block_num used_list_len[i] = dec_block_num - block_tables[i, :enc_block_num] = np.arange( - encoder_block_id, encoder_block_id + enc_block_num, 1, "int32") + block_tables[i, :enc_block_num] = np.arange(encoder_block_id, encoder_block_id + enc_block_num, 1, "int32") encoder_block_id += enc_block_num if dec_block_num > 0: - block_tables[ - i, enc_block_num:enc_block_num + - dec_block_num] = free_list[free_list_len[0] - 1 - - dec_block_num:free_list_len[0] - 1] - free_list[free_list_len[0] - 1 - dec_block_num:free_list_len[0] - - 1] = -1 + block_tables[i, enc_block_num : enc_block_num + dec_block_num] = free_list[ + free_list_len[0] - 1 - dec_block_num : free_list_len[0] - 1 + ] + free_list[free_list_len[0] - 1 - dec_block_num : free_list_len[0] - 1] = -1 free_list_len[0] -= dec_block_num assert free_list_len[0] >= 0 @@ -137,13 +130,32 @@ first_token_ids = paddle.to_tensor(first_token_ids) # print("step_idx: ", step_idx) # print("next_tokens: ", next_tokens) -step_paddle(stop_flags, seq_lens_this_time, ori_seq_lens_encoder, - seq_lens_encoder, seq_lens_decoder, block_tables, - encoder_block_lens, is_block_step, step_block_list, step_lens, - recover_block_list, recover_lens, need_block_list, need_block_len, - used_list_len, free_list, free_list_len, input_ids, pre_ids, - step_idx, next_tokens, first_token_ids, block_size, - encoder_decoder_block_num) +step_paddle( + stop_flags, + seq_lens_this_time, + ori_seq_lens_encoder, + seq_lens_encoder, + seq_lens_decoder, + block_tables, + encoder_block_lens, + is_block_step, + step_block_list, + step_lens, + recover_block_list, + recover_lens, + need_block_list, + need_block_len, + used_list_len, + free_list, + free_list_len, + input_ids, + pre_ids, + step_idx, + next_tokens, + first_token_ids, + block_size, + encoder_decoder_block_num, +) print("-" * 50 + "after step op" + "-" * 50) print("stop_flags: ", stop_flags) diff --git a/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py b/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py index cbe4c48bf..537e41f5e 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py +++ b/custom_ops/xpu_ops/test/python/ops/test_stop_generation_multi_ends.py @@ -30,8 +30,7 @@ end_ids = paddle.to_tensor([0, 1, 2, 3, 4, 5], "int64") print("topk_ids\n", topk_ids) print("next_tokens\n", next_tokens) print("stop_flags\n", stop_flags) -set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens, - False) +set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens, False) print("topk_ids\n", topk_ids) print("next_tokens\n", next_tokens) print("stop_flags\n", stop_flags) @@ -40,44 +39,220 @@ print("end_ids\n", end_ids) ref_topk_ids = np.array( [ - 0, 0, 2, 3, -1, 0, 0, 0, 0, 9, 10, 0, 12, 0, -1, 15, 16, 0, 18, 19, 20, - 0, 22, 23, 0, 25, 26, 27, -1, 29, 30, 31, 0, 0, 0, -1, -1, 37, 38, 39, - -1, -1, 0, 0, 0, 0, 46, -1, 0, 49, 50, 0, 52, 53, 0, -1, 0, 57, -1, 59, - 60, 0, 0, 63 + 0, + 0, + 2, + 3, + -1, + 0, + 0, + 0, + 0, + 9, + 10, + 0, + 12, + 0, + -1, + 15, + 16, + 0, + 18, + 19, + 20, + 0, + 22, + 23, + 0, + 25, + 26, + 27, + -1, + 29, + 30, + 31, + 0, + 0, + 0, + -1, + -1, + 37, + 38, + 39, + -1, + -1, + 0, + 0, + 0, + 0, + 46, + -1, + 0, + 49, + 50, + 0, + 52, + 53, + 0, + -1, + 0, + 57, + -1, + 59, + 60, + 0, + 0, + 63, ], "int64", ) ref_next_tokens = np.array( [ - 0, 0, 2, 3, 0, 0, 0, 0, 0, 9, 10, 0, 12, 0, 0, 15, 16, 0, 18, 19, 20, - 0, 22, 23, 0, 25, 26, 27, 0, 29, 30, 31, 0, 0, 0, 0, 0, 37, 38, 39, 0, - 0, 0, 0, 0, 0, 46, 0, 0, 49, 50, 0, 52, 53, 0, 0, 0, 57, 0, 59, 60, 0, - 0, 63 + 0, + 0, + 2, + 3, + 0, + 0, + 0, + 0, + 0, + 9, + 10, + 0, + 12, + 0, + 0, + 15, + 16, + 0, + 18, + 19, + 20, + 0, + 22, + 23, + 0, + 25, + 26, + 27, + 0, + 29, + 30, + 31, + 0, + 0, + 0, + 0, + 0, + 37, + 38, + 39, + 0, + 0, + 0, + 0, + 0, + 0, + 46, + 0, + 0, + 49, + 50, + 0, + 52, + 53, + 0, + 0, + 0, + 57, + 0, + 59, + 60, + 0, + 0, + 63, ], "int64", ) ref_stop_flags = np.array( [ - True, True, True, True, True, True, True, True, True, False, False, - True, False, True, True, False, False, True, False, False, False, True, - False, False, True, False, False, False, True, False, False, False, - True, True, True, True, True, False, False, False, True, True, True, - True, True, True, False, True, True, False, False, True, False, False, - True, True, True, False, True, False, False, True, True, False + True, + True, + True, + True, + True, + True, + True, + True, + True, + False, + False, + True, + False, + True, + True, + False, + False, + True, + False, + False, + False, + True, + False, + False, + True, + False, + False, + False, + True, + False, + False, + False, + True, + True, + True, + True, + True, + False, + False, + False, + True, + True, + True, + True, + True, + True, + False, + True, + True, + False, + False, + True, + False, + False, + True, + True, + True, + False, + True, + False, + False, + True, + True, + False, ], "bool", ) diff_topk_ids = np.sum(np.abs(ref_topk_ids - topk_ids.numpy())) print("diff_topk_ids\n", diff_topk_ids) -assert diff_topk_ids == 0, 'Check failed.' +assert diff_topk_ids == 0, "Check failed." diff_next_tokens = np.sum(np.abs(ref_next_tokens - next_tokens.numpy())) print("diff_next_tokens\n", diff_next_tokens) -assert diff_next_tokens == 0, 'Check failed.' -diff_stop_flags = np.sum( - np.abs( - ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32))) +assert diff_next_tokens == 0, "Check failed." +diff_stop_flags = np.sum(np.abs(ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32))) print("diff_stop_flags\n", diff_stop_flags) -assert diff_stop_flags == 0, 'Check failed.' +assert diff_stop_flags == 0, "Check failed." # test beam_search=True topk_ids = paddle.arange(0, bs, dtype="int64") @@ -88,8 +263,7 @@ end_ids = paddle.to_tensor([0, 1, 2, 3, 4, 5], "int64") print("topk_ids\n", topk_ids) print("next_tokens\n", next_tokens) print("stop_flags\n", stop_flags) -set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens, - True) +set_stop_value_multi_ends(topk_ids, stop_flags, seq_lens, end_ids, next_tokens, True) print("topk_ids\n", topk_ids) print("next_tokens\n", next_tokens) print("stop_flags\n", stop_flags) @@ -98,42 +272,217 @@ print("end_ids\n", end_ids) ref_topk_ids = np.array( [ - 0, 1, 2, 3, 4, 0, 6, 7, -1, 9, 10, 0, -1, 13, 14, 15, 0, 17, 18, 19, - 20, 0, 22, 23, 24, 25, -1, -1, 28, 29, 0, 0, -1, 33, 34, 35, 36, 37, 0, - -1, 0, 41, -1, 0, 44, 45, 46, 0, 0, 49, 0, 0, 0, 53, 0, 0, 0, 0, 58, - -1, 60, 61, -1, 63 + 0, + 1, + 2, + 3, + 4, + 0, + 6, + 7, + -1, + 9, + 10, + 0, + -1, + 13, + 14, + 15, + 0, + 17, + 18, + 19, + 20, + 0, + 22, + 23, + 24, + 25, + -1, + -1, + 28, + 29, + 0, + 0, + -1, + 33, + 34, + 35, + 36, + 37, + 0, + -1, + 0, + 41, + -1, + 0, + 44, + 45, + 46, + 0, + 0, + 49, + 0, + 0, + 0, + 53, + 0, + 0, + 0, + 0, + 58, + -1, + 60, + 61, + -1, + 63, ], "int64", ) ref_next_tokens = np.array( [ - 0, 1, 2, 3, 4, 0, 6, 7, 0, 9, 10, 0, 0, 13, 14, 15, 0, 17, 18, 19, 20, - 0, 22, 23, 24, 25, 0, 0, 28, 29, 0, 0, 0, 33, 34, 35, 36, 37, 0, 0, 0, - 41, 0, 0, 44, 45, 46, 0, 0, 49, 0, 0, 0, 53, 0, 0, 0, 0, 58, 0, 60, 61, - 0, 63 + 0, + 1, + 2, + 3, + 4, + 0, + 6, + 7, + 0, + 9, + 10, + 0, + 0, + 13, + 14, + 15, + 0, + 17, + 18, + 19, + 20, + 0, + 22, + 23, + 24, + 25, + 0, + 0, + 28, + 29, + 0, + 0, + 0, + 33, + 34, + 35, + 36, + 37, + 0, + 0, + 0, + 41, + 0, + 0, + 44, + 45, + 46, + 0, + 0, + 49, + 0, + 0, + 0, + 53, + 0, + 0, + 0, + 0, + 58, + 0, + 60, + 61, + 0, + 63, ], "int64", ) ref_stop_flags = np.array( [ - False, False, False, False, False, True, False, False, True, False, - False, True, True, False, False, False, True, False, False, False, - False, True, False, False, False, False, True, True, False, False, - True, True, True, False, False, False, False, False, True, True, True, - False, True, True, False, False, False, True, True, False, True, True, - True, False, True, True, True, True, False, True, False, False, True, - False + False, + False, + False, + False, + False, + True, + False, + False, + True, + False, + False, + True, + True, + False, + False, + False, + True, + False, + False, + False, + False, + True, + False, + False, + False, + False, + True, + True, + False, + False, + True, + True, + True, + False, + False, + False, + False, + False, + True, + True, + True, + False, + True, + True, + False, + False, + False, + True, + True, + False, + True, + True, + True, + False, + True, + True, + True, + True, + False, + True, + False, + False, + True, + False, ], "bool", ) diff_topk_ids = np.sum(np.abs(ref_topk_ids - topk_ids.numpy())) print("diff_topk_ids\n", diff_topk_ids) -assert diff_topk_ids == 0, 'Check failed.' +assert diff_topk_ids == 0, "Check failed." diff_next_tokens = np.sum(np.abs(ref_next_tokens - next_tokens.numpy())) print("diff_next_tokens\n", diff_next_tokens) -assert diff_next_tokens == 0, 'Check failed.' -diff_stop_flags = np.sum( - np.abs( - ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32))) +assert diff_next_tokens == 0, "Check failed." +diff_stop_flags = np.sum(np.abs(ref_stop_flags.astype(np.int32) - stop_flags.numpy().astype(np.int32))) print("diff_stop_flags\n", diff_stop_flags) -assert diff_stop_flags == 0, 'Check failed.' +assert diff_stop_flags == 0, "Check failed." diff --git a/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py b/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py index d1e8e36dd..037429b22 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py +++ b/custom_ops/xpu_ops/test/python/ops/test_update_inputs.py @@ -60,9 +60,17 @@ print("stop_nums:\n", stop_nums) print("next_tokens:\n", next_tokens) print("is_block_step:\n", is_block_step) -update_inputs(stop_flags, not_need_stop, seq_lens_this_time, seq_lens_encoder, - seq_lens_decoder, input_ids, stop_nums, next_tokens, - is_block_step) +update_inputs( + stop_flags, + not_need_stop, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + input_ids, + stop_nums, + next_tokens, + is_block_step, +) print("-" * 50) print("stop_flags:\n", stop_flags) @@ -75,32 +83,269 @@ print("stop_nums:\n", stop_nums) print("next_tokens:\n", next_tokens) ref_not_need_stop_out = np.array([True]) -ref_seq_lens_this_time_out = np.array([ - 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, - 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1 -], "int32") -ref_seq_lens_encoder_out = np.array([ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -], "int32") -ref_seq_lens_decoder_out = np.array([ - 0, 0, 2, 0, 0, 6, 0, 8, 8, 10, 0, 12, 12, 0, 0, 0, 0, 0, 0, 0, 20, 22, 0, - 24, 24, 0, 26, 28, 0, 0, 0, 32, 32, 0, 34, 0, 0, 38, 0, 40, 0, 0, 42, 0, 0, - 46, 46, 48, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -], "int32") -input_ids_np[:, 0] = np.array([ - 6, 5, 9, 8, 6, 2, 8, 1, 3, 1, 3, 6, 9, 8, 1, 9, 1, 8, 8, 6, 7, 6, 5, 3, 5, - 9, 3, 6, 3, 9, 8, 8, 8, 8, 4, 8, 7, 4, 2, 3, 5, 8, 4, 2, 5, 6, 8, 9, 6, 7, - 4, 2, 4, 6, 2, 3, 4, 9, 7, 2, 1, 8, 7, 8 -], "int64") +ref_seq_lens_this_time_out = np.array( + [ + 0, + 0, + 1, + 0, + 0, + 1, + 0, + 1, + 1, + 1, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 1, + 1, + 0, + 1, + 1, + 0, + 0, + 0, + 1, + 1, + 0, + 1, + 0, + 0, + 1, + 0, + 1, + 0, + 0, + 1, + 0, + 0, + 1, + 1, + 1, + ], + "int32", +) +ref_seq_lens_encoder_out = np.array( + [ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + "int32", +) +ref_seq_lens_decoder_out = np.array( + [ + 0, + 0, + 2, + 0, + 0, + 6, + 0, + 8, + 8, + 10, + 0, + 12, + 12, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 20, + 22, + 0, + 24, + 24, + 0, + 26, + 28, + 0, + 0, + 0, + 32, + 32, + 0, + 34, + 0, + 0, + 38, + 0, + 40, + 0, + 0, + 42, + 0, + 0, + 46, + 46, + 48, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + ], + "int32", +) +input_ids_np[:, 0] = np.array( + [ + 6, + 5, + 9, + 8, + 6, + 2, + 8, + 1, + 3, + 1, + 3, + 6, + 9, + 8, + 1, + 9, + 1, + 8, + 8, + 6, + 7, + 6, + 5, + 3, + 5, + 9, + 3, + 6, + 3, + 9, + 8, + 8, + 8, + 8, + 4, + 8, + 7, + 4, + 2, + 3, + 5, + 8, + 4, + 2, + 5, + 6, + 8, + 9, + 6, + 7, + 4, + 2, + 4, + 6, + 2, + 3, + 4, + 9, + 7, + 2, + 1, + 8, + 7, + 8, + ], + "int64", +) -assert not_need_stop.numpy( -) == ref_not_need_stop_out, 'Check not_need_stop failed.' -assert np.all(seq_lens_this_time.numpy() == - ref_seq_lens_this_time_out), 'Check seq_lens_this_time failed.' -assert np.all(seq_lens_encoder.numpy() == - ref_seq_lens_encoder_out), 'Check seq_lens_encoder failed.' -assert np.all(seq_lens_decoder.numpy() == - ref_seq_lens_decoder_out), 'Check seq_lens_decoder failed.' -assert np.all(input_ids.numpy() == input_ids_np), 'Check input_ids failed.' +assert not_need_stop.numpy() == ref_not_need_stop_out, "Check not_need_stop failed." +assert np.all(seq_lens_this_time.numpy() == ref_seq_lens_this_time_out), "Check seq_lens_this_time failed." +assert np.all(seq_lens_encoder.numpy() == ref_seq_lens_encoder_out), "Check seq_lens_encoder failed." +assert np.all(seq_lens_decoder.numpy() == ref_seq_lens_decoder_out), "Check seq_lens_decoder failed." +assert np.all(input_ids.numpy() == input_ids_np), "Check input_ids failed." diff --git a/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py b/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py index e946d4069..59312c95d 100644 --- a/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py +++ b/custom_ops/xpu_ops/test/python/ops/test_weight_quantize_xpu.py @@ -29,16 +29,15 @@ def np_quant_weight_int4(weight_np): weight = np.transpose(weight_np, [1, 0]) # n,k max_value = np.max(np.abs(weight), axis=1).reshape(-1, 1) # k => k,1 quanted_weight = np_clip_and_round(weight / max_value * 7.0, 7) # n,k - quanted_weight = (quanted_weight[:, 1::2] & 0xF) << 4 | ( - quanted_weight[:, ::2] & 0xF) # pack int4, [n,k//2] + quanted_weight = (quanted_weight[:, 1::2] & 0xF) << 4 | (quanted_weight[:, ::2] & 0xF) # pack int4, [n,k//2] weight_scales = (max_value).astype(weight_np.dtype).reshape(-1) return quanted_weight, weight_scales.astype(np.float32) -def np_quant_weight(weight_np, algo='weight_only_int8'): +def np_quant_weight(weight_np, algo="weight_only_int8"): assert weight_np.dtype == np.float32 - if algo == 'weight_only_int4': + if algo == "weight_only_int4": return np_quant_weight_int4(weight_np) weight = np.transpose(weight_np, [1, 0]) @@ -56,7 +55,7 @@ def int8_to_bin_np(value): def int8_to_bin(value): if not -128 <= value <= 127: raise ValueError("int8 值必须在 -128 到 127 之间") - return format(value & 0xFF, '08b') # '08b' 表示 8 位二进制,高位补零 + return format(value & 0xFF, "08b") # '08b' 表示 8 位二进制,高位补零 # 1) preparation @@ -70,7 +69,7 @@ w_np = (np.random.random((k, n)).astype(np.float32) - 0.5) * 10 qw_np, wscale_np = np_quant_weight(w_np, algo) # 3) xpu calculation -dtype = 'float32' +dtype = "float32" x_pd = paddle.to_tensor(w_np, dtype=dtype) qw_pd, wscale_pd = weight_quantize_xpu(x_pd, algo, -1, -1) qw_pd_trans = paddle.transpose(qw_pd, [1, 0]) @@ -83,12 +82,7 @@ qw_pd_trans = paddle.transpose(qw_pd, [1, 0]) # comparation print(f"wscale_pd, mean={wscale_pd.mean()}, std={wscale_pd.std()}") print(f"wscale_np, mean={wscale_np.mean()}, std={wscale_np.std()}") -print( - f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}" -) -print( - f"qw_pd_trans, mean={qw_pd_trans.astype('float32').mean()}, std={qw_pd_trans.astype('float32').std()}" -) -sum_diff = np.sum( - np.abs(qw_pd_trans.astype("float32").numpy() - qw_np.astype("float32"))) +print(f"qw_np, mean={qw_np.astype(np.float32).mean()}, std={qw_np.astype(np.float32).std()}") +print(f"qw_pd_trans, mean={qw_pd_trans.astype('float32').mean()}, std={qw_pd_trans.astype('float32').std()}") +sum_diff = np.sum(np.abs(qw_pd_trans.astype("float32").numpy() - qw_np.astype("float32"))) print(f"sum_diff: {sum_diff}") diff --git a/docs/benchmark.md b/docs/benchmark.md index 67f2a8c05..46283b627 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -37,4 +37,4 @@ python benchmark_serving.py \ --num-prompts 1 \ --max-concurrency 1 \ --save-result -``` \ No newline at end of file +``` diff --git a/docs/features/disaggregated.md b/docs/features/disaggregated.md index 4fddfc84a..e5e20dcae 100644 --- a/docs/features/disaggregated.md +++ b/docs/features/disaggregated.md @@ -15,7 +15,7 @@ We provide two transmission methods for KV Cache, targeting intra-machine and in Uses cudaMemcpyPeer for KV Cache transmission between two GPUs within a single machine, offering low latency and high throughput. ### Inter-machine Transmission -For transmission between multiple machines, uses high-speed RDMA network for KV Cache transmission. We provide the `rdma_comm` high-speed transmission network library for cross-machine KV Cache transmission. +For transmission between multiple machines, uses high-speed RDMA network for KV Cache transmission. We provide the `rdma_comm` high-speed transmission network library for cross-machine KV Cache transmission. ## PD Disaggregated Scheduling ![Splitwise Scheduler](./images/disaggregated.png) @@ -60,7 +60,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --cache-queue-port 8187 \ --tensor-parallel-size 4 \ --quantization wint4 \ - --innode-prefill-ports 8182 \ + --innode-prefill-ports 8182 \ --splitwise-role "decode" ``` @@ -72,7 +72,8 @@ Refer to the example code `offline_disaggregated_demo.py` in the `fastdeploy/dem ### Multi-machine Disaggregated Deployment #### Prerequisite: Redis -- Installation via `conda` +* Installation via `conda` + ```bash # Install conda install redis @@ -80,7 +81,8 @@ conda install redis nohup redis-server > redis.log 2>&1 & ``` -- Installation via `apt` +* Installation via `apt` + ```bash # Install sudo apt install redis-server -y @@ -88,7 +90,8 @@ sudo apt install redis-server -y sudo systemctl start redis-server ``` -- Installation via `yum` +* Installation via `yum` + ```bash # Install sudo yum install redis -y diff --git a/docs/features/load_balance.md b/docs/features/load_balance.md index a022470d1..78f832e7c 100644 --- a/docs/features/load_balance.md +++ b/docs/features/load_balance.md @@ -38,6 +38,7 @@ conda install redis # Launch nohup redis-server > redis.log 2>&1 & ``` + ### apt installation (Debian/Ubuntu) ```bash @@ -57,6 +58,7 @@ sudo systemctl start redis ``` ## Launching FastDeploy + ```bash python -m fastdeploy.entrypoints.openai.api_server \ --port 8801 \ @@ -72,6 +74,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --scheduler-min-load_score 3 \ --scheduler-load-shards-num 1 ``` + [Scheduler Launching Parameter](../online_serving/scheduler.md) ### Deployment notes: diff --git a/docs/features/prefix_caching.md b/docs/features/prefix_caching.md index 1e2148135..0a58336de 100644 --- a/docs/features/prefix_caching.md +++ b/docs/features/prefix_caching.md @@ -36,4 +36,4 @@ python -m fastdeploy.entrypoints.openai.api_server \ Set `enable_prefix_caching=True` when launching FastDeploy. Enable CPU caching via `swap_space` based on available machine memory. -A test example is provided: `demo/offline_prefix_caching_demo.py` \ No newline at end of file +A test example is provided: `demo/offline_prefix_caching_demo.py` diff --git a/docs/features/reasoning_output.md b/docs/features/reasoning_output.md index 78ea9de2e..5f23e65d5 100644 --- a/docs/features/reasoning_output.md +++ b/docs/features/reasoning_output.md @@ -18,8 +18,9 @@ Interfaces that support toggling the reasoning mode: For reasoning models, the length of the reasoning content can be controlled via `reasoning_max_tokens`. Add `metadata={"reasoning_max_tokens": 1024}` to the request. ### Quick Start -When launching the model service, specify the parser name using the `--reasoning-parser` argument. +When launching the model service, specify the parser name using the `--reasoning-parser` argument. This parser will process the model's output and extract the `reasoning_content` field. + ```bash python -m fastdeploy.entrypoints.openai.api_server \ --model /path/to/your/model \ @@ -29,7 +30,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ --quantization wint4 \ --reasoning-parser ernie-45-vl ``` + Next, make a request to the model that should return the reasoning content in the response. + ```bash curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -43,10 +46,12 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ "metadata": {"enable_thinking": true} }' ``` + The `reasoning_content` field contains the reasoning steps to reach the final conclusion, while the `content` field holds the conclusion itself. ### Streaming chat completions Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in `chat completion response chunks` + ```python from openai import OpenAI # Set OpenAI's API key and API base to use vLLM's API server. @@ -69,4 +74,4 @@ for chunk in chat_response: if chunk.choices[0].delta is not None: print(chunk.choices[0].delta, end='') print("\n") -``` \ No newline at end of file +``` diff --git a/docs/features/speculative_decoding.md b/docs/features/speculative_decoding.md index 0e6da2283..4093dcca5 100644 --- a/docs/features/speculative_decoding.md +++ b/docs/features/speculative_decoding.md @@ -10,22 +10,22 @@ This project implements an efficient **Speculative Decoding** inference framewor - **Ngram** -- **MTP (Multi-Token Prediction)** - - ✅ Supported: TP Sharding - - ✅ Supported: Shared Prefix - - ✅ Supported: TP Sharding + PD Separation +- **MTP (Multi-Token Prediction)** + - ✅ Supported: TP Sharding + - ✅ Supported: Shared Prefix + - ✅ Supported: TP Sharding + PD Separation - ⏳ Coming Soon: EP + DP + PD Separation - ⏳ Coming Soon: Support Chunk-prefill - - ⏳ Coming Soon: Multi-layer MTP Layer + - ⏳ Coming Soon: Multi-layer MTP Layer --- ### Coming Soon -- Draft Model -- Eagle -- Hydra -- Medusa +- Draft Model +- Eagle +- Hydra +- Medusa - ... --- @@ -54,7 +54,7 @@ This project implements an efficient **Speculative Decoding** inference framewor ## 🚀 Using Multi-Token Prediction (MTP) -For detailed theory, refer to: +For detailed theory, refer to: 📄 [DeepSeek-V3 Paper](https://arxiv.org/pdf/2412.19437) ### TP Sharding Mode @@ -147,4 +147,4 @@ python -m fastdeploy.entrypoints.openai.api_server \ --config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \ --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${mtp_model_path}"}' -``` \ No newline at end of file +``` diff --git a/docs/get_started/installation/Enflame_gcu.md b/docs/get_started/installation/Enflame_gcu.md index 844c38626..46d7f0d84 100644 --- a/docs/get_started/installation/Enflame_gcu.md +++ b/docs/get_started/installation/Enflame_gcu.md @@ -132,4 +132,3 @@ Upon completion, accuracy results are saved in ```result.jsonl```, e.g.: ```json {"task": "gsm8k", "backend": "paddlepaddle", "num_gpus": 1, "latency": 13446.01, "accuracy": 0.956, "num_requests": 1319, "other": {"num_questions": 1319, "parallel": 8}} ``` - diff --git a/docs/get_started/installation/README.md b/docs/get_started/installation/README.md index 5fb4ab6b9..ba7042e26 100644 --- a/docs/get_started/installation/README.md +++ b/docs/get_started/installation/README.md @@ -6,4 +6,4 @@ FastDeploy currently supports installation on the following hardware platforms: - [Kunlun XPU Installation](kunlunxin_xpu.md) - [Enflame S60 GCU Installation](Enflame_gcu.md) - [Iluvatar GPU Installation](iluvatar_gpu.md) -- [Hygon DCU Installation](hygon_dcu.md) \ No newline at end of file +- [Hygon DCU Installation](hygon_dcu.md) diff --git a/docs/get_started/installation/hygon_dcu.md b/docs/get_started/installation/hygon_dcu.md index e5e3eea67..245ee4457 100644 --- a/docs/get_started/installation/hygon_dcu.md +++ b/docs/get_started/installation/hygon_dcu.md @@ -37,6 +37,7 @@ image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04 ``` ## 2. Start service + ```bash export FD_ATTENTION_BACKEND="BLOCK_ATTN" python -m fastdeploy.entrypoints.openai.api_server \ @@ -47,7 +48,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --gpu-memory-utilization=0.8 ``` -#### Send requests +### Send requests Send requests using either curl or Python @@ -78,4 +79,4 @@ response = client.chat.completions.create( stream=False, ) print(response) -``` \ No newline at end of file +``` diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index 5284d08d5..754cc7c0f 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -1,115 +1,120 @@ -# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine -The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version. - -## Machine Preparation -First, you need to prepare a machine with the following configurations: - -| CPU | Memory | Card | Hard Disk| -| :---: | :---: | :---: | :---: | -| x86 | 1TB| 8xBI150| 1TB| - -Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions. - -## Image Preparation -Pull the Docker image - -```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -``` - -## Container Preparation -1. Start Container -```bash -docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -docker exec -it paddle_infer bash -``` -/home/paddle contains the model files, *.whl packages, and scripts. - -2. Install packages - -```bash -pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ -pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels -``` - -## Prepare the inference demo script - -script list below: - -`run_demo.sh`: -```bash -#!/bin/bash -export PADDLE_XCCL_BACKEND=iluvatar_gpu -export INFERENCE_MSG_QUEUE_ID=232132 -export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 -export FD_DEBUG=1 -python3 run_demo.py -``` - -`run_demo.py`: - -```python -from fastdeploy import LLM, SamplingParams - -prompts = [ - "Hello, my name is", - "The largest ocean is", -] - -# sampling parameters -sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) - -# load the model -llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8') - -# Perform batch inference -outputs = llm.generate(prompts, sampling_params) -# Note:Replace `/home/paddle/ernie-4_5-21b-a3b-bf16-paddle` in it with the path to the ERNIE model you have downloaded. - -for output in outputs: - prompt = output.prompt - generated_text = output.outputs.text - print(prompt, generated_text) -``` - -## run demo - -```bash -./run_demo.sh -``` -The following logs will be printed: Loading the model took approximately 74 seconds, and running the demo took approximately 240 seconds. -``` -/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md - warnings.warn(warning_message) -/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml - warnings.warn( -[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json -/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. - warnings.warn( -/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. - warnings.warn( -INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready... -Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s] -Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s] -INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds. -Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s] -Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started! - (1) First, draw a big circle for the ghost's head. - (2) Then, add two small circles for the eyes, making sure they're not too big. - (3) Next, draw a wide, open mouth that looks like a big "U". - (4) After that, create the body by drawing a slightly smaller circle below the head. - (5) Now, let's add some arms. Draw two short, curly lines on each side of the body. - (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance. - -Now, let's break down each step: - -**Step 1: Drawing the Head** -- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing. - -**Step 2: Adding Eyes** -- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look. - -**Step 3: Drawing the -The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean -``` +# Run ERNIE-4.5-300B-A47B & ERNIE-4.5-21B-A3B model on iluvatar machine +The current version of the software merely serves as a demonstration demo for the Iluvatar CoreX combined with the Fastdeploy inference framework for large models. There may be issues when running the latest ERNIE4.5 model, and we will conduct repairs and performance optimization in the future. Subsequent versions will provide customers with a more stable version. + +## Machine Preparation +First, you need to prepare a machine with the following configurations: + +| CPU | Memory | Card | Hard Disk| +| :---: | :---: | :---: | :---: | +| x86 | 1TB| 8xBI150| 1TB| + +Currently, the entire model needs to be loaded into the host memory, which requires more than 600GB of host memory. This issue will be optimized in subsequent versions. + +## Image Preparation +Pull the Docker image + +```bash +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +``` + +## Container Preparation +1. Start Container + +```bash +docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker exec -it paddle_infer bash +``` + +/home/paddle contains the model files, *.whl packages, and scripts. + +1. Install packages + +```bash +pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ +pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ +pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +``` + +## Prepare the inference demo script + +script list below: + +`run_demo.sh`: + +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_DEBUG=1 +python3 run_demo.py +``` + +`run_demo.py`: + +```python +from fastdeploy import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The largest ocean is", +] + +# sampling parameters +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) + +# load the model +llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8') + +# Perform batch inference +outputs = llm.generate(prompts, sampling_params) +# Note:Replace `/home/paddle/ernie-4_5-21b-a3b-bf16-paddle` in it with the path to the ERNIE model you have downloaded. + +for output in outputs: + prompt = output.prompt + generated_text = output.outputs.text + print(prompt, generated_text) +``` + +## run demo + +```bash +./run_demo.sh +``` + +The following logs will be printed: Loading the model took approximately 74 seconds, and running the demo took approximately 240 seconds. + +``` +/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md + warnings.warn(warning_message) +/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. + warnings.warn( +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. + warnings.warn( +INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready... +Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s] +Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s] +INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds. +Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s] +Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started! + (1) First, draw a big circle for the ghost's head. + (2) Then, add two small circles for the eyes, making sure they're not too big. + (3) Next, draw a wide, open mouth that looks like a big "U". + (4) After that, create the body by drawing a slightly smaller circle below the head. + (5) Now, let's add some arms. Draw two short, curly lines on each side of the body. + (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance. + +Now, let's break down each step: + +**Step 1: Drawing the Head** +- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing. + +**Step 2: Adding Eyes** +- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look. + +**Step 3: Drawing the +The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean +``` diff --git a/docs/get_started/quick_start.md b/docs/get_started/quick_start.md index 5368941a3..a9d2331ee 100644 --- a/docs/get_started/quick_start.md +++ b/docs/get_started/quick_start.md @@ -25,9 +25,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ --max-num-seqs 32 ``` -> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md). -```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service. -```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service. +> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md). +```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service. +```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service. **Related Documents** - [Service Deployment](../online_serving/README.md) diff --git a/docs/get_started/quick_start_vl.md b/docs/get_started/quick_start_vl.md index acd805a11..82bc609b1 100644 --- a/docs/get_started/quick_start_vl.md +++ b/docs/get_started/quick_start_vl.md @@ -30,10 +30,10 @@ python -m fastdeploy.entrypoints.openai.api_server \ --enable-mm ``` -> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Base-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md). -```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service. -```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service. -```--reasoning-parser``` specifies the thinking content parser. +> 💡 Note: In the path specified by ```--model```, if the subdirectory corresponding to the path does not exist in the current directory, it will try to query whether AIStudio has a preset model based on the specified model name (such as ```baidu/ERNIE-4.5-0.3B-Base-Paddle```). If it exists, it will automatically start downloading. The default download path is: ```~/xx```. For instructions and configuration on automatic model download, see [Model Download](../supported_models.md). +```--max-model-len``` indicates the maximum number of tokens supported by the currently deployed service. +```--max-num-seqs``` indicates the maximum number of concurrent processing supported by the currently deployed service. +```--reasoning-parser``` specifies the thinking content parser. ```--enable-mm``` indicates whether to enable multi-modal support. **Related Documents** diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index 2e5ff98fb..8062fe76a 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -36,6 +36,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ ] }' ``` + Here's an example curl command demonstrating how to include the logprobs parameter in a user request: ```bash @@ -49,6 +50,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ ``` Here is an example of sending a user request using a Python script: + ```python import openai host = "0.0.0.0" @@ -87,10 +89,10 @@ The differences in request parameters between FastDeploy and the OpenAI protocol - `temperature`: Optional[float] = None - `top_p`: Optional[float] = None - `metadata`: Optional[dict] = None (supported only in `v1/chat/completions` for configuring additional parameters, e.g., `metadata={"enable_thinking": True}`) - - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated) - - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`) - - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking) - - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition)) + - `min_tokens`: Optional[int] = 1 (minimum number of tokens generated) + - `reasoning_max_tokens`: Optional[int] = None (maximum number of tokens for reasoning content, defaults to the same as `max_tokens`) + - `enable_thinking`: Optional[bool] = True (whether to enable reasoning for models that support deep thinking) + - `repetition_penalty`: Optional[float] = None (coefficient for directly penalizing repeated token generation (>1 penalizes repetition, <1 encourages repetition)) > Note: For multimodal models, since the reasoning chain is enabled by default, resulting in overly long outputs, `max_tokens` can be set to the model's maximum output length or the default value can be used. diff --git a/docs/online_serving/metrics.md b/docs/online_serving/metrics.md index 6eee4f47d..c5c16ee81 100644 --- a/docs/online_serving/metrics.md +++ b/docs/online_serving/metrics.md @@ -24,4 +24,4 @@ After FastDeploy is launched, it supports continuous monitoring of the FastDeplo ## Accessing Metrics - Access URL: `http://localhost:8000/metrics` -- Metric Type: Prometheus format \ No newline at end of file +- Metric Type: Prometheus format diff --git a/docs/online_serving/scheduler.md b/docs/online_serving/scheduler.md index f985de05a..8ce9fa4cd 100644 --- a/docs/online_serving/scheduler.md +++ b/docs/online_serving/scheduler.md @@ -11,9 +11,9 @@ The Local Scheduler functions similarly to a memory manager, performing eviction The Global Scheduler is implemented using Redis. Each node actively steals tasks from others when its GPU is idle, then pushes the execution results back to the originating node. ### PD-Separated Scheduler -Building upon the Global Scheduler, FastDeploy introduces the **PD-Separated Scheduling Strategy**, specifically optimized for large language model inference scenarios. It decouples the inference pipeline into two distinct phases: -- **Prefill Phase**: Builds KV cache, which is compute-intensive with high memory usage but low latency. -- **Decode Phase**: Performs autoregressive decoding, which is sequential and time-consuming but requires less memory. +Building upon the Global Scheduler, FastDeploy introduces the **PD-Separated Scheduling Strategy**, specifically optimized for large language model inference scenarios. It decouples the inference pipeline into two distinct phases: +- **Prefill Phase**: Builds KV cache, which is compute-intensive with high memory usage but low latency. +- **Decode Phase**: Performs autoregressive decoding, which is sequential and time-consuming but requires less memory. By separating roles (prefill nodes handle request processing while decode nodes manage generation), this strategy enables finer-grained resource allocation, improving throughput and GPU utilization. @@ -36,4 +36,4 @@ By separating roles (prefill nodes handle request processing while decode nodes | scheduler_reader_parallel | int | No | 4 | splitwise | Number of output reader threads | | scheduler_writer_parallel | int | No | 4 | splitwise | Number of writer threads | | scheduler_reader_batch_size | int | No | 200 | splitwise | Batch size for fetching results from Redis | -| scheduler_writer_batch_size | int | No | 200 | splitwise | Batch size for writing results to Redis | \ No newline at end of file +| scheduler_writer_batch_size | int | No | 200 | splitwise | Batch size for writing results to Redis | diff --git a/docs/parameters.md b/docs/parameters.md index 8c9e3cbee..c52fc9ac6 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -45,7 +45,6 @@ When using FastDeploy to deploy models (including offline inference and service | ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel | | ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting | - ## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```? During FastDeploy inference, GPU memory is occupied by ```model weights```, ```preallocated KVCache blocks``` and ```model computation intermediate activation values```. The preallocated KVCache blocks are determined by ```num_gpu_blocks_override```, with ```block_size``` (default: 64) as its unit, meaning one block can store KVCache for 64 Tokens. @@ -55,14 +54,14 @@ In actual inference, it's difficult for users to know how to properly configure - Load the model, after completing model loading, record current memory usage ```total_memory_after_load``` and FastDeploy framework memory usage ```fd_memory_after_load```; note the former is actual GPU memory usage (may include other processes), the latter is memory used by FD framework itself; - According to user-configured ```max_num_batched_tokens``` (default: ```max_model_len```), perform fake prefill computation with corresponding length input data, record current maximum FastDeploy framework memory allocation ```fd_memory_after_prefill```, thus ```model computation intermediate activation values``` can be considered as ```fd_memory_after_prefill - fd_memory_after_load```; - - At this point, available GPU memory for KVCache allocation (taking A800 80G as example) is ```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)``` - - Based on model KVCache precision (e.g. 8bit/16bit), calculate memory size per block, then calculate total allocatable blocks, assign to ```num_gpu_blocks_override``` + - At this point, available GPU memory for KVCache allocation (taking A800 80G as example) is ```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)``` + - Based on model KVCache precision (e.g. 8bit/16bit), calculate memory size per block, then calculate total allocatable blocks, assign to ```num_gpu_blocks_override``` > In service startup logs, we can find ```Reset block num, the total_block_num:17220, prefill_kvcache_block_num:12915``` in log/fastdeploy.log, where ```total_block_num``` is the automatically calculated KVCache block count, multiply by ```block_size``` to get total cacheable Tokens. ## 2. Relationship between ```kv_cache_ratio```, ```block_size``` and ```max_num_seqs```? - - FastDeploy divides KVCache between Prefill and Decode phases according to ```kv_cache_ratio```. When configuring this parameter, you can use ```kv_cache_ratio = average input Tokens / (average input + average output Tokens)```. Typically input is 3x output, so can be configured as 0.75. - - ```max_num_seqs``` is the maximum concurrency in Decode phase, generally can be set to maximum 128, but users can also configure based on KVCache situation, e.g. output KVCache Token amount is ```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```, to prevent extreme OOM situations, can configure ```max_num_seqs = decode_token_cache / average output Tokens```, not exceeding 128. +- FastDeploy divides KVCache between Prefill and Decode phases according to ```kv_cache_ratio```. When configuring this parameter, you can use ```kv_cache_ratio = average input Tokens / (average input + average output Tokens)```. Typically input is 3x output, so can be configured as 0.75. +- ```max_num_seqs``` is the maximum concurrency in Decode phase, generally can be set to maximum 128, but users can also configure based on KVCache situation, e.g. output KVCache Token amount is ```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```, to prevent extreme OOM situations, can configure ```max_num_seqs = decode_token_cache / average output Tokens```, not exceeding 128. ## 3. ```enable_chunked_prefill``` parameter description @@ -74,24 +73,24 @@ To optimize scheduling priority for short requests, new `max_long_partial_prefil Currently, only user configuration of the following parameters is supported: - `use_cudagraph` : bool = False - `graph_optimization_config` : Dict[str, Any] - - `graph_opt_level`: int = 0 - - `use_cudagraph`: bool = False - - `cudagraph_capture_sizes` : List[int] = None + - `graph_opt_level`: int = 0 + - `use_cudagraph`: bool = False + - `cudagraph_capture_sizes` : List[int] = None CudaGrpah can be enabled by setting `--use-cudagraph` or `--graph-optimization-config '{"use_cudagraph":true}'`. Using two different methods to set the use graph simultaneously may cause conflicts. - The `graph_opt_level` parameter within `--graph-optimization-config` is used to configure the graph optimization level, with the following available options: - `0`: Use Dynamic compute graph, default to 0 - `1`: Use Static compute graph, during the initialization phase, Paddle API will be used to convert the dynamic image into a static image - `2`: Base on Static compute graph, use the complier(CINN, Compiler Infrastructure for Neural Networks) of Paddle to compile and optimize In general, static graphs have lower Kernel Launch overhead than dynamic graphs, and it is recommended to use static graphs. -For adapted models, FastDeploy's CudaGraph * * can support both dynamic and static graphs * * simultaneously. +For adapted models, FastDeploy's CudaGraph *can support both dynamic and static graphs* simultaneously. When CudaGraph is enabled in the default configuration, a list of Batch Sizes that CudaGraph needs to capture will be automatically set based on the 'max_num_deqs' parameter. The logic for generating the list of Batch Sizes that need to be captured is as follows: 1. Generate a candidate list with a range of [1,1024] Batch Size. + ``` # Batch Size [1, 2, 4, 8, 16, ... 120, 128] candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)] @@ -100,24 +99,25 @@ When CudaGraph is enabled in the default configuration, a list of Batch Sizes th # Batch Size (256, 288, ... 992, 1024] candidate_capture_sizes += [32 * i for i in range(17, 33)] ``` + 2. Crop the candidate list based on the user set 'max_num_deqs' to obtain a CudaGraph capture list with a range of [1,' max_num_deqs']. Users can also customize the batch size list that needs to be captured by CudaGraph through the parameter `cudagraph_capture_sizes` in`--graph-optimization-config`: + ``` --graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}' ``` - ### CudaGraph related parameters Using CudaGraph incurs some additional memory overhead, divided into two categories in FastDeploy: -* Additional input Buffer overhead -* CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework +- Additional input Buffer overhead +- CudaGraph uses dedicated memory pool, thus holding some intermediate activation memory isolated from main framework FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter to calculate available memory for `KVCache`, after initializing `KVCache` then uses remaining memory to initialize CudaGraph. Since CudaGraph is not enabled by default currently, using default startup parameters may encounter `Out of memory` errors, can try following solutions: -* Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph. -* Lower `max_num_seqs` to decrease the maximum concurrency. -* Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes` +- Lower `gpu_memory_utilization` value, reserve more memory for CudaGraph. +- Lower `max_num_seqs` to decrease the maximum concurrency. +- Customize the batch size list that CudaGraph needs to capture through `graph_optimization_config`, and reduce the number of captured graphs by using `cudagraph_capture_sizes` - Before use, must ensure loaded model is properly decorated with ```@support_graph_optimization```. @@ -148,5 +148,6 @@ FastDeploy initialization sequence first uses `gpu_memory_utilization` parameter class Ernie45TModel(nn.Layer): # Note decorator is added to nn.Layer subclass ... ``` + - When ```use_cudagraph``` is enabled, currently only supports single-GPU inference, i.e. ```tensor_parallel_size``` set to 1. - When ```use_cudagraph``` is enabled, cannot enable ```enable_prefix_caching``` or ```enable_chunked_prefill```. diff --git a/docs/quantization/README.md b/docs/quantization/README.md index 96cb6c684..d564223b1 100644 --- a/docs/quantization/README.md +++ b/docs/quantization/README.md @@ -24,7 +24,7 @@ FastDeploy supports various quantization inference precisions including FP8, INT ## 2. Model Support List -| Model Name | Supported Quantization Precision | +| Model Name | Supported Quantization Precision | |---------|---------| | ERNIE-4.5-300B-A47B | WINT8, WINT4, Block-wise FP8, MixQuant| @@ -43,4 +43,4 @@ Examples: - **W4A16C16 / WInt4 / weight-only int4**: 4 defaults to INT4 - **WNF4A8C8**: NF4 refers to 4bits norm-float numerical type - **Wfp8Afp8**: Both weights and activations are FP8 precision -- **W4Afp8**: Weights are INT4, activations are FP8 +- **W4Afp8**: Weights are INT4, activations are FP8 diff --git a/docs/quantization/online_quantization.md b/docs/quantization/online_quantization.md index 3e3f24df9..bf8b9a536 100644 --- a/docs/quantization/online_quantization.md +++ b/docs/quantization/online_quantization.md @@ -24,7 +24,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ - By specifying `--model baidu/ERNIE-4.5-300B-A47B-Paddle`, the model can be automatically downloaded from AIStudio. FastDeploy depends on Paddle format models. For more information, please refer to [Supported Model List](../supported_models.md). - By setting `--quantization` to `wint8` or `wint4`, online INT8/INT4 quantization can be selected. -- Deploying ERNIE-4.5-300B-A47B-Paddle WINT8 requires at least 80G * 8 cards, while WINT4 requires 80GB * 4 cards. +- Deploying ERNIE-4.5-300B-A47B-Paddle WINT8 requires at least 80G *8 cards, while WINT4 requires 80GB* 4 cards. - For more deployment tutorials, please refer to [get_started](../get_started/ernie-4.5.md). ## 2. Block-wise FP8 @@ -51,4 +51,4 @@ python -m fastdeploy.entrypoints.openai.api_server \ - By specifying `--model baidu/ERNIE-4.5-300B-A47B-Paddle`, the model can be automatically downloaded from AIStudio. FastDeploy depends on Paddle format models. For more information, please refer to [Supported Model List](../supported_models.md). - By setting `--quantization` to `block_wise_fp8`, online Block-wise FP8 quantization can be selected. - Deploying ERNIE-4.5-300B-A47B-Paddle Block-wise FP8 requires at least 80G * 8 cards. -- For more deployment tutorials, please refer to [get_started](../get_started/ernie-4.5.md) +- For more deployment tutorials, please refer to [get_started](../get_started/ernie-4.5.md) diff --git a/docs/quantization/wint2.md b/docs/quantization/wint2.md index cbec8aff5..82dd60609 100644 --- a/docs/quantization/wint2.md +++ b/docs/quantization/wint2.md @@ -59,4 +59,4 @@ On the ERNIE-4.5-300B-A47B model, comparison of WINT2 vs WINT4 performance: |DROP|9536|91.17|89.97| |GSM8K|1319|96.21|95.98| |CMath|600|96.50|96.00| -|CMMLU|11477|89.92|86.22| \ No newline at end of file +|CMMLU|11477|89.92|86.22| diff --git a/docs/usage/code_overview.md b/docs/usage/code_overview.md index fb8e70615..506a51680 100644 --- a/docs/usage/code_overview.md +++ b/docs/usage/code_overview.md @@ -22,4 +22,4 @@ Below is an overview of the FastDeploy code structure and functionality organize - ```metrics```: Core component for collecting, managing, and exporting Prometheus metrics, tracking key runtime performance data (e.g., request latency, resource utilization, successful request counts). - ```splitwise```: Modules related to PD disaggragation deployment. - ```scripts```/```tools```: Utility scripts for FastDeploy operations (e.g., compilation, unit testing, code style fixes). -- ```test```: Code for unit testing and validation. \ No newline at end of file +- ```test```: Code for unit testing and validation. diff --git a/docs/usage/log.md b/docs/usage/log.md index 7afa9bf6c..60e658a5b 100644 --- a/docs/usage/log.md +++ b/docs/usage/log.md @@ -1,6 +1,6 @@ # Log Description -FastDeploy generates the following log files during deployment. Below is an explanation of each log's purpose. +FastDeploy generates the following log files during deployment. Below is an explanation of each log's purpose. By default, logs are stored in the `log` directory under the execution path. To specify a custom directory, set the environment variable `FD_LOG_DIR`. ## Inference Service Logs diff --git a/docs/zh/features/disaggregated.md b/docs/zh/features/disaggregated.md index c23cd75dd..ac895639c 100644 --- a/docs/zh/features/disaggregated.md +++ b/docs/zh/features/disaggregated.md @@ -25,13 +25,10 @@ 多实例情况下,每收到一条请求需要根据不同的策略将请求分配到不同的Prefill实例和Decode实例。通过角色分离(prefill 节点负责接收并处理请求,decode节点完成后续生成),可以更细粒度地控制资源分配、提高吞吐量与 GPU 利用率。 - ## 使用说明 - ### 单机分离式部署 - #### 在线推理服务 使用如下命令进行服务部署 @@ -63,7 +60,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --cache-queue-port 8187 \ --tensor-parallel-size 4 \ --quantization wint4 \ - --innode-prefill-ports 8182 \ + --innode-prefill-ports 8182 \ --splitwise-role "decode" ``` @@ -75,9 +72,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ ### 多机分离式部署 - #### 前置依赖 Redis -- 使用`conda`安装 +* 使用`conda`安装 + ```bash # 安装 conda install redis @@ -85,7 +82,8 @@ conda install redis nohup redis-server > redis.log 2>&1 & ``` -- 使用`apt`安装 +* 使用`apt`安装 + ```bash # 安装 sudo apt install redis-server -y @@ -93,7 +91,8 @@ sudo apt install redis-server -y sudo systemctl start redis-server ``` -- 使用`yum`安装 +* 使用`yum`安装 + ```bash # 安装 sudo yum install redis -y diff --git a/docs/zh/features/load_balance.md b/docs/zh/features/load_balance.md index 6626269f6..7e1bab1df 100644 --- a/docs/zh/features/load_balance.md +++ b/docs/zh/features/load_balance.md @@ -23,6 +23,7 @@ ### 前置依赖 Redis - 使用`conda`安装 + ```bash # 安装 conda install redis @@ -31,6 +32,7 @@ nohup redis-server > redis.log 2>&1 & ``` - 使用`apt`安装 + ```bash # 安装 sudo apt install redis-server -y @@ -39,6 +41,7 @@ sudo systemctl start redis-server ``` - 使用`yum`安装 + ```bash # 安装 sudo yum install redis -y @@ -47,6 +50,7 @@ sudo systemctl start redis ``` ### 启动FastDeploy + ```bash python -m fastdeploy.entrypoints.openai.api_server \ --port 8801 \ @@ -62,6 +66,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --scheduler-min-load_score 3 \ --scheduler-load-shards-num 1 ``` + [启动参数说明](../online_serving/scheduler.md) 可以将上述启动命令在多个机器执行,启动多个推理实例(如果是在一个机器中启动多个推理实例,注意端口不要冲突)。 diff --git a/docs/zh/features/prefix_caching.md b/docs/zh/features/prefix_caching.md index 3eff20b63..b6020483f 100644 --- a/docs/zh/features/prefix_caching.md +++ b/docs/zh/features/prefix_caching.md @@ -8,7 +8,6 @@ Prefix Caching(前缀缓存)是一种优化生成式模型推理效率的技 增量计算:对于后续请求,只需计算新增部分(如用户追加的输入)并复用缓存的中间结果,显著减少计算量。 - ## 服务化部署开启 Prefix Caching 启动服务增加以下参数 `enable-prefix-caching`,默认只开启一级缓存(GPU 缓存)。 @@ -37,4 +36,4 @@ python -m fastdeploy.entrypoints.openai.api_server \ FastDeploy 启动时设置 `enable_prefix_caching=True`,CPU Cache 根据机器内存选择开启 `swap_space`。 -提供了测试示例 `demo/offline_prefix_caching_demo.py`。 \ No newline at end of file +提供了测试示例 `demo/offline_prefix_caching_demo.py`。 diff --git a/docs/zh/features/reasoning_output.md b/docs/zh/features/reasoning_output.md index 1090facb6..5417f66d7 100644 --- a/docs/zh/features/reasoning_output.md +++ b/docs/zh/features/reasoning_output.md @@ -8,7 +8,7 @@ | baidu/ERNIE-4.5-VL-424B-A47B-Paddle | ernie-45-vl | ✓ | | baidu/ERNIE-4.5-VL-28B-A3B-Paddle | ernie-45-vl | ✓ | -思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式. +思考模型需要指定解析器,以便于对思考内容进行解析. 通过`enable_thinking=False` 参数可以关闭模型思考模式. 可以支持思考模式开关的接口: 1. OpenAI 服务中 `/v1/chat/completions` 请求. @@ -17,10 +17,10 @@ 同时在思考模型中,支持通过```reasoning_max_tokens```控制思考内容的长度,在请求中添加```metadata={"reasoning_max_tokens": 1024}```即可。 - -### 快速使用 +## 快速使用 在启动模型服务时, 通过`--reasoning-parser`参数指定解析器名称. 该解析器会解析思考模型的输出, 提取`reasoning_content`字段. + ```bash python -m fastdeploy.entrypoints.openai.api_server \ --model /path/to/your/model \ @@ -30,7 +30,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ --quantization wint4 \ --reasoning-parser ernie-45-vl ``` + 接下来, 向模型发送 `chat completion` 请求 + ```bash curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -45,10 +47,12 @@ curl -X POST "http://0.0.0.0:8192/v1/chat/completions" \ }' ``` + 字段`reasoning_content`包含得出最终结论的思考步骤,而`content`字段包含最终结论。 ### 流式会话 在流式会话中, `reasoning_content`字段会可以在`chat completion response chunks`中的 `delta` 中获取 + ```python from openai import OpenAI # Set OpenAI's API key and API base to use vLLM's API server. @@ -73,4 +77,3 @@ for chunk in chat_response: print("\n") ``` - diff --git a/docs/zh/features/speculative_decoding.md b/docs/zh/features/speculative_decoding.md index 38cb02ad2..eb898e873 100644 --- a/docs/zh/features/speculative_decoding.md +++ b/docs/zh/features/speculative_decoding.md @@ -6,10 +6,10 @@ - **Ngram** -- **MTP (Multi-Token Prediction)** - - ✅ 已支持:TP 切分 - - ✅ 已支持:共享前缀 - - ✅ 已支持:单机 TP 切分 + PD 分离 +- **MTP (Multi-Token Prediction)** + - ✅ 已支持:TP 切分 + - ✅ 已支持:共享前缀 + - ✅ 已支持:单机 TP 切分 + PD 分离 - ⏳ 即将支持:EP + DP + PD 分离 - ⏳ 即将支持:兼容 Chunk Prefill - ⏳ 即将支持:多层 MTP layer @@ -18,10 +18,10 @@ ### ⏳ 规划中 -- Draft Model -- Eagle -- Hydra -- Medusa +- Draft Model +- Eagle +- Hydra +- Medusa - ... ## ⚙️ 高效投机解码框架设计 @@ -40,7 +40,7 @@ ## 🚀 使用 Multi-Token-Prediction(MTP) 解码 详见论文:[DeepSeek-V3](https://arxiv.org/pdf/2412.19437) ### TP 并行部署 -> 使用 4×H100,量化方式选择 WINT4 +> 使用 4×H100,量化方式选择 WINT4 > 配置文件:`benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml` ``` @@ -50,13 +50,15 @@ python -m fastdeploy.entrypoints.openai.api_server \ --config ${path_to_FastDeploy}benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml \ --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": "${path_to_mtp_model}"}' ``` + ### PD 分离式部署(1P1D) -> 在8×H100上部署1P1D,P、D节点 分别使用 4×H100;量化方式选择 WINT4 -> 与常规 PD 分离部署一致,仅需替换配置文件并新增 speculative_config +> 在8×H100上部署1P1D,P、D节点 分别使用 4×H100;量化方式选择 WINT4 +> 与常规 PD 分离部署一致,仅需替换配置文件并新增 speculative_config 详情请参考[PD分离式部署](./disaggregated.md)。 - P 节点(Prefill) > 配置文件: `benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-prefill.yaml` + ``` export FD_LOG_DIR="log_prefill" rm -rf ${FD_LOG_DIR} @@ -80,9 +82,11 @@ python -m fastdeploy.entrypoints.openai.api_server \ --scheduler-password "scheduler_mtp" \ --speculative-config '{"method": "mtp", "num_speculative_tokens": 1, "model": ""${path_to_mtp_model}"}' & ``` + - D 节点(Decode) > 配置文件: `benchmarks/yaml/eb45t-32k-wint4-mtp-tp4-decode.yaml` + ``` export FD_LOG_DIR="log_prefill" rm -rf ${FD_LOG_DIR} @@ -109,8 +113,9 @@ python -m fastdeploy.entrypoints.openai.api_server \ ## 🧠 使用 Ngram 解码 该算法通过 n-gram 窗口从 prompt 和已生成的 Token 中进行匹配生成草稿 Token,适合输入和输出有很大 overlap 的场景,如代码续写、文档查询等。 -> 使用 4×H100;量化方式选择 WINT4 +> 使用 4×H100;量化方式选择 WINT4 > 配置文件:benchmarks/yaml/eb45t-32k-wint4-mtp-h100-tp4.yaml + ``` python -m fastdeploy.entrypoints.openai.api_server \ --model ${path_to_main_model} \ diff --git a/docs/zh/get_started/installation/Enflame_gcu.md b/docs/zh/get_started/installation/Enflame_gcu.md index f47212dc6..b71a97a8a 100644 --- a/docs/zh/get_started/installation/Enflame_gcu.md +++ b/docs/zh/get_started/installation/Enflame_gcu.md @@ -131,4 +131,3 @@ python -u bench_gsm8k.py --port 8188 --num-questions 1319 --num-shots 5 --parall ```json {"task": "gsm8k", "backend": "paddlepaddle", "num_gpus": 1, "latency": 13446.01, "accuracy": 0.956, "num_requests": 1319, "other": {"num_questions": 1319, "parallel": 8}} ``` - diff --git a/docs/zh/get_started/installation/README.md b/docs/zh/get_started/installation/README.md index 014c092f5..80638604b 100644 --- a/docs/zh/get_started/installation/README.md +++ b/docs/zh/get_started/installation/README.md @@ -2,8 +2,8 @@ FastDeploy currently supports installation on the following hardware platforms: -- [NVIDIA GPU Installation](nvidia_gpu.md) +- [NVIDIA GPU Installation](nvidia_gpu.md) - [Kunlunxin XPU Installation](kunlunxin_xpu.md) - [Enflame S60 GCU Installation](Enflame_gcu.md) - [Iluvatar GPU Installation](iluvatar_gpu.md) -- [Hygon DCU Installation](hygon_dcu.md) \ No newline at end of file +- [Hygon DCU Installation](hygon_dcu.md) diff --git a/docs/zh/get_started/installation/hygon_dcu.md b/docs/zh/get_started/installation/hygon_dcu.md index 7408ec733..d9bdae0dd 100644 --- a/docs/zh/get_started/installation/hygon_dcu.md +++ b/docs/zh/get_started/installation/hygon_dcu.md @@ -37,6 +37,7 @@ image.sourcefind.cn:5000/dcu/admin/base/custom:fastdeploy2.0.0-kylinv10-dtk25.04 ``` ## 2. 启动服务 + ```bash export FD_ATTENTION_BACKEND="BLOCK_ATTN" python -m fastdeploy.entrypoints.openai.api_server \ @@ -47,7 +48,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --gpu-memory-utilization=0.8 ``` -#### 请求服务 +### 请求服务 您可以基于 OpenAI 协议,通过 curl 和 python 两种方式请求服务。 @@ -78,4 +79,4 @@ response = client.chat.completions.create( stream=False, ) print(response) -``` \ No newline at end of file +``` diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index aa045c7bb..f1ab2b38d 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -1,115 +1,120 @@ -# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B -当前版本软件只是作为天数芯片 + Fastdeploy 推理大模型的一个演示 demo,跑最新ERNIE4.5模型可能存在问题,后续进行修复和性能优化,给客户提供一个更稳定的版本。 - -## 准备机器 -首先您需要准备以下配置的机器 -| CPU | 内存 | 天数 | 硬盘| -|-----|------|-----|-----| -| x86 | 1TB| 8xBI150| 1TB| - -目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。 - -## 镜像 -从官网获取: - -```bash -docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -``` - -## 准备容器 -1. 启动容器 -```bash -docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest -docker exec -it paddle_infer bash -``` -/home/paddle 为模型文件、whl包、脚本所在目录 - -2. 安装whl包 - -```bash -pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ -pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels -``` - -## 准备推理demo脚本 -推理 demo 路径:/home/paddle/scripts -脚本内容如下 - -`run_demo.sh`: -```bash -#!/bin/bash -export PADDLE_XCCL_BACKEND=iluvatar_gpu -export INFERENCE_MSG_QUEUE_ID=232132 -export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 -export FD_DEBUG=1 -python3 run_demo.py -``` - -run_demo.py - - -```python -from fastdeploy import LLM, SamplingParams - -prompts = [ - "Hello, my name is", - "The largest ocean is", -] - -# 采样参数 -sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) - -# 加载模型 -llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8') - -# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) -outputs = llm.generate(prompts, sampling_params) -# 注意将其中`/home/paddle/ernie-4_5-21b-a3b-bf16-paddle`替换为您下载的ERNIE模型的路径。 -# 输出结果 -for output in outputs: - prompt = output.prompt - generated_text = output.outputs.text - print(prompt, generated_text) -``` - -## 运行demo -执行 -```bash -./run_demo.sh -``` -会有如下 log 打印;load 模型耗时约74s,demo 运行约240s。 -``` -/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md - warnings.warn(warning_message) -/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml - warnings.warn( -[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json -/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. - warnings.warn( -/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. - warnings.warn( -INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready... -Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s] -Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s] -INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds. -Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s] -Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started! - (1) First, draw a big circle for the ghost's head. - (2) Then, add two small circles for the eyes, making sure they're not too big. - (3) Next, draw a wide, open mouth that looks like a big "U". - (4) After that, create the body by drawing a slightly smaller circle below the head. - (5) Now, let's add some arms. Draw two short, curly lines on each side of the body. - (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance. - -Now, let's break down each step: - -**Step 1: Drawing the Head** -- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing. - -**Step 2: Adding Eyes** -- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look. - -**Step 3: Drawing the -The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean -``` +# 如何在天数机器上运行 ERNIE-4.5-300B-A47B-BF16 & ERNIE-4.5-21B-A3B +当前版本软件只是作为天数芯片 + Fastdeploy 推理大模型的一个演示 demo,跑最新ERNIE4.5模型可能存在问题,后续进行修复和性能优化,给客户提供一个更稳定的版本。 + +## 准备机器 +首先您需要准备以下配置的机器 +| CPU | 内存 | 天数 | 硬盘| +|-----|------|-----|-----| +| x86 | 1TB| 8xBI150| 1TB| + +目前需要将完整模型 load 到 host memory 中,需要需要大于 600GB 的 host memory,后续版本会优化。 + +## 镜像 +从官网获取: + +```bash +docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +``` + +## 准备容器 +1. 启动容器 + +```bash +docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker exec -it paddle_infer bash +``` + +/home/paddle 为模型文件、whl包、脚本所在目录 + +1. 安装whl包 + +```bash +pip3 install paddlepaddle==3.1.0a0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ +pip3 install paddle-iluvatar-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ +pip3 install fastdeploy_iluvatar_gpu -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels +``` + +## 准备推理demo脚本 +推理 demo 路径:/home/paddle/scripts +脚本内容如下 + +`run_demo.sh`: + +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_DEBUG=1 +python3 run_demo.py +``` + +run_demo.py + +```python +from fastdeploy import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The largest ocean is", +] + +# 采样参数 +sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=256) + +# 加载模型 +llm = LLM(model="/home/paddle/ernie-4_5-21b-a3b-bf16-paddle", tensor_parallel_size=4, max_model_len=8192, static_decode_blocks=0, quantization='wint8') + +# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) +outputs = llm.generate(prompts, sampling_params) +# 注意将其中`/home/paddle/ernie-4_5-21b-a3b-bf16-paddle`替换为您下载的ERNIE模型的路径。 +# 输出结果 +for output in outputs: + prompt = output.prompt + generated_text = output.outputs.text + print(prompt, generated_text) +``` + +## 运行demo +执行 + +```bash +./run_demo.sh +``` + +会有如下 log 打印;load 模型耗时约74s,demo 运行约240s。 + +``` +/usr/local/lib/python3.10/site-packages/paddle/utils/cpp_extension/extension_utils.py:715: UserWarning: No ccache found. Please be aware that recompiling all source files may be required. You can download and install ccache from: https://github.com/ccache/ccache/blob/master/doc/INSTALL.md + warnings.warn(warning_message) +/usr/local/lib/python3.10/site-packages/_distutils_hack/__init__.py:31: UserWarning: Setuptools is replacing distutils. Support for replacing an already imported distutils is deprecated. In the future, this condition will fail. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml + warnings.warn( +[2025-07-02 11:07:42,393] [ INFO] - Loading configuration file /home/paddle/ernie-4_5-21b-a3b-bf16-paddle/generation_config.json +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. + warnings.warn( +/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset `top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. + warnings.warn( +INFO 2025-07-02 11:07:43,589 577964 engine.py[line:207] Waitting worker processes ready... +Loading Weights: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:57<00:00, 1.75it/s] +Loading Layers: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 11.73it/s] +INFO 2025-07-02 11:08:55,261 577964 engine.py[line:277] Worker processes are launched with 73.76574492454529 seconds. +Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [03:59<00:00, 119.96s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s] +Hello, my name is Christopher. Today, I'm going to teach you how to draw a cute cartoon ghost. Let's get started! + (1) First, draw a big circle for the ghost's head. + (2) Then, add two small circles for the eyes, making sure they're not too big. + (3) Next, draw a wide, open mouth that looks like a big "U". + (4) After that, create the body by drawing a slightly smaller circle below the head. + (5) Now, let's add some arms. Draw two short, curly lines on each side of the body. + (6) Finally, give the ghost a wavy line at the bottom to represent its floating appearance. + +Now, let's break down each step: + +**Step 1: Drawing the Head** +- Start with a big circle to form the head of the ghost. This will be the foundation of your drawing. + +**Step 2: Adding Eyes** +- On the head, place two small circles for the eyes. They should be centered and not too big, to give the ghost a cute and innocent look. + +**Step 3: Drawing the +The largest ocean is the Pacific Ocean, covering an area of approximately ⦠[3], The first scientific expeditions to determine the ocean's depth were the Challenger expedition (1872â1876) and the U.S. Navy Hydrographic Office survey (1877â1879). The oceanic crust is thin and irregular, consisting of upward moving magma from the mantle below, and cooling and solidifying on the surface. The shallowest parts of the ocean are called the continental shelves. Large tides are caused mainly by the alignment of the Sun, Moon, and Earth during new or full moons. The origin of the word "ocean" is not clear. The first global oceanic topography survey was completed by the Challenger expedition (1872â1876). [57] The sound speed in the ocean is primarily a function of water temperature and salinity, and varies with depth. The deep-ocean floor is mostly flat and devoid of life, with the exception of seamounts and various underwater volcanic features, including seamounts and hydrothermal vents. [73] Today, the five ocean +``` diff --git a/docs/zh/get_started/installation/nvidia_gpu.md b/docs/zh/get_started/installation/nvidia_gpu.md index 348e350b7..94c111fe1 100644 --- a/docs/zh/get_started/installation/nvidia_gpu.md +++ b/docs/zh/get_started/installation/nvidia_gpu.md @@ -21,6 +21,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12 ## 2. 预编译Pip安装 首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/en/install/quick?docurl=/documentation/docs/en/develop/install/pip/linux-pip_en.html) + ``` shell python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ ``` @@ -28,6 +29,7 @@ python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn 再安装 fastdeploy,**注意不要通过pypi源安装**,需要通过如下方式安装 如你的 GPU 是 SM80/90 架构(A100/H100等),按如下方式安装 + ``` # 安装稳定版本fastdeploy python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -37,6 +39,7 @@ python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages ``` 如你的 GPU 是 SM86/89 架构(4090/L20/L40等),按如下方式安装 + ``` # 安装稳定版本fastdeploy python -m pip install fastdeploy-gpu -i https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple @@ -59,11 +62,13 @@ docker build -f dockerfiles/Dockerfile.gpu -t fastdeploy:gpu . ## 4. Wheel包源码编译 首先安装 paddlepaddle-gpu,详细安装方式参考 [PaddlePaddle安装](https://www.paddlepaddle.org.cn/) + ``` shell python -m pip install paddlepaddle-gpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ ``` 接着克隆源代码,编译安装 + ``` shell git clone https://github.com/PaddlePaddle/FastDeploy cd FastDeploy @@ -74,11 +79,13 @@ cd FastDeploy # 第4个参数: 编译的GPU架构 bash build.sh 1 python false [80,90] ``` + 编译后的产物在```FastDeploy/dist```目录下。 ## 环境检查 在安装 FastDeploy 后,通过如下 Python 代码检查环境的可用性 + ``` python import paddle from paddle.jit.marker import unified @@ -87,4 +94,5 @@ paddle.utils.run_check() # 检查FastDeploy自定义算子编译成功与否 from fastdeploy.model_executor.ops.gpu import beam_search_softmax ``` + 如上代码执行成功,则认为环境可用。 diff --git a/docs/zh/get_started/quick_start.md b/docs/zh/get_started/quick_start.md index 36ac0e855..46da9fa05 100644 --- a/docs/zh/get_started/quick_start.md +++ b/docs/zh/get_started/quick_start.md @@ -15,6 +15,7 @@ ## 1. 启动服务 安装FastDeploy后,在终端执行如下命令,启动服务,其中启动命令配置方式参考[参数说明](../parameters.md) + ```shell python -m fastdeploy.entrypoints.openai.api_server \ --model baidu/ERNIE-4.5-0.3B-Paddle \ @@ -24,9 +25,10 @@ python -m fastdeploy.entrypoints.openai.api_server \ --max-model-len 32768 \ --max-num-seqs 32 ``` ->💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。 -```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。 -```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。 + +>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。 +```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。 +```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。 **相关文档** @@ -36,6 +38,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ ## 2. 用户发起服务请求 执行启动服务指令后,当终端打印如下信息,说明服务已经启动成功。 + ``` api_server.py[line:91] Launching metrics service at http://0.0.0.0:8181/metrics api_server.py[line:94] Launching chat completion service at http://0.0.0.0:8180/v1/chat/completions @@ -47,11 +50,13 @@ INFO: Uvicorn running on http://0.0.0.0:8180 (Press CTRL+C to quit) ``` FastDeploy提供服务探活接口,用以判断服务的启动状态,执行如下命令返回 ```HTTP/1.1 200 OK``` 即表示服务启动成功。 + ```shell curl -i http://0.0.0.0:8180/health ``` 通过如下命令发起服务请求 + ```shell curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ -H "Content-Type: application/json" \ diff --git a/docs/zh/get_started/quick_start_vl.md b/docs/zh/get_started/quick_start_vl.md index 11f9133b0..deaf3e10d 100644 --- a/docs/zh/get_started/quick_start_vl.md +++ b/docs/zh/get_started/quick_start_vl.md @@ -30,11 +30,11 @@ python -m fastdeploy.entrypoints.openai.api_server \ --enable-mm ``` ->💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Base-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。 -```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。 -```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。 -```--reasoning-parser``` 指定思考内容解析器。 -```--enable-mm``` 表示是否开启多模态支持。 +>💡 注意:在 ```--model``` 指定的路径中,若当前目录下不存在该路径对应的子目录,则会尝试根据指定的模型名称(如 ```baidu/ERNIE-4.5-0.3B-Base-Paddle```)查询AIStudio是否存在预置模型,若存在,则自动启动下载。默认的下载路径为:```~/xx```。关于模型自动下载的说明和配置参阅[模型下载](../supported_models.md)。 +```--max-model-len``` 表示当前部署的服务所支持的最长Token数量。 +```--max-num-seqs``` 表示当前部署的服务所支持的最大并发处理数量。 +```--reasoning-parser``` 指定思考内容解析器。 +```--enable-mm``` 表示是否开启多模态支持。 **相关文档** diff --git a/docs/zh/index.md b/docs/zh/index.md index 0e98a53b3..40417db2c 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -2,11 +2,11 @@ **FastDeploy** 是基于飞桨(PaddlePaddle)的大语言模型(LLM)与视觉语言模型(VLM)推理部署工具包,提供**开箱即用的生产级部署方案**,核心技术特性包括: -🚀 **负载均衡式PD分解**:工业级解决方案,支持上下文缓存与动态实例角色切换,在保障SLO达标和吞吐量的同时优化资源利用率 -🔄 **统一KV缓存传输**:轻量级高性能传输库,支持智能NVLink/RDMA选择 -🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口 -🧮 **全量化格式支持**:W8A16、W8A8、W4A16、W4A8、W2A16、FP8等 -⏩ **高级加速技术**:推测解码、多令牌预测(MTP)及分块预填充 +🚀 **负载均衡式PD分解**:工业级解决方案,支持上下文缓存与动态实例角色切换,在保障SLO达标和吞吐量的同时优化资源利用率 +🔄 **统一KV缓存传输**:轻量级高性能传输库,支持智能NVLink/RDMA选择 +🤝 **OpenAI API服务与vLLM兼容**:单命令部署,兼容[vLLM](https://github.com/vllm-project/vllm/)接口 +🧮 **全量化格式支持**:W8A16、W8A8、W4A16、W4A8、W2A16、FP8等 +⏩ **高级加速技术**:推测解码、多令牌预测(MTP)及分块预填充 🖥️ **多硬件支持**:NVIDIA GPU、昆仑芯XPU、海光DCU、昇腾NPU、天数智芯GPU、燧原GCU、沐曦GPU等 ## 支持模型 @@ -24,6 +24,7 @@ ## 文档说明 本项目文档基于mkdocs支持编译可视化查看,参考如下命令进行编译预览, + ``` pip install requirements.txt @@ -32,4 +33,5 @@ mkdocs build mkdocs serve ``` + 根据提示打开相应地址即可。 diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index 26c985883..d2c001037 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -19,7 +19,6 @@ python -m fastdeploy.entrypoints.openai.api_server \ --enable-logprob ``` - 服务部署时的命令行更多使用方式参考[参数说明](../parameters.md)。 ## 发送用户请求 @@ -51,6 +50,7 @@ curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ ``` 使用 Python 脚本发送用户请求示例如下: + ```python import openai host = "0.0.0.0" @@ -88,10 +88,10 @@ FastDeploy 与 OpenAI 协议的请求参数差异如下,其余请求参数会 - `temperature`: Optional[float] = None - `top_p`: Optional[float] = None - `metadata`: Optional[dict] = None (仅在v1/chat/compeltions中支持,用于配置额外参数, 如metadata={"enable_thinking": True}) - - `min_tokens`: Optional[int] = 1 最小生成的Token个数 - - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数,默认与max_tokens一致 - - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考 - - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数(>1时惩罚重复,<1时鼓励重复) + - `min_tokens`: Optional[int] = 1 最小生成的Token个数 + - `reasoning_max_tokens`: Optional[int] = None 思考内容最大Token数,默认与max_tokens一致 + - `enable_thinking`: Optional[bool] = True 支持深度思考的模型是否打开思考 + - `repetition_penalty`: Optional[float] = None: 直接对重复生成的token进行惩罚的系数(>1时惩罚重复,<1时鼓励重复) > 注: 若为多模态模型 由于思考链默认打开导致输出过长,max tokens 可以设置为模型最长输出,或使用默认值。 @@ -103,6 +103,7 @@ FastDeploy 增加的返回字段如下: - `reasoning_content`: 思考链的返回结果 返回参数总览: + ```python ChatCompletionStreamResponse: id: str diff --git a/docs/zh/online_serving/scheduler.md b/docs/zh/online_serving/scheduler.md index 9f92ac0b0..afbd819ba 100644 --- a/docs/zh/online_serving/scheduler.md +++ b/docs/zh/online_serving/scheduler.md @@ -14,11 +14,10 @@ FastDeploy 目前支持两种调度器: **本地调度器** 和 **全局调度 基于全局调度器,FastDeploy 引入了专为大语言模型推理场景优化的 **PD 分离调度策略**。该策略将推理流程解耦为两个独立阶段: - **Prefill 阶段** :构建 KV 缓存,该过程计算密集度高、显存占用大,但延迟低; -- **Decode 阶段**:进行自回归解码,该过程串行执行、时延高,但显存占用低。 +- **Decode 阶段**:进行自回归解码,该过程串行执行、时延高,但显存占用低。 通过角色分离(prefill 节点负责接收并处理请求,decode节点完成后续生成),可以更细粒度地控制资源分配、提高吞吐量与 GPU 利用率。 - ## 配置参数 | 字段名 | 字段类型 | 是否必填 | 默认值 | 生效范围 | 说明 | | ------------------------------------ | -------- | -------- | --------- |------------------------|-----------------------------------| diff --git a/docs/zh/parameters.md b/docs/zh/parameters.md index b6865f554..fbf57a971 100644 --- a/docs/zh/parameters.md +++ b/docs/zh/parameters.md @@ -2,7 +2,6 @@ 在使用FastDeploy部署模型(包括离线推理、服务化部署),涉及如下参数配置,其实需要注意,在使用离线推理时,各参数配置即为如下参数名;而在使用命令行启动服务时,相应参数中的分隔符需要从```_```修改为```-```,如```max_model_len```在命令行中则为```--max-model-len```。 - | 参数名 | 类型 | 说明 | |:-----------------------------------|:----------| :----- | | ```port``` | `int` | 仅服务化部署需配置,服务HTTP请求端口号,默认8000 | @@ -44,7 +43,6 @@ | ```enable_expert_parallel``` | `bool` | 是否启用专家并行 | | ```enable_logprob``` | `bool` | 是否启用输出token返回logprob。如果未使用 logrpob,则在启动时可以省略此参数。 | - ## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系? FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache块```和```模型计算中间激活值```占用。其中预分配KVCache块由```num_gpu_blocks_override```决定,其单位为```block_size```(默认64),即一个块可以存储64个Token的KVCache。 @@ -53,14 +51,14 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache - 加载模型,在完成模型加载后,记录当前显存占用情况```total_memory_after_load```和FastDeploy框架占用的显存值```fd_memory_after_load```; 注意前者为GPU实际被占用显存(可能有其它进程也占用),后者是FD框架本身占用显存; - 根据用户配置的```max_num_batched_tokens```(默认为```max_model_len```),Fake相应长度的输入数据进行Prefill计算,记录当前FastDeploy框架显存最大分配值```fd_memory_after_prefill```,因此可以认为```模型计算中间激活值```为```fd_memory_after_prefill - fd_memory_after_load```; - - 截止当前,认为GPU卡可以剩分配KVCache的显存(以A800 80G为例)为```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)``` - - 根据模型KVCache的精度(如8bit/16bit),计算一个block占用的KVCache大小,从而计算出总共可分配的block数量,赋值给```num_gpu_blocks_override``` + - 截止当前,认为GPU卡可以剩分配KVCache的显存(以A800 80G为例)为```80GB * gpu_memory_utilization - total_memory_after_load - (fd_memory_after_prefill - fd_memory_after_load)``` + - 根据模型KVCache的精度(如8bit/16bit),计算一个block占用的KVCache大小,从而计算出总共可分配的block数量,赋值给```num_gpu_blocks_override``` > 在服务启动日志中,我们可以在log/fastdeploy.log中找到```Reset block num, the total_block_num:17220, prefill_kvcache_block_num:12915```,其中```total_block_num```即为自动计算出来的KVCache block数量,将其乘以```block_size```即可知道整个服务可以缓存多少Token的KV值。 ## 2. ```kv_cache_ratio```、```block_size```、```max_num_seqs```的关系? - - FastDeploy里面将KVCache按照```kv_cache_ratio```分为Prefill阶段使用和Decode阶段使用,在配置这个参数时,可以按照```kv_cache_ratio = 平均输入Token数/(平均输入+平均输出Token数)```进行配置,常规情况输入是输出的3倍,因此可以配置成0.75 - - ```max_num_seqs```是Decode阶段的最大并发数,一般而言可以配置成最大值128,但用户也可以根据KVCache情况作调用,例如输出的KVCache Token量为```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```,为了防止极端情况下的显存不足问题,可以配置```max_num_seqs = decode_token_cache / 平均输出Token数```,不高于128即可。 +- FastDeploy里面将KVCache按照```kv_cache_ratio```分为Prefill阶段使用和Decode阶段使用,在配置这个参数时,可以按照```kv_cache_ratio = 平均输入Token数/(平均输入+平均输出Token数)```进行配置,常规情况输入是输出的3倍,因此可以配置成0.75 +- ```max_num_seqs```是Decode阶段的最大并发数,一般而言可以配置成最大值128,但用户也可以根据KVCache情况作调用,例如输出的KVCache Token量为```decode_token_cache = total_block_num * (1 - kv_cache_ratio) * block_size```,为了防止极端情况下的显存不足问题,可以配置```max_num_seqs = decode_token_cache / 平均输出Token数```,不高于128即可。 ## 3. ```enable_chunked_prefill```参数配置说明 @@ -72,9 +70,9 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache 当前仅支持用户配置以下参数: - `use_cudagraph` : bool = False - `graph_optimization_config` : Dict[str, Any] - - `graph_opt_level`: int = 0 - - `use_cudagraph`: bool = False - - `cudagraph_capture_sizes` : List[int] = None + - `graph_opt_level`: int = 0 + - `use_cudagraph`: bool = False + - `cudagraph_capture_sizes` : List[int] = None 可以通过设置 `--use-cudagraph` 或 `--graph-optimization-config '{"use_cudagraph":true}'` 开启 CudaGrpah。 @@ -88,6 +86,7 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache 在默认配置下开启 CudaGraph 时,会根据 `max_num_seqs` 参数自动设置 CudaGraph 需要捕获的 Batch Size 列表,需要捕获的 Batch Size 的列表自动生成逻辑如下: 1. 生成一个范围为 [1,1024] Batch Size 的候选列表 + ``` # Batch Size [1, 2, 4, 8, 16, ... 120, 128] candidate_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)] @@ -96,24 +95,24 @@ FastDeploy在推理过程中,显存被```模型权重```、```预分配KVCache # Batch Size (256, 288, ... 992, 1024] candidate_capture_sizes += [32 * i for i in range(17, 33)] ``` + 2. 根据用户设置的 `max_num_seqs` 裁剪候选列表,得到范围为 [1, `max_num_seqs`] 的 CudaGraph 捕获列表。 用户也可以通过 `--graph-optimization-config` 中的 `cudagraph_capture_sizes` 参数自定义需要被 CudaGraph 捕获的 Batch Size 列表: + ``` --graph-optimization-config '{"cudagraph_capture_sizes": [1, 3, 5, 7, 9]}' ``` - ### CudaGraph相关参数说明 使用 CudaGraph 会产生一些额外的显存开销,在FastDeploy中分为下面两类: -* 额外的输入 Buffer 开销 -* CudaGraph 使用了专用的显存池,因此会持有一部分与主框架隔离的中间激活显存 +- 额外的输入 Buffer 开销 +- CudaGraph 使用了专用的显存池,因此会持有一部分与主框架隔离的中间激活显存 FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 `KVCache` 可用的显存,初始化完 `KVCache` 之后才会使用剩余显存初始化 CudaGraph。由于 CudaGraph 目前还不是默认开启的,因此使用默认启动参数可能会遇到 `Out Of Memory` 错误,可以尝试使用下面三种方式解决: -* 调低 `gpu_memory_utilization` 的值,多预留一些显存给CudaGraph使用。 -* 调低 `max_num_seqs` 的值,降低最大并发数。 -* 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`,减少捕获的图的数量 - +- 调低 `gpu_memory_utilization` 的值,多预留一些显存给CudaGraph使用。 +- 调低 `max_num_seqs` 的值,降低最大并发数。 +- 通过 `graph_optimization_config` 自定义需要 CudaGraph 捕获的 Batch Size 列表 `cudagraph_capture_sizes`,减少捕获的图的数量 使用CudaGraph之前,需要确保加载的模型被装饰器 ```@support_graph_optimization```正确修饰。 @@ -144,5 +143,6 @@ FastDeploy 的初始化顺序为先使用 `gpu_memory_utilization` 参数计算 class Ernie45TModel(nn.Layer): # 注意 decorator 加在 nn.Layer 的子类上 ... ``` + - 当开启 ```use_cudagraph``` 时,暂时只支持单卡推理,即 ```tensor_parallel_size``` 设为1。 - 当开启 ```use_cudagraph``` 时,暂不支持开启 ```enable_prefix_caching``` 或 ```enable_chunked_prefill``` 。 diff --git a/docs/zh/quantization/README.md b/docs/zh/quantization/README.md index 7b85c094d..77705c1e0 100644 --- a/docs/zh/quantization/README.md +++ b/docs/zh/quantization/README.md @@ -24,7 +24,7 @@ FastDeploy支持FP8、INT8、INT4、2-bit等多种量化推理精度,支持模 ## 2. 模型支持列表 -| 模型名称 | 支持量化精度 | +| 模型名称 | 支持量化精度 | |---------|---------| | ERNIE-4.5-300B-A47B | WINT8, WINT4, Block_wise= FP8, MixQuant| @@ -37,11 +37,10 @@ FastDeploy 按以下格式命名各种量化精度: ``` 部分示例如下: - + - **W8A8C8**:W=weights,A=activations,C=CacheKV;8默认为INT8 - **W8A8C16**:16默认为BF16,其它同上 - **W4A16C16 / WInt4 / weight-only int4**:4默认为INT4 - **WNF4A8C8**:NF4指4bit norm-float数值类型 - **Wfp8Afp8**:权重和激活均为FP8精度 - **W4Afp8**:权重为INT4, 激活为FP8 - diff --git a/docs/zh/quantization/online_quantization.md b/docs/zh/quantization/online_quantization.md index f487f8ac8..2e5040239 100644 --- a/docs/zh/quantization/online_quantization.md +++ b/docs/zh/quantization/online_quantization.md @@ -23,8 +23,8 @@ python -m fastdeploy.entrypoints.openai.api_server \ ``` - 通过指定 `--model baidu/ERNIE-4.5-300B-A47B-Paddle` 可自动从AIStudio下载模型。FastDeploy依赖Paddle格式的模型,更多说明参考[支持模型列表](../supported_models.md)。 -- 通过设置 `--quantization` 为 `wint8` 或 `wint4` 选择在线 INT8/INT4 量化。 -- 部署 ERNIE-4.5-300B-A47B-Paddle WINT8 最少需要 80G * 8卡, WINT4 则需要 80GB * 4卡。 +- 通过设置 `--quantization` 为 `wint8` 或 `wint4` 选择在线 INT8/INT4 量化。 +- 部署 ERNIE-4.5-300B-A47B-Paddle WINT8 最少需要 80G *8卡, WINT4 则需要 80GB* 4卡。 - 更多部署教程请参考[get_started](../get_started/ernie-4.5.md). ## 2. Block-wise FP8 @@ -49,9 +49,6 @@ python -m fastdeploy.entrypoints.openai.api_server \ ``` - 通过指定 `--model baidu/ERNIE-4.5-300B-A47B-Paddle` 可自动从AIStudio下载模型。FastDeploy依赖Paddle格式的模型,更多说明参考[支持模型列表](../supported_models.md)。 -- 通过设置 `--quantization` 为 `block_wise_fp8` 选择在线 Block-wise FP8 量化。 +- 通过设置 `--quantization` 为 `block_wise_fp8` 选择在线 Block-wise FP8 量化。 - 部署 ERNIE-4.5-300B-A47B-Paddle Block-wise FP8 最少需要 80G * 8卡。 - 更多部署教程请参考[get_started](../get_started/ernie-4.5.md) - - - diff --git a/docs/zh/quantization/wint2.md b/docs/zh/quantization/wint2.md index 79da233e8..91c1441bf 100644 --- a/docs/zh/quantization/wint2.md +++ b/docs/zh/quantization/wint2.md @@ -48,7 +48,6 @@ python -m fastdeploy.entrypoints.openai.api_server \ - 更多部署教程请参考[get_started](../get_started/ernie-4.5.md); - 更多模型说明请参考[支持模型列表](../supported_models.md)。 - ## WINT2效果 在ERNIE-4.5-300B-A47B模型上,WINT2与WINT4效果对比: diff --git a/docs/zh/usage/code_overview.md b/docs/zh/usage/code_overview.md index 2fda9caef..170652a5e 100644 --- a/docs/zh/usage/code_overview.md +++ b/docs/zh/usage/code_overview.md @@ -22,4 +22,3 @@ - ```splitwise```: 分离式部署相关模块 - ```scripts```/```tools```:FastDeploy 用于执行功能的辅助脚本,比如编译,单测执行,代码风格纠正等 - ```test```:项目单测验证使用到的代码 - diff --git a/docs/zh/usage/log.md b/docs/zh/usage/log.md index 5e521f1a1..c9b287523 100644 --- a/docs/zh/usage/log.md +++ b/docs/zh/usage/log.md @@ -19,14 +19,12 @@ FastDeploy 在部署过程中,会产生如下日志文件,各日志含义说 ## 在线推理客户端日志 * `api_server.log` : 记录启动参数,及接收到的请求信息 - ## 调度器日志 * `scheduler.log` : 记录调度器的信息包含当前结点的信息,每条请求分配的信息 ## 投机解码日志 * `speculate.log` : 投机解码相关信息 - ## Prefix Caching 相关日志 * `cache_queue_manager.log` : 记录启动参数,及接收到的请求信息 diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py index e511eb6c4..15186dfb7 100644 --- a/fastdeploy/__init__.py +++ b/fastdeploy/__init__.py @@ -22,14 +22,14 @@ import sys os.environ["GLOG_minloglevel"] = "2" # suppress log from aistudio os.environ["AISTUDIO_LOG"] = "critical" -from fastdeploy.utils import version from fastdeploy.engine.sampling_params import SamplingParams from fastdeploy.entrypoints.llm import LLM -__all__ = ['LLM', 'SamplingParams'] +__all__ = ["LLM", "SamplingParams"] try: import use_triton_in_paddle + use_triton_in_paddle.make_triton_compatible_with_paddle() except ImportError: pass @@ -38,13 +38,21 @@ except ImportError: def _patch_fastsafetensors(): try: - file_path = subprocess.check_output([ - sys.executable, "-c", "import fastsafetensors, os; \ + file_path = ( + subprocess.check_output( + [ + sys.executable, + "-c", + "import fastsafetensors, os; \ print(os.path.join(os.path.dirname(fastsafetensors.__file__), \ - 'frameworks', '_paddle.py'))" - ]).decode().strip() + 'frameworks', '_paddle.py'))", + ] + ) + .decode() + .strip() + ) - with open(file_path, 'r') as f: + with open(file_path, "r") as f: content = f.read() if "DType.U16: DType.BF16," in content and "DType.U8: paddle.uint8," in content: return @@ -56,21 +64,20 @@ def _patch_fastsafetensors(): inside_block = False for line in lines: new_lines.append(line) - if 'need_workaround_dtypes: Dict[DType, DType] = {' in line: + if "need_workaround_dtypes: Dict[DType, DType] = {" in line: inside_block = True - elif inside_block and '}' in line: - new_lines.insert(-1, ' DType.U16: DType.BF16,') + elif inside_block and "}" in line: + new_lines.insert(-1, " DType.U16: DType.BF16,") inside_block = False modified = True content = "\n".join(new_lines) if "DType.I8: paddle.uint8," in content: - content = content.replace("DType.I8: paddle.uint8,", - "DType.U8: paddle.uint8,") + content = content.replace("DType.I8: paddle.uint8,", "DType.U8: paddle.uint8,") modified = True if modified: - with open(file_path, 'w') as f: + with open(file_path, "w") as f: f.write(content + "\n") except Exception as e: diff --git a/fastdeploy/cache_manager/__init__.py b/fastdeploy/cache_manager/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/cache_manager/__init__.py +++ b/fastdeploy/cache_manager/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/cache_manager/cache_data.py b/fastdeploy/cache_manager/cache_data.py index aeb58d55f..638da70bc 100644 --- a/fastdeploy/cache_manager/cache_data.py +++ b/fastdeploy/cache_manager/cache_data.py @@ -109,13 +109,12 @@ class BlockNode: parent_node_id = None return ( f"node_id {self.node_id}: depth {self.depth} hash_value {self.hash_value}" - + - f" shared_count {self.shared_count} is_gpu_leaf_node {self.is_gpu_leaf_node}" - + - f" is_cpu_leaf_node {self.is_cpu_leaf_node} block_id {self.block_id} " - + f"has_in_gpu {self.has_in_gpu} " + - f"cache_status {self.cache_status} parent {parent_node_id} with children number " - + f"{len(self.children)} req_id_set {self.req_id_set}") + + f" shared_count {self.shared_count} is_gpu_leaf_node {self.is_gpu_leaf_node}" + + f" is_cpu_leaf_node {self.is_cpu_leaf_node} block_id {self.block_id} " + + f"has_in_gpu {self.has_in_gpu} " + + f"cache_status {self.cache_status} parent {parent_node_id} with children number " + + f"{len(self.children)} req_id_set {self.req_id_set}" + ) @property def has_in_gpu(self): @@ -141,8 +140,7 @@ class BlockNode: """ check if the node is a leaf node in CPU """ - if (self.cache_status == CacheStatus.CPU) and (len(self.children) - == 0): + if (self.cache_status == CacheStatus.CPU) and (len(self.children) == 0): return True return False diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py index 641f44bb1..f11c40690 100644 --- a/fastdeploy/cache_manager/cache_messager.py +++ b/fastdeploy/cache_manager/cache_messager.py @@ -21,51 +21,54 @@ import time import numpy as np import paddle -from fastdeploy.cache_manager.transfer_factory import (IPCCommManager, - RDMACommManager) +from fastdeploy.cache_manager.transfer_factory import IPCCommManager, RDMACommManager from fastdeploy.inter_communicator import EngineWorkerQueue, IPCSignal from fastdeploy.utils import get_logger logger = get_logger("cache_messager", "cache_messager.log") -class CacheMessager(object): +class CacheMessager: """ CacheMessager is used to send the cache data between the engine worker and the cache server. """ - def __init__(self, - splitwise_role, - transfer_protocol, - pod_ip, - engine_worker_queue_port, - local_data_parallel_id, - gpu_cache_kvs, - rank, - nranks, - num_layers, - gpu_id=0, - rdma_port=None): + def __init__( + self, + splitwise_role, + transfer_protocol, + pod_ip, + engine_worker_queue_port, + local_data_parallel_id, + gpu_cache_kvs, + rank, + nranks, + num_layers, + gpu_id=0, + rdma_port=None, + ): """ - Initialize the CacheMessager object. + Initialize the CacheMessager object. - Args: - splitwise_role (str): splitwise_role only can be 'prefill' or 'decode'. - transfer_protocol (str): support ipc and rdma - engine_worker_queue_port (int): engine_worker_queue port - gpu_cache_kvs (dict): GPU kv cache - rank (int): current rank - nranks (int): global rank number - num_layers (int): model layer number - gpu_id (int, optional): GPU ID - rdma_port (int, optional): RDMA port + Args: + splitwise_role (str): splitwise_role only can be 'prefill' or 'decode'. + transfer_protocol (str): support ipc and rdma + engine_worker_queue_port (int): engine_worker_queue port + gpu_cache_kvs (dict): GPU kv cache + rank (int): current rank + nranks (int): global rank number + num_layers (int): model layer number + gpu_id (int, optional): GPU ID + rdma_port (int, optional): RDMA port - Returns: - None + Returns: + None """ - assert splitwise_role in ["prefill", "decode"], \ - "splitwise_role must be prefill or decode" + assert splitwise_role in [ + "prefill", + "decode", + ], "splitwise_role must be prefill or decode" self.splitwise_role = splitwise_role self.gpu_cache_kvs = gpu_cache_kvs self.rank = rank @@ -76,11 +79,11 @@ class CacheMessager(object): is_server=False, num_client=self.nranks, client_id=self.rank, - local_data_parallel_id=local_data_parallel_id) + local_data_parallel_id=local_data_parallel_id, + ) transfer_protocol = transfer_protocol.split(",") - logger.info(f"splitwise role: {splitwise_role}, {transfer_protocol}" - f"rank: {rank}") + logger.info(f"splitwise role: {splitwise_role}, {transfer_protocol}" f"rank: {rank}") # 1. initialize the cache_k_ptr_list and cache_v_ptr_list self.num_layers = num_layers @@ -90,10 +93,8 @@ class CacheMessager(object): cache_v = [] self.messager = {} for layer_idx in range(self.num_layers): - key_cache = self.gpu_cache_kvs[ - f'key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}'] - val_cache = self.gpu_cache_kvs[ - f'value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}'] + key_cache = self.gpu_cache_kvs[f"key_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"] + val_cache = self.gpu_cache_kvs[f"value_caches_{layer_idx}_rank{self.rank}_device{gpu_id}"] cache_k.append(key_cache) cache_v.append(val_cache) cache_k_ptr_list.append(key_cache.data_ptr()) @@ -109,7 +110,8 @@ class CacheMessager(object): block_bytes *= 2 logger.info( f"layers {num_layers} cache_shape: {cache_shape}, max_block_num: {max_block_num}, " - f"block_bytes: {block_bytes}, dtype: {key_cache.dtype}") + f"block_bytes: {block_bytes}, dtype: {key_cache.dtype}" + ) self.block_bytes = block_bytes # 3. initialize the messager @@ -122,24 +124,26 @@ class CacheMessager(object): cache_v, ) local_device_id = int(str(cache_k[0].place)[-2]) - logger.info( - f"done create ipc_comm with local_device_id:{local_device_id}, " - ) + logger.info(f"done create ipc_comm with local_device_id:{local_device_id}, ") elif protocol == "rdma": - logger.info( - f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}" - ) + logger.info(f"splitwise_role rdma: {self.splitwise_role}, rank: {self.rank}, gpu_id: {gpu_id}") self.messager[protocol] = RDMACommManager( - splitwise_role, rank, gpu_id, cache_k_ptr_list, - cache_v_ptr_list, max_block_num, block_bytes, rdma_port) + splitwise_role, + rank, + gpu_id, + cache_k_ptr_list, + cache_v_ptr_list, + max_block_num, + block_bytes, + rdma_port, + ) self.gpu_id = gpu_id self.cache_info = dict() - layerwise_send_cache_thread = threading.Thread( - target=self._prefill_layerwise_send_cache_thread) + layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread) layerwise_send_cache_thread.daemon = True layerwise_send_cache_thread.start() @@ -159,26 +163,30 @@ class CacheMessager(object): array=prefilled_step_idx_data, dtype=np.int32, suffix=self.gpu_id, - create=True) + create=True, + ) layer_shm_value = IPCSignal( name=f"splitwise_complete_prefilled_layer_{self.rank}", array=prefilled_layer_idx_data, dtype=np.int32, suffix=self.gpu_id, - create=True) + create=True, + ) except: step_shm_value = IPCSignal( name=f"splitwise_complete_prefilled_step_{self.rank}", array=prefilled_step_idx_data, dtype=np.int32, suffix=self.gpu_id, - create=False) + create=False, + ) layer_shm_value = IPCSignal( name=f"splitwise_complete_prefilled_layer_{self.rank}", array=prefilled_layer_idx_data, dtype=np.int32, suffix=self.gpu_id, - create=False) + create=False, + ) step_shm_value.value[0] = -1 layer_shm_value.value[0] = -1 @@ -193,21 +201,19 @@ class CacheMessager(object): if cache_info: logger.debug(f"cache info {cache_info}") for info in cache_info: - if info['request_id'] in self.cache_info: + if info["request_id"] in self.cache_info: self.cache_info[info["request_id"]].update(info) current_info = self.cache_info[info["request_id"]] if "dest_block_ids" in current_info and "src_block_ids" in current_info: - current_src_blocks = current_info[ - "src_block_ids"][-len(current_info["dest_block_ids"]):] - current_info[ - "src_block_ids"] = current_src_blocks + current_src_blocks = current_info["src_block_ids"][ + -len(current_info["dest_block_ids"]) : + ] + current_info["src_block_ids"] = current_src_blocks current_info["current_layer_ids"] = 0 current_info["status"] = "init" - logger.info( - f"start cache_infos: {current_info}") + logger.info(f"start cache_infos: {current_info}") self.cache_info[info["request_id"]] = current_info - self.last_step_idx = min( - self.last_step_idx, current_info['current_id']) + self.last_step_idx = min(self.last_step_idx, current_info["current_id"]) else: self.cache_info[info["request_id"]] = info prefilled_layer_idx = layer_shm_value.value[0] @@ -223,64 +229,53 @@ class CacheMessager(object): if not self.cache_info: time.sleep(0.001) continue - logger.debug( - f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}" - ) + logger.debug(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}") for req_id, item in list(self.cache_info.items()): if "status" not in item: continue if "layer_idx" not in item: item["layer_idx"] = 0 - if item['status'] == 'error': + if item["status"] == "error": del self.cache_info[req_id] continue - if item['current_id'] > prefilled_step_idx: + if item["current_id"] > prefilled_step_idx: continue current_transfer_protocol = item["transfer_protocol"] if item["transfer_protocol"] == "rdma": - target_ip = item['ip'] - target_id = int(item['rdma_ports'][self.rank]) - status = self.messager[ - current_transfer_protocol].connect( - target_ip, target_id) + target_ip = item["ip"] + target_id = int(item["rdma_ports"][self.rank]) + status = self.messager[current_transfer_protocol].connect(target_ip, target_id) if not status: - logger.error( - f"connect to {target_ip}:{target_id} failed") + logger.error(f"connect to {target_ip}:{target_id} failed") item["status"] = "error" self.engine_worker_queue.finish_request_barrier.wait() if self.rank == 0: - self.engine_worker_queue.put_finished_req([ - (item['request_id'], "connect error") - ]) + self.engine_worker_queue.put_finished_req([(item["request_id"], "connect error")]) continue elif item["transfer_protocol"] == "ipc": target_ip = "0.0.0.0" - target_id = int(item['device_ids'][self.rank]) - src_block_ids = paddle.to_tensor(item['src_block_ids'], - dtype='int32', - place='cpu') - dest_block_ids = paddle.to_tensor(item['dest_block_ids'], - dtype='int32', - place='cpu') - if item['current_id'] < prefilled_step_idx: + target_id = int(item["device_ids"][self.rank]) + src_block_ids = paddle.to_tensor(item["src_block_ids"], dtype="int32", place="cpu") + dest_block_ids = paddle.to_tensor(item["dest_block_ids"], dtype="int32", place="cpu") + if item["current_id"] < prefilled_step_idx: current_layer_idx = self.num_layers else: current_layer_idx = prefilled_layer_idx + 1 - for layer_idx in range(item["layer_idx"], - current_layer_idx): + for layer_idx in range(item["layer_idx"], current_layer_idx): tic = time.time() - return_code = self.messager[ - current_transfer_protocol].write_cache( - target_ip, target_id, src_block_ids, - dest_block_ids, layer_idx) + return_code = self.messager[current_transfer_protocol].write_cache( + target_ip, + target_id, + src_block_ids, + dest_block_ids, + layer_idx, + ) if return_code != 0: item["status"] = "error" self.engine_worker_queue.finish_request_barrier.wait() if self.rank == 0: - self.engine_worker_queue.put_finished_req([ - (item['request_id'], "write cache error") - ]) + self.engine_worker_queue.put_finished_req([(item["request_id"], "write cache error")]) logger.error( f"write cache failed, layer_idx: {layer_idx}, " f"req_id: {item['request_id']}, dest_ip: {target_ip}" @@ -298,16 +293,14 @@ class CacheMessager(object): f"block_num: {block_num}, send_cache_speed(GB/s): {round(send_cache_speed, 5)}," f"avg_time per block(ms): {round(avg_time_per_block, 5)}" ) - item['layer_idx'] = current_layer_idx - if item['layer_idx'] == self.num_layers: + item["layer_idx"] = current_layer_idx + if item["layer_idx"] == self.num_layers: if item["transfer_protocol"] == "ipc": self.messager["ipc"].write_block_by_sync(target_id) logger.info(f"finish write cache {item['request_id']}") self.engine_worker_queue.finish_request_barrier.wait() if self.rank == 0: - self.engine_worker_queue.put_finished_req([ - (item['request_id'], "finished") - ]) + self.engine_worker_queue.put_finished_req([(item["request_id"], "finished")]) logger.info(f"put write cache {item['request_id']}") del self.cache_info[req_id] @@ -315,5 +308,4 @@ class CacheMessager(object): self.last_layer_idx = prefilled_layer_idx except Exception as e: - logger.error( - f"prefill layerwise send cache thread has exception: {e}") + logger.error(f"prefill layerwise send cache thread has exception: {e}") diff --git a/fastdeploy/cache_manager/cache_metrics.py b/fastdeploy/cache_manager/cache_metrics.py index 212b5c2dd..2f5acf36a 100644 --- a/fastdeploy/cache_manager/cache_metrics.py +++ b/fastdeploy/cache_manager/cache_metrics.py @@ -14,52 +14,45 @@ # limitations under the License. """ - from fastdeploy.utils import get_logger logger = get_logger("prefix_cache_manager", "prefix_cache_manager.log") - - class CacheMetrics: """ - Cache Metrics used to record the cache hit time, token num, request num, etc. + Cache Metrics used to record the cache hit time, token num, request num, etc. """ + def __init__(self): - self.total_match_time = 0.0 - self.avg_match_time = 0.0 + self.total_match_time = 0.0 + self.avg_match_time = 0.0 self.min_match_time = 1e9 self.max_match_time = 0.0 # request level - self.req_count = 0 - self.hit_req_count = 0 - self.hit_req_ratio = 0.0 + self.req_count = 0 + self.hit_req_count = 0 + self.hit_req_ratio = 0.0 # token level - self.total_gpu_matched_token_num = 0 + self.total_gpu_matched_token_num = 0 self.total_cpu_matched_token_num = 0 self.matched_token_num = 0 - self.total_token_num = 0 - self.hit_token_ratio = 0.0 + self.total_token_num = 0 + self.hit_token_ratio = 0.0 self.cpu_hit_token_ratio = 0.0 self.gpu_hit_token_ratio = 0.0 - def _update_history_hit_metrics(self): """ update hit ratio """ self.hit_req_ratio = self.hit_req_count / self.req_count self.hit_token_ratio = self.matched_token_num / self.total_token_num - self.cpu_hit_token_ratio = ( - self.total_cpu_matched_token_num / self.total_token_num - ) - self.gpu_hit_token_ratio = ( - self.total_gpu_matched_token_num / self.total_token_num - ) + self.cpu_hit_token_ratio = self.total_cpu_matched_token_num / self.total_token_num + self.gpu_hit_token_ratio = self.total_gpu_matched_token_num / self.total_token_num logger.info( f"Metrics for all requests: req_count {self.req_count} hit_req_count {self.hit_req_count}" @@ -82,31 +75,17 @@ class CacheMetrics: """ calculate hit metrics for current query """ - - cpu_cache_match_ratio = ( - current_query_cpu_match_token_num / current_query_token_num - ) - gpu_cache_match_ratio = ( - current_query_gpu_match_token_num / current_query_token_num - ) - total_match_ratio = ( - cpu_cache_match_ratio + gpu_cache_match_ratio - ) + cpu_cache_match_ratio = current_query_cpu_match_token_num / current_query_token_num + gpu_cache_match_ratio = current_query_gpu_match_token_num / current_query_token_num - - self.total_cpu_matched_token_num += ( - current_query_cpu_match_token_num - ) - self.total_gpu_matched_token_num += ( - current_query_gpu_match_token_num - ) + total_match_ratio = cpu_cache_match_ratio + gpu_cache_match_ratio - self.matched_token_num += ( - current_query_cpu_match_token_num - + current_query_gpu_match_token_num - ) - self.total_token_num += current_query_token_num + self.total_cpu_matched_token_num += current_query_cpu_match_token_num + self.total_gpu_matched_token_num += current_query_gpu_match_token_num + + self.matched_token_num += current_query_cpu_match_token_num + current_query_gpu_match_token_num + self.total_token_num += current_query_token_num logger.info( f"Metrics for req_id {req_id}: token_num {current_query_token_num}" + f" cpu_cache_match_ratio {cpu_cache_match_ratio}" @@ -134,4 +113,4 @@ class CacheMetrics: self.total_token_num = 0 self.hit_token_ratio = 0.0 self.cpu_hit_token_ratio = 0.0 - self.gpu_hit_token_ratio = 0.0 \ No newline at end of file + self.gpu_hit_token_ratio = 0.0 diff --git a/fastdeploy/cache_manager/cache_transfer_manager.py b/fastdeploy/cache_manager/cache_transfer_manager.py index 912624512..678819723 100644 --- a/fastdeploy/cache_manager/cache_transfer_manager.py +++ b/fastdeploy/cache_manager/cache_transfer_manager.py @@ -26,8 +26,11 @@ import paddle from fastdeploy.cache_manager.cache_data import CacheStatus from fastdeploy.engine.config import SpeculativeConfig from fastdeploy.inter_communicator import EngineCacheQueue, IPCSignal -from fastdeploy.model_executor.ops.gpu import (cuda_host_alloc, set_data_ipc, - swap_cache_all_layers) +from fastdeploy.model_executor.ops.gpu import ( + cuda_host_alloc, + set_data_ipc, + swap_cache_all_layers, +) from fastdeploy.utils import get_logger @@ -36,79 +39,58 @@ def parse_args(): 从命令行解析参数 """ parser = argparse.ArgumentParser("Cache transfer manager") - parser.add_argument("--splitwise_role", - type=str, - default="mixed", - help="splitwise role, can be decode, prefill or mixed") + parser.add_argument( + "--splitwise_role", + type=str, + default="mixed", + help="splitwise role, can be decode, prefill or mixed", + ) parser.add_argument("--rank", type=int, default=0, help="current rank") parser.add_argument("--device_id", type=int, default=0, help="device id") - parser.add_argument("--num_layers", - type=int, - default=1, - help="model num layers") - parser.add_argument("--head_dim", - type=int, - default=1, - help="model head dim") - parser.add_argument("--kv_num_head", - type=int, - default=1, - help="model kv num head") + parser.add_argument("--num_layers", type=int, default=1, help="model num layers") + parser.add_argument("--head_dim", type=int, default=1, help="model head dim") + parser.add_argument("--kv_num_head", type=int, default=1, help="model kv num head") parser.add_argument("--rdma_port", type=str, default="", help="rmda port") - parser.add_argument("--mp_num", - type=int, - default=1, - help="number of model parallel") - parser.add_argument("--protocol", - type=str, - default="ipc", - help="cache transfer protocol, only surport ipc now") - parser.add_argument("--enable_splitwise", - type=int, - default=0, - help="enable splitwise ") - parser.add_argument("--cache_queue_port", - type=int, - default=9923, - help="cache queue port") - parser.add_argument("--pod_ip", - type=str, - default="0.0.0.0", - help="pod ip") - parser.add_argument("--engine_worker_queue_port", - type=int, - default=9923, - help="engine worker queue port") - parser.add_argument("--engine_pid", - type=str, - default=None, - help="engine pid") + parser.add_argument("--mp_num", type=int, default=1, help="number of model parallel") + parser.add_argument( + "--protocol", + type=str, + default="ipc", + help="cache transfer protocol, only surport ipc now", + ) + parser.add_argument("--enable_splitwise", type=int, default=0, help="enable splitwise ") + parser.add_argument("--cache_queue_port", type=int, default=9923, help="cache queue port") + parser.add_argument("--pod_ip", type=str, default="0.0.0.0", help="pod ip") + parser.add_argument( + "--engine_worker_queue_port", + type=int, + default=9923, + help="engine worker queue port", + ) + parser.add_argument("--engine_pid", type=str, default=None, help="engine pid") - parser.add_argument("--num_gpu_blocks", - type=int, - default=1, - help="gpu cache block number") - parser.add_argument("--num_cpu_blocks", - type=int, - default=4, - help="cpu cache block number") - parser.add_argument("--block_size", - type=int, - default=64, - help="cache block size(tokens)") - parser.add_argument("--bytes_per_layer_per_block", - type=int, - default=1024, - help="per layer per block bytes") - parser.add_argument("--cache_dtype", - type=str, - default="bfloat16", - choices=["uint8", "bfloat16"], - help="cache dtype") - parser.add_argument("--speculative_config", - type=json.loads, - default="{}", - help="speculative config") + parser.add_argument("--num_gpu_blocks", type=int, default=1, help="gpu cache block number") + parser.add_argument("--num_cpu_blocks", type=int, default=4, help="cpu cache block number") + parser.add_argument("--block_size", type=int, default=64, help="cache block size(tokens)") + parser.add_argument( + "--bytes_per_layer_per_block", + type=int, + default=1024, + help="per layer per block bytes", + ) + parser.add_argument( + "--cache_dtype", + type=str, + default="bfloat16", + choices=["uint8", "bfloat16"], + help="cache dtype", + ) + parser.add_argument( + "--speculative_config", + type=json.loads, + default="{}", + help="speculative config", + ) parser.add_argument("--local_data_parallel_id", type=int, default=0) args = parser.parse_args() @@ -134,14 +116,10 @@ class CacheTransferManager: self.gpu_cache_v_tensors = [] self.speculative_config = SpeculativeConfig(**args.speculative_config) self.num_extra_layers = self.speculative_config.num_extra_cache_layer - self.num_extra_layer_gpu_blocks = \ - int(args.num_gpu_blocks * \ - self.speculative_config.num_gpu_block_expand_ratio) + self.num_extra_layer_gpu_blocks = int(args.num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio) - self.swap_to_cpu_thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=1) - self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor( - max_workers=1) + self.swap_to_cpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) + self.swap_to_gpu_thread_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) self.transfer_task_queue = queue.Queue() # 用来接收传输任务 self.tansfer_done_queue = queue.Queue() # 用来告知任务执行完毕 self.n_ranks = args.mp_num @@ -154,81 +132,72 @@ class CacheTransferManager: is_server=False, num_client=args.mp_num, client_id=rank, - local_data_parallel_id=args.local_data_parallel_id) + local_data_parallel_id=args.local_data_parallel_id, + ) self.num_cpu_blocks = args.num_cpu_blocks cache_type = args.cache_dtype for i in range(args.num_layers + self.num_extra_layers): - num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else \ - self.num_extra_layer_gpu_blocks + num_gpu_blocks = args.num_gpu_blocks if i < args.num_layers else self.num_extra_layer_gpu_blocks - self.gpu_cache_kvs["key_caches_{}_rank{}_device{}".format( - i, rank, device)] = paddle.full( - shape=[ - num_gpu_blocks, - args.kv_num_head, - args.block_size, - args.head_dim, - ], - fill_value=0, - dtype=cache_type, - ) - self.gpu_cache_k_tensors.append( - self.gpu_cache_kvs["key_caches_{}_rank{}_device{}".format( - i, rank, device)]) - self.gpu_cache_kvs["value_caches_{}_rank{}_device{}".format( - i, rank, device)] = paddle.full( - shape=[ - num_gpu_blocks, - args.kv_num_head, - args.block_size, - args.head_dim, - ], - fill_value=0, - dtype=cache_type, - ) - self.gpu_cache_v_tensors.append( - self.gpu_cache_kvs["value_caches_{}_rank{}_device{}".format( - i, rank, device)]) + self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"] = paddle.full( + shape=[ + num_gpu_blocks, + args.kv_num_head, + args.block_size, + args.head_dim, + ], + fill_value=0, + dtype=cache_type, + ) + self.gpu_cache_k_tensors.append(self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"]) + self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"] = paddle.full( + shape=[ + num_gpu_blocks, + args.kv_num_head, + args.block_size, + args.head_dim, + ], + fill_value=0, + dtype=cache_type, + ) + self.gpu_cache_v_tensors.append(self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"]) set_data_ipc( - self.gpu_cache_kvs["key_caches_{}_rank{}_device{}".format( - i, rank, device)], - "key_caches_{}_rank{}.device{}".format(i, rank, device)) + self.gpu_cache_kvs[f"key_caches_{i}_rank{rank}_device{device}"], + f"key_caches_{i}_rank{rank}.device{device}", + ) set_data_ipc( - self.gpu_cache_kvs["value_caches_{}_rank{}_device{}".format( - i, rank, device)], - "value_caches_{}_rank{}.device{}".format(i, rank, device)) - cache_kv_size_byte = sum( - [tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()]) + self.gpu_cache_kvs[f"value_caches_{i}_rank{rank}_device{device}"], + f"value_caches_{i}_rank{rank}.device{device}", + ) + cache_kv_size_byte = sum([tmp.numel() * 1 for key, tmp in self.gpu_cache_kvs.items()]) logger.info(f"device :{self.device}") logger.info(f"cache_kv_size_byte : {cache_kv_size_byte}") - logger.info( - f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}" - ) + logger.info(f"done init cache (full) gmem alloc : {paddle.device.cuda.memory_allocated()}") paddle.set_device("cpu") self.k_dst_ptrs = [] self.v_dst_ptrs = [] for i in range(args.num_layers + self.num_extra_layers): - self.cpu_cache_kvs["key_caches_{}_rank{}".format( - i, rank)] = cuda_host_alloc(args.num_cpu_blocks * - args.bytes_per_layer_per_block) - self.k_dst_ptrs.append( - self.cpu_cache_kvs["key_caches_{}_rank{}".format(i, rank)]) - self.cpu_cache_kvs["value_caches_{}_rank{}".format( - i, rank)] = cuda_host_alloc(args.num_cpu_blocks * - args.bytes_per_layer_per_block) - self.v_dst_ptrs.append( - self.cpu_cache_kvs["value_caches_{}_rank{}".format(i, rank)]) + self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"] = cuda_host_alloc( + args.num_cpu_blocks * args.bytes_per_layer_per_block + ) + self.k_dst_ptrs.append(self.cpu_cache_kvs[f"key_caches_{i}_rank{rank}"]) + self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"] = cuda_host_alloc( + args.num_cpu_blocks * args.bytes_per_layer_per_block + ) + self.v_dst_ptrs.append(self.cpu_cache_kvs[f"value_caches_{i}_rank{rank}"]) cache_ready_signal_data = np.zeros(shape=[args.mp_num], dtype=np.int32) - self.cache_ready_signal = IPCSignal(name="cache_ready_signal", - array=cache_ready_signal_data, - dtype=np.int32, - suffix=args.engine_pid, - create=False) + self.cache_ready_signal = IPCSignal( + name="cache_ready_signal", + array=cache_ready_signal_data, + dtype=np.int32, + suffix=args.engine_pid, + create=False, + ) self.cache_ready_signal.value[self.rank] = 1 paddle.set_device(f"gpu:{device}") @@ -251,9 +220,7 @@ class CacheTransferManager: rdma_port=args.rdma_port, ) logger.info("successfully create cache messager") - logger.info( - f"done init CacheMessager gmem alloc : {paddle.device.cuda.memory_allocated()}" - ) + logger.info(f"done init CacheMessager gmem alloc : {paddle.device.cuda.memory_allocated()}") cache_task_broadcast_data = np.zeros(shape=[1], dtype=np.int32) self.cache_task_broadcast_signal = IPCSignal( @@ -261,10 +228,17 @@ class CacheTransferManager: array=cache_task_broadcast_data, dtype=np.int32, suffix=args.engine_pid, - create=False) + create=False, + ) - def _do_swap_to_cpu_task(self, swap_node_ids, gpu_block_id, cpu_block_id, - event_type, transfer_task_id): + def _do_swap_to_cpu_task( + self, + swap_node_ids, + gpu_block_id, + cpu_block_id, + event_type, + transfer_task_id, + ): """ swap cache GPU->CPU """ @@ -282,14 +256,17 @@ class CacheTransferManager: if self.rank == 0: self.cache_task_queue.swap_to_cpu_barrier2.reset() self.cache_task_queue.put_transfer_done_signal(result) - logger.debug( - f"_do_swap_to_cpu_task: put_transfer_done_signal {result}") - logger.info( - f"_do_swap_to_cpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}" - ) + logger.debug(f"_do_swap_to_cpu_task: put_transfer_done_signal {result}") + logger.info(f"_do_swap_to_cpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}") - def _do_swap_to_gpu_task(self, swap_node_ids, gpu_block_id, cpu_block_id, - event_type, transfer_task_id): + def _do_swap_to_gpu_task( + self, + swap_node_ids, + gpu_block_id, + cpu_block_id, + event_type, + transfer_task_id, + ): """ swap cache CPU->GPU """ @@ -307,11 +284,8 @@ class CacheTransferManager: if self.rank == 0: self.cache_task_queue.swap_to_gpu_barrier2.reset() self.cache_task_queue.put_transfer_done_signal(result) - logger.debug( - f"_do_swap_to_gpu_task: put_transfer_done_signal {result}") - logger.info( - f"_do_swap_to_gpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}" - ) + logger.debug(f"_do_swap_to_gpu_task: put_transfer_done_signal {result}") + logger.info(f"_do_swap_to_gpu_task: put_transfer_done_signal for transfer_task_id {transfer_task_id}") def do_data_transfer(self): """ @@ -327,8 +301,7 @@ class CacheTransferManager: if self.rank == 0: self.cache_task_queue.barrier1.reset() if self.cache_task_broadcast_signal.value[0] == 1: - data, read_finish = self.cache_task_queue.get_transfer_task( - ) + data, read_finish = self.cache_task_queue.get_transfer_task() logger.debug(f"transfer data: get_transfer_task {data}") if read_finish: self.cache_task_broadcast_signal.value[0] = 0 @@ -386,8 +359,7 @@ class CacheTransferManager: """ logger.debug( f"transfer data: transfer_task_id {transfer_task_id}: swap_node_ids {swap_node_ids}" - + - f"task_gpu_block_id {task_gpu_block_id} task_cpu_block_id {task_cpu_block_id} event_type {event_type}" + + f"task_gpu_block_id {task_gpu_block_id} task_cpu_block_id {task_cpu_block_id} event_type {event_type}" ) start_time = time.time() try: @@ -446,8 +418,7 @@ class CacheTransferManager: elasped_time = end_time - start_time logger.info( f"transfer data: transfer_task_id {transfer_task_id} event_type {event_type}: " - + - f"transfer {len(gpu_block_ids)} blocks done elapsed_time {elasped_time:.4f}" + + f"transfer {len(gpu_block_ids)} blocks done elapsed_time {elasped_time:.4f}" ) return ( swap_node_ids, diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 10e463cd0..e64dbb5ae 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -41,11 +41,13 @@ class PrefixCacheManager: PrefixCacheManager is used to manage the prefix tree and the cache. """ - def __init__(self, - config, - tensor_parallel_size, - splitwise_role="mixed", - local_data_parallel_id=0): + def __init__( + self, + config, + tensor_parallel_size, + splitwise_role="mixed", + local_data_parallel_id=0, + ): """ initialize the PrefixCacheManager """ @@ -66,14 +68,12 @@ class PrefixCacheManager: self.num_cpu_blocks = self.cache_config.num_cpu_blocks self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) if self.num_cpu_blocks > 0: - self.cpu_free_block_list = list( - range(self.num_cpu_blocks - 1, -1, -1)) + self.cpu_free_block_list = list(range(self.num_cpu_blocks - 1, -1, -1)) else: self.cpu_free_block_list = [] heapq.heapify(self.gpu_free_block_list) heapq.heapify(self.cpu_free_block_list) - self.node_id_pool = list( - range(self.num_gpu_blocks + self.num_cpu_blocks)) + self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks)) self.radix_tree_root = BlockNode(-1, [], 0, 0, -1, 0, None, None, None) @@ -90,7 +90,7 @@ class PrefixCacheManager: self.task_swapping_event = {} self.node_map = {} - self.req_leaf_map = ({}) # {request_id: leaf node} + self.req_leaf_map = {} # {request_id: leaf node} self.leaf_req_map = defaultdict(set) self.unfilled_req_block_map = defaultdict(list) @@ -102,14 +102,18 @@ class PrefixCacheManager: logger.info( f"num_gpu_blocks_server_owned {self.num_gpu_blocks} num_cpu_blocks " - + - f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}" + + f"{self.num_cpu_blocks}, bytes_per_layer_per_block {self.cache_config.bytes_per_layer_per_block}" ) - - - def launch_cache_manager(self, cache_config, tensor_parallel_size, \ - device_ids, pod_ip, engine_worker_queue_port, pid_suffix): + def launch_cache_manager( + self, + cache_config, + tensor_parallel_size, + device_ids, + pod_ip, + engine_worker_queue_port, + pid_suffix, + ): """ launch_cache_manager function used to initialize the cache manager. """ @@ -120,70 +124,72 @@ class PrefixCacheManager: array=broadcast_cache_task_flag_array, dtype=np.int32, suffix=pid_suffix, - create=True) + create=True, + ) self.cache_task_queue = EngineCacheQueue( address=(pod_ip, cache_config.cache_queue_port), - authkey=b'cache_queue_service', + authkey=b"cache_queue_service", is_server=False, num_client=tensor_parallel_size, client_id=0, - local_data_parallel_id=self.local_data_parallel_id) + local_data_parallel_id=self.local_data_parallel_id, + ) current_dir_path = os.path.split(os.path.abspath(__file__))[0] filename = "cache_transfer_manager.py" py_path = os.path.join(current_dir_path, filename) - if (hasattr(cache_config.model_cfg, "num_key_value_heads") - and hasattr(cache_config.model_cfg, "num_key_value_heads") - and cache_config.model_cfg.num_key_value_heads is not None - and int(cache_config.model_cfg.num_key_value_heads) > 0): - kv_num_head = int(cache_config.model_cfg.num_key_value_heads - ) // tensor_parallel_size + if ( + hasattr(cache_config.model_cfg, "num_key_value_heads") + and hasattr(cache_config.model_cfg, "num_key_value_heads") + and cache_config.model_cfg.num_key_value_heads is not None + and int(cache_config.model_cfg.num_key_value_heads) > 0 + ): + kv_num_head = int(cache_config.model_cfg.num_key_value_heads) // tensor_parallel_size else: kv_num_head = cache_config.model_cfg.num_attention_heads // tensor_parallel_size - cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], - dtype=np.int32) - self.cache_ready_signal = IPCSignal(name="cache_ready_signal", - array=cache_ready_signal_data, - dtype=np.int32, - suffix=pid_suffix, - create=True) + cache_ready_signal_data = np.zeros(shape=[tensor_parallel_size], dtype=np.int32) + self.cache_ready_signal = IPCSignal( + name="cache_ready_signal", + array=cache_ready_signal_data, + dtype=np.int32, + suffix=pid_suffix, + create=True, + ) log_dir = envs.FD_LOG_DIR cache_manager_processes = [] for i in range(tensor_parallel_size): launch_cmd = ( "FLAGS_allocator_strategy=auto_growth CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7" - + " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0" + - f" {sys.executable} {py_path}" + - f" --device_id {int(device_ids[i])}" + f" --rank {i}" + - f" --splitwise_role {self.splitwise_role}" + - f" --num_layers {cache_config.model_cfg.num_layers}" + - f" --head_dim {cache_config.model_cfg.head_dim}" + - f" --kv_num_head {kv_num_head}" + - f" --mp_num {tensor_parallel_size}" + - f" --cache_dtype {cache_config.cache_dtype}" + - f" --cache_queue_port {cache_config.cache_queue_port}" + - f" --enable_splitwise {int(self.enable_splitwise)}" + - f" --pod_ip {pod_ip}" + - f" --engine_worker_queue_port {engine_worker_queue_port}" + - f" --num_gpu_blocks {cache_config.total_block_num}" + - f" --num_cpu_blocks {cache_config.num_cpu_blocks}" + - f" --bytes_per_layer_per_block {cache_config.bytes_per_layer_per_block}" - + f" --block_size {cache_config.block_size}" + - f" --engine_pid {pid_suffix}" + - f" --protocol {cache_config.cache_transfer_protocol}" + - f" --local_data_parallel_id {self.local_data_parallel_id}" + - f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" - + - f" --speculative_config '{self.speculative_config.to_json_string()}'" - + - f" >{log_dir}/launch_cache_manager_{int(device_ids[i])}.log 2>&1" + + " NCCL_MAX_NCHANNELS=1 NCCL_BUFFSIZE=0" + + f" {sys.executable} {py_path}" + + f" --device_id {int(device_ids[i])}" + + f" --rank {i}" + + f" --splitwise_role {self.splitwise_role}" + + f" --num_layers {cache_config.model_cfg.num_layers}" + + f" --head_dim {cache_config.model_cfg.head_dim}" + + f" --kv_num_head {kv_num_head}" + + f" --mp_num {tensor_parallel_size}" + + f" --cache_dtype {cache_config.cache_dtype}" + + f" --cache_queue_port {cache_config.cache_queue_port}" + + f" --enable_splitwise {int(self.enable_splitwise)}" + + f" --pod_ip {pod_ip}" + + f" --engine_worker_queue_port {engine_worker_queue_port}" + + f" --num_gpu_blocks {cache_config.total_block_num}" + + f" --num_cpu_blocks {cache_config.num_cpu_blocks}" + + f" --bytes_per_layer_per_block {cache_config.bytes_per_layer_per_block}" + + f" --block_size {cache_config.block_size}" + + f" --engine_pid {pid_suffix}" + + f" --protocol {cache_config.cache_transfer_protocol}" + + f" --local_data_parallel_id {self.local_data_parallel_id}" + + f" --rdma_port {cache_config.rdma_comm_ports[i] if cache_config.rdma_comm_ports is not None else '0'}" + + f" --speculative_config '{self.speculative_config.to_json_string()}'" + + f" >{log_dir}/launch_cache_manager_{int(device_ids[i])}.log 2>&1" ) logger.info(f"Launch cache transfer manager, command:{launch_cmd}") - cache_manager_processes.append( - subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid)) + cache_manager_processes.append(subprocess.Popen(launch_cmd, shell=True, preexec_fn=os.setsid)) # 等待cache初始化完毕 logger.info("Waiting for cache transfer manager ready...") while np.sum(self.cache_ready_signal.value) != tensor_parallel_size: @@ -192,9 +198,7 @@ class PrefixCacheManager: if exit_code is None: logger.info("Launch cache transfer manager successful") else: - logger.info( - "Launch cache transfer manager failed, see launch_cache_manager.log for more information" - ) + logger.info("Launch cache transfer manager failed, see launch_cache_manager.log for more information") if cache_config.enable_hierarchical_cache and self.num_cpu_blocks > 0: logger.info("Enable hierarchical cache.") @@ -207,12 +211,10 @@ class PrefixCacheManager: """ self.cache_config = cache_config self.num_gpu_blocks = cache_config.prefill_kvcache_block_num - self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, - -1)) # 服务端管理的GPU上剩余的block id + self.gpu_free_block_list = list(range(self.num_gpu_blocks - 1, -1, -1)) # 服务端管理的GPU上剩余的block id heapq.heapify(self.gpu_free_block_list) - self.node_id_pool = list( - range(self.num_gpu_blocks + self.num_cpu_blocks)) + self.node_id_pool = list(range(self.num_gpu_blocks + self.num_cpu_blocks)) def _enable_cpu_cache(self): """ @@ -226,8 +228,7 @@ class PrefixCacheManager: # port=ipc_cache_queue_port, # ) # 开启获取传输任务结果的监听线程 - self.transfer_recv_thread = threading.Thread( - target=self.recv_data_transfer_result) + self.transfer_recv_thread = threading.Thread(target=self.recv_data_transfer_result) self.transfer_recv_thread.start() def allocate_gpu_blocks(self, num_blocks): @@ -237,9 +238,7 @@ class PrefixCacheManager: assert num_blocks <= len( self.gpu_free_block_list ), f"gpu free block num: {len(self.gpu_free_block_list)} < needed number {num_blocks}" - allocated_block_ids = [ - heapq.heappop(self.gpu_free_block_list) for i in range(num_blocks) - ] + allocated_block_ids = [heapq.heappop(self.gpu_free_block_list) for i in range(num_blocks)] logger.info( f"allocate_gpu_blocks: {allocated_block_ids}, len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}" ) @@ -265,9 +264,7 @@ class PrefixCacheManager: assert num_blocks <= len( self.cpu_free_block_list ), f"cpu free block num: {len(self.cpu_free_block_list)} < needed number {num_blocks}" - allocated_block_ids = [ - heapq.heappop(self.cpu_free_block_list) for i in range(num_blocks) - ] + allocated_block_ids = [heapq.heappop(self.cpu_free_block_list) for i in range(num_blocks)] logger.info( f"allocate_cpu_blocks: {allocated_block_ids}, len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}" ) @@ -307,16 +304,17 @@ class PrefixCacheManager: """ self.task_swapping_event[transfer_task_id] = Event() - self.cache_task_queue.put_transfer_task(( - swap_node_ids, - gpu_block_ids, - cpu_block_ids, - event_type, - transfer_task_id, - )) + self.cache_task_queue.put_transfer_task( + ( + swap_node_ids, + gpu_block_ids, + cpu_block_ids, + event_type, + transfer_task_id, + ) + ) if is_sync: self.sync_swap_task(transfer_task_id) - return def sync_swap_task(self, transfer_task_id): """ @@ -325,26 +323,27 @@ class PrefixCacheManager: self.task_swapping_event[transfer_task_id].wait() del self.task_swapping_event[transfer_task_id] - def _check_validity(self, req_id, match_gpu_blocks_num, - expected_block_num): + def _check_validity(self, req_id, match_gpu_blocks_num, expected_block_num): """ check enough gpu memory to allocate cache """ - if expected_block_num - match_gpu_blocks_num > len( - self.gpu_free_block_list): + if expected_block_num - match_gpu_blocks_num > len(self.gpu_free_block_list): msg = ( f"request_block_ids: request block for req_id {req_id} failed. " - + - f"matched gpu block num: {match_gpu_blocks_num} require extra gpu block num: " - + - f"{expected_block_num - match_gpu_blocks_num} > free block num: {len(self.gpu_free_block_list)}" + + f"matched gpu block num: {match_gpu_blocks_num} require extra gpu block num: " + + f"{expected_block_num - match_gpu_blocks_num} > free block num: {len(self.gpu_free_block_list)}" ) logger.info(msg) raise Exception("Not enough GPU memory to allocate cache") - - def _prepare_cpu_cache(self, req_id, swap_node_ids, gpu_recv_block_ids, \ - cpu_recv_block_ids, match_cpu_block_ids): + def _prepare_cpu_cache( + self, + req_id, + swap_node_ids, + gpu_recv_block_ids, + cpu_recv_block_ids, + match_cpu_block_ids, + ): """ 将cpu cache转移到GPU """ @@ -357,11 +356,8 @@ class PrefixCacheManager: for tmp_cpu_block_id in match_cpu_block_ids: need_transfer_task_cpu_block_ids.append(tmp_cpu_block_id) - assert len(need_transfer_task_gpu_block_ids) == len( - need_transfer_task_cpu_block_ids) - logger.info( - f"request_block_ids: req_id {req_id} issue_swap_task transfer_task_id {transfer_task_id}" - ) + assert len(need_transfer_task_gpu_block_ids) == len(need_transfer_task_cpu_block_ids) + logger.info(f"request_block_ids: req_id {req_id} issue_swap_task transfer_task_id {transfer_task_id}") self.issue_swap_task( transfer_task_id, swap_node_ids, @@ -371,8 +367,16 @@ class PrefixCacheManager: True, ) - def _prepare_cache(self, req_id, input_ids, block_size, \ - expected_block_num, match_gpu_block_ids, match_cpu_block_ids, match_node_ids): + def _prepare_cache( + self, + req_id, + input_ids, + block_size, + expected_block_num, + match_gpu_block_ids, + match_cpu_block_ids, + match_node_ids, + ): """ prepare cache for request """ @@ -394,26 +398,31 @@ class PrefixCacheManager: gpu_extra_block_ids = self.allocate_gpu_blocks(gpu_extra_block_num) if len(gpu_recv_block_ids) > 0: - self._prepare_cpu_cache(req_id, match_node_ids, gpu_recv_block_ids, \ - cpu_recv_block_ids, match_cpu_block_ids) + self._prepare_cpu_cache( + req_id, + match_node_ids, + gpu_recv_block_ids, + cpu_recv_block_ids, + match_cpu_block_ids, + ) return gpu_recv_block_ids, gpu_extra_block_ids def request_block_ids(self, task, block_size, dec_token_num, *args): """ - Allocate blocks for a task. - This is a synchronous interface. If CPU-to-GPU data transfer occurs, - it will block until synchronization completes. - Callers requiring asynchronous behavior should invoke this via a thread pool. + Allocate blocks for a task. + This is a synchronous interface. If CPU-to-GPU data transfer occurs, + it will block until synchronization completes. + Callers requiring asynchronous behavior should invoke this via a thread pool. - Parameters: - - task: Task dictionary - - block_size: Size per block (in tokens) - - dec_token_num: Number of tokens reserved for decoding on the server side + Parameters: + - task: Task dictionary + - block_size: Size per block (in tokens) + - dec_token_num: Number of tokens reserved for decoding on the server side - Returns: - - common_block_ids: List of matched shared blocks - - unique_block_ids: List of exclusively allocated blocks + Returns: + - common_block_ids: List of matched shared blocks + - unique_block_ids: List of exclusively allocated blocks """ with self.request_release_lock: try: @@ -423,9 +432,7 @@ class PrefixCacheManager: self.metrics.req_count += 1 input_ids = task.prompt_token_ids req_id = task.request_id - logger.info( - f"request_block_ids: start to allocate blocks for req_id {req_id}" - ) + logger.info(f"request_block_ids: start to allocate blocks for req_id {req_id}") input_token_num = len(input_ids) common_block_ids = [] unique_block_ids = [] @@ -443,34 +450,43 @@ class PrefixCacheManager: matched_block_num = match_gpu_blocks_num + match_cpu_blocks_num matched_token_num_in_cpu_and_gpu = gpu_match_token_num + cpu_match_token_num # check enough gpu memory to allocate cache - block_num = (input_token_num + block_size - 1 + - dec_token_num) // block_size + block_num = (input_token_num + block_size - 1 + dec_token_num) // block_size self._check_validity(req_id, matched_block_num, block_num) # update matched node info current_time = time.time() - self._update_matched_node_info(req_id, match_block_node, - current_time) + self._update_matched_node_info(req_id, match_block_node, current_time) # 2. prepare cache - gpu_recv_block_ids, gpu_extra_block_ids, = self._prepare_cache(req_id, \ - input_ids, block_size, block_num, match_gpu_block_ids, match_cpu_block_ids, swap_node_ids) + (gpu_recv_block_ids, gpu_extra_block_ids,) = self._prepare_cache( + req_id, + input_ids, + block_size, + block_num, + match_gpu_block_ids, + match_cpu_block_ids, + swap_node_ids, + ) # update matched token num - matched_block_num = (gpu_match_token_num + cpu_match_token_num) + matched_block_num = gpu_match_token_num + cpu_match_token_num common_block_ids = match_gpu_block_ids + gpu_recv_block_ids unique_block_ids = gpu_extra_block_ids dec_block_num = dec_token_num // block_size - left_input_ids = input_ids[ - matched_token_num_in_cpu_and_gpu:] # 没在前缀树中的token + left_input_ids = input_ids[matched_token_num_in_cpu_and_gpu:] # 没在前缀树中的token gpu_build_path_block_ids = [] gpu_build_path_block_ids = gpu_extra_block_ids - leaf_node = self.build_path(req_id, current_time, input_ids, - left_input_ids, - gpu_build_path_block_ids, - block_size, match_block_node, - dec_block_num) + leaf_node = self.build_path( + req_id, + current_time, + input_ids, + left_input_ids, + gpu_build_path_block_ids, + block_size, + match_block_node, + dec_block_num, + ) self.req_leaf_map[req_id] = leaf_node self.leaf_req_map[leaf_node].add(req_id) # 3. update metrics @@ -482,17 +498,15 @@ class PrefixCacheManager: gpu_match_token_num, input_token_num, ) - hit_info[ - "gpu_cache_blocks"] = gpu_match_token_num // block_size - hit_info[ - "cpu_cache_blocks"] = cpu_match_token_num // block_size + hit_info["gpu_cache_blocks"] = gpu_match_token_num // block_size + hit_info["cpu_cache_blocks"] = cpu_match_token_num // block_size self.metrics._update_history_hit_metrics() if self.metrics.req_count % 10000 == 0: self.metrics.reset_metrics() logger.info( f"request_block_ids: request block for req_id {req_id}: common_block_ids " - + - f"{common_block_ids}, unique_block_ids {unique_block_ids}") + + f"{common_block_ids}, unique_block_ids {unique_block_ids}" + ) return common_block_ids, unique_block_ids, hit_info except Exception as e: logger.error(f"request_block_ids: error: {type(e)} {e}") @@ -523,25 +537,21 @@ class PrefixCacheManager: node.decrement_shared_count() node = node.parent - logger.info( - f"release_block_ids: req_id {req_id} leaf_node {leaf_node}" - ) + logger.info(f"release_block_ids: req_id {req_id} leaf_node {leaf_node}") if leaf_node == self.radix_tree_root: - self.recycle_gpu_blocks( - self.unfilled_req_block_map[req_id]) + self.recycle_gpu_blocks(self.unfilled_req_block_map[req_id]) del self.unfilled_req_block_map[req_id] return if leaf_node in self.gpu_lru_leaf_set: return - if (leaf_node.shared_count == 0 and leaf_node.is_gpu_leaf_node - and leaf_node.is_persistent is False): + if leaf_node.shared_count == 0 and leaf_node.is_gpu_leaf_node and leaf_node.is_persistent is False: self.gpu_lru_leaf_set.add(leaf_node) heapq.heappush(self.gpu_lru_leaf_heap, leaf_node) logger.info( - f"release_block_ids: req_id {req_id} has been finished, " + - f"current gpu_lru_leaf_heap length {len(self.gpu_lru_leaf_heap)}" + f"release_block_ids: req_id {req_id} has been finished, " + + f"current gpu_lru_leaf_heap length {len(self.gpu_lru_leaf_heap)}" ) return except Exception as e: @@ -563,8 +573,15 @@ class PrefixCacheManager: node.reverved_dec_block_ids = [] self.recycle_gpu_blocks(node.block_id) - def _handle_free_gpu_node_with_cpu(self, node, hash_value_input_ids_map, \ - hash_value_depth_map, need_recycle_gpu_block_ids, hash_value_gpu_block_ids_map, hash_value_swap_node_ids_map): + def _handle_free_gpu_node_with_cpu( + self, + node, + hash_value_input_ids_map, + hash_value_depth_map, + need_recycle_gpu_block_ids, + hash_value_gpu_block_ids_map, + hash_value_swap_node_ids_map, + ): """ GPU node eviction in hierarchical cache layers """ @@ -573,14 +590,19 @@ class PrefixCacheManager: node.reverved_dec_block_ids = [] need_recycle_gpu_block_ids.append(node.block_id) - hash_value_gpu_block_ids_map[node.input_hash_value].append( - node.block_id) - hash_value_swap_node_ids_map[node.input_hash_value].append( - node.node_id) + hash_value_gpu_block_ids_map[node.input_hash_value].append(node.block_id) + hash_value_swap_node_ids_map[node.input_hash_value].append(node.node_id) - def _evict_cache_async(self, future, total_gpu_free_count, \ - hash_value_gpu_block_ids_map, hash_value_block_ids_map, \ - hash_value_swap_node_ids_map, hash_value_input_ids_map, hash_value_depth_map): + def _evict_cache_async( + self, + future, + total_gpu_free_count, + hash_value_gpu_block_ids_map, + hash_value_block_ids_map, + hash_value_swap_node_ids_map, + hash_value_input_ids_map, + hash_value_depth_map, + ): """ evict cache async (GPU --> CPU) """ @@ -592,23 +614,21 @@ class PrefixCacheManager: need_transfer_task_cpu_block_ids = [] cpu_block_ids = self.allocate_cpu_blocks(total_gpu_free_count) for input_hash_value in hash_value_gpu_block_ids_map.keys(): - need_transfer_task_gpu_block_ids.extend( - reversed(hash_value_gpu_block_ids_map[input_hash_value])) + need_transfer_task_gpu_block_ids.extend(reversed(hash_value_gpu_block_ids_map[input_hash_value])) all_allocated_cpu_block_ids = [] for _ in reversed(hash_value_gpu_block_ids_map[input_hash_value]): cpu_block_id_t = cpu_block_ids.pop(0) all_allocated_cpu_block_ids.append(cpu_block_id_t) need_transfer_task_cpu_block_ids.append(cpu_block_id_t) - swap_node_ids.extend( - reversed(hash_value_swap_node_ids_map[input_hash_value])) + swap_node_ids.extend(reversed(hash_value_swap_node_ids_map[input_hash_value])) logger.info( - "free_block_ids_async: issue transfer task: " + - f"transfer_task_id {transfer_task_id}: " + - f"swap_node_ids {swap_node_ids} need_transfer_task_gpu_block_ids " - + - f"{need_transfer_task_gpu_block_ids}, need_transfer_task_cpu_block_ids " - + f"{need_transfer_task_cpu_block_ids}, CacheStatus.SWAP2CPU") + "free_block_ids_async: issue transfer task: " + + f"transfer_task_id {transfer_task_id}: " + + f"swap_node_ids {swap_node_ids} need_transfer_task_gpu_block_ids " + + f"{need_transfer_task_gpu_block_ids}, need_transfer_task_cpu_block_ids " + + f"{need_transfer_task_cpu_block_ids}, CacheStatus.SWAP2CPU" + ) self.issue_swap_task( transfer_task_id, swap_node_ids, @@ -619,9 +639,8 @@ class PrefixCacheManager: ) logger.info( - "free_block_ids_async: after free, " + - f"len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}") - return + "free_block_ids_async: after free, " + f"len(self.gpu_free_block_list) {len(self.gpu_free_block_list)}" + ) def free_block_ids_async(self, need_block_num): """ @@ -654,8 +673,10 @@ class PrefixCacheManager: break node = heapq.heappop(self.gpu_lru_leaf_heap) self.gpu_lru_leaf_set.remove(node) - if not self.cache_config.enable_hierarchical_cache or \ - self.cache_config.num_cpu_blocks < need_block_num: + if ( + not self.cache_config.enable_hierarchical_cache + or self.cache_config.num_cpu_blocks < need_block_num + ): if node.shared_count == 0 and node.is_gpu_leaf_node: # 直接回收 self._handle_free_gpu_node_without_cpu(node) total_gpu_free_count += 1 @@ -666,12 +687,13 @@ class PrefixCacheManager: if not node.children: if node in self.gpu_lru_leaf_set: continue - if (node != self.radix_tree_root - and node.shared_count == 0 - and node.is_gpu_leaf_node - and node.is_persistent is False): - heapq.heappush(self.gpu_lru_leaf_heap, - node) + if ( + node != self.radix_tree_root + and node.shared_count == 0 + and node.is_gpu_leaf_node + and node.is_persistent is False + ): + heapq.heappush(self.gpu_lru_leaf_heap, node) self.gpu_lru_leaf_set.add(node) else: continue @@ -680,18 +702,25 @@ class PrefixCacheManager: node.cache_status = CacheStatus.SWAP2CPU else: continue - self._handle_free_gpu_node_with_cpu(node, hash_value_input_ids_map, \ - hash_value_depth_map, need_recycle_gpu_block_ids, \ - hash_value_gpu_block_ids_map, hash_value_swap_node_ids_map) + self._handle_free_gpu_node_with_cpu( + node, + hash_value_input_ids_map, + hash_value_depth_map, + need_recycle_gpu_block_ids, + hash_value_gpu_block_ids_map, + hash_value_swap_node_ids_map, + ) total_gpu_free_count += 1 node = node.parent if node in self.gpu_lru_leaf_set: continue - if (node != self.radix_tree_root - and node.shared_count == 0 - and node.is_gpu_leaf_node - and node.is_persistent is False): + if ( + node != self.radix_tree_root + and node.shared_count == 0 + and node.is_gpu_leaf_node + and node.is_persistent is False + ): heapq.heappush(self.gpu_lru_leaf_heap, node) self.gpu_lru_leaf_set.add(node) @@ -702,12 +731,16 @@ class PrefixCacheManager: cpu_free_count = total_gpu_free_count if cpu_free_count < need_block_num: cpu_free_count = need_block_num - cpu_free_future = self.free_cpu_executor_pool.submit( - self.free_cpu_block_ids, cpu_free_count) + cpu_free_future = self.free_cpu_executor_pool.submit(self.free_cpu_block_ids, cpu_free_count) self.gpu_free_task_future = self.free_gpu_executor_pool.submit( - self._evict_cache_async, cpu_free_future, total_gpu_free_count, \ - hash_value_gpu_block_ids_map, hash_value_block_ids_map, \ - hash_value_swap_node_ids_map, hash_value_input_ids_map, hash_value_depth_map + self._evict_cache_async, + cpu_free_future, + total_gpu_free_count, + hash_value_gpu_block_ids_map, + hash_value_block_ids_map, + hash_value_swap_node_ids_map, + hash_value_input_ids_map, + hash_value_depth_map, ) else: self.gpu_free_task_future = None @@ -717,17 +750,14 @@ class PrefixCacheManager: def free_cpu_block_ids(self, need_block_num): """ - Evict CPU blocks (at least need_block_num blocks) - Parameters: - - need_block_num: Number of CPU blocks required to evict + Evict CPU blocks (at least need_block_num blocks) + Parameters: + - need_block_num: Number of CPU blocks required to evict - Returns: - - freed_block_num: Number of CPU blocks successfully evicted + Returns: + - freed_block_num: Number of CPU blocks successfully evicted """ - hash_value_input_ids_map = {} hash_value_block_ids_map = defaultdict(list) - hash_value_depth_map = {} - need_recycle_cpu_block_ids = [] total_cpu_free_count = 0 with self.request_release_lock: while True: @@ -739,13 +769,10 @@ class PrefixCacheManager: node = heapq.heappop(self.cpu_lru_leaf_heap) self.cpu_lru_leaf_set.remove(node) tmp_block_ids = [] - if (node.shared_count == 0 - and node.cache_status == CacheStatus.CPU - and node.is_cpu_leaf_node): + if node.shared_count == 0 and node.cache_status == CacheStatus.CPU and node.is_cpu_leaf_node: self.recycle_cpu_blocks(node.block_id) - hash_value_block_ids_map[node.input_hash_value].extend( - reversed(tmp_block_ids)) + hash_value_block_ids_map[node.input_hash_value].extend(reversed(tmp_block_ids)) logger.info(f"free_cpu_block_ids: free node {node}") self.node_id_pool.append(node.node_id) @@ -759,15 +786,17 @@ class PrefixCacheManager: if not node.children: if node in self.cpu_lru_leaf_set: continue - if (node != self.radix_tree_root - and node.shared_count == 0 - and node.is_cpu_leaf_node - and node.cache_status == CacheStatus.CPU): + if ( + node != self.radix_tree_root + and node.shared_count == 0 + and node.is_cpu_leaf_node + and node.cache_status == CacheStatus.CPU + ): heapq.heappush(self.cpu_lru_leaf_heap, node) self.cpu_lru_leaf_set.add(node) logger.info( - "free_cpu_block_ids: after free, " + - f"len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}") + "free_cpu_block_ids: after free, " + f"len(self.cpu_free_block_list) {len(self.cpu_free_block_list)}" + ) return total_cpu_free_count def cal_block_hash(self, block): @@ -778,18 +807,18 @@ class PrefixCacheManager: def match_block(self, req_id, input_ids, block_size): """ - Args: - req_id: Task request ID - input_ids: Input token IDs - block_size: Size of each block + Args: + req_id: Task request ID + input_ids: Input token IDs + block_size: Size of each block - Returns: - match_gpu_block_ids: List of matched GPU block IDs - match_cpu_block_ids: List of matched CPU block IDs - swap_node_ids: List of node IDs requiring swap operations - match_block_node: Last matched node in the path - gpu_match_token_num: Number of tokens matched in GPU blocks - cpu_match_token_num: Number of tokens matched in CPU blocks + Returns: + match_gpu_block_ids: List of matched GPU block IDs + match_cpu_block_ids: List of matched CPU block IDs + swap_node_ids: List of node IDs requiring swap operations + match_block_node: Last matched node in the path + gpu_match_token_num: Number of tokens matched in GPU blocks + cpu_match_token_num: Number of tokens matched in CPU blocks """ total_token_num = len(input_ids) @@ -807,8 +836,7 @@ class PrefixCacheManager: with self.cache_status_lock: while match_token_num < total_token_num: - token_block = input_ids[match_token_num:match_token_num + - block_size] + token_block = input_ids[match_token_num : match_token_num + block_size] token_num = len(token_block) if token_num != block_size: break @@ -817,11 +845,11 @@ class PrefixCacheManager: child = current_match_node.children[hash_value] matche_nodes.append(child) match_node_ids.append(child.node_id) - if (child in self.gpu_lru_leaf_set): + if child in self.gpu_lru_leaf_set: self.gpu_lru_leaf_set.remove(child) self.gpu_lru_leaf_heap.remove(child) has_modified_gpu_lru_leaf_heap = True - elif (child in self.cpu_lru_leaf_set): + elif child in self.cpu_lru_leaf_set: self.cpu_lru_leaf_set.remove(child) self.cpu_lru_leaf_heap.remove(child) has_modified_cpu_lru_leaf_heap = True @@ -831,8 +859,9 @@ class PrefixCacheManager: else: if child.cache_status == CacheStatus.SWAP2CPU: logger.info( - f"match_block: req_id {req_id} matched node" + - f" {child.node_id} which is being SWAP2CPU") + f"match_block: req_id {req_id} matched node" + + f" {child.node_id} which is being SWAP2CPU" + ) child.cache_status = CacheStatus.GPU match_gpu_block_ids.append(child.block_id) gpu_match_token_num += block_size @@ -851,8 +880,7 @@ class PrefixCacheManager: if has_modified_cpu_lru_leaf_heap: heapq.heapify(self.cpu_lru_leaf_heap) - logger.info( - f"match_block: req_id {req_id} matched nodes: {match_node_ids}") + logger.info(f"match_block: req_id {req_id} matched nodes: {match_node_ids}") return ( match_gpu_block_ids, match_cpu_block_ids, @@ -873,9 +901,17 @@ class PrefixCacheManager: node.req_id_set.add(req_id) node = node.parent - def build_path(self, req_id, current_time, input_ids, left_input_ids, - gpu_block_ids, block_size, last_node, - reverved_dec_block_num): + def build_path( + self, + req_id, + current_time, + input_ids, + left_input_ids, + gpu_block_ids, + block_size, + last_node, + reverved_dec_block_num, + ): """ Build path for blocks beyond the common prefix Parameters: @@ -906,7 +942,7 @@ class PrefixCacheManager: has_unfilled_block = False for i in range(0, token_num, block_size): - current_block = left_input_ids[i:i + block_size] + current_block = left_input_ids[i : i + block_size] current_block_size = len(current_block) # 最后一个block可能没填满 if current_block_size != block_size: has_unfilled_block = True @@ -915,17 +951,19 @@ class PrefixCacheManager: allocated_block_id = gpu_block_ids.pop(0) node_id = self.node_id_pool.pop() unique_node_ids.append(node_id) - new_last_node = BlockNode(node_id, - input_ids, - input_hash_value, - node.depth + 1, - allocated_block_id, - current_block_size, - hash_value, - current_time, - parent=node, - shared_count=1, - reverved_dec_block_ids=[]) + new_last_node = BlockNode( + node_id, + input_ids, + input_hash_value, + node.depth + 1, + allocated_block_id, + current_block_size, + hash_value, + current_time, + parent=node, + shared_count=1, + reverved_dec_block_ids=[], + ) new_last_node.req_id_set.add(req_id) self.node_map[node_id] = new_last_node node.children[hash_value] = new_last_node @@ -939,46 +977,44 @@ class PrefixCacheManager: self.unfilled_req_block_map[req_id] = reverved_dec_block_ids else: new_last_node.reverved_dec_block_ids.extend(reverved_dec_block_ids) - logger.info( - f"build_path: allocate unique node ids {unique_node_ids} for req_id {req_id}" - ) + logger.info(f"build_path: allocate unique node ids {unique_node_ids} for req_id {req_id}") return new_last_node - def _handle_swap_result(self, swap_node_id, task_gpu_block_id, - task_cpu_block_id, event_type): + def _handle_swap_result(self, swap_node_id, task_gpu_block_id, task_cpu_block_id, event_type): """ handle swap resuha """ if swap_node_id is None: return with self.cache_status_lock: - if (event_type.value == CacheStatus.SWAP2CPU.value): + if event_type.value == CacheStatus.SWAP2CPU.value: gpu_block_id = task_gpu_block_id cpu_block_id = task_cpu_block_id node = self.node_map[swap_node_id] if node.cache_status.value == CacheStatus.GPU.value: logger.info( - f"recv_data_transfer_result: node {node.node_id} " + - f"has been reused when SWAP2CPU, recycle cpu block id {cpu_block_id}" + f"recv_data_transfer_result: node {node.node_id} " + + f"has been reused when SWAP2CPU, recycle cpu block id {cpu_block_id}" ) self.recycle_cpu_blocks(cpu_block_id) else: node.cache_status = CacheStatus.CPU node.block_id = cpu_block_id - if (node != self.radix_tree_root and node.shared_count == 0 - and node.is_cpu_leaf_node - and node.cache_status == CacheStatus.CPU): + if ( + node != self.radix_tree_root + and node.shared_count == 0 + and node.is_cpu_leaf_node + and node.cache_status == CacheStatus.CPU + ): if node not in self.cpu_lru_leaf_set: heapq.heappush(self.cpu_lru_leaf_heap, node) self.cpu_lru_leaf_set.add(node) self.recycle_gpu_blocks(gpu_block_id) - logger.info( - f"recv_data_transfer_result: after SWAP2CPU, node {node}" - ) + logger.info(f"recv_data_transfer_result: after SWAP2CPU, node {node}") - elif (event_type.value == CacheStatus.SWAP2GPU.value): + elif event_type.value == CacheStatus.SWAP2GPU.value: gpu_block_id = task_gpu_block_id cpu_block_id = task_cpu_block_id @@ -987,12 +1023,12 @@ class PrefixCacheManager: node.block_id = gpu_block_id self.recycle_cpu_blocks(cpu_block_id) - logger.info( - f"recv_data_transfer_result: after SWAP2GPU, node {node}") + logger.info(f"recv_data_transfer_result: after SWAP2GPU, node {node}") else: logger.warning( f"recv_data_transfer_result: Get unexpected event type {event_type}" - + ", only SWAP2CPU and SWAP2GPU supported") + + ", only SWAP2CPU and SWAP2GPU supported" + ) def recv_data_transfer_result(self): """ @@ -1024,10 +1060,8 @@ class PrefixCacheManager: self.task_swapping_event[transfer_task_id].set() logger.info( f"recv_data_transfer_result: transfer_task_id {transfer_task_id}: " - + - f"task_node_ids {swap_node_ids} task_gpu_block_id {task_gpu_block_id} " - + - f"task_cpu_block_id {task_cpu_block_id} event_type {event_type} done" + + f"task_node_ids {swap_node_ids} task_gpu_block_id {task_gpu_block_id} " + + f"task_cpu_block_id {task_cpu_block_id} event_type {event_type} done" ) except Exception as e: logger.warning(f"recv_data_transfer_result: error: {e}") diff --git a/fastdeploy/cache_manager/transfer_factory/__init__.py b/fastdeploy/cache_manager/transfer_factory/__init__.py index c5270bbdd..31298a918 100644 --- a/fastdeploy/cache_manager/transfer_factory/__init__.py +++ b/fastdeploy/cache_manager/transfer_factory/__init__.py @@ -13,5 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from .ipc_cache_transfer import IPCCommManager from .rdma_cache_transfer import RDMACommManager + +__all__ = ["IPCCommManager", "RDMACommManager"] diff --git a/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py index 2f7bcffb5..61a4fa10b 100644 --- a/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py +++ b/fastdeploy/cache_manager/transfer_factory/ipc_cache_transfer.py @@ -14,13 +14,13 @@ # limitations under the License. """ -import os - import paddle from fastdeploy.model_executor.ops.gpu import ( - get_data_ptr_ipc, ipc_sent_key_value_cache_by_remote_ptr, - ipc_sent_key_value_cache_by_remote_ptr_block_sync) + get_data_ptr_ipc, + ipc_sent_key_value_cache_by_remote_ptr, + ipc_sent_key_value_cache_by_remote_ptr_block_sync, +) from fastdeploy.utils import get_logger logger = get_logger("cache_messager", "cache_messager.log") @@ -44,17 +44,13 @@ class IPCConnector: self.rank_id = rank_id_ self.local_gpu_id = int(local_gpu_id_) tmp = paddle.ones([1, 1]) - logger.info( - f"init ipc rank{self.rank_id} with remote {self.remote_gpu_id} {self.local_gpu_id}" - ) + logger.info(f"init ipc rank{self.rank_id} with remote {self.remote_gpu_id} {self.local_gpu_id}") for layer_id in range(layer_num): key_unique_name = f"key_caches_{layer_id}_rank{self.rank_id}.device{self.remote_gpu_id}" value_unique_name = f"value_caches_{layer_id}_rank{self.rank_id}.device{self.remote_gpu_id}" - self.remote_key_tensor_ptr_list.append( - get_data_ptr_ipc(tmp, key_unique_name)) - self.remote_value_tensor_ptr_list.append( - get_data_ptr_ipc(tmp, value_unique_name)) - self.write_stream = paddle.device.Stream(f'gpu:{self.local_gpu_id}') + self.remote_key_tensor_ptr_list.append(get_data_ptr_ipc(tmp, key_unique_name)) + self.remote_value_tensor_ptr_list.append(get_data_ptr_ipc(tmp, value_unique_name)) + self.write_stream = paddle.device.Stream(f"gpu:{self.local_gpu_id}") self.finish_event = paddle.device.Event() @@ -64,11 +60,11 @@ class IPCCommManager: """ def __init__( - self, - rank_id_, - gpu_idx_, - local_key_cache_tensor_list, # tensor list - local_value_cache_tensor_list, # tensor + self, + rank_id_, + gpu_idx_, + local_key_cache_tensor_list, # tensor list + local_value_cache_tensor_list, # tensor ): self.rank_id = rank_id_ self.gpu_idx = gpu_idx_ @@ -83,14 +79,11 @@ class IPCCommManager: """ Connect to remote gpu. """ - logger.info( - f"{self.rank_id}: connect to remote_gpu_id:{remote_gpu_id_} {self.layer_num} {self.gpu_idx}" - ) + logger.info(f"{self.rank_id}: connect to remote_gpu_id:{remote_gpu_id_} {self.layer_num} {self.gpu_idx}") if self.is_connected(remote_gpu_id_): return True else: - self.comm_map[remote_gpu_id_] = IPCConnector( - self.rank_id, remote_gpu_id_, self.layer_num, self.gpu_idx) + self.comm_map[remote_gpu_id_] = IPCConnector(self.rank_id, remote_gpu_id_, self.layer_num, self.gpu_idx) return True def is_connected(self, remote_gpu_id_=0): @@ -102,8 +95,7 @@ class IPCCommManager: else: return False - def write_cache(self, ip, remote_gpu_id, local_block_ids, remote_block_ids, - layer_idx): + def write_cache(self, ip, remote_gpu_id, local_block_ids, remote_block_ids, layer_idx): """ Connect to remote gpu and write cache. """ @@ -114,20 +106,26 @@ class IPCCommManager: with paddle.device.stream_guard(comm.write_stream): ipc_sent_key_value_cache_by_remote_ptr( self.local_key_cache_tensor_list[layer_idx], - self.local_value_cache_tensor_list[layer_idx], local_block_ids, - remote_block_ids, comm.remote_key_tensor_ptr_list[layer_idx], - comm.remote_value_tensor_ptr_list[layer_idx], block_num, - self.gpu_idx, comm.remote_gpu_id, - comm.write_stream.stream_base.cuda_stream) + self.local_value_cache_tensor_list[layer_idx], + local_block_ids, + remote_block_ids, + comm.remote_key_tensor_ptr_list[layer_idx], + comm.remote_value_tensor_ptr_list[layer_idx], + block_num, + self.gpu_idx, + comm.remote_gpu_id, + comm.write_stream.stream_base.cuda_stream, + ) return 0 def write_block_by_sync(self, remote_gpu_id): """ check finish event and wait for it """ - paddle.set_device(f'gpu:{self.gpu_idx}') + paddle.set_device(f"gpu:{self.gpu_idx}") comm = self.comm_map[remote_gpu_id] ipc_sent_key_value_cache_by_remote_ptr_block_sync( - self.local_key_cache_tensor_list[0], #tensor no use - self.local_value_cache_tensor_list[0], #tensor no use - comm.write_stream.stream_base.cuda_stream) + self.local_key_cache_tensor_list[0], # tensor no use + self.local_value_cache_tensor_list[0], # tensor no use + comm.write_stream.stream_base.cuda_stream, + ) diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt index c241538c8..7bed564e9 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/CMakeLists.txt @@ -25,7 +25,7 @@ find_package(pybind11 CONFIG REQUIRED) include_directories("${PROJECT_SOURCE_DIR}/include") add_library(rdma_comm MODULE ${PROJECT_SOURCE_DIR}/src/pybind.cpp ${PROJECT_SOURCE_DIR}/src/kvcache_rdma.cpp ${PROJECT_SOURCE_DIR}/src/kvcache_connection.cpp ${PROJECT_SOURCE_DIR}/src/log.cpp) -set_target_properties(rdma_comm PROPERTIES +set_target_properties(rdma_comm PROPERTIES OUTPUT_NAME "rdma_comm" PREFIX "" SUFFIX ".so" diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md index b16ab460a..700a045fe 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README.md @@ -11,7 +11,7 @@ A dedicated component for transferring KV Cache between Prefill and Decode nodes - Single Mellanox ConnectX-7 400G NIC (single port) - Tested with BATCH_SIZE = 1538 and block size = 1K - 256K - Single pressure thread (threads = 1) - + - **Comparison Baseline**: - Mooncake performance measured using transfer_engine_bench from example directory - Same hardware configuration and test parameters applied to KVTransferManager @@ -42,11 +42,13 @@ Bandwidth Saturation Capability: Under multi-threaded high-pressure scenarios, b ### Dependencies Installation #### Python Packages + ```bash pip install pyzmq pybind11[global] ``` #### System Libraries (Linux) + ```bash # Ubuntu/Debian sudo apt-get install -y libibverbs-dev librdmacm-dev @@ -62,10 +64,10 @@ sudo yum install -y libibverbs-devel librdmacm-devel #### Ampere Architecture Note To support Ampere GPUs, enable the environment variable KVCACHE_GDRCOPY_FLUSH_ENABLE. - What it does: - Forces memory flushing after a GDRCopy write operation to ensure data consistency on the Ampere architecture. Here if KVCACHE_GDRCOPY_FLUSH_ENABLE is enable we trigger an RDMA read operation after the last RDMA write. + Forces memory flushing after a GDRCopy write operation to ensure data consistency on the Ampere architecture. Here if KVCACHE_GDRCOPY_FLUSH_ENABLE is enable we trigger an RDMA read operation after the last RDMA write. - Why it’s needed: When the NIC delivers a completion to the CPU, it indicates that the data has reach the GPU. However, it doesn't mean that the GPU can read that data yet. To make sure the data has gone all the way down to the GPU memory and the GPU can read it, we need to perform a read. -[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) | +[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) | [NCCL Issue #1702](https://github.com/NVIDIA/nccl/issues/1702) Since the upper layer typically issues a cache arrival notification only after polling a Completion Queue Entry (CQE), this prevents the application from being notified before the data is actually written back to memory. Therefore, the potential race condition where the cache has not yet been flushed but the application assumes completion is considered a rare event in practice. - How to enable: @@ -75,14 +77,14 @@ To support Ampere GPUs, enable the environment variable KVCACHE_GDRCOPY_FLUSH_EN ```bash # Build and make symbolic links for SO files -python setup.py bdist_wheel +python setup.py bdist_wheel pip install dist/*.whl ``` ## Environment Variables Configuration -### RDMA Settings +### RDMA Settings | Variable | Default | Description | |----------|---------|-------------| | `KVCACHE_RDMA_GID_INDEX` | 3 | RDMA GID index | @@ -90,25 +92,23 @@ pip install dist/*.whl | `KVCACHE_IB_TIMEOUT` | 18 | InfiniBand communication timeout (14-31), where timeout = 4.096μs * 2^value (default 18 ≈ 1.07s).| | `KVCACHE_RELAX_ORDERING` | false | Enable RDMA relaxed ordering to improve performance in multi-GPU scenarios. Recommended when multiple GPUs share the same NIC to mitigate TX pause issues. | -### Network Settings +### Network Settings | Variable | Default | Description | |----------|---------|-------------| | `KVCACHE_SOCKET_IFNAME` | auto | Network interface for socket comm (e.g. "eth0") | -### Debugging +### Debugging | Variable | Default | Description | |----------|---------|-------------| | `KVCACHE_DEBUG` | false | Enable debug logging | | `KVCACHE_DEBUG_FILE` | - | Debug log file path | | `KVCACHE_ERROR_FILE` | - | Error log file path | -### Performance Tuning +### Performance Tuning | Variable | Default | Description | |----------|---------|-------------| | `KVCACHE_GDRCOPY_FLUSH_ENABLE` | false | Enable GDRCopy flush for Ampere GPUs | - - # Set RDMA GID index export KVCACHE_RDMA_GID_INDEX=3 @@ -125,7 +125,6 @@ export KVCACHE_DEBUG=1 export KVCACHE_DEBUG_FILE=/var/log/kvcache_debug.log export KVCACHE_ERROR_FILE=/var/log/kvcache_error.log - ## Network configurations kvcache transfer is fully tested with RDMA over Converged Ethernet (RoCE) networks. However, it is theoretically compatible with Infiniband as well. @@ -164,14 +163,14 @@ comm.write_cache( **Parameter Details**: -1. `role`: +1. `role`: - "prefill": Prefill node role - "decode": Decode node role -2. `gpu_idx`: +2. `gpu_idx`: - GPU device index to use -3. `port`: +3. `port`: - RDMA communication port number 4. `local_key_cache`/`local_value_cache`: @@ -216,7 +215,7 @@ comm = RDMACommunicator( if comm.connect("192.168.1.100", "12345"): print("Connection established") - + # Write cache comm.write_cache( ip="192.168.1.100", # Target server IP @@ -229,4 +228,4 @@ if comm.connect("192.168.1.100", "12345"): ## Citation -If you use this codebase, or otherwise found our work valuable, please cite: \ No newline at end of file +If you use this codebase, or otherwise found our work valuable, please cite: diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md index bed94d860..b2a2be91a 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/README_CN.md @@ -11,7 +11,7 @@ - 单张Mellanox ConnectX-7 400G网卡(单端口) - 测试参数: BATCH_SIZE = 1538, 块大小 = 1K - 256K - 单压力线程(threads = 1) - + - **对比基准**: - Mooncake性能使用example目录中的transfer_engine_bench测量 - KVTransferManager使用相同的硬件配置和测试参数 @@ -43,11 +43,13 @@ ### 依赖安装 #### Python包 + ```bash pip install pyzmq pybind11[global] ``` #### 系统库(Linux) + ```bash # Ubuntu/Debian sudo apt-get install -y libibverbs-dev librdmacm-dev @@ -66,7 +68,7 @@ sudo yum install -y libibverbs-devel librdmacm-devel 在GDRCopy写操作后强制内存刷新,确保Ampere架构上的数据一致性。启用后会在最后一个RDMA写操作后触发一个RDMA读操作。 - 原因: 当网卡向CPU发送完成通知时,仅表示数据已到达GPU,但不保证GPU可以立即读取该数据。为确保数据已完全写入GPU内存且可被GPU读取,需要执行读操作。 -[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) | +[NCCL Issue #683](https://github.com/NVIDIA/nccl/issues/683) | [NCCL Issue #1702](https://github.com/NVIDIA/nccl/issues/1702) 由于上层通常只在轮询完成队列条目(CQE)后发出缓存到达通知,这避免了应用在数据实际写回内存前收到通知的情况。因此,缓存未刷新但应用认为已完成这种潜在问题在实践中被认为是罕见情况。 - 启用方式: @@ -76,7 +78,7 @@ sudo yum install -y libibverbs-devel librdmacm-devel ```bash # 构建并创建SO文件的符号链接 -python setup.py bdist_wheel +python setup.py bdist_wheel pip install dist/*.whl ``` @@ -108,7 +110,6 @@ pip install dist/*.whl |------|--------|------| | `KVCACHE_GDRCOPY_FLUSH_ENABLE` | false | 为Ampere GPU启用GDRCopy刷新 | - # 设置RDMA GID索引 export KVCACHE_RDMA_GID_INDEX=3 @@ -125,7 +126,6 @@ export KVCACHE_DEBUG=1 export KVCACHE_DEBUG_FILE=/var/log/kvcache_debug.log export KVCACHE_ERROR_FILE=/var/log/kvcache_error.log - ## 网络配置 kvcache transfer已通过RDMA over Converged Ethernet (RoCE)网络全面测试。理论上也兼容Infiniband。 @@ -145,7 +145,7 @@ comm = RDMACommunicator( gpu_idx, # GPU设备索引(0~7) port, # 通信端口 local_key_cache, # 本地key缓存指针列表 - local_value_cache, # 本地value缓存指针列表 + local_value_cache, # 本地value缓存指针列表 block_number, # 块数量 block_bytes # 每块字节数 ) @@ -159,19 +159,19 @@ comm.write_cache( local_block_ids, # 本地缓存块ID列表,指定要传输的本地块 remote_block_ids, # 远程缓存块ID列表,指定要写入的远程块 layer_idx # 模型层索引,用于多层模型场景 -) +) ``` **参数说明**: -1. `role`: +1. `role`: - "prefill" - "decode" -2. `gpu_idx`: +2. `gpu_idx`: - 使用的GPU设备索引 -3. `port`: +3. `port`: - RDMA通信端口号 4. `local_key_cache`/`local_value_cache`: @@ -216,7 +216,7 @@ comm = RDMACommunicator( if comm.connect("192.168.1.100", "12345"): print("连接成功") - + # 写入缓存 comm.write_cache( ip="192.168.1.100", # 目标服务器IP @@ -229,4 +229,4 @@ if comm.connect("192.168.1.100", "12345"): ## 引用 -如果您使用此代码库,或认为我们的工作有价值,请引用: \ No newline at end of file +如果您使用此代码库,或认为我们的工作有价值,请引用: diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h index 28877ea65..596e3b2e6 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_connection.h @@ -3,13 +3,13 @@ * @brief RDMA connection management for key-value cache * @version 1.0.0 * @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,22 +32,22 @@ #include #include #include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include "kvcache_rdma.h" #include "util.h" @@ -88,8 +88,8 @@ struct QpInfo { intBuffer[0] = htonl(lid); intBuffer[1] = htonl(qpn); intBuffer[2] = htonl(psn); - memcpy(buffer + 12, gid.raw, sizeof(gid.raw)); - intBuffer[7] = htonl(static_cast(mtu)); + memcpy(buffer + 12, gid.raw, sizeof(gid.raw)); + intBuffer[7] = htonl(static_cast(mtu)); } /// @brief Deserialize QP info from buffer @@ -102,7 +102,7 @@ struct QpInfo { mtu = static_cast(ntohl(intBuffer[7])); } - static const size_t size = 12 + sizeof(gid.raw) + 4; + static const size_t size = 12 + sizeof(gid.raw) + 4; }; /// @brief RDMA connection context @@ -137,13 +137,13 @@ struct Connection { std::vector send_write_cache_key_remote_ptr_list; std::vector send_write_cache_key_remote_rkey_list; - std::vector send_write_cache_value_remote_ptr_list; + std::vector send_write_cache_value_remote_ptr_list; std::vector send_write_cache_value_remote_rkey_list; // For rdma read operations std::vector read_bufs; std::vector read_mrs; - + // Work completion tracking int wc_count; int wc_target_count; @@ -208,4 +208,4 @@ int setup_listening_socket(int port); int configure_epoll(int sockfd); std::vector get_net_ifname(); -#endif // FASTDEPLOY_KVCACHE_CONNECTION_H \ No newline at end of file +#endif // FASTDEPLOY_KVCACHE_CONNECTION_H diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h index 73df757fd..de759e909 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/kvcache_rdma.h @@ -61,30 +61,30 @@ private: uint32_t rkey, const std::string &ip, const std::string &port); - bool execute_rdma_writes(struct RdmaContext* ctx, int layer_idx, - const std::vector& local_block_ids, - bool is_key, std::vector& remote_addr, + bool execute_rdma_writes(struct RdmaContext* ctx, int layer_idx, + const std::vector& local_block_ids, + bool is_key, std::vector& remote_addr, uint32_t rkey); - - void prepare_write_requests(struct ibv_sge* sge_list, + + void prepare_write_requests(struct ibv_sge* sge_list, struct ibv_send_wr* send_wr_list, - int layer_idx, + int layer_idx, const std::vector& local_block_ids, - bool is_key, - std::vector& remote_addr, + bool is_key, + std::vector& remote_addr, uint32_t rkey); - - bool execute_read_verification(struct RdmaContext* ctx, - size_t block_idx, - uint64_t remote_addr, + + bool execute_read_verification(struct RdmaContext* ctx, + size_t block_idx, + uint64_t remote_addr, uint32_t rkey, int layer_idx, - const std::string& ip, + const std::string& ip, const std::string& port); - - bool post_send_with_retry(struct RdmaContext* ctx, - struct ibv_send_wr* wr_list, - size_t inflight_wr, + + bool post_send_with_retry(struct RdmaContext* ctx, + struct ibv_send_wr* wr_list, + size_t inflight_wr, bool need_poll); // Connection management @@ -119,7 +119,7 @@ private: std::map conn_map; // Active connections map std::mutex mutex_; // Thread synchronization mutex int rdma_event_channel_epoll_fd; // Epoll file descriptor - struct ibv_pd *g_pd = NULL; // fd + struct ibv_pd *g_pd = NULL; // fd int RDMACommunicator_status; // Communicator status flag bool start_client_listener = false; // Client listener flag }; diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h index 923a0316d..d0bf18ae2 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/log.h @@ -5,13 +5,13 @@ * @brief Logging module for key-value cache system * @version 1.0.0 * @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -43,7 +43,7 @@ typedef enum { KV_LOG_LEVEL_ERROR = 3 } KVLogLevel; -void debug_log(KVLogLevel level, bool enable_to_terminal, const char *filefunc, +void debug_log(KVLogLevel level, bool enable_to_terminal, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); /** @@ -107,11 +107,11 @@ void debug_log(KVLogLevel level, bool enable_to_terminal, const char *filefunc, LOGD(fmt, __VA_ARGS__); \ } while (0) -#define LOGD_RAW(fmt, arg...) do { \ +#define LOGD_RAW(fmt, arg...) do { \ if (ENV_ENABLE_RAW("KV_IS_DEBUG_ENABLED")) { \ GET_CURRENT_TIME(); \ fprintf(stdout, "[%s][DBG][KV_CACHE][%s:%d] " \ fmt "\n", str, \ FILE_NAME(__FILE__), __LINE__, ## arg); \ } \ - } while (0) \ No newline at end of file + } while (0) diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h index d2149a6dc..c040b2a62 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/include/util.h @@ -15,12 +15,12 @@ #include #include #include -#include +#include #include "log.h" #define PATH_MAX 4096 /* # chars in a path name including nul */ #define RDMA_WR_LIST_MAX_SIZE 32 -#define RDMA_SQ_MAX_SIZE 1024 +#define RDMA_SQ_MAX_SIZE 1024 #define RDMA_DEFAULT_PORT 20001 #define RDMA_TCP_CONNECT_SIZE 1024 @@ -54,19 +54,19 @@ enum class QpStatus { inline void busid_to_int64(const char *busId, int64_t *id) { char hexStr[17] = {0}; int hexOffset = 0; - + // Filter valid hex characters for (int i = 0; hexOffset < sizeof(hexStr) - 1 && busId[i] != '\0'; i++) { char c = busId[i]; if (c == '.' || c == ':') continue; - - if ((c >= '0' && c <= '9') || - (c >= 'A' && c <= 'F') || + + if ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) { hexStr[hexOffset++] = c; } } - + *id = strtol(hexStr, NULL, 16); } @@ -78,45 +78,45 @@ public: bool is_up; bool is_running; bool is_loopback; - + bool isUsable() const { return is_up && is_running && !is_loopback; } }; - + static std::vector getAllInterfaces() { std::vector interfaces; struct ifaddrs *ifaddrs_ptr = nullptr; - + if (getifaddrs(&ifaddrs_ptr) == -1) { return interfaces; } - + for (struct ifaddrs *ifa = ifaddrs_ptr; ifa != nullptr; ifa = ifa->ifa_next) { if (ifa->ifa_addr == nullptr) continue; if (ifa->ifa_addr->sa_family != AF_INET) continue; - + InterfaceInfo info; info.name = ifa->ifa_name; info.is_up = (ifa->ifa_flags & IFF_UP) != 0; info.is_running = (ifa->ifa_flags & IFF_RUNNING) != 0; info.is_loopback = (ifa->ifa_flags & IFF_LOOPBACK) != 0; - + struct sockaddr_in* sa = (struct sockaddr_in*)ifa->ifa_addr; char ip_str[INET_ADDRSTRLEN]; inet_ntop(AF_INET, &sa->sin_addr, ip_str, INET_ADDRSTRLEN); info.ip = ip_str; - + interfaces.push_back(info); } - + freeifaddrs(ifaddrs_ptr); return interfaces; } - + static std::string getFirstUsableInterface() { auto interfaces = getAllInterfaces(); - + for (const auto& iface : interfaces) { if (iface.isUsable()) { return iface.name; @@ -124,14 +124,14 @@ public: } return ""; } - + static void displayAllInterfaces() { auto interfaces = getAllInterfaces(); - + printf("Available network interfaces:\n"); for (const auto& iface : interfaces) { - printf(" %s: %s [%s%s%s]\n", - iface.name.c_str(), + printf(" %s: %s [%s%s%s]\n", + iface.name.c_str(), iface.ip.c_str(), iface.is_up ? "UP" : "DOWN", iface.is_running ? ",RUNNING" : "", @@ -157,13 +157,13 @@ private: bool relax_ordering_enabled_; int ib_timeout_; const char* rdma_nics_; - + // Private constructor for singleton pattern KVCacheConfig() { // Initialize configuration from environment variables rdma_gid_index_ = parse_int_value( std::getenv("KVCACHE_RDMA_GID_INDEX"), 3, "KVCACHE_RDMA_GID_INDEX"); - + // Parse optional RDMA port override const char* port_value = std::getenv("SET_RDMA_DEST_PORT"); has_rdma_dest_port_override_ = false; // 默认为false @@ -177,7 +177,7 @@ private: } const char* env_interface = std::getenv("KVCACHE_SOCKET_IFNAME"); - + if (env_interface && env_interface[0] != '\0') { socket_interface_ = env_interface; printf("Using specified interface: %s\n", socket_interface_); @@ -194,14 +194,14 @@ private: } NetworkInterfaceManager::displayAllInterfaces(); } - + socket_interface_ = std::getenv("KVCACHE_SOCKET_IFNAME"); debug_file_path_ = std::getenv("KVCACHE_DEBUG_FILE"); error_file_path_ = std::getenv("KVCACHE_ERROR_FILE"); - + gdrcopy_flush_enabled_ = parse_bool_value(std::getenv("KVCACHE_GDRCOPY_FLUSH_ENABLE")); verify_read_enabled_ = parse_bool_value(std::getenv("KVCACHE_VERIFY_READ")); - debug_mode_enabled_ = parse_bool_value(std::getenv("KVCACHE_DEBUG")) || + debug_mode_enabled_ = parse_bool_value(std::getenv("KVCACHE_DEBUG")) || parse_bool_value(std::getenv("KV_IS_DEBUG_ENABLED")); debug_output_enabled_ = parse_bool_value(std::getenv("KVCACHE_DEBUG_OUTPUT")); @@ -215,29 +215,29 @@ private: rdma_nics_ = std::getenv("KVCACHE_RDMA_NICS"); } - + // Helper methods bool parse_bool_value(const char* value) { if (!value) return false; - + std::string str_value(value); std::transform(str_value.begin(), str_value.end(), str_value.begin(), ::tolower); - - return (str_value == "1" || str_value == "true" || + + return (str_value == "1" || str_value == "true" || str_value == "on" || str_value == "yes"); } - + int parse_int_value(const char* value, int default_value, const char* env_name) { if (!value) return default_value; - + try { return std::stoi(std::string(value)); } catch (const std::invalid_argument& e) { - fprintf(stderr, "Invalid value for %s: '%s', using default: %d\n", + fprintf(stderr, "Invalid value for %s: '%s', using default: %d\n", env_name, value, default_value); return default_value; } catch (const std::out_of_range& e) { - fprintf(stderr, "%s value out of range: '%s', using default: %d\n", + fprintf(stderr, "%s value out of range: '%s', using default: %d\n", env_name, value, default_value); return default_value; } @@ -247,7 +247,7 @@ public: // Prevent copying and assignment KVCacheConfig(const KVCacheConfig&) = delete; KVCacheConfig& operator=(const KVCacheConfig&) = delete; - + // Get singleton instance static KVCacheConfig& getInstance() { static KVCacheConfig instance; @@ -255,14 +255,14 @@ public: } int get_ib_timeout() const { return ib_timeout_; } - + // Configuration retrieval methods int get_rdma_gid_index() const { return rdma_gid_index_; } - + int resolve_rdma_dest_port(int default_port) const { return has_rdma_dest_port_override_ ? rdma_dest_port_override_ : default_port; } - + int resolve_rdma_dest_port(const std::string& default_port) const { try { return resolve_rdma_dest_port(std::stoi(default_port)); @@ -271,45 +271,45 @@ public: return 0; } } - + const char* get_socket_interface() const { return socket_interface_; } const char* get_debug_file_path() const { return debug_file_path_; } const char* get_error_file_path() const { return error_file_path_; } const char* get_rdma_nics() const { return rdma_nics_; } - + // Feature check methods bool is_gdrcopy_flush_enabled() const { return gdrcopy_flush_enabled_; } bool is_verify_read_enabled() const { return verify_read_enabled_; } bool is_debug_mode_enabled() const { return debug_mode_enabled_; } bool is_debug_output_enabled() const { return debug_output_enabled_; } bool is_relax_ordering_enabled() const { return relax_ordering_enabled_; } - + // Display configuration void displayConfiguration() const { INFO("KVCache Configuration:\n"); INFO("Init KVCacheConfig RDMA GID Index: %d\n", rdma_gid_index_); - + if (has_rdma_dest_port_override_) { INFO("Init KVCacheConfig RDMA Destination Port Override: %d\n", rdma_dest_port_override_); } - + if (socket_interface_) { INFO("Init KVCacheConfig Socket Interface: %s\n", socket_interface_); } - + INFO("Init KVCacheConfig GDRCopy Flush: %s\n", gdrcopy_flush_enabled_ ? "enabled" : "disabled"); INFO("Init KVCacheConfig Verify Read: %s\n", verify_read_enabled_ ? "enabled" : "disabled"); INFO("Init KVCacheConfig Debug Mode: %s\n", debug_mode_enabled_ ? "enabled" : "disabled"); INFO("Init KVCacheConfig Debug Output: %s\n", debug_output_enabled_ ? "enabled" : "disabled"); - + if (debug_file_path_) { INFO("Init KVCacheConfig Debug File: %s\n", debug_file_path_); } - + if (error_file_path_) { INFO("Init KVCacheConfig Error File: %s\n", error_file_path_); } } }; -#endif \ No newline at end of file +#endif diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp index 1551e7c78..d49a8271c 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_connection.cpp @@ -3,13 +3,13 @@ * @brief RDMA connection implementation for key-value cache * @version 1.0.0 * @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,7 +32,7 @@ std::vector g_ib_all_devs; static int64_t get_ib_busid(const char *dev_name) { char dev_path[PATH_MAX]; snprintf(dev_path, PATH_MAX, "/sys/class/infiniband/%s/device", dev_name); - + char *p = realpath(dev_path, NULL); if (p == NULL) { WARN("Failed to get realpath for device %s: %s", dev_name, strerror(errno)); @@ -63,7 +63,7 @@ static int64_t get_ib_busid(const char *dev_name) { /** * @brief Parse and cache IB device information * @return Number of IB devices found, negative on error - * + * * @note This function is thread-safe and will only parse once */ int parse_port_ib_info() { @@ -448,7 +448,7 @@ bool poll_cq_with_timeout(struct RdmaContext *ctx, int timeout_seconds, int cqe_ if ((current_time.tv_sec - start_time.tv_sec) >= timeout_seconds) { ERR("Timeout occurred after %d seconds", timeout_seconds); free(wc_array); - return false; + return false; } } return true; @@ -468,7 +468,7 @@ bool clear_qp_info(struct RdmaContext* ctx) { success = false; } } - + if (ctx->cq) { if (ibv_destroy_cq(ctx->cq)) { ERR("Failed to deallocate cq Domain."); @@ -565,7 +565,7 @@ struct RdmaContext* create_qp(struct IbDeviceInfo* ib_dev, struct ibv_pd** g_pd) return NULL; } - INFO("Successfully created QP 0x%x on device %s", + INFO("Successfully created QP 0x%x on device %s", ctx->qp->qp_num, ib_dev->devName); return ctx; @@ -601,10 +601,10 @@ bool client_exchange_destinations( ERR("Failed to get port info for port %d", ib_port); return false; } - + my_dest.lid = ctx->portinfo.lid; my_dest.mtu = ctx->portinfo.active_mtu; - + // Validate LID for InfiniBand if (ctx->portinfo.link_layer != IBV_LINK_LAYER_ETHERNET && !my_dest.lid) { ERR("Invalid LID 0x%04x for non-Ethernet link layer", my_dest.lid); @@ -722,24 +722,24 @@ bool server_exchange_mr(struct RdmaContext *ctx) { auto layer_num = ctx->conn.layer_number; auto& key_mrs = ctx->conn.write_cache_key_server_mr_list; auto& val_mrs = ctx->conn.write_cache_value_server_mr_list; - + // Verify that server memory regions are properly initialized if (key_mrs.size() != layer_num || val_mrs.size() != layer_num) { ERR("server write cache memory region size error"); return false; } - + // Prepare memory region information to send std::vector send_key_ptrs; std::vector send_key_rkeys; std::vector send_val_ptrs; std::vector send_val_rkeys; - + send_key_ptrs.reserve(layer_num); send_key_rkeys.reserve(layer_num); send_val_ptrs.reserve(layer_num); send_val_rkeys.reserve(layer_num); - + // Collect memory region information from local MRs for (int i = 0; i < layer_num; ++i) { send_key_ptrs.push_back(reinterpret_cast(key_mrs[i]->addr)); @@ -753,13 +753,13 @@ bool server_exchange_mr(struct RdmaContext *ctx) { if (!exchange_mr_vector(ctx, send_key_rkeys, false)) return false; if (!exchange_mr_vector(ctx, send_val_ptrs, false)) return false; if (!exchange_mr_vector(ctx, send_val_rkeys, false)) return false; - + return true; } /** * Send memory region information from server to client - * + * * @param ctx The RDMA context * @param local_mr Pointer to the local memory region to be sent * @param byte_num Size of the memory region in bytes @@ -796,16 +796,16 @@ bool server_send_memory_region(struct RdmaContext *ctx, void *local_mr, int byte ibv_dereg_mr(ctx->conn.send_mr); return false; } - + // Wait for completion struct ibv_wc wc; ctx->conn.wc_count = 0; ctx->conn.wc_target_count = 0; - + if (!poll_cq_with_timeout(ctx, RDMA_POLL_CQE_TIMEOUT, 1)) { return false; } - + // Deregister the memory region ibv_dereg_mr(ctx->conn.send_mr); return true; @@ -813,7 +813,7 @@ bool server_send_memory_region(struct RdmaContext *ctx, void *local_mr, int byte /** * Receive memory region information on the client side - * + * * @param ctx The RDMA context * @param remote_mr Pointer to the buffer where remote memory region info will be stored * @param byte_num Size of the memory region in bytes @@ -863,17 +863,17 @@ bool client_receive_memory_region(struct RdmaContext *ctx, void *remote_mr, int /** * Sets up a listening socket on the specified port - * + * * @param port The port number to listen on * @return The socket file descriptor on success, -1 on failure */ int setup_listening_socket(int port) { int sockfd = -1; struct addrinfo hints = {0}; - + // Set up hints for getaddrinfo hints.ai_flags = AI_PASSIVE; - hints.ai_family = AF_UNSPEC; + hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; struct addrinfo *res = nullptr; @@ -881,14 +881,14 @@ int setup_listening_socket(int port) { // Convert port to string for getaddrinfo std::ostringstream service; service << port; - + // Get address info for the specified port int n = getaddrinfo(nullptr, service.str().c_str(), &hints, &res); if (n != 0) { ERR("getaddrinfo failed for port %d: %s", port, gai_strerror(n)); return -1; } - + // Check if a specific network interface is specified const char *ifname = KVCacheConfig::getInstance().get_socket_interface(); // Try each address until we successfully bind to one @@ -913,7 +913,7 @@ int setup_listening_socket(int port) { // Enable address reuse n = 1; setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &n, sizeof(n)); - + // Attempt to bind to the address if (bind(sockfd, t->ai_addr, t->ai_addrlen) == 0) { break; // Successful bind @@ -948,7 +948,7 @@ int setup_listening_socket(int port) { close(sockfd); return -1; } - + // Enable TCP keep-alive int enable = 1; if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, &enable, sizeof(enable)) < 0) { diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp index 16df80701..3f2d21016 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/kvcache_rdma.cpp @@ -3,13 +3,13 @@ * @brief RDMA-based Key-Value Cache Communication Implementation * @version 1.0.0 * @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -34,15 +34,15 @@ /** * @brief Construct a new RDMACommunicator object - * + * * @param role Role in distributed system ("decode" or "prefill") * @param gpu_idx GPU device index to use * @param port Communication port number * @param local_key_cache Vector of local key cache pointers - * @param local_value_cache Vector of local value cache pointers + * @param local_value_cache Vector of local value cache pointers * @param block_number Number of blocks in cache * @param block_bytes Size of each block in bytes - * + * * @throws std::runtime_error If initialization fails */ RDMACommunicator::RDMACommunicator(std::string &role, int gpu_idx, @@ -50,16 +50,16 @@ RDMACommunicator::RDMACommunicator(std::string &role, int gpu_idx, std::vector local_key_cache, std::vector local_value_cache, int block_number, int block_bytes) - : splitwise_role(role), - gpu_idx(gpu_idx), + : splitwise_role(role), + gpu_idx(gpu_idx), port(port), local_cache_key_ptr_layer_head_(std::move(local_key_cache)), local_cache_value_ptr_layer_head_(std::move(local_value_cache)), - block_number(block_number), + block_number(block_number), block_size_byte(block_bytes), RDMACommunicator_status(0), rdma_event_channel_epoll_fd(-1) { - + try { WARN("Initializing RDMA communicator for role: %s", role.c_str()); @@ -80,7 +80,7 @@ RDMACommunicator::RDMACommunicator(std::string &role, int gpu_idx, // Step 3:Initialize the event channel rdma_event_channel_epoll_fd = epoll_create1(EPOLL_CLOEXEC); if (rdma_event_channel_epoll_fd < 0) { - throw std::runtime_error("Failed to create epoll fd: " + + throw std::runtime_error("Failed to create epoll fd: " + std::string(strerror(errno))); } @@ -112,7 +112,7 @@ void RDMACommunicator::resize_vectors() { if (layer_number <= 0) { throw std::runtime_error("Invalid layer number"); } - + local_cache_key_ptr_per_layer.resize(layer_number); local_cache_value_ptr_per_layer.resize(layer_number); } @@ -126,9 +126,9 @@ void RDMACommunicator::assign_pointers() { // Assign pointers for each layer and block for (int layer_idx = 0; layer_idx < layer_number; ++layer_idx) { // Validate layer head pointers - if (local_cache_key_ptr_layer_head_[layer_idx] == 0 || + if (local_cache_key_ptr_layer_head_[layer_idx] == 0 || local_cache_value_ptr_layer_head_[layer_idx] == 0) { - throw std::runtime_error("Invalid cache pointer for layer " + + throw std::runtime_error("Invalid cache pointer for layer " + std::to_string(layer_idx)); } @@ -140,12 +140,12 @@ void RDMACommunicator::assign_pointers() { for (int block_idx = 0; block_idx < block_number; ++block_idx) { local_cache_key_ptr_per_layer[layer_idx][block_idx] = reinterpret_cast( - local_cache_key_ptr_layer_head_[layer_idx] + + local_cache_key_ptr_layer_head_[layer_idx] + block_idx * block_size_byte); - + local_cache_value_ptr_per_layer[layer_idx][block_idx] = reinterpret_cast( - local_cache_value_ptr_layer_head_[layer_idx] + + local_cache_value_ptr_layer_head_[layer_idx] + block_idx * block_size_byte); } } @@ -214,7 +214,7 @@ RDMACommunicator::~RDMACommunicator() { int RDMACommunicator::start_server(int sport, int sgid_idx, int gpu_index) { WARN("verbs server starting …"); - + int sockfd = setup_listening_socket(sport); if (sockfd < 0) { ERR("Failed to set up listening socket"); @@ -244,7 +244,7 @@ int RDMACommunicator::start_server(int sport, int sgid_idx, int gpu_index) { struct RdmaContext* contexts[RDMA_TCP_CONNECT_SIZE] = {nullptr}; while (RDMACommunicator_status == 1) { - int nfds = epoll_wait(epollfd, events, 10, -1); + int nfds = epoll_wait(epollfd, events, 10, -1); if (nfds < 0) { if (errno == EINTR) continue; ERR("epoll_wait failed: %s", strerror(errno)); @@ -292,7 +292,7 @@ int RDMACommunicator::start_server(int sport, int sgid_idx, int gpu_index) { ctx->conn.block_byte_size = block_size_byte; ctx->conn.local_cache_key_ptr_per_layer = local_cache_key_ptr_per_layer; ctx->conn.local_cache_value_ptr_per_layer = local_cache_value_ptr_per_layer; - + std::lock_guard lock(mutex_); if(!server_mr_register_per_layer(ctx)){ ERR("server_mr_register_per_layer failed"); @@ -394,7 +394,7 @@ void RDMACommunicator::close_client_connection(int fd, struct RdmaContext* ctx, } conn_map.erase(ctx->conn.url); - + for (size_t i = 0; i < ctx->conn.read_bufs.size(); ++i) { if (ctx->conn.read_mrs[i]) ibv_dereg_mr(ctx->conn.read_mrs[i]); if (ctx->conn.read_bufs[i]) free(ctx->conn.read_bufs[i]); @@ -402,7 +402,7 @@ void RDMACommunicator::close_client_connection(int fd, struct RdmaContext* ctx, ctx->conn.read_bufs.clear(); ctx->conn.read_mrs.clear(); - + ctx->conn.connected = 0; if (!clear_qp_info(ctx)) { LOGD("Failed to clear memory regions for Connection fd %d", fd); @@ -465,7 +465,7 @@ std::string RDMACommunicator::fetch_local_ip() { * Connect to a remote RDMA endpoint * * Establishes an RDMA connection with the specified destination IP and port. - * + * * @param dst_ip Destination IP address * @param dst_port Destination port * @return ConnStatus::kConnected ConnStatus::kError; @@ -503,7 +503,7 @@ int RDMACommunicator::connect(const std::string &dst_ip, ctx->conn.layer_number = layer_number; ctx->conn.block_number = block_number; ctx->conn.block_byte_size = block_size_byte; - + // Get port information for the connection if (get_port_info(ctx->context, ib_dev->port, &ctx->portinfo)) { ERR("Couldn't get port info"); @@ -516,7 +516,7 @@ int RDMACommunicator::connect(const std::string &dst_ip, } // Exchange connection information with remote peer - if (!client_exchange_destinations(ctx, ib_dev->port, KVCacheConfig::getInstance().resolve_rdma_dest_port(dst_port), + if (!client_exchange_destinations(ctx, ib_dev->port, KVCacheConfig::getInstance().resolve_rdma_dest_port(dst_port), KVCacheConfig::getInstance().get_rdma_gid_index(), dst_ip)) { ERR("Couldn't getexchange port infodestinations"); return static_cast(ConnStatus::kError); @@ -641,7 +641,7 @@ void RDMACommunicator::remove_conn(const std::string& url) { } struct RdmaContext *RDMACommunicator::get_conn(const std::string &ip, - const std::string &port) { + const std::string &port) { std::string url = ip + ":" + port; if (conn_map.find(url) == conn_map.end()) { return NULL; @@ -660,9 +660,9 @@ struct RdmaContext *RDMACommunicator::get_conn(const std::string &ip, * @throws std::runtime_error Throws an exception if registration fails */ struct ibv_mr* RDMACommunicator::register_memory_region( - ibv_pd* pd, void* addr, size_t size, + ibv_pd* pd, void* addr, size_t size, const std::string& desc, uint32_t access_flags) { - + if (!pd || !addr || size == 0) { throw std::invalid_argument("Invalid memory region parameters"); } @@ -675,11 +675,11 @@ struct ibv_mr* RDMACommunicator::register_memory_region( struct ibv_mr* mr = ibv_reg_mr(pd, addr, size, access_flags); if (!mr) { - throw std::runtime_error("Failed to register memory region " + desc + + throw std::runtime_error("Failed to register memory region " + desc + ": " + strerror(errno)); } - LOGD("Registered %s MR: addr=%p, size=%zu, flags=0x%x, lkey=0x%x", + LOGD("Registered %s MR: addr=%p, size=%zu, flags=0x%x, lkey=0x%x", desc.c_str(), addr, size, access_flags, mr->lkey); return mr; } @@ -744,7 +744,7 @@ fail: /** * @brief Register server-side memory regions for RDMA operations * @param ctx RDMA context containing protection domain and other resources - * + * * @details This method registers memory regions for both keys and values * for each layer, enabling remote read/write access. */ @@ -850,7 +850,7 @@ int RDMACommunicator::write_cache(const std::string &ip, for (size_t block_index = 0; block_index < block_num; ++block_index) { char* char_ptr = static_cast(ctx->conn.write_cache_key_remote_ptr_list[layer_idx]); - cache_key_remote_addr[block_index] = + cache_key_remote_addr[block_index] = (uint64_t(char_ptr + remote_block_ids[block_index] * block_size_byte)); char_ptr = static_cast(ctx->conn.write_cache_value_remote_ptr_list[layer_idx]); cache_value_remote_addr[block_index] = @@ -869,28 +869,28 @@ int RDMACommunicator::write_cache(const std::string &ip, if (KVCacheConfig::getInstance().is_debug_mode_enabled()) { auto duration_us = std::chrono::duration_cast( std::chrono::steady_clock::now() - start_time).count(); - + DEBUG("Write cache completed - IP: %s, Port: %s, Layer: %d, BlockSize: %d, Blocks: %lu, Duration: %ld us", ip.c_str(), port.c_str(), layer_idx, block_size_byte, block_num, duration_us); } - return 0; + return 0; } -bool RDMACommunicator::post_block_send(struct RdmaContext* ctx, int layer_idx, - const std::vector& local_block_ids, - bool is_key, std::vector& remote_addr, - uint32_t rkey, const std::string &ip, +bool RDMACommunicator::post_block_send(struct RdmaContext* ctx, int layer_idx, + const std::vector& local_block_ids, + bool is_key, std::vector& remote_addr, + uint32_t rkey, const std::string &ip, const std::string &port) { auto block_num = local_block_ids.size(); assert(block_num > 0 && "block_num must be > 0"); - bool success = execute_rdma_writes(ctx, layer_idx, local_block_ids, + bool success = execute_rdma_writes(ctx, layer_idx, local_block_ids, is_key, remote_addr, rkey); - + if (success) { if (KVCacheConfig::getInstance().is_gdrcopy_flush_enabled()) { const size_t last_idx = block_num - 1; - success = execute_read_verification(ctx, last_idx, remote_addr[last_idx], + success = execute_read_verification(ctx, last_idx, remote_addr[last_idx], rkey, layer_idx, ip, port); } } @@ -905,22 +905,22 @@ bool RDMACommunicator::execute_rdma_writes(struct RdmaContext* ctx, int layer_id auto block_num = local_block_ids.size(); struct ibv_sge* sge_list = new ibv_sge[block_num]; struct ibv_send_wr* send_wr_list = new ibv_send_wr[block_num]; - - prepare_write_requests(sge_list, send_wr_list, layer_idx, + + prepare_write_requests(sge_list, send_wr_list, layer_idx, local_block_ids, is_key, remote_addr, rkey); - + bool success = true; size_t inflight_wr = 0; - + for (size_t scnt = 0; scnt < block_num; ++scnt) { size_t idx = scnt % RDMA_WR_LIST_MAX_SIZE; inflight_wr++; - + bool is_batch_end = (idx == RDMA_WR_LIST_MAX_SIZE - 1 || scnt == block_num - 1); bool need_poll = (inflight_wr >= RDMA_SQ_MAX_SIZE || scnt == block_num - 1); - + if (is_batch_end) { - if (!post_send_with_retry(ctx, &send_wr_list[scnt - idx], + if (!post_send_with_retry(ctx, &send_wr_list[scnt - idx], inflight_wr, need_poll)) { success = false; break; @@ -930,7 +930,7 @@ bool RDMACommunicator::execute_rdma_writes(struct RdmaContext* ctx, int layer_id } } } - + delete[] sge_list; delete[] send_wr_list; return success; @@ -944,19 +944,19 @@ void RDMACommunicator::prepare_write_requests(struct ibv_sge* sge_list, std::vector& remote_addr, uint32_t rkey) { auto block_num = local_block_ids.size(); - + for (size_t i = 0; i < block_num; ++i) { - sge_list[i].addr = (uintptr_t)(is_key ? - local_cache_key_ptr_per_layer[layer_idx][local_block_ids[i]] : + sge_list[i].addr = (uintptr_t)(is_key ? + local_cache_key_ptr_per_layer[layer_idx][local_block_ids[i]] : local_cache_value_ptr_per_layer[layer_idx][local_block_ids[i]]); sge_list[i].length = block_size_byte; - sge_list[i].lkey = (is_key ? - write_mr_key_list[layer_idx]->lkey : + sge_list[i].lkey = (is_key ? + write_mr_key_list[layer_idx]->lkey : write_mr_value_list[layer_idx]->lkey); - + size_t idx = i % RDMA_WR_LIST_MAX_SIZE; send_wr_list[i].wr_id = i; - send_wr_list[i].next = (idx == RDMA_WR_LIST_MAX_SIZE - 1 || i == block_num - 1) ? + send_wr_list[i].next = (idx == RDMA_WR_LIST_MAX_SIZE - 1 || i == block_num - 1) ? nullptr : &send_wr_list[i + 1]; send_wr_list[i].sg_list = &sge_list[i]; send_wr_list[i].num_sge = 1; @@ -975,7 +975,7 @@ bool RDMACommunicator::post_send_with_retry(struct RdmaContext* ctx, int retries = 0; int ret = 0; struct ibv_send_wr* bad_wr = nullptr; - + if (inflight_wr >= RDMA_SQ_MAX_SIZE && wr_list) { struct ibv_send_wr* last_wr = wr_list; while (last_wr->next) { @@ -983,7 +983,7 @@ bool RDMACommunicator::post_send_with_retry(struct RdmaContext* ctx, } last_wr->send_flags |= IBV_SEND_SIGNALED; } - + do { ret = ibv_post_send(ctx->qp, wr_list, &bad_wr); if (ret == 0) { @@ -997,14 +997,14 @@ bool RDMACommunicator::post_send_with_retry(struct RdmaContext* ctx, } return true; } else { - ERR("ibv_post_send failed: %s (errno: %d), retry %d/%d", + ERR("ibv_post_send failed: %s (errno: %d), retry %d/%d", strerror(errno), errno, retries + 1, max_retries); usleep(1000); retries++; } } while (retries < max_retries); - - ERR("ibv_post_send failed after %d retries: %s (errno: %d)", + + ERR("ibv_post_send failed after %d retries: %s (errno: %d)", retries, strerror(errno), errno); return false; } @@ -1053,4 +1053,4 @@ bool RDMACommunicator::execute_read_verification(struct RdmaContext* ctx, } return true; -} \ No newline at end of file +} diff --git a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/log.cpp b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/log.cpp index 3b48b316f..603ff6595 100644 --- a/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/log.cpp +++ b/fastdeploy/cache_manager/transfer_factory/kvcache_transfer/src/log.cpp @@ -3,13 +3,13 @@ * @brief Logging module implementation for key-value cache system * @version 1.0.0 * @copyright Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. - * + * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -134,7 +134,7 @@ void debug_init() { buffer[len++] = '\n'; if (global_error_file != NULL) { fwrite(buffer, 1, len, global_error_file); - } + } } __atomic_store_n(&global_debug_level, tempg_kv_cache_debug_level, __ATOMIC_RELEASE); pthread_mutex_unlock(&global_debug_lock); diff --git a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py index 281548f8f..f90abe798 100644 --- a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py +++ b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py @@ -24,13 +24,24 @@ class RDMACommManager: RDMACommManager to manage rdma communication """ - def __init__(self, splitwise_role, rank, gpu_id, cache_k_ptr_list, \ - cache_v_ptr_list, max_block_num, block_bytes, rdma_port): + def __init__( + self, + splitwise_role, + rank, + gpu_id, + cache_k_ptr_list, + cache_v_ptr_list, + max_block_num, + block_bytes, + rdma_port, + ): try: import rdma_comm except: - logger.error(f"The installation of the RDMA library failed." \ - "Confirm whether your network card supports RDMA transmission.") + logger.error( + "The installation of the RDMA library failed." + "Confirm whether your network card supports RDMA transmission." + ) return self.messager = rdma_comm.RDMACommunicator( splitwise_role, @@ -50,7 +61,7 @@ class RDMACommManager: Connect to remote gpu and write cache. """ assert self.splitwise_role == "prefill", "only prefill can call this method" - addr = f"{ip}:{str(port)}" + addr = f"{ip}:{port!s}" if addr in self.connected_rdma: return True ret = self.messager.is_connected(ip, str(port)) @@ -59,18 +70,13 @@ class RDMACommManager: return True ret = self.messager.connect(ip, str(port)) - logger.info( - f"connect to remote rdma address {ip}:{port} status is {ret}") + logger.info(f"connect to remote rdma address {ip}:{port} status is {ret}") if ret == 0: self.connected_rdma.add(addr) return ret == 0 - def write_cache(self, ip, port, local_block_ids, remote_block_ids, - layer_idx): + def write_cache(self, ip, port, local_block_ids, remote_block_ids, layer_idx): """ Connect to remote gpu and write cache. """ - return self.messager.write_cache(ip, str(port), local_block_ids, - remote_block_ids, layer_idx) - - + return self.messager.write_cache(ip, str(port), local_block_ids, remote_block_ids, layer_idx) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index f0d72bf15..98d118896 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -24,12 +24,12 @@ from typing import Literal, Optional from paddleformers.transformers.configuration_utils import PretrainedConfig from fastdeploy import envs -from fastdeploy.model_executor.layers.quantization.quant_base import \ - QuantConfigBase +from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase from fastdeploy.utils import get_logger logger = get_logger("config", "config.log") + class MoEPhase(Enum): """ The generation phase of the moe. @@ -38,13 +38,14 @@ class MoEPhase(Enum): PREFILL = 1 DECODER = 2 + class ErnieArchitectures: """Helper class for ERNIE architecture check.""" - + ARCHITECTURES = { "Ernie4_5_ForCausalLM", - "Ernie4_5_MoeForCausalLM", - "Ernie4_5_VLMoeForConditionalGeneration" + "Ernie4_5_MoeForCausalLM", + "Ernie4_5_VLMoeForConditionalGeneration", } @classmethod @@ -57,23 +58,24 @@ class ErnieArchitectures: """Check if the given architecture is an ERNIE architecture.""" return architecture in cls.ARCHITECTURES + PRETRAINED_INIT_CONFIGURATION = { - "rope_theta" : 10000.0, - "num_key_value_heads" : -1, - "start_layer_index" : 0, - "moe_num_shared_experts" : 0, - "moe_layer_start_index" : 0, - "num_max_dispatch_tokens_per_rank" : 256, - "moe_use_aux_free" : False, - "vocab_size" : -1, - "hidden_dropout_prob" : 0.0, - "initializer_range" : 0.02, - "max_position_embeddings" : 512, - "quantization_config" : None, - "tie_word_embeddings" : False, - "rms_norm_eps" : 1e-5, - "moe_num_experts" : None, - "moe_layer_end_index" : None, + "rope_theta": 10000.0, + "num_key_value_heads": -1, + "start_layer_index": 0, + "moe_num_shared_experts": 0, + "moe_layer_start_index": 0, + "num_max_dispatch_tokens_per_rank": 256, + "moe_use_aux_free": False, + "vocab_size": -1, + "hidden_dropout_prob": 0.0, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "quantization_config": None, + "tie_word_embeddings": False, + "rms_norm_eps": 1e-5, + "moe_num_experts": None, + "moe_layer_end_index": None, } @@ -81,6 +83,7 @@ class ModelConfig: """ The configuration class to store the configuration of a `LLM`. """ + def __init__( self, args, @@ -134,6 +137,7 @@ class ModelConfig: class ParallelConfig: """Configuration for the distributed execution.""" + def __init__( self, args, @@ -213,10 +217,8 @@ class ParallelConfig: self.enable_custom_all_reduce: bool = False # pd_disaggregation - use_pd_disaggregation: int = int( - os.getenv("FLAGS_use_pd_disaggregation", 0)) - use_pd_disaggregation_per_chunk: int = int( - os.getenv("FLAGS_use_pd_disaggregation_per_chunk", 0)) + use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0)) + use_pd_disaggregation_per_chunk: int = int(os.getenv("FLAGS_use_pd_disaggregation_per_chunk", 0)) if use_pd_disaggregation_per_chunk: self.pd_disaggregation_mode = "per_chunk" elif use_pd_disaggregation: @@ -224,10 +226,12 @@ class ParallelConfig: else: self.pd_disaggregation_mode = "None" + class SpeculativeConfig: """ Configuration for speculative decoding. """ + def __init__( self, args, @@ -261,22 +265,26 @@ class SpeculativeConfig: # This ensures that the specified simulation acceptance rate is not affected. self.benchmark_mode: bool = False - #TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig. - #We temperately add the name map here and will delete it in future. - name_map = {"speculative_method": "method", - "speculative_max_draft_token_num": "num_speculative_tokens", - "speculative_model_name_or_path": "model_name_or_path", - "speculative_model_quantization": "quantization", - "speculative_benchmark_mode": "benchmark_mode"} + # TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig. + # We temperately add the name map here and will delete it in future. + name_map = { + "speculative_method": "method", + "speculative_max_draft_token_num": "num_speculative_tokens", + "speculative_model_name_or_path": "model_name_or_path", + "speculative_model_quantization": "quantization", + "speculative_benchmark_mode": "benchmark_mode", + } for key, value in args.items(): if key in name_map.keys() and hasattr(self, name_map[key]): setattr(self, name_map[key], value) + class DeviceConfig: """ Configuration for device settings. """ + def __init__( self, args, @@ -286,6 +294,7 @@ class DeviceConfig: if hasattr(self, key): setattr(self, key, value) + @dataclass class GraphOptimizationConfig: """ @@ -336,15 +345,10 @@ class GraphOptimizationConfig: full_cuda_graph: bool = True max_capture_size: int = field(default=None, init=False) # type: ignore - batch_size_to_captured_size: dict[int, - int] = field(default=None, - init=False) # type: ignore + batch_size_to_captured_size: dict[int, int] = field(default=None, init=False) # type: ignore # CINN Config ... - def init_with_cudagrpah_size( - self, - max_num_seqs:int = 0 - ) -> None: + def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None: """ Initialize cuda graph capture sizes and pre-compute the mapping from batch size to padded graph size @@ -353,32 +357,28 @@ class GraphOptimizationConfig: self.cudagraph_capture_sizes = [size for size in self.cudagraph_capture_sizes if size <= max_num_seqs] dedup_sizes = list(set(self.cudagraph_capture_sizes)) if len(dedup_sizes) < len(self.cudagraph_capture_sizes): - logger.info(("cudagraph sizes specified by model runner" - " %s is overridden by config %s"), - self.cudagraph_capture_sizes, dedup_sizes) + logger.info( + ("cudagraph sizes specified by model runner" " %s is overridden by config %s"), + self.cudagraph_capture_sizes, + dedup_sizes, + ) self.cudagraph_capture_sizes = dedup_sizes # Sort to make sure cudagraph capture sizes are in descending order self.cudagraph_capture_sizes.sort(reverse=True) - self.max_capture_size = self.cudagraph_capture_sizes[ - 0] if self.cudagraph_capture_sizes else 0 + self.max_capture_size = self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0 # Pre-compute the mapping from batch size to padded graph size self.batch_size_to_captured_size = {} - for end, start in zip(self.cudagraph_capture_sizes, - self.cudagraph_capture_sizes[1:] + [0]): + for end, start in zip(self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]): for bs in range(start, end): if bs == start: self.batch_size_to_captured_size[bs] = start else: self.batch_size_to_captured_size[bs] = end - self.batch_size_to_captured_size[ - self.max_capture_size] = self.max_capture_size + self.batch_size_to_captured_size[self.max_capture_size] = self.max_capture_size - def _set_cudagraph_sizes( - self, - max_num_seqs:int = 0 - ): + def _set_cudagraph_sizes(self, max_num_seqs: int = 0): """ Calculate a series of candidate capture batch sizes, and then extract a portion of them as the capture list for the CUDA graph based on user input. @@ -405,24 +405,28 @@ class LoadConfig: - 'ipc_snapshot': Load from disk snapshot of IPC weights - None: No dynamic loading """ + def __init__( self, args, ): self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1 self.dynamic_load_weight: bool = False - self.load_strategy: Optional[Literal['ipc', 'ipc_snapshot']] = None + self.load_strategy: Optional[Literal["ipc", "ipc_snapshot"]] = None for key, value in args.items(): if hasattr(self, key): setattr(self, key, value) + class LoRAConfig: - """ LoRA Config """ + """LoRA Config""" + pass class KVCacheConfig: - """ KV Cache Config """ + """KV Cache Config""" + cache_quant_dtype: str = "none" @@ -430,6 +434,7 @@ class DecodingConfig: """ Configuration for decoding """ + def __init__( self, args, @@ -439,26 +444,24 @@ class DecodingConfig: if hasattr(self, key): setattr(self, key, value) + @dataclass class FDConfig: """ The configuration class which contains all fastdeploy-related configuration. This simplifies passing around the distinct configurations in the codebase. """ + model_config: ModelConfig = field(default=None, init=True) # type: ignore parallel_config: ParallelConfig = field(default=None, init=True) - speculative_config: SpeculativeConfig = field(default=None, - init=True) # type: ignore - device_config: DeviceConfig = field(default=None, - init=True) # type: ignore + speculative_config: SpeculativeConfig = field(default=None, init=True) # type: ignore + device_config: DeviceConfig = field(default=None, init=True) # type: ignore load_config: LoadConfig = field(default=None, init=True) quant_config: Optional[QuantConfigBase] = None graph_opt_config: Optional[GraphOptimizationConfig] = None - decoding_config: DecodingConfig = field(default=None, - init=True) # type: ignore - kv_cache_config: KVCacheConfig = field(default=None, - init=True) # type: ignore + decoding_config: DecodingConfig = field(default=None, init=True) # type: ignore + kv_cache_config: KVCacheConfig = field(default=None, init=True) # type: ignore def __post_init__(self): # Initialize cuda graph capture list @@ -466,6 +469,6 @@ class FDConfig: self.graph_opt_config._set_cudagraph_sizes(max_num_seqs=self.parallel_config.max_num_seqs) self.graph_opt_config.init_with_cudagrpah_size(max_num_seqs=self.parallel_config.max_num_seqs) - #TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn + # TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn if self.graph_opt_config.graph_opt_level == 2: self.graph_opt_config.graph_opt_level = 1 diff --git a/fastdeploy/demo/offline_demo.py b/fastdeploy/demo/offline_demo.py index 856757aa0..c02bdb45c 100644 --- a/fastdeploy/demo/offline_demo.py +++ b/fastdeploy/demo/offline_demo.py @@ -22,8 +22,6 @@ model_name_or_path = "./models/llama-7b" # 超参设置 sampling_params = SamplingParams(temperature=0.1, max_tokens=30) llm = LLM(model=model_name_or_path, tensor_parallel_size=1) -output = llm.generate(prompts="who are you?", - use_tqdm=True, - sampling_params=sampling_params) +output = llm.generate(prompts="who are you?", use_tqdm=True, sampling_params=sampling_params) print(output) diff --git a/fastdeploy/demo/offline_disaggregated_demo.py b/fastdeploy/demo/offline_disaggregated_demo.py index 82831649c..9dbb53655 100644 --- a/fastdeploy/demo/offline_disaggregated_demo.py +++ b/fastdeploy/demo/offline_disaggregated_demo.py @@ -14,55 +14,50 @@ # limitations under the License. """ -import time -import os import multiprocessing +import os +import time from fastdeploy.entrypoints.llm import LLM -from fastdeploy.engine.sampling_params import SamplingParams - - model_name_or_path = "baidu/ERNIE-4.5-21B-A3B-Paddle" - def start_decode(model_name_or_path): - os.environ["CUDA_VISIBLE_DEVICES"] = "1" + os.environ["CUDA_VISIBLE_DEVICES"] = "1" os.environ["FD_LOG_DIR"] = "log_decode" llm_decode = LLM( - model=model_name_or_path, - tensor_parallel_size=1, + model=model_name_or_path, + tensor_parallel_size=1, splitwise_role="decode", engine_worker_queue_port=6678, innode_prefill_ports=[6676], - cache_queue_port=55668 - ) + cache_queue_port=55668, + ) return llm_decode + def start_prefill(model_name_or_path): os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["FD_LOG_DIR"] = "log_prefill" - llm_prefill = LLM( - model=model_name_or_path, - tensor_parallel_size=1, + LLM( + model=model_name_or_path, + tensor_parallel_size=1, splitwise_role="prefill", - engine_worker_queue_port=6677, + engine_worker_queue_port=6677, cache_queue_port=55667, - ) + ) def main(): - prefill = multiprocessing.Process( - target=start_prefill, - args=(model_name_or_path,)).start() + prefill = multiprocessing.Process(target=start_prefill, args=(model_name_or_path,)).start() time.sleep(10) llm_decode = start_decode(model_name_or_path) output = llm_decode.generate(prompts=["who are you?", "what can you do?"], use_tqdm=True) print(output) - decode.join() + prefill.join() if __name__ == "__main__": diff --git a/fastdeploy/demo/offline_prefix_caching_demo.py b/fastdeploy/demo/offline_prefix_caching_demo.py index 3465d2402..16e660b13 100644 --- a/fastdeploy/demo/offline_prefix_caching_demo.py +++ b/fastdeploy/demo/offline_prefix_caching_demo.py @@ -40,10 +40,10 @@ sampling_params = SamplingParams(temperature=1, top_p=0.0) model = "baidu/ERNIE-4.5-21B-A3B-Paddle" prefix_cached_llm = LLM( - model=model, - quantization="wint4", - enable_prefix_caching=True, - ) + model=model, + quantization="wint4", + enable_prefix_caching=True, +) prefix_outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) diff --git a/fastdeploy/demo/openai_demo.py b/fastdeploy/demo/openai_demo.py index 1b8b5862a..308fa440f 100644 --- a/fastdeploy/demo/openai_demo.py +++ b/fastdeploy/demo/openai_demo.py @@ -14,11 +14,10 @@ # limitations under the License. """ - import openai ip = "0.0.0.0" -service_http_port = "9809" # 服务配置的 +service_http_port = "9809" # 服务配置的 client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") @@ -42,7 +41,7 @@ response = client.completions.create( ) for chunk in response: - print(chunk.choices[0].text, end='') + print(chunk.choices[0].text, end="") print("\n") # Chat completion @@ -78,5 +77,5 @@ response = client.chat.completions.create( for chunk in response: if chunk.choices[0].delta is not None: - print(chunk.choices[0].delta.content, end='') + print(chunk.choices[0].delta.content, end="") print("\n") diff --git a/fastdeploy/demo/openai_vl_demo.py b/fastdeploy/demo/openai_vl_demo.py index 52c1095c8..9b7e68cac 100644 --- a/fastdeploy/demo/openai_vl_demo.py +++ b/fastdeploy/demo/openai_vl_demo.py @@ -14,14 +14,12 @@ # limitations under the License. """ - import openai print("hello") ip = "0.0.0.0" service_http_port = "9809" -client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", - api_key="EMPTY_API_KEY") +client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") print("world") # 非流式对话 @@ -30,23 +28,21 @@ response = client.chat.completions.create( messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] - } + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + }, ], temperature=1, max_tokens=53, @@ -60,30 +56,25 @@ response = client.chat.completions.create( messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 - { - "role": "user", - "content": "List 3 countries and their capitals." - }, + {"role": "user", "content": "List 3 countries and their capitals."}, { "role": "assistant", - "content": "China(Beijing), France(Paris), Australia(Canberra)." + "content": "China(Beijing), France(Paris), Australia(Canberra).", }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], }, ], temperature=1, @@ -94,5 +85,5 @@ for chunk in response: if chunk.choices[0].delta is not None: # print(chunk.choices[0].delta, end='') # print("\n") - print(chunk.choices[0].delta.content, end='') + print(chunk.choices[0].delta.content, end="") print(response) diff --git a/fastdeploy/distributed/__init__.py b/fastdeploy/distributed/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/distributed/__init__.py +++ b/fastdeploy/distributed/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/distributed/communication_op.py b/fastdeploy/distributed/communication_op.py index fb397df0f..a54e58f87 100644 --- a/fastdeploy/distributed/communication_op.py +++ b/fastdeploy/distributed/communication_op.py @@ -17,24 +17,31 @@ import paddle import paddle.distributed as dist from paddle.distributed import fleet + from fastdeploy.distributed.parallel_state import get_tensor_model_parallel_world_size _TP_AR = None + def use_custom_allreduce(custom_all_reduce_max_bytes: int = 8192 * 1024): hcg = fleet.get_hybrid_communicate_group() model_parallel_group = hcg.get_model_parallel_group() global _TP_AR if get_tensor_model_parallel_world_size() > 1 and paddle.is_compiled_with_cuda(): from fastdeploy.distributed.custom_all_reduce import CustomAllreduce + _TP_AR = CustomAllreduce(model_parallel_group, custom_all_reduce_max_bytes) + try: + @paddle.jit.marker.unified - def tensor_model_parallel_all_reduce(input_: paddle.Tensor) -> paddle.Tensor: + def tensor_model_parallel_all_reduce( + input_: paddle.Tensor, + ) -> paddle.Tensor: """All-reduce the input tensor across model parallel group.""" global _TP_AR - if _TP_AR is not None and _TP_AR.should_custom_ar(input_) : + if _TP_AR is not None and _TP_AR.should_custom_ar(input_): _TP_AR.all_reduce(input_, input_) elif paddle.in_dynamic_mode(): hcg = fleet.get_hybrid_communicate_group() @@ -42,5 +49,6 @@ try: dist.all_reduce(input_, group=mp_group) else: dist.all_reduce(input_) + except: - tensor_model_parallel_all_reduce=None \ No newline at end of file + tensor_model_parallel_all_reduce = None diff --git a/fastdeploy/distributed/custom_all_reduce/__init__.py b/fastdeploy/distributed/custom_all_reduce/__init__.py index 054074cf9..ec2758e29 100644 --- a/fastdeploy/distributed/custom_all_reduce/__init__.py +++ b/fastdeploy/distributed/custom_all_reduce/__init__.py @@ -14,4 +14,4 @@ from .custom_all_reduce import CustomAllreduce -__all__ = ["CustomAllreduce"] \ No newline at end of file +__all__ = ["CustomAllreduce"] diff --git a/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py b/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py index af5cc487d..22195364b 100644 --- a/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py +++ b/fastdeploy/distributed/custom_all_reduce/cuda_wrapper.py @@ -41,7 +41,7 @@ def find_loaded_library(lib_name) -> Optional[str]: the file `/proc/self/maps` contains the memory maps of the process, which includes the shared libraries loaded by the process. We can use this file to find the path of the a loaded library. - """ # noqa + """ found = False with open("/proc/self/maps") as f: for line in f: @@ -73,18 +73,40 @@ class CudaRTLibrary: # const char* cudaGetErrorString ( cudaError_t error ) Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]), # ​cudaError_t cudaMalloc ( void** devPtr, size_t size ) - Function("cudaMalloc", cudaError_t, [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]), + Function( + "cudaMalloc", + cudaError_t, + [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t], + ), # ​cudaError_t cudaFree ( void* devPtr ) Function("cudaFree", cudaError_t, [ctypes.c_void_p]), # ​cudaError_t cudaMemset ( void* devPtr, int value, size_t count ) - Function("cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]), - # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa - Function("cudaMemcpy", cudaError_t, [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind]), - # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa - Function("cudaIpcGetMemHandle", cudaError_t, [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]), - # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) # noqa Function( - "cudaIpcOpenMemHandle", cudaError_t, [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint] + "cudaMemset", + cudaError_t, + [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t], + ), + # ​cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) + Function( + "cudaMemcpy", + cudaError_t, + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind], + ), + # cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) + Function( + "cudaIpcGetMemHandle", + cudaError_t, + [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p], + ), + # ​cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) + Function( + "cudaIpcOpenMemHandle", + cudaError_t, + [ + ctypes.POINTER(ctypes.c_void_p), + cudaIpcMemHandle_t, + ctypes.c_uint, + ], ), ] diff --git a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py index 3b6de6ea9..818b8bd98 100644 --- a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py +++ b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py @@ -13,26 +13,26 @@ # limitations under the License. -from contextlib import contextmanager import atexit import ctypes +from contextlib import contextmanager from typing import List, Optional import paddle import paddle.distributed as dist from paddle.distributed.communication.group import Group + +from fastdeploy.distributed.custom_all_reduce import cuda_wrapper from fastdeploy.model_executor.ops.gpu import ( all_reduce, dispose, + get_graph_buffer_ipc_meta, init_custom_all_reduce, meta_size, register_buffer, - get_graph_buffer_ipc_meta, register_graph_buffers, ) -from fastdeploy.distributed.custom_all_reduce import cuda_wrapper - try: meta_size() custom_ar = True @@ -47,7 +47,7 @@ class CustomAllreduce: _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8] # max_size: max supported allreduce size - def __init__(self, group: Group, max_size: int=8192 * 1024) -> None: + def __init__(self, group: Group, max_size: int = 8192 * 1024) -> None: """ Args: device: the device to bind the CustomAllreduce to. If None, @@ -147,7 +147,12 @@ class CustomAllreduce: return inp_size < self.max_size return False - def all_reduce(self, inp: paddle.Tensor, out: paddle.Tensor = None, registered: bool = False): + def all_reduce( + self, + inp: paddle.Tensor, + out: paddle.Tensor = None, + registered: bool = False, + ): """Performs an out-of-place all reduce. If registered is True, this assumes inp's pointer is already @@ -165,7 +170,7 @@ class CustomAllreduce: @contextmanager def capture(self): """ - The main responsibility of this context manager is the + The main responsibility of this context manager is the `register_graph_buffers` call at the end of the context. It records all the buffer addresses used in the CUDA graph. """ @@ -179,22 +184,18 @@ class CustomAllreduce: def register_graph_buffers(self): handle, offset = get_graph_buffer_ipc_meta(self._ptr) - all_data = [[None, None] - for _ in range(dist.get_world_size(group=self.group))] + all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))] all_data[self.rank] = [handle, offset] ranks = sorted(dist.get_process_group_ranks(group=self.group)) for i, rank in enumerate(ranks): - dist.broadcast_object_list(all_data[i], - src=rank, - group=self.group, - device="cpu") + dist.broadcast_object_list(all_data[i], src=rank, group=self.group, device="cpu") # Unpack list of tuples to tuple of lists. handles = [d[0] for d in all_data] # type: ignore offsets = [d[1] for d in all_data] # type: ignore register_graph_buffers(self._ptr, handles, offsets) - + def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]: """The main allreduce API that provides support for cuda graph.""" # When custom allreduce is disabled, this will be None. diff --git a/fastdeploy/engine/__init__.py b/fastdeploy/engine/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/engine/__init__.py +++ b/fastdeploy/engine/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index cdd9e81d9..ba3ce8312 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -13,15 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import json from dataclasses import asdict, dataclass from dataclasses import fields as dataclass_fields from typing import Any, Dict, List, Optional -from fastdeploy.engine.config import (CacheConfig, Config, - GraphOptimizationConfig, ModelConfig, - ParallelConfig, SpeculativeConfig, - TaskOption) +from fastdeploy.engine.config import ( + CacheConfig, + Config, + GraphOptimizationConfig, + ModelConfig, + ParallelConfig, + SpeculativeConfig, + TaskOption, +) from fastdeploy.scheduler.config import SchedulerConfig from fastdeploy.utils import FlexibleArgumentParser @@ -323,365 +329,429 @@ class EngineArgs: """ # Model parameters group model_group = parser.add_argument_group("Model Configuration") - model_group.add_argument("--model", - type=str, - default=EngineArgs.model, - help="Model name or path to be used.") - model_group.add_argument("--model-config-name", - type=nullable_str, - default=EngineArgs.model_config_name, - help="The model configuration file name.") + model_group.add_argument( + "--model", + type=str, + default=EngineArgs.model, + help="Model name or path to be used.", + ) + model_group.add_argument( + "--model-config-name", + type=nullable_str, + default=EngineArgs.model_config_name, + help="The model configuration file name.", + ) model_group.add_argument( "--tokenizer", type=nullable_str, default=EngineArgs.tokenizer, - help= - "Tokenizer name or path (defaults to model path if not specified)." + help="Tokenizer name or path (defaults to model path if not specified).", ) model_group.add_argument( "--max-model-len", type=int, default=EngineArgs.max_model_len, - help="Maximum context length supported by the model.") + help="Maximum context length supported by the model.", + ) model_group.add_argument( "--block-size", type=int, default=EngineArgs.block_size, - help="Number of tokens processed in one block.") - model_group.add_argument("--task", - type=str, - default=EngineArgs.task, - help="Task to be executed by the model.") + help="Number of tokens processed in one block.", + ) + model_group.add_argument( + "--task", + type=str, + default=EngineArgs.task, + help="Task to be executed by the model.", + ) model_group.add_argument( "--use-warmup", type=int, default=EngineArgs.use_warmup, - help="Flag to indicate whether to use warm-up before inference.") + help="Flag to indicate whether to use warm-up before inference.", + ) model_group.add_argument( "--limit-mm-per-prompt", default=EngineArgs.limit_mm_per_prompt, type=json.loads, - help="Limitation of numbers of multi-modal data.") + help="Limitation of numbers of multi-modal data.", + ) model_group.add_argument( "--mm-processor-kwargs", default=EngineArgs.mm_processor_kwargs, type=json.loads, - help="Additional keyword arguments for the multi-modal processor.") - model_group.add_argument("--enable-mm", - action='store_true', - default=EngineArgs.enable_mm, - help="Flag to enable multi-modal model.") - model_group.add_argument("--reasoning-parser", - type=str, - default=EngineArgs.reasoning_parser, - help="Flag specifies the reasoning parser to use for extracting "\ - "reasoning content from the model output") + help="Additional keyword arguments for the multi-modal processor.", + ) + model_group.add_argument( + "--enable-mm", + action="store_true", + default=EngineArgs.enable_mm, + help="Flag to enable multi-modal model.", + ) + model_group.add_argument( + "--reasoning-parser", + type=str, + default=EngineArgs.reasoning_parser, + help="Flag specifies the reasoning parser to use for extracting " + "reasoning content from the model output", + ) model_group.add_argument( "--speculative-config", type=json.loads, default=EngineArgs.speculative_config, - help="Configuration for speculative execution.") + help="Configuration for speculative execution.", + ) model_group.add_argument( "--dynamic-load-weight", - action='store_true', + action="store_true", default=EngineArgs.dynamic_load_weight, - help="Flag to indicate whether to load weight dynamically.") + help="Flag to indicate whether to load weight dynamically.", + ) model_group.add_argument( "--load-strategy", type=str, default=EngineArgs.load_strategy, - help="Flag to dynamic load strategy.") - model_group.add_argument("--engine-worker-queue-port", - type=int, - default=EngineArgs.engine_worker_queue_port, - help="port for engine worker queue") - model_group.add_argument("--quantization", - type=str, - default=EngineArgs.quantization, - help="Quantization name for the model, currentlly support " \ - "'wint8', 'wint4'," \ - "default is None. The priority of this configuration "\ - "is lower than that of the config file. " \ - "More complex quantization methods need to be configured via the config file.") - model_group.add_argument("--use-cudagraph", - action='store_true', - default=EngineArgs.use_cudagraph, - help="Flags to enable cuda graph.") - model_group.add_argument("--graph-optimization-config", - type=json.loads, - default=EngineArgs.graph_optimization_config, - help="") - model_group.add_argument("--guided-decoding-backend", - type=str, - default=EngineArgs.guided_decoding_backend, - help="Guided Decoding Backend") + help="Flag to dynamic load strategy.", + ) + model_group.add_argument( + "--engine-worker-queue-port", + type=int, + default=EngineArgs.engine_worker_queue_port, + help="port for engine worker queue", + ) + model_group.add_argument( + "--quantization", + type=str, + default=EngineArgs.quantization, + help="Quantization name for the model, currentlly support " + "'wint8', 'wint4'," + "default is None. The priority of this configuration " + "is lower than that of the config file. " + "More complex quantization methods need to be configured via the config file.", + ) + model_group.add_argument( + "--use-cudagraph", + action="store_true", + default=EngineArgs.use_cudagraph, + help="Flags to enable cuda graph.", + ) + model_group.add_argument( + "--graph-optimization-config", + type=json.loads, + default=EngineArgs.graph_optimization_config, + help="", + ) + model_group.add_argument( + "--guided-decoding-backend", + type=str, + default=EngineArgs.guided_decoding_backend, + help="Guided Decoding Backend", + ) model_group.add_argument( "--guided-decoding-disable-any-whitespace", type=str, default=EngineArgs.guided_decoding_disable_any_whitespace, - help= - "Disabled any whitespaces when using guided decoding backend XGrammar." + help="Disabled any whitespaces when using guided decoding backend XGrammar.", + ) + model_group.add_argument( + "--enable-logprob", + action="store_true", + default=EngineArgs.enable_logprob, + help="Enable output of token-level log probabilities.", ) - model_group.add_argument("--enable-logprob", - action="store_true", - default=EngineArgs.enable_logprob, - help="Enable output of token-level log probabilities." - ) # Parallel processing parameters group parallel_group = parser.add_argument_group("Parallel Configuration") - parallel_group.add_argument("--tensor-parallel-size", - "-tp", - type=int, - default=EngineArgs.tensor_parallel_size, - help="Degree of tensor parallelism.") - parallel_group.add_argument("--enable-custom-all-reduce", - action='store_true', - default=EngineArgs.enable_custom_all_reduce, - help="Flag to enable custom all-reduce.") + parallel_group.add_argument( + "--tensor-parallel-size", + "-tp", + type=int, + default=EngineArgs.tensor_parallel_size, + help="Degree of tensor parallelism.", + ) + parallel_group.add_argument( + "--enable-custom-all-reduce", + action="store_true", + default=EngineArgs.enable_custom_all_reduce, + help="Flag to enable custom all-reduce.", + ) parallel_group.add_argument( "--max-num-seqs", type=int, default=EngineArgs.max_num_seqs, - help="Maximum number of sequences per iteration.") + help="Maximum number of sequences per iteration.", + ) parallel_group.add_argument( "--num-gpu-blocks-override", type=int, default=EngineArgs.num_gpu_blocks_override, - help="Override for the number of GPU blocks.") + help="Override for the number of GPU blocks.", + ) parallel_group.add_argument( "--max-num-batched-tokens", type=int, default=EngineArgs.max_num_batched_tokens, - help="Maximum number of tokens to batch together.") + help="Maximum number of tokens to batch together.", + ) parallel_group.add_argument( "--gpu-memory-utilization", type=float, default=EngineArgs.gpu_memory_utilization, - help="Fraction of GPU memory to be utilized.") + help="Fraction of GPU memory to be utilized.", + ) - parallel_group.add_argument("--data-parallel-size", - type=int, - default=EngineArgs.data_parallel_size, - help="Degree of data parallelism.") - parallel_group.add_argument("--enable-expert-parallel", - action='store_true', - default=EngineArgs.enable_expert_parallel, - help="Enable expert parallelism.") + parallel_group.add_argument( + "--data-parallel-size", + type=int, + default=EngineArgs.data_parallel_size, + help="Degree of data parallelism.", + ) + parallel_group.add_argument( + "--enable-expert-parallel", + action="store_true", + default=EngineArgs.enable_expert_parallel, + help="Enable expert parallelism.", + ) # CacheConfig parameters group cache_group = parser.add_argument_group("Cache Configuration") - cache_group.add_argument("--kv-cache-ratio", - type=float, - default=EngineArgs.kv_cache_ratio, - help="Ratio of tokens to process in a block.") + cache_group.add_argument( + "--kv-cache-ratio", + type=float, + default=EngineArgs.kv_cache_ratio, + help="Ratio of tokens to process in a block.", + ) cache_group.add_argument( "--swap-space", type=float, default=EngineArgs.swap_space, - help="The amount of CPU memory to offload to.") + help="The amount of CPU memory to offload to.", + ) - cache_group.add_argument("--cache-queue-port", - type=int, - default=EngineArgs.cache_queue_port, - help="port for cache queue") - cache_group.add_argument("--static-decode-blocks", - type=int, - default=EngineArgs.static_decode_blocks, - help="Static decoding blocks num.") + cache_group.add_argument( + "--cache-queue-port", + type=int, + default=EngineArgs.cache_queue_port, + help="port for cache queue", + ) + cache_group.add_argument( + "--static-decode-blocks", + type=int, + default=EngineArgs.static_decode_blocks, + help="Static decoding blocks num.", + ) # Cluster system parameters group system_group = parser.add_argument_group("System Configuration") system_group.add_argument( "--dist-init-ip", default=EngineArgs.dist_init_ip, - help= - "IP addresses of master node.") + help="IP addresses of master node.", + ) system_group.add_argument( "--nnodes", type=int, default=EngineArgs.nnodes, - help= - "The number of all nodes.") + help="The number of all nodes.", + ) system_group.add_argument( "--node-rank", type=int, default=EngineArgs.node_rank, - help= - "node rank id (range [0, nnodes)).") - - + help="node rank id (range [0, nnodes)).", + ) # Performance tuning parameters group perf_group = parser.add_argument_group("Performance Tuning") - perf_group.add_argument("--enable-prefix-caching", - action='store_true', - default=EngineArgs.enable_prefix_caching, - help="Flag to enable prefix caching.") + perf_group.add_argument( + "--enable-prefix-caching", + action="store_true", + default=EngineArgs.enable_prefix_caching, + help="Flag to enable prefix caching.", + ) - perf_group.add_argument("--splitwise-role", - type=str, - default=EngineArgs.splitwise_role, - help="Role of splitwise. Default is \ - 'mixed'. (prefill, decode, mixed)") + perf_group.add_argument( + "--splitwise-role", + type=str, + default=EngineArgs.splitwise_role, + help="Role of splitwise. Default is \ + 'mixed'. (prefill, decode, mixed)", + ) - perf_group.add_argument("--innode-prefill-ports", - type=lambda s: s.split(",") if s else None, - default=EngineArgs.innode_prefill_ports, - help="port for innode prefill") + perf_group.add_argument( + "--innode-prefill-ports", + type=lambda s: s.split(",") if s else None, + default=EngineArgs.innode_prefill_ports, + help="port for innode prefill", + ) - perf_group.add_argument("--enable-chunked-prefill", - action='store_true', - default=EngineArgs.enable_chunked_prefill, - help="Flag to enable chunked prefill.") - perf_group.add_argument("--max-num-partial-prefills", - type=int, - default=EngineArgs.max_num_partial_prefills, - help="For chunked prefill, Maximum number \ - of concurrent partial prefill requests.") + perf_group.add_argument( + "--enable-chunked-prefill", + action="store_true", + default=EngineArgs.enable_chunked_prefill, + help="Flag to enable chunked prefill.", + ) + perf_group.add_argument( + "--max-num-partial-prefills", + type=int, + default=EngineArgs.max_num_partial_prefills, + help="For chunked prefill, Maximum number \ + of concurrent partial prefill requests.", + ) perf_group.add_argument( "--max-long-partial-prefills", type=int, default=EngineArgs.max_long_partial_prefills, - help= - ("For chunked prefill, the maximum number of prompts longer than long-prefill-token-threshold" - "that will be prefilled concurrently.")) + help=( + "For chunked prefill, the maximum number of prompts longer than long-prefill-token-threshold" + "that will be prefilled concurrently." + ), + ) perf_group.add_argument( "--long-prefill-token-threshold", type=int, default=EngineArgs.long_prefill_token_threshold, - help=("For chunked prefill, the threshold number of" - " tokens for a prompt to be considered long.")) + help=("For chunked prefill, the threshold number of" " tokens for a prompt to be considered long."), + ) perf_group.add_argument( "--cache-transfer-protocol", type=str, default=EngineArgs.cache_transfer_protocol, - help="support protocol list, comma separated, default is ipc") + help="support protocol list, comma separated, default is ipc", + ) - perf_group.add_argument("--pd-comm-port", - type=lambda s: s.split(",") if s else None, - default=EngineArgs.pd_comm_port, - help="port for splitwise communication.") + perf_group.add_argument( + "--pd-comm-port", + type=lambda s: s.split(",") if s else None, + default=EngineArgs.pd_comm_port, + help="port for splitwise communication.", + ) - perf_group.add_argument("--rdma-comm-ports", - type=lambda s: s.split(",") if s else None, - default=EngineArgs.rdma_comm_ports, - help="ports for rdma communication.") + perf_group.add_argument( + "--rdma-comm-ports", + type=lambda s: s.split(",") if s else None, + default=EngineArgs.rdma_comm_ports, + help="ports for rdma communication.", + ) # Scheduler parameters group scheduler_group = parser.add_argument_group("Scheduler") scheduler_group.add_argument( "--scheduler-name", default=EngineArgs.scheduler_name, - help= - f"Scheduler name to be used. Default is {EngineArgs.scheduler_name}. (local,global)" + help=f"Scheduler name to be used. Default is {EngineArgs.scheduler_name}. (local,global)", ) scheduler_group.add_argument( "--scheduler-max-size", type=int, default=EngineArgs.scheduler_max_size, - help= - f"Size of scheduler. Default is {EngineArgs.scheduler_max_size}. (Local)" + help=f"Size of scheduler. Default is {EngineArgs.scheduler_max_size}. (Local)", ) scheduler_group.add_argument( "--scheduler-ttl", type=int, default=EngineArgs.scheduler_ttl, - help= - f"TTL of request. Default is {EngineArgs.scheduler_ttl} seconds. (local,global)" + help=f"TTL of request. Default is {EngineArgs.scheduler_ttl} seconds. (local,global)", ) scheduler_group.add_argument( "--scheduler-host", default=EngineArgs.scheduler_host, - help= - f"Host address of redis. Default is {EngineArgs.scheduler_host}. (global)" + help=f"Host address of redis. Default is {EngineArgs.scheduler_host}. (global)", ) scheduler_group.add_argument( "--scheduler-port", type=int, default=EngineArgs.scheduler_port, - help= - f"Port of redis. Default is {EngineArgs.scheduler_port}. (global)") + help=f"Port of redis. Default is {EngineArgs.scheduler_port}. (global)", + ) scheduler_group.add_argument( "--scheduler-db", type=int, default=EngineArgs.scheduler_db, - help=f"DB of redis. Default is {EngineArgs.scheduler_db}. (global)" + help=f"DB of redis. Default is {EngineArgs.scheduler_db}. (global)", ) scheduler_group.add_argument( "--scheduler-password", default=EngineArgs.scheduler_password, - help= - f"Password of redis. Default is {EngineArgs.scheduler_password}. (global)" + help=f"Password of redis. Default is {EngineArgs.scheduler_password}. (global)", ) scheduler_group.add_argument( "--scheduler-topic", default=EngineArgs.scheduler_topic, - help= - f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)" + help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)", ) scheduler_group.add_argument( "--scheduler-min-load-score", type=float, default=EngineArgs.scheduler_min_load_score, - help= - f"Minimum load score for task assignment. Default is {EngineArgs.scheduler_min_load_score} (global)" + help=f"Minimum load score for task assignment. Default is {EngineArgs.scheduler_min_load_score} (global)", ) scheduler_group.add_argument( "--scheduler-load-shards-num", type=int, default=EngineArgs.scheduler_load_shards_num, - help=("Number of shards for load balancing table. Default is " - f"{EngineArgs.scheduler_load_shards_num} (global)")) + help=( + "Number of shards for load balancing table. Default is " + f"{EngineArgs.scheduler_load_shards_num} (global)" + ), + ) scheduler_group.add_argument( "--scheduler-sync-period", type=int, default=EngineArgs.scheduler_sync_period, help=f"SplitWise Use, node load sync period, " - f"Default is {EngineArgs.scheduler_sync_period}ms. (global)") + f"Default is {EngineArgs.scheduler_sync_period}ms. (global)", + ) scheduler_group.add_argument( "--scheduler-expire-period", type=int, default=EngineArgs.scheduler_expire_period, help=f"SplitWise Use, node will not be scheduled after " f"expire-period ms not sync load, Default is " - f"{EngineArgs.scheduler_expire_period}ms. (global)") + f"{EngineArgs.scheduler_expire_period}ms. (global)", + ) scheduler_group.add_argument( "--scheduler-release-load-expire-period", type=int, default=EngineArgs.scheduler_release_load_expire_period, help=f"SplitWise Use, scheduler will release req load after " f"expire period(s). Default is " - f"{EngineArgs.scheduler_release_load_expire_period}. (global)") + f"{EngineArgs.scheduler_release_load_expire_period}. (global)", + ) scheduler_group.add_argument( "--scheduler-reader-parallel", type=int, default=EngineArgs.scheduler_reader_parallel, help=f"SplitWise Use, Results Reader Sync Parallel, " - f"Default is {EngineArgs.scheduler_reader_parallel}. (global)") + f"Default is {EngineArgs.scheduler_reader_parallel}. (global)", + ) scheduler_group.add_argument( "--scheduler-writer-parallel", type=int, default=EngineArgs.scheduler_writer_parallel, help=f"SplitWise Use, Results Writer Sync Parallel, " - f"Default is {EngineArgs.scheduler_writer_parallel}. (global)") + f"Default is {EngineArgs.scheduler_writer_parallel}. (global)", + ) scheduler_group.add_argument( "--scheduler-reader-batch-size", type=int, default=EngineArgs.scheduler_reader_batch_size, help=f"SplitWise Use, Results Reader Batch Size, " - f"Default is {EngineArgs.scheduler_reader_batch_size}. (global)") + f"Default is {EngineArgs.scheduler_reader_batch_size}. (global)", + ) scheduler_group.add_argument( "--scheduler-writer-batch-size", type=int, default=EngineArgs.scheduler_writer_batch_size, help=f"SplitWise Use, Results Writer Batch Size, " - f"Default is {EngineArgs.scheduler_writer_batch_size}. (global)") + f"Default is {EngineArgs.scheduler_writer_batch_size}. (global)", + ) return parser @@ -690,21 +760,19 @@ class EngineArgs: """ Create an instance of EngineArgs from command line arguments. """ - return cls( - **{ - field.name: getattr(args, field.name) - for field in dataclass_fields(cls) - }) + return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)}) def create_model_config(self) -> ModelConfig: """ Create and return a ModelConfig object based on the current settings. """ - return ModelConfig(model_name_or_path=self.model, - config_json_file=self.model_config_name, - quantization=self.quantization, - dynamic_load_weight=self.dynamic_load_weight, - load_strategy=self.load_strategy) + return ModelConfig( + model_name_or_path=self.model, + config_json_file=self.model_config_name, + quantization=self.quantization, + dynamic_load_weight=self.dynamic_load_weight, + load_strategy=self.load_strategy, + ) def create_cache_config(self, model_cfg) -> CacheConfig: """ @@ -728,8 +796,7 @@ class EngineArgs: ) def create_speculative_config(self) -> SpeculativeConfig: - """ - """ + """ """ if self.speculative_config is not None: return SpeculativeConfig(**self.speculative_config) else: @@ -742,9 +809,11 @@ class EngineArgs: prefix = "scheduler_" prefix_len = len(prefix) extra_params = [ - "max_model_len", "enable_chunked_prefill", - "max_num_partial_prefills", "max_long_partial_prefills", - "long_prefill_token_threshold" + "max_model_len", + "enable_chunked_prefill", + "max_num_partial_prefills", + "max_long_partial_prefills", + "long_prefill_token_threshold", ] all = asdict(self) @@ -765,7 +834,7 @@ class EngineArgs: tensor_parallel_size=self.tensor_parallel_size, enable_expert_parallel=self.enable_expert_parallel, data_parallel_size=self.data_parallel_size, - enable_custom_all_reduce=self.enable_custom_all_reduce + enable_custom_all_reduce=self.enable_custom_all_reduce, ) def create_graph_optimization_config(self) -> GraphOptimizationConfig: @@ -782,8 +851,7 @@ class EngineArgs: Create and return a Config object based on the current settings. """ model_cfg = self.create_model_config() - if not model_cfg.is_unified_ckpt and hasattr(model_cfg, - 'tensor_parallel_size'): + if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"): self.tensor_parallel_size = model_cfg.tensor_parallel_size if self.max_num_batched_tokens is None: if self.enable_chunked_prefill: @@ -795,11 +863,11 @@ class EngineArgs: graph_opt_cfg = self.create_graph_optimization_config() graph_opt_cfg.update_use_cudagraph(self.use_cudagraph) - assert not (self.use_cudagraph and self.enable_prefix_caching), \ - "Prefix caching cannot be used with CUDA graph" + assert not (self.use_cudagraph and self.enable_prefix_caching), "Prefix caching cannot be used with CUDA graph" - assert not (self.tensor_parallel_size<=1 and self.enable_custom_all_reduce), \ - "enable_custom_all_reduce must be used with tensor_parallel_size>1" + assert not ( + self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce + ), "enable_custom_all_reduce must be used with tensor_parallel_size>1" return Config( model_name_or_path=self.model, @@ -831,5 +899,5 @@ class EngineArgs: guided_decoding_backend=self.guided_decoding_backend, disable_any_whitespace=self.guided_decoding_disable_any_whitespace, enable_custom_all_reduce=self.enable_custom_all_reduce, - enable_logprob = self.enable_logprob, + enable_logprob=self.enable_logprob, ) diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 3326e8321..96c860d87 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -23,8 +23,14 @@ from typing import Any, Dict, List, Literal, Optional from fastdeploy import envs from fastdeploy.platforms import current_platform from fastdeploy.scheduler import SchedulerConfig -from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip, - is_port_available, get_random_port, llm_logger) +from fastdeploy.utils import ( + ceil_div, + check_unified_ckpt, + get_host_ip, + get_random_port, + is_port_available, + llm_logger, +) TaskOption = Literal["generate"] @@ -39,13 +45,15 @@ class ModelConfig: model_name_or_path (str): Name or path of the model. """ - def __init__(self, - model_name_or_path: str, - config_json_file: str = "config.json", - dynamic_load_weight: bool = False, - load_strategy: str = "ipc_snapshot", - quantization: str = None, - download_dir: Optional[str] = None): + def __init__( + self, + model_name_or_path: str, + config_json_file: str = "config.json", + dynamic_load_weight: bool = False, + load_strategy: str = "ipc_snapshot", + quantization: str = None, + download_dir: Optional[str] = None, + ): """ Initialize the ModelConfig class. @@ -64,11 +72,9 @@ class ModelConfig: if os.path.isfile(model_name_or_path): try: from paddleformers.transformers import AutoConfig + config = AutoConfig.from_pretrained(model_name_or_path) - config_dict = { - k: v - for k, v in vars(config).items() if not k.startswith('_') - } + config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")} for key, value in config_dict.items(): setattr(self, key, value) except Exception: @@ -115,8 +121,7 @@ class ModelConfig: if not hasattr(self, "mla_use_absorb"): self.mla_use_absorb = False if not hasattr(self, "head_dim"): - assert hasattr(self, "hidden_size") and hasattr( - self, "num_attention_heads") + assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads") self.head_dim = self.hidden_size // self.num_attention_heads def read_from_env(self): @@ -132,11 +137,9 @@ class ModelConfig: if not hasattr(self, key.lower()): if os.getenv(key, None): value = eval(os.getenv(key)) - llm_logger.info( - f"Get parameter `{key}` = {value} from environment.") + llm_logger.info(f"Get parameter `{key}` = {value} from environment.") else: - llm_logger.info( - f"Parameter `{key}` will use default value {value}.") + llm_logger.info(f"Parameter `{key}` will use default value {value}.") setattr(self, key.lower(), value) reset_config_value("COMPRESSION_RATIO", 1.0) @@ -153,8 +156,7 @@ class ModelConfig: llm_logger.info("Model Configuration Information :") for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") class CacheConfig: @@ -211,8 +213,7 @@ class CacheConfig: self.enc_dec_block_num = enc_dec_block_num self.cache_dtype = cache_dtype if hasattr(model_cfg, "quantization_config"): - self.cache_dtype = model_cfg.quantization_config.get( - "kv_cache_quant_type", cache_dtype) + self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype) self.enable_chunked_prefill = enable_chunked_prefill self.rdma_comm_ports = rdma_comm_ports @@ -220,7 +221,7 @@ class CacheConfig: self.pd_comm_port = pd_comm_port if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str): - self.rdma_comm_ports = rdma_comm_ports.split(',') + self.rdma_comm_ports = rdma_comm_ports.split(",") if pd_comm_port is not None and isinstance(pd_comm_port, str): self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")] @@ -236,41 +237,39 @@ class CacheConfig: self.cache_queue_port = cache_queue_port self.swap_space = swap_space - if (hasattr(self.model_cfg, "num_key_value_heads") - and hasattr(self.model_cfg, "num_key_value_heads") - and self.model_cfg.num_key_value_heads is not None - and int(self.model_cfg.num_key_value_heads) > 0): + if ( + hasattr(self.model_cfg, "num_key_value_heads") + and hasattr(self.model_cfg, "num_key_value_heads") + and self.model_cfg.num_key_value_heads is not None + and int(self.model_cfg.num_key_value_heads) > 0 + ): kv_num_head = int(self.model_cfg.num_key_value_heads) else: kv_num_head = self.model_cfg.num_attention_heads self.model_cfg.kv_num_head = kv_num_head # TODO check name - if "int4" in self.cache_dtype.lower( - ) or "float4" in self.cache_dtype.lower(): + if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower(): byte_size = 0.5 self.cache_dtype = "uint8" - elif "int8" in self.cache_dtype.lower( - ) or "float8" in self.cache_dtype.lower(): + elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower(): self.cache_dtype = "uint8" byte_size = 1 else: byte_size = 2 self.each_token_cache_space = int( - self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * - byte_size) - self.bytes_per_block = int(self.each_token_cache_space * - self.block_size) + self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size + ) + self.bytes_per_block = int(self.each_token_cache_space * self.block_size) self.bytes_per_layer_per_block = int( - self.block_size * self.model_cfg.kv_num_head * - self.model_cfg.head_dim // tensor_parallel_size * byte_size) + self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size + ) if self.swap_space is None: self.num_cpu_blocks = 0 else: - self.num_cpu_blocks = int(self.swap_space * 1024**3 / - self.bytes_per_block) + self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block) self._verify_args() def metrics_info(self): @@ -279,12 +278,9 @@ class CacheConfig: def _verify_args(self): if self.gpu_memory_utilization > 1.0: - raise ValueError( - "GPU memory utilization must be less than 1.0. Got " - f"{self.gpu_memory_utilization}.") + raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.") if self.kv_cache_ratio > 1.0: - raise ValueError("KV cache ratio must be less than 1.0. Got " - f"{self.kv_cache_ratio}.") + raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.") def postprocess(self, num_total_tokens, number_of_tasks): """ @@ -293,27 +289,24 @@ class CacheConfig: self.dec_token_num = self.enc_dec_block_num * self.block_size if self.num_gpu_blocks_override is not None: self.total_block_num = self.num_gpu_blocks_override - self.prefill_kvcache_block_num = int(self.total_block_num * - self.kv_cache_ratio) + self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) else: length = num_total_tokens // number_of_tasks - block_num = (length + self.block_size - 1 + - self.dec_token_num) // self.block_size + block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size self.total_block_num = block_num * number_of_tasks self.prefill_kvcache_block_num = self.total_block_num - llm_logger.info( - f"Doing profile, the total_block_num:{self.total_block_num}") + llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}") def reset(self, num_gpu_blocks): """ reset gpu block number """ self.total_block_num = num_gpu_blocks - self.prefill_kvcache_block_num = int(self.total_block_num * - self.kv_cache_ratio) + self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio) llm_logger.info( - (f"Reset block num, the total_block_num:{self.total_block_num}," - f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}")) + f"Reset block num, the total_block_num:{self.total_block_num}," + f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}" + ) def print(self): """ @@ -323,8 +316,7 @@ class CacheConfig: llm_logger.info("Cache Configuration Information :") for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") class SpeculativeConfig: @@ -340,14 +332,16 @@ class SpeculativeConfig: benchmark_mode (bool): Whether to use benchmark mode. """ - def __init__(self, - method: Optional[str] = None, - num_speculative_tokens: Optional[int] = 1, - model: Optional[str] = None, - quantization: Optional[str] = "WINT8", - max_model_len: Optional[int] = None, - benchmark_mode: bool = False, - **kwargs): + def __init__( + self, + method: Optional[str] = None, + num_speculative_tokens: Optional[int] = 1, + model: Optional[str] = None, + quantization: Optional[str] = "WINT8", + max_model_len: Optional[int] = None, + benchmark_mode: bool = False, + **kwargs, + ): self.model_name_or_path = model self.method = method self.num_speculative_tokens = num_speculative_tokens @@ -381,8 +375,7 @@ class SpeculativeConfig: self.config_path = os.path.join(self.model_name_or_path, "config.json") if os.path.exists(self.config_path): - self.model_config = json.load( - open(self.config_path, 'r', encoding='utf-8')) + self.model_config = json.load(open(self.config_path, "r", encoding="utf-8")) def reset(self): """ @@ -414,10 +407,7 @@ class SpeculativeConfig: """ Convert speculative_config to json string. """ - return json.dumps({ - key: value - for key, value in self.__dict__.items() if value is not None - }) + return json.dumps({key: value for key, value in self.__dict__.items() if value is not None}) def print(self): """ @@ -427,8 +417,7 @@ class SpeculativeConfig: llm_logger.info("Speculative Decoding Configuration Information :") for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") def __str__(self) -> str: return self.to_json_string() @@ -440,7 +429,7 @@ class GraphOptimizationConfig: graph_opt_level: Optional[int] = 0, use_cudagraph: Optional[bool] = None, cudagraph_capture_sizes: Optional[List[int]] = None, - **kwargs + **kwargs, ): """ Graph Optimization Configuration class. @@ -460,10 +449,7 @@ class GraphOptimizationConfig: """ Convert speculative_config to json string. """ - return json.dumps({ - key: value - for key, value in self.__dict__.items() - }) + return json.dumps({key: value for key, value in self.__dict__.items()}) def __str__(self) -> str: return self.to_json_string() @@ -473,22 +459,30 @@ class GraphOptimizationConfig: graph_opt_level: Optional[int] = None, use_cudagraph: Optional[bool] = None, cudagraph_capture_sizes: Optional[List[int]] = None, - **kwargs + **kwargs, ) -> None: - """ Check the legality of parameters passed in from the command line """ + """Check the legality of parameters passed in from the command line""" if graph_opt_level is not None: - assert graph_opt_level in [0, 1, 2], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2." + assert graph_opt_level in [ + 0, + 1, + 2, + ], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2." if use_cudagraph is not None: assert type(use_cudagraph) is bool, "In graph optimization config, type of use_cudagraph must is bool." if cudagraph_capture_sizes is not None: - assert type(cudagraph_capture_sizes) is list, "In graph optimization config, type of cudagraph_capture_sizes must is list." - assert len(cudagraph_capture_sizes) > 0, "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list." + assert ( + type(cudagraph_capture_sizes) is list + ), "In graph optimization config, type of cudagraph_capture_sizes must is list." + assert ( + len(cudagraph_capture_sizes) > 0 + ), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list." for key, value in kwargs.items(): raise ValueError(f"Invalid --graph-optimization-config parameter {key}") - def update_use_cudagraph(self, argument:bool): + def update_use_cudagraph(self, argument: bool): """ Unified user specifies the use_cudagraph parameter through two methods, '--use-cudagraph' and '--graph-optimization-config' @@ -499,9 +493,12 @@ class GraphOptimizationConfig: else: # User both set '--use-cudagraph' and '--graph-optimization-config' if self.use_cudagraph is False and argument is True: - raise ValueError("Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously.") + raise ValueError( + "Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously." + ) argument = self.use_cudagraph + class ParallelConfig: """ Configuration for parallelism. @@ -544,8 +541,7 @@ class ParallelConfig: llm_logger.info("Parallel Configuration Information :") for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") @dataclass @@ -560,6 +556,7 @@ class CommitConfig: cuda_version: CUDA version string compiler_version: CXX compiler version string """ + fastdeploy_commit: str = "" paddle_version: str = "" paddle_commit: str = "" @@ -573,7 +570,7 @@ class CommitConfig: def _load_from_version_file(self, file_path: str = "fastdeploy/version.txt"): """Internal method to load version info from file""" try: - with open(file_path, 'r') as f: + with open(file_path, "r") as f: for line in f: line = line.strip() if line.startswith("fastdeploy GIT COMMIT ID:"): @@ -589,7 +586,7 @@ class CommitConfig: except FileNotFoundError: llm_logger.info(f"Warning: Version file not found at {file_path}") except Exception as e: - llm_logger.info(f"Warning: Could not read version file - {str(e)}") + llm_logger.info(f"Warning: Could not read version file - {e!s}") def print(self): """ @@ -599,8 +596,7 @@ class CommitConfig: llm_logger.info("Fasedeploy Commit Information :") for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") class Config: @@ -701,7 +697,7 @@ class Config: self.max_num_batched_tokens = max_num_batched_tokens self.tensor_parallel_size = tensor_parallel_size self.dist_init_ip = dist_init_ip - + self.nnode = nnodes self.node_rank = node_rank if self.dist_init_ip is None: @@ -728,7 +724,6 @@ class Config: self.disable_any_whitespace = disable_any_whitespace self._str_to_list("innode_prefill_ports", int) - assert self.splitwise_role in ["mixed", "prefill", "decode"] # TODO @@ -739,19 +734,16 @@ class Config: self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化 # TODO(@wufeisheng): TP and EP need to be supported simultaneously. - assert (self.tensor_parallel_size == 1 - and self.parallel_config.expert_parallel_size - >= 1) or (self.tensor_parallel_size >= 1 - and self.parallel_config.expert_parallel_size - == 1), "TP and EP cannot be enabled at the same time" + assert (self.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or ( + self.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1 + ), "TP and EP cannot be enabled at the same time" num_ranks = self.tensor_parallel_size * self.parallel_config.expert_parallel_size self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 if num_ranks > self.max_chips_per_node: self.worker_num_per_node = self.max_chips_per_node nnode = ceil_div(num_ranks, self.worker_num_per_node) - assert nnode == self.nnode, \ - f"nnode: {nnode}, but got {self.nnode}" + assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}" else: self.worker_num_per_node = num_ranks @@ -772,13 +764,14 @@ class Config: """ calculate some parameters """ - assert self.device_ids.split(',').__len__() == self.worker_num_per_node, \ - f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}" + assert ( + self.device_ids.split(",").__len__() == self.worker_num_per_node + ), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}" - assert self.worker_num_per_node % self.tensor_parallel_size == 0, \ - f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}" - self.local_device_ids = self.device_ids.split( - ',')[:self.tensor_parallel_size] + assert ( + self.worker_num_per_node % self.tensor_parallel_size == 0 + ), f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}" + self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size] self.host_ip = get_host_ip() @@ -788,6 +781,7 @@ class Config: self.is_master = False import paddle + self.paddle_commit_id = paddle.version.commit if self.max_num_batched_tokens is None: @@ -799,10 +793,8 @@ class Config: if self.long_prefill_token_threshold == 0: self.long_prefill_token_threshold = int(self.max_model_len * 0.04) - self.cache_config.postprocess(self.max_num_batched_tokens, - self.max_num_seqs) - self.cache_config.max_block_num_per_seq = int( - self.max_model_len // self.cache_config.block_size) + self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs) + self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size) if self.guided_decoding_backend == "auto": if self.enable_mm: @@ -814,30 +806,26 @@ class Config: """ check the legality of config """ - assert ( - self.max_num_seqs <= 256 - ), "The parameter `max_num_seqs` is not allowed to exceed 256, " "but now it's {}.".format( - self.max_num_seqs) - assert ( - is_port_available('0.0.0.0', self.engine_worker_queue_port) + assert self.max_num_seqs <= 256, ( + "The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}." + ) + assert is_port_available( + "0.0.0.0", self.engine_worker_queue_port ), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use." assert ( self.max_chips_per_node >= self.tensor_parallel_size > 0 ), f"tensor_parallel_size: {self.tensor_parallel_size} should be between 1 and {self.max_chips_per_node}" - assert (self.nnode >= 1), f"nnode: {self.nnode} should no less than 1" - assert ( - self.max_model_len >= 16 - ), f"max_model_len: {self.max_model_len} should be larger than 16" - assert ( - self.max_num_seqs - >= 1), f"max_num_seqs: {self.max_num_seqs} should be larger than 1" - assert ( - self.max_num_batched_tokens >= self.max_num_seqs - ), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \ + assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1" + assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16" + assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1" + assert self.max_num_batched_tokens >= self.max_num_seqs, ( + f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}" - assert (self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs), \ - f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" \ - f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}" + ) + assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, ( + f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" + f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}" + ) assert ( self.max_num_partial_prefills >= 1 ), f"max_num_partial_prefills: {self.max_num_partial_prefills} should be larger than or equal to 1" @@ -845,31 +833,38 @@ class Config: assert ( self.max_long_partial_prefills >= 1 ), f"max_long_partial_prefills: {self.max_long_partial_prefills} should be larger than or equal to 1" - assert (self.max_long_partial_prefills <= self.max_num_partial_prefills), \ - f"max_long_partial_prefills: {self.max_long_partial_prefills} should " \ - f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}" + assert self.max_long_partial_prefills <= self.max_num_partial_prefills, ( + f"max_long_partial_prefills: {self.max_long_partial_prefills} should " + f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}" + ) if not self.cache_config.enable_chunked_prefill: - assert ( - self.max_num_batched_tokens >= self.max_model_len - ), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \ + assert self.max_num_batched_tokens >= self.max_model_len, ( + f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to max_model_len: {self.max_model_len}" + ) else: - assert ( - self.max_num_batched_tokens >= self.cache_config.block_size - ), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \ + assert self.max_num_batched_tokens >= self.cache_config.block_size, ( + f"max_num_batched_tokens: {self.max_num_batched_tokens} " f"should be larger than or equal to block_size: {self.cache_config.block_size}" + ) if self.max_num_partial_prefills > 1: - assert (self.cache_config.enable_chunked_prefill is True), \ - "Chunked prefill must be enabled to set max_num_partial_prefills > 1" - assert (self.long_prefill_token_threshold < self.max_model_len), \ - f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"\ - f" max_model_len: {self.max_model_len}" + assert ( + self.cache_config.enable_chunked_prefill is True + ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1" + assert self.long_prefill_token_threshold < self.max_model_len, ( + f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than" + f" max_model_len: {self.max_model_len}" + ) if self.guided_decoding_backend is not None: - assert self.guided_decoding_backend in ["xgrammar", "XGrammar", "auto", "off"], \ - f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}." + assert self.guided_decoding_backend in [ + "xgrammar", + "XGrammar", + "auto", + "off", + ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}." if self.guided_decoding_backend != "off": # TODO: mm support guided_decoding @@ -878,8 +873,7 @@ class Config: # TODO: speculative decoding support guided_decoding # TODO: xpu support guided_decoding - assert not current_platform.is_xpu( - ), "XPU currently do not support guided_decoding" + assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding" try: import xgrammar # noqa @@ -897,22 +891,22 @@ class Config: Args: file (str): the path of file to save config """ - llm_logger.info( - "=================== Configuration Information ===============") + llm_logger.info("=================== Configuration Information ===============") for k, v in self.__dict__.items(): if k == "generation_config" and v is not None: for gck, gcv in v.to_dict().items(): llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv)) - elif (k == "cache_config" or - k == "model_config" or - k == "scheduler_config" or - k == "parallel_config" or - k == "commit_config"): + elif ( + k == "cache_config" + or k == "model_config" + or k == "scheduler_config" + or k == "parallel_config" + or k == "commit_config" + ): v.print() else: llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") if file is not None: f = open(file, "a") now_time = datetime.now() @@ -929,15 +923,14 @@ class Config: if self.splitwise_role != "mixed": disaggregate_info["role"] = self.splitwise_role disaggregate_info["cache_info"] = dict() - current_protocol = self.cache_config.cache_transfer_protocol.split( - ",") + current_protocol = self.cache_config.cache_transfer_protocol.split(",") disaggregate_info["transfer_protocol"] = current_protocol for protocol in current_protocol: if protocol == "ipc": disaggregate_info["cache_info"][protocol] = { "ip": self.host_ip, "port": self.engine_worker_queue_port, - "device_ids": self.local_device_ids + "device_ids": self.local_device_ids, } elif protocol == "rdma": disaggregate_info["cache_info"][protocol] = { @@ -957,13 +950,14 @@ class Config: if hasattr(cls, key): value = getattr(cls, key) setattr(cls, value_name, value) - llm_logger.info( - f"Reset parameter {value_name} = {value} from configuration." - ) + llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.") reset_value(self.cache_config, "block_size", "infer_model_block_size") - reset_value(self.model_config, "return_full_hidden_states", - "return_full_hidden_states") + reset_value( + self.model_config, + "return_full_hidden_states", + "return_full_hidden_states", + ) reset_value(self.cache_config, "cache_dtype", "infer_model_dtype") def _check_master(self): diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 414a7b209..bc0b98212 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from __future__ import annotations import copy @@ -40,18 +41,21 @@ from fastdeploy.engine.expert_service import start_expert_service from fastdeploy.engine.request import Request, RequestOutput from fastdeploy.engine.resource_manager import ResourceManager from fastdeploy.input.preprocess import InputPreprocessor -from fastdeploy.inter_communicator import (EngineCacheQueue, EngineWorkerQueue, - IPCSignal, ZmqClient) +from fastdeploy.inter_communicator import ( + EngineCacheQueue, + EngineWorkerQueue, + IPCSignal, + ZmqClient, +) from fastdeploy.metrics.metrics import main_process_metrics from fastdeploy.metrics.trace_util import start_span, start_span_request from fastdeploy.model_executor.guided_decoding import schema_checker -from fastdeploy.output.token_processor import (TokenProcessor, - WarmUpTokenProcessor) +from fastdeploy.output.token_processor import TokenProcessor, WarmUpTokenProcessor from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector from fastdeploy.utils import EngineError, console_logger, llm_logger -class LLMEngine(object): +class LLMEngine: """ Engine class responsible for managing the Large Language Model (LLM) operations. @@ -94,30 +98,28 @@ class LLMEngine(object): self.running = True self.scheduler = cfg.scheduler_config.scheduler() - self.input_processor = InputPreprocessor(cfg.tokenizer, - cfg.reasoning_parser, - cfg.limit_mm_per_prompt, - cfg.mm_processor_kwargs, - cfg.enable_mm) + self.input_processor = InputPreprocessor( + cfg.tokenizer, + cfg.reasoning_parser, + cfg.limit_mm_per_prompt, + cfg.mm_processor_kwargs, + cfg.enable_mm, + ) self.start_queue_service() - self.resource_manager = ResourceManager(cfg.max_num_seqs, cfg, - cfg.tensor_parallel_size, - cfg.splitwise_role) + self.resource_manager = ResourceManager(cfg.max_num_seqs, cfg, cfg.tensor_parallel_size, cfg.splitwise_role) - os.environ['INFERENCE_MSG_QUEUE_ID'] = str( - self.cfg.engine_worker_queue_port) + os.environ["INFERENCE_MSG_QUEUE_ID"] = str(self.cfg.engine_worker_queue_port) - self.split_connector = SplitwiseConnector(cfg, self.scheduler, - self.engine_worker_queue, - self.resource_manager) + self.split_connector = SplitwiseConnector(cfg, self.scheduler, self.engine_worker_queue, self.resource_manager) self.token_processor = TokenProcessor( cfg=self.cfg, cached_generated_tokens=self.scheduler, engine_worker_queue=self.engine_worker_queue, - split_connector=self.split_connector) + split_connector=self.split_connector, + ) self.token_processor.set_resource_manager(self.resource_manager) self.is_started = False @@ -129,11 +131,13 @@ class LLMEngine(object): else: self.do_profile = 0 - self.partial_chunked_tokens = [0] * ( - self.cfg.max_num_partial_prefills + 1) + self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1) for idx in range(1, self.cfg.max_num_partial_prefills + 1): - self.partial_chunked_tokens[idx] = (self.cfg.max_num_batched_tokens // idx) \ - // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size + self.partial_chunked_tokens[idx] = ( + (self.cfg.max_num_batched_tokens // idx) + // self.cfg.cache_config.block_size + * self.cfg.cache_config.block_size + ) self.partial_chunked_tokens[idx] = max(1, self.partial_chunked_tokens[idx]) self._finalizer = weakref.finalize(self, self._exit_sub_services) @@ -168,8 +172,8 @@ class LLMEngine(object): time.sleep(3) if self.do_profile == 0 and ( - self.cfg.cache_config.enable_prefix_caching \ - or self.cfg.splitwise_role != "mixed"): + self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed" + ): device_ids = self.cfg.device_ids.split(",") self.cache_manager_processes = self.resource_manager.cache_manager.launch_cache_manager( cache_config=self.cfg.cache_config, @@ -177,16 +181,15 @@ class LLMEngine(object): device_ids=device_ids, pod_ip=self.cfg.master_ip, engine_worker_queue_port=self.cfg.engine_worker_queue_port, - pid_suffix=self.ipc_signal_suffix) + pid_suffix=self.ipc_signal_suffix, + ) self.worker_proc = self._start_worker_service() console_logger.info("Waitting worker processes ready...") time.sleep(5) self.worker_init_status = dict() if not self.check_worker_initialize_status(): - console_logger.error( - "Failed to launch worker processes, check log/workerlog.* for more details." - ) + console_logger.error("Failed to launch worker processes, check log/workerlog.* for more details.") return False # Start warmup if enabled @@ -199,17 +202,16 @@ class LLMEngine(object): self.token_processor.tasks_queue = self.engine_worker_queue - self.insert_task_to_worker_thread = threading.Thread( - target=self._insert_task_to_worker, daemon=True) + self.insert_task_to_worker_thread = threading.Thread(target=self._insert_task_to_worker, daemon=True) self.insert_task_to_worker_thread.start() if self.api_server_pid is not None: self.insert_task_to_scheduler_thread = threading.Thread( - target=self._insert_zmq_task_to_scheduler, daemon=True) + target=self._insert_zmq_task_to_scheduler, daemon=True + ) self.insert_task_to_scheduler_thread.start() - self.receive_output_thread = threading.Thread( - target=self._zmq_send_generated_tokens, daemon=True) + self.receive_output_thread = threading.Thread(target=self._zmq_send_generated_tokens, daemon=True) self.receive_output_thread.start() # Start TokenProcessor thread @@ -223,8 +225,7 @@ class LLMEngine(object): self.engine_worker_queue.available_prefill_instances.put(1) self.split_mode_get_tasks() if self.cfg.scheduler_config.name == "splitwise": - self.splitwise_receive_thread = threading.Thread( - target=self.split_connector.start_receiver, args=()) + self.splitwise_receive_thread = threading.Thread(target=self.split_connector.start_receiver, args=()) self.splitwise_receive_thread.daemon = True self.splitwise_receive_thread.start() @@ -240,20 +241,28 @@ class LLMEngine(object): if self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1: self.dp_processed = [] - for i in range(1, self.cfg.parallel_config.data_parallel_size // self.cfg.nnode): + for i in range( + 1, + self.cfg.parallel_config.data_parallel_size // self.cfg.nnode, + ): time.sleep(1) self.dp_processed.append( - multiprocessing.Process(target=start_expert_service, - args=(self.cfg, - i + self.cfg.node_rank * self.cfg.worker_num_per_node, - self.ipc_signal_suffix))) - llm_logger.info(f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" \ - + " data parallel id {}".format(i)) + multiprocessing.Process( + target=start_expert_service, + args=( + self.cfg, + i + self.cfg.node_rank * self.cfg.worker_num_per_node, + self.ipc_signal_suffix, + ), + ) + ) + llm_logger.info( + f"Engine is initialized successfully with {self.cfg.tensor_parallel_size}" + + f" data parallel id {i}" + ) self.dp_processed[-1].start() - console_logger.info( - "Worker processes are launched with {} seconds.".format( - time.time() - start_time)) + console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.") return True def _zmq_send_generated_tokens(self): @@ -271,8 +280,7 @@ class LLMEngine(object): self.zmq_server.send_multipart(request_id, contents) except Exception as e: - llm_logger.error("Unexcepted error happend: {}, {}".format( - e, str(traceback.format_exc()))) + llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}") def _get_generated_result(self): """ @@ -296,8 +304,7 @@ class LLMEngine(object): time.sleep(0.001) continue if self.exist_prefill_task_signal.value[0] > 0: - if self.cfg.splitwise_role == "mixed" or \ - self.split_connector.has_splitwise_tasks(): + if self.cfg.splitwise_role == "mixed" or self.split_connector.has_splitwise_tasks(): time.sleep(0.005) continue if self.engine_worker_queue.num_cache_infos() > 0: @@ -309,17 +316,17 @@ class LLMEngine(object): num_prefill_batch = min( int(self.resource_manager.available_batch()), - self.cfg.max_prefill_batch) + self.cfg.max_prefill_batch, + ) self.resource_manager.check_and_free_block_tables() tasks = self.scheduler.get_requests( - available_blocks=self.resource_manager.available_block_num( - ), + available_blocks=self.resource_manager.available_block_num(), block_size=self.cfg.cache_config.block_size, - reserved_output_blocks=self.cfg.cache_config. - enc_dec_block_num, + reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, max_num_batched_tokens=self.cfg.max_num_batched_tokens, - batch=num_prefill_batch) + batch=num_prefill_batch, + ) if len(tasks) == 0: time.sleep(0.001) @@ -328,16 +335,14 @@ class LLMEngine(object): current_id = (current_id + 1) % 100003 if self.cfg.splitwise_role != "mixed": llm_logger.info("Inserting splitwise tasks") - self.split_connector.send_splitwise_tasks( - tasks, current_id) + self.split_connector.send_splitwise_tasks(tasks, current_id) self.insert_tasks(tasks, current_id) main_process_metrics.num_requests_waiting.dec(len(tasks)) main_process_metrics.num_requests_running.inc(len(tasks)) except Exception as e: - err_msg = "Error happend while insert task to engine: {}, {}.".format( - e, str(traceback.format_exc())) + err_msg = f"Error happend while insert task to engine: {e}, {traceback.format_exc()!s}." llm_logger.error(err_msg) def _insert_zmq_task_to_scheduler(self): @@ -353,8 +358,7 @@ class LLMEngine(object): else: err, data = self.zmq_server.receive_pyobj_once(block) if err is not None: - llm_logger.error( - "Engine stops inserting zmq task into scheduler") + llm_logger.error("Engine stops inserting zmq task into scheduler") break request, insert_task = None, [] @@ -363,13 +367,11 @@ class LLMEngine(object): request = Request.from_dict(data) start_span("ENQUEUE_ZMQ", data, trace.SpanKind.PRODUCER) - llm_logger.debug(f"Receive request: {request}") err_msg = None if self.guided_decoding_checker is not None: - request, err_msg = self.guided_decoding_checker.schema_format( - request) + request, err_msg = self.guided_decoding_checker.schema_format(request) if err_msg is not None: llm_logger.error(err_msg) @@ -394,17 +396,20 @@ class LLMEngine(object): main_process_metrics.num_requests_waiting.inc(1) continue - error_result = RequestOutput(request_id=request_id, - finished=True, - error_code=500, - error_msg=failed) + error_result = RequestOutput( + request_id=request_id, + finished=True, + error_code=500, + error_msg=failed, + ) # Since the request is not in scheduler # Send result by zmq directly self.zmq_server.send_multipart(request_id, error_result) except Exception as e: llm_logger.error( f"Error happend while receving new request from zmq, details={e}, " - f"traceback={traceback.format_exc()}") + f"traceback={traceback.format_exc()}" + ) def add_requests(self, task, sampling_params=None, **kwargs): """ @@ -428,23 +433,25 @@ class LLMEngine(object): enable_thinking = None if kwargs is not None: enable_thinking = kwargs.get("enable_thinking", None) - request = self.data_processor.process_request( - request, self.cfg.max_model_len, enable_thinking=enable_thinking) + request = self.data_processor.process_request(request, self.cfg.max_model_len, enable_thinking=enable_thinking) request.prompt_token_ids_len = len(request.prompt_token_ids) input_ids_len = request.prompt_token_ids_len request.set( "max_tokens", - min(self.cfg.max_model_len - input_ids_len, - request.get("max_tokens"))) + min( + self.cfg.max_model_len - input_ids_len, + request.get("max_tokens"), + ), + ) if request.get("reasoning_max_tokens") is None: - default_reasoning_max_tokens = max( - int(request.get("max_tokens") * 0.8), 1) + default_reasoning_max_tokens = max(int(request.get("max_tokens") * 0.8), 1) request.set("reasoning_max_tokens", default_reasoning_max_tokens) min_tokens = request.get("min_tokens") if input_ids_len + min_tokens >= self.cfg.max_model_len: error_msg = ( f"Input text is too long, length of prompt token({input_ids_len}) " - f"+ min_dec_len ({min_tokens}) >= max_model_len ") + f"+ min_dec_len ({min_tokens}) >= max_model_len " + ) llm_logger.error(error_msg) raise EngineError(error_msg, error_code=400) @@ -456,16 +463,14 @@ class LLMEngine(object): raise EngineError(error_msg, error_code=400) if self.guided_decoding_checker is not None: - request, err_msg = self.guided_decoding_checker.schema_format( - request) + request, err_msg = self.guided_decoding_checker.schema_format(request) if err_msg is not None: llm_logger.error(err_msg) raise EngineError(err_msg, error_code=400) request.preprocess_end_time = time.time() self.scheduler.put_requests([request]) - llm_logger.info( - f"Cache task with request_id ({request.get('request_id')})") + llm_logger.info(f"Cache task with request_id ({request.get('request_id')})") llm_logger.debug(f"cache task: {request}") def warmup(self): @@ -486,25 +491,19 @@ class LLMEngine(object): processed_indices = [] for idx, task in enumerate(self.waiting_requests): - if self.resource_manager.is_resource_sufficient( - task.prompt_token_ids_len): + if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len): self.insert_tasks([task]) - llm_logger.info( - f"Resource available, processing task {task.request_id}" - ) + llm_logger.info(f"Resource available, processing task {task.request_id}") processed_indices.append(idx) else: - llm_logger.debug( - f"Still waiting for resources {task.request_id}" - ) + llm_logger.debug(f"Still waiting for resources {task.request_id}") break for idx in sorted(processed_indices, reverse=True): self.waiting_requests.pop(idx) if not self.engine_worker_queue.disaggregate_queue_empty(): - items = self.engine_worker_queue.get_disaggregated_tasks( - ) + items = self.engine_worker_queue.get_disaggregated_tasks() for item in items: role = item[0] tasks = item[1] @@ -515,7 +514,7 @@ class LLMEngine(object): self.insert_tasks(tasks) elif role == "decode": - if hasattr(tasks[0], 'finished'): + if hasattr(tasks[0], "finished"): if not isinstance(tasks, list): tasks = [tasks] for task in tasks: @@ -527,25 +526,19 @@ class LLMEngine(object): else: if len(self.waiting_requests): - llm_logger.info( - f"Waiting for resource for task {tasks[0].request_id}" - ) + llm_logger.info(f"Waiting for resource for task {tasks[0].request_id}") self.waiting_requests.extend(tasks) else: new_waiting = [] for task in tasks: - if self.resource_manager.is_resource_sufficient( - task.prompt_token_ids_len): + if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len): self.insert_tasks([task]) else: new_waiting.append(task) if new_waiting: - self.waiting_requests.extend( - new_waiting) - llm_logger.info( - f"Added {len(new_waiting)} tasks to waiting queue" - ) + self.waiting_requests.extend(new_waiting) + llm_logger.info(f"Added {len(new_waiting)} tasks to waiting queue") else: time.sleep(0.001) @@ -572,13 +565,10 @@ class LLMEngine(object): if current_request_size[idx] <= 0: chunk_request_num -= 1 - if not self.cfg.cache_config.enable_chunked_prefill or len( - requests) == 0: + if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0: return - current_request_size = [ - request.prompt_token_ids_len for request in requests - ] + current_request_size = [request.prompt_token_ids_len for request in requests] requests_chunk = [[] for _ in range(len(requests))] chunk_request_num = len(current_request_size) while chunk_request_num >= 1: @@ -588,25 +578,25 @@ class LLMEngine(object): continue chunk_size = min( current_request_size[idx], - self.partial_chunked_tokens[chunk_request_num]) + self.partial_chunked_tokens[chunk_request_num], + ) update_tokens(idx, chunk_size) while remain_batched_tokens >= self.cfg.cache_config.block_size: # 当前 max_num_batched_tokens 还有剩余时,优先分配给较短的请求 - waiting_requests = [ - input_lens for input_lens in current_request_size - if input_lens > 0 - ] + waiting_requests = [input_lens for input_lens in current_request_size if input_lens > 0] if len(waiting_requests) == 0: break - available_tokens = remain_batched_tokens // self.cfg.cache_config.block_size * \ - self.cfg.cache_config.block_size + available_tokens = ( + remain_batched_tokens // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size + ) append_idx = current_request_size.index(min(waiting_requests)) chunk_size = min( current_request_size[append_idx], self.partial_chunked_tokens[chunk_request_num], - available_tokens) + available_tokens, + ) update_tokens(append_idx, chunk_size, update_chunk=True) for idx in range(len(requests)): @@ -616,8 +606,7 @@ class LLMEngine(object): """ update each multimodal request's chunk size info """ - if not self.cfg.cache_config.enable_chunked_prefill or len( - requests) == 0: + if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0: return for request in requests: @@ -628,12 +617,9 @@ class LLMEngine(object): inputs["grid_thw"] = np.array([], dtype="int64") inputs["images"] = np.array([], dtype="uint8") input_ids = paddle.to_tensor(inputs["input_ids"], dtype="int64") - image_type_ids = paddle.to_tensor(inputs["image_type_ids"], - dtype="int32") + image_type_ids = paddle.to_tensor(inputs["image_type_ids"], dtype="int32") image_mask = input_ids == self.data_processor.image_patch_id - image_token_sum = paddle.full(shape=[len(input_ids) + 1], - fill_value=0, - dtype="int32") + image_token_sum = paddle.full(shape=[len(input_ids) + 1], fill_value=0, dtype="int32") image_token_sum[1:] = paddle.cumsum(image_mask.cast("int32")) grid_thw = [] for one in inputs["grid_thw"]: @@ -644,45 +630,46 @@ class LLMEngine(object): grid_thw = paddle.to_tensor(grid_thw, dtype="int64") from fastdeploy.model_executor.ops.gpu import get_mm_split_fuse + chunk_image_num, chunk_seq_len = get_mm_split_fuse( - input_ids, image_type_ids, image_token_sum, grid_thw, - self.data_processor.image_patch_id, len(grid_thw), 0, - len(input_ids), 0, self.partial_chunked_tokens[1], 2048) + input_ids, + image_type_ids, + image_token_sum, + grid_thw, + self.data_processor.image_patch_id, + len(grid_thw), + 0, + len(input_ids), + 0, + self.partial_chunked_tokens[1], + 2048, + ) grid_thw = grid_thw.numpy().reshape([-1, 3]) num_chunks = len(chunk_image_num) chunks_info = [] input_ids_st, image_type_ids_st, grid_thw_st, patch_st = 0, 0, 0, 0 for idx in range(num_chunks): - chunk_input_ids = inputs["input_ids"][ - input_ids_st:input_ids_st + chunk_seq_len[idx]] - chunk_token_type_ids = inputs["token_type_ids"][ - input_ids_st:input_ids_st + chunk_seq_len[idx]] - actual_image_num = np.sum(grid_thw[grid_thw_st:grid_thw_st + - chunk_image_num[idx], 0]) + chunk_input_ids = inputs["input_ids"][input_ids_st : input_ids_st + chunk_seq_len[idx]] + chunk_token_type_ids = inputs["token_type_ids"][input_ids_st : input_ids_st + chunk_seq_len[idx]] + actual_image_num = np.sum(grid_thw[grid_thw_st : grid_thw_st + chunk_image_num[idx], 0]) chunk_image_type_ids = inputs["image_type_ids"][ - image_type_ids_st:image_type_ids_st + actual_image_num] - chunk_grid_thw = grid_thw[grid_thw_st:grid_thw_st + - chunk_image_num[idx]] + image_type_ids_st : image_type_ids_st + actual_image_num + ] + chunk_grid_thw = grid_thw[grid_thw_st : grid_thw_st + chunk_image_num[idx]] chunk_patch_num = np.sum(np.prod(chunk_grid_thw, axis=1)) - chunk_images = inputs["images"][patch_st:patch_st + - chunk_patch_num] + chunk_images = inputs["images"][patch_st : patch_st + chunk_patch_num] - chunks_info.append({ - "input_ids": - chunk_input_ids, - "token_type_ids": - chunk_token_type_ids, - "image_type_ids": - chunk_image_type_ids - if chunk_image_type_ids.shape[0] else None, - "grid_thw": - chunk_grid_thw if chunk_grid_thw.shape[0] else None, - "images": - chunk_images if chunk_images.shape[0] else None, - "position_ids": - None - }) + chunks_info.append( + { + "input_ids": chunk_input_ids, + "token_type_ids": chunk_token_type_ids, + "image_type_ids": (chunk_image_type_ids if chunk_image_type_ids.shape[0] else None), + "grid_thw": (chunk_grid_thw if chunk_grid_thw.shape[0] else None), + "images": (chunk_images if chunk_images.shape[0] else None), + "position_ids": None, + } + ) input_ids_st += chunk_seq_len[idx] image_type_ids_st += actual_image_num @@ -704,18 +691,14 @@ class LLMEngine(object): del self.resource_manager.req_dict[task.request_id] cur_task = self.resource_manager.tasks_list[cur_task_idx] cur_task.prompt_token_ids[0] = task.outputs.token_ids[0] - if self.cfg.speculative_config.method in [ - "mtp" - ] and self.cfg.splitwise_role == "decode": - cur_task.draft_token_ids = copy.deepcopy( - task.outputs.draft_token_ids) + if self.cfg.speculative_config.method in ["mtp"] and self.cfg.splitwise_role == "decode": + cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids) if task.error_code != 200: self.resource_manager.stop_flags[cur_task_idx] = True self.resource_manager.tasks_list[cur_task_idx] = None self.resource_manager._recycle_block_tables(cur_task) if task.request_id in self.token_processor.tokens_counter: - del self.token_processor.tokens_counter[ - task.request_id] + del self.token_processor.tokens_counter[task.request_id] self.scheduler.put_results([task]) llm_logger.warning( f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource." @@ -723,8 +706,7 @@ class LLMEngine(object): continue self.token_processor.tokens_counter[task.request_id] = 1 current_tasks.append(cur_task) - self.engine_worker_queue.put_tasks( - (current_tasks, self.resource_manager.real_bsz)) + self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz)) return True self.resource_manager.check_and_free_block_tables() @@ -737,9 +719,7 @@ class LLMEngine(object): available_batch = np.sum(self.resource_manager.stop_flags) if len(tasks) > available_batch: - llm_logger.error( - "Inserting batch:{} exceeds the available batch:{}.".format( - len(tasks), available_batch)) + llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.") llm_logger.error("The exceeded part will be ignored!") tasks = tasks[:available_batch] @@ -763,8 +743,7 @@ class LLMEngine(object): is_decode = True else: is_prefill = True - self.token_processor.number_of_input_tokens += tasks[ - i].prompt_token_ids_len + self.token_processor.number_of_input_tokens += tasks[i].prompt_token_ids_len self.split_connector.send_cache_infos(tasks, current_id) if not is_decode: @@ -776,8 +755,7 @@ class LLMEngine(object): self.update_requests_chunk_size(tasks) else: self.update_mm_requests_chunk_size(tasks) - self.engine_worker_queue.put_tasks( - (tasks, self.resource_manager.real_bsz)) + self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz)) if is_prefill and self.cfg.scheduler_config.name != "splitwise": self.engine_worker_queue.available_prefill_instances.put(1) return True @@ -793,8 +771,7 @@ class LLMEngine(object): """ judge if all tasks are finished """ - return np.sum(self.resource_manager.stop_flags) == len( - self.resource_manager.stop_flags) + return np.sum(self.resource_manager.stop_flags) == len(self.resource_manager.stop_flags) def _set_warmup_token_processor(self): """ @@ -824,8 +801,7 @@ class LLMEngine(object): judge if all worker processes are ready """ - if np.sum(self.worker_ready_signal.value - ) == self.cfg.worker_num_per_node: + if np.sum(self.worker_ready_signal.value) == self.cfg.worker_num_per_node: return True return False @@ -835,30 +811,33 @@ class LLMEngine(object): """ # worker_ready_signatensor_parallel_size worker_ready_signal_data = np.zeros(shape=[self.cfg.worker_num_per_node], dtype=np.int32) - self.worker_ready_signal = IPCSignal(name="worker_ready_signal", - array=worker_ready_signal_data, - dtype=np.int32, - suffix=self.ipc_signal_suffix, - create=True) + self.worker_ready_signal = IPCSignal( + name="worker_ready_signal", + array=worker_ready_signal_data, + dtype=np.int32, + suffix=self.ipc_signal_suffix, + create=True, + ) # exist_task_signal 用于各worker进程感知是否有新Task需要处理 - exist_task_signal_data = np.zeros( - [self.cfg.parallel_config.data_parallel_size], dtype=np.int32) - self.exist_task_signal = IPCSignal(name="exist_task_signal", - array=exist_task_signal_data, - dtype=np.int32, - suffix=self.ipc_signal_suffix, - create=True) + exist_task_signal_data = np.zeros([self.cfg.parallel_config.data_parallel_size], dtype=np.int32) + self.exist_task_signal = IPCSignal( + name="exist_task_signal", + array=exist_task_signal_data, + dtype=np.int32, + suffix=self.ipc_signal_suffix, + create=True, + ) # exist_swapped_task_signal 用于engine感知worker中是否存在swapped task - exist_swapped_task_signal_data = np.zeros( - [self.cfg.parallel_config.data_parallel_size], dtype=np.int32) + exist_swapped_task_signal_data = np.zeros([self.cfg.parallel_config.data_parallel_size], dtype=np.int32) self.exist_swapped_task_signal = IPCSignal( name="exist_swapped_task_signal", array=exist_swapped_task_signal_data, dtype=np.int32, suffix=self.ipc_signal_suffix, - create=True) + create=True, + ) # exist_prefill_task_signal 用于各worker进程感知是否进行prefill exist_prefill_task_signal_data = np.zeros([1], dtype=np.int32) @@ -867,17 +846,18 @@ class LLMEngine(object): array=exist_prefill_task_signal_data, dtype=np.int32, suffix=self.ipc_signal_suffix, - create=True) + create=True, + ) # worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间 - worker_healthy_live_recorded_time_array = np.zeros(shape=[self.cfg.worker_num_per_node], - dtype=np.int32) + worker_healthy_live_recorded_time_array = np.zeros(shape=[self.cfg.worker_num_per_node], dtype=np.int32) self.worker_healthy_live_signal = IPCSignal( name="worker_healthy_live_signal", array=worker_healthy_live_recorded_time_array, dtype=np.int32, suffix=self.ipc_signal_suffix, - create=True) + create=True, + ) if self.do_profile: get_profile_block_num = np.zeros([self.cfg.worker_num_per_node], dtype=np.int32) @@ -886,7 +866,8 @@ class LLMEngine(object): array=get_profile_block_num, dtype=np.int32, suffix=self.ipc_signal_suffix, - create=True) + create=True, + ) model_weights_status = np.zeros([1], dtype=np.int32) self.model_weights_status_signal = IPCSignal( @@ -894,7 +875,8 @@ class LLMEngine(object): array=model_weights_status, dtype=np.int32, suffix=self.ipc_signal_suffix, - create=True) + create=True, + ) def _exit_sub_services(self): """ @@ -903,8 +885,7 @@ class LLMEngine(object): self.running = False if hasattr(self, "cache_manager_processes"): - self.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear( - ) + self.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear() self.resource_manager.cache_manager.cache_ready_signal.clear() for p in self.cache_manager_processes: llm_logger.info(f"Killing cache manager process {p.pid}") @@ -943,7 +924,7 @@ class LLMEngine(object): "TRAINER_INSTANCES_NUM": 1, "TRAINER_INSTANCES": "0.0.0.0", "ENABLE_FASTDEPLOY_LOAD_MODEL_CONCURRENCY": 0, - "LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.device_ids.split(',')), + "LOAD_STATE_DICT_THREAD_NUM": len(self.cfg.device_ids.split(",")), "PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python", "FLAGS_use_append_attn": 1, "NCCL_ALGO": "Ring", @@ -951,24 +932,22 @@ class LLMEngine(object): "FLAGS_hardamard_moe_block_size": 128, } # environment variables needed by Dy2St - variables.update({ - "SOT_LOG_LEVEL": - os.getenv("SOT_LOG_LEVEL", default="0"), - "SOT_UNSAFE_CACHE_FASTPATH": - os.getenv("SOT_UNSAFE_CACHE_FASTPATH", default="1"), - "SOT_ENABLE_0_SIZE_FALLBACK": - os.getenv("SOT_ENABLE_0_SIZE_FALLBACK", default="0"), - "FLAGS_specialize_device_in_dy2st": - os.getenv("FLAGS_specialize_device_in_dy2st", default="1"), - "FLAGS_enable_async_fast_gc": - os.getenv("FLAGS_enable_async_fast_gc", default="0"), - "FLAGS_pir_interpreter_record_stream_for_gc_cache": - os.getenv("FLAGS_pir_interpreter_record_stream_for_gc_cache", - default="1"), - "FLAGS_parameters_persistent_mode_in_dy2st": - os.getenv("FLAGS_parameters_persistent_mode_in_dy2st", - default="1"), - }) + variables.update( + { + "SOT_LOG_LEVEL": os.getenv("SOT_LOG_LEVEL", default="0"), + "SOT_UNSAFE_CACHE_FASTPATH": os.getenv("SOT_UNSAFE_CACHE_FASTPATH", default="1"), + "SOT_ENABLE_0_SIZE_FALLBACK": os.getenv("SOT_ENABLE_0_SIZE_FALLBACK", default="0"), + "FLAGS_specialize_device_in_dy2st": os.getenv("FLAGS_specialize_device_in_dy2st", default="1"), + "FLAGS_enable_async_fast_gc": os.getenv("FLAGS_enable_async_fast_gc", default="0"), + "FLAGS_pir_interpreter_record_stream_for_gc_cache": os.getenv( + "FLAGS_pir_interpreter_record_stream_for_gc_cache", + default="1", + ), + "FLAGS_parameters_persistent_mode_in_dy2st": os.getenv( + "FLAGS_parameters_persistent_mode_in_dy2st", default="1" + ), + } + ) if self.cfg.splitwise_role != "mixed": variables["FLAGS_use_pd_disaggregation"] = 1 @@ -994,8 +973,7 @@ class LLMEngine(object): current_file_path = os.path.abspath(__file__) current_dir_path = os.path.split(current_file_path)[0] # TODO - uncache_worker_stdout = "" if os.getenv("UNCACHE_WORKER_STDOUT", - "0") == 1 else "-u" + uncache_worker_stdout = "" if os.getenv("UNCACHE_WORKER_STDOUT", "0") == 1 else "-u" pd_cmd = f"{command_prefix} {sys.executable} {uncache_worker_stdout} -m paddle.distributed.launch" pd_cmd = pd_cmd + f" --log_dir {log_dir}" @@ -1004,7 +982,7 @@ class LLMEngine(object): ori_vocab_size = ( len(self.data_processor.tokenizer.sp_model) - if hasattr(self.data_processor.tokenizer, 'sp_model') + if hasattr(self.data_processor.tokenizer, "sp_model") else len(self.data_processor.tokenizer.vocab) ) @@ -1012,10 +990,10 @@ class LLMEngine(object): f" --devices {self.cfg.device_ids} {py_script}" f" --max_num_seqs {self.cfg.max_num_seqs} --max_model_len {self.cfg.max_model_len}" f" --gpu_memory_utilization {self.cfg.cache_config.gpu_memory_utilization}" - f" --model_name_or_path {str(self.cfg.model_name_or_path)}" + f" --model_name_or_path {self.cfg.model_name_or_path!s}" f" --device_ids {self.cfg.device_ids}" f" --tensor_parallel_size {self.cfg.tensor_parallel_size}" - f" --engine_worker_queue_port {str(self.cfg.engine_worker_queue_port)}" + f" --engine_worker_queue_port {self.cfg.engine_worker_queue_port!s}" f" --pod_ip {self.cfg.master_ip}" f" --total_block_num {self.cfg.cache_config.total_block_num}" f" --block_size {self.cfg.cache_config.block_size}" @@ -1036,16 +1014,13 @@ class LLMEngine(object): f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}" f" --graph_optimization_config '{self.cfg.graph_optimization_config.to_json_string()}'" f" --guided_decoding_backend {self.cfg.guided_decoding_backend}" - f" --load_strategy {self.cfg.model_config.load_strategy}") - + f" --load_strategy {self.cfg.model_config.load_strategy}" + ) worker_append_flag = { - "enable_expert_parallel": - self.cfg.parallel_config.enable_expert_parallel, - "enable_prefix_caching": - self.cfg.cache_config.enable_prefix_caching, - "enable_chunked_prefill": - self.cfg.cache_config.enable_chunked_prefill, + "enable_expert_parallel": self.cfg.parallel_config.enable_expert_parallel, + "enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching, + "enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill, "do_profile": self.do_profile, "dynamic_load_weight": self.cfg.model_config.dynamic_load_weight, "disable_any_whitespace": self.cfg.disable_any_whitespace, @@ -1059,11 +1034,11 @@ class LLMEngine(object): if self.cfg.nnode > 1: pd_cmd = pd_cmd + ( f" --master {self.cfg.dist_init_addr}" - f" --nnodes {str(self.cfg.nnode)}" - f" --rank {str(self.cfg.node_rank)}" + f" --nnodes {self.cfg.nnode!s}" + f" --rank {self.cfg.node_rank!s}" ) pd_cmd = pd_cmd + arguments + f" 2>{log_dir}/launch_worker.log" - llm_logger.info("Launch worker service command: {}".format(pd_cmd)) + llm_logger.info(f"Launch worker service command: {pd_cmd}") p = subprocess.Popen( pd_cmd, stdout=subprocess.PIPE, @@ -1111,8 +1086,7 @@ class LLMEngine(object): try: req_id = self._format_and_add_data(prompts) except Exception as e: - llm_logger.error( - f"Error happend while adding request, details={e}") + llm_logger.error(f"Error happend while adding request, details={e}") raise EngineError(str(e), error_code=400) # 获取当前请求的结果 @@ -1151,8 +1125,7 @@ class LLMEngine(object): if num_gpu_blocks < 0: num_gpu_blocks = self.get_profile_block_num_signal.value[i] else: - num_gpu_blocks = min( - num_gpu_blocks, self.get_profile_block_num_signal.value[i]) + num_gpu_blocks = min(num_gpu_blocks, self.get_profile_block_num_signal.value[i]) self.cfg.cache_config.reset(num_gpu_blocks) self.resource_manager.reset_cache_config(self.cfg.cache_config) @@ -1164,15 +1137,16 @@ class LLMEngine(object): device_ids=device_ids, pod_ip=self.cfg.master_ip, engine_worker_queue_port=self.cfg.engine_worker_queue_port, - pid_suffix=self.ipc_signal_suffix) + pid_suffix=self.ipc_signal_suffix, + ) + def check_health(self, time_interval_threashold=30): """ Check the health of the model server by checking whether all workers are alive. """ if self.worker_healthy_live_signal.value[0]: - elapsed_time = time.time() - \ - self.worker_healthy_live_signal.value[0] + elapsed_time = time.time() - self.worker_healthy_live_signal.value[0] if elapsed_time > time_interval_threashold: return False, "Worker Service Not Healthy" @@ -1185,38 +1159,31 @@ class LLMEngine(object): def detect_thread(): for line in self.worker_proc.stdout: - line = line.decode('utf-8', errors='ignore') + line = line.decode("utf-8", errors="ignore") if self.worker_init_status.get("finished", False): break if match := re.search( - r'Loading (?:fastsafetensors |safetensors )?checkpoint shards:\s*(\d+)', - line): - self.worker_init_status["weight_loadding"] = eval( - match.group(1)) * 1.0 / 100 - elif (match := re.search(r'Start load layer (\d+)', - line)) or (match := re.search( - r'set state for layer (\d+)', - line)): - progress = eval(match.group( - 1)) * 1.0 / self.cfg.model_config.num_layers + r"Loading (?:fastsafetensors |safetensors )?checkpoint shards:\s*(\d+)", + line, + ): + self.worker_init_status["weight_loadding"] = eval(match.group(1)) * 1.0 / 100 + elif (match := re.search(r"Start load layer (\d+)", line)) or ( + match := re.search(r"set state for layer (\d+)", line) + ): + progress = eval(match.group(1)) * 1.0 / self.cfg.model_config.num_layers self.worker_init_status["layer_loadding"] = progress - if self.worker_init_status[ - "layer_loadding"] == self.cfg.model_config.num_layers - 1: + if self.worker_init_status["layer_loadding"] == self.cfg.model_config.num_layers - 1: self.worker_init_status["finished"] = True - self.checking_worker_status_thread = threading.Thread( - target=detect_thread, daemon=True) + self.checking_worker_status_thread = threading.Thread(target=detect_thread, daemon=True) self.checking_worker_status_thread.start() # display weight loadding progress with tqdm(total=100, desc="Loading Weights") as pbar: progress = 0 while progress < 100: - progress = int( - self.worker_init_status.get("weight_loadding", 0) * 100) - if self.worker_init_status.get( - "layer_loadding", - 0) > 0 or self._worker_processes_ready(): + progress = int(self.worker_init_status.get("weight_loadding", 0) * 100) + if self.worker_init_status.get("layer_loadding", 0) > 0 or self._worker_processes_ready(): progress = 100 pbar.update(progress - pbar.n) pbar.refresh() @@ -1228,8 +1195,7 @@ class LLMEngine(object): with tqdm(total=100, desc="Loading Layers") as pbar: progress = 0 while progress < 100: - progress = int( - self.worker_init_status.get("layer_loadding", 0) * 100) + progress = int(self.worker_init_status.get("layer_loadding", 0) * 100) if self._worker_processes_ready(): progress = 100 pbar.update(progress - pbar.n) @@ -1256,19 +1222,21 @@ class LLMEngine(object): address=address, is_server=True, num_client=self.cfg.tensor_parallel_size, - local_data_parallel_size=self.cfg.parallel_config. - data_parallel_size) + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, + ) - if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != 'mixed': + if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed": self.cache_task_queue = EngineCacheQueue( - address=(self.cfg.master_ip, self.cfg.cache_config.cache_queue_port), - authkey=b'cache_queue_service', + address=( + self.cfg.master_ip, + self.cfg.cache_config.cache_queue_port, + ), + authkey=b"cache_queue_service", is_server=True, num_client=self.cfg.tensor_parallel_size, client_id=-1, - local_data_parallel_size=self.cfg.parallel_config. - data_parallel_size) - + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, + ) self.engine_worker_queue = EngineWorkerQueue( address=address, @@ -1276,5 +1244,8 @@ class LLMEngine(object): num_client=self.cfg.tensor_parallel_size, client_id=0, local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, - local_data_parallel_id= min(self.cfg.worker_num_per_node * self.cfg.node_rank, - self.cfg.parallel_config.data_parallel_size - 1)) + local_data_parallel_id=min( + self.cfg.worker_num_per_node * self.cfg.node_rank, + self.cfg.parallel_config.data_parallel_size - 1, + ), + ) diff --git a/fastdeploy/engine/expert_service.py b/fastdeploy/engine/expert_service.py index 66607da82..f2f5e9e17 100644 --- a/fastdeploy/engine/expert_service.py +++ b/fastdeploy/engine/expert_service.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from __future__ import annotations import os @@ -32,7 +33,7 @@ from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector from fastdeploy.utils import EngineError, console_logger, llm_logger -class ExpertService(object): +class ExpertService: """ Engine class responsible for managing the Large Language Model (LLM) operations. @@ -51,17 +52,14 @@ class ExpertService(object): self.cfg = cfg start_pos = (local_data_parallel_id * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node end_pos = ((local_data_parallel_id + 1) * self.cfg.tensor_parallel_size) % self.cfg.worker_num_per_node - self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[ - start_pos:end_pos] - self.cfg.local_device_ids = self.cfg.device_ids.split( - ",")[start_pos:end_pos] + self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos] + self.cfg.local_device_ids = self.cfg.device_ids.split(",")[start_pos:end_pos] self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id self.cfg.disaggregate_info = None self.scheduler = cfg.scheduler_config.scheduler() - self.scheduler.reset_nodeid( - f"{self.scheduler.infer.nodeid}_{str(local_data_parallel_id)}") + self.scheduler.reset_nodeid(f"{self.scheduler.infer.nodeid}_{local_data_parallel_id!s}") self.cfg.parallel_config.local_data_parallel_id = local_data_parallel_id @@ -73,33 +71,41 @@ class ExpertService(object): num_client=cfg.tensor_parallel_size, local_data_parallel_id=local_data_parallel_id, ) - self.resource_manager = ResourceManager(cfg.max_num_seqs, cfg, \ - cfg.tensor_parallel_size, cfg.splitwise_role, local_data_parallel_id) + self.resource_manager = ResourceManager( + cfg.max_num_seqs, + cfg, + cfg.tensor_parallel_size, + cfg.splitwise_role, + local_data_parallel_id, + ) if len(self.cfg.cache_config.pd_comm_port) == 1: - self.cfg.cache_config.pd_comm_port[0] = int( - self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id + self.cfg.cache_config.pd_comm_port[0] = int(self.cfg.cache_config.pd_comm_port[0]) + local_data_parallel_id else: - self.cfg.cache_config.pd_comm_port = [ - self.cfg.cache_config.pd_comm_port[local_data_parallel_id] - ] + self.cfg.cache_config.pd_comm_port = [self.cfg.cache_config.pd_comm_port[local_data_parallel_id]] - self.split_connector = SplitwiseConnector(self.cfg, self.scheduler, - self.engine_worker_queue, - self.resource_manager) + self.split_connector = SplitwiseConnector( + self.cfg, + self.scheduler, + self.engine_worker_queue, + self.resource_manager, + ) self.token_processor = TokenProcessor( cfg=cfg, cached_generated_tokens=self.scheduler, engine_worker_queue=self.engine_worker_queue, - split_connector=self.split_connector) + split_connector=self.split_connector, + ) self.token_processor.set_resource_manager(self.resource_manager) - self.partial_chunked_tokens = [0] * ( - self.cfg.max_num_partial_prefills + 1) + self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1) for idx in range(1, self.cfg.max_num_partial_prefills + 1): - self.partial_chunked_tokens[idx] = (self.cfg.max_num_batched_tokens // idx) \ - // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size + self.partial_chunked_tokens[idx] = ( + (self.cfg.max_num_batched_tokens // idx) + // self.cfg.cache_config.block_size + * self.cfg.cache_config.block_size + ) self._finalizer = weakref.finalize(self, self._exit_sub_services) @@ -120,17 +126,15 @@ class ExpertService(object): device_ids=self.cfg.local_device_ids, pod_ip=self.cfg.master_ip, engine_worker_queue_port=self.cfg.engine_worker_queue_port, - pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}" + pid_suffix=f"{local_data_parallel_id}_{ipc_signal_suffix}", ) - self.insert_task_to_worker_thread = threading.Thread( - target=self._insert_task_to_worker, args=()) + self.insert_task_to_worker_thread = threading.Thread(target=self._insert_task_to_worker, args=()) self.insert_task_to_worker_thread.daemon = True self.insert_task_to_worker_thread.start() # Start TokenProcessor thread - os.environ["INFERENCE_MSG_QUEUE_ID"] = str( - local_data_parallel_id + int(self.cfg.engine_worker_queue_port)) + os.environ["INFERENCE_MSG_QUEUE_ID"] = str(local_data_parallel_id + int(self.cfg.engine_worker_queue_port)) self.token_processor.run() @@ -144,9 +148,7 @@ class ExpertService(object): self.scheduler.start(role, host_ip, disaggregate) self.cfg.print() - console_logger.info( - "Worker processes are launched with {} seconds.".format( - time.time() - start_time)) + console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.") return True def _insert_task_to_worker(self): @@ -169,17 +171,17 @@ class ExpertService(object): num_prefill_batch = min( int(self.resource_manager.available_batch()), - self.cfg.max_prefill_batch) + self.cfg.max_prefill_batch, + ) self.resource_manager.check_and_free_block_tables() tasks = self.scheduler.get_requests( - available_blocks=self.resource_manager.available_block_num( - ), + available_blocks=self.resource_manager.available_block_num(), block_size=self.cfg.cache_config.block_size, - reserved_output_blocks=self.cfg.cache_config. - enc_dec_block_num, + reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, max_num_batched_tokens=self.cfg.max_num_batched_tokens, - batch=num_prefill_batch) + batch=num_prefill_batch, + ) if len(tasks) == 0: time.sleep(0.001) @@ -187,8 +189,7 @@ class ExpertService(object): if self.cfg.splitwise_role != "mixed": llm_logger.info("Inserting splitwise tasks") - self.split_connector.send_splitwise_tasks( - tasks, current_id) + self.split_connector.send_splitwise_tasks(tasks, current_id) current_id = (current_id + 1) % 100003 @@ -197,8 +198,7 @@ class ExpertService(object): main_process_metrics.num_requests_waiting.dec(len(tasks)) main_process_metrics.num_requests_running.inc(len(tasks)) except Exception as e: - err_msg = "Error happend while insert task to engine: {}, {}.".format( - e, str(traceback.format_exc())) + err_msg = f"Error happend while insert task to engine: {e}, {traceback.format_exc()!s}." llm_logger.error(err_msg) def split_mode_get_tasks(self): @@ -212,15 +212,13 @@ class ExpertService(object): try: if len(waiting_requests) > 0: for task in waiting_requests: - if self.resource_manager.is_resource_sufficient( - task.prompt_token_ids_len): + if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len): self.insert_tasks([task]) waiting_requests.remove(task) else: break if not self.engine_worker_queue.disaggregate_queue_empty(): - items = self.engine_worker_queue.get_disaggregated_tasks( - ) + items = self.engine_worker_queue.get_disaggregated_tasks() for item in items: role = item[0] tasks = item[1] @@ -231,7 +229,7 @@ class ExpertService(object): self.insert_tasks(tasks) elif role == "decode": llm_logger.info(f"get decode tasks {tasks}") - if hasattr(tasks[0], 'finished'): + if hasattr(tasks[0], "finished"): if not isinstance(tasks, list): tasks = [tasks] for task in tasks: @@ -246,7 +244,8 @@ class ExpertService(object): else: for task in tasks: if not self.resource_manager.is_resource_sufficient( - task.prompt_token_ids_len): + task.prompt_token_ids_len + ): waiting_requests.append(task) else: self.insert_tasks([task]) @@ -274,8 +273,7 @@ class ExpertService(object): self.resource_manager.tasks_list[cur_task_idx] = None self.resource_manager._recycle_block_tables(cur_task) if task.request_id in self.token_processor.tokens_counter: - del self.token_processor.tokens_counter[ - task.request_id] + del self.token_processor.tokens_counter[task.request_id] self.scheduler.put_results([task]) llm_logger.warning( f"{task.request_id} prefill failed with msg:{task.error_msg}, recycle resource." @@ -285,8 +283,7 @@ class ExpertService(object): cur_task.prompt_token_ids[0] = task.outputs.token_ids[0] self.token_processor.tokens_counter[task.request_id] = 1 current_tasks.append(cur_task) - self.engine_worker_queue.put_tasks( - (current_tasks, self.resource_manager.real_bsz)) + self.engine_worker_queue.put_tasks((current_tasks, self.resource_manager.real_bsz)) return True self.resource_manager.check_and_free_block_tables() @@ -299,9 +296,7 @@ class ExpertService(object): available_batch = np.sum(self.resource_manager.stop_flags) if len(tasks) > available_batch: - llm_logger.error( - "Inserting batch:{} exceeds the available batch:{}.".format( - len(tasks), available_batch)) + llm_logger.error(f"Inserting batch:{len(tasks)} exceeds the available batch:{available_batch}.") llm_logger.error("The exceeded part will be ignored!") tasks = tasks[:available_batch] @@ -325,8 +320,7 @@ class ExpertService(object): is_decode = True else: is_prefill = True - self.token_processor.number_of_input_tokens += tasks[ - i].prompt_token_ids_len + self.token_processor.number_of_input_tokens += tasks[i].prompt_token_ids_len self.split_connector.send_cache_infos(tasks, current_id) for task in tasks: @@ -338,8 +332,7 @@ class ExpertService(object): self.update_requests_chunk_size(tasks) else: self.update_mm_requests_chunk_size(tasks) - self.engine_worker_queue.put_tasks( - (tasks, self.resource_manager.real_bsz)) + self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz)) return True def _exit_sub_services(self): @@ -348,8 +341,7 @@ class ExpertService(object): """ if hasattr(self, "cache_manager_processes"): - self.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear( - ) + self.resource_manager.cache_manager.shm_cache_task_flag_broadcast.clear() self.resource_manager.cache_manager.cache_ready_signal.clear() for p in self.cache_manager_processes: llm_logger.info(f"Killing cache manager process {p.pid}") diff --git a/fastdeploy/engine/kv_cache_interface.py b/fastdeploy/engine/kv_cache_interface.py index 5f9479cf5..a872fc8fa 100644 --- a/fastdeploy/engine/kv_cache_interface.py +++ b/fastdeploy/engine/kv_cache_interface.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import copy from dataclasses import dataclass from typing import list @@ -25,6 +26,7 @@ class KVCacheSpec: """ A base class for specifying the KV cache format of one layer. """ + # number of tokens in a block block_size: int # the memory size used by each block in bytes. @@ -37,10 +39,9 @@ class KVCacheSpec: """ # check list assert all( - (spec.block_size == specs[0].block_size - and spec.block_memory_used == specs[0].block_memory_used) - for spec in specs[1:]), ( - "All layers in the model must share the same block_size.") + (spec.block_size == specs[0].block_size and spec.block_memory_used == specs[0].block_memory_used) + for spec in specs[1:] + ), "All layers in the model must share the same block_size." return copy.deepcopy(specs[0]) @@ -48,6 +49,7 @@ class KVCacheSpec: @dataclass class AttentionSpec(KVCacheSpec): """ """ + num_kv_heads: int head_size: int dtype: str diff --git a/fastdeploy/engine/request.py b/fastdeploy/engine/request.py index 8b5c7a005..541254ed6 100644 --- a/fastdeploy/engine/request.py +++ b/fastdeploy/engine/request.py @@ -29,34 +29,35 @@ from fastdeploy.worker.output import LogprobsLists @dataclass class Request: - - def __init__(self, - request_id: str, - prompt: Optional[Union[str, list[str]]], - prompt_token_ids: Optional[list[int]], - prompt_token_ids_len: Optional[int], - messages: Optional[list[list[dict[str, Any]]]], - history: Optional[list[list[str]]], - tools: Optional[list[Dict]], - system: Optional[Union[str, list[str]]], - sampling_params: SamplingParams, - eos_token_ids: Optional[list[int]], - arrival_time: float, - preprocess_start_time: Optional[float] = None, - preprocess_end_time: Optional[float] = None, - multimodal_inputs: Optional[dict] = None, - multimodal_data: Optional[dict] = None, - raw_request: bool = True, - disaggregate_info: Optional[dict] = None, - draft_token_ids: Optional[list[int]] = None, - guided_json: Optional[Any] = None, - guided_regex: Optional[Any] = None, - guided_choice: Optional[Any] = None, - guided_grammar: Optional[Any] = None, - structural_tag: Optional[Any] = None, - guided_json_object: Optional[bool] = None, - enable_thinking: Optional[bool] = True, - trace_carrier: dict = dict()) -> None: + def __init__( + self, + request_id: str, + prompt: Optional[Union[str, list[str]]], + prompt_token_ids: Optional[list[int]], + prompt_token_ids_len: Optional[int], + messages: Optional[list[list[dict[str, Any]]]], + history: Optional[list[list[str]]], + tools: Optional[list[Dict]], + system: Optional[Union[str, list[str]]], + sampling_params: SamplingParams, + eos_token_ids: Optional[list[int]], + arrival_time: float, + preprocess_start_time: Optional[float] = None, + preprocess_end_time: Optional[float] = None, + multimodal_inputs: Optional[dict] = None, + multimodal_data: Optional[dict] = None, + raw_request: bool = True, + disaggregate_info: Optional[dict] = None, + draft_token_ids: Optional[list[int]] = None, + guided_json: Optional[Any] = None, + guided_regex: Optional[Any] = None, + guided_choice: Optional[Any] = None, + guided_grammar: Optional[Any] = None, + structural_tag: Optional[Any] = None, + guided_json_object: Optional[bool] = None, + enable_thinking: Optional[bool] = True, + trace_carrier: dict = dict(), + ) -> None: self.request_id = request_id self.prompt = prompt self.prompt_token_ids = prompt_token_ids @@ -98,35 +99,37 @@ class Request: def from_dict(cls, d: dict): data_processor_logger.debug(f"{d}") sampling_params = SamplingParams.from_dict(d) - return cls(request_id=d["request_id"], - prompt=d.get("prompt"), - prompt_token_ids=d.get("prompt_token_ids"), - prompt_token_ids_len=d.get("prompt_token_ids_len"), - messages=d.get("messages"), - system=d.get("system"), - history=d.get("history"), - tools=d.get("tools"), - sampling_params=sampling_params, - eos_token_ids=d.get("eos_token_ids"), - arrival_time=d.get("arrival_time", time.time()), - preprocess_start_time=d.get("preprocess_start_time"), - preprocess_end_time=d.get("preprocess_end_time"), - multimodal_inputs=d.get("multimodal_inputs"), - multimodal_data=d.get("multimodal_data"), - disaggregate_info=d.get("disaggregate_info"), - draft_token_ids=d.get("draft_token_ids"), - raw_request=d.get("raw_request", True), - guided_json=d.get("guided_json", None), - guided_regex=d.get("guided_regex", None), - guided_choice=d.get("guided_choice", None), - guided_grammar=d.get("guided_grammar", None), - structural_tag=d.get("structural_tag", None), - guided_json_object=d.get("guided_json_object", None), - enable_thinking=d.get("enable_thinking", True), - trace_carrier=d.get("trace_carrier", {})) + return cls( + request_id=d["request_id"], + prompt=d.get("prompt"), + prompt_token_ids=d.get("prompt_token_ids"), + prompt_token_ids_len=d.get("prompt_token_ids_len"), + messages=d.get("messages"), + system=d.get("system"), + history=d.get("history"), + tools=d.get("tools"), + sampling_params=sampling_params, + eos_token_ids=d.get("eos_token_ids"), + arrival_time=d.get("arrival_time", time.time()), + preprocess_start_time=d.get("preprocess_start_time"), + preprocess_end_time=d.get("preprocess_end_time"), + multimodal_inputs=d.get("multimodal_inputs"), + multimodal_data=d.get("multimodal_data"), + disaggregate_info=d.get("disaggregate_info"), + draft_token_ids=d.get("draft_token_ids"), + raw_request=d.get("raw_request", True), + guided_json=d.get("guided_json", None), + guided_regex=d.get("guided_regex", None), + guided_choice=d.get("guided_choice", None), + guided_grammar=d.get("guided_grammar", None), + structural_tag=d.get("structural_tag", None), + guided_json_object=d.get("guided_json_object", None), + enable_thinking=d.get("enable_thinking", True), + trace_carrier=d.get("trace_carrier", {}), + ) def to_dict(self) -> dict: - """convert Request into a serializable dict """ + """convert Request into a serializable dict""" data = { "request_id": self.request_id, "prompt": self.prompt, @@ -146,11 +149,15 @@ class Request: "disaggregate_info": self.disaggregate_info, "draft_token_ids": self.draft_token_ids, "enable_thinking": self.enable_thinking, - "trace_carrier": self.trace_carrier + "trace_carrier": self.trace_carrier, } add_params = [ - "guided_json", "guided_regex", "guided_choice", "guided_grammar", - "structural_tag", "guided_json_object" + "guided_json", + "guided_regex", + "guided_choice", + "guided_grammar", + "structural_tag", + "guided_json_object", ] for param in add_params: if getattr(self, param, None) is not None: @@ -174,11 +181,13 @@ class Request: setattr(self, key, value) def __repr__(self) -> str: - return (f"Request(request_id={self.request_id}, " - f"prompt={self.prompt!r}, " - f"prompt_token_ids={self.prompt_token_ids}, " - f"draft_token_ids={self.draft_token_ids}, " - f"sampling_params={self.sampling_params})") + return ( + f"Request(request_id={self.request_id}, " + f"prompt={self.prompt!r}, " + f"prompt_token_ids={self.prompt_token_ids}, " + f"draft_token_ids={self.draft_token_ids}, " + f"sampling_params={self.sampling_params})" + ) @dataclass(slots=True) @@ -202,7 +211,7 @@ class CompletionOutput: def to_dict(self): """ - convert CompletionOutput to a serialized dict + convert CompletionOutput to a serialized dict """ return { "index": self.index, @@ -212,27 +221,28 @@ class CompletionOutput: "top_logprobs": self.top_logprobs, "draft_token_ids": self.draft_token_ids, "text": self.text, - "reasoning_content": self.reasoning_content + "reasoning_content": self.reasoning_content, } @classmethod - def from_dict(cls, req_dict: dict[str, Any]) -> 'CompletionOutput': + def from_dict(cls, req_dict: dict[str, Any]) -> CompletionOutput: """Create instance from dict arguments""" return cls( **{ - field.name: - req_dict[field.name] if field.name in - req_dict else field.default + field.name: (req_dict[field.name] if field.name in req_dict else field.default) for field in fields(cls) - }) + } + ) def __repr__(self) -> str: - return (f"CompletionOutput(index={self.index}, " - f"send_idx={self.send_idx}, " - f"text={self.text!r}, " - f"token_ids={self.token_ids}, " - f"draft_token_ids={self.draft_token_ids}, " - f"reasoning_content={self.reasoning_content!r}") + return ( + f"CompletionOutput(index={self.index}, " + f"send_idx={self.send_idx}, " + f"text={self.text!r}, " + f"token_ids={self.token_ids}, " + f"draft_token_ids={self.draft_token_ids}, " + f"reasoning_content={self.reasoning_content!r}" + ) @dataclass(slots=True) @@ -252,6 +262,7 @@ class RequestMetrics: request_start_time: Time to accept the request """ + arrival_time: float inference_start_time: Optional[float] = None first_token_time: Optional[float] = None @@ -273,19 +284,18 @@ class RequestMetrics: "preprocess_cost_time": self.preprocess_cost_time, "model_forward_time": self.model_forward_time, "model_execute_time": self.model_execute_time, - "request_start_time": self.request_start_time + "request_start_time": self.request_start_time, } @classmethod - def from_dict(cls, req_dict: dict[str, Any]) -> 'RequestMetrics': + def from_dict(cls, req_dict: dict[str, Any]) -> RequestMetrics: """Create instance from dict arguments""" return cls( **{ - field.name: - req_dict[field.name] if field.name in - req_dict else field.default + field.name: (req_dict[field.name] if field.name in req_dict else field.default) for field in fields(cls) - }) + } + ) class RequestOutput: @@ -333,13 +343,12 @@ class RequestOutput: self.error_code = error_code self.error_msg = error_msg - if prompt_token_ids is None: self.prompt_token_ids = [] elif isinstance(self.prompt_token_ids, np.ndarray): self.prompt_token_ids = self.prompt_token_ids.tolist() - def add(self, next_output: "RequestOutput") -> None: + def add(self, next_output: RequestOutput) -> None: """Merge RequestOutput into this one""" self.prompt = next_output.prompt @@ -348,19 +357,19 @@ class RequestOutput: self.outputs.index = next_output.outputs.index self.outputs.token_ids.extend(next_output.outputs.token_ids) if next_output.metrics.arrival_time is not None and self.metrics.inference_start_time is not None: - self.metrics.model_forward_time = next_output.metrics.arrival_time - \ - self.metrics.inference_start_time + self.metrics.model_forward_time = next_output.metrics.arrival_time - self.metrics.inference_start_time if next_output.metrics.arrival_time is not None and self.metrics.arrival_time is not None: - self.metrics.model_execute_time = next_output.metrics.arrival_time - \ - self.metrics.arrival_time + self.metrics.model_execute_time = next_output.metrics.arrival_time - self.metrics.arrival_time def __repr__(self) -> str: - return (f"RequestOutput(request_id={self.request_id}, " - f"prompt={self.prompt!r}, " - f"prompt_token_ids={self.prompt_token_ids}, " - f"outputs={self.outputs}, " - f"metrics={self.metrics}, " - f"num_cached_tokens={self.num_cached_tokens})") + return ( + f"RequestOutput(request_id={self.request_id}, " + f"prompt={self.prompt!r}, " + f"prompt_token_ids={self.prompt_token_ids}, " + f"outputs={self.outputs}, " + f"metrics={self.metrics}, " + f"num_cached_tokens={self.num_cached_tokens})" + ) @classmethod def from_dict(cls, d: dict): @@ -370,16 +379,14 @@ class RequestOutput: return RequestOutput(**d, outputs=completion_output, metrics=metrics) def to_dict(self): - """convert RequestOutput into a serializable dict """ + """convert RequestOutput into a serializable dict""" return { "request_id": self.request_id, "prompt": self.prompt, "prompt_token_ids": self.prompt_token_ids, - "outputs": - None if self.outputs is None else self.outputs.to_dict(), - "metrics": - None if self.metrics is None else self.metrics.to_dict(), + "outputs": None if self.outputs is None else self.outputs.to_dict(), + "metrics": None if self.metrics is None else self.metrics.to_dict(), "finished": self.finished, "num_cached_tokens": self.num_cached_tokens, "error_code": self.error_code, diff --git a/fastdeploy/engine/resource_manager.py b/fastdeploy/engine/resource_manager.py index 37962e0f8..3b83306de 100644 --- a/fastdeploy/engine/resource_manager.py +++ b/fastdeploy/engine/resource_manager.py @@ -25,17 +25,19 @@ from fastdeploy.metrics.metrics import main_process_metrics from fastdeploy.utils import llm_logger -class ResourceManager(object): +class ResourceManager: """ record and allocate resources for the engine """ - def __init__(self, - max_num_seqs, - config, - tensor_parallel_size, - splitwise_role, - local_data_parallel_id=0): + def __init__( + self, + max_num_seqs, + config, + tensor_parallel_size, + splitwise_role, + local_data_parallel_id=0, + ): """ Args: cfg (Config): config object containing parameters for the engine @@ -51,9 +53,7 @@ class ResourceManager(object): self.max_num_seqs = max_num_seqs self.stop_flags = [True] * max_num_seqs self.enable_prefix_cache = config.cache_config.enable_prefix_caching - self.cache_manager = PrefixCacheManager(config, tensor_parallel_size, - splitwise_role, - local_data_parallel_id) + self.cache_manager = PrefixCacheManager(config, tensor_parallel_size, splitwise_role, local_data_parallel_id) self.tasks_list = [None] * max_num_seqs self.req_dict = dict() # current batch status of the engine @@ -77,8 +77,7 @@ class ResourceManager(object): Returns: int: block number """ - block_num = (input_token_num + self.cfg.block_size - 1 + - self.cfg.dec_token_num) // self.cfg.block_size + block_num = (input_token_num + self.cfg.block_size - 1 + self.cfg.dec_token_num) // self.cfg.block_size return block_num def get_encoder_block_number(self, input_token_num): @@ -91,8 +90,7 @@ class ResourceManager(object): Returns: int: encoder block number """ - enc_block_num = (input_token_num + self.cfg.block_size - - 1) // self.cfg.block_size + enc_block_num = (input_token_num + self.cfg.block_size - 1) // self.cfg.block_size return enc_block_num def get_decoder_block_number(self): @@ -102,8 +100,7 @@ class ResourceManager(object): Returns: int: decoder block number """ - return (self.cfg.dec_token_num + self.cfg.block_size - - 1) // self.cfg.block_size + return (self.cfg.dec_token_num + self.cfg.block_size - 1) // self.cfg.block_size def total_block_number(self): """ @@ -132,13 +129,12 @@ class ResourceManager(object): elif required_type == "decoder": block_num = self.get_decoder_block_number() else: - raise ValueError('unknown required type') + raise ValueError("unknown required type") block_list = list() current_block_num = self.available_block_num() if block_num > current_block_num: - llm_logger.error("block_num:{0} > free_list len:{1}".format( - block_num, current_block_num)) + llm_logger.error(f"block_num:{block_num} > free_list len:{current_block_num}") return block_list block_list = self.cache_manager.allocate_gpu_blocks(block_num) llm_logger.debug(f"dispatch {len(block_list)} blocks.") @@ -172,10 +168,8 @@ class ResourceManager(object): ori_number = self.available_block_num() self.cache_manager.recycle_gpu_blocks(block_tables) cur_number = self.available_block_num() - main_process_metrics.gpu_cache_usage_perc.set( - self.get_gpu_cache_usage_perc()) - llm_logger.info( - f"recycle {req_id} {cur_number - ori_number} blocks.") + main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) + llm_logger.info(f"recycle {req_id} {cur_number - ori_number} blocks.") def available_batch(self): """ @@ -238,8 +232,7 @@ class ResourceManager(object): can_insert = False while allocated_position + 1 <= self.max_num_seqs: - if sum(self.stop_flags[allocated_position:allocated_position + - 1]) == 1: + if sum(self.stop_flags[allocated_position : allocated_position + 1]) == 1: can_insert = True break allocated_position += 1 @@ -249,72 +242,63 @@ class ResourceManager(object): task = tasks[processing_task_index] if task.get("seed") is None: - task.set("seed", - random.randint(0, 9223372036854775807)) + task.set("seed", random.randint(0, 9223372036854775807)) task.idx = allocated_position if self.enable_prefix_cache: cache_prepare_time = time.time() common_block_ids, unique_block_ids, hit_info = self.cache_manager.request_block_ids( - task, self.cfg.block_size, self.cfg.dec_token_num) + task, + self.cfg.block_size, + self.cfg.dec_token_num, + ) if unique_block_ids is None: - llm_logger.warning( - "req_id: {0} not enough blocks available". - format(task["req_id"])) + llm_logger.warning("req_id: {0} not enough blocks available".format(task["req_id"])) return cached_len = self._record_request_cache_info( - task, common_block_ids, unique_block_ids, hit_info) - task.cache_prepare_time = time.time( - ) - cache_prepare_time + task, common_block_ids, unique_block_ids, hit_info + ) + task.cache_prepare_time = time.time() - cache_prepare_time if task.disaggregate_info is not None: - if task.disaggregate_info['role'] == "prefill": - self.req_dict[ - task.request_id] = allocated_position - task.disaggregate_info[ - 'block_tables'] = task.block_tables + if task.disaggregate_info["role"] == "prefill": + self.req_dict[task.request_id] = allocated_position + task.disaggregate_info["block_tables"] = task.block_tables self._delete_cached_data(task, cached_len) - elif task.disaggregate_info['role'] == "decode": - self.req_dict[ - task.request_id] = allocated_position - task.disaggregate_info[ - 'block_tables'] = task.need_block_tables + elif task.disaggregate_info["role"] == "decode": + self.req_dict[task.request_id] = allocated_position + task.disaggregate_info["block_tables"] = task.need_block_tables else: self._delete_cached_data(task, cached_len) else: - block_tables = self._get_block_tables( - task.prompt_token_ids_len) + block_tables = self._get_block_tables(task.prompt_token_ids_len) if not block_tables: - llm_logger.error( - "req_id: {0} block_tables is empty".format( - task.request_id)) + llm_logger.error(f"req_id: {task.request_id} block_tables is empty") continue else: task.block_tables = block_tables task.need_block_tables = task.block_tables if task.disaggregate_info is not None: - task.disaggregate_info[ - 'block_tables'] = block_tables - if task.disaggregate_info['role'] == "prefill": - self.req_dict[ - task.request_id] = allocated_position - elif task.disaggregate_info['role'] == "decode": - self.req_dict[ - task.request_id] = allocated_position + task.disaggregate_info["block_tables"] = block_tables + if task.disaggregate_info["role"] == "prefill": + self.req_dict[task.request_id] = allocated_position + elif task.disaggregate_info["role"] == "decode": + self.req_dict[task.request_id] = allocated_position processed_tasks.append(task) self.stop_flags[allocated_position] = False task.inference_start_time = time.time() task.inference_time_cost = -1.0 - task.tokens_all_num = int(0) + task.tokens_all_num = 0 self.tasks_list[allocated_position] = task llm_logger.info( f"Allocate request: {task.request_id}, " f"allocated_position:{allocated_position}, " - f"length of prompt token: {task.prompt_token_ids_len}") + f"length of prompt token: {task.prompt_token_ids_len}" + ) allocated_position += 1 processing_task_index += 1 @@ -325,11 +309,10 @@ class ResourceManager(object): break llm_logger.info( - f"Number of allocated requests: {len(tasks)}, number of " - f"running requests in worker: {self.real_bsz}") + f"Number of allocated requests: {len(tasks)}, number of " f"running requests in worker: {self.real_bsz}" + ) llm_logger.info(f"{self.info()}") - main_process_metrics.gpu_cache_usage_perc.set( - self.get_gpu_cache_usage_perc()) + main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) return processed_tasks @@ -338,26 +321,22 @@ class ResourceManager(object): Delete cached data from the task's prompt token ids based on the cached length. """ if cached_len == len(task.prompt_token_ids): - task.prompt_token_ids = task.prompt_token_ids[cached_len - 1:] + task.prompt_token_ids = task.prompt_token_ids[cached_len - 1 :] task.seq_lens_decoder = cached_len - 1 else: task.prompt_token_ids = task.prompt_token_ids[cached_len:] task.seq_lens_decoder = cached_len task.prompt_token_ids_len = len(task.prompt_token_ids) - def _record_request_cache_info(self, task, common_block_ids, - unique_block_ids, hit_info): + def _record_request_cache_info(self, task, common_block_ids, unique_block_ids, hit_info): """ Record the cache information for a given task and its corresponding block IDs. """ cache_block_num = len(common_block_ids) - no_cache_block_num = math.ceil(len(task.prompt_token_ids) / self.cfg.block_size \ - - cache_block_num) + no_cache_block_num = math.ceil(len(task.prompt_token_ids) / self.cfg.block_size - cache_block_num) task.num_cached_tokens = cache_block_num * self.cfg.block_size - task.gpu_cache_token_num = hit_info[ - "gpu_cache_blocks"] * self.cfg.block_size - task.cpu_cache_token_num = hit_info[ - "cpu_cache_blocks"] * self.cfg.block_size + task.gpu_cache_token_num = hit_info["gpu_cache_blocks"] * self.cfg.block_size + task.cpu_cache_token_num = hit_info["cpu_cache_blocks"] * self.cfg.block_size task.cache_info = (cache_block_num, no_cache_block_num) cached_len = len(common_block_ids) * self.cfg.block_size @@ -374,9 +353,11 @@ class ResourceManager(object): Returns: str: resource manager info """ - info = f"ResourceManager info, " \ - f"total_block_number: {self.total_block_number()}, total_batch_number: {len(self.stop_flags)}, " \ - f"available_block_num: {self.available_block_num()}, available_batch: {self.available_batch()}" + info = ( + f"ResourceManager info, " + f"total_block_number: {self.total_block_number()}, total_batch_number: {len(self.stop_flags)}, " + f"available_block_num: {self.available_block_num()}, available_batch: {self.available_batch()}" + ) return info def get_gpu_cache_usage_perc(self): diff --git a/fastdeploy/engine/sampling_params.py b/fastdeploy/engine/sampling_params.py index d81f9f999..336afcc8d 100644 --- a/fastdeploy/engine/sampling_params.py +++ b/fastdeploy/engine/sampling_params.py @@ -94,54 +94,54 @@ class SamplingParams: bad_words: Optional[List[str]] = None @classmethod - def from_dict(cls, req_dict: dict[str, Any]) -> "SamplingParams": + def from_dict(cls, req_dict: dict[str, Any]) -> SamplingParams: """Create instance from command line arguments""" return cls( **{ - field.name: - req_dict[field.name] if field.name in - req_dict else field.default + field.name: (req_dict[field.name] if field.name in req_dict else field.default) for field in fields(cls) - }) + } + ) @classmethod - def from_optional(cls, - n, - best_of, - presence_penalty, - frequency_penalty, - repetition_penalty, - temperature, - top_p, - top_k, - seed=None, - stop=None, - stop_token_ids=None, - max_tokens=None, - reasoning_max_tokens=None, - min_tokens=1, - logprobs=None, - bad_words=None) -> "SamplingParams": + def from_optional( + cls, + n, + best_of, + presence_penalty, + frequency_penalty, + repetition_penalty, + temperature, + top_p, + top_k, + seed=None, + stop=None, + stop_token_ids=None, + max_tokens=None, + reasoning_max_tokens=None, + min_tokens=1, + logprobs=None, + bad_words=None, + ) -> SamplingParams: """Create instance from command line arguments""" - return cls(n=1 if n is None else n, - best_of=best_of, - presence_penalty=presence_penalty - if presence_penalty is not None else 0.0, - frequency_penalty=frequency_penalty - if frequency_penalty is not None else 0.0, - repetition_penalty=repetition_penalty - if repetition_penalty is not None else 1.0, - temperature=temperature if temperature is not None else 1.0, - top_p=top_p, - top_k=top_k if top_k is not None else 0, - seed=seed, - stop=stop, - stop_token_ids=stop_token_ids, - max_tokens=max_tokens if max_tokens is not None else 8192, - reasoning_max_tokens=reasoning_max_tokens, - min_tokens=min_tokens, - logprobs=logprobs, - bad_words=bad_words) + return cls( + n=1 if n is None else n, + best_of=best_of, + presence_penalty=(presence_penalty if presence_penalty is not None else 0.0), + frequency_penalty=(frequency_penalty if frequency_penalty is not None else 0.0), + repetition_penalty=(repetition_penalty if repetition_penalty is not None else 1.0), + temperature=temperature if temperature is not None else 1.0, + top_p=top_p, + top_k=top_k if top_k is not None else 0, + seed=seed, + stop=stop, + stop_token_ids=stop_token_ids, + max_tokens=max_tokens if max_tokens is not None else 8192, + reasoning_max_tokens=reasoning_max_tokens, + min_tokens=min_tokens, + logprobs=logprobs, + bad_words=bad_words, + ) def __post_init__(self): if self.seed is None: @@ -152,60 +152,44 @@ class SamplingParams: def _verify_args(self) -> None: if not isinstance(self.n, int): - raise ValueError( - f"n must be an int, but is of type {type(self.n)}") + raise ValueError(f"n must be an int, but is of type {type(self.n)}") if self.n < 1: raise ValueError(f"n must be at least 1, got {self.n}.") - if self.presence_penalty is not None and ( - not -2.0 <= self.presence_penalty <= 2.0): - raise ValueError("presence_penalty must be in [-2, 2], got " - f"{self.presence_penalty}.") - if self.frequency_penalty is not None and ( - not -2.0 <= self.frequency_penalty <= 2.0): - raise ValueError("frequency_penalty must be in [-2, 2], got " - f"{self.frequency_penalty}.") + if self.presence_penalty is not None and (not -2.0 <= self.presence_penalty <= 2.0): + raise ValueError("presence_penalty must be in [-2, 2], got " f"{self.presence_penalty}.") + if self.frequency_penalty is not None and (not -2.0 <= self.frequency_penalty <= 2.0): + raise ValueError("frequency_penalty must be in [-2, 2], got " f"{self.frequency_penalty}.") if self.repetition_penalty is not None and self.repetition_penalty <= 0.0: - raise ValueError( - "repetition_penalty must be greater than zero, got " - f"{self.repetition_penalty}.") + raise ValueError("repetition_penalty must be greater than zero, got " f"{self.repetition_penalty}.") if self.temperature is not None and self.temperature < 0.0: - raise ValueError( - f"temperature must be non-negative, got {self.temperature}.") + raise ValueError(f"temperature must be non-negative, got {self.temperature}.") if self.top_p is not None and not 0.0 <= self.top_p <= 1.0: raise ValueError(f"top_p must be in [0, 1], got {self.top_p}.") # quietly accept -1 as disabled, but prefer 0 if self.top_k < -1: - raise ValueError(f"top_k must be 0 (disable), or at least 1, " - f"got {self.top_k}.") + raise ValueError(f"top_k must be 0 (disable), or at least 1, " f"got {self.top_k}.") if not isinstance(self.top_k, int): - raise TypeError( - f"top_k must be an integer, got {type(self.top_k).__name__}") + raise TypeError(f"top_k must be an integer, got {type(self.top_k).__name__}") if self.max_tokens is not None and self.max_tokens < 1: - raise ValueError( - f"max_tokens must be at least 1, got {self.max_tokens}.") + raise ValueError(f"max_tokens must be at least 1, got {self.max_tokens}.") if self.reasoning_max_tokens is not None and self.reasoning_max_tokens > self.max_tokens: - raise ValueError( - f"reasoning_max_tokens must be less than max_tokens, got {self.reasoning_max_tokens}.") + raise ValueError(f"reasoning_max_tokens must be less than max_tokens, got {self.reasoning_max_tokens}.") if self.min_tokens < 0: - raise ValueError(f"min_tokens must be greater than or equal to 0, " - f"got {self.min_tokens}.") + raise ValueError(f"min_tokens must be greater than or equal to 0, " f"got {self.min_tokens}.") if self.max_tokens is not None and self.min_tokens > self.max_tokens: raise ValueError( - f"min_tokens must be less than or equal to " - f"max_tokens={self.max_tokens}, got {self.min_tokens}.") + f"min_tokens must be less than or equal to " f"max_tokens={self.max_tokens}, got {self.min_tokens}." + ) if self.logprobs is not None and self.logprobs < 0: - raise ValueError( - f"logprobs must be non-negative, got {self.logprobs}.") + raise ValueError(f"logprobs must be non-negative, got {self.logprobs}.") if self.logprobs is not None and self.logprobs > 20: - raise ValueError( - "Invalid value for 'top_logprobs': must be less than or equal to 20.") + raise ValueError("Invalid value for 'top_logprobs': must be less than or equal to 20.") if not 0 <= self.seed <= 922337203685477580: - raise ValueError("seed must be in [0, 922337203685477580], got " - f"{self.seed}.") + raise ValueError("seed must be in [0, 922337203685477580], got " f"{self.seed}.") def update_from_tokenizer(self, tokenizer): """ @@ -218,6 +202,7 @@ class SamplingParams: @dataclass class BeamSearchParams: """Beam search parameters for text generation.""" + beam_width: int max_tokens: int ignore_eos: bool = False diff --git a/fastdeploy/entrypoints/__init__.py b/fastdeploy/entrypoints/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/entrypoints/__init__.py +++ b/fastdeploy/entrypoints/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py index 9c2ce35c3..f27c00831 100644 --- a/fastdeploy/entrypoints/api_server.py +++ b/fastdeploy/entrypoints/api_server.py @@ -14,19 +14,25 @@ # limitations under the License. """ -import uvicorn import json + +import uvicorn from fastapi import FastAPI from fastapi.responses import Response, StreamingResponse -from fastdeploy.utils import FlexibleArgumentParser, api_server_logger, is_port_available from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.engine import LLMEngine +from fastdeploy.utils import ( + FlexibleArgumentParser, + api_server_logger, + is_port_available, +) app = FastAPI() llm_engine = None + def init_app(args): """ init LLMEngine @@ -39,7 +45,7 @@ def init_app(args): api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!") return False - api_server_logger.info(f"FastDeploy LLM engine initialized!") + api_server_logger.info("FastDeploy LLM engine initialized!") return True @@ -48,6 +54,7 @@ async def health() -> Response: """Health check.""" return Response(status_code=200) + @app.post("/generate") async def generate(request: dict): """ @@ -64,7 +71,7 @@ async def generate(request: dict): output = result except Exception as e: # 记录完整的异常堆栈信息 - api_server_logger.error(f"Error during generation: {str(e)}", exc_info=True) + api_server_logger.error(f"Error during generation: {e!s}", exc_info=True) # 返回结构化的错误消息并终止流 output = {"error": str(e), "error_type": e.__class__.__name__} return output @@ -76,12 +83,14 @@ async def generate(request: dict): yield f"data: {json.dumps(result)}\n\n" except Exception as e: # 记录完整的异常堆栈信息 - api_server_logger.error(f"Error during generation: {str(e)}", exc_info=True) + api_server_logger.error(f"Error during generation: {e!s}", exc_info=True) # 返回结构化的错误消息并终止流 error_msg = {"error": str(e), "error_type": e.__class__.__name__} - yield f"data: {json.dumps(error_msg)}\n\n" + yield f"data: {json.dumps(error_msg)}\n\n" + return StreamingResponse(event_generator(), media_type="text/event-stream") + def launch_api_server(args) -> None: """ 启动http服务 @@ -97,11 +106,13 @@ def launch_api_server(args) -> None: return try: - uvicorn.run(app=app, - host=args.host, - port=args.port, - workers=args.workers, - log_level="info") # set log level to error to avoid log + uvicorn.run( + app=app, + host=args.host, + port=args.port, + workers=args.workers, + log_level="info", + ) # set log level to error to avoid log except Exception as e: api_server_logger.error(f"launch sync http server error, {e}") @@ -115,7 +126,7 @@ def main(): parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() launch_api_server(args) - + if __name__ == "__main__": main() diff --git a/fastdeploy/entrypoints/chat_utils.py b/fastdeploy/entrypoints/chat_utils.py index 5bc3e1048..4f7357e11 100644 --- a/fastdeploy/entrypoints/chat_utils.py +++ b/fastdeploy/entrypoints/chat_utils.py @@ -14,35 +14,45 @@ # limitations under the License. """ -from typing import Literal, Union, List -from typing_extensions import Required, TypedDict, TypeAlias - -from openai.types.chat import ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam -from openai.types.chat import ChatCompletionMessageParam as OpenAIChatCompletionMessageParam - -from urllib.parse import urlparse -import requests from copy import deepcopy +from typing import List, Literal, Union +from urllib.parse import urlparse + +import requests +from openai.types.chat import ( + ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam, +) +from openai.types.chat import ( + ChatCompletionMessageParam as OpenAIChatCompletionMessageParam, +) +from typing_extensions import Required, TypeAlias, TypedDict -from fastdeploy.input.multimodal.video import VideoMediaIO from fastdeploy.input.multimodal.image import ImageMediaIO +from fastdeploy.input.multimodal.video import VideoMediaIO + class VideoURL(TypedDict, total=False): """Video URL object""" + url: Required[str] """Either a URL of the video or the base64 encoded video data""" + class CustomChatCompletionContentPartVideoParam(TypedDict, total=False): """Custom Video URL object""" + video_url: Required[VideoURL] type: Required[Literal["video_url"]] """The type of the content type.""" + CustomChatCompletionContentPartParam: TypeAlias = Union[ - OpenAIChatCompletionContentPartParam, CustomChatCompletionContentPartVideoParam + OpenAIChatCompletionContentPartParam, + CustomChatCompletionContentPartVideoParam, ] + class CustomChatCompletionMessageParam(TypedDict, total=False): """Custom User chat message parameter.""" @@ -58,17 +68,19 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): Provides the model information to differentiate between participants of the same role. """ + ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam, CustomChatCompletionMessageParam] -class MultiModalPartParser(object): +class MultiModalPartParser: """Multi Modal Part parser""" + def __init__(self): self.image_io = ImageMediaIO() self.video_io = VideoMediaIO() def parse_image(self, image_url): - """"Parse Image""" + """ "Parse Image""" return self.load_from_url(image_url, self.image_io) def parse_video(self, video_url): @@ -82,7 +94,7 @@ class MultiModalPartParser(object): if parsed.scheme.startswith("http"): media_bytes = requests.get(url).content return media_io.load_bytes(media_bytes) - + if parsed.scheme.startswith("data"): data_spec, data = parsed.path.split(",", 1) media_type, data_type = data_spec.split(";", 1) @@ -92,6 +104,7 @@ class MultiModalPartParser(object): localpath = parsed.path return media_io.load_file(localpath) + def parse_content_part(mm_parser, part): """only support openai compatible format for now""" @@ -120,8 +133,9 @@ def parse_content_part(mm_parser, part): raise ValueError(f"Unknown content part type: {part_type}") -#TODO async -#def parse_chat_messages(messages: List[ChatCompletionMessageParam]): + +# TODO async +# def parse_chat_messages(messages: List[ChatCompletionMessageParam]): def parse_chat_messages(messages): """Parse chat messages to [dict]""" @@ -141,4 +155,4 @@ def parse_chat_messages(messages): parsed_content = [parse_content_part(mm_parser, part) for part in content] conversation.append({"role": role, "content": parsed_content}) - return conversation \ No newline at end of file + return conversation diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 9ff35d47b..8aa93e213 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -14,17 +14,15 @@ # limitations under the License. """ -import zmq import time -from random import randint import uuid + import numpy as np from fastdeploy.input.preprocess import InputPreprocessor -from fastdeploy.engine.request import Request -from fastdeploy.inter_communicator import ZmqClient, IPCSignal +from fastdeploy.inter_communicator import IPCSignal, ZmqClient from fastdeploy.metrics.work_metrics import work_process_metrics -from fastdeploy.utils import api_server_logger, EngineError +from fastdeploy.utils import EngineError, api_server_logger class EngineClient: @@ -32,23 +30,36 @@ class EngineClient: EngineClient is a class that handles the communication between the client and the server. """ - def __init__(self, tokenizer, max_model_len, tensor_parallel_size, pid, limit_mm_per_prompt, mm_processor_kwargs, - enable_mm=False, reasoning_parser=None): - input_processor = InputPreprocessor(tokenizer, - reasoning_parser, - limit_mm_per_prompt, - mm_processor_kwargs, - enable_mm) + def __init__( + self, + tokenizer, + max_model_len, + tensor_parallel_size, + pid, + limit_mm_per_prompt, + mm_processor_kwargs, + enable_mm=False, + reasoning_parser=None, + ): + input_processor = InputPreprocessor( + tokenizer, + reasoning_parser, + limit_mm_per_prompt, + mm_processor_kwargs, + enable_mm, + ) self.enable_mm = enable_mm self.reasoning_parser = reasoning_parser self.data_processor = input_processor.create_processor() self.max_model_len = max_model_len self.worker_healthy_live_recorded_time_array = np.zeros(shape=[tensor_parallel_size], dtype=np.int32) - self.worker_healthy_live_signal = IPCSignal(name="worker_healthy_live_signal", - array=self.worker_healthy_live_recorded_time_array, - dtype=np.int32, - suffix=pid, - create=False) + self.worker_healthy_live_signal = IPCSignal( + name="worker_healthy_live_signal", + array=self.worker_healthy_live_recorded_time_array, + dtype=np.int32, + suffix=pid, + create=False, + ) model_weights_status = np.zeros([1], dtype=np.int32) self.model_weights_status_signal = IPCSignal( @@ -56,7 +67,8 @@ class EngineClient: array=model_weights_status, dtype=np.int32, suffix=pid, - create=False) + create=False, + ) def create_zmq_client(self, model, mode): """ @@ -75,7 +87,6 @@ class EngineClient: if "request_id" not in prompts: request_id = str(uuid.uuid4()) prompts["request_id"] = request_id - query_list = [] if "max_tokens" not in prompts: prompts["max_tokens"] = self.max_model_len - 1 @@ -101,12 +112,12 @@ class EngineClient: task["prompt_token_ids_len"] = len(task["prompt_token_ids"]) input_ids_len = task["prompt_token_ids_len"] - task["max_tokens"] = min(self.max_model_len - input_ids_len , task.get("max_tokens")) + task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens")) if task.get("reasoning_max_tokens", None) is None: task["reasoning_max_tokens"] = max(int(task["max_tokens"] * 0.8), 1) min_tokens = task.get("min_tokens", 1) - if 'messages' in task: - del task['messages'] + if "messages" in task: + del task["messages"] api_server_logger.info(f"task['max_tokens']:{task['max_tokens']}") work_process_metrics.request_params_max_tokens.observe(task["max_tokens"]) work_process_metrics.prompt_tokens_total.inc(input_ids_len) @@ -133,8 +144,7 @@ class EngineClient: task["preprocess_end_time"] = time.time() preprocess_cost_time = task["preprocess_end_time"] - task["preprocess_start_time"] api_server_logger.info( - f"Cache request with request_id ({task.get('request_id')}), " - f"cost {time.time() - preprocess_cost_time}" + f"Cache request with request_id ({task.get('request_id')}), " f"cost {time.time() - preprocess_cost_time}" ) self.vaild_parameters(task) @@ -153,7 +163,6 @@ class EngineClient: Validate stream options """ - if data.get("n"): if data["n"] != 1: raise ValueError("n only support 1.") @@ -168,34 +177,26 @@ class EngineClient: if data.get("top_p"): if data["top_p"] > 1 or data["top_p"] < 0: - raise ValueError( - "top_p value can only be defined [0, 1].") - + raise ValueError("top_p value can only be defined [0, 1].") if data.get("frequency_penalty"): - if not -2.0 <= data["frequency_penalty"] <= 2.0: + if not -2.0 <= data["frequency_penalty"] <= 2.0: raise ValueError("frequency_penalty must be in [-2, 2]") if data.get("temperature"): if data["temperature"] < 0: - raise ValueError(f"temperature must be non-negative") - + raise ValueError("temperature must be non-negative") if data.get("presence_penalty"): - if not -2.0 <= data["presence_penalty"] <= 2.0: + if not -2.0 <= data["presence_penalty"] <= 2.0: raise ValueError("presence_penalty must be in [-2, 2]") - - if data.get("seed"): if not 0 <= data["seed"] <= 922337203685477580: raise ValueError("seed must be in [0, 922337203685477580]") if data.get("stream_options") and not data.get("stream"): - raise ValueError( - "Stream options can only be defined when `stream=True`.") - - + raise ValueError("Stream options can only be defined when `stream=True`.") def check_health(self, time_interval_threashold=30): """ @@ -209,7 +210,6 @@ class EngineClient: return True, "" - def is_workers_alive(self): """ Check the health of the model server by checking whether all workers are alive. @@ -220,9 +220,7 @@ class EngineClient: else: return False, "No model weight enabled" - - - def update_model_weight(self, timeout = 300): + def update_model_weight(self, timeout=300): """ Update the model weight by sending a signal to the server. 1 : worker receive the signal and start to update model weight @@ -235,7 +233,7 @@ class EngineClient: self.model_weights_status_signal.value[0] = 1 api_server_logger.info(f"start update model weight {self.model_weights_status_signal.value}") - while self.model_weights_status_signal.value[0] != 0 and timeout != 0: + while self.model_weights_status_signal.value[0] != 0 and timeout != 0: time.sleep(1) timeout -= 1 continue @@ -244,9 +242,7 @@ class EngineClient: time.sleep(1) return True, "" - - - def clear_load_weight(self, timeout = 300): + def clear_load_weight(self, timeout=300): """ Clear the load weight status. -1 : worker receive the signal and start to clear model weight @@ -260,7 +256,7 @@ class EngineClient: self.model_weights_status_signal.value[0] = -1 api_server_logger.info(f"start clear model weight {self.model_weights_status_signal.value}") - while self.model_weights_status_signal.value[0] != -2 and timeout != 0: + while self.model_weights_status_signal.value[0] != -2 and timeout != 0: time.sleep(1) timeout -= 1 continue diff --git a/fastdeploy/entrypoints/llm.py b/fastdeploy/entrypoints/llm.py index 5601a7e4c..e6356981f 100644 --- a/fastdeploy/entrypoints/llm.py +++ b/fastdeploy/entrypoints/llm.py @@ -28,6 +28,7 @@ from tqdm import tqdm from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.engine import LLMEngine from fastdeploy.engine.sampling_params import SamplingParams + # from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam from fastdeploy.utils import llm_logger, retrive_model_from_server @@ -78,18 +79,16 @@ class LLM: # Create the Engine self.llm_engine = LLMEngine.from_engine_args(engine_args=engine_args) - self.default_sampling_params = SamplingParams( - max_tokens=self.llm_engine.cfg.max_model_len) + self.default_sampling_params = SamplingParams(max_tokens=self.llm_engine.cfg.max_model_len) self.llm_engine.start() self.mutex = threading.Lock() self.req_output = dict() self.master_node_ip = self.llm_engine.cfg.master_ip - self._receive_output_thread = threading.Thread( - target=self._receive_output, daemon=True) + self._receive_output_thread = threading.Thread(target=self._receive_output, daemon=True) self._receive_output_thread.start() - + def _check_master(self): """ Check if the current node is the master node. @@ -111,15 +110,19 @@ class LLM: continue self.req_output[request_id].add(result) except Exception as e: - llm_logger.error("Unexcepted error happend: {}, {}".format( - e, str(traceback.format_exc()))) + llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}") def generate( self, - prompts: Union[str, list[str], list[int], list[list[int]], - dict[str, Any], list[dict[str, Any]]], - sampling_params: Optional[Union[SamplingParams, - list[SamplingParams]]] = None, + prompts: Union[ + str, + list[str], + list[int], + list[list[int]], + dict[str, Any], + list[dict[str, Any]], + ], + sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None, use_tqdm: bool = True, ): """ @@ -161,11 +164,9 @@ class LLM: # sampling_params = None if sampling_params_len != 1 and len(prompts) != sampling_params_len: - raise ValueError( - "prompts and sampling_params must be the same length.") + raise ValueError("prompts and sampling_params must be the same length.") - req_ids = self._add_request(prompts=prompts, - sampling_params=sampling_params) + req_ids = self._add_request(prompts=prompts, sampling_params=sampling_params) # get output outputs = self._run_engine(req_ids, use_tqdm=use_tqdm) @@ -176,8 +177,7 @@ class LLM: def chat( self, messages: Union[list[Any], list[list[Any]]], - sampling_params: Optional[Union[SamplingParams, - list[SamplingParams]]] = None, + sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None, use_tqdm: bool = True, chat_template_kwargs: Optional[dict[str, Any]] = None, ): @@ -198,7 +198,7 @@ class LLM: if not self._check_master(): err_msg = f"Only master node can accept completion request, please send request to master node: {self.master_node_ip}" raise ValueError(err_msg) - + if sampling_params is None: sampling_params = self.default_sampling_params @@ -211,15 +211,16 @@ class LLM: messages = [messages] if sampling_params_len != 1 and len(messages) != sampling_params_len: - raise ValueError( - "messages and sampling_params must be the same length.") + raise ValueError("messages and sampling_params must be the same length.") messages_len = len(messages) for i in range(messages_len): messages[i] = {"messages": messages[i]} - req_ids = self._add_request(prompts=messages, - sampling_params=sampling_params, - chat_template_kwargs=chat_template_kwargs) + req_ids = self._add_request( + prompts=messages, + sampling_params=sampling_params, + chat_template_kwargs=chat_template_kwargs, + ) # get output outputs = self._run_engine(req_ids, use_tqdm=use_tqdm) @@ -253,8 +254,7 @@ class LLM: "prompt": prompts[i], "request_id": request_id, } - elif isinstance(prompts[i], list) and isinstance( - prompts[i][0], int): + elif isinstance(prompts[i], list) and isinstance(prompts[i][0], int): tasks = { "prompt_token_ids": prompts[i], "request_id": request_id, @@ -273,11 +273,8 @@ class LLM: current_sampling_params = sampling_params enable_thinking = None if chat_template_kwargs is not None: - enable_thinking = chat_template_kwargs.get( - "enable_thinking", None) - self.llm_engine.add_requests(tasks, - current_sampling_params, - enable_thinking=enable_thinking) + enable_thinking = chat_template_kwargs.get("enable_thinking", None) + self.llm_engine.add_requests(tasks, current_sampling_params, enable_thinking=enable_thinking) return req_ids def _run_engine(self, req_ids: list[str], use_tqdm: bool): @@ -303,8 +300,7 @@ class LLM: total=num_requests, desc="Processed prompts", dynamic_ncols=True, - postfix=(f"est. speed input: {0:.2f} toks/s, " - f"output: {0:.2f} toks/s"), + postfix=(f"est. speed input: {0:.2f} toks/s, " f"output: {0:.2f} toks/s"), ) output = [None] * num_requests @@ -322,13 +318,11 @@ class LLM: continue result = self.req_output.pop(req_id) - result = self.llm_engine.data_processor.process_response( - result) + result = self.llm_engine.data_processor.process_response(result) output[pos] = result finished.append(i) - llm_logger.debug( - "Request id: {} has been completed.".format(req_id)) + llm_logger.debug(f"Request id: {req_id} has been completed.") if use_tqdm: pbar.update(1) @@ -346,24 +340,27 @@ if __name__ == "__main__": # llm = LLM(model="llama_model") # output = llm.generate(prompts="who are you?", use_tqdm=True) # print(output) - llm = LLM(model="/opt/baidu/paddle_internal/FastDeploy/Qwen2.5-7B", - tensor_parallel_size=2) + llm = LLM( + model="/opt/baidu/paddle_internal/FastDeploy/Qwen2.5-7B", + tensor_parallel_size=2, + ) sampling_params = SamplingParams(temperature=0.1, max_tokens=30) - output = llm.generate(prompts="who are you?", - use_tqdm=True, - sampling_params=sampling_params) + output = llm.generate(prompts="who are you?", use_tqdm=True, sampling_params=sampling_params) print(output) - output = llm.generate(prompts=["who are you?", "what can you do?"], - sampling_params=SamplingParams(temperature=1, - max_tokens=50), - use_tqdm=True) + output = llm.generate( + prompts=["who are you?", "what can you do?"], + sampling_params=SamplingParams(temperature=1, max_tokens=50), + use_tqdm=True, + ) print(output) - output = llm.generate(prompts=["who are you?", "I miss you"], - sampling_params=[ - SamplingParams(temperature=1, max_tokens=50), - SamplingParams(temperature=1, max_tokens=20) - ], - use_tqdm=True) + output = llm.generate( + prompts=["who are you?", "I miss you"], + sampling_params=[ + SamplingParams(temperature=1, max_tokens=50), + SamplingParams(temperature=1, max_tokens=20), + ], + use_tqdm=True, + ) print(output) diff --git a/fastdeploy/entrypoints/openai/__init__.py b/fastdeploy/entrypoints/openai/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/entrypoints/openai/__init__.py +++ b/fastdeploy/entrypoints/openai/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index e2ebf925d..3e05e7367 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os import threading import time @@ -24,46 +25,41 @@ import zmq from fastapi import FastAPI, Request from fastapi.responses import JSONResponse, Response, StreamingResponse from prometheus_client import CONTENT_TYPE_LATEST -from fastdeploy.metrics.trace_util import inject_to_metadata,instrument from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.engine import LLMEngine from fastdeploy.entrypoints.engine_client import EngineClient -from fastdeploy.entrypoints.openai.protocol import (ChatCompletionRequest, - ChatCompletionResponse, - CompletionRequest, - CompletionResponse, - ErrorResponse, - ControlSchedulerRequest) +from fastdeploy.entrypoints.openai.protocol import ( + ChatCompletionRequest, + ChatCompletionResponse, + CompletionRequest, + CompletionResponse, + ControlSchedulerRequest, + ErrorResponse, +) from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat -from fastdeploy.entrypoints.openai.serving_completion import \ - OpenAIServingCompletion -from fastdeploy.metrics.metrics import (EXCLUDE_LABELS, - cleanup_prometheus_files, - get_filtered_metrics, - main_process_metrics) -from fastdeploy.utils import (FlexibleArgumentParser, api_server_logger, - console_logger, is_port_available, - retrive_model_from_server) +from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion +from fastdeploy.metrics.metrics import ( + EXCLUDE_LABELS, + cleanup_prometheus_files, + get_filtered_metrics, + main_process_metrics, +) +from fastdeploy.metrics.trace_util import inject_to_metadata, instrument +from fastdeploy.utils import ( + FlexibleArgumentParser, + api_server_logger, + console_logger, + is_port_available, + retrive_model_from_server, +) parser = FlexibleArgumentParser() -parser.add_argument("--port", - default=8000, - type=int, - help="port to the http server") -parser.add_argument("--host", - default="0.0.0.0", - type=str, - help="host to the http server") +parser.add_argument("--port", default=8000, type=int, help="port to the http server") +parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") parser.add_argument("--workers", default=1, type=int, help="number of workers") -parser.add_argument("--metrics-port", - default=8001, - type=int, - help="port for metrics server") -parser.add_argument("--controller-port", - default=-1, - type=int, - help="port for controller server") +parser.add_argument("--metrics-port", default=8001, type=int, help="port for metrics server") +parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server") parser = EngineArgs.add_cli_args(parser) args = parser.parse_args() args.model = retrive_model_from_server(args.model) @@ -79,26 +75,18 @@ def load_engine(): if llm_engine is not None: return llm_engine - api_server_logger.info( - f"FastDeploy LLM API server starting... {os.getpid()}") + api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}") engine_args = EngineArgs.from_cli_args(args) engine = LLMEngine.from_engine_args(engine_args) if not engine.start(api_server_pid=os.getpid()): - api_server_logger.error( - "Failed to initialize FastDeploy LLM engine, service exit now!") + api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!") return None api_server_logger.info("FastDeploy LLM engine initialized!\n") - console_logger.info( - f"Launching metrics service at http://{args.host}:{args.metrics_port}/metrics" - ) - console_logger.info( - f"Launching chat completion service at http://{args.host}:{args.port}/v1/chat/completions" - ) - console_logger.info( - f"Launching completion service at http://{args.host}:{args.port}/v1/completions" - ) + console_logger.info(f"Launching metrics service at http://{args.host}:{args.metrics_port}/metrics") + console_logger.info(f"Launching chat completion service at http://{args.host}:{args.port}/v1/chat/completions") + console_logger.info(f"Launching completion service at http://{args.host}:{args.port}/v1/completions") llm_engine = engine return engine @@ -111,16 +99,21 @@ async def lifespan(app: FastAPI): if args.tokenizer is None: args.tokenizer = args.model - if current_process().name != 'MainProcess': + if current_process().name != "MainProcess": pid = os.getppid() else: pid = os.getpid() api_server_logger.info(f"{pid}") - engine_client = EngineClient(args.tokenizer, args.max_model_len, - args.tensor_parallel_size, pid, - args.limit_mm_per_prompt, - args.mm_processor_kwargs, args.enable_mm, - args.reasoning_parser) + engine_client = EngineClient( + args.tokenizer, + args.max_model_len, + args.tensor_parallel_size, + pid, + args.limit_mm_per_prompt, + args.mm_processor_kwargs, + args.enable_mm, + args.reasoning_parser, + ) app.state.dynamic_load_weight = args.dynamic_load_weight chat_handler = OpenAIServingChat(engine_client, pid, args.dist_init_ip) completion_handler = OpenAIServingCompletion(engine_client, pid, args.dist_init_ip) @@ -134,6 +127,7 @@ async def lifespan(app: FastAPI): try: engine_client.zmq_client.close() from prometheus_client import multiprocess + multiprocess.mark_process_dead(os.getpid()) api_server_logger.info(f"Closing metrics client pid: {pid}") except Exception as e: @@ -187,11 +181,7 @@ async def list_all_routes(): if route.path.startswith("/v1"): methods = sorted(route.methods) tags = getattr(route, "tags", []) or [] - routes_info.append({ - "path": route.path, - "methods": methods, - "tags": tags - }) + routes_info.append({"path": route.path, "methods": methods, "tags": tags}) return {"routes": routes_info} @@ -209,15 +199,12 @@ async def create_chat_completion(request: ChatCompletionRequest): if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: - return JSONResponse( - content={"error": "Worker Service Not Healthy"}, - status_code=304) + return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304) inject_to_metadata(request) generator = await app.state.chat_handler.create_chat_completion(request) if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + return JSONResponse(content=generator.model_dump(), status_code=generator.code) elif isinstance(generator, ChatCompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -233,14 +220,11 @@ async def create_completion(request: CompletionRequest): if app.state.dynamic_load_weight: status, msg = app.state.engine_client.is_workers_alive() if not status: - return JSONResponse( - content={"error": "Worker Service Not Healthy"}, - status_code=304) + return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304) generator = await app.state.completion_handler.create_completion(request) if isinstance(generator, ErrorResponse): - return JSONResponse(content=generator.model_dump(), - status_code=generator.code) + return JSONResponse(content=generator.model_dump(), status_code=generator.code) elif isinstance(generator, CompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -258,8 +242,7 @@ def update_model_weight(request: Request) -> Response: return Response(content=msg, status_code=404) return Response(status_code=200) else: - return Response(content="Dynamic Load Weight Disabled.", - status_code=404) + return Response(content="Dynamic Load Weight Disabled.", status_code=404) @app.get("/clear_load_weight") @@ -273,8 +256,7 @@ def clear_load_weight(request: Request) -> Response: return Response(content=msg, status_code=404) return Response(status_code=200) else: - return Response(content="Dynamic Load Weight Disabled.", - status_code=404) + return Response(content="Dynamic Load Weight Disabled.", status_code=404) def launch_api_server() -> None: @@ -284,16 +266,17 @@ def launch_api_server() -> None: if not is_port_available(args.host, args.port): raise Exception(f"The parameter `port`:{args.port} is already in use.") - api_server_logger.info( - f"launch Fastdeploy api server... port: {args.port}") + api_server_logger.info(f"launch Fastdeploy api server... port: {args.port}") api_server_logger.info(f"args: {args.__dict__}") try: - uvicorn.run(app="fastdeploy.entrypoints.openai.api_server:app", - host=args.host, - port=args.port, - workers=args.workers, - log_level="info") # set log level to error to avoid log + uvicorn.run( + app="fastdeploy.entrypoints.openai.api_server:app", + host=args.host, + port=args.port, + workers=args.workers, + log_level="info", + ) # set log level to error to avoid log except Exception as e: api_server_logger.error(f"launch sync http server error, {e}") @@ -308,8 +291,8 @@ async def metrics(): """ metrics_text = get_filtered_metrics( EXCLUDE_LABELS, - extra_register_func=lambda reg: main_process_metrics.register_all( - reg, workers=args.workers)) + extra_register_func=lambda reg: main_process_metrics.register_all(reg, workers=args.workers), + ) return Response(metrics_text, media_type=CONTENT_TYPE_LATEST) @@ -318,23 +301,17 @@ def run_metrics_server(): run metrics server """ - uvicorn.run(metrics_app, - host="0.0.0.0", - port=args.metrics_port, - log_level="error") + uvicorn.run(metrics_app, host="0.0.0.0", port=args.metrics_port, log_level="error") def launch_metrics_server(): """Metrics server running the sub thread""" if not is_port_available(args.host, args.metrics_port): - raise Exception( - f"The parameter `metrics_port`:{args.metrics_port} is already in use." - ) + raise Exception(f"The parameter `metrics_port`:{args.metrics_port} is already in use.") prom_dir = cleanup_prometheus_files(True) os.environ["PROMETHEUS_MULTIPROC_DIR"] = prom_dir - metrics_server_thread = threading.Thread(target=run_metrics_server, - daemon=True) + metrics_server_thread = threading.Thread(target=run_metrics_server, daemon=True) metrics_server_thread.start() time.sleep(1) @@ -358,10 +335,10 @@ def reset_scheduler(): @controller_app.post("/controller/scheduler") def control_scheduler(request: ControlSchedulerRequest): """ - Control the scheduler behavior with the given parameters. + Control the scheduler behavior with the given parameters. """ content = ErrorResponse(object="", message="Scheduler updated successfully", code=0) - + global llm_engine if llm_engine is None: content.message = "Engine is not loaded" @@ -375,10 +352,11 @@ def control_scheduler(request: ControlSchedulerRequest): if hasattr(llm_engine.scheduler, "update_config") and callable(llm_engine.scheduler.update_config): llm_engine.scheduler.update_config( load_shards_num=request.load_shards_num, - reallocate=request.reallocate_shard) + reallocate=request.reallocate_shard, + ) else: - content.message="This scheduler doesn't support the `update_config()` method." - content.code=400 + content.message = "This scheduler doesn't support the `update_config()` method." + content.code = 400 return JSONResponse(content=content.model_dump(), status_code=400) return JSONResponse(content=content.model_dump(), status_code=200) @@ -388,10 +366,12 @@ def run_controller_server(): """ run controller server """ - uvicorn.run(controller_app, - host="0.0.0.0", - port=args.controller_port, - log_level="error") + uvicorn.run( + controller_app, + host="0.0.0.0", + port=args.controller_port, + log_level="error", + ) def launch_controller_server(): @@ -400,12 +380,9 @@ def launch_controller_server(): return if not is_port_available(args.host, args.controller_port): - raise Exception( - f"The parameter `controller_port`:{args.controller_port} is already in use." - ) + raise Exception(f"The parameter `controller_port`:{args.controller_port} is already in use.") - controller_server_thread = threading.Thread(target=run_controller_server, - daemon=True) + controller_server_thread = threading.Thread(target=run_controller_server, daemon=True) controller_server_thread.start() time.sleep(1) diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index 75c6f0711..febb3dd0c 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -22,7 +22,7 @@ from typing import Any, List, Literal, Optional, Union from pydantic import BaseModel, Field, model_validator -#from openai.types.chat import ChatCompletionMessageParam +# from openai.types.chat import ChatCompletionMessageParam # from fastdeploy.entrypoints.chat_utils import ChatCompletionMessageParam @@ -30,6 +30,7 @@ class ErrorResponse(BaseModel): """ Error response from OpenAI API. """ + object: str = "error" message: str code: int @@ -39,6 +40,7 @@ class PromptTokenUsageInfo(BaseModel): """ Prompt-related token usage info. """ + cached_tokens: Optional[int] = None @@ -46,6 +48,7 @@ class UsageInfo(BaseModel): """ Usage info for a single request. """ + prompt_tokens: int = 0 total_tokens: int = 0 completion_tokens: Optional[int] = 0 @@ -56,6 +59,7 @@ class FunctionCall(BaseModel): """ Function call. """ + name: str arguments: str @@ -64,6 +68,7 @@ class ToolCall(BaseModel): """ Tool call. """ + id: str = None type: Literal["function"] = "function" function: FunctionCall @@ -74,6 +79,7 @@ class DeltaFunctionCall(BaseModel): """ Delta function call. """ + name: Optional[str] = None arguments: Optional[str] = None @@ -83,6 +89,7 @@ class DeltaToolCall(BaseModel): """ Delta tool call. """ + id: Optional[str] = None type: Optional[Literal["function"]] = None index: int @@ -93,6 +100,7 @@ class FunctionDefinition(BaseModel): """ Function definition. """ + name: str description: Optional[str] = None parameters: Optional[dict[str, Any]] = None @@ -102,6 +110,7 @@ class ChatCompletionToolsParam(BaseModel): """ Chat completion tools parameter. """ + type: Literal["function"] = "function" function: FunctionDefinition @@ -110,6 +119,7 @@ class ChatMessage(BaseModel): """ Chat message. """ + role: str content: str reasoning_content: Optional[str] = None @@ -120,6 +130,7 @@ class ChatCompletionResponseChoice(BaseModel): """ Chat completion response choice. """ + index: int message: ChatMessage logprobs: Optional[LogProbs] = None @@ -130,6 +141,7 @@ class ChatCompletionResponse(BaseModel): """ Chat completion response. """ + id: str object: str = "chat.completion" created: int = Field(default_factory=lambda: int(time.time())) @@ -137,26 +149,32 @@ class ChatCompletionResponse(BaseModel): choices: List[ChatCompletionResponseChoice] usage: UsageInfo + class LogProbEntry(BaseModel): """ Log probability entry. """ + token: str logprob: float bytes: Optional[List[int]] = None - top_logprobs: Optional[List["LogProbEntry"]] = None + top_logprobs: Optional[List[LogProbEntry]] = None + class LogProbs(BaseModel): """ LogProbs. """ + content: Optional[List[LogProbEntry]] = None refusal: Optional[Union[str, None]] = None + class DeltaMessage(BaseModel): """ Delta message for chat completion stream response. """ + role: Optional[str] = None content: Optional[str] = None token_ids: Optional[List[int]] = None @@ -168,6 +186,7 @@ class ChatCompletionResponseStreamChoice(BaseModel): """ Chat completion response choice for stream response. """ + index: int delta: DeltaMessage logprobs: Optional[LogProbs] = None @@ -179,6 +198,7 @@ class ChatCompletionStreamResponse(BaseModel): """ Chat completion response for stream response. """ + id: str object: str = "chat.completion.chunk" created: int = Field(default_factory=lambda: int(time.time())) @@ -191,6 +211,7 @@ class CompletionResponseChoice(BaseModel): """ Completion response choice. """ + index: int text: str token_ids: Optional[List[int]] = None @@ -205,6 +226,7 @@ class CompletionResponse(BaseModel): """ Completion response. """ + id: str object: str = "text_completion" created: int = Field(default_factory=lambda: int(time.time())) @@ -217,6 +239,7 @@ class CompletionResponseStreamChoice(BaseModel): """ Completion response choice for stream response. """ + index: int text: str arrival_time: float = None @@ -231,6 +254,7 @@ class CompletionStreamResponse(BaseModel): """ Completion response for stream response. """ + id: str object: str = "text_completion" created: int = Field(default_factory=lambda: int(time.time())) @@ -243,6 +267,7 @@ class StreamOptions(BaseModel): """ Stream options. """ + include_usage: Optional[bool] = True continuous_usage_stats: Optional[bool] = False @@ -251,9 +276,9 @@ class StructuralTag(BaseModel): """ Structural tag. """ + begin: str - structural_tag_schema: Optional[dict[str, Any]] = Field(default=None, - alias="schema") + structural_tag_schema: Optional[dict[str, Any]] = Field(default=None, alias="schema") end: str @@ -261,9 +286,10 @@ class JsonSchemaResponseFormat(BaseModel): """ Json schema for ResponseFormat. """ + name: str description: Optional[str] = None - json_schema: Optional[dict[str, Any]] = Field(default=None, alias='schema') + json_schema: Optional[dict[str, Any]] = Field(default=None, alias="schema") strict: Optional[bool] = None @@ -271,6 +297,7 @@ class StructuralTagResponseFormat(BaseModel): """ Structural tag for ResponseFormat. """ + type: Literal["structural_tag"] structures: list[StructuralTag] triggers: list[str] @@ -280,6 +307,7 @@ class ResponseFormat(BaseModel): """ response_format type. """ + type: Literal["text", "json_object", "json_schema"] json_schema: Optional[JsonSchemaResponseFormat] = None @@ -291,6 +319,7 @@ class CompletionRequest(BaseModel): """ Completion request to the engine. """ + # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/completions/create model: Optional[str] = "default" @@ -333,7 +362,7 @@ class CompletionRequest(BaseModel): """ req_dict = {} if request_id is not None: - req_dict['request_id'] = request_id + req_dict["request_id"] = request_id for key, value in self.dict().items(): if value is not None: req_dict[key] = value @@ -341,7 +370,7 @@ class CompletionRequest(BaseModel): for key, value in self.suffix.items(): req_dict[key] = value if prompt is not None: - req_dict['prompt'] = prompt + req_dict["prompt"] = prompt if isinstance(prompt[0], int): req_dict["prompt_token_ids"] = prompt @@ -363,8 +392,11 @@ class CompletionRequest(BaseModel): req_dict["guided_json_object"] = guided_json_object guided_schema = [ - "guided_json", "guided_regex", "guided_choice", "guided_grammar", - "structural_tag" + "guided_json", + "guided_regex", + "guided_choice", + "guided_grammar", + "structural_tag", ] for key in guided_schema: item = getattr(self, key, None) @@ -380,15 +412,16 @@ class CompletionRequest(BaseModel): Validate stream options """ if data.get("stream_options") and not data.get("stream"): - raise ValueError( - "Stream options can only be defined when `stream=True`.") + raise ValueError("Stream options can only be defined when `stream=True`.") - guided_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None, - "guided_grammar" in data and data["guided_grammar"] is not None - ]) + guided_count = sum( + [ + "guided_json" in data and data["guided_json"] is not None, + "guided_regex" in data and data["guided_regex"] is not None, + "guided_choice" in data and data["guided_choice"] is not None, + "guided_grammar" in data and data["guided_grammar"] is not None, + ] + ) if guided_count > 1: raise ValueError( @@ -403,6 +436,7 @@ class ChatCompletionRequest(BaseModel): """ Chat completion request to the engine. """ + # Ordered by official OpenAI API documentation # https://platform.openai.com/docs/api-reference/chat/create messages: Union[List[Any], List[int]] @@ -414,8 +448,8 @@ class ChatCompletionRequest(BaseModel): # remove max_tokens when field is removed from OpenAI API max_tokens: Optional[int] = Field( default=None, - deprecated= - 'max_tokens is deprecated in favor of the max_completion_tokens field') + deprecated="max_tokens is deprecated in favor of the max_completion_tokens field", + ) max_completion_tokens: Optional[int] = None n: Optional[int] = 1 presence_penalty: Optional[float] = None @@ -451,7 +485,7 @@ class ChatCompletionRequest(BaseModel): """ req_dict = {} if request_id is not None: - req_dict['request_id'] = request_id + req_dict["request_id"] = request_id req_dict["max_tokens"] = self.max_completion_tokens or self.max_tokens req_dict["logprobs"] = self.top_logprobs if self.logprobs else None @@ -483,17 +517,18 @@ class ChatCompletionRequest(BaseModel): self.guided_json = json_schema elif self.response_format.type == "structural_tag": structural_tag = self.response_format - assert structural_tag is not None and isinstance( - structural_tag, StructuralTagResponseFormat) - self.structural_tag = json.dumps( - structural_tag.model_dump(by_alias=True)) + assert structural_tag is not None and isinstance(structural_tag, StructuralTagResponseFormat) + self.structural_tag = json.dumps(structural_tag.model_dump(by_alias=True)) if guided_json_object: req_dict["guided_json_object"] = guided_json_object guided_schema = [ - "guided_json", "guided_regex", "guided_choice", "guided_grammar", - "structural_tag" + "guided_json", + "guided_regex", + "guided_choice", + "guided_grammar", + "structural_tag", ] for key in guided_schema: item = getattr(self, key, None) @@ -509,16 +544,17 @@ class ChatCompletionRequest(BaseModel): Validate stream options """ if data.get("stream_options") and not data.get("stream"): - raise ValueError( - "Stream options can only be defined when `stream=True`.") + raise ValueError("Stream options can only be defined when `stream=True`.") - guided_count = sum([ - "guided_json" in data and data["guided_json"] is not None, - "guided_regex" in data and data["guided_regex"] is not None, - "guided_choice" in data and data["guided_choice"] is not None, - "guided_grammar" in data and data["guided_grammar"] is not None, - "structural_tag" in data and data["structural_tag"] is not None - ]) + guided_count = sum( + [ + "guided_json" in data and data["guided_json"] is not None, + "guided_regex" in data and data["guided_regex"] is not None, + "guided_choice" in data and data["guided_choice"] is not None, + "guided_grammar" in data and data["guided_grammar"] is not None, + "structural_tag" in data and data["structural_tag"] is not None, + ] + ) if guided_count > 1: raise ValueError( @@ -537,17 +573,16 @@ class ChatCompletionRequest(BaseModel): raise ValueError("`top_logprobs` must be a positive value.") if top_logprobs > 0 and not data.get("logprobs"): - raise ValueError( - "when using `top_logprobs`, `logprobs` must be set to true." - ) + raise ValueError("when using `top_logprobs`, `logprobs` must be set to true.") return data - - + + class ControlSchedulerRequest(BaseModel): """ Control scheduler request to the engine. """ + reset: Optional[bool] = False load_shards_num: Optional[int] = None reallocate_shard: Optional[bool] = False diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 778061b85..653d1e171 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -15,21 +15,29 @@ """ import asyncio -import json import time import traceback import uuid from typing import List, Optional -import msgpack import aiozmq +import msgpack from aiozmq import zmq from fastdeploy.entrypoints.openai.protocol import ( - ChatCompletionRequest, ChatCompletionResponse, - ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, - LogProbEntry, LogProbs, PromptTokenUsageInfo, UsageInfo) + ChatCompletionRequest, + ChatCompletionResponse, + ChatCompletionResponseChoice, + ChatCompletionResponseStreamChoice, + ChatCompletionStreamResponse, + ChatMessage, + DeltaMessage, + ErrorResponse, + LogProbEntry, + LogProbs, + PromptTokenUsageInfo, + UsageInfo, +) from fastdeploy.metrics.work_metrics import work_process_metrics from fastdeploy.utils import api_server_logger, get_host_ip from fastdeploy.worker.output import LogprobsLists @@ -53,10 +61,7 @@ class OpenAIServingChat: return True return False - async def create_chat_completion( - self, - request: ChatCompletionRequest - ): + async def create_chat_completion(self, request: ChatCompletionRequest): """ Create a new chat completion using the specified parameters. """ @@ -81,16 +86,10 @@ class OpenAIServingChat: del current_req_dict if request.stream: - return self.chat_completion_stream_generator( - request, request_id, - request.model, - prompt_token_ids) + return self.chat_completion_stream_generator(request, request_id, request.model, prompt_token_ids) else: try: - return await self.chat_completion_full_generator( - request, request_id, - request.model, - prompt_token_ids) + return await self.chat_completion_full_generator(request, request_id, request.model, prompt_token_ids) except Exception as e: return ErrorResponse(code=400, message=str(e)) @@ -106,7 +105,7 @@ class OpenAIServingChat: request: ChatCompletionRequest, request_id: str, model_name: str, - prompt_token_ids: list() + prompt_token_ids: list(), ): """ Streaming chat completion generator. @@ -135,14 +134,11 @@ class OpenAIServingChat: object=chunk_object_type, created=created_time, choices=[], - model=model_name + model=model_name, ) try: - dealer = await aiozmq.create_zmq_stream( - zmq.DEALER, - connect=f"ipc:///dev/shm/router_{self.pid}.ipc" - ) - dealer.write([b"", request_id.encode('utf-8')]) + dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") + dealer.write([b"", request_id.encode("utf-8")]) choices = [] current_waiting_time = 0 if request.metadata is not None: @@ -171,20 +167,29 @@ class OpenAIServingChat: raise ValueError("{}".format(res["error_msg"])) self.engine_client.data_processor.process_response_dict( - res, stream=True, enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output) + res, + stream=True, + enable_thinking=enable_thinking, + include_stop_str_in_output=include_stop_str_in_output, + ) - if res['metrics']['first_token_time'] is not None: - arrival_time = res['metrics']['first_token_time'] - inference_start_time = res['metrics']['inference_start_time'] + if res["metrics"]["first_token_time"] is not None: + arrival_time = res["metrics"]["first_token_time"] + inference_start_time = res["metrics"]["inference_start_time"] else: - arrival_time = res['metrics']['arrival_time'] - inference_start_time + arrival_time = res["metrics"]["arrival_time"] - inference_start_time if first_iteration: num_prompt_tokens = len(prompt_token_ids) num_cached_tokens = res.get("num_cached_tokens", 0) for i in range(num_choices): choice = ChatCompletionResponseStreamChoice( index=i, - delta=DeltaMessage(role="assistant", content="", reasoning_content="", tool_calls=None) + delta=DeltaMessage( + role="assistant", + content="", + reasoning_content="", + tool_calls=None, + ), ) if request.metadata is not None and request.metadata.get("training", False): choice.delta.token_ids = prompt_token_ids @@ -193,14 +198,14 @@ class OpenAIServingChat: object=chunk_object_type, created=created_time, choices=[choice], - model=model_name + model=model_name, ) if include_continuous_usage: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=0, total_tokens=num_prompt_tokens, - prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens) + prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=num_cached_tokens), ) yield f"data: {chunk.model_dump_json(exclude_unset=True)} \n\n" first_iteration = False @@ -222,24 +227,32 @@ class OpenAIServingChat: ) previous_num_tokens += len(output["token_ids"]) - delta_message = DeltaMessage(content=delta_text, reasoning_content=output.get("reasoning_content"), \ - token_ids=output.get("token_ids"), tool_calls=output.get("tool_call_content", [])) + delta_message = DeltaMessage( + content=delta_text, + reasoning_content=output.get("reasoning_content"), + token_ids=output.get("token_ids"), + tool_calls=output.get("tool_call_content", []), + ) choice = ChatCompletionResponseStreamChoice( index=0, delta=delta_message, logprobs=logprobs_res, - arrival_time=arrival_time + arrival_time=arrival_time, ) if res["finished"]: num_choices -= 1 - work_process_metrics.e2e_request_latency.observe(time.time() - res["metrics"]["request_start_time"]) + work_process_metrics.e2e_request_latency.observe( + time.time() - res["metrics"]["request_start_time"] + ) has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None max_tokens = request.max_completion_tokens or request.max_tokens if has_no_token_limit or previous_num_tokens != max_tokens: choice.finish_reason = "stop" - if self.engine_client.reasoning_parser == "ernie_x1" and \ - output.get("finish_reason", "") == "tool_calls": + if ( + self.engine_client.reasoning_parser == "ernie_x1" + and output.get("finish_reason", "") == "tool_calls" + ): choice.finish_reason = "tool_calls" else: choice.finish_reason = "length" @@ -253,7 +266,7 @@ class OpenAIServingChat: chunk.usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=previous_num_tokens, - total_tokens=num_prompt_tokens + previous_num_tokens + total_tokens=num_prompt_tokens + previous_num_tokens, ) choices.append(choice) @@ -267,13 +280,12 @@ class OpenAIServingChat: yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" choices = [] - if include_usage: completion_tokens = previous_num_tokens usage = UsageInfo( prompt_tokens=num_prompt_tokens, completion_tokens=completion_tokens, - total_tokens=num_prompt_tokens + completion_tokens + total_tokens=num_prompt_tokens + completion_tokens, ) chunk = ChatCompletionStreamResponse( id=request_id, @@ -281,7 +293,7 @@ class OpenAIServingChat: created=created_time, choices=[], model=model_name, - usage=usage + usage=usage, ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" @@ -297,7 +309,7 @@ class OpenAIServingChat: request: ChatCompletionRequest, request_id: str, model_name: str, - prompt_token_ids: list() + prompt_token_ids: list(), ): """ Full chat completion generator. @@ -307,11 +319,8 @@ class OpenAIServingChat: enable_thinking = None include_stop_str_in_output = False try: - dealer = await aiozmq.create_zmq_stream( - zmq.DEALER, - connect=f"ipc:///dev/shm/router_{self.pid}.ipc" - ) - dealer.write([b"", request_id.encode('utf-8')]) + dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") + dealer.write([b"", request_id.encode("utf-8")]) final_res = None previous_num_tokens = 0 current_waiting_time = 0 @@ -340,7 +349,11 @@ class OpenAIServingChat: enable_thinking = request.metadata.get("enable_thinking") include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False) data = self.engine_client.data_processor.process_response_dict( - data, stream=False, enable_thinking=enable_thinking, include_stop_str_in_output=include_stop_str_in_output) + data, + stream=False, + enable_thinking=enable_thinking, + include_stop_str_in_output=include_stop_str_in_output, + ) # api_server_logger.debug(f"Client {request_id} received: {data}") previous_num_tokens += len(data["outputs"]["token_ids"]) # The logprob for handling the response @@ -375,26 +388,23 @@ class OpenAIServingChat: content=output["text"], reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), - token_ids=output.get("token_ids") + token_ids=output.get("token_ids"), ) logprobs_full_res = None if logprob_contents: - logprobs_full_res = LogProbs( - content=logprob_contents - ) + logprobs_full_res = LogProbs(content=logprob_contents) choice = ChatCompletionResponseChoice( index=0, message=message, logprobs=logprobs_full_res, - finish_reason=None + finish_reason=None, ) has_no_token_limit = request.max_tokens is None and request.max_completion_tokens is None max_tokens = request.max_completion_tokens or request.max_tokens if has_no_token_limit or previous_num_tokens != max_tokens: choice.finish_reason = "stop" - if self.engine_client.reasoning_parser == "ernie_x1" and \ - output.get("finish_reason", "") == "tool_calls": + if self.engine_client.reasoning_parser == "ernie_x1" and output.get("finish_reason", "") == "tool_calls": choice.finish_reason = "tool_calls" else: choice.finish_reason = "length" @@ -409,7 +419,7 @@ class OpenAIServingChat: prompt_tokens=num_prompt_tokens, completion_tokens=num_generated_tokens, total_tokens=num_prompt_tokens + num_generated_tokens, - prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)) + prompt_tokens_details=PromptTokenUsageInfo(cached_tokens=final_res.get("num_cached_tokens", 0)), ) work_process_metrics.e2e_request_latency.observe(time.time() - final_res["metrics"]["request_start_time"]) return ChatCompletionResponse( @@ -417,14 +427,14 @@ class OpenAIServingChat: created=created_time, model=model_name, choices=choices, - usage=usage + usage=usage, ) def build_logprobs_response( - self, - request_logprobs: bool, - response_logprobs: Optional[LogprobsLists], - request_top_logprobs: int, + self, + request_logprobs: bool, + response_logprobs: Optional[LogprobsLists], + request_top_logprobs: int, ) -> Optional[LogProbs]: """ Construct a logprobs response object in line with the OpenAI style. @@ -433,10 +443,10 @@ class OpenAIServingChat: # Parameter validation if ( - response_logprobs is None - or not request_logprobs - or request_top_logprobs is None - or request_top_logprobs < 0 + response_logprobs is None + or not request_logprobs + or request_top_logprobs is None + or request_top_logprobs < 0 ): return None @@ -446,16 +456,17 @@ class OpenAIServingChat: topk_logprobs = [] if response_logprobs.logprob_token_ids and len(response_logprobs.logprob_token_ids) > 0: - topk_token_ids = response_logprobs.logprob_token_ids[0][:request_top_logprobs + 1] + topk_token_ids = response_logprobs.logprob_token_ids[0][: request_top_logprobs + 1] if response_logprobs.logprobs and len(response_logprobs.logprobs) > 0: - topk_logprobs = response_logprobs.logprobs[0][:request_top_logprobs + 1] + topk_logprobs = response_logprobs.logprobs[0][: request_top_logprobs + 1] # Construct the candidate token structure (LogProbEntry) of topk top_logprob_entries: List[LogProbEntry] = [] for tid, lp in zip(topk_token_ids, topk_logprobs): - token_str = self.engine_client.data_processor.process_logprob_response([tid], - clean_up_tokenization_spaces=False) + token_str = self.engine_client.data_processor.process_logprob_response( + [tid], clean_up_tokenization_spaces=False + ) # token_bytes = token_str.encode("utf-8", errors="replace") entry = LogProbEntry( token=token_str, @@ -468,7 +479,7 @@ class OpenAIServingChat: token=top_logprob_entries[0].token, logprob=top_logprob_entries[0].logprob, bytes=top_logprob_entries[0].bytes, - top_logprobs=top_logprob_entries[1:] # Here are the complete topk candidates + top_logprobs=top_logprob_entries[1:], # Here are the complete topk candidates ) return LogProbs(content=[sampled_entry]) diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index acefc3d17..648376d3d 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -15,33 +15,25 @@ """ import asyncio +import time +import uuid +from typing import List + import aiozmq -import json import msgpack from aiozmq import zmq -from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task -import time -from collections.abc import AsyncGenerator, AsyncIterator -from collections.abc import Sequence as GenericSequence -from typing import Optional, Union, cast, TypeVar, List -import uuid -from fastapi import Request +from fastdeploy.engine.request import RequestOutput from fastdeploy.entrypoints.openai.protocol import ( - ErrorResponse, CompletionRequest, CompletionResponse, - CompletionStreamResponse, - CompletionResponseStreamChoice, CompletionResponseChoice, + CompletionResponseStreamChoice, + CompletionStreamResponse, + ErrorResponse, UsageInfo, - DeltaToolCall, - DeltaFunctionCall, - ToolCall, - FunctionCall ) from fastdeploy.utils import api_server_logger, get_host_ip -from fastdeploy.engine.request import RequestOutput class OpenAIServingCompletion: @@ -77,7 +69,7 @@ class OpenAIServingCompletion: try: if isinstance(request.prompt, str): request_prompts = [request.prompt] - elif isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): + elif isinstance(request.prompt, list) and all(isinstance(item, int) for item in request.prompt): request_prompt_ids = [request.prompt] elif isinstance(request.prompt, list) and all(isinstance(item, str) for item in request.prompt): request_prompts = request.prompt @@ -105,9 +97,7 @@ class OpenAIServingCompletion: current_req_dict = request.to_dict_for_infer(request_id_idx, prompt) try: current_req_dict["arrival_time"] = time.time() - prompt_batched_token_ids.append( - self.engine_client.format_and_add_data(current_req_dict) - ) + prompt_batched_token_ids.append(self.engine_client.format_and_add_data(current_req_dict)) except Exception as e: return ErrorResponse(message=str(e), code=400) @@ -116,11 +106,11 @@ class OpenAIServingCompletion: if request.stream: return self.completion_stream_generator( request=request, - num_choices = num_choices, + num_choices=num_choices, request_id=request_id, created_time=created_time, model_name=request.model, - prompt_batched_token_ids=prompt_batched_token_ids + prompt_batched_token_ids=prompt_batched_token_ids, ) else: try: @@ -130,7 +120,7 @@ class OpenAIServingCompletion: request_id=request_id, created_time=created_time, model_name=request.model, - prompt_batched_token_ids=prompt_batched_token_ids + prompt_batched_token_ids=prompt_batched_token_ids, ) except Exception as e: return ErrorResponse(code=400, message=str(e)) @@ -138,7 +128,6 @@ class OpenAIServingCompletion: except Exception as e: return ErrorResponse(message=str(e), code=400) - async def completion_full_generator( self, request: CompletionRequest, @@ -146,7 +135,7 @@ class OpenAIServingCompletion: request_id: str, created_time: int, model_name: str, - prompt_batched_token_ids: list() + prompt_batched_token_ids: list(), ): """ Process the full completion request with multiple choices. @@ -155,10 +144,7 @@ class OpenAIServingCompletion: try: request_ids = [f"{request_id}-{i}" for i in range(num_choices)] # create dealer - dealer = await aiozmq.create_zmq_stream( - zmq.DEALER, - connect=f"ipc:///dev/shm/router_{self.pid}.ipc" - ) + dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") for rid in request_ids: dealer.write([b"", rid.encode("utf-8")]) @@ -186,8 +172,7 @@ class OpenAIServingCompletion: if data.get("error_code", 200) != 200: raise ValueError("{}".format(data["error_msg"])) - self.engine_client.data_processor.process_response_dict( - data, stream=False) + self.engine_client.data_processor.process_response_dict(data, stream=False) output_tokens[rid] += len(data["outputs"]["token_ids"]) if data.get("finished", False): data["output_token_ids"] = output_tokens[rid] @@ -201,18 +186,15 @@ class OpenAIServingCompletion: request_id=request_id, created_time=created_time, model_name=model_name, - prompt_batched_token_ids=prompt_batched_token_ids + prompt_batched_token_ids=prompt_batched_token_ids, ) except Exception as e: - api_server_logger.error( - f"Error in completion_full_generator: {e}", exc_info=True - ) + api_server_logger.error(f"Error in completion_full_generator: {e}", exc_info=True) raise finally: if dealer is not None: dealer.close() - async def completion_stream_generator( self, request: CompletionRequest, @@ -220,20 +202,17 @@ class OpenAIServingCompletion: request_id: str, created_time: int, model_name: str, - prompt_batched_token_ids: list() + prompt_batched_token_ids: list(), ): """ Process the stream completion request. """ try: - dealer = await aiozmq.create_zmq_stream( - zmq.DEALER, - connect=f"ipc:///dev/shm/router_{self.pid}.ipc" - ) + dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") for i in range(num_choices): req_id = f"{request_id}-{i}" - dealer.write([b"", req_id.encode('utf-8')]) # 发送多路请求 + dealer.write([b"", req_id.encode("utf-8")]) # 发送多路请求 output_tokens = [0] * num_choices inference_start_time = [0] * num_choices first_iteration = [True] * num_choices @@ -245,7 +224,7 @@ class OpenAIServingCompletion: id=request_id, created=created_time, model=model_name, - choices=choices + choices=choices, ) current_waiting_time = 0 @@ -264,7 +243,6 @@ class OpenAIServingCompletion: await asyncio.sleep(0.1) continue - response = msgpack.unpackb(raw_data[-1]) for res in response: idx = int(res["request_id"].split("-")[-1]) @@ -277,39 +255,43 @@ class OpenAIServingCompletion: id=request_id, created=created_time, model=model_name, - choices=[CompletionResponseStreamChoice( - index=idx, - text="", - token_ids=list(prompt_batched_token_ids[idx]) - )] + choices=[ + CompletionResponseStreamChoice( + index=idx, + text="", + token_ids=list(prompt_batched_token_ids[idx]), + ) + ], ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" first_iteration[idx] = False - - self.engine_client.data_processor.process_response_dict( - res, stream=True) - if res['metrics'].get('first_token_time') is not None: - arrival_time = res['metrics']['first_token_time'] - inference_start_time[idx] = res['metrics']['inference_start_time'] + self.engine_client.data_processor.process_response_dict(res, stream=True) + if res["metrics"].get("first_token_time") is not None: + arrival_time = res["metrics"]["first_token_time"] + inference_start_time[idx] = res["metrics"]["inference_start_time"] else: - arrival_time = res['metrics']['arrival_time'] - inference_start_time[idx] + arrival_time = res["metrics"]["arrival_time"] - inference_start_time[idx] output = res["outputs"] - choices.append(CompletionResponseStreamChoice( - index=idx, - text=output["text"], - token_ids=output.get("token_ids"), - tool_calls=output.get("tool_call_content"), - reasoning_content=output.get("reasoning_content"), - arrival_time=arrival_time - )) + choices.append( + CompletionResponseStreamChoice( + index=idx, + text=output["text"], + token_ids=output.get("token_ids"), + tool_calls=output.get("tool_call_content"), + reasoning_content=output.get("reasoning_content"), + arrival_time=arrival_time, + ) + ) if res["finished"]: if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens: chunk.choices[0].finish_reason = "stop" - if self.engine_client.reasoning_parser == "ernie_x1" and \ - output.get("finish_reason", "") == "tool_calls": + if ( + self.engine_client.reasoning_parser == "ernie_x1" + and output.get("finish_reason", "") == "tool_calls" + ): chunk.choices[0].finish_reason = "tool_calls" else: chunk.choices[0].finish_reason = "length" @@ -321,12 +303,11 @@ class OpenAIServingCompletion: id=request_id, created=created_time, model=model_name, - choices=choices + choices=choices, ) yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" choices = [] - if res["finished"]: num_choices -= 1 if getattr(request, "stream_options", None) and request.stream_options.include_usage: @@ -337,8 +318,8 @@ class OpenAIServingCompletion: choices=[], usage=UsageInfo( prompt_tokens=len(prompt_batched_token_ids[idx]), - completion_tokens=output_tokens[idx] - ) + completion_tokens=output_tokens[idx], + ), ) yield f"data: {usage_chunk.model_dump_json(exclude_unset=True)}\n\n" if choices: @@ -346,7 +327,6 @@ class OpenAIServingCompletion: yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" choices = [] - except Exception as e: yield f"data: {ErrorResponse(message=str(e), code=400).model_dump_json(exclude_unset=True)}\n\n" finally: @@ -355,7 +335,6 @@ class OpenAIServingCompletion: dealer.close() yield "data: [DONE]\n\n" - def request_output_to_completion_response( self, final_res_batch: List[RequestOutput], @@ -363,7 +342,7 @@ class OpenAIServingCompletion: request_id: str, created_time: int, model_name: str, - prompt_batched_token_ids: list() + prompt_batched_token_ids: list(), ) -> CompletionResponse: choices: List[CompletionResponseChoice] = [] num_prompt_tokens = 0 @@ -389,12 +368,13 @@ class OpenAIServingCompletion: output_text = output["text"] choice_data = CompletionResponseChoice( + token_ids=token_ids, index=len(choices), text=output_text, - reasoning_content=output.get('reasoning_content'), + reasoning_content=output.get("reasoning_content"), tool_calls=output.get("tool_call_content"), logprobs=None, - finish_reason=None + finish_reason=None, ) choices.append(choice_data) diff --git a/fastdeploy/entrypoints/openai/test_openai.py b/fastdeploy/entrypoints/openai/test_openai.py index e2fff507e..3b56b2c22 100644 --- a/fastdeploy/entrypoints/openai/test_openai.py +++ b/fastdeploy/entrypoints/openai/test_openai.py @@ -17,7 +17,7 @@ import openai ip = "0.0.0.0" -service_http_port = "9908" # 服务配置的 +service_http_port = "9908" # 服务配置的 client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") @@ -37,12 +37,12 @@ print("\n") response = client.completions.create( model="default", prompt="Hello, how are you?", - max_tokens=100, - stream=True, + max_tokens=100, + stream=True, ) for chunk in response: - print(chunk.choices[0].text, end='') + print(chunk.choices[0].text, end="") print("\n") # Chat completion @@ -76,5 +76,5 @@ response = client.chat.completions.create( for chunk in response: if chunk.choices[0].delta is not None: - print(chunk.choices[0].delta, end='') + print(chunk.choices[0].delta, end="") print("\n") diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 8f9488ee9..58f51aa78 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -20,115 +20,62 @@ from typing import Any, Callable environment_variables: dict[str, Callable[[], Any]] = { # Whether to use BF16 on CPU. - "FD_CPU_USE_BF16": - lambda: os.getenv("FD_CPU_USE_BF16", "False"), - + "FD_CPU_USE_BF16": lambda: os.getenv("FD_CPU_USE_BF16", "False"), # Cuda architecture to build FastDeploy.This is a list of strings # such as [80,90]. - "FD_BUILDING_ARCS": - lambda: os.getenv("FD_BUILDING_ARCS", "[]"), - + "FD_BUILDING_ARCS": lambda: os.getenv("FD_BUILDING_ARCS", "[]"), # Log directory. - "FD_LOG_DIR": - lambda: os.getenv("FD_LOG_DIR", "log"), - + "FD_LOG_DIR": lambda: os.getenv("FD_LOG_DIR", "log"), # Whether to use debug mode, can set 0 or 1 - "FD_DEBUG": - lambda: os.getenv("FD_DEBUG", "0"), - + "FD_DEBUG": lambda: os.getenv("FD_DEBUG", "0"), # Number of days to keep fastdeploy logs. - "FD_LOG_BACKUP_COUNT": - lambda: os.getenv("FD_LOG_BACKUP_COUNT", "7"), - + "FD_LOG_BACKUP_COUNT": lambda: os.getenv("FD_LOG_BACKUP_COUNT", "7"), # Model download cache directory. - "FD_MODEL_CACHE": - lambda: os.getenv("FD_MODEL_CACHE", None), - + "FD_MODEL_CACHE": lambda: os.getenv("FD_MODEL_CACHE", None), # Maximum number of stop sequences. - "FD_MAX_STOP_SEQS_NUM": - lambda: os.getenv("FD_MAX_STOP_SEQS_NUM", "5"), - + "FD_MAX_STOP_SEQS_NUM": lambda: os.getenv("FD_MAX_STOP_SEQS_NUM", "5"), # Maximum length of stop sequences. - "FD_STOP_SEQS_MAX_LEN": - lambda: os.getenv("FD_STOP_SEQS_MAX_LEN", "8"), - + "FD_STOP_SEQS_MAX_LEN": lambda: os.getenv("FD_STOP_SEQS_MAX_LEN", "8"), # GPU devices that will be used. This is a string that # splited by comma, such as 0,1,2. - "CUDA_VISIBLE_DEVICES": - lambda: os.getenv("CUDA_VISIBLE_DEVICES", None), - + "CUDA_VISIBLE_DEVICES": lambda: os.getenv("CUDA_VISIBLE_DEVICES", None), # Whether to use HuggingFace tokenizer. - "FD_USE_HF_TOKENIZER": - lambda: os.getenv("FD_USE_HF_TOKENIZER", 0), - + "FD_USE_HF_TOKENIZER": lambda: os.getenv("FD_USE_HF_TOKENIZER", 0), # Set the high watermark (HWM) for receiving data during ZMQ initialization - "FD_ZMQ_SNDHWM": - lambda: os.getenv("FD_ZMQ_SNDHWM", 10000), - + "FD_ZMQ_SNDHWM": lambda: os.getenv("FD_ZMQ_SNDHWM", 10000), # cache kv quant params directory - "FD_CACHE_PARAMS": - lambda: os.getenv("FD_CACHE_PARAMS", "none"), - + "FD_CACHE_PARAMS": lambda: os.getenv("FD_CACHE_PARAMS", "none"), # Set attention backend. "NATIVE_ATTN", "APPEND_ATTN" # and "MLA_ATTN" can be set currently. - "FD_ATTENTION_BACKEND": - lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"), - + "FD_ATTENTION_BACKEND": lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"), # Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently. - "FD_SAMPLING_CLASS": - lambda: os.getenv("FD_SAMPLING_CLASS", "base"), - + "FD_SAMPLING_CLASS": lambda: os.getenv("FD_SAMPLING_CLASS", "base"), # Set moe backend."cutlass","marlin" and "triton" can be set currently. - "FD_MOE_BACKEND": - lambda: os.getenv("FD_MOE_BACKEND", "cutlass"), - + "FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"), # Set whether to disable recompute the request when the KV cache is full. - "FD_DISABLED_RECOVER": - lambda: os.getenv("FD_DISABLED_RECOVER", "0"), - + "FD_DISABLED_RECOVER": lambda: os.getenv("FD_DISABLED_RECOVER", "0"), # Set triton kernel JIT compilation directory. - "FD_TRITON_KERNEL_CACHE_DIR": - lambda: os.getenv("FD_TRITON_KERNEL_CACHE_DIR", None), - + "FD_TRITON_KERNEL_CACHE_DIR": lambda: os.getenv("FD_TRITON_KERNEL_CACHE_DIR", None), # Whether transition from standalone PD decoupling to centralized inference - "FD_PD_CHANGEABLE": - lambda: os.getenv("FD_PD_CHANGEABLE", "0"), - + "FD_PD_CHANGEABLE": lambda: os.getenv("FD_PD_CHANGEABLE", "0"), # Whether to use fastsafetensor load weight (0 or 1) - "FD_USE_FASTSAFETENSOR": - lambda: os.getenv("FD_USE_FASTSAFETENSOR", "0"), - + "FD_USE_FASTSAFETENSOR": lambda: os.getenv("FD_USE_FASTSAFETENSOR", "0"), # Whether to use DeepGemm for FP8 blockwise MoE. - "FD_USE_DEEP_GEMM": - lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))), - + "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))), # Whether to use aggregate send. - "FD_USE_AGGREGATE_SEND": - lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))), - + "FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))), # Whether to open Trace. - "TRACES_ENABLE": - lambda: os.getenv("TRACES_ENABLE", "false"), - + "TRACES_ENABLE": lambda: os.getenv("TRACES_ENABLE", "false"), # set traec Server name. - "FD_SERVICE_NAME": - lambda: os.getenv("FD_SERVICE_NAME", "FastDeploy"), - + "FD_SERVICE_NAME": lambda: os.getenv("FD_SERVICE_NAME", "FastDeploy"), # set traec host name. - "FD_HOST_NAME": - lambda: os.getenv("FD_HOST_NAME", "localhost"), - + "FD_HOST_NAME": lambda: os.getenv("FD_HOST_NAME", "localhost"), # set traec exporter. - "TRACES_EXPORTER": - lambda: os.getenv("TRACES_EXPORTER", "console"), - + "TRACES_EXPORTER": lambda: os.getenv("TRACES_EXPORTER", "console"), # set traec exporter_otlp_endpoint. - "EXPORTER_OTLP_ENDPOINT": - lambda: os.getenv("EXPORTER_OTLP_ENDPOINT"), - + "EXPORTER_OTLP_ENDPOINT": lambda: os.getenv("EXPORTER_OTLP_ENDPOINT"), # set traec exporter_otlp_headers. - "EXPORTER_OTLP_HEADERS": - lambda: os.getenv("EXPORTER_OTLP_HEADERS"), + "EXPORTER_OTLP_HEADERS": lambda: os.getenv("EXPORTER_OTLP_HEADERS"), } diff --git a/fastdeploy/import_ops.py b/fastdeploy/import_ops.py index 01fe251e9..9a3230821 100644 --- a/fastdeploy/import_ops.py +++ b/fastdeploy/import_ops.py @@ -43,8 +43,7 @@ def import_custom_ops(package, module_name, global_ns): logger.warning(f"Failed to import op {func_name}: {e}") except Exception: - logger.warning( - f"Ops of {package} import failed, it may be not compiled.") + logger.warning(f"Ops of {package} import failed, it may be not compiled.") preprocess_static_op(global_ns) diff --git a/fastdeploy/input/__init__.py b/fastdeploy/input/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/input/__init__.py +++ b/fastdeploy/input/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/input/ernie_processor.py b/fastdeploy/input/ernie_processor.py index 85e8ccc2f..0fe996d4f 100644 --- a/fastdeploy/input/ernie_processor.py +++ b/fastdeploy/input/ernie_processor.py @@ -69,12 +69,12 @@ class ErnieProcessor(BaseDataProcessor): # Generation config try: - self.generation_config = GenerationConfig.from_pretrained( - self.model_name_or_path) + self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path) except Exception as e: data_processor_logger.warning( f"Can't find generation config, so it will not use " - f"generation_config field in the model config, details={e}") + f"generation_config field in the model config, details={e}" + ) self.generation_config = None def process_request(self, request, max_model_len=None, **kwargs): @@ -89,8 +89,7 @@ class ErnieProcessor(BaseDataProcessor): str: error message """ request = self._apply_default_parameters(request) - if request.get("eos_token_ids") is None or len( - request.eos_token_ids) == 0: + if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0: request.eos_token_ids = self.eos_token_ids stop_sequences = request.get("stop", []) if stop_sequences is not None and len(stop_sequences) != 0: @@ -98,12 +97,9 @@ class ErnieProcessor(BaseDataProcessor): request.set("stop_token_ids", stop_seqs) request.set("stop_seqs_len", stop_seqs_len) - if request.prompt_token_ids is None or len( - request.prompt_token_ids) == 0: - system = request.get("system") + if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is None and request.messages is None: - raise ValueError( - f"The request should have `input_ids`, `text` or `messages`: {request}.") + raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") if request.prompt is not None or not request.raw_request: prompt = request.prompt if request.prompt is not None else request.messages[0] prompt = prompt[0] if isinstance(prompt, list) else prompt @@ -114,14 +110,13 @@ class ErnieProcessor(BaseDataProcessor): else: request.prompt_token_ids = self.messages2ids(request.to_dict()) - if max_model_len is not None and len( - request.prompt_token_ids) > max_model_len: - request.prompt_token_ids = request.prompt_token_ids[: - max_model_len - - 1] + if max_model_len is not None and len(request.prompt_token_ids) > max_model_len: + request.prompt_token_ids = request.prompt_token_ids[: max_model_len - 1] if request.get("max_tokens") is None: - request.set("max_tokens", - max(1, max_model_len - len(request.prompt_token_ids))) + request.set( + "max_tokens", + max(1, max_model_len - len(request.prompt_token_ids)), + ) if request.get("temperature") < _SAMPLING_EPS: # zero temperature is equivalent to greedy sampling request.set("temperature", 1) @@ -140,45 +135,36 @@ class ErnieProcessor(BaseDataProcessor): str: error message """ request = self._apply_default_parameters(request) - if not request.get('eos_token_ids'): - request['eos_token_ids'] = self.eos_token_ids + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids # 处理stop_sequences - stop_sequences = request.get('stop', []) + stop_sequences = request.get("stop", []) if stop_sequences: stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) - request['stop_token_ids'] = stop_seqs - request['stop_seqs_len'] = stop_seqs_len + request["stop_token_ids"] = stop_seqs + request["stop_seqs_len"] = stop_seqs_len - system = request.get("system") # 处理prompt_token_ids - if not request.get('prompt_token_ids'): - if request.get('prompt') is None and request.get( - 'messages') is None: - raise ValueError( - f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}" - ) - if request.get('prompt'): - prompt = request.get('prompt') + if not request.get("prompt_token_ids"): + if request.get("prompt") is None and request.get("messages") is None: + raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}") + if request.get("prompt"): + prompt = request.get("prompt") prompt = prompt[0] if isinstance(prompt, list) else prompt tokens = self.tokenizer.tokenize(prompt) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - request['prompt_token_ids'] = token_ids + request["prompt_token_ids"] = token_ids req_id = request.get("request_id", None) - data_processor_logger.info( - f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}" - ) + data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") else: - request['prompt_token_ids'] = self.messages2ids(request) + request["prompt_token_ids"] = self.messages2ids(request) # 截断超过长度限制的prompt - if max_model_len is not None and len( - request['prompt_token_ids']) > max_model_len: - request['prompt_token_ids'] = request[ - 'prompt_token_ids'][:max_model_len - 1] + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] if request.get("max_tokens") is None: - request["max_tokens"] = max( - 1, max_model_len - len(request['prompt_token_ids'])) + request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) if request.get("temperature") < _SAMPLING_EPS: # zero temperature is equivalent to greedy sampling request["temperature"] = 1 @@ -200,22 +186,18 @@ class ErnieProcessor(BaseDataProcessor): req_id = response_dict.request_id token_ids = response_dict.outputs.token_ids - response_dict.usage = { - "completion_tokens": response_dict.outputs.index + 1 - } + response_dict.usage = {"completion_tokens": response_dict.outputs.index + 1} if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] full_text = self.tokenizer.decode(token_ids) if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content else: response_dict.outputs.text = full_text data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}") - if response_dict.outputs.text == "" and \ - response_dict.outputs.reasoning_content == "": + if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "": return None return response_dict @@ -230,8 +212,7 @@ class ErnieProcessor(BaseDataProcessor): Dict: response contain text fields """ if stream: - return self.process_response_dict_streaming( - response_dict, **kwargs) + return self.process_response_dict_streaming(response_dict, **kwargs) else: return self.process_response_dict_normal(response_dict, **kwargs) @@ -255,16 +236,12 @@ class ErnieProcessor(BaseDataProcessor): if is_end: full_text = previous_texts + delta_text if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text - response_dict["outputs"][ - "reasoning_content"] = reasoning_content + response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = full_text - data_processor_logger.info( - f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}" - ) + data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] return response_dict @@ -286,20 +263,22 @@ class ErnieProcessor(BaseDataProcessor): if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] - delta_text, previous_token_ids, previous_texts = self.ids2tokens( - token_ids, req_id) + delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( - previous_texts, previous_texts + delta_text, delta_text, - previous_token_ids, previous_token_ids + token_ids, token_ids) + previous_texts, + previous_texts + delta_text, + delta_text, + previous_token_ids, + previous_token_ids + token_ids, + token_ids, + ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = delta_text if is_end: - data_processor_logger.info( - f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}" - ) + data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] return response_dict @@ -320,15 +299,15 @@ class ErnieProcessor(BaseDataProcessor): request_or_messages, tokenize=False, split_special_tokens=False, - add_special_tokens=False) + add_special_tokens=False, + ) req_id = None if isinstance(request_or_messages, dict): req_id = request_or_messages.get("request_id", None) tokens = self.tokenizer.tokenize(spliced_message) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - data_processor_logger.info( - f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") + data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") return token_ids def ids2tokens(self, token_id, task_id): @@ -352,7 +331,8 @@ class ErnieProcessor(BaseDataProcessor): previous_token_ids = self.decode_status[task_id][2] previous_texts = self.decode_status[task_id][3] decode_str, prefix_offset, read_offset = self.tokenizer.decode_token( - previous_token_ids + token_id, prefix_offset, read_offset) + previous_token_ids + token_id, prefix_offset, read_offset + ) self.decode_status[task_id][0] = prefix_offset self.decode_status[task_id][1] = read_offset self.decode_status[task_id][2] += token_id @@ -368,17 +348,15 @@ class ErnieProcessor(BaseDataProcessor): tokenizer (AutoTokenizer) """ vocab_file_names = [ - "tokenizer.model", "spm.model", "ernie_token_100k.model" + "tokenizer.model", + "spm.model", + "ernie_token_100k.model", ] for i in range(len(vocab_file_names)): - if os.path.exists( - os.path.join(self.model_name_or_path, - vocab_file_names[i])): - ErnieBotTokenizer.resource_files_names[ - "vocab_file"] = vocab_file_names[i] + if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])): + ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] break - self.tokenizer = ErnieBotTokenizer.from_pretrained( - self.model_name_or_path) + self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path) def get_pad_id(self): """ @@ -391,16 +369,17 @@ class ErnieProcessor(BaseDataProcessor): # return self.tokenizer.eos_token return self.tokenizer.pad_token_id - def pad_batch_data(self, - insts, - pad_id=0, - return_seq_len=False, - return_array=True, - pad_style="right"): + def pad_batch_data( + self, + insts, + pad_id=0, + return_seq_len=False, + return_array=True, + pad_style="right", + ): """Pad the instances to the max sequence length in batch.""" if len(insts) == 0: - padded_insts = np.array([[]], - dtype=np.int64) if return_array else [[]] + padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]] if return_seq_len: seq_len = np.array([], dtype=np.int64) if return_array else [] return padded_insts, seq_len @@ -408,15 +387,11 @@ class ErnieProcessor(BaseDataProcessor): max_len = max(map(len, insts)) if pad_style == "left": - padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) - for inst in insts] + padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts] else: - padded_insts = [ - list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts - ] + padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts] if return_array: - padded_insts = np.array(padded_insts, - dtype=np.int64).reshape([-1, max_len]) + padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len]) if return_seq_len: seq_len = [len(inst) for inst in insts] @@ -432,15 +407,9 @@ class ErnieProcessor(BaseDataProcessor): stop_seqs = [] for seq in stop_sequences: if seq != self.tokenizer.eos_token_id: - stop_seqs.append( - self.tokenizer.convert_tokens_to_ids( - self.tokenizer.tokenize(seq))) - stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, - pad_id=-1, - return_seq_len=True, - return_array=False) - data_processor_logger.debug( - f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}") + stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq))) + stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False) + data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}") return stop_seqs, stop_seqs_len def process_logprob_response(self, token_ids, **kwargs): diff --git a/fastdeploy/input/ernie_tokenizer.py b/fastdeploy/input/ernie_tokenizer.py index f51c730fd..2bbc798c5 100644 --- a/fastdeploy/input/ernie_tokenizer.py +++ b/fastdeploy/input/ernie_tokenizer.py @@ -19,19 +19,14 @@ import os import re from shutil import copyfile -from typing import Dict, Optional, Tuple, List +from typing import Dict, List, Optional, Tuple + import numpy as np -import sentencepiece as spm - import paddle - - -from paddleformers.utils.log import logger +import sentencepiece as spm from paddleformers.transformers import PretrainedTokenizer -from paddleformers.transformers.tokenizer_utils_base import ( - PaddingStrategy, - TextInput, -) +from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, TextInput +from paddleformers.utils.log import logger class ErnieBotTokenizer(PretrainedTokenizer): @@ -47,7 +42,12 @@ class ErnieBotTokenizer(PretrainedTokenizer): pretrained_init_configuration = { "ernie-bot-10b": {}, } - model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"] + model_input_names = [ + "input_ids", + "position_ids", + "attention_mask", + "labels", + ] padding_side = "right" def __init__( @@ -222,9 +222,7 @@ class ErnieBotTokenizer(PretrainedTokenizer): # TODO: should this be in the base class? if hasattr(self, "do_lower_case") and self.do_lower_case: # convert non-special tokens to lowercase - escaped_special_toks = [ - re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_spec_tok) - ] + escaped_special_toks = [re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_spec_tok)] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) @@ -303,7 +301,12 @@ class ErnieBotTokenizer(PretrainedTokenizer): elif not isinstance(attention_mask, np.ndarray): raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ") else: - attention_mask = np.tril(np.ones((len(required_input), len(required_input)), dtype=np.int64)) + attention_mask = np.tril( + np.ones( + (len(required_input), len(required_input)), + dtype=np.int64, + ) + ) attention_mask = np.expand_dims(attention_mask, axis=0) if needs_to_be_padded: difference = max_length - len(required_input) diff --git a/fastdeploy/input/ernie_vl_processor.py b/fastdeploy/input/ernie_vl_processor.py index 184abeffb..1b8669e29 100644 --- a/fastdeploy/input/ernie_vl_processor.py +++ b/fastdeploy/input/ernie_vl_processor.py @@ -17,18 +17,23 @@ import os import numpy as np -import re -from fastdeploy.input.mm_processor import DataProcessor, IDS_TYPE_FLAG -from fastdeploy.input.ernie_processor import ErnieProcessor + from fastdeploy.engine.request import Request -from fastdeploy.entrypoints.chat_utils import parse_chat_messages +from fastdeploy.input.ernie_processor import ErnieProcessor +from fastdeploy.input.mm_processor import IDS_TYPE_FLAG, DataProcessor from fastdeploy.utils import data_processor_logger class ErnieMoEVLProcessor(ErnieProcessor): """The processor class for ERNIE MoE VL models.""" - def __init__(self, model_name_or_path, limit_mm_per_prompt=None, mm_processor_kwargs=None, - reasoning_parser_obj=None): + + def __init__( + self, + model_name_or_path, + limit_mm_per_prompt=None, + mm_processor_kwargs=None, + reasoning_parser_obj=None, + ): self.use_hf_tokenizer = False if "merge_llm_model" in model_name_or_path: @@ -37,11 +42,11 @@ class ErnieMoEVLProcessor(ErnieProcessor): tokenizer_path = model_name_or_path preprocessor_path = model_name_or_path processor_kwargs = self._parse_processor_kwargs(mm_processor_kwargs) - + self.ernie_processor = DataProcessor( tokenizer_name=tokenizer_path, image_preprocessor_name=preprocessor_path, - **processor_kwargs + **processor_kwargs, ) self.ernie_processor.eval() self.image_patch_id = self.ernie_processor.image_patch_id @@ -73,12 +78,12 @@ class ErnieMoEVLProcessor(ErnieProcessor): def process_request(self, request, max_model_len=None, **kwargs): """process the input data""" task = request.to_dict() - task['enable_thinking'] = kwargs.get("enable_thinking", True) + task["enable_thinking"] = kwargs.get("enable_thinking", True) self.process_request_dict(task, max_model_len) request = Request.from_dict(task) return request - + def _parse_processor_kwargs(self, kwargs): """解析多模态处理器参数配置""" if not kwargs: @@ -101,13 +106,14 @@ class ErnieMoEVLProcessor(ErnieProcessor): "video_frames_sample": str, "video_max_frames": int, "video_min_frames": int, - "video_fps": int + "video_fps": int, } for key, value in kwargs.items(): if key in expected_types and not isinstance(value, expected_types[key]): raise ValueError( - f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}") + f"Invalid type for {key}: expected {expected_types[key].__name__}, got {type(value).__name__}" + ) return kwargs @@ -117,11 +123,7 @@ class ErnieMoEVLProcessor(ErnieProcessor): def _parse_limits(self, limits): """解析多模态限制配置""" - DEFAULT_LIMITS = { - "image": 1, - "video": 1, - "audio": 1 - } + DEFAULT_LIMITS = {"image": 1, "video": 1, "audio": 1} if not limits: return DEFAULT_LIMITS @@ -141,10 +143,7 @@ class ErnieMoEVLProcessor(ErnieProcessor): mm_data = item else: # 请求包含messages - mm_data = { - "image": [], - "video": [] - } + mm_data = {"image": [], "video": []} for message in item: if isinstance(message.get("content"), list): @@ -153,15 +152,12 @@ class ErnieMoEVLProcessor(ErnieProcessor): mm_data["image"].append(part) elif part.get("type") == "video": mm_data["video"].append(part) - + for modality, data in mm_data.items(): if modality in self.limit_mm_per_prompt: limit = self.limit_mm_per_prompt[modality] if len(data) > limit: - raise ValueError( - f"Too many {modality} items in prompt, " - f"got {len(data)} but limit is {limit}" - ) + raise ValueError(f"Too many {modality} items in prompt, " f"got {len(data)} but limit is {limit}") def process_request_dict(self, request, max_model_len=None): """process the input data""" @@ -178,7 +174,7 @@ class ErnieMoEVLProcessor(ErnieProcessor): if request.get("prompt"): multimodal_data = request.get("multimodal_data") if multimodal_data is None: - multimodal_data = {} + multimodal_data = {} self._check_mm_limits(multimodal_data) images = multimodal_data.get("image", None) videos = multimodal_data.get("video", None) @@ -189,7 +185,7 @@ class ErnieMoEVLProcessor(ErnieProcessor): outputs = self.ernie_processor.request2ids(request) else: raise ValueError(f"Request must contain 'prompt', or 'messages': {request}") - + metadata = request.get("metadata") # 如果metadata包含之前输出的token,将这些token添加到input_ids末尾 if metadata and metadata.get("generated_token_ids"): @@ -200,20 +196,17 @@ class ErnieMoEVLProcessor(ErnieProcessor): request["multimodal_inputs"] = outputs # 截断超过长度限制的prompt - if max_model_len is not None and len( - request['prompt_token_ids']) > max_model_len: - request['prompt_token_ids'] = request[ - 'prompt_token_ids'][:max_model_len - 1] + if max_model_len is not None and len(request["prompt_token_ids"]) > max_model_len: + request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] if request.get("max_tokens") is None: - request["max_tokens"] = max( - 1, max_model_len - len(request['prompt_token_ids'])) + request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) data_processor_logger.info(f"Processed request {request}") - + return request def append_generated_tokens(self, multimodal_inputs, generated_token_ids): "append already generated tokens" - + num_tokens = len(generated_token_ids) multimodal_inputs["input_ids"].extend(generated_token_ids) multimodal_inputs["token_type_ids"].extend([IDS_TYPE_FLAG["text"]] * num_tokens) @@ -257,4 +250,4 @@ class ErnieMoEVLProcessor(ErnieProcessor): if stream: return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) else: - return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs) \ No newline at end of file + return self.process_response_dict_normal(response_dict, enable_thinking=enable_thinking, **kwargs) diff --git a/fastdeploy/input/mm_processor/__init__.py b/fastdeploy/input/mm_processor/__init__.py index 3001e7f56..ba59bc165 100644 --- a/fastdeploy/input/mm_processor/__init__.py +++ b/fastdeploy/input/mm_processor/__init__.py @@ -14,10 +14,10 @@ # limitations under the License. """ -from .process import DataProcessor, fancy_print, IDS_TYPE_FLAG +from .process import IDS_TYPE_FLAG, DataProcessor, fancy_print __all__ = [ - 'DataProcessor', - 'fancy_print', - 'IDS_TYPE_FLAG', -] + "DataProcessor", + "fancy_print", + "IDS_TYPE_FLAG", +] diff --git a/fastdeploy/input/mm_processor/image_preprocessor/__init__.py b/fastdeploy/input/mm_processor/image_preprocessor/__init__.py index 7b1c6d3e5..c11444e67 100644 --- a/fastdeploy/input/mm_processor/image_preprocessor/__init__.py +++ b/fastdeploy/input/mm_processor/image_preprocessor/__init__.py @@ -17,4 +17,4 @@ from .get_image_preprocessor import get_image_preprocessor from .image_preprocessor_adaptive import AdaptiveImageProcessor -__all__ = ['get_image_preprocessor', 'AdaptiveImageProcessor'] +__all__ = ["get_image_preprocessor", "AdaptiveImageProcessor"] diff --git a/fastdeploy/input/mm_processor/image_preprocessor/get_image_preprocessor.py b/fastdeploy/input/mm_processor/image_preprocessor/get_image_preprocessor.py index bf458a212..0ff6f7d1e 100644 --- a/fastdeploy/input/mm_processor/image_preprocessor/get_image_preprocessor.py +++ b/fastdeploy/input/mm_processor/image_preprocessor/get_image_preprocessor.py @@ -16,9 +16,10 @@ """get image preprocessor""" -from .image_preprocessor_adaptive import AdaptiveImageProcessor from fastdeploy.utils import data_processor_logger +from .image_preprocessor_adaptive import AdaptiveImageProcessor + def get_image_preprocessor(args): """ diff --git a/fastdeploy/input/mm_processor/image_preprocessor/image_preprocessor_adaptive.py b/fastdeploy/input/mm_processor/image_preprocessor/image_preprocessor_adaptive.py index d2e481a89..15b15a4d2 100644 --- a/fastdeploy/input/mm_processor/image_preprocessor/image_preprocessor_adaptive.py +++ b/fastdeploy/input/mm_processor/image_preprocessor/image_preprocessor_adaptive.py @@ -42,9 +42,7 @@ from paddleformers.transformers.image_utils import ( to_numpy_array, valid_images, ) -from paddleformers.transformers.tokenizer_utils_base import ( - TensorType, -) +from paddleformers.transformers.tokenizer_utils_base import TensorType from PIL import Image from fastdeploy.utils import data_processor_logger @@ -161,7 +159,12 @@ class AdaptiveImageProcessor(BaseImageProcessor): The merge size of the vision encoder to llm encoder. """ - model_input_names = ["pixel_values", "image_grid_thw", "pixel_values_videos", "video_grid_thw"] + model_input_names = [ + "pixel_values", + "image_grid_thw", + "pixel_values_videos", + "video_grid_thw", + ] def __init__( self, @@ -221,7 +224,10 @@ class AdaptiveImageProcessor(BaseImageProcessor): min_pixels=actual_min_pixels, max_pixels=actual_max_pixels, ) - return (resized_height, resized_width), (resized_height // self.patch_size, resized_width // self.patch_size) + return (resized_height, resized_width), ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) def _preprocess( self, @@ -330,7 +336,12 @@ class AdaptiveImageProcessor(BaseImageProcessor): image = rescale(image, scale=rescale_factor, data_format=input_data_format) if do_normalize: - image = normalize(image=image, mean=image_mean, std=image_std, data_format=input_data_format) + image = normalize( + image=image, + mean=image_mean, + std=image_std, + data_format=input_data_format, + ) image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) # [C, H, W] @@ -341,7 +352,10 @@ class AdaptiveImageProcessor(BaseImageProcessor): channel = patches.shape[1] # [time, C, H, W] grid_t = patches.shape[0] - grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size + grid_h, grid_w = ( + resized_height // self.patch_size, + resized_width // self.patch_size, + ) patches = patches.reshape( [ grid_t, @@ -358,7 +372,10 @@ class AdaptiveImageProcessor(BaseImageProcessor): patches = patches.transpose([0, 2, 5, 3, 6, 1, 4, 7]) flatten_patches = patches.reshape( - [grid_t * grid_h * grid_w, channel * self.patch_size * self.patch_size] + [ + grid_t * grid_h * grid_w, + channel * self.patch_size * self.patch_size, + ] ) # [grid_t * grid_h * grid_w, C * psz * psz] return flatten_patches, (grid_t, grid_h, grid_w) @@ -471,7 +488,10 @@ class AdaptiveImageProcessor(BaseImageProcessor): vision_grid_thws.append(image_grid_thw) pixel_values = np.array(pixel_values) vision_grid_thws = np.array(vision_grid_thws) - data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws} + data = { + "pixel_values": pixel_values, + "image_grid_thw": vision_grid_thws, + } if videos is not None: pixel_values, vision_grid_thws = [], [] @@ -495,7 +515,10 @@ class AdaptiveImageProcessor(BaseImageProcessor): pixel_values = np.array(pixel_values) vision_grid_thws = np.array(vision_grid_thws) - data = {"pixel_values_videos": pixel_values, "video_grid_thw": vision_grid_thws} + data = { + "pixel_values_videos": pixel_values, + "video_grid_thw": vision_grid_thws, + } return BatchFeature(data=data, tensor_type=return_tensors) @@ -516,7 +539,11 @@ def floor_by_factor(number: int, factor: int) -> int: def smart_resize( - height: int, width: int, factor: int = IMAGE_FACTOR, min_pixels: int = MIN_PIXELS, max_pixels: int = MAX_PIXELS + height: int, + width: int, + factor: int = IMAGE_FACTOR, + min_pixels: int = MIN_PIXELS, + max_pixels: int = MAX_PIXELS, ): """ Rescales the image so that the following conditions are met: diff --git a/fastdeploy/input/mm_processor/process.py b/fastdeploy/input/mm_processor/process.py index 26d1f8031..23c2828c0 100644 --- a/fastdeploy/input/mm_processor/process.py +++ b/fastdeploy/input/mm_processor/process.py @@ -17,7 +17,6 @@ """ process.py """ import copy -import io import os from collections import defaultdict from typing import Any, Dict, List, Union @@ -26,14 +25,12 @@ import numpy as np from paddleformers.transformers.image_utils import ChannelDimension from PIL import Image - +from fastdeploy.entrypoints.chat_utils import parse_chat_messages +from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from .image_preprocessor.image_preprocessor_adaptive import AdaptiveImageProcessor from .process_video import read_frames_decord, read_video_decord -from .utils.io_utils import RAW_IMAGE_DIR, get_downloadable from .utils.render_timestamp import render_frame_timestamp -from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer -from fastdeploy.entrypoints.chat_utils import parse_chat_messages IDS_TYPE_FLAG = {"text": 0, "image": 1, "video": 2, "audio": 3} @@ -98,7 +95,7 @@ class DataProcessor: video_max_frames: int = 180, video_min_frames: int = 16, video_fps: int = 2, - **kwargs + **kwargs, ) -> None: # Tokenizer and image preprocessor self.model_name_or_path = tokenizer_name @@ -137,14 +134,23 @@ class DataProcessor: self.sep_token_id = self.tokenizer.convert_tokens_to_ids(self.sep_token) self.eos_token_id = self.tokenizer.convert_tokens_to_ids(self.eos_token) - self.token_type_mapping = self._build_token_type_mapping() self.is_training = True - self.role_prefixes = {"system": "", "user": "User: ", "bot": "Assistant: ", "assistant": "Assistant: "} + self.role_prefixes = { + "system": "", + "user": "User: ", + "bot": "Assistant: ", + "assistant": "Assistant: ", + } def _build_token_type_mapping(self) -> Dict[Any, int]: mapping = defaultdict(lambda: IDS_TYPE_FLAG["text"]) - for token in (self.IMG_START, self.IMG_END, self.VID_START, self.VID_END): + for token in ( + self.IMG_START, + self.IMG_END, + self.VID_START, + self.VID_END, + ): mapping[token] = IDS_TYPE_FLAG["image"] mapping[self.image_patch_id] = IDS_TYPE_FLAG["image"] return mapping @@ -175,7 +181,7 @@ class DataProcessor: "pic_cnt": 0, "video_cnt": 0, } - + IMAGE_PLACEHOLDER = "<|image@placeholder|>" VIDEO_PLACEHOLDER = "<|video@placeholder|>" IMAGE_PLACEHOLDER_LEN = len(IMAGE_PLACEHOLDER) @@ -206,15 +212,17 @@ class DataProcessor: self._add_video(frames, outputs) video_idx += 1 st = ed + VIDEO_PLACEHOLDER_LEN - + return outputs - def request2ids(self, request: Dict[str, Any],tgts: List[str]=None) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: + def request2ids( + self, request: Dict[str, Any], tgts: List[str] = None + ) -> Dict[str, Union[np.ndarray, List[np.ndarray], None]]: """ Convert chat messages into model inputs. Returns a dict with input_ids, token_type_ids, position_ids, images, grid_thw, image_type_ids, labels. """ - + outputs = { "input_ids": [], "token_type_ids": [], @@ -237,16 +245,22 @@ class DataProcessor: if not isinstance(content_items, list): content_items = [content_items] for item in content_items: - if isinstance(item, dict) and item.get("type") in ["image", "video"]: + if isinstance(item, dict) and item.get("type") in [ + "image", + "video", + ]: image_message_list.append(item) - + prompt_token_ids = self.apply_chat_template(request) image_start_index = 0 image_message_index = 0 for i in range(len(prompt_token_ids)): - if prompt_token_ids[i] in [self.image_start_id, self.video_start_id]: - self._add_text(prompt_token_ids[image_start_index:i + 1], outputs) - image_start_index = i + 1 + if prompt_token_ids[i] in [ + self.image_start_id, + self.video_start_id, + ]: + self._add_text(prompt_token_ids[image_start_index : i + 1], outputs) + image_start_index = i + 1 image_message = image_message_list[image_message_index] if image_message["type"] == "image": img = image_message.get("image") @@ -265,8 +279,8 @@ class DataProcessor: self._add_text(prompt_token_ids[image_start_index:], outputs) if self.is_training: - assert tgts, f"training must give tgt !" - self._extract_labels(outputs,tgts) + assert tgts, "training must give tgt !" + self._extract_labels(outputs, tgts) return outputs def _add_special_token(self, token: Union[str, int], outputs: Dict) -> None: @@ -349,24 +363,22 @@ class DataProcessor: outputs["cur_position"] = np.max(pos_ids) + 1 def _extract_labels(self, outputs: Dict, tgts: List[str]) -> None: - input_ids = copy.deepcopy(outputs['input_ids']) + input_ids = copy.deepcopy(outputs["input_ids"]) labels = [self.tokenizer.ignored_index] * len(input_ids) - tgt_count=input_ids.count(self.sep_token_id) - assert tgt_count==len(tgts),f'len(tgts) != len(src) {len(tgts)} vs {tgt_count}' + tgt_count = input_ids.count(self.sep_token_id) + assert tgt_count == len(tgts), f"len(tgts) != len(src) {len(tgts)} vs {tgt_count}" - tgt_index=0 - for i,token_id in enumerate(input_ids): - if token_id==self.sep_token_id: + tgt_index = 0 + for i, token_id in enumerate(input_ids): + if token_id == self.sep_token_id: labels_token = self.tokenizer.tokenize(tgts[tgt_index]) labels_token_id = self.tokenizer.convert_tokens_to_ids(labels_token) - labels[i-len(labels_token_id):i]=labels_token_id - labels[i] = self.eos_token_id # + labels[i - len(labels_token_id) : i] = labels_token_id + labels[i] = self.eos_token_id # tgt_index += 1 - outputs['labels']=labels - - + outputs["labels"] = labels def _load_and_process_video(self, url: str, item: Dict) -> List[Image.Image]: reader, meta, path = read_video_decord(url, save_to_disk=False) @@ -455,30 +467,40 @@ class DataProcessor: Returns: tokenizer (AutoTokenizer) """ - vocab_file_names = ["tokenizer.model", "spm.model", "ernie_token_100k.model"] + vocab_file_names = [ + "tokenizer.model", + "spm.model", + "ernie_token_100k.model", + ] for i in range(len(vocab_file_names)): if os.path.exists(os.path.join(self.model_name_or_path, vocab_file_names[i])): ErnieBotTokenizer.resource_files_names["vocab_file"] = vocab_file_names[i] break self.tokenizer = ErnieBotTokenizer.from_pretrained(self.model_name_or_path) - + def apply_chat_template(self, request): """ Convert multi-turn messages into ID sequences. - + Args: - messages: Either a request dict containing 'messages' field, + messages: Either a request dict containing 'messages' field, or a list of message dicts directly - + Returns: List of token IDs as strings (converted from token objects) """ if self.tokenizer.chat_template is None: raise ValueError("This model does not support chat_template.") - prompt_token_str = self.tokenizer.apply_chat_template( - request, tokenize=False, add_generation_prompt=request.get("add_generation_prompt", True) - ).replace("<|image@placeholder|>", "").replace("<|video@placeholder|>", "") + prompt_token_str = ( + self.tokenizer.apply_chat_template( + request, + tokenize=False, + add_generation_prompt=request.get("add_generation_prompt", True), + ) + .replace("<|image@placeholder|>", "") + .replace("<|video@placeholder|>", "") + ) tokens = self.tokenizer.tokenize(prompt_token_str) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - return token_ids \ No newline at end of file + return token_ids diff --git a/fastdeploy/input/mm_processor/process_video.py b/fastdeploy/input/mm_processor/process_video.py index 258d0b24c..91120096c 100644 --- a/fastdeploy/input/mm_processor/process_video.py +++ b/fastdeploy/input/mm_processor/process_video.py @@ -21,17 +21,16 @@ import random import numpy as np from PIL import Image -from .utils.io_utils import EXTRACTED_FRAME_DIR, get_downloadable, get_filename -from .utils.video_utils import VideoReaderWrapper from fastdeploy.utils import data_processor_logger +from .utils.io_utils import EXTRACTED_FRAME_DIR, get_filename +from .utils.video_utils import VideoReaderWrapper + def read_video_decord(video_path, save_to_disk): """get reader and meta by decord""" - data_in_mem = False # video_path = get_downloadable(video_path, save_to_disk=save_to_disk) if isinstance(video_path, VideoReaderWrapper): - data_in_mem = True video_reader = video_path else: if isinstance(video_path, bytes): @@ -78,7 +77,7 @@ def get_frame_indices( if frames_sample == "rand": try: frame_indices = [random.choice(range(x[0], x[1])) for x in ranges] - except Exception as e: + except Exception: frame_indices = np.random.permutation(vlen)[:acc_samples] frame_indices.sort() frame_indices = list(frame_indices) @@ -161,11 +160,14 @@ def read_frames_decord( continue try: frames.append(video_reader[frame_indice - previous_counter].asnumpy()) - data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame") + data_processor_logger.info( + f"replace {frame_indice}-th frame with {frame_indice-previous_counter}-th frame" + ) frame_indices[frame_indice_index] = frame_indice - previous_counter break except Exception as e: previous_counter += 1 + data_processor_logger.info(f"error: {e}") else: if frame_indice + later_counter >= len(video_reader): later_counter += 1 @@ -173,10 +175,12 @@ def read_frames_decord( continue try: frames.append(video_reader[frame_indice + later_counter].asnumpy()) - data_processor_logger.info(f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame") + data_processor_logger.info( + f"replace {frame_indice}-th frame with {frame_indice+later_counter}-th frame" + ) frame_indices[frame_indice_index] = frame_indice + later_counter break - except Exception as e: + except Exception: later_counter += 1 previous_after_flag = not previous_after_flag diff --git a/fastdeploy/input/mm_processor/tokenizer/__init__.py b/fastdeploy/input/mm_processor/tokenizer/__init__.py index d168a0a45..a705b4424 100644 --- a/fastdeploy/input/mm_processor/tokenizer/__init__.py +++ b/fastdeploy/input/mm_processor/tokenizer/__init__.py @@ -16,4 +16,4 @@ from .tokenizer_vl import ErnieVLTokenizer -__all__ = ['ErnieVLTokenizer'] +__all__ = ["ErnieVLTokenizer"] diff --git a/fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py b/fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py index 9e103912d..5797fcee9 100644 --- a/fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py +++ b/fastdeploy/input/mm_processor/tokenizer/tokenizer_vl.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + """ ErnieVLTokenizer """ @@ -25,8 +26,7 @@ import numpy as np import paddle import sentencepiece as spm from paddleformers.transformers import PretrainedTokenizer -from paddleformers.transformers.tokenizer_utils_base import (PaddingStrategy, - TextInput) +from paddleformers.transformers.tokenizer_utils_base import PaddingStrategy, TextInput from fastdeploy.utils import console_logger as logger @@ -42,7 +42,10 @@ class ErnieVLTokenizer(PretrainedTokenizer): "ernie-bot-10b": {}, } model_input_names = [ - "input_ids", "position_ids", "attention_mask", "labels" + "input_ids", + "position_ids", + "attention_mask", + "labels", ] padding_side = "right" @@ -114,10 +117,7 @@ class ErnieVLTokenizer(PretrainedTokenizer): def get_vocab(self): """doc""" - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab @@ -160,9 +160,7 @@ class ErnieVLTokenizer(PretrainedTokenizer): # logger.warning(f'ErnieBotTokenizer v2 does not support `add_special_tokens`') return super().prepare_for_model(*args, **kwargs) - def save_vocabulary(self, - save_directory, - filename_prefix: Optional[str] = None) -> Tuple[str]: + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. Args: @@ -172,22 +170,19 @@ class ErnieVLTokenizer(PretrainedTokenizer): `Tuple(str)`: Paths to the files saved. """ if not os.path.isdir(save_directory): - logger.error( - f"Vocabulary path ({save_directory}) should be a directory") + logger.error(f"Vocabulary path ({save_directory}) should be a directory") return out_vocab_file = os.path.join( save_directory, - (filename_prefix + "-" if filename_prefix else "") + - self.resource_files_names["vocab_file"], + (filename_prefix + "-" if filename_prefix else "") + self.resource_files_names["vocab_file"], ) - if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file) and os.path.isfile(self.vocab_file): + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) - return (out_vocab_file, ) + return (out_vocab_file,) def tokenize(self, text: TextInput, **kwargs) -> List[str]: """ @@ -211,13 +206,10 @@ class ErnieVLTokenizer(PretrainedTokenizer): if hasattr(self, "do_lower_case") and self.do_lower_case: # convert non-special tokens to lowercase escaped_special_toks = [ - re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + - self.all_special_tokens) + re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) ] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" - text = re.sub(pattern, - lambda m: m.groups()[0] or m.groups()[1].lower(), - text) + text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) no_split_token = set(self.unique_no_split_tokens) tokens = self.tokens_trie.split(text) @@ -259,27 +251,24 @@ class ErnieVLTokenizer(PretrainedTokenizer): required_input = encoded_inputs[self.model_input_names[0]] if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) - if max_length is not None and pad_to_multiple_of is not None and ( - max_length % pad_to_multiple_of != 0): - max_length = ((max_length // pad_to_multiple_of) + - 1) * pad_to_multiple_of - needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len( - required_input) != max_length - if "attention_mask" in encoded_inputs and encoded_inputs[ - "attention_mask"] is not None: + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + if "attention_mask" in encoded_inputs and encoded_inputs["attention_mask"] is not None: attention_mask = encoded_inputs.pop("attention_mask") if isinstance(attention_mask, paddle.Tensor): attention_mask = attention_mask.numpy() elif isinstance(attention_mask, list): attention_mask = np.array(attention_mask) elif not isinstance(attention_mask, np.ndarray): - raise ValueError( - f"Unexpected type {type(attention_mask)} of attention_mask, " - ) + raise ValueError(f"Unexpected type {type(attention_mask)} of attention_mask, ") else: attention_mask = np.tril( - np.ones((len(required_input), len(required_input)), - dtype=np.int64)) + np.ones( + (len(required_input), len(required_input)), + dtype=np.int64, + ) + ) attention_mask = np.expand_dims(attention_mask, axis=0) if needs_to_be_padded: difference = max_length - len(required_input) @@ -294,8 +283,7 @@ class ErnieVLTokenizer(PretrainedTokenizer): else: pad_width = [(0, 0), (difference, 0), (difference, 0)] else: - raise ValueError("Invalid padding strategy:" + - str(self.padding_side)) + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) attention_mask = np.pad( attention_mask, pad_width=pad_width, @@ -362,8 +350,7 @@ def add_special_tokens( # check first_special_tokens = tokenizer.encode(special_tokens[0])["input_ids"] - assert first_special_tokens[ - 0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}" + assert first_special_tokens[0] == special_token_ids_start, f"[ERROR] first_special_tokens={first_special_tokens}" assert ( len(tokenizer.get_vocab()) < special_token_ids_end ), f"[ERROR] vocab_size = {len(tokenizer.get_vocab())} >= {special_token_ids_end} 增加过多special token了!" diff --git a/fastdeploy/input/mm_processor/utils/io_utils.py b/fastdeploy/input/mm_processor/utils/io_utils.py index 800ddd435..43bf05d08 100644 --- a/fastdeploy/input/mm_processor/utils/io_utils.py +++ b/fastdeploy/input/mm_processor/utils/io_utils.py @@ -87,7 +87,13 @@ def get_filename(url=None): return image_filname -def get_downloadable(url, download_dir=RAW_VIDEO_DIR, save_to_disk=False, retry=0, retry_interval=3): +def get_downloadable( + url, + download_dir=RAW_VIDEO_DIR, + save_to_disk=False, + retry=0, + retry_interval=3, +): """download video and store it in the disk return downloaded **path** if save_to_disk is set to true @@ -150,7 +156,12 @@ def get_downloadable_image(download_path, need_exif_info, retry_max_time=0, retr # 由于I模式的point函数只支持加减乘,所以下面的* (1 / 256)不能改成除法 return img.point(lambda i: i * (1 / 256)).convert("L") - image = get_downloadable(download_path, save_to_disk=False, retry=retry_max_time, retry_interval=retry_interval) + image = get_downloadable( + download_path, + save_to_disk=False, + retry=retry_max_time, + retry_interval=retry_interval, + ) if isinstance(image, Image.Image): pil_image = image else: @@ -158,7 +169,7 @@ def get_downloadable_image(download_path, need_exif_info, retry_max_time=0, retr if need_exif_info: try: exif_info = get_image_exif(pil_image) - except Exception as why: + except Exception: exif_info = {} else: exif_info = {} @@ -168,7 +179,7 @@ def get_downloadable_image(download_path, need_exif_info, retry_max_time=0, retr pil_image = change_I16_to_L(pil_image) if has_transparent_background(pil_image): pil_image = add_white_background(pil_image) - except Exception as e: + except Exception: pass return pil_image.convert("RGB"), exif_info diff --git a/fastdeploy/input/mm_processor/utils/render_timestamp.py b/fastdeploy/input/mm_processor/utils/render_timestamp.py index beb58b922..9b24226ed 100644 --- a/fastdeploy/input/mm_processor/utils/render_timestamp.py +++ b/fastdeploy/input/mm_processor/utils/render_timestamp.py @@ -39,7 +39,14 @@ def render_single_image_with_timestamp(image: Image, number: str, rate: float, f y = 0 # 文本的x坐标, y坐标 # 绘制黑色的时间戳,白色的边框 - draw.text((x, y), number, font=font, fill=(0, 0, 0), stroke_width=outline_size, stroke_fill=(255, 255, 255)) + draw.text( + (x, y), + number, + font=font, + fill=(0, 0, 0), + stroke_width=outline_size, + stroke_fill=(255, 255, 255), + ) return image diff --git a/fastdeploy/input/multimodal/__init__.py b/fastdeploy/input/multimodal/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/input/multimodal/__init__.py +++ b/fastdeploy/input/multimodal/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/input/multimodal/audio.py b/fastdeploy/input/multimodal/audio.py index 0abedf5c2..97c73b26e 100644 --- a/fastdeploy/input/multimodal/audio.py +++ b/fastdeploy/input/multimodal/audio.py @@ -21,8 +21,7 @@ from pathlib import Path import numpy as np import numpy.typing as npt -from .base import MediaIO, MultiModalPlugin -from .inputs import AudioItem, ModalityData, MultiModalKwargs +from .base import MediaIO # TODO 多模数据处理 # try: @@ -44,25 +43,24 @@ def resample_audio( ) -> npt.NDArray[np.floating]: """ 将音频数据从原始采样率(`orig_sr`)重采样到目标采样率(`target_sr`)。 - + Args: audio (npt.NDArray[np.floating]): 带有单通道浮点型音频数据的 numpy ndarray,形状为 `(samples,)`。 orig_sr (float): 音频数据的原始采样率。 target_sr (float): 需要转换到的目标采样率。 - + Returns: npt.NDArray[np.floating]: 带有单通道浮点型音频数据的 numpy ndarray,形状为 `(samples,)`,已经被重采样到目标采样率。 - + Raises: None. """ import librosa + return librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr) - class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): - def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]: """ 加载字节数据,返回音频信号和采样率。 @@ -73,8 +71,8 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): 如果解码失败,则返回 None。 """ import librosa - return librosa.load(BytesIO(data), sr=None) + return librosa.load(BytesIO(data), sr=None) def load_base64( self, @@ -83,16 +81,16 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): ) -> tuple[npt.NDArray, float]: """ 将 base64 编码的字符串转换为 numpy 数组和尺度。 - + Args: media_type (str): 媒体类型,例如 'image/jpeg'、'image/png' 等。 data (str): base64 编码的字符串,表示图像或其他二进制数据。 - + Returns: tuple[npt.NDArray, float]: 包含以下两个元素: - npt.NDArray: 形状为(H,W,C)的 numpy 数组,表示图像或其他二进制数据。 - float: 图像的尺度,单位为像素。 - + Raises: ValueError: 当 media_type 不是有效的媒体类型时引发。 """ @@ -108,6 +106,7 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): 第二个是采样率(float类型)。 """ import librosa + return librosa.load(filepath, sr=None) def encode_base64(self, media: tuple[npt.NDArray, float]) -> str: @@ -121,7 +120,8 @@ class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]): with BytesIO() as buffer: import soundfile + soundfile.write(buffer, audio, sr, format="WAV") data = buffer.getvalue() - return base64.b64encode(data).decode('utf-8') + return base64.b64encode(data).decode("utf-8") diff --git a/fastdeploy/input/multimodal/base.py b/fastdeploy/input/multimodal/base.py index f00ce84c5..962b186d2 100644 --- a/fastdeploy/input/multimodal/base.py +++ b/fastdeploy/input/multimodal/base.py @@ -15,30 +15,25 @@ """ from abc import ABC, abstractmethod -from collections import defaultdict -from collections.abc import Sequence from pathlib import Path -from typing import (TYPE_CHECKING, Any, Callable, Generic, NamedTuple, - Optional, TypeVar, Union) - +from typing import Generic, TypeVar _T = TypeVar("_T") class MediaIO(ABC, Generic[_T]): - @abstractmethod def load_bytes(self, data: bytes) -> _T: """ 将字节数据加载为对象,并返回该对象。 如果加载失败,则抛出异常。 - + Args: data (bytes): 要加载的字节数据。 - + Raises: NotImplementedError: 当前类未实现此方法。 - + Returns: _T: 加载后的对象。 """ @@ -56,13 +51,13 @@ class MediaIO(ABC, Generic[_T]): def load_file(self, filepath: Path) -> _T: """ 加载文件,返回解析后的数据。 - + Args: filepath (Path): 文件路径,必须是一个绝对路径。 - + Raises: NotImplementedError: 当前方法未被实现。 - + Returns: _T: 任意类型,表示解析后的数据。 """ diff --git a/fastdeploy/input/multimodal/image.py b/fastdeploy/input/multimodal/image.py index 33f3068be..908e55489 100644 --- a/fastdeploy/input/multimodal/image.py +++ b/fastdeploy/input/multimodal/image.py @@ -16,8 +16,8 @@ import base64 from io import BytesIO -from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional +from typing import Any + import requests from PIL import Image @@ -25,18 +25,17 @@ from .base import MediaIO class ImageMediaIO(MediaIO[Image.Image]): - def __init__(self, *, image_mode: str = "RGB") -> None: """ Initializes the object. - + Args: image_mode (str, optional): The mode of the image, defaults to "RGB". Should be one of "L", "LA", "P", "RGB", "RGBA", "CMYK", or "YCbCr". - + Raises: ValueError: If `image_mode` is not a valid mode. - + Returns: None: This method does not return anything. It initializes the object with the given parameters. """ @@ -48,13 +47,13 @@ class ImageMediaIO(MediaIO[Image.Image]): """ 将字节数据转换为图像对象,并返回。 该方法会自动调用Image.open和Image.load方法,以及convert方法将图像转换为指定模式(默认为RGB)。 - + Args: data (bytes): 包含图像数据的字节对象。 - + Returns: Image.Image: 一个包含了原始图像数据的Image对象,已经被转换为指定模式。 - + Raises: 无。 """ @@ -65,14 +64,14 @@ class ImageMediaIO(MediaIO[Image.Image]): def load_base64(self, media_type: str, data: str) -> Image.Image: """ 将 base64 编码的字符串转换为图片对象。 - + Args: media_type (str): 媒体类型,例如 "image/jpeg"。 data (str): base64 编码的字符串数据。 - + Returns: Image.Image: PIL 中的图片对象。 - + Raises: 无。 """ @@ -82,13 +81,13 @@ class ImageMediaIO(MediaIO[Image.Image]): """ 加载文件,并转换为指定模式。 如果文件不存在或无法打开,将抛出FileNotFoundError异常。 - + Args: filepath (str): 文件路径。 - + Returns: Image.Image: 返回一个Image.Image对象,表示已经加载和转换的图像。 - + Raises: FileNotFoundError: 当文件不存在时抛出此异常。 """ @@ -101,13 +100,13 @@ class ImageMediaIO(MediaIO[Image.Image]): 从请求中加载图像文件,并返回一个PIL Image对象。 该函数需要传入一个包含图像URL的字符串或者可迭代对象(如requests库的Response对象)。 该函数会自动处理图像的格式和大小,并将其转换为指定的模式(默认为RGB)。 - + Args: request (Any): 包含图像URL的字符串或者可迭代对象(如requests库的Response对象)。 - + Returns: Image.Image: PIL Image对象,表示已经加载并转换好的图像。 - + Raises: 无。 """ @@ -123,15 +122,15 @@ class ImageMediaIO(MediaIO[Image.Image]): ) -> str: """ 将图像转换为Base64编码的字符串。 - + Args: media (Image.Image): 待处理的图像对象,支持PIL库中的Image类型。 image_format (str, optional): 指定图像格式,默认为"JPEG"。可选项包括:"PNG", "JPEG", "BMP", "TIFF"等。 PIL库中的所有图片格式都可以使用,但是不建议使用"PPM"和"XBM"格式,因为这两种格式在Python3中已经被弃用了。 - + Returns: str: Base64编码后的字符串,可以直接作为HTML或者JSON数据传输。 - + Raises: None """ @@ -142,4 +141,4 @@ class ImageMediaIO(MediaIO[Image.Image]): image.save(buffer, image_format) data = buffer.getvalue() - return base64.b64encode(data).decode('utf-8') + return base64.b64encode(data).decode("utf-8") diff --git a/fastdeploy/input/multimodal/utils.py b/fastdeploy/input/multimodal/utils.py index f8626096d..4c7f2e557 100644 --- a/fastdeploy/input/multimodal/utils.py +++ b/fastdeploy/input/multimodal/utils.py @@ -16,62 +16,76 @@ import base64 import io +import ipaddress +import mimetypes import os import random - import socket -from urllib.parse import urlparse -import ipaddress - -import requests -from PIL import Image, ImageOps -from fastdeploy.utils import data_processor_logger - -import pyheif -from pdf2image import convert_from_path -import cairosvg import subprocess import tempfile -import mimetypes +from urllib.parse import urlparse + +import cairosvg +import pyheif +import requests +from pdf2image import convert_from_path +from PIL import Image, ImageOps + +from fastdeploy.utils import data_processor_logger + def process_image_data(image_data, mime_type, url): """处理不同类型的图像数据并返回 PIL 图像对象""" - if mime_type in ['image/heif', 'image/heic'] or url.lower().endswith('.heif') or url.lower().endswith('.heic'): + if mime_type in ["image/heif", "image/heic"] or url.lower().endswith(".heif") or url.lower().endswith(".heic"): heif_file = pyheif.read(image_data) pil_image = Image.frombytes( - heif_file.mode, heif_file.size, heif_file.data, - "raw", heif_file.mode, heif_file.stride + heif_file.mode, + heif_file.size, + heif_file.data, + "raw", + heif_file.mode, + heif_file.stride, ) - elif mime_type == 'application/pdf' or url.lower().endswith('.pdf'): - with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf: + elif mime_type == "application/pdf" or url.lower().endswith(".pdf"): + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: temp_pdf.write(image_data.getvalue()) temp_pdf_path = temp_pdf.name images = convert_from_path(temp_pdf_path) pil_image = images[0] os.remove(temp_pdf_path) - elif mime_type == 'image/svg+xml' or url.lower().endswith('.svg'): + elif mime_type == "image/svg+xml" or url.lower().endswith(".svg"): png_data = cairosvg.svg2png(bytestring=image_data.getvalue()) pil_image = Image.open(io.BytesIO(png_data)) - elif mime_type in ['application/postscript', 'application/illustrator'] or url.lower().endswith('.ai'): - with tempfile.NamedTemporaryFile(delete=False, suffix='.ai') as ai_temp, tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as pdf_temp: + elif mime_type in [ + "application/postscript", + "application/illustrator", + ] or url.lower().endswith(".ai"): + with ( + tempfile.NamedTemporaryFile(delete=False, suffix=".ai") as ai_temp, + tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as pdf_temp, + ): ai_temp_path = ai_temp.name pdf_temp_path = pdf_temp.name ai_temp.write(image_data.getvalue()) ai_temp.close() - subprocess.run(['inkscape', ai_temp_path, '--export-pdf=' + pdf_temp_path], check=True) + subprocess.run( + ["inkscape", ai_temp_path, "--export-pdf=" + pdf_temp_path], + check=True, + ) images = convert_from_path(pdf_temp_path) pil_image = images[0] os.remove(ai_temp_path) os.remove(pdf_temp_path) - elif mime_type == 'image/gif' or url.lower().endswith('.gif'): + elif mime_type == "image/gif" or url.lower().endswith(".gif"): pil_image = Image.open(image_data) else: pil_image = Image.open(image_data) return pil_image + def http_to_pil_image(url): """http_to_pil_image""" if is_public_url(url) and int(os.getenv("DOWNLOAD_WITH_TP_SERVER", "0")): @@ -82,7 +96,7 @@ def http_to_pil_image(url): raise Exception("Failed to download the image from URL.") image_data = io.BytesIO(response.content) - mime_type = response.headers.get('Content-Type') + mime_type = response.headers.get("Content-Type") if mime_type is None: mime_type, _ = mimetypes.guess_type(url) @@ -91,24 +105,37 @@ def http_to_pil_image(url): return pil_image + def http_to_pil_image_with_tp_server(url, retry_time=6): """cnap平台没有外网访问权限,需要使用tp服务下载图片""" - proxies = [{"http": "http://10.229.197.142:8807"}, {"http": "http://10.229.197.161:8804"}, - {"http": "http://10.229.198.143:8804"}, {"http": "http://10.122.108.164:8807"}, - {"http": "http://10.122.108.165:8807"}, {"http": "http://10.122.108.166:8807"}, - {"http": "http://10.122.108.168:8801"}, {"http": "http://10.122.150.146:8802"}, - {"http": "http://10.122.150.158:8802"}, {"http": "http://10.122.150.164:8801"}, - {"http": "http://10.143.51.38:8813"}, {"http": "http://10.143.103.42:8810"}, - {"http": "http://10.143.194.45:8804"}, {"http": "http://10.143.226.25:8801"}, - {"http": "http://10.143.236.12:8807"}, {"http": "http://10.143.238.36:8807"}, - {"http": "http://10.144.71.30:8807"}, {"http": "http://10.144.73.16:8804"}, - {"http": "http://10.144.138.36:8801"}, {"http": "http://10.144.152.40:8810"}, - {"http": "http://10.144.199.29:8810"}, {"http": "http://10.144.251.29:8813"}, - ] + proxies = [ + {"http": "http://10.229.197.142:8807"}, + {"http": "http://10.229.197.161:8804"}, + {"http": "http://10.229.198.143:8804"}, + {"http": "http://10.122.108.164:8807"}, + {"http": "http://10.122.108.165:8807"}, + {"http": "http://10.122.108.166:8807"}, + {"http": "http://10.122.108.168:8801"}, + {"http": "http://10.122.150.146:8802"}, + {"http": "http://10.122.150.158:8802"}, + {"http": "http://10.122.150.164:8801"}, + {"http": "http://10.143.51.38:8813"}, + {"http": "http://10.143.103.42:8810"}, + {"http": "http://10.143.194.45:8804"}, + {"http": "http://10.143.226.25:8801"}, + {"http": "http://10.143.236.12:8807"}, + {"http": "http://10.143.238.36:8807"}, + {"http": "http://10.144.71.30:8807"}, + {"http": "http://10.144.73.16:8804"}, + {"http": "http://10.144.138.36:8801"}, + {"http": "http://10.144.152.40:8810"}, + {"http": "http://10.144.199.29:8810"}, + {"http": "http://10.144.251.29:8813"}, + ] headers = { "X-Tp-Authorization": "Basic RVJOSUVMaXRlVjpFUk5JRUxpdGVWXzFxYXo0cmZ2M2VkYzV0Z2Iyd3N4LWJmZS10cA==", - "scheme": "https" - } + "scheme": "https", + } new_url = url.replace("https://", "http://") if url.startswith("https://") else url @@ -119,7 +146,7 @@ def http_to_pil_image_with_tp_server(url, retry_time=6): if response.status_code == 200: image_data = io.BytesIO(response.content) - mime_type = response.headers.get('Content-Type') + mime_type = response.headers.get("Content-Type") if mime_type is None: mime_type, _ = mimetypes.guess_type(url) @@ -133,7 +160,6 @@ def http_to_pil_image_with_tp_server(url, retry_time=6): raise Exception(f"Failed to download the image from URL: {url}") - def base64_to_pil_image(base64_string): """base64_to_pil_image""" image_bytes = base64.b64decode(base64_string) @@ -163,22 +189,23 @@ def is_public_url(url): print(f"Error checking URL: {e}") return False + def process_transparency(image): - """ process transparency. """ + """process transparency.""" + def _is_transparent(image): # 检查图片是否有alpha通道 - if image.mode in ('RGBA', 'LA') or (image.mode == 'P' and 'transparency' in image.info): + if image.mode in ("RGBA", "LA") or (image.mode == "P" and "transparency" in image.info): # 获取alpha通道 - alpha = image.convert('RGBA').split()[-1] + alpha = image.convert("RGBA").split()[-1] # 如果alpha通道中存在0,说明图片有透明部分 if alpha.getextrema()[0] < 255: return True return False - def _convert_transparent_paste(image): width, height = image.size - new_image = Image.new("RGB", (width, height), (255, 255, 255)) # 生成一张白色底图 + new_image = Image.new("RGB", (width, height), (255, 255, 255)) # 生成一张白色底图 new_image.paste(image, (0, 0), image) return new_image diff --git a/fastdeploy/input/multimodal/video.py b/fastdeploy/input/multimodal/video.py index 7e13cf9f4..b1aacc2a1 100644 --- a/fastdeploy/input/multimodal/video.py +++ b/fastdeploy/input/multimodal/video.py @@ -15,42 +15,37 @@ """ from __future__ import annotations + import base64 -from functools import partial -from io import BytesIO -from pathlib import Path -from typing import Optional import numpy as np import numpy.typing as npt -from PIL import Image from .base import MediaIO -from .image import ImageMediaIO def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: """ 对视频帧进行缩放,将每一帧的大小调整为指定的高度和宽度。 - + Args: frames (npt.NDArray, shape=(N, H, W, C)): 包含N个帧的三维数组,其中H是高度,W是宽度,C是通道数。 所有帧都应该具有相同的通道数。 size (tuple[int, int], required): 一个元组,包含两个整数,分别表示目标高度和宽度。 - + Returns: npt.NDArray, shape=(N, new_height, new_width, C): 返回一个新的三维数组,其中每一帧已经被缩放到指定的高度和宽度。 新数组的通道数与输入数组相同。 - + Raises: None """ num_frames, _, _, channels = frames.shape new_height, new_width = size - resized_frames = np.empty((num_frames, new_height, new_width, channels), - dtype=frames.dtype) + resized_frames = np.empty((num_frames, new_height, new_width, channels), dtype=frames.dtype) # lazy import cv2 to avoid bothering users who only use text models import cv2 + for i, frame in enumerate(frames): resized_frame = cv2.resize(frame, (new_width, new_height)) resized_frames[i] = resized_frame @@ -60,15 +55,15 @@ def resize_video(frames: npt.NDArray, size: tuple[int, int]) -> npt.NDArray: def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: """ 对视频帧进行缩放,将每个帧的高度和宽度都乘以一个因子。 - + Args: frames (npt.NDArray): 形状为(T,H,W,C)的四维numpy数组,表示T个帧,高度为H,宽度为W,通道数为C。 size_factor (float): 用于缩放视频帧的因子,新的高度和宽度将分别是原来的高度和宽度的size_factor倍。 - + Returns: npt.NDArray: 形状为(T,new_H,new_W,C)的四维numpy数组,表示T个帧,高度为new_H,宽度为new_W,通道数为C。 其中new_H和new_W是根据size_factor计算出来的。 - + Raises: None """ @@ -79,15 +74,14 @@ def rescale_video_size(frames: npt.NDArray, size_factor: float) -> npt.NDArray: return resize_video(frames, (new_height, new_width)) -def sample_frames_from_video(frames: npt.NDArray, - num_frames: int) -> npt.NDArray: +def sample_frames_from_video(frames: npt.NDArray, num_frames: int) -> npt.NDArray: """ 从视频中随机选取指定数量的帧,并返回一个包含这些帧的numpy数组。 - + Args: frames (npt.NDArray): 形状为(T,H,W,C)的ndarray,表示视频的所有帧,其中T是帧的总数,H、W是每个帧的高度和宽度,C是通道数。 num_frames (int, optional): 要从视频中选取的帧数。如果设置为-1,则将返回所有帧。默认为-1。 - + Returns: npt.NDArray: 形状为(num_frames,H,W,C)的ndarray,表示选取的帧。如果num_frames=-1,则返回原始的frames。 """ @@ -101,17 +95,16 @@ def sample_frames_from_video(frames: npt.NDArray, class VideoMediaIO(MediaIO[bytes]): - def __init__(self) -> None: """ 初始化一个 VideoMediaIO 对象。 - + Args: 无。 - + Raises: 无。 - + Returns: 无。 """ @@ -121,13 +114,13 @@ class VideoMediaIO(MediaIO[bytes]): """ ERNIE-45-VL模型的前处理中包含抽帧操作,如果将视频帧加载为npt.NDArray格式会丢失FPS信息,因此目前 不对字节数据做任何操作。 - + Args: data (bytes): 包含视频帧数据的字节对象。 - + Returns: bytes,字节数据原样返回。 - + Raises: 无。 """ @@ -136,14 +129,14 @@ class VideoMediaIO(MediaIO[bytes]): def load_base64(self, media_type: str, data: str) -> bytes: """ 加载 base64 编码的数据,并返回bytes。 - + Args: media_type (str): 媒体类型,目前不支持 "video/jpeg"。 data (str): base64 编码的字符串数据。 - + Returns: bytes, optional: 如果 media_type 不为 "video/jpeg",则返回字节数据。 - + Raises: ValueError: 如果media_type是"video/jpeg"。 """ @@ -155,13 +148,13 @@ class VideoMediaIO(MediaIO[bytes]): def load_file(self, filepath: str) -> bytes: """ 读取文件内容,并返回bytes。 - + Args: filepath (str): 文件路径,表示要读取的文件。 - + Returns: bytes, optional: 返回字节数据,包含了文件内容。 - + Raises: 无。 """ diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py index 5bc1de877..27fb9e12f 100644 --- a/fastdeploy/input/preprocess.py +++ b/fastdeploy/input/preprocess.py @@ -13,31 +13,32 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Any, Dict, Optional +from fastdeploy.config import ErnieArchitectures from fastdeploy.engine.config import ModelConfig from fastdeploy.reasoning import ReasoningParserManager -from fastdeploy.config import ErnieArchitectures class InputPreprocessor: """ - Args: - model_name_or_path (str): - Model name or path to the pretrained model. If a model name is provided, it should be a - key in the Hugging Face Transformers' model registry (https://huggingface.co/models). - The model will be downloaded from the Hugging Face model hub if necessary. - If a path is provided, the model will be loaded from that path. - reasoning_parser (str, optional): - Reasoning parser type. Defaults to None. - Flag specifies the reasoning parser to use for extracting reasoning content from the model output - enable_mm (bool, optional): - Whether to use the multi-modal model processor. Defaults to False. + Args: + model_name_or_path (str): + Model name or path to the pretrained model. If a model name is provided, it should be a + key in the Hugging Face Transformers' model registry (https://huggingface.co/models). + The model will be downloaded from the Hugging Face model hub if necessary. + If a path is provided, the model will be loaded from that path. + reasoning_parser (str, optional): + Reasoning parser type. Defaults to None. + Flag specifies the reasoning parser to use for extracting reasoning content from the model output + enable_mm (bool, optional): + Whether to use the multi-modal model processor. Defaults to False. - Raises: - ValueError: - If the model name is not found in the Hugging Face Transformers' model registry and the path does not - exist. + Raises: + ValueError: + If the model name is not found in the Hugging Face Transformers' model registry and the path does not + exist. """ def __init__( @@ -68,30 +69,33 @@ class InputPreprocessor: """ reasoning_parser_obj = None if self.reasoning_parser: - reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser( - self.reasoning_parser) + reasoning_parser_obj = ReasoningParserManager.get_reasoning_parser(self.reasoning_parser) architectures = ModelConfig(self.model_name_or_path).architectures if not self.enable_mm: if not ErnieArchitectures.contains_ernie_arch(architectures): from fastdeploy.input.text_processor import DataProcessor + self.processor = DataProcessor( - model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj) - else: - from fastdeploy.input.ernie_processor import ErnieProcessor - self.processor = ErnieProcessor( - model_name_or_path=self.model_name_or_path, reasoning_parser_obj=reasoning_parser_obj) - else: - if not architectures.startswith( - "Ernie4_5_VLMoeForConditionalGeneration"): - raise ValueError( - f"Model {self.model_name_or_path} is not a valid Ernie4_5_VLMoe model." + model_name_or_path=self.model_name_or_path, + reasoning_parser_obj=reasoning_parser_obj, ) else: - from fastdeploy.input.ernie_vl_processor import \ - ErnieMoEVLProcessor + from fastdeploy.input.ernie_processor import ErnieProcessor + + self.processor = ErnieProcessor( + model_name_or_path=self.model_name_or_path, + reasoning_parser_obj=reasoning_parser_obj, + ) + else: + if not architectures.startswith("Ernie4_5_VLMoeForConditionalGeneration"): + raise ValueError(f"Model {self.model_name_or_path} is not a valid Ernie4_5_VLMoe model.") + else: + from fastdeploy.input.ernie_vl_processor import ErnieMoEVLProcessor + self.processor = ErnieMoEVLProcessor( model_name_or_path=self.model_name_or_path, limit_mm_per_prompt=self.limit_mm_per_prompt, mm_processor_kwargs=self.mm_processor_kwargs, - reasoning_parser_obj=reasoning_parser_obj) + reasoning_parser_obj=reasoning_parser_obj, + ) return self.processor diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py index 9c3c615c4..b79d469c5 100644 --- a/fastdeploy/input/text_processor.py +++ b/fastdeploy/input/text_processor.py @@ -35,23 +35,20 @@ class BaseDataProcessor(ABC): None """ self.tokenizer = self._load_tokenizer() - self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id( - self.tokenizer.bos_token) - self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id( - self.tokenizer.cls_token) - self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id( - self.tokenizer.sep_token) - self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id( - self.tokenizer.eos_token) - self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id( - self.tokenizer.mask_token) - data_processor_logger.info(( - f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ", - f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, " - f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, " - f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, " - f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}" - )) + self.tokenizer.bos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.bos_token) + self.tokenizer.cls_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.cls_token) + self.tokenizer.sep_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.sep_token) + self.tokenizer.eos_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.eos_token) + self.tokenizer.mask_token_id = self.tokenizer._convert_token_to_id(self.tokenizer.mask_token) + data_processor_logger.info( + ( + f"tokenizer information: bos_token is {self.tokenizer.bos_token}, {self.tokenizer.bos_token_id}, ", + f"cls_token is {self.tokenizer.cls_token}, {self.tokenizer.cls_token_id}, " + f"sep_token is {self.tokenizer.sep_token}, {self.tokenizer.sep_token_id}, " + f"eos_token is {self.tokenizer.eos_token}, {self.tokenizer.eos_token_id}, " + f"mask_token is {self.tokenizer.mask_token}, {self.tokenizer.mask_token_id}", + ) + ) def _apply_default_parameters(self, request): """ @@ -132,7 +129,7 @@ class BaseDataProcessor(ABC): Args: token_id (List[int]): token id - task_id (str): task id + task_id (str): task id Returns: List[str]: strings @@ -151,7 +148,6 @@ class BaseDataProcessor(ABC): class DataProcessor(BaseDataProcessor): - def __init__(self, model_name_or_path, reasoning_parser_obj=None): """ Initializes the DecodeStatus object. @@ -180,8 +176,7 @@ class DataProcessor(BaseDataProcessor): from paddleformers.trl.llm_utils import get_eos_token_id - self.eos_token_ids = get_eos_token_id(self.tokenizer, - self.generation_config) + self.eos_token_ids = get_eos_token_id(self.tokenizer, self.generation_config) self.eos_token_id_len = len(self.eos_token_ids) self.pad_token_id = self.get_pad_id() self.reasoning_parser = None @@ -206,8 +201,7 @@ class DataProcessor(BaseDataProcessor): # Generation config try: - self.generation_config = GenerationConfig.from_pretrained( - self.model_name_or_path) + self.generation_config = GenerationConfig.from_pretrained(self.model_name_or_path) except Exception as e: data_processor_logger.warning( f"Can't find generation config: {e}, so it will not use generation_config field in the model config" @@ -226,8 +220,7 @@ class DataProcessor(BaseDataProcessor): str: error message """ request = self._apply_default_parameters(request) - if request.get("eos_token_ids") is None or len( - request.eos_token_ids) == 0: + if request.get("eos_token_ids") is None or len(request.eos_token_ids) == 0: request.eos_token_ids = self.eos_token_ids stop_sequences = request.get("stop", []) @@ -236,25 +229,22 @@ class DataProcessor(BaseDataProcessor): request.set("stop_token_ids", stop_seqs) request.set("stop_seqs_len", stop_seqs_len) - if request.prompt_token_ids is None or len( - request.prompt_token_ids) == 0: + if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt is not None: - request.prompt_token_ids = self.text2ids( - request.prompt, max_model_len, request.raw_request) + request.prompt_token_ids = self.text2ids(request.prompt, max_model_len, request.raw_request) elif request.messages is not None: if self.tokenizer.chat_template is None: - raise ValueError( - "This model does not support chat_template.") + raise ValueError("This model does not support chat_template.") task = request.to_dict() - task['enable_thinking'] = kwargs.get("enable_thinking", True) + task["enable_thinking"] = kwargs.get("enable_thinking", True) request.prompt_token_ids = self.messages2ids(task) else: - raise ValueError( - f"The request should have `input_ids`, `text` or `messages`: {request}." - ) + raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.") if request.get("max_tokens") is None: - request.set("max_tokens", - max(1, max_model_len - len(request.prompt_token_ids))) + request.set( + "max_tokens", + max(1, max_model_len - len(request.prompt_token_ids)), + ) if request.get("temperature") < _SAMPLING_EPS: # zero temperature is equivalent to greedy sampling request.set("temperature", 1) @@ -273,36 +263,31 @@ class DataProcessor(BaseDataProcessor): str: error message """ request = self._apply_default_parameters(request) - if not request.get('eos_token_ids'): - request['eos_token_ids'] = self.eos_token_ids + if not request.get("eos_token_ids"): + request["eos_token_ids"] = self.eos_token_ids # 处理stop_sequences - stop_sequences = request.get('stop', []) + stop_sequences = request.get("stop", []) if stop_sequences: stop_seqs, stop_seqs_len = self.update_stop_seq(stop_sequences) - request['stop_token_ids'] = stop_seqs - request['stop_seqs_len'] = stop_seqs_len + request["stop_token_ids"] = stop_seqs + request["stop_seqs_len"] = stop_seqs_len data_processor_logger.info(f"Processing request {request}") # 处理prompt_token_ids - if not request.get('prompt_token_ids'): - if 'prompt' in request: - raw_request = request.get('raw_request', True) - request['prompt_token_ids'] = self.text2ids( - request['prompt'], max_model_len, raw_request).tolist() - elif 'messages' in request: + if not request.get("prompt_token_ids"): + if "prompt" in request: + raw_request = request.get("raw_request", True) + request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len, raw_request).tolist() + elif "messages" in request: if self.tokenizer.chat_template is None: - raise ValueError( - "This model does not support chat_template.") - request['prompt_token_ids'] = self.messages2ids(request) + raise ValueError("This model does not support chat_template.") + request["prompt_token_ids"] = self.messages2ids(request) else: - raise ValueError( - f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}" - ) + raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}") if request.get("max_tokens") is None: - request["max_tokens"] = max( - 1, max_model_len - len(request['prompt_token_ids'])) + request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) if request.get("temperature") < _SAMPLING_EPS: # zero temperature is equivalent to greedy sampling request["temperature"] = 1 @@ -331,8 +316,7 @@ class DataProcessor(BaseDataProcessor): # 模型支持思考,并且支持思考 if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict.outputs.text = text response_dict.outputs.reasoning_content = reasoning_content else: @@ -362,16 +346,12 @@ class DataProcessor(BaseDataProcessor): if is_end: full_text = previous_texts + delta_text if self.reasoning_parser: - reasoning_content, text = self.reasoning_parser.extract_reasoning_content( - full_text, response_dict) + reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict) response_dict["outputs"]["text"] = text - response_dict["outputs"][ - "reasoning_content"] = reasoning_content + response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = full_text - data_processor_logger.info( - f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}" - ) + data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] return response_dict @@ -393,21 +373,23 @@ class DataProcessor(BaseDataProcessor): if is_end and len(token_ids) > 0 and not kwargs.get("include_stop_str_in_output"): if token_ids[-1] == self.tokenizer.eos_token_id: token_ids = token_ids[:-1] - delta_text, previous_token_ids, previous_texts = self.ids2tokens( - token_ids, req_id) + delta_text, previous_token_ids, previous_texts = self.ids2tokens(token_ids, req_id) if enable_thinking and self.reasoning_parser: reasoning_content, text = self.reasoning_parser.extract_reasoning_content_streaming( - previous_texts, previous_texts + delta_text, delta_text, - previous_token_ids, previous_token_ids + token_ids, token_ids) + previous_texts, + previous_texts + delta_text, + delta_text, + previous_token_ids, + previous_token_ids + token_ids, + token_ids, + ) response_dict["outputs"]["text"] = text response_dict["outputs"]["reasoning_content"] = reasoning_content else: response_dict["outputs"]["text"] = delta_text if is_end: - data_processor_logger.info( - f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}" - ) + data_processor_logger.info(f"req_id:{req_id}, decode_status: {self.decode_status[req_id]}") del self.decode_status[req_id] return response_dict @@ -426,11 +408,13 @@ class DataProcessor(BaseDataProcessor): enable_thinking = True stream = kwargs.get("stream", True) if stream: - return self.process_response_dict_streaming( - response_dict, enable_thinking=enable_thinking, **kwargs) + return self.process_response_dict_streaming(response_dict, enable_thinking=enable_thinking, **kwargs) else: return self.process_response_dict_normal( - response_dict=response_dict, enable_thinking=enable_thinking, **kwargs) + response_dict=response_dict, + enable_thinking=enable_thinking, + **kwargs, + ) def text2ids(self, text, max_model_len, raw_request=True): """ @@ -479,14 +463,14 @@ class DataProcessor(BaseDataProcessor): tokenize=False, split_special_tokens=False, add_special_tokens=False, - return_tensors="pd") + return_tensors="pd", + ) req_id = None tokens = self.tokenizer.tokenize(spliced_message) if isinstance(request, dict): req_id = request.get("request_id", None) token_ids = self.tokenizer.convert_tokens_to_ids(tokens) - data_processor_logger.info( - f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") + data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") return token_ids def ids2tokens(self, token_id, task_id): @@ -495,7 +479,7 @@ class DataProcessor(BaseDataProcessor): Args: token_ids (List[int]): token ids - task_id (str): task id + task_id (str): task id Returns: List[str]: strings @@ -509,10 +493,10 @@ class DataProcessor(BaseDataProcessor): decode_str = self.tokenizer.batch_decode( [previous_token_ids + token_id], skip_special_tokens=True, - clean_up_tokenization_spaces=False) + clean_up_tokenization_spaces=False, + ) if isinstance(decode_str, list) and len(decode_str): - new_str = decode_str[0].replace(self.decode_status[task_id][2], - "", 1) + new_str = decode_str[0].replace(self.decode_status[task_id][2], "", 1) self.decode_status[task_id][1].append(new_str) self.decode_status[task_id][2] = decode_str[0] else: @@ -529,7 +513,8 @@ class DataProcessor(BaseDataProcessor): previous_token_ids = self.decode_status[task_id][2] previous_texts = self.decode_status[task_id][3] decode_str, prefix_offset, read_offset = self.tokenizer.decode_token( - previous_token_ids + token_id, prefix_offset, read_offset) + previous_token_ids + token_id, prefix_offset, read_offset + ) self.decode_status[task_id][0] = prefix_offset self.decode_status[task_id][1] = read_offset self.decode_status[task_id][2] += token_id @@ -546,13 +531,12 @@ class DataProcessor(BaseDataProcessor): """ if self.use_hf_tokenizer: from transformers import AutoTokenizer - return AutoTokenizer.from_pretrained(self.model_name_or_path, - use_fast=False) + + return AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=False) else: from paddleformers.transformers import AutoTokenizer - return AutoTokenizer.from_pretrained(self.model_name_or_path, - padding_side="left", - use_fast=True) + + return AutoTokenizer.from_pretrained(self.model_name_or_path, padding_side="left", use_fast=True) def clear_request_status(self, task_id): """ @@ -580,22 +564,21 @@ class DataProcessor(BaseDataProcessor): Returns: int: pad_token_id """ - if isinstance(self.tokenizer, - (LlamaTokenizer, - Llama3Tokenizer)) and not self.tokenizer.pad_token_id: + if isinstance(self.tokenizer, (LlamaTokenizer, Llama3Tokenizer)) and not self.tokenizer.pad_token_id: return self.tokenizer.eos_token return self.tokenizer.pad_token_id - def pad_batch_data(self, - insts, - pad_id=0, - return_seq_len=False, - return_array=True, - pad_style="right"): + def pad_batch_data( + self, + insts, + pad_id=0, + return_seq_len=False, + return_array=True, + pad_style="right", + ): """Pad the instances to the max sequence length in batch.""" if len(insts) == 0: - padded_insts = np.array([[]], - dtype=np.int64) if return_array else [[]] + padded_insts = np.array([[]], dtype=np.int64) if return_array else [[]] if return_seq_len: seq_len = np.array([], dtype=np.int64) if return_array else [] return padded_insts, seq_len @@ -603,15 +586,11 @@ class DataProcessor(BaseDataProcessor): max_len = max(map(len, insts)) if pad_style == "left": - padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) - for inst in insts] + padded_insts = [[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts] else: - padded_insts = [ - list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts - ] + padded_insts = [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts] if return_array: - padded_insts = np.array(padded_insts, - dtype=np.int64).reshape([-1, max_len]) + padded_insts = np.array(padded_insts, dtype=np.int64).reshape([-1, max_len]) if return_seq_len: seq_len = [len(inst) for inst in insts] @@ -627,13 +606,7 @@ class DataProcessor(BaseDataProcessor): stop_seqs = [] for seq in stop_sequences: if seq != self.tokenizer.eos_token_id: - stop_seqs.append( - self.tokenizer.convert_tokens_to_ids( - self.tokenizer.tokenize(seq))) - stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, - pad_id=-1, - return_seq_len=True, - return_array=False) - data_processor_logger.debug( - f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}") + stop_seqs.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(seq))) + stop_seqs, stop_seqs_len = self.pad_batch_data(stop_seqs, pad_id=-1, return_seq_len=True, return_array=False) + data_processor_logger.debug(f"processed stop_seqs: {stop_seqs}, {stop_seqs_len}") return stop_seqs, stop_seqs_len diff --git a/fastdeploy/inter_communicator/__init__.py b/fastdeploy/inter_communicator/__init__.py index b908b4239..0c1cc0d9f 100644 --- a/fastdeploy/inter_communicator/__init__.py +++ b/fastdeploy/inter_communicator/__init__.py @@ -14,12 +14,9 @@ # limitations under the License. """ -from .zmq_client import ZmqClient -from .ipc_signal import IPCSignal -from .engine_worker_queue import EngineWorkerQueue from .engine_cache_queue import EngineCacheQueue +from .engine_worker_queue import EngineWorkerQueue +from .ipc_signal import IPCSignal +from .zmq_client import ZmqClient - -__all__ = [ - 'ZmqClient', 'IPCSignal', 'EngineWorkerQueue', 'CacheQueueManager' -] +__all__ = ["ZmqClient", "IPCSignal", "EngineWorkerQueue", "EngineCacheQueue"] diff --git a/fastdeploy/inter_communicator/engine_cache_queue.py b/fastdeploy/inter_communicator/engine_cache_queue.py index 70ef08ba1..03fae97d7 100644 --- a/fastdeploy/inter_communicator/engine_cache_queue.py +++ b/fastdeploy/inter_communicator/engine_cache_queue.py @@ -16,8 +16,13 @@ import threading import time -from multiprocessing.managers import (AcquirerProxy, BaseManager, ListProxy, - Value, ValueProxy) +from multiprocessing.managers import ( + AcquirerProxy, + BaseManager, + ListProxy, + Value, + ValueProxy, +) from typing import Any, List, Tuple from fastdeploy.utils import get_logger @@ -32,14 +37,14 @@ class EngineCacheQueue: """ def __init__( - self, - address: Tuple[str, int] = ('127.0.0.1', 56666), - authkey: bytes = b'cache_queue_service', - is_server: bool = False, - num_client: int = 1, # tensor parallel size - client_id: int = -1, # tensor parallel id - local_data_parallel_size: int = 1, # data parallel size - local_data_parallel_id: int = 0, # local data parallel id + self, + address: Tuple[str, int] = ("127.0.0.1", 56666), + authkey: bytes = b"cache_queue_service", + is_server: bool = False, + num_client: int = 1, # tensor parallel size + client_id: int = -1, # tensor parallel id + local_data_parallel_size: int = 1, # data parallel size + local_data_parallel_id: int = 0, # local data parallel id ) -> None: """ Initialize the cache communication queue. @@ -64,19 +69,14 @@ class EngineCacheQueue: """ Custom QueueManager for proxy object registration """ + pass if is_server: # Server-side initialization for shared resources - self.transfer_task_queue_init: List[List[Any]] = [ - list() for _ in range(self.local_data_parallel_size) - ] - self.tansfer_done_queue_init: List[List[Any]] = [ - list() for _ in range(self.local_data_parallel_size) - ] - self.cache_sync_value_init: List[Value] = [ - Value("i", 0) for _ in range(self.local_data_parallel_size) - ] + self.transfer_task_queue_init: List[List[Any]] = [list() for _ in range(self.local_data_parallel_size)] + self.tansfer_done_queue_init: List[List[Any]] = [list() for _ in range(self.local_data_parallel_size)] + self.cache_sync_value_init: List[Value] = [Value("i", 0) for _ in range(self.local_data_parallel_size)] self.transfer_task_lock_init: List[threading.Lock] = [ threading.Lock() for _ in range(self.local_data_parallel_size) ] @@ -85,84 +85,76 @@ class EngineCacheQueue: ] # Initialize barriers - self.barrier1_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) - ] - self.barrier2_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) - ] - self.barrier3_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) - ] + self.barrier1_init = [threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size)] + self.barrier2_init = [threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size)] + self.barrier3_init = [threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size)] self.swap_to_cpu_barrier1_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) + threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size) ] self.swap_to_cpu_barrier2_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) + threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size) ] self.swap_to_gpu_barrier1_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) + threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size) ] self.swap_to_gpu_barrier2_init = [ - threading.Barrier(self.num_client) - for _ in range(self.local_data_parallel_size) + threading.Barrier(self.num_client) for _ in range(self.local_data_parallel_size) ] # Register shared objects with proxy types QueueManager.register( "get_transfer_task_queue", callable=lambda idx: self.transfer_task_queue_init[idx], - proxytype=ListProxy) + proxytype=ListProxy, + ) QueueManager.register( "get_tansfer_done_queue", callable=lambda idx: self.tansfer_done_queue_init[idx], - proxytype=ListProxy) + proxytype=ListProxy, + ) QueueManager.register( "get_cache_sync_value", callable=lambda idx: self.cache_sync_value_init[idx], - proxytype=ValueProxy) + proxytype=ValueProxy, + ) QueueManager.register( "get_transfer_task_lock", callable=lambda idx: self.transfer_task_lock_init[idx], - proxytype=AcquirerProxy) + proxytype=AcquirerProxy, + ) QueueManager.register( "get_transfer_task_done_lock", callable=lambda idx: self.transfer_task_done_lock_init[idx], - proxytype=AcquirerProxy) - QueueManager.register("get_barrier1", - callable=lambda idx: self.barrier1_init[idx]) - QueueManager.register("get_barrier2", - callable=lambda idx: self.barrier2_init[idx]) - QueueManager.register("get_barrier3", - callable=lambda idx: self.barrier3_init[idx]) + proxytype=AcquirerProxy, + ) + QueueManager.register("get_barrier1", callable=lambda idx: self.barrier1_init[idx]) + QueueManager.register("get_barrier2", callable=lambda idx: self.barrier2_init[idx]) + QueueManager.register("get_barrier3", callable=lambda idx: self.barrier3_init[idx]) QueueManager.register( "get_swap_to_cpu_barrier1", - callable=lambda idx: self.swap_to_cpu_barrier1_init[idx]) + callable=lambda idx: self.swap_to_cpu_barrier1_init[idx], + ) QueueManager.register( "get_swap_to_cpu_barrier2", - callable=lambda idx: self.swap_to_cpu_barrier2_init[idx]) + callable=lambda idx: self.swap_to_cpu_barrier2_init[idx], + ) QueueManager.register( "get_swap_to_gpu_barrier1", - callable=lambda idx: self.swap_to_gpu_barrier1_init[idx]) + callable=lambda idx: self.swap_to_gpu_barrier1_init[idx], + ) QueueManager.register( "get_swap_to_gpu_barrier2", - callable=lambda idx: self.swap_to_gpu_barrier2_init[idx]) + callable=lambda idx: self.swap_to_gpu_barrier2_init[idx], + ) - self.manager: BaseManager = QueueManager(address=self.address, - authkey=self.authkey) + self.manager: BaseManager = QueueManager(address=self.address, authkey=self.authkey) self.manager.start() logger.info(f"EngineCacheQueue server started at {self.address}") else: # Client-side connection setup - assert 0 <= self.client_id < self.num_client, ( - f"client_id must be between 0 and {self.num_client-1}, got {self.client_id}" - ) + assert ( + 0 <= self.client_id < self.num_client + ), f"client_id must be between 0 and {self.num_client-1}, got {self.client_id}" QueueManager.register("get_transfer_task_queue") QueueManager.register("get_tansfer_done_queue") QueueManager.register("get_cache_sync_value") @@ -176,45 +168,32 @@ class EngineCacheQueue: QueueManager.register("get_swap_to_gpu_barrier1") QueueManager.register("get_swap_to_gpu_barrier2") - self.manager = QueueManager(address=self.address, - authkey=self.authkey) + self.manager = QueueManager(address=self.address, authkey=self.authkey) self._connect_with_retry() # Get proxy objects for shared resources - self.transfer_task_queue = self.manager.get_transfer_task_queue( - self.local_data_parallel_id) - self.tansfer_done_queue = self.manager.get_tansfer_done_queue( - self.local_data_parallel_id) - self.task_sync_value = self.manager.get_cache_sync_value( - self.local_data_parallel_id) - self.task_lock = self.manager.get_transfer_task_lock( - self.local_data_parallel_id) - self.task_done_lock = self.manager.get_transfer_task_done_lock( - self.local_data_parallel_id) + self.transfer_task_queue = self.manager.get_transfer_task_queue(self.local_data_parallel_id) + self.tansfer_done_queue = self.manager.get_tansfer_done_queue(self.local_data_parallel_id) + self.task_sync_value = self.manager.get_cache_sync_value(self.local_data_parallel_id) + self.task_lock = self.manager.get_transfer_task_lock(self.local_data_parallel_id) + self.task_done_lock = self.manager.get_transfer_task_done_lock(self.local_data_parallel_id) # Get barrier proxies self.barrier1 = self.manager.get_barrier1(self.local_data_parallel_id) self.barrier2 = self.manager.get_barrier2(self.local_data_parallel_id) self.barrier3 = self.manager.get_barrier3(self.local_data_parallel_id) - self.swap_to_cpu_barrier1 = self.manager.get_swap_to_cpu_barrier1( - self.local_data_parallel_id) - self.swap_to_cpu_barrier2 = self.manager.get_swap_to_cpu_barrier2( - self.local_data_parallel_id) - self.swap_to_gpu_barrier1 = self.manager.get_swap_to_gpu_barrier1( - self.local_data_parallel_id) - self.swap_to_gpu_barrier2 = self.manager.get_swap_to_gpu_barrier2( - self.local_data_parallel_id) + self.swap_to_cpu_barrier1 = self.manager.get_swap_to_cpu_barrier1(self.local_data_parallel_id) + self.swap_to_cpu_barrier2 = self.manager.get_swap_to_cpu_barrier2(self.local_data_parallel_id) + self.swap_to_gpu_barrier1 = self.manager.get_swap_to_gpu_barrier1(self.local_data_parallel_id) + self.swap_to_gpu_barrier2 = self.manager.get_swap_to_gpu_barrier2(self.local_data_parallel_id) self.total_num: int = (1 << self.num_client) - 1 if not is_server: # Setup position and total_num for sync operations self.position: int = 1 << self.client_id - logger.info( - f"Connected EngineCacheQueue client_id: {self.client_id}") + logger.info(f"Connected EngineCacheQueue client_id: {self.client_id}") - def _connect_with_retry(self, - max_retries: int = 5, - interval: int = 3) -> None: + def _connect_with_retry(self, max_retries: int = 5, interval: int = 3) -> None: """ Connect to the server with retry mechanism. @@ -231,8 +210,7 @@ class EngineCacheQueue: return except ConnectionRefusedError: time.sleep(interval) - raise ConnectionError( - f"EngineCacheQueue cannot connect to {self.address}") + raise ConnectionError(f"EngineCacheQueue cannot connect to {self.address}") def put_transfer_task(self, item): """ @@ -246,8 +224,7 @@ class EngineCacheQueue: self.task_lock.acquire() self.task_sync_value.set(0) self.transfer_task_queue.append(item) - logger.info( - f"put_transfer_task: put swap task {item[-1]} to queue successful") + logger.info(f"put_transfer_task: put swap task {item[-1]} to queue successful") self.task_lock.release() def get_transfer_task(self): @@ -257,15 +234,11 @@ class EngineCacheQueue: data = None read_finish = False self.task_lock.acquire() - if (self.task_sync_value.get() & self.position == 0 - and len(self.transfer_task_queue) > 0): + if self.task_sync_value.get() & self.position == 0 and len(self.transfer_task_queue) > 0: data = self.transfer_task_queue[0] - logger.debug( - f"get_transfer_task: Get {data} by {self.client_id} from queue successful" - ) + logger.debug(f"get_transfer_task: Get {data} by {self.client_id} from queue successful") set_value = self.task_sync_value.get() | self.position - logger.info("get_transfer_task: rank: {0} set_value: {1}".format( - self.client_id, set_value)) + logger.info(f"get_transfer_task: rank: {self.client_id} set_value: {set_value}") if set_value >= self.total_num: self.transfer_task_queue.pop(0) set_value = 0 @@ -281,9 +254,7 @@ class EngineCacheQueue: self.task_done_lock.acquire() self.tansfer_done_queue.append(item) self.task_done_lock.release() - logger.info( - f"put_transfer_done_signal: put swap task {item[-1]} finished signal to queue successful" - ) + logger.info(f"put_transfer_done_signal: put swap task {item[-1]} finished signal to queue successful") def get_transfer_done_signal(self): """ @@ -293,9 +264,7 @@ class EngineCacheQueue: self.task_done_lock.acquire() if len(self.tansfer_done_queue) > 0: data = self.tansfer_done_queue.pop(0) - logger.info( - f"get_transfer_done_signal: Get swap task {data[-1]} finished signal from queue successful" - ) + logger.info(f"get_transfer_done_signal: Get swap task {data[-1]} finished signal from queue successful") self.task_done_lock.release() return data diff --git a/fastdeploy/inter_communicator/engine_worker_queue.py b/fastdeploy/inter_communicator/engine_worker_queue.py index d837c6a27..da88265a2 100644 --- a/fastdeploy/inter_communicator/engine_worker_queue.py +++ b/fastdeploy/inter_communicator/engine_worker_queue.py @@ -16,8 +16,13 @@ import threading import time -from multiprocessing.managers import (AcquirerProxy, BaseManager, ListProxy, - Value, ValueProxy) +from multiprocessing.managers import ( + AcquirerProxy, + BaseManager, + ListProxy, + Value, + ValueProxy, +) from queue import Queue from typing import Any, List, Tuple @@ -33,14 +38,14 @@ class EngineWorkerQueue: """ def __init__( - self, - address: Tuple[str, int] = ('0.0.0.0', 5000), - authkey: bytes = b'secret_key', - is_server: bool = False, - num_client: int = 1, # tensor parallel size - client_id: int = -1, # tensor parallel id - local_data_parallel_size: int = 1, # data parallel size - local_data_parallel_id: int = 0, # local data parallel id + self, + address: Tuple[str, int] = ("0.0.0.0", 5000), + authkey: bytes = b"secret_key", + is_server: bool = False, + num_client: int = 1, # tensor parallel size + client_id: int = -1, # tensor parallel id + local_data_parallel_size: int = 1, # data parallel size + local_data_parallel_id: int = 0, # local data parallel id ) -> None: """ Initialize the communication queue. @@ -64,35 +69,24 @@ class EngineWorkerQueue: """ Custom QueueManager for proxy object registration. """ + pass if is_server: # Server-side initialization for shared resources - self.tasks_init: List[List[Any]] = [ - list() for _ in range(self.local_data_parallel_size) - ] + self.tasks_init: List[List[Any]] = [list() for _ in range(self.local_data_parallel_size)] self.client_read_flag_init: List[List[int]] = [ - [1] * self.num_client - for _ in range(self.local_data_parallel_size) - ] - self.lock_init: List[threading.Lock] = [ - threading.Lock() for _ in range(self.local_data_parallel_size) - ] - self.read_finish_flag_init: List[Value] = [ - Value("i", 0) for _ in range(self.local_data_parallel_size) + [1] * self.num_client for _ in range(self.local_data_parallel_size) ] + self.lock_init: List[threading.Lock] = [threading.Lock() for _ in range(self.local_data_parallel_size)] + self.read_finish_flag_init: List[Value] = [Value("i", 0) for _ in range(self.local_data_parallel_size)] self.connected_client_counter_init: List[Value] = [ Value("i", 0) for _ in range(self.local_data_parallel_size) ] - self.finished_req_queue = [ - Queue() for _ in range(self.local_data_parallel_size) - ] - self.cache_infos_init: List[List[Any]] = [ - list() for _ in range(self.local_data_parallel_size) - ] + self.finished_req_queue = [Queue() for _ in range(self.local_data_parallel_size)] + self.cache_infos_init: List[List[Any]] = [list() for _ in range(self.local_data_parallel_size)] self.client_read_info_flag_init: List[List[int]] = [ - [1] * self.num_client - for _ in range(self.local_data_parallel_size) + [1] * self.num_client for _ in range(self.local_data_parallel_size) ] self.lock_info_init: List[threading.Lock] = [ threading.Lock() for _ in range(self.local_data_parallel_size) @@ -103,66 +97,77 @@ class EngineWorkerQueue: ] # Register shared objects with proxy types - QueueManager.register("get_tasks", - callable=lambda idx: self.tasks_init[idx], - proxytype=ListProxy) + QueueManager.register( + "get_tasks", + callable=lambda idx: self.tasks_init[idx], + proxytype=ListProxy, + ) QueueManager.register( "get_client_read_flag", callable=lambda idx: self.client_read_flag_init[idx], - proxytype=ListProxy) - QueueManager.register("get_lock", - callable=lambda idx: self.lock_init[idx], - proxytype=AcquirerProxy) + proxytype=ListProxy, + ) + QueueManager.register( + "get_lock", + callable=lambda idx: self.lock_init[idx], + proxytype=AcquirerProxy, + ) QueueManager.register( "get_read_finish_flag", callable=lambda idx: self.read_finish_flag_init[idx], - proxytype=ValueProxy) + proxytype=ValueProxy, + ) QueueManager.register( "get_connected_client_counter", callable=lambda idx: self.connected_client_counter_init[idx], - proxytype=ValueProxy) + proxytype=ValueProxy, + ) QueueManager.register( - 'get_finish_request_queue', - callable=lambda idx: self.finished_req_queue[idx]) + "get_finish_request_queue", + callable=lambda idx: self.finished_req_queue[idx], + ) QueueManager.register( "get_cache_infos", callable=lambda idx: self.cache_infos_init[idx], - proxytype=ListProxy) + proxytype=ListProxy, + ) QueueManager.register( "get_client_read_info_flag", callable=lambda idx: self.client_read_info_flag_init[idx], - proxytype=ListProxy) + proxytype=ListProxy, + ) QueueManager.register( "get_lock_info", callable=lambda idx: self.lock_info_init[idx], - proxytype=AcquirerProxy) + proxytype=AcquirerProxy, + ) - self.disaggregate_requests = [ - Queue() for _ in range(self.local_data_parallel_size) - ] + self.disaggregate_requests = [Queue() for _ in range(self.local_data_parallel_size)] QueueManager.register( "get_disaggregate_requests", - callable=lambda idx: self.disaggregate_requests[idx]) + callable=lambda idx: self.disaggregate_requests[idx], + ) self.available_prefill_instances = Queue() QueueManager.register( "get_available_prefill_instances", - callable=lambda: self.available_prefill_instances) - + callable=lambda: self.available_prefill_instances, + ) + QueueManager.register( "get_finish_request_barrier", - callable=lambda idx: self.finish_request_barrier[idx]) - self.manager: BaseManager = QueueManager(address=self.address, - authkey=self.authkey) + callable=lambda idx: self.finish_request_barrier[idx], + ) + self.manager: BaseManager = QueueManager(address=self.address, authkey=self.authkey) self.manager.start() else: # Client-side connection setup - assert self.client_id >= 0 and self.client_id < self.num_client, ( - f"self.client_id={self.client_id}, self.num_client={self.num_client}" - ) + assert ( + self.client_id >= 0 and self.client_id < self.num_client + ), f"self.client_id={self.client_id}, self.num_client={self.num_client}" QueueManager.register("get_tasks") QueueManager.register("get_client_read_flag") QueueManager.register("get_lock") @@ -175,37 +180,26 @@ class EngineWorkerQueue: QueueManager.register("get_disaggregate_requests") QueueManager.register("get_available_prefill_instances") QueueManager.register("get_finish_request_barrier") - self.manager = QueueManager(address=self.address, - authkey=self.authkey) + self.manager = QueueManager(address=self.address, authkey=self.authkey) self._connect_with_retry() # Get proxy objects for shared resources - self.tasks: ListProxy = self.manager.get_tasks( - self.local_data_parallel_id) - self.client_read_flag: ListProxy = self.manager.get_client_read_flag( - self.local_data_parallel_id) - self.lock: AcquirerProxy = self.manager.get_lock( - self.local_data_parallel_id) - self.read_finish_flag: ValueProxy = self.manager.get_read_finish_flag( - self.local_data_parallel_id) - self.connected_client_counter: ValueProxy = \ - self.manager.get_connected_client_counter(self.local_data_parallel_id) - self.cache_infos: ListProxy = self.manager.get_cache_infos( - self.local_data_parallel_id) - self.client_read_info_flag: ListProxy = self.manager.get_client_read_info_flag( - self.local_data_parallel_id) - self.lock_info: AcquirerProxy = self.manager.get_lock_info( - self.local_data_parallel_id) - - # p/d 分离获取 - self.disaggregate_requests = self.manager.get_disaggregate_requests( - self.local_data_parallel_id) - self.available_prefill_instances = self.manager.get_available_prefill_instances() - self.finish_request_barrier = self.manager.get_finish_request_barrier( + self.tasks: ListProxy = self.manager.get_tasks(self.local_data_parallel_id) + self.client_read_flag: ListProxy = self.manager.get_client_read_flag(self.local_data_parallel_id) + self.lock: AcquirerProxy = self.manager.get_lock(self.local_data_parallel_id) + self.read_finish_flag: ValueProxy = self.manager.get_read_finish_flag(self.local_data_parallel_id) + self.connected_client_counter: ValueProxy = self.manager.get_connected_client_counter( self.local_data_parallel_id ) - self.finished_req_queue = self.manager.get_finish_request_queue( - self.local_data_parallel_id) + self.cache_infos: ListProxy = self.manager.get_cache_infos(self.local_data_parallel_id) + self.client_read_info_flag: ListProxy = self.manager.get_client_read_info_flag(self.local_data_parallel_id) + self.lock_info: AcquirerProxy = self.manager.get_lock_info(self.local_data_parallel_id) + + # p/d 分离获取 + self.disaggregate_requests = self.manager.get_disaggregate_requests(self.local_data_parallel_id) + self.available_prefill_instances = self.manager.get_available_prefill_instances() + self.finish_request_barrier = self.manager.get_finish_request_barrier(self.local_data_parallel_id) + self.finished_req_queue = self.manager.get_finish_request_queue(self.local_data_parallel_id) assert self.num_client == len(self.client_read_flag) if is_server: @@ -213,17 +207,14 @@ class EngineWorkerQueue: else: # Update client connection counter self.lock.acquire() - self.connected_client_counter.set( - self.connected_client_counter.get() + 1) + self.connected_client_counter.set(self.connected_client_counter.get() + 1) self.lock.release() - llm_logger.info(( + llm_logger.info( f"Connected EngineWorkerQueue client_id: {self.client_id}, number " f"of connected clients: {self.connected_client_counter.get()}" - )) + ) - def _connect_with_retry(self, - max_retries: int = 5, - interval: int = 3) -> None: + def _connect_with_retry(self, max_retries: int = 5, interval: int = 3) -> None: """ Connect to the server with retry mechanism. @@ -272,8 +263,7 @@ class EngineWorkerQueue: self.lock.acquire() tasks.extend(self.tasks) self.client_read_flag[self.client_id] = 1 - all_client_read: bool = np.sum( - self.client_read_flag) == self.num_client + all_client_read: bool = np.sum(self.client_read_flag) == self.num_client if all_client_read: self.tasks[:] = list() self.lock.release() @@ -290,7 +280,7 @@ class EngineWorkerQueue: total_num: int = len(self.tasks) self.lock.release() return total_num - + def get_prefill_instances(self): """ check if the prefill queue is empty @@ -300,7 +290,6 @@ class EngineWorkerQueue: else: return self.available_prefill_instances.get() - def put_cache_info(self, cache_info) -> None: """ Args: @@ -316,9 +305,7 @@ class EngineWorkerQueue: self.client_read_info_flag[:] = [0] * self.num_client self.cache_infos.extend(cache_info) - llm_logger.debug( - f"cache_infos: {self.cache_infos} local_data_parallel_id:{self.local_data_parallel_id}" - ) + llm_logger.debug(f"cache_infos: {self.cache_infos} local_data_parallel_id:{self.local_data_parallel_id}") self.lock_info.release() def get_cache_info(self) -> List[Any]: @@ -335,17 +322,14 @@ class EngineWorkerQueue: return cache_infos cache_infos.extend(self.cache_infos) self.client_read_info_flag[self.client_id] = 1 - all_client_read: bool = np.sum( - self.client_read_info_flag) == self.num_client + all_client_read: bool = np.sum(self.client_read_info_flag) == self.num_client if all_client_read: self.cache_infos[:] = list() self.lock_info.release() if len(cache_infos) != 0: - llm_logger.debug( - f"get cache infos: {cache_infos} local_data_parallel_id:{self.local_data_parallel_id}" - ) + llm_logger.debug(f"get cache infos: {cache_infos} local_data_parallel_id:{self.local_data_parallel_id}") return cache_infos - + def num_cache_infos(self) -> int: """ Get current number of tasks in the queue. diff --git a/fastdeploy/inter_communicator/ipc_signal.py b/fastdeploy/inter_communicator/ipc_signal.py index ec7d98568..0ac2e3fa0 100644 --- a/fastdeploy/inter_communicator/ipc_signal.py +++ b/fastdeploy/inter_communicator/ipc_signal.py @@ -14,9 +14,11 @@ # limitations under the License. """ -import numpy as np from multiprocessing.shared_memory import SharedMemory +import numpy as np + + def shared_memory_exists(name: str) -> bool: """Check if a shared memory block with the given name exists. @@ -37,8 +39,6 @@ def shared_memory_exists(name: str) -> bool: return False - - class IPCSignal: """A shared memory wrapper for inter-process communication using numpy arrays. @@ -50,12 +50,14 @@ class IPCSignal: value: Numpy array interface to the shared memory buffer. """ - def __init__(self, - name: str, - array: np.ndarray, - dtype: np.dtype, - suffix: int = None, - create: bool = True) -> None: + def __init__( + self, + name: str, + array: np.ndarray, + dtype: np.dtype, + suffix: int = None, + create: bool = True, + ) -> None: """Initialize or connect to a shared memory block. Args: @@ -76,18 +78,13 @@ class IPCSignal: name = name + f".{suffix}" if create: - assert not shared_memory_exists( - name), f"ShareMemory: {name} already exists" + assert not shared_memory_exists(name), f"ShareMemory: {name} already exists" self.shm = SharedMemory(create=True, size=array.nbytes, name=name) - self.value: np.ndarray = np.ndarray(array.shape, - dtype=array.dtype, - buffer=self.shm.buf) + self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf) self.value[:] = array # Initialize with input array data else: self.shm = SharedMemory(name=name) - self.value: np.ndarray = np.ndarray(array.shape, - dtype=array.dtype, - buffer=self.shm.buf) + self.value: np.ndarray = np.ndarray(array.shape, dtype=array.dtype, buffer=self.shm.buf) def clear(self) -> None: """Release system resources and unlink the shared memory block.""" diff --git a/fastdeploy/inter_communicator/zmq_client.py b/fastdeploy/inter_communicator/zmq_client.py index 115331c32..05e55929d 100644 --- a/fastdeploy/inter_communicator/zmq_client.py +++ b/fastdeploy/inter_communicator/zmq_client.py @@ -14,13 +14,12 @@ # limitations under the License. """ -import json import os import threading import time -import zmq import msgpack +import zmq from fastdeploy import envs from fastdeploy.utils import llm_logger @@ -104,22 +103,21 @@ class ZmqClient: for response in data[1:]: result.add(response) result = msgpack.packb([result.to_dict()]) - return result + return result + def send_multipart(self, req_id, data): """ Send a multipart message to the router socket. """ if self.router is None: - raise RuntimeError( - "Router socket not created. Call create_router() first.") + raise RuntimeError("Router socket not created. Call create_router() first.") while self.running: with self.mutex: if req_id not in self.req_dict: try: - client, _, request_id = self.router.recv_multipart( - flags=zmq.NOBLOCK) - req_id_str = request_id.decode('utf-8') + client, _, request_id = self.router.recv_multipart(flags=zmq.NOBLOCK) + req_id_str = request_id.decode("utf-8") self.req_dict[req_id_str] = client except zmq.Again: time.sleep(0.001) @@ -133,7 +131,7 @@ class ZmqClient: result = self.pack_aggregated_data(data) else: result = msgpack.packb([response.to_dict() for response in data]) - self.router.send_multipart([self.req_dict[req_id], b'', result]) + self.router.send_multipart([self.req_dict[req_id], b"", result]) llm_logger.debug(f"send_multipart result: {req_id} len {len(data)} elapse: {time.time()-start_send}") except Exception as e: @@ -143,7 +141,6 @@ class ZmqClient: with self.mutex: self.req_dict.pop(req_id, None) llm_logger.info(f"send_multipart finished, req_id: {req_id}") - def receive_json_once(self, block=False): """ @@ -197,7 +194,7 @@ class ZmqClient: self.running = False llm_logger.info("Closing ZMQ connection...") try: - if hasattr(self, 'socket') and not self.socket.closed: + if hasattr(self, "socket") and not self.socket.closed: self.socket.close() if self.router is not None and not self.router.closed: diff --git a/fastdeploy/metrics/__init__.py b/fastdeploy/metrics/__init__.py index 1680a0d6a..d997c5113 100644 --- a/fastdeploy/metrics/__init__.py +++ b/fastdeploy/metrics/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + """ metrics """ @@ -28,7 +29,7 @@ def build_buckets(mantissa_lst: List[int], max_value: int) -> List[int]: buckets: List[int] = [] while True: for m in mantissa_lst: - value = m * 10 ** exponent + value = m * 10**exponent if value <= max_value: buckets.append(value) else: diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index 015c0337c..a09273fc8 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -19,30 +19,34 @@ metrics """ import os import shutil -from typing import Set, TYPE_CHECKING +from typing import Set -from prometheus_client import Gauge, Histogram, multiprocess, CollectorRegistry, generate_latest, Counter +from prometheus_client import ( + CollectorRegistry, + Counter, + Gauge, + Histogram, + generate_latest, + multiprocess, +) from prometheus_client.registry import Collector from fastdeploy.metrics import build_1_2_5_buckets from fastdeploy.metrics.work_metrics import work_process_metrics -if TYPE_CHECKING: - from prometheus_client import Gauge, Histogram, Counter - def cleanup_prometheus_files(is_main): """ - Cleans and recreates the Prometheus multiprocess directory. + Cleans and recreates the Prometheus multiprocess directory. - Depending on whether it's the main process or a worker, this function removes the corresponding - Prometheus multiprocess directory (/tmp/prom_main or /tmp/prom_worker) and recreates it as an empty directory. + Depending on whether it's the main process or a worker, this function removes the corresponding + Prometheus multiprocess directory (/tmp/prom_main or /tmp/prom_worker) and recreates it as an empty directory. - Args: - is_main (bool): Indicates whether the current process is the main process. + Args: + is_main (bool): Indicates whether the current process is the main process. - Returns: - str: The path to the newly created Prometheus multiprocess directory. + Returns: + str: The path to the newly created Prometheus multiprocess directory. """ PROM_DIR = "/tmp/prom_main" if is_main else "/tmp/prom_worker" if os.path.exists(PROM_DIR): @@ -53,30 +57,30 @@ def cleanup_prometheus_files(is_main): class SimpleCollector(Collector): """ - A custom Prometheus collector that filters out specific metrics by name. + A custom Prometheus collector that filters out specific metrics by name. - This collector wraps an existing registry and yields only those metrics - whose names are not in the specified exclusion set. + This collector wraps an existing registry and yields only those metrics + whose names are not in the specified exclusion set. """ def __init__(self, base_registry, exclude_names: Set[str]): """ - Initializes the SimpleCollector. + Initializes the SimpleCollector. - Args: - base_registry (CollectorRegistry): The source registry from which metrics are collected. - exclude_names (Set[str]): A set of metric names to exclude from collection. + Args: + base_registry (CollectorRegistry): The source registry from which metrics are collected. + exclude_names (Set[str]): A set of metric names to exclude from collection. """ self.base_registry = base_registry self.exclude_names = exclude_names def collect(self): """ - Collects and yields metrics not in the exclusion list. + Collects and yields metrics not in the exclusion list. - Yields: - Metric: Prometheus Metric objects that are not excluded. - """ + Yields: + Metric: Prometheus Metric objects that are not excluded. + """ for metric in self.base_registry.collect(): if not any(name.startswith(metric.name) for name in self.exclude_names): yield metric @@ -102,124 +106,157 @@ def get_filtered_metrics(exclude_names: Set[str], extra_register_func=None) -> s REQUEST_LATENCY_BUCKETS = [ - 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 + 0.3, + 0.5, + 0.8, + 1.0, + 1.5, + 2.0, + 2.5, + 5.0, + 10.0, + 15.0, + 20.0, + 30.0, + 40.0, + 50.0, + 60.0, + 120.0, + 240.0, + 480.0, + 960.0, + 1920.0, + 7680.0, ] class MetricsManager: - """Prometheus Metrics Manager handles all metric updates """ + """Prometheus Metrics Manager handles all metric updates""" _instance = None - num_requests_running: 'Gauge' - num_requests_waiting: 'Gauge' - time_to_first_token: 'Histogram' - time_per_output_token: 'Histogram' - request_inference_time: 'Histogram' - request_queue_time: 'Histogram' - gpu_cache_usage_perc: 'Gauge' - generation_tokens_total: 'Counter' - request_prefill_time: 'Histogram' - request_decode_time: 'Histogram' - request_generation_tokens: 'Histogram' - request_success_total: 'Counter' - spec_decode_draft_acceptance_rate: 'Gauge' - spec_decode_efficiency: 'Gauge' - spec_decode_num_accepted_tokens_total: 'Counter' - spec_decode_num_draft_tokens_total: 'Counter' - spec_decode_num_emitted_tokens_total: 'Counter' - spec_decode_draft_single_head_acceptance_rate: 'list[Gauge]' + num_requests_running: "Gauge" + num_requests_waiting: "Gauge" + time_to_first_token: "Histogram" + time_per_output_token: "Histogram" + request_inference_time: "Histogram" + request_queue_time: "Histogram" + gpu_cache_usage_perc: "Gauge" + generation_tokens_total: "Counter" + request_prefill_time: "Histogram" + request_decode_time: "Histogram" + request_generation_tokens: "Histogram" + request_success_total: "Counter" + spec_decode_draft_acceptance_rate: "Gauge" + spec_decode_efficiency: "Gauge" + spec_decode_num_accepted_tokens_total: "Counter" + spec_decode_num_draft_tokens_total: "Counter" + spec_decode_num_emitted_tokens_total: "Counter" + spec_decode_draft_single_head_acceptance_rate: "list[Gauge]" # 定义所有指标配置 METRICS = { - 'num_requests_running': { - 'type': Gauge, - 'name': 'fastdeploy:num_requests_running', - 'description': 'Number of requests currently running', - 'kwargs': {} + "num_requests_running": { + "type": Gauge, + "name": "fastdeploy:num_requests_running", + "description": "Number of requests currently running", + "kwargs": {}, }, - 'num_requests_waiting': { - 'type': Gauge, - 'name': 'fastdeploy:num_requests_waiting', - 'description': 'Number of requests currently waiting', - 'kwargs': {} + "num_requests_waiting": { + "type": Gauge, + "name": "fastdeploy:num_requests_waiting", + "description": "Number of requests currently waiting", + "kwargs": {}, }, - 'time_to_first_token': { - 'type': Histogram, - 'name': 'fastdeploy:time_to_first_token_seconds', - 'description': 'Time to first token in seconds', - 'kwargs': { - 'buckets': [0.001, 0.005, 0.01, 0.02, 0.04, 0.06, 0.08, 0.1, 0.25, 0.5, 0.75, 1.0] - } + "time_to_first_token": { + "type": Histogram, + "name": "fastdeploy:time_to_first_token_seconds", + "description": "Time to first token in seconds", + "kwargs": { + "buckets": [ + 0.001, + 0.005, + 0.01, + 0.02, + 0.04, + 0.06, + 0.08, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + ] + }, }, - 'time_per_output_token': { - 'type': Histogram, - 'name': 'fastdeploy:time_per_output_token_seconds', - 'description': 'Time per output token in seconds', - 'kwargs': { - 'buckets': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75, 1.0] - } + "time_per_output_token": { + "type": Histogram, + "name": "fastdeploy:time_per_output_token_seconds", + "description": "Time per output token in seconds", + "kwargs": { + "buckets": [ + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.15, + 0.2, + 0.3, + 0.4, + 0.5, + 0.75, + 1.0, + ] + }, }, - - 'request_inference_time': { - 'type': Histogram, - 'name': 'fastdeploy:request_inference_time_seconds', - 'description': 'Time spent in inference phase (from inference start to last token)', - 'kwargs': { - 'buckets': REQUEST_LATENCY_BUCKETS - } + "request_inference_time": { + "type": Histogram, + "name": "fastdeploy:request_inference_time_seconds", + "description": "Time spent in inference phase (from inference start to last token)", + "kwargs": {"buckets": REQUEST_LATENCY_BUCKETS}, }, - 'request_queue_time': { - 'type': Histogram, - 'name': 'fastdeploy:request_queue_time_seconds', - 'description': 'Time spent in waiting queue (from preprocess end to inference start)', - 'kwargs': { - 'buckets': REQUEST_LATENCY_BUCKETS - } + "request_queue_time": { + "type": Histogram, + "name": "fastdeploy:request_queue_time_seconds", + "description": "Time spent in waiting queue (from preprocess end to inference start)", + "kwargs": {"buckets": REQUEST_LATENCY_BUCKETS}, }, - 'gpu_cache_usage_perc': { - 'type': Gauge, - 'name': 'fastdeploy:gpu_cache_usage_perc', - 'description': 'GPU KV-cache usage. 1 means 100 percent usage', - 'kwargs': {} + "gpu_cache_usage_perc": { + "type": Gauge, + "name": "fastdeploy:gpu_cache_usage_perc", + "description": "GPU KV-cache usage. 1 means 100 percent usage", + "kwargs": {}, }, - - 'generation_tokens_total': { - 'type': Counter, - 'name': 'fastdeploy:generation_tokens_total', - 'description': 'Total number of generation tokens processed', - 'kwargs': {} + "generation_tokens_total": { + "type": Counter, + "name": "fastdeploy:generation_tokens_total", + "description": "Total number of generation tokens processed", + "kwargs": {}, }, - 'request_prefill_time': { - 'type': Histogram, - 'name': 'fastdeploy:request_prefill_time_seconds', - 'description': 'Time spent in prefill phase (from preprocess start to preprocess end)', - 'kwargs': { - 'buckets': REQUEST_LATENCY_BUCKETS - } + "request_prefill_time": { + "type": Histogram, + "name": "fastdeploy:request_prefill_time_seconds", + "description": "Time spent in prefill phase (from preprocess start to preprocess end)", + "kwargs": {"buckets": REQUEST_LATENCY_BUCKETS}, }, - 'request_decode_time': { - 'type': Histogram, - 'name': 'fastdeploy:request_decode_time_seconds', - 'description': 'Time spent in decode phase (from first token to last token)', - 'kwargs': { - 'buckets': REQUEST_LATENCY_BUCKETS - } + "request_decode_time": { + "type": Histogram, + "name": "fastdeploy:request_decode_time_seconds", + "description": "Time spent in decode phase (from first token to last token)", + "kwargs": {"buckets": REQUEST_LATENCY_BUCKETS}, }, - 'request_generation_tokens': { - 'type': Histogram, - 'name': 'fastdeploy:request_generation_tokens', - 'description': 'Number of generation tokens processed.', - 'kwargs': { - 'buckets': build_1_2_5_buckets(33792) - } + "request_generation_tokens": { + "type": Histogram, + "name": "fastdeploy:request_generation_tokens", + "description": "Number of generation tokens processed.", + "kwargs": {"buckets": build_1_2_5_buckets(33792)}, }, - 'request_success_total': { - 'type': Counter, - 'name': 'fastdeploy:request_success_total', - 'description': 'Total number of successfully processed requests', - 'kwargs': {} + "request_success_total": { + "type": Counter, + "name": "fastdeploy:request_success_total", + "description": "Total number of successfully processed requests", + "kwargs": {}, }, } SPECULATIVE_METRICS = {} @@ -228,11 +265,11 @@ class MetricsManager: """Initializes the Prometheus metrics and starts the HTTP server if not already initialized.""" # 动态创建所有指标 for metric_name, config in self.METRICS.items(): - setattr(self, metric_name, config['type']( - config['name'], - config['description'], - **config['kwargs'] - )) + setattr( + self, + metric_name, + config["type"](config["name"], config["description"], **config["kwargs"]), + ) def _init_speculative_metrics(self, speculative_method, num_speculative_tokens): self.SPECULATIVE_METRICS = { @@ -256,19 +293,19 @@ class MetricsManager: }, } if speculative_method == "mtp": - self.SPECULATIVE_METRICS["spec_decode_efficiency"]={ + self.SPECULATIVE_METRICS["spec_decode_efficiency"] = { "type": Gauge, "name": "fastdeploy:spec_decode_efficiency", "description": "Efficiency of speculative decoding", "kwargs": {}, } - self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"]={ + self.SPECULATIVE_METRICS["spec_decode_num_draft_tokens_total"] = { "type": Counter, "name": "fastdeploy:spec_decode_num_draft_tokens_total", "description": "Total number of speculative tokens generated by the proposal method", "kwargs": {}, } - self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"]={ + self.SPECULATIVE_METRICS["spec_decode_draft_single_head_acceptance_rate"] = { "type": list[Gauge], "name": "fastdeploy:spec_decode_draft_single_head_acceptance_rate", "description": "Single head acceptance rate of speculative decoding", @@ -290,7 +327,9 @@ class MetricsManager: self, metric_name, config["type"]( - config["name"], config["description"], **config["kwargs"] + config["name"], + config["description"], + **config["kwargs"], ), ) @@ -318,7 +357,7 @@ class MetricsManager: @classmethod def get_excluded_metrics(cls) -> Set[str]: """Get the set of indicator names that need to be excluded""" - return {config['name'] for config in cls.METRICS.values()} + return {config["name"] for config in cls.METRICS.values()} main_process_metrics = MetricsManager() diff --git a/fastdeploy/metrics/trace_util.py b/fastdeploy/metrics/trace_util.py index 576e284a2..e51446e77 100644 --- a/fastdeploy/metrics/trace_util.py +++ b/fastdeploy/metrics/trace_util.py @@ -1,16 +1,16 @@ -from opentelemetry.propagate import inject, extract -from opentelemetry import trace -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter -from opentelemetry.sdk.trace.export import ConsoleSpanExporter -from opentelemetry.sdk.resources import Resource -from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor -from fastapi import FastAPI -from fastdeploy.utils import (llm_logger) -from fastdeploy import envs import json +from fastapi import FastAPI +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.propagate import extract, inject +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter + +from fastdeploy import envs +from fastdeploy.utils import llm_logger # OpenTelemetry Trace context store in metadata TRACE_CARRIER = "trace_carrier" @@ -18,6 +18,7 @@ TRACE_CARRIER = "trace_carrier" traces_enable = False tracer = trace.get_tracer(__name__) + def set_up(): try: # when TRACES_ENABLED=true start trace @@ -32,9 +33,7 @@ def set_up(): service_name = envs.FD_SERVICE_NAME host_name = envs.FD_HOST_NAME # --- set attributes (Service Name, Host Name, etc.) --- - resource_attributes = { - "service.name": service_name - } + resource_attributes = {"service.name": service_name} if host_name: resource_attributes["host.name"] = host_name @@ -43,12 +42,12 @@ def set_up(): # --- set Exporter --- exporter_type = envs.TRACES_EXPORTER.lower() if exporter_type == "otlp": - endpoint = envs.EXPORTER_OTLP_ENDPOINT # should be set + endpoint = envs.EXPORTER_OTLP_ENDPOINT # should be set headers = envs.EXPORTER_OTLP_HEADERS # e.g., "Authentication=***,k2=v2" otlp_exporter = OTLPSpanExporter( endpoint=endpoint, - headers=dict(item.split("=") for item in headers.split(",")) if headers else None + headers=(dict(item.split("=") for item in headers.split(",")) if headers else None), ) processor = BatchSpanProcessor(otlp_exporter) llm_logger.info(f"Using OTLP Exporter, sending to {endpoint} with headers {headers}") @@ -66,6 +65,7 @@ def set_up(): llm_logger.error("set_up failed") pass + def instrument(app: FastAPI): try: set_up() @@ -77,26 +77,25 @@ def instrument(app: FastAPI): pass - -def inject_to_metadata(request, metadata_attr='metadata'): +def inject_to_metadata(request, metadata_attr="metadata"): """ - Inject OpenTelemetry trace context into the metadata field of the request. + Inject OpenTelemetry trace context into the metadata field of the request. - Parameters: - request: can be a dict or object, with metadata attributes or fields. - metadata_attr: the field name of metadata, default is 'metadata'. + Parameters: + request: can be a dict or object, with metadata attributes or fields. + metadata_attr: the field name of metadata, default is 'metadata'. - Operation: - - If metadata does not exist, create a new one and mount it on the request. - - Inject the current trace context as a JSON string and store it in metadata. - - Use the key TRACE_CARRIER to store the injected content. + Operation: + - If metadata does not exist, create a new one and mount it on the request. + - Inject the current trace context as a JSON string and store it in metadata. + - Use the key TRACE_CARRIER to store the injected content. - Note: - - This function is a non-blocking operation, and errors are silently ignored. - - If there is no metadata attribute in the request, an empty dict will be created for it as its attribute + Note: + - This function is a non-blocking operation, and errors are silently ignored. + - If there is no metadata attribute in the request, an empty dict will be created for it as its attribute """ try: - if request is None or traces_enable == False: + if request is None or not traces_enable: return metadata = request.get(metadata_attr) if isinstance(request, dict) else getattr(request, metadata_attr, None) @@ -115,17 +114,17 @@ def inject_to_metadata(request, metadata_attr='metadata'): pass -def extract_from_metadata(request, metadata_attr='metadata'): +def extract_from_metadata(request, metadata_attr="metadata"): """ - Extract trace context from metadata of request object (dict or class instance). + Extract trace context from metadata of request object (dict or class instance). - Parameters: - request: can be a dictionary or any object, containing metadata attributes or fields. - metadata_attr: metadata field name, default is 'metadata'. + Parameters: + request: can be a dictionary or any object, containing metadata attributes or fields. + metadata_attr: metadata field name, default is 'metadata'. - Returns: - - Extraction success: returns OpenTelemetry context object (Context) - - Extraction failure or exception: returns None + Returns: + - Extraction success: returns OpenTelemetry context object (Context) + - Extraction failure or exception: returns None """ try: metadata = request.get(metadata_attr) if isinstance(request, dict) else getattr(request, metadata_attr, None) @@ -145,15 +144,15 @@ def extract_from_metadata(request, metadata_attr='metadata'): def extract_from_request(request): """ - Extract trace context from trace_carrier of request object (dict or class instance). + Extract trace context from trace_carrier of request object (dict or class instance). - Parameters: - request: can be a dictionary or any object, containing metadata attributes or fields. - metadata_attr: metadata field name, default is 'metadata'. + Parameters: + request: can be a dictionary or any object, containing metadata attributes or fields. + metadata_attr: metadata field name, default is 'metadata'. - Returns: - - Extraction success: returns OpenTelemetry context object (Context) - - Extraction failure or exception: returns None + Returns: + - Extraction success: returns OpenTelemetry context object (Context) + - Extraction failure or exception: returns None """ try: trace_carrier_info = getattr(request, TRACE_CARRIER, None) @@ -170,14 +169,14 @@ def extract_from_request(request): def start_span(span_name, request, kind=trace.SpanKind.CLIENT): """ - just start a new span in request trace context + just start a new span in request trace context """ try: if not traces_enable: return # extract Trace context from request.metadata.trace_carrier ctx = extract_from_metadata(request) - with tracer.start_as_current_span(span_name, context=ctx, kind=kind) as span: + with tracer.start_as_current_span(span_name, context=ctx, kind=kind): pass except: pass @@ -185,14 +184,14 @@ def start_span(span_name, request, kind=trace.SpanKind.CLIENT): def start_span_request(span_name, request, kind=trace.SpanKind.CLIENT): """ - just start a new span in request trace context + just start a new span in request trace context """ try: if not traces_enable: return # extract Trace context from request.metadata.trace_carrier ctx = extract_from_request(request) - with tracer.start_as_current_span(span_name, context=ctx, kind=kind) as span: + with tracer.start_as_current_span(span_name, context=ctx, kind=kind): pass except: - pass \ No newline at end of file + pass diff --git a/fastdeploy/metrics/work_metrics.py b/fastdeploy/metrics/work_metrics.py index 28182bf3a..190940ff6 100644 --- a/fastdeploy/metrics/work_metrics.py +++ b/fastdeploy/metrics/work_metrics.py @@ -17,18 +17,14 @@ """ metrics """ -import os -import atexit -import shutil -from threading import Lock -from prometheus_client import Histogram, Counter +from prometheus_client import Counter, Histogram from fastdeploy.metrics.metrics import build_1_2_5_buckets -class WorkMetricsManager(object): - """Prometheus Metrics Manager handles all metric updates """ +class WorkMetricsManager: + """Prometheus Metrics Manager handles all metric updates""" _initialized = False @@ -39,26 +35,45 @@ class WorkMetricsManager(object): return self.e2e_request_latency = Histogram( - 'fastdeploy:e2e_request_latency_seconds', - 'End-to-end request latency (from request arrival to final response)', + "fastdeploy:e2e_request_latency_seconds", + "End-to-end request latency (from request arrival to final response)", buckets=[ - 0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0, - 40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0 - ] + 0.3, + 0.5, + 0.8, + 1.0, + 1.5, + 2.0, + 2.5, + 5.0, + 10.0, + 15.0, + 20.0, + 30.0, + 40.0, + 50.0, + 60.0, + 120.0, + 240.0, + 480.0, + 960.0, + 1920.0, + 7680.0, + ], ) self.request_params_max_tokens = Histogram( - name='fastdeploy:request_params_max_tokens', - documentation='Histogram of max_tokens parameter in request parameters', - buckets=build_1_2_5_buckets(33792) + name="fastdeploy:request_params_max_tokens", + documentation="Histogram of max_tokens parameter in request parameters", + buckets=build_1_2_5_buckets(33792), ) self.prompt_tokens_total = Counter( name="fastdeploy:prompt_tokens_total", documentation="Total number of prompt tokens processed", ) self.request_prompt_tokens = Histogram( - name='fastdeploy:request_prompt_tokens', - documentation='Number of prefill tokens processed.', - buckets=build_1_2_5_buckets(33792) + name="fastdeploy:request_prompt_tokens", + documentation="Number of prefill tokens processed.", + buckets=build_1_2_5_buckets(33792), ) self._initialized = True diff --git a/fastdeploy/model_executor/__init__.py b/fastdeploy/model_executor/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/model_executor/__init__.py +++ b/fastdeploy/model_executor/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index 15395b419..d6108cffc 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -30,6 +30,7 @@ class ForwardMode(IntEnum): """ Forward mode used during attention. """ + # Prefill and Extend mode EXTEND = auto() # Decode mode @@ -38,23 +39,24 @@ class ForwardMode(IntEnum): MIXED = auto() def is_prefill(self): - """ Is Extend mode """ + """Is Extend mode""" return self == ForwardMode.EXTEND def is_decode(self): - """ Is Decode mode """ + """Is Decode mode""" return self == ForwardMode.DECODE def is_mixed(self): - """ Is Mixed mode """ + """Is Mixed mode""" return self == ForwardMode.MIXED @dataclass -class ForwardMeta(): +class ForwardMeta: """ ForwardMeta is used to store the global meta information of the model forward. """ + # Input tokens IDs input_ids: paddle.Tensor # Input tokens IDs of removed padding @@ -100,7 +102,7 @@ class ForwardMeta(): caches: Optional[list[paddle.Tensor]] = None def clear_caches(self): - """ Safely clean up the caches """ + """Safely clean up the caches""" if self.caches: del self.caches @@ -110,6 +112,7 @@ class XPUForwardMeta(ForwardMeta): """ XPUForwardMeta is used to store the global meta information of the forward, and some XPU specific meta info. """ + # TODO(wanghaitao): Supplementary notes # encoder_batch_map: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 730a05807..bbe95feb4 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -17,19 +17,19 @@ from dataclasses import dataclass from typing import Callable, Dict, Optional -import paddle.device.cuda.graphs as graphs import paddle.nn.layer +from paddle.device.cuda import graphs from fastdeploy.config import FDConfig from fastdeploy.utils import get_logger -logger = get_logger("cudagrpah_piecewise_backend", - "cudagraph_piecewise_backend.log") +logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log") @dataclass class ConcreteSizeEntry: - """ Record the concrete information corresponding to the current batch size """ + """Record the concrete information corresponding to the current batch size""" + # Concrete batch size runtime_bs: int # The size is in cudagraph_capture_sizes @@ -48,7 +48,7 @@ class ConcreteSizeEntry: class CudaGraphPiecewiseBackend: - """ Manage the capture and replay of CUDA graphs at the subgraph level. """ + """Manage the capture and replay of CUDA graphs at the subgraph level.""" def __init__( self, @@ -65,12 +65,10 @@ class CudaGraphPiecewiseBackend: self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {} for shape in self.cudagraph_capture_sizes: - self.concrete_size_entries[shape] = ConcreteSizeEntry( - runtime_bs=shape) + self.concrete_size_entries[shape] = ConcreteSizeEntry(runtime_bs=shape) logger.info( - f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " - "Created all batch sizes entry." + f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all batch sizes entry." ) def __call__(self, **kwargs): @@ -87,9 +85,7 @@ class CudaGraphPiecewiseBackend: assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list." if entry.runnable is None: entry.runnable = self.runnable - logger.debug( - f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}" - ) + logger.debug(f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}") if not entry.use_cudagraph: return entry.runnable(**kwargs) @@ -106,10 +102,7 @@ class CudaGraphPiecewiseBackend: ) # Store input addresses for debug - input_addresses = [ - x.data_ptr() for (_, x) in kwargs.items() - if isinstance(x, paddle.Tensor) - ] + input_addresses = [x.data_ptr() for (_, x) in kwargs.items() if isinstance(x, paddle.Tensor)] entry.input_addresses = input_addresses new_grpah = graphs.CUDAGraph() @@ -127,13 +120,9 @@ class CudaGraphPiecewiseBackend: output._clear paddle.device.synchronize() - logger.debug( - f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}" - ) + logger.debug(f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}") # Replay entry.cuda_graph.replay() - logger.debug( - f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}" - ) + logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}") return entry.output_buffer diff --git a/fastdeploy/model_executor/graph_optimization/decorator.py b/fastdeploy/model_executor/graph_optimization/decorator.py index 8661a7beb..49b92feb4 100644 --- a/fastdeploy/model_executor/graph_optimization/decorator.py +++ b/fastdeploy/model_executor/graph_optimization/decorator.py @@ -20,8 +20,9 @@ from typing import Callable, Optional, TypeVar import paddle.nn.layer from fastdeploy.config import FDConfig -from fastdeploy.model_executor.graph_optimization.graph_optimization_backend import \ - GraphOptBackend +from fastdeploy.model_executor.graph_optimization.graph_optimization_backend import ( + GraphOptBackend, +) _T = TypeVar("_T", bound=type[paddle.nn.Layer]) @@ -46,23 +47,21 @@ def support_graph_optimization(cls: Optional[_T] = None) -> _T: if GraphOptWrapper in cls.__bases__: return cls else: - cls.__bases__ = cls.__bases__ + (GraphOptWrapper, ) + cls.__bases__ = cls.__bases__ + (GraphOptWrapper,) origin_init = cls.__init__ def __init__(self, fd_config: FDConfig, **kwargs): - """ Decorator model.__init__() func """ + """Decorator model.__init__() func""" origin_init(self, fd_config=fd_config, **kwargs) self.use_graph_opt = fd_config.graph_opt_config.graph_opt_level > 0 or fd_config.graph_opt_config.use_cudagraph if self.use_graph_opt: - GraphOptWrapper.__init__(self, - fd_config=fd_config, - graph_opt_backend=None) + GraphOptWrapper.__init__(self, fd_config=fd_config, graph_opt_backend=None) else: # Not use graph optimization return def __call__(self, **kwargs): - """ Decorator model.__call__() func """ + """Decorator model.__call__() func""" if not self.use_graph_opt: return self.forward(**kwargs) @@ -74,7 +73,7 @@ def support_graph_optimization(cls: Optional[_T] = None) -> _T: class GraphOptWrapper: - """ The wrapper for GraphOptBackend """ + """The wrapper for GraphOptBackend""" def __init__( self, @@ -87,7 +86,7 @@ class GraphOptWrapper: @abstractmethod def forward(self, **kwargs): - """ Abstract methods for implementing model.forward() """ + """Abstract methods for implementing model.forward()""" pass def __call__(self, **kwargs): diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py index 9ce6f7372..367c0f670 100644 --- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py +++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py @@ -19,8 +19,9 @@ from typing import Callable, Optional from paddle.jit.dy2static.utils import Backend from fastdeploy.config import FDConfig -from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend import \ - CudaGraphPiecewiseBackend +from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend import ( + CudaGraphPiecewiseBackend, +) class GraphOptBackend: @@ -36,32 +37,28 @@ class GraphOptBackend: self.runnable = runnable self.fd_config = fd_config - self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[ - 0] + self.max_captre_batch = fd_config.graph_opt_config.cudagraph_capture_sizes[0] if self.fd_config.graph_opt_config.graph_opt_level > 0: # 1. Prepare cuda grpah input buffers (contain output of subgraphs) # 2. Convert dynamic grpah to static graph from paddle.jit import sot - backend = (Backend.CINN - if self.fd_config.graph_opt_config.graph_opt_level > 1 - else Backend.PHI) - self.runnable = sot.symbolic_translate(self.runnable, - training=False, - backend=backend) + + backend = Backend.CINN if self.fd_config.graph_opt_config.graph_opt_level > 1 else Backend.PHI + self.runnable = sot.symbolic_translate(self.runnable, training=False, backend=backend) def __call__(self, **kwargs): if not self.fd_config.graph_opt_config.use_cudagraph: return self.runnable(**kwargs) if self.cudagraph_piecewise_backend is None: self.cudagraph_piecewise_backend = CudaGraphPiecewiseBackend( - fd_config=self.fd_config, runnable=self.runnable) + fd_config=self.fd_config, runnable=self.runnable + ) assert kwargs["forward_meta"].ids_remove_padding is not None batch_size = kwargs["forward_meta"].ids_remove_padding.shape[0] - if ((not kwargs["forward_meta"].step_use_cudagraph) - or (batch_size > self.max_captre_batch)): + if (not kwargs["forward_meta"].step_use_cudagraph) or (batch_size > self.max_captre_batch): return self.runnable(**kwargs) else: return self.cudagraph_piecewise_backend.__call__(**kwargs) diff --git a/fastdeploy/model_executor/guided_decoding/__init__.py b/fastdeploy/model_executor/guided_decoding/__init__.py index 53163f2c2..d6ee61199 100644 --- a/fastdeploy/model_executor/guided_decoding/__init__.py +++ b/fastdeploy/model_executor/guided_decoding/__init__.py @@ -16,7 +16,7 @@ # from fastdeploy.config import FDConfig -__all__ = ['get_guided_backend', 'schema_checker'] +__all__ = ["get_guided_backend", "schema_checker"] def get_guided_backend( @@ -37,8 +37,10 @@ def get_guided_backend( ValueError: If the specified backend is not supported """ if fd_config.parallel_config.guided_decoding_backend.lower() == "xgrammar": - from fastdeploy.model_executor.guided_decoding.xgrammar_backend import \ - XGrammarBackend + from fastdeploy.model_executor.guided_decoding.xgrammar_backend import ( + XGrammarBackend, + ) + return XGrammarBackend( fd_config=fd_config, **kwargs, @@ -46,7 +48,8 @@ def get_guided_backend( else: raise ValueError( f"Get unsupported backend {fd_config.parallel_config.guided_decoding_backend}," - f" please check your configuration.") + f" please check your configuration." + ) def schema_checker(backend_name: str, **kwargs): @@ -64,10 +67,10 @@ def schema_checker(backend_name: str, **kwargs): ValueError: If the specified backend is not supported """ if backend_name.lower() == "xgrammar": - from fastdeploy.model_executor.guided_decoding.xgrammar_backend import \ - XGrammarChecker + from fastdeploy.model_executor.guided_decoding.xgrammar_backend import ( + XGrammarChecker, + ) + return XGrammarChecker(**kwargs) else: - raise ValueError( - f"Get unsupported backend {backend_name}, please check your configuration." - ) + raise ValueError(f"Get unsupported backend {backend_name}, please check your configuration.") diff --git a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py index d89b9ccb4..260c1a2af 100644 --- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py +++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py @@ -17,7 +17,7 @@ import os from concurrent.futures import ThreadPoolExecutor -from fastdeploy.config import FDConfig, ErnieArchitectures +from fastdeploy.config import ErnieArchitectures, FDConfig from fastdeploy.engine.request import Request from fastdeploy.utils import llm_logger @@ -48,7 +48,7 @@ class LogitsProcessorBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def apply_token_mask(self, logits, token_bitmask): """ @@ -61,7 +61,7 @@ class LogitsProcessorBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def allocate_token_bitmask(self, batch_size, vocab_size): """ @@ -74,7 +74,7 @@ class LogitsProcessorBase: Returns: tensor: The allocated token bitmask. """ - raise NotImplementedError() + raise NotImplementedError def accept_token(self, token): """ @@ -86,7 +86,7 @@ class LogitsProcessorBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def is_terminated(self): """ @@ -95,13 +95,13 @@ class LogitsProcessorBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def reset(self): """ Reset the matcher state. """ - raise NotImplementedError() + raise NotImplementedError def copy(self): """ @@ -110,7 +110,7 @@ class LogitsProcessorBase: Returns: BackendBase: A copy of the backend instance. """ - raise NotImplementedError() + raise NotImplementedError class BackendBase: @@ -146,7 +146,7 @@ class BackendBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def _json_processor(self, schemata): """ @@ -158,7 +158,7 @@ class BackendBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def _regex_processor(self, schemata): """ @@ -170,7 +170,7 @@ class BackendBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def _grammar_processor(self, schemata): """ @@ -182,7 +182,7 @@ class BackendBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def _structural_tag_processor(self, schemata): """ @@ -194,7 +194,7 @@ class BackendBase: Raises: NotImplementedError: This method should be implemented in subclasses. """ - raise NotImplementedError() + raise NotImplementedError def _unsupported_processor_type(self, key_type, schemata): """ @@ -206,8 +206,7 @@ class BackendBase: """ raise Exception(f"Unsupported processor type {key_type}.") - def _init_logits_processor( - self, schemata_key: tuple[str, str]) -> LogitsProcessorBase: + def _init_logits_processor(self, schemata_key: tuple[str, str]) -> LogitsProcessorBase: """ init logits processor by type and schemata. @@ -233,9 +232,7 @@ class BackendBase: llm_logger.error(f"Unsupported processor type {key_type}.") return None - def get_logits_processor( - self, - schemata_key: tuple[str, str]) -> tuple[LogitsProcessorBase, bool]: + def get_logits_processor(self, schemata_key: tuple[str, str]) -> tuple[LogitsProcessorBase, bool]: """ get logits processor by key from cache or create new one. @@ -271,39 +268,41 @@ class BackendBase: if not ErnieArchitectures.contains_ernie_arch(architectures): from transformers import AutoTokenizer, PreTrainedTokenizerFast + tokenizer = AutoTokenizer.from_pretrained( self.fd_config.parallel_config.model_name_or_path, use_fast=False, ) if not isinstance(tokenizer, PreTrainedTokenizerFast): - tokenizer = PreTrainedTokenizerFast( - __slow_tokenizer=tokenizer) + tokenizer = PreTrainedTokenizerFast(__slow_tokenizer=tokenizer) else: - from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import \ - ErnieBotTokenizer + from fastdeploy.model_executor.guided_decoding.ernie_tokenizer import ( + ErnieBotTokenizer, + ) vocab_file_names = [ - "tokenizer.model", "spm.model", "ernie_token_100k.model" + "tokenizer.model", + "spm.model", + "ernie_token_100k.model", ] for i in range(len(vocab_file_names)): if os.path.exists( - os.path.join( - self.fd_config.parallel_config. - model_name_or_path, vocab_file_names[i])): - ErnieBotTokenizer.vocab_files_names[ - "vocab_file"] = vocab_file_names[i] + os.path.join( + self.fd_config.parallel_config.model_name_or_path, + vocab_file_names[i], + ) + ): + ErnieBotTokenizer.vocab_files_names["vocab_file"] = vocab_file_names[i] break - tokenizer = ErnieBotTokenizer.from_pretrained( - self.fd_config.parallel_config.model_name_or_path) + tokenizer = ErnieBotTokenizer.from_pretrained(self.fd_config.parallel_config.model_name_or_path) return tokenizer except Exception as e: raise Exception(f"Fail to initialize hf tokenizer: {e}") - def add_cache(self, schemata_key: tuple[str, str], - processor: LogitsProcessorBase) -> None: + def add_cache(self, schemata_key: tuple[str, str], processor: LogitsProcessorBase) -> None: """ add logits processor to cache. @@ -343,4 +342,4 @@ class BaseChecker: Returns: request (Request): request object with formatted schema. """ - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/guided_decoding/ernie_tokenizer.py b/fastdeploy/model_executor/guided_decoding/ernie_tokenizer.py index b78b77a4b..40d67c42a 100644 --- a/fastdeploy/model_executor/guided_decoding/ernie_tokenizer.py +++ b/fastdeploy/model_executor/guided_decoding/ernie_tokenizer.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os from shutil import copyfile from typing import Any, Dict, List, Optional, Tuple @@ -63,18 +64,10 @@ class ErnieBotTokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) - bos_token = AddedToken(bos_token, - lstrip=False, rstrip=False) if isinstance( - bos_token, str) else bos_token - eos_token = AddedToken(eos_token, - lstrip=False, rstrip=False) if isinstance( - eos_token, str) else eos_token - unk_token = AddedToken(unk_token, - lstrip=False, rstrip=False) if isinstance( - unk_token, str) else unk_token - pad_token = AddedToken(pad_token, - lstrip=False, rstrip=False) if isinstance( - pad_token, str) else pad_token + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -111,10 +104,7 @@ class ErnieBotTokenizer(PreTrainedTokenizer): def get_vocab(self): """Returns vocab as a dict""" - vocab = { - self.convert_ids_to_tokens(i): i - for i in range(self.vocab_size) - } + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} vocab.update(self.added_tokens_encoder) return vocab @@ -126,10 +116,12 @@ class ErnieBotTokenizer(PreTrainedTokenizer): """Returns a tokenized string.""" return self.sp_model.encode(text, out_type=str) - def decode(self, - tokens, - skip_special_tokens=False, - clean_up_tokenization_spaces=False): + def decode( + self, + tokens, + skip_special_tokens=False, + clean_up_tokenization_spaces=False, + ): """Returns a tokenized string.""" return self.sp_model.decode(tokens) @@ -161,9 +153,7 @@ class ErnieBotTokenizer(PreTrainedTokenizer): out_string += self.sp_model.decode(current_sub_tokens) return out_string - def save_vocabulary(self, - save_directory, - filename_prefix: Optional[str] = None) -> Tuple[str]: + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: """ Save the vocabulary and special tokens file to a directory. Args: @@ -176,18 +166,17 @@ class ErnieBotTokenizer(PreTrainedTokenizer): return out_vocab_file = os.path.join( save_directory, - (filename_prefix + "-" if filename_prefix else "") + - VOCAB_FILES_NAMES["vocab_file"]) + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"], + ) - if os.path.abspath(self.vocab_file) != os.path.abspath( - out_vocab_file) and os.path.isfile(self.vocab_file): + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): copyfile(self.vocab_file, out_vocab_file) elif not os.path.isfile(self.vocab_file): with open(out_vocab_file, "wb") as fi: content_spiece_model = self.sp_model.serialized_model_proto() fi.write(content_spiece_model) - return (out_vocab_file, ) + return (out_vocab_file,) def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): """ @@ -204,10 +193,11 @@ class ErnieBotTokenizer(PreTrainedTokenizer): return output def get_special_tokens_mask( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - already_has_special_tokens: bool = False) -> List[int]: + self, + token_ids_0: List[int], + token_ids_1: Optional[List[int]] = None, + already_has_special_tokens: bool = False, + ) -> List[int]: """ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer `prepare_for_model` method. @@ -225,20 +215,26 @@ class ErnieBotTokenizer(PreTrainedTokenizer): return super().get_special_tokens_mask( token_ids_0=token_ids_0, token_ids_1=token_ids_1, - already_has_special_tokens=True) + already_has_special_tokens=True, + ) bos_token_id = [1] if self.add_bos_token else [] eos_token_id = [1] if self.add_eos_token else [] if token_ids_1 is None: return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id - return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id + - bos_token_id + ([0] * len(token_ids_1)) + eos_token_id) + return ( + bos_token_id + + ([0] * len(token_ids_0)) + + eos_token_id + + bos_token_id + + ([0] * len(token_ids_1)) + + eos_token_id + ) def create_token_type_ids_from_sequences( - self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None) -> List[int]: + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: """ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT sequence pair mask has the following format: diff --git a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py index 74b1c2952..f702a1085 100644 --- a/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py +++ b/fastdeploy/model_executor/guided_decoding/xgrammar_backend.py @@ -24,16 +24,25 @@ import torch from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( - BackendBase, BaseChecker, LogitsProcessorBase) + BackendBase, + BaseChecker, + LogitsProcessorBase, +) from fastdeploy.utils import llm_logger try: - from xgrammar import (CompiledGrammar, Grammar, GrammarCompiler, - GrammarMatcher, StructuralTagItem, TokenizerInfo, - allocate_token_bitmask, apply_token_bitmask_inplace) + from xgrammar import ( + CompiledGrammar, + Grammar, + GrammarCompiler, + GrammarMatcher, + StructuralTagItem, + TokenizerInfo, + allocate_token_bitmask, + apply_token_bitmask_inplace, + ) except Exception as e: - raise Exception( - f"import XGrammar failed, please check your environment:\n\t {e}") + raise Exception(f"import XGrammar failed, please check your environment:\n\t {e}") class XGrammarProcessor(LogitsProcessorBase): @@ -88,8 +97,7 @@ class XGrammarProcessor(LogitsProcessorBase): """ return allocate_token_bitmask(self.batch_size, self.vocab_size) - def fill_token_bitmask(self, token_bitmask: torch.Tensor, - idx: int) -> None: + def fill_token_bitmask(self, token_bitmask: torch.Tensor, idx: int) -> None: """ Fill the token bitmask with allowed tokens for the given index. @@ -155,8 +163,7 @@ class XGrammarProcessor(LogitsProcessorBase): Raises: AssertionError: If token is not allowed by the grammar """ - assert self.matcher.accept_token( - token), f"Failed to accept token {token}" + assert self.matcher.accept_token(token), f"Failed to accept token {token}" def is_terminated(self) -> bool: """ @@ -212,10 +219,8 @@ class XGrammarBackend(BackendBase): self.splitwise_role = fd_config.parallel_config.splitwise_role try: - tokenizer_info = TokenizerInfo.from_huggingface( - self.hf_tokenizer, vocab_size=self.vocab_size) - self.grammar_compiler = GrammarCompiler( - tokenizer_info=tokenizer_info) + tokenizer_info = TokenizerInfo.from_huggingface(self.hf_tokenizer, vocab_size=self.vocab_size) + self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info) except Exception as e: raise Exception(f"Failed to load XGrammar tokenizer: {e}") @@ -256,8 +261,7 @@ class XGrammarBackend(BackendBase): Optional[XGrammarProcessor]: Configured processor if successful, None on failure """ try: - compiled_grammar = self.grammar_compiler.compile_json_schema( - schemata, any_whitespace=self.any_whitespace) + compiled_grammar = self.grammar_compiler.compile_json_schema(schemata, any_whitespace=self.any_whitespace) except Exception as e: llm_logger.error(f"Failed to compile json schema: {e}") return None @@ -297,8 +301,7 @@ class XGrammarBackend(BackendBase): return None return self._create_processor(compiled_grammar) - def _structural_tag_processor( - self, schemata: str) -> Optional[XGrammarProcessor]: + def _structural_tag_processor(self, schemata: str) -> Optional[XGrammarProcessor]: """ Compile structural tags into a grammar processor. @@ -315,11 +318,11 @@ class XGrammarBackend(BackendBase): begin=structure["begin"], schema=json.dumps(structure["schema"]), end=structure["end"], - ) for structure in structural_tag["structures"] + ) + for structure in structural_tag["structures"] ] - compiled_grammar = self.grammar_compiler.compile_structural_tag( - tags, structural_tag["triggers"]) + compiled_grammar = self.grammar_compiler.compile_structural_tag(tags, structural_tag["triggers"]) except Exception as e: llm_logger.error(f"Failed to compile structural tags schema: {e}") return None @@ -357,22 +360,32 @@ class XGrammarChecker(BaseChecker): if not isinstance(obj, dict): return False - if obj.get("type") in ("integer", "number") and ("multipleOf" - in obj): + if obj.get("type") in ("integer", "number") and ("multipleOf" in obj): return True if obj.get("type") == "array" and any( - key in obj for key in ("uniqueItems", "contains", - "minContains", "maxContains")): + key in obj + for key in ( + "uniqueItems", + "contains", + "minContains", + "maxContains", + ) + ): return True if obj.get("type") == "string" and "format" in obj: return True if obj.get("type") == "object" and any( - key in obj - for key in ("minProperties", "maxProperties", - "propertyNames", "patternProperties")): + key in obj + for key in ( + "minProperties", + "maxProperties", + "propertyNames", + "patternProperties", + ) + ): return True for value in obj.values(): @@ -398,10 +411,9 @@ class XGrammarChecker(BaseChecker): else: guided_json = request.guided_json - Grammar.from_json_schema(guided_json, - any_whitespace=self.any_whitespace) + Grammar.from_json_schema(guided_json, any_whitespace=self.any_whitespace) except RuntimeError as e: - err_msg = f"Invalid JSON format: {guided_json}, error message: {str(e)}" + err_msg = f"Invalid JSON format: {guided_json}, error message: {e!s}" return request, err_msg if self._unsupported_json_schema(guided_json): @@ -416,7 +428,7 @@ class XGrammarChecker(BaseChecker): try: Grammar.from_ebnf(guided_grammar) except RuntimeError as e: - err_msg = f"Invalid grammar format: {guided_grammar}, error message: {str(e)}" + err_msg = f"Invalid grammar format: {guided_grammar}, error message: {e!s}" return request, err_msg request.guided_grammar = guided_grammar return request, None @@ -425,14 +437,12 @@ class XGrammarChecker(BaseChecker): return request, None elif request.guided_choice: try: - escaped_choices = (re.sub(r'(["\\])', r'\\\1', c) - for c in request.guided_choice) - guided_choice = ('root ::= ' + - ' | '.join(f'"{c}"' for c in escaped_choices)) + escaped_choices = (re.sub(r'(["\\])', r"\\\1", c) for c in request.guided_choice) + guided_choice = "root ::= " + " | ".join(f'"{c}"' for c in escaped_choices) Grammar.from_ebnf(guided_choice) except RuntimeError as e: - err_msg = f"Invalid choice format: {guided_choice}, error message: {str(e)}" + err_msg = f"Invalid choice format: {guided_choice}, error message: {e!s}" return request, err_msg request.guided_grammar = guided_choice @@ -445,11 +455,12 @@ class XGrammarChecker(BaseChecker): begin=s["begin"], schema=json.dumps(s["schema"]), end=s["end"], - ) for s in structural_tag["structures"] + ) + for s in structural_tag["structures"] ] Grammar.from_structural_tag(tags, structural_tag["triggers"]) except RuntimeError as e: - err_msg = f"Invalid structural_tag format: {structural_tag}, error message: {str(e)}" + err_msg = f"Invalid structural_tag format: {structural_tag}, error message: {e!s}" return request, err_msg return request, None else: diff --git a/fastdeploy/model_executor/layers/activation.py b/fastdeploy/model_executor/layers/activation.py index 5f7a568ff..3c336702f 100644 --- a/fastdeploy/model_executor/layers/activation.py +++ b/fastdeploy/model_executor/layers/activation.py @@ -63,8 +63,7 @@ class SiluAndMul(nn.Layer): """ super().__init__() - if current_platform.is_cuda() or current_platform.is_xpu( - ) or current_platform.is_iluvatar(): + if current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_iluvatar(): self.forward = self.forward_cuda elif current_platform.is_gcu(): self.forward = self.forward_gcu @@ -93,8 +92,10 @@ class SiluAndMul(nn.Layer): elif self._dtype == "float32": self._fuse_kernel_compute_dtype = "fp32" else: - raise ValueError(f"Just support float32, float16 and \ - bfloat16 as default dtype, but received {self._dtype}") + raise ValueError( + f"Just support float32, float16 and \ + bfloat16 as default dtype, but received {self._dtype}" + ) # fp8 is not support smooth quantization if fd_config.quant_config and "fp8" in fd_config.quant_config.name(): diff --git a/fastdeploy/model_executor/layers/attention/__init__.py b/fastdeploy/model_executor/layers/attention/__init__.py index 5557616f0..83a21da77 100644 --- a/fastdeploy/model_executor/layers/attention/__init__.py +++ b/fastdeploy/model_executor/layers/attention/__init__.py @@ -15,16 +15,21 @@ from .append_attn_backend import AppendAttentionBackend from .attention_selecter import get_attention_backend from .base_attention_backend import AttentionBackend +from .block_multihead_attn_backend import BlockAttentionBackend from .flash_attn_backend import FlashAttentionBackend +from .iluvatar_attn_backend import IluvatarAttnBackend from .mla_attention_backend import MLAAttentionBackend from .native_paddle_backend import PaddleNativeAttnBackend from .xpu_attn_backend import XPUAttentionBackend -from .iluvatar_attn_backend import IluvatarAttnBackend -from .block_multihead_attn_backend import BlockAttentionBackend __all__ = [ - "AttentionBackend", "PaddleNativeAttnBackend", - "get_attention_backend", "AppendAttentionBackend", "XPUAttentionBackend", - "MLAAttentionBackend", "FlashAttentionBackend", "IluvatarAttnBackend", - "BlockAttentionBackend" + "AttentionBackend", + "PaddleNativeAttnBackend", + "get_attention_backend", + "AppendAttentionBackend", + "XPUAttentionBackend", + "MLAAttentionBackend", + "FlashAttentionBackend", + "IluvatarAttnBackend", + "BlockAttentionBackend", ] diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 311fb6bce..cdea28b73 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -23,9 +23,12 @@ from typing import TYPE_CHECKING, List, Optional, Tuple import paddle from fastdeploy.model_executor.layers.attention.ops import ( - append_attention, get_block_shape_and_split_kv_block, - init_signal_layerwise, open_shm_and_get_meta_signal, - init_kv_signal_per_query) + append_attention, + get_block_shape_and_split_kv_block, + init_kv_signal_per_query, + init_signal_layerwise, + open_shm_and_get_meta_signal, +) if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -33,9 +36,10 @@ if TYPE_CHECKING: from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) -from fastdeploy.model_executor.layers.attention.utils import \ - init_rank_and_device_id + AttentionBackend, + AttentionMetadata, +) +from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id @dataclass @@ -43,6 +47,7 @@ class AppendAttentionMetadata(AttentionMetadata): """ AppendAttentionMetadata """ + max_len_kv: paddle.Tensor = None set_max_lengths: int = -1 encoder_batch_ids: paddle.Tensor = None @@ -75,8 +80,13 @@ class AppendAttentionBackend(AttentionBackend): AppendAttentionBackend backend implementation. """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int) -> None: + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ) -> None: """ AppendAttentionBackend __init__ """ @@ -84,9 +94,9 @@ class AppendAttentionBackend(AttentionBackend): self.attention_metadata: AppendAttentionMetadata = None self.block_size: int = fd_config.parallel_config.block_size self.max_seq_len: int = fd_config.parallel_config.max_model_len - self.rope_theta: float = (10000.0 - if fd_config.model_config.rope_theta is None - else fd_config.model_config.rope_theta) + self.rope_theta: float = ( + 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta + ) self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) self.causal: bool = getattr(fd_config.model_config, "causal", True) self.speculative_method: str = fd_config.speculative_config.method @@ -99,11 +109,10 @@ class AppendAttentionBackend(AttentionBackend): self.num_heads: int = num_heads self.head_dim: int = fd_config.model_config.head_dim self.num_layers: int = fd_config.model_config.num_hidden_layers - self.max_partition_size: int = int( - os.getenv("FLAGS_max_partition_size", 32768)) + self.max_partition_size: int = int(os.getenv("FLAGS_max_partition_size", 32768)) self.pd_disaggregation_mode: str = fd_config.parallel_config.pd_disaggregation_mode - + self.start_layer_index: int = fd_config.model_config.start_layer_index if fd_config.parallel_config.expert_parallel_rank is None: @@ -137,7 +146,7 @@ class AppendAttentionBackend(AttentionBackend): metadata.kv_tile_ids_per_batch, metadata.kv_num_blocks, metadata.decoder_batch_ids, # will copy to buffer - metadata.decoder_tile_ids_per_batch, # will copy to buffer + metadata.decoder_tile_ids_per_batch, # will copy to buffer metadata.decoder_num_blocks, metadata.max_len_kv, metadata.set_max_lengths, @@ -165,12 +174,12 @@ class AppendAttentionBackend(AttentionBackend): ) elif self.pd_disaggregation_mode == "per_query": metadata.kv_signal_metadata = open_shm_and_get_meta_signal( - self.rank, int(self.device_id), self.keep_pd_step_flag) + self.rank, int(self.device_id), self.keep_pd_step_flag + ) self.attention_metadata: AttentionMetadata = metadata forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False) - forward_meta.decoder_tile_ids_per_batch.copy_( - metadata.decoder_tile_ids_per_batch, False) + forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False) def get_attntion_meta(self) -> AttentionMetadata: """get_attntion_meta""" @@ -183,8 +192,12 @@ class AppendAttentionBackend(AttentionBackend): """ Caculate kv cache shape """ - return (max_num_blocks, self.kv_num_heads, self.block_size, - self.head_dim) + return ( + max_num_blocks, + self.kv_num_heads, + self.block_size, + self.head_dim, + ) def forward_mixed( self, @@ -203,10 +216,10 @@ class AppendAttentionBackend(AttentionBackend): metadata = self.attention_metadata if self.pd_disaggregation_mode == "per_query": - metadata.kv_signal_data_list[ - layer.layer_id] = init_signal_layerwise( - metadata.kv_signal_metadata, - layer.layer_id + self.start_layer_index) + metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( + metadata.kv_signal_metadata, + layer.layer_id + self.start_layer_index, + ) res = append_attention( qkv, diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 457e5d521..e6ae92b3f 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -24,8 +24,8 @@ from paddle import nn from paddleformers.utils.log import logger from fastdeploy.config import FDConfig -from fastdeploy.model_executor.layers.quantization.quant_base import \ - QuantMethodBase +from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase + if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -67,10 +67,14 @@ class Attention(nn.Layer): ValueError: If the `v_head_dim` is less than 0. """ super().__init__() - self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size + self.num_heads: int = ( + fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size + ) self.head_dim: int = fd_config.model_config.head_dim - self.kv_num_heads: int = \ - max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size) + self.kv_num_heads: int = max( + 1, + fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size, + ) self.layer_id: int = layer_id self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim self.rope_type: str = rope_type @@ -86,10 +90,8 @@ class Attention(nn.Layer): self.out_scale: float = out_scale self.use_neox_rotary_style: bool = use_neox_rotary_style - if fd_config.quant_config and hasattr(fd_config.quant_config, - "kv_cache_quant_type"): - self.kvcache_quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method( - self) + if fd_config.quant_config and hasattr(fd_config.quant_config, "kv_cache_quant_type"): + self.kvcache_quant_method: QuantMethodBase = fd_config.quant_config.get_quant_method(self) else: self.kvcache_quant_method = None @@ -100,11 +102,10 @@ class Attention(nn.Layer): f"Attention is running in cache kv {self.kvcache_quant_method.cache_quant_config.quant_type} mode" ) - def load_state_dict(self, state_dict: Dict[str, - paddle.Tensor | np.ndarray]): - ''' + def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): + """ Attention only have quant related scales not other parameters. - ''' + """ if self.kvcache_quant_method is not None: self.kvcache_quant_method.create_weights(self, state_dict) diff --git a/fastdeploy/model_executor/layers/attention/attention_selecter.py b/fastdeploy/model_executor/layers/attention/attention_selecter.py index 3db03b188..3ceaf9c4f 100644 --- a/fastdeploy/model_executor/layers/attention/attention_selecter.py +++ b/fastdeploy/model_executor/layers/attention/attention_selecter.py @@ -22,22 +22,20 @@ from fastdeploy.utils import resolve_obj_from_strname def backend_name_to_enum(backend_name: str) -> _Backend: - """backend_name_to_enum """ + """backend_name_to_enum""" assert backend_name is not None return _Backend.__members__.get(backend_name) @cache def _get_attn_backend(selected_backend: str) -> object: - """_get_attn_backend """ + """_get_attn_backend""" if isinstance(selected_backend, str): selected_backend = backend_name_to_enum(selected_backend) - attention_cls = current_platform.get_attention_backend_cls( - selected_backend) + attention_cls = current_platform.get_attention_backend_cls(selected_backend) if not attention_cls: - raise ValueError( - f"Invalid attention backend for {current_platform.device_name}") + raise ValueError(f"Invalid attention backend for {current_platform.device_name}") return resolve_obj_from_strname(attention_cls) diff --git a/fastdeploy/model_executor/layers/attention/base_attention_backend.py b/fastdeploy/model_executor/layers/attention/base_attention_backend.py index 4a442e5c3..492a5790d 100644 --- a/fastdeploy/model_executor/layers/attention/base_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/base_attention_backend.py @@ -24,6 +24,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING import paddle + if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -39,7 +40,7 @@ class AttentionBackend(ABC): @abstractmethod def init_attention_metadata(self, forward_meta: ForwardMeta): """Initialize the forward metadata.""" - raise NotImplementedError() + raise NotImplementedError def forward( self, @@ -109,7 +110,7 @@ class AttentionBackend(ABC): forward_meta: ForwardMeta, ) -> paddle.Tensor: """Run a forward for mix.""" - raise NotImplementedError() + raise NotImplementedError def forward_decode( self, @@ -123,7 +124,7 @@ class AttentionBackend(ABC): forward_meta: ForwardMeta, ) -> paddle.Tensor: """Run a forward for decode.""" - raise NotImplementedError() + raise NotImplementedError def forward_extend( self, @@ -137,4 +138,4 @@ class AttentionBackend(ABC): forward_meta: ForwardMeta, ) -> paddle.Tensor: """Run a forward for extend.""" - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py index 5d48f5477..400adc62b 100644 --- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py @@ -16,7 +16,6 @@ from __future__ import annotations -import os from dataclasses import dataclass, field from typing import TYPE_CHECKING, List, Optional @@ -28,7 +27,9 @@ if TYPE_CHECKING: from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) + AttentionBackend, + AttentionMetadata, +) @dataclass @@ -36,6 +37,7 @@ class BlockAttentionMetadata(AttentionMetadata): """ BlockAttentionMetadata """ + max_len_kv: paddle.Tensor = None set_max_lengths: int = -1 encoder_batch_ids: paddle.Tensor = None @@ -68,8 +70,13 @@ class BlockAttentionBackend(AttentionBackend): BlockAttentionBackend backend implementation. """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, - num_heads: int, head_dim: int): + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ): """ BlockAttentionBackend __init__ """ @@ -77,8 +84,7 @@ class BlockAttentionBackend(AttentionBackend): self.attention_metadata: BlockAttentionMetadata = None self.block_size = fd_config.parallel_config.block_size self.max_seq_len = fd_config.parallel_config.max_model_len - self.rope_theta = (10000.0 if fd_config.model_config.rope_theta - is None else fd_config.model_config.rope_theta) + self.rope_theta = 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta self.rank = fd_config.parallel_config.tensor_parallel_rank self.kv_num_heads = kv_num_heads @@ -111,8 +117,12 @@ class BlockAttentionBackend(AttentionBackend): """ Caculate kv cache shape """ - return (max_num_blocks, self.kv_num_heads, self.block_size, - self.head_dim) + return ( + max_num_blocks, + self.kv_num_heads, + self.block_size, + self.head_dim, + ) def forward_mixed( self, diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index 97b0b1bb7..c153556c7 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -16,9 +16,8 @@ from __future__ import annotations -import os from dataclasses import dataclass, field -from typing import List, Optional, TYPE_CHECKING +from typing import TYPE_CHECKING, List, Optional import paddle @@ -30,12 +29,19 @@ except: from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) + AttentionBackend, + AttentionMetadata, +) from fastdeploy.model_executor.layers.attention.ops import ( - get_block_shape_and_split_kv_block, gqa_rope_write_cache, - init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat, - init_kv_signal_per_query) + get_block_shape_and_split_kv_block, + gqa_rope_write_cache, + init_kv_signal_per_query, + init_signal_layerwise, + open_shm_and_get_meta_signal, + pre_cache_len_concat, +) from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id + if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -45,6 +51,7 @@ class FlashAttentionMetadata(AttentionMetadata): """ FlashAttentionMetadata """ + max_len_kv: paddle.Tensor = None set_max_lengths: int = -1 rotary_embs: Optional[paddle.Tensor] = None @@ -82,8 +89,13 @@ class FlashAttentionBackend(AttentionBackend): FlashAttentionBackend backend implementation """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int): + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ): """ FlashAttentionBackend __init__ """ @@ -111,7 +123,7 @@ class FlashAttentionBackend(AttentionBackend): if fd_config.parallel_config.expert_parallel_rank is None: fd_config.parallel_config.expert_parallel_rank = 0 - + self.rank, self.device_id = init_rank_and_device_id(fd_config) def get_attntion_meta(self): @@ -125,8 +137,12 @@ class FlashAttentionBackend(AttentionBackend): """ Caculate kv cache shape """ - return (max_num_blocks, self.kv_num_heads, self.block_size, - self.head_dim) + return ( + max_num_blocks, + self.kv_num_heads, + self.block_size, + self.head_dim, + ) def init_attention_metadata(self, forward_meta: ForwardMeta): metadata = FlashAttentionMetadata() @@ -184,11 +200,11 @@ class FlashAttentionBackend(AttentionBackend): ) elif self.pd_disaggregation_mode == "per_query": metadata.kv_signal_metadata = open_shm_and_get_meta_signal( - self.rank, int(self.device_id), self.keep_pd_step_flag) + self.rank, int(self.device_id), self.keep_pd_step_flag + ) self.attention_metadata = metadata forward_meta.decoder_batch_ids.copy_(metadata.decoder_batch_ids, False) - forward_meta.decoder_tile_ids_per_batch.copy_( - metadata.decoder_tile_ids_per_batch, False) + forward_meta.decoder_tile_ids_per_batch.copy_(metadata.decoder_tile_ids_per_batch, False) def forward_mixed( self, @@ -204,10 +220,10 @@ class FlashAttentionBackend(AttentionBackend): metadata = self.attention_metadata if self.pd_disaggregation_mode == "per_query": - metadata.kv_signal_data_list[ - layer.layer_id] = init_signal_layerwise( - metadata.kv_signal_metadata, - layer.layer_id + self.start_layer_index) + metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( + metadata.kv_signal_metadata, + layer.layer_id + self.start_layer_index, + ) q, k, v, _ = gqa_rope_write_cache( qkv, diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 08795a881..5a4bf549e 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -17,19 +17,21 @@ from __future__ import annotations import os -import paddle - from dataclasses import dataclass -from typing import Optional, TYPE_CHECKING from math import sqrt +from typing import TYPE_CHECKING, Optional +import paddle from paddle.nn.functional.flash_attention import flash_attn_unpadded -from fastdeploy.model_executor.ops.iluvatar import paged_attention from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) + AttentionBackend, + AttentionMetadata, +) +from fastdeploy.model_executor.ops.iluvatar import paged_attention + if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -39,6 +41,7 @@ class IluvatarAttentionMetadata(AttentionMetadata): """ IluvatarAttentionMetadata """ + # flash_attn metadata cu_seqlens_q: Optional[paddle.Tensor] = None cu_seqlens_k: Optional[paddle.Tensor] = None @@ -72,8 +75,7 @@ def apply_rope(qk, cos, sin): paddle.stack([-qk[..., 1::2], qk[..., 0::2]], axis=-1), paddle.shape(qk), ) - out = paddle.add(paddle.multiply(qk, cos), - paddle.multiply(rotate_half, sin)) + out = paddle.add(paddle.multiply(qk, cos), paddle.multiply(rotate_half, sin)) return paddle.cast(out, qk.dtype) @@ -83,18 +85,21 @@ class IluvatarAttnBackend(AttentionBackend): Which is used only for testing purpose. """ - def __init__(self, llm_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int): + def __init__( + self, + llm_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ): super().__init__() self.attention_metadata = IluvatarAttentionMetadata() self.attention_metadata.block_size = llm_config.parallel_config.block_size assert llm_config.parallel_config.enc_dec_block_num == 0, "Iluvatar does not support yet" self.attention_metadata.max_context_len = llm_config.parallel_config.max_model_len - self.attention_metadata.causal = getattr(llm_config.model_config, - "causal", True) - self.speculate_method = getattr(llm_config.parallel_config, - "speculate_method", None) + self.attention_metadata.causal = getattr(llm_config.model_config, "causal", True) + self.speculate_method = getattr(llm_config.parallel_config, "speculate_method", None) self.use_speculate = self.speculate_method is not None self.attention_metadata.num_kv_heads = kv_num_heads self.attention_metadata.dropout = llm_config.model_config.hidden_dropout_prob @@ -104,10 +109,8 @@ class IluvatarAttnBackend(AttentionBackend): self.attention_metadata.scale = 1.0 / sqrt(head_dim) self.num_layers = llm_config.model_config.num_hidden_layers self.record_block_table_metadata = {} - self.only_use_flash_attn = int( - os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1 - self.do_check_kv_cache = int( - os.getenv("FD_ILUVATAR_CHECK_KV_CACHE_CORRECTNESS", 0)) == 1 + self.only_use_flash_attn = int(os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1 + self.do_check_kv_cache = int(os.getenv("FD_ILUVATAR_CHECK_KV_CACHE_CORRECTNESS", 0)) == 1 if not self.only_use_flash_attn: assert self.attention_metadata.block_size == 16, "Iluvatar paged attn requires block_size must be 16." if self.do_check_kv_cache: @@ -133,16 +136,22 @@ class IluvatarAttnBackend(AttentionBackend): """ Caculate kv cache shape """ - return (max_num_blocks, self.attention_metadata.num_kv_heads, - self.attention_metadata.block_size, self.head_dim) + return ( + max_num_blocks, + self.attention_metadata.num_kv_heads, + self.attention_metadata.block_size, + self.head_dim, + ) - def get_new_kv(self, - k, - v, - k_cache_id: int, - v_cache_id: int, - forward_meta: ForwardMeta, - debug_paged_attn=False): + def get_new_kv( + self, + k, + v, + k_cache_id: int, + v_cache_id: int, + forward_meta: ForwardMeta, + debug_paged_attn=False, + ): new_k = [] new_v = [] tensor_start = 0 @@ -163,39 +172,31 @@ class IluvatarAttnBackend(AttentionBackend): # decode assert seq_len == 1 cur_block_tables = forward_meta.block_tables[batch_idx] - cur_used_block_tables = cur_block_tables[cur_block_tables != - -1] - assert batch_idx in self.record_block_table_metadata, \ - f"Key error: {batch_idx} vs {self.record_block_table_metadata}." - cur_block_table_metadata = self.record_block_table_metadata[ - batch_idx] + cur_used_block_tables = cur_block_tables[cur_block_tables != -1] + assert ( + batch_idx in self.record_block_table_metadata + ), f"Key error: {batch_idx} vs {self.record_block_table_metadata}." + cur_block_table_metadata = self.record_block_table_metadata[batch_idx] record_last_block_id = cur_block_table_metadata["block_id"] assert record_last_block_id != -1 for block_id in cur_used_block_tables: if block_id == record_last_block_id: cache_end = cur_block_table_metadata["cache_end"] - block_k_cache = forward_meta.caches[k_cache_id][ - block_id, :, 0:cache_end, :] - block_v_cache = forward_meta.caches[v_cache_id][ - block_id, :, 0:cache_end, :] + block_k_cache = forward_meta.caches[k_cache_id][block_id, :, 0:cache_end, :] + block_v_cache = forward_meta.caches[v_cache_id][block_id, :, 0:cache_end, :] else: - block_k_cache = forward_meta.caches[k_cache_id][ - block_id] - block_v_cache = forward_meta.caches[v_cache_id][ - block_id] + block_k_cache = forward_meta.caches[k_cache_id][block_id] + block_v_cache = forward_meta.caches[v_cache_id][block_id] # [num_kv_heads, block_size, head_dim] -> [block_size, num_kv_heads, head_dim] - new_k.append( - block_k_cache.transpose([1, 0, 2]).contiguous()) - new_v.append( - block_v_cache.transpose([1, 0, 2]).contiguous()) + new_k.append(block_k_cache.transpose([1, 0, 2]).contiguous()) + new_v.append(block_v_cache.transpose([1, 0, 2]).contiguous()) if block_id == record_last_block_id: break # as line 301 show, record_block_table_metadata updates when executing the last layer, # so slice_k and slice_v has been updated in block_k_cache and block_v_cache - if not (debug_paged_attn and - (k_cache_id / 2 == self.num_layers - 1)): + if not (debug_paged_attn and (k_cache_id / 2 == self.num_layers - 1)): new_k.append(slice_k) new_v.append(slice_v) @@ -208,15 +209,17 @@ class IluvatarAttnBackend(AttentionBackend): new_v = paddle.concat(new_v, axis=0) return new_k, new_v - def update_kv_cache(self, - k, - v, - k_cache_id: int, - v_cache_id: int, - layer_id: int, - forward_meta: ForwardMeta, - specific_batch_ids=None, - debug_paged_attn=False): + def update_kv_cache( + self, + k, + v, + k_cache_id: int, + v_cache_id: int, + layer_id: int, + forward_meta: ForwardMeta, + specific_batch_ids=None, + debug_paged_attn=False, + ): # [num_tokens, num_kv_heads, head_dim] -> [num_kv_heads, num_tokens, head_dim] trans_k = k.transpose([1, 0, 2]).contiguous() trans_v = v.transpose([1, 0, 2]).contiguous() @@ -244,39 +247,33 @@ class IluvatarAttnBackend(AttentionBackend): if i == cur_used_num_blocks - 1: cache_end = seq_len - cache_start assert cache_end <= self.attention_metadata.block_size - forward_meta.caches[k_cache_id][ - block_id, :, - 0:cache_end, :] = slice_trans_k[:, cache_start: - seq_len, :] - forward_meta.caches[v_cache_id][ - block_id, :, - 0:cache_end, :] = slice_trans_v[:, cache_start: - seq_len, :] + forward_meta.caches[k_cache_id][block_id, :, 0:cache_end, :] = slice_trans_k[ + :, cache_start:seq_len, : + ] + forward_meta.caches[v_cache_id][block_id, :, 0:cache_end, :] = slice_trans_v[ + :, cache_start:seq_len, : + ] if layer_id == self.num_layers - 1: self.record_block_table_metadata[batch_idx] = { "block_id": block_id.item(), - "cache_end": cache_end + "cache_end": cache_end, } # non last block: seq_lens_this_time > block_size else: assert seq_len > self.attention_metadata.block_size cache_end = cache_start + self.attention_metadata.block_size - forward_meta.caches[k_cache_id][ - block_id] = slice_trans_k[:, - cache_start:cache_end, :] - forward_meta.caches[v_cache_id][ - block_id] = slice_trans_v[:, - cache_start:cache_end, :] + forward_meta.caches[k_cache_id][block_id] = slice_trans_k[:, cache_start:cache_end, :] + forward_meta.caches[v_cache_id][block_id] = slice_trans_v[:, cache_start:cache_end, :] cache_start += self.attention_metadata.block_size else: # decode assert seq_len == 1 cur_last_block_id = cur_used_block_tables[-1].item() assert cur_last_block_id != -1 - assert batch_idx in self.record_block_table_metadata, \ - f"Key error: {batch_idx} vs {self.record_block_table_metadata}." - cur_block_table_metadata = self.record_block_table_metadata[ - batch_idx] + assert ( + batch_idx in self.record_block_table_metadata + ), f"Key error: {batch_idx} vs {self.record_block_table_metadata}." + cur_block_table_metadata = self.record_block_table_metadata[batch_idx] record_last_block_id = cur_block_table_metadata["block_id"] if cur_last_block_id == record_last_block_id: @@ -291,34 +288,25 @@ class IluvatarAttnBackend(AttentionBackend): # paged attn API will update kv cache with inplace mode if not debug_paged_attn: - forward_meta.caches[k_cache_id][ - cur_last_block_id, :, - cache_start:cache_end, :] = slice_trans_k - forward_meta.caches[v_cache_id][ - cur_last_block_id, :, - cache_start:cache_end, :] = slice_trans_v + forward_meta.caches[k_cache_id][cur_last_block_id, :, cache_start:cache_end, :] = slice_trans_k + forward_meta.caches[v_cache_id][cur_last_block_id, :, cache_start:cache_end, :] = slice_trans_v # update record_block_table_metadata if layer_id == self.num_layers - 1: - self.record_block_table_metadata[batch_idx][ - "block_id"] = cur_last_block_id - self.record_block_table_metadata[batch_idx][ - "cache_end"] = cache_end + self.record_block_table_metadata[batch_idx]["block_id"] = cur_last_block_id + self.record_block_table_metadata[batch_idx]["cache_end"] = cache_end tensor_start = tensor_end - def _check_new_kv_correctness(self, k, v, new_k, new_v, layer_id: int, - forward_meta: ForwardMeta): + def _check_new_kv_correctness(self, k, v, new_k, new_v, layer_id: int, forward_meta: ForwardMeta): tensor_start = 0 - for batch_idx, seq_lens_this_time in enumerate( - forward_meta.seq_lens_this_time): + for batch_idx, seq_lens_this_time in enumerate(forward_meta.seq_lens_this_time): if seq_lens_this_time == 0: continue # note: the second request will also use the batch_idx 0 instead of 1 in # the streaming inference mode, so use seq_lens_this_time > 1 with the same # batch_idx represents the second request comes. - if seq_lens_this_time > 1 and batch_idx in self.record_batched_k[ - layer_id]: + if seq_lens_this_time > 1 and batch_idx in self.record_batched_k[layer_id]: print( f"clear self.record_batched_batched_k: " f"layer_id={layer_id}, batch_id={batch_idx}, " @@ -337,8 +325,7 @@ class IluvatarAttnBackend(AttentionBackend): tensor_start = tensor_end ref_k, ref_v = [], [] - for batch_idx, seq_lens_this_time in enumerate( - forward_meta.seq_lens_this_time): + for batch_idx, seq_lens_this_time in enumerate(forward_meta.seq_lens_this_time): if seq_lens_this_time == 0: continue bached_k_list = self.record_batched_k[layer_id][batch_idx] @@ -359,30 +346,30 @@ class IluvatarAttnBackend(AttentionBackend): f"ref_k[-2:, 0:2, 0:2]={ref_k[-2:, 0:2, 0:2]}, " f"ref_v[-2:, 0:2, 0:2]={ref_v[-2:, 0:2, 0:2]}, " f"new_k[-2:, 0:2, 0:2]={new_k[-2:, 0:2, 0:2]}, " - f"new_v[-2:, 0:2, 0:2]={new_v[-2:, 0:2, 0:2]}") + f"new_v[-2:, 0:2, 0:2]={new_v[-2:, 0:2, 0:2]}" + ) assert paddle.allclose( ref_k.to("cpu").to(paddle.float32), - new_k.to("cpu").to(paddle.float32)) + new_k.to("cpu").to(paddle.float32), + ) assert paddle.allclose( ref_v.to("cpu").to(paddle.float32), - new_v.to("cpu").to(paddle.float32)) + new_v.to("cpu").to(paddle.float32), + ) def get_splited_qkv(self, qkv: paddle.Tensor, forward_meta: ForwardMeta): q_end = self.num_heads * self.head_dim k_end = q_end + self.attention_metadata.num_kv_heads * self.head_dim v_end = k_end + self.attention_metadata.num_kv_heads * self.head_dim - assert v_end == qkv.shape[ - -1], f"Shape mistach: {v_end} vs {qkv.shape[-1]}" + assert v_end == qkv.shape[-1], f"Shape mistach: {v_end} vs {qkv.shape[-1]}" assert qkv.shape[0] == forward_meta.cu_seqlens_q[-1] q = qkv[..., 0:q_end] k = qkv[..., q_end:k_end] v = qkv[..., k_end:v_end] q = q.view([-1, self.num_heads, self.head_dim]).contiguous() - k = k.view([-1, self.attention_metadata.num_kv_heads, - self.head_dim]).contiguous() - v = v.view([-1, self.attention_metadata.num_kv_heads, - self.head_dim]).contiguous() + k = k.view([-1, self.attention_metadata.num_kv_heads, self.head_dim]).contiguous() + v = v.view([-1, self.attention_metadata.num_kv_heads, self.head_dim]).contiguous() # forward_meta.seq_lens_this_time [max_batch,] for batch_idx in range(forward_meta.seq_lens_this_time.shape[0]): seq_len_i = forward_meta.seq_lens_this_time[batch_idx] @@ -393,16 +380,10 @@ class IluvatarAttnBackend(AttentionBackend): cu_seq_end_q = forward_meta.cu_seqlens_q[batch_idx + 1] # forward_meta.rotary_embs is [2, 1, S, 1, D] if forward_meta.rotary_embs is not None: - cos = forward_meta.rotary_embs[0, 0, - cached_kv_len:cached_kv_len + - seq_len_i, :, :] - sin = forward_meta.rotary_embs[1, 0, - cached_kv_len:cached_kv_len + - seq_len_i, :, :] - q[cu_seq_start_q:cu_seq_end_q] = apply_rope( - q[cu_seq_start_q:cu_seq_end_q], cos, sin) - k[cu_seq_start_q:cu_seq_end_q] = apply_rope( - k[cu_seq_start_q:cu_seq_end_q], cos, sin) + cos = forward_meta.rotary_embs[0, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :] + sin = forward_meta.rotary_embs[1, 0, cached_kv_len : cached_kv_len + seq_len_i, :, :] + q[cu_seq_start_q:cu_seq_end_q] = apply_rope(q[cu_seq_start_q:cu_seq_end_q], cos, sin) + k[cu_seq_start_q:cu_seq_end_q] = apply_rope(k[cu_seq_start_q:cu_seq_end_q], cos, sin) return q, k, v @@ -410,8 +391,7 @@ class IluvatarAttnBackend(AttentionBackend): prefill_info_dict = {"q": [], "k": [], "v": [], "batch_ids": []} decode_info_dict = {"q": [], "k": [], "v": [], "batch_ids": []} tensor_start = 0 - for batch_idx, seq_lens_this_time in enumerate( - forward_meta.seq_lens_this_time): + for batch_idx, seq_lens_this_time in enumerate(forward_meta.seq_lens_this_time): if seq_lens_this_time == 0: continue tensor_end = tensor_start + seq_lens_this_time @@ -432,29 +412,21 @@ class IluvatarAttnBackend(AttentionBackend): tensor_start = tensor_end if len(prefill_info_dict["batch_ids"]) > 0: - prefill_info_dict["q"] = paddle.concat(prefill_info_dict["q"], - axis=0) - prefill_info_dict["k"] = paddle.concat(prefill_info_dict["k"], - axis=0) - prefill_info_dict["v"] = paddle.concat(prefill_info_dict["v"], - axis=0) - cu_seq_ids = list( - map(lambda x: x + 1, prefill_info_dict["batch_ids"])) + prefill_info_dict["q"] = paddle.concat(prefill_info_dict["q"], axis=0) + prefill_info_dict["k"] = paddle.concat(prefill_info_dict["k"], axis=0) + prefill_info_dict["v"] = paddle.concat(prefill_info_dict["v"], axis=0) + cu_seq_ids = list(map(lambda x: x + 1, prefill_info_dict["batch_ids"])) prefill_info_dict["cu_seq_ids"] = [0, *cu_seq_ids] if len(decode_info_dict["batch_ids"]) > 0: - decode_info_dict["q"] = paddle.concat(decode_info_dict["q"], - axis=0) - decode_info_dict["k"] = paddle.concat(decode_info_dict["k"], - axis=0) - decode_info_dict["v"] = paddle.concat(decode_info_dict["v"], - axis=0) + decode_info_dict["q"] = paddle.concat(decode_info_dict["q"], axis=0) + decode_info_dict["k"] = paddle.concat(decode_info_dict["k"], axis=0) + decode_info_dict["v"] = paddle.concat(decode_info_dict["v"], axis=0) return prefill_info_dict, decode_info_dict def merge_output(self, prefill_out, decode_out, forward_meta: ForwardMeta): - assert not (prefill_out is None and decode_out - is None), "prefill and decode output cannot both be None" + assert not (prefill_out is None and decode_out is None), "prefill and decode output cannot both be None" if prefill_out is None: return decode_out elif decode_out is None: @@ -468,20 +440,20 @@ class IluvatarAttnBackend(AttentionBackend): continue if seq_lens_this_time > 1: tensor_end = prefill_tensor_start + seq_lens_this_time - merged_output.append( - prefill_out[prefill_tensor_start:tensor_end, :, :]) + merged_output.append(prefill_out[prefill_tensor_start:tensor_end, :, :]) prefill_tensor_start = tensor_end else: assert seq_lens_this_time == 1 tensor_end = decode_tensor_start + seq_lens_this_time - merged_output.append( - decode_out[decode_tensor_start:tensor_end, :, :]) + merged_output.append(decode_out[decode_tensor_start:tensor_end, :, :]) decode_tensor_start = tensor_end - assert prefill_tensor_start == prefill_out.shape[0], \ - f"prefill merged unfinished: {prefill_tensor_start} vs {prefill_out.shape[0]}" - assert decode_tensor_start == decode_out.shape[0], \ - f"decode merged unfinished: {decode_tensor_start} vs {decode_out.shape[0]}" + assert ( + prefill_tensor_start == prefill_out.shape[0] + ), f"prefill merged unfinished: {prefill_tensor_start} vs {prefill_out.shape[0]}" + assert ( + decode_tensor_start == decode_out.shape[0] + ), f"decode merged unfinished: {decode_tensor_start} vs {decode_out.shape[0]}" merged_output = paddle.concat(merged_output, axis=0) return merged_output @@ -509,11 +481,9 @@ class IluvatarAttnBackend(AttentionBackend): q, k, v = self.get_splited_qkv(qkv, forward_meta) if self.only_use_flash_attn: - new_k, new_v = self.get_new_kv(k, v, k_cache_id, v_cache_id, - forward_meta) + new_k, new_v = self.get_new_kv(k, v, k_cache_id, v_cache_id, forward_meta) if self.do_check_kv_cache: - self._check_new_kv_correctness(k, v, new_k, new_v, layer_id, - forward_meta) + self._check_new_kv_correctness(k, v, new_k, new_v, layer_id, forward_meta) out = flash_attn_unpadded( q, @@ -526,13 +496,12 @@ class IluvatarAttnBackend(AttentionBackend): scale=self.attention_metadata.scale, dropout=self.attention_metadata.dropout, causal=self.attention_metadata.causal, - return_softmax=self.attention_metadata.return_softmax)[0] + return_softmax=self.attention_metadata.return_softmax, + )[0] - self.update_kv_cache(k, v, k_cache_id, v_cache_id, layer_id, - forward_meta) + self.update_kv_cache(k, v, k_cache_id, v_cache_id, layer_id, forward_meta) else: - prefill_info_dict, decode_info_dict = self.get_splited_info_by_stage( - q, k, v, forward_meta) + prefill_info_dict, decode_info_dict = self.get_splited_info_by_stage(q, k, v, forward_meta) prefill_out, decode_out = None, None if len(prefill_info_dict["batch_ids"]) > 0: @@ -540,16 +509,15 @@ class IluvatarAttnBackend(AttentionBackend): prefill_info_dict["q"], prefill_info_dict["k"], prefill_info_dict["v"], - cu_seqlens_q=forward_meta.cu_seqlens_q[ - prefill_info_dict["cu_seq_ids"]], - cu_seqlens_k=forward_meta.cu_seqlens_k[ - prefill_info_dict["cu_seq_ids"]], + cu_seqlens_q=forward_meta.cu_seqlens_q[prefill_info_dict["cu_seq_ids"]], + cu_seqlens_k=forward_meta.cu_seqlens_k[prefill_info_dict["cu_seq_ids"]], max_seqlen_q=self.attention_metadata.max_context_len, max_seqlen_k=self.attention_metadata.max_context_len, scale=self.attention_metadata.scale, dropout=self.attention_metadata.dropout, causal=self.attention_metadata.causal, - return_softmax=self.attention_metadata.return_softmax)[0] + return_softmax=self.attention_metadata.return_softmax, + )[0] self.update_kv_cache( prefill_info_dict["k"], prefill_info_dict["v"], @@ -557,7 +525,8 @@ class IluvatarAttnBackend(AttentionBackend): v_cache_id, layer_id, forward_meta, - specific_batch_ids=prefill_info_dict['batch_ids']) + specific_batch_ids=prefill_info_dict["batch_ids"], + ) if len(decode_info_dict["batch_ids"]) > 0: k_cache = forward_meta.caches[k_cache_id] @@ -567,10 +536,8 @@ class IluvatarAttnBackend(AttentionBackend): decode_info_dict["q"], k_cache, v_cache, - block_tables=forward_meta.block_tables[ - decode_info_dict["batch_ids"], :], - seq_lens=forward_meta.seq_lens_decoder[ - decode_info_dict["batch_ids"], 0] + 1, + block_tables=forward_meta.block_tables[decode_info_dict["batch_ids"], :], + seq_lens=forward_meta.seq_lens_decoder[decode_info_dict["batch_ids"], 0] + 1, num_kv_heads=self.attention_metadata.num_kv_heads, scale=self.attention_metadata.scale, block_size=self.attention_metadata.block_size, @@ -583,28 +550,31 @@ class IluvatarAttnBackend(AttentionBackend): use_cuda_graph=self.attention_metadata.use_cuda_graph, use_sqrt_alibi=self.attention_metadata.use_sqrt_alibi, k=decode_info_dict["k"], - v=decode_info_dict["v"]) + v=decode_info_dict["v"], + ) if self.do_check_kv_cache: self.update_kv_cache( - decode_info_dict['k'], - decode_info_dict['v'], + decode_info_dict["k"], + decode_info_dict["v"], k_cache_id, v_cache_id, layer_id, forward_meta, - specific_batch_ids=decode_info_dict['batch_ids'], - debug_paged_attn=True) + specific_batch_ids=decode_info_dict["batch_ids"], + debug_paged_attn=True, + ) if self.do_check_kv_cache: - new_k, new_v = self.get_new_kv(k, - v, - k_cache_id, - v_cache_id, - forward_meta, - debug_paged_attn=True) - self._check_new_kv_correctness(k, v, new_k, new_v, layer_id, - forward_meta) + new_k, new_v = self.get_new_kv( + k, + v, + k_cache_id, + v_cache_id, + forward_meta, + debug_paged_attn=True, + ) + self._check_new_kv_correctness(k, v, new_k, new_v, layer_id, forward_meta) out = self.merge_output(prefill_out, decode_out, forward_meta) diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index e11469e96..12e36b73e 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -25,14 +25,19 @@ import paddle from paddle.nn.functional.flash_attention import flash_attn_unpadded from fastdeploy.model_executor.layers.attention.ops import ( - get_block_shape_and_split_kv_block, init_signal_layerwise, - open_shm_and_get_meta_signal, init_kv_signal_per_query) + get_block_shape_and_split_kv_block, + init_kv_signal_per_query, + init_signal_layerwise, + open_shm_and_get_meta_signal, +) from fastdeploy.platforms import current_platform if current_platform.is_cuda() and not current_platform.is_dcu(): - from fastdeploy.model_executor.ops.gpu import (decode_mla_write_cache, - multi_head_latent_attention, - prefill_mla_write_cache) + from fastdeploy.model_executor.ops.gpu import ( + decode_mla_write_cache, + multi_head_latent_attention, + prefill_mla_write_cache, + ) if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -40,13 +45,14 @@ if TYPE_CHECKING: from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) + AttentionBackend, + AttentionMetadata, +) from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id def yarn_get_mscale(scale=1, mscale=1): - """ - """ + """ """ if scale <= 1: return 1.0 return 0.1 * mscale * math.log(scale) + 1.0 @@ -57,6 +63,7 @@ class MLAAttentionMetadata(AttentionMetadata): """ MLAAttentionMetadata for Multi-Layer Attention """ + max_len_kv: paddle.Tensor = None set_max_lengths: int = -1 encoder_batch_ids: paddle.Tensor = None @@ -89,8 +96,13 @@ class MLAAttentionBackend(AttentionBackend): MLA Attention Backend implementation. """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int) -> None: + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ) -> None: """ MLAAttentionBackend __init__ """ @@ -100,9 +112,9 @@ class MLAAttentionBackend(AttentionBackend): # 基础配置 self.block_size: int = fd_config.parallel_config.block_size self.max_seq_len: int = fd_config.parallel_config.max_model_len - self.rope_theta: float = (10000.0 - if fd_config.model_config.rope_theta is None - else fd_config.model_config.rope_theta) + self.rope_theta: float = ( + 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta + ) self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) self.causal: bool = getattr(fd_config.model_config, "causal", True) self.speculative_method: str = fd_config.speculative_config.method @@ -119,14 +131,11 @@ class MLAAttentionBackend(AttentionBackend): # For Multi Head Latent Attention self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim - self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \ - + fd_config.model_config.qk_rope_head_dim + self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim + fd_config.model_config.qk_rope_head_dim self.attn_softmax_scale: float = self.qk_head_dim**-0.5 if fd_config.model_config.rope_scaling: - mscale_all_dim = fd_config.model_config.rope_scaling.get( - "mscale_all_dim", False) # 1.0 - scaling_factor = fd_config.model_config.rope_scaling[ - "factor"] # 40 + mscale_all_dim = fd_config.model_config.rope_scaling.get("mscale_all_dim", False) # 1.0 + scaling_factor = fd_config.model_config.rope_scaling["factor"] # 40 mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale @@ -134,7 +143,7 @@ class MLAAttentionBackend(AttentionBackend): self.start_layer_index: int = fd_config.model_config.start_layer_index self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None) - + self.rank, self.device_id = init_rank_and_device_id(fd_config) def init_attention_metadata(self, forward_meta: ForwardMeta): @@ -199,7 +208,8 @@ class MLAAttentionBackend(AttentionBackend): ) elif self.pd_disaggregation_mode == "per_query": metadata.kv_signal_metadata = open_shm_and_get_meta_signal( - self.rank, int(self.device_id), self.keep_pd_step_flag) + self.rank, int(self.device_id), self.keep_pd_step_flag + ) self.attention_metadata: AttentionMetadata = metadata @@ -207,13 +217,16 @@ class MLAAttentionBackend(AttentionBackend): """get_attntion_meta""" return self.attention_metadata - def get_kv_cache_shape(self, - max_num_blocks: int) -> Tuple[int, int, int, int]: + def get_kv_cache_shape(self, max_num_blocks: int) -> Tuple[int, int, int, int]: """ Calculate kv cache shape for MLA """ - return (max_num_blocks, 1, self.block_size, - self.kv_lora_rank + self.qk_rope_head_dim) + return ( + max_num_blocks, + 1, + self.block_size, + self.kv_lora_rank + self.qk_rope_head_dim, + ) def forward_extend( self, @@ -232,13 +245,12 @@ class MLAAttentionBackend(AttentionBackend): metadata = self.attention_metadata if self.pd_disaggregation_mode == "per_query": - metadata.kv_signal_data_list[ - layer.layer_id] = init_signal_layerwise( - metadata.kv_signal_metadata, - layer.layer_id + self.start_layer_index) + metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( + metadata.kv_signal_metadata, + layer.layer_id + self.start_layer_index, + ) - latent_cache = forward_meta.caches[layer.layer_id] if hasattr( - forward_meta, 'caches') else None + latent_cache = forward_meta.caches[layer.layer_id] if hasattr(forward_meta, "caches") else None # 写入缓存 prefill_mla_write_cache( @@ -251,7 +263,7 @@ class MLAAttentionBackend(AttentionBackend): forward_meta.cum_offsets, metadata.block_tables, "none", - getattr(forward_meta, 'max_input_length', -1), + getattr(forward_meta, "max_input_length", -1), ) # Flash注意力计算 @@ -287,13 +299,12 @@ class MLAAttentionBackend(AttentionBackend): metadata = self.attention_metadata if self.use_pd_disaggregation: - metadata.kv_signal_data_list[ - layer.layer_id] = init_signal_layerwise( - metadata.kv_signal_metadata, - layer.layer_id + self.start_layer_index) + metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( + metadata.kv_signal_metadata, + layer.layer_id + self.start_layer_index, + ) - latent_cache = forward_meta.caches[layer.layer_id] if hasattr( - forward_meta, 'caches') else None + latent_cache = forward_meta.caches[layer.layer_id] if hasattr(forward_meta, "caches") else None # 获取推测解码参数 speculate_decoder = self.speculative_method is not None @@ -335,8 +346,7 @@ class MLAAttentionBackend(AttentionBackend): metadata.decoder_batch_ids, metadata.decoder_tile_ids_per_batch, metadata.decoder_num_blocks, - metadata. - decoder_num_blocks, # PaddleNLP 传入的是 decoder_num_blocks_cpu + metadata.decoder_num_blocks, # PaddleNLP 传入的是 decoder_num_blocks_cpu metadata.max_enc_len_this_time, metadata.max_dec_len_this_time, metadata.max_len_kv, @@ -385,13 +395,12 @@ class MLAAttentionBackend(AttentionBackend): speculate_max_tokens = self.speculate_max_draft_token_num if self.use_pd_disaggregation: - metadata.kv_signal_data_list[ - layer.layer_id] = init_signal_layerwise( - metadata.kv_signal_metadata, - layer.layer_id + self.start_layer_index) + metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( + metadata.kv_signal_metadata, + layer.layer_id + self.start_layer_index, + ) - latent_cache = forward_meta.caches[layer.layer_id] if hasattr( - forward_meta, 'caches') else None + latent_cache = forward_meta.caches[layer.layer_id] if hasattr(forward_meta, "caches") else None if k is not None: prefill_mla_write_cache( @@ -460,8 +469,7 @@ class MLAAttentionBackend(AttentionBackend): metadata.decoder_batch_ids, metadata.decoder_tile_ids_per_batch, metadata.decoder_num_blocks, - metadata. - decoder_num_blocks, # PaddleNLP 传入的是 decoder_num_blocks_cpu + metadata.decoder_num_blocks, # PaddleNLP 传入的是 decoder_num_blocks_cpu metadata.max_enc_len_this_time, metadata.max_dec_len_this_time, metadata.max_len_kv, diff --git a/fastdeploy/model_executor/layers/attention/native_paddle_backend.py b/fastdeploy/model_executor/layers/attention/native_paddle_backend.py index b8f5db6a1..f92df9724 100644 --- a/fastdeploy/model_executor/layers/attention/native_paddle_backend.py +++ b/fastdeploy/model_executor/layers/attention/native_paddle_backend.py @@ -18,11 +18,14 @@ from __future__ import annotations from typing import TYPE_CHECKING + import paddle from paddle.nn.functional import scaled_dot_product_attention -from fastdeploy.model_executor.layers.attention.base_attention_backend import \ - AttentionBackend +from fastdeploy.model_executor.layers.attention.base_attention_backend import ( + AttentionBackend, +) + if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -104,19 +107,20 @@ class PaddleNativeAttnBackend(AttentionBackend): per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv] # per_req_key = k_cache[per_req_tokens].movedim(0, query.dim() - 2) # per_req_value = v_cache[per_req_tokens].movedim(0, query.dim() - 2) - per_req_key = k_cache[per_req_tokens].transpose( - [query.dim() - 2, 0]) - per_req_value = v_cache[per_req_tokens].transpose( - [query.dim() - 2, 0]) + per_req_key = k_cache[per_req_tokens].transpose([query.dim() - 2, 0]) + per_req_value = v_cache[per_req_tokens].transpose([query.dim() - 2, 0]) - per_req_out_redudant = (scaled_dot_product_attention( - per_req_query_redudant.unsqueeze(0), - per_req_key.unsqueeze(0), - per_req_value.unsqueeze(0), - is_causal=causal, - ).squeeze(0).transpose([query.dim() - 2, 0])) - output[start_q:end_q, :, :] = per_req_out_redudant[ - prefill_seq_len_q:, :, :] + per_req_out_redudant = ( + scaled_dot_product_attention( + per_req_query_redudant.unsqueeze(0), + per_req_key.unsqueeze(0), + per_req_value.unsqueeze(0), + is_causal=causal, + ) + .squeeze(0) + .transpose([query.dim() - 2, 0]) + ) + output[start_q:end_q, :, :] = per_req_out_redudant[prefill_seq_len_q:, :, :] start_q, start_kv = end_q, end_kv return output @@ -132,8 +136,7 @@ class PaddleNativeAttnBackend(AttentionBackend): d_k = query.shape[-1] scores = paddle.matmul(query, key.transpose([0, 1, 3, 2])) # QK^T - scores = scores / \ - paddle.sqrt(paddle.to_tensor(d_k, dtype=scores.dtype)) + scores = scores / paddle.sqrt(paddle.to_tensor(d_k, dtype=scores.dtype)) if is_causal: # Apply causal mask q_len, k_len = scores.shape[-2], scores.shape[-1] @@ -192,17 +195,19 @@ class PaddleNativeAttnBackend(AttentionBackend): per_req_tokens = req_to_token[req_pool_idx, :seq_len_kv] # [seq_len_kv, num_heads, head_size] -> [num_heads, seq_len_kv, head_size] - per_req_key = k_cache[per_req_tokens].transpose( - [query.dim() - 2, 0]) - per_req_value = v_cache[per_req_tokens].transpose( - [query.dim() - 2, 0]) + per_req_key = k_cache[per_req_tokens].transpose([query.dim() - 2, 0]) + per_req_value = v_cache[per_req_tokens].transpose([query.dim() - 2, 0]) - per_req_out = (self._scaled_dot_product_attention( - per_req_query.unsqueeze(0), - per_req_key.unsqueeze(0), - per_req_value.unsqueeze(0), - is_causal=causal, - ).squeeze(0).transpose([query.dim() - 2, 0])) + per_req_out = ( + self._scaled_dot_product_attention( + per_req_query.unsqueeze(0), + per_req_key.unsqueeze(0), + per_req_value.unsqueeze(0), + is_causal=causal, + ) + .squeeze(0) + .transpose([query.dim() - 2, 0]) + ) output[start_q:end_q, :, :] = per_req_out start_q, start_kv = end_q, end_kv @@ -218,17 +223,15 @@ class PaddleNativeAttnBackend(AttentionBackend): save_kv_cache: bool = True, ) -> paddle.Tensor: """ - Run the prefill and extend(prompt cache) attention forward by using paddle native sdpa op. + Run the prefill and extend(prompt cache) attention forward by using paddle native sdpa op. """ if layer.qk_head_dim != layer.v_head_dim: - o = q.new_empty( - (q.shape[0], layer.self.num_heads * layer.v_head_dim)) + o = q.new_empty((q.shape[0], layer.self.num_heads * layer.v_head_dim)) else: o = paddle.empty_like(q) if save_kv_cache: - forward_meta.token_to_kv_pool.set_kv_buffer( - layer, forward_meta.out_cache_loc, k, v) + forward_meta.token_to_kv_pool.set_kv_buffer(layer, forward_meta.out_cache_loc, k, v) q_ = q.view([-1, layer.self.num_heads, layer.qk_head_dim]) o_ = o.view([-1, layer.self.num_heads, layer.v_head_dim]) @@ -258,19 +261,16 @@ class PaddleNativeAttnBackend(AttentionBackend): forward_meta: ForwardMeta, ) -> paddle.Tensor: """ - Run the decoding attention forward by using paddle native sdpa op. + Run the decoding attention forward by using paddle native sdpa op. """ q = q.reshape([-1, layer.self.num_heads * layer.qk_head_dim]) if layer.qk_head_dim != layer.v_head_dim: - o = q.new_empty( - (q.shape[0], layer.self.num_heads * layer.v_head_dim)) + o = q.new_empty((q.shape[0], layer.self.num_heads * layer.v_head_dim)) else: o = paddle.empty_like(q) - forward_meta.token_to_kv_pool.set_kv_buffer(layer, - forward_meta.out_cache_loc, - k, v) + forward_meta.token_to_kv_pool.set_kv_buffer(layer, forward_meta.out_cache_loc, k, v) q_ = q.view([-1, layer.self.num_heads, layer.qk_head_dim]) o_ = o.view([-1, layer.self.num_heads, layer.v_head_dim]) diff --git a/fastdeploy/model_executor/layers/attention/ops/__init__.py b/fastdeploy/model_executor/layers/attention/ops/__init__.py index a44ca7cbf..f2f629d94 100644 --- a/fastdeploy/model_executor/layers/attention/ops/__init__.py +++ b/fastdeploy/model_executor/layers/attention/ops/__init__.py @@ -15,13 +15,12 @@ """ from .append_attention import append_attention -from .get_block_shape_and_split_kv_block import \ - get_block_shape_and_split_kv_block +from .get_block_shape_and_split_kv_block import get_block_shape_and_split_kv_block from .gqa_rope_write_cache import gqa_rope_write_cache +from .init_kv_signal_per_query import init_kv_signal_per_query from .init_signal_layerwise import init_signal_layerwise from .open_shm_and_get_meta_signal import open_shm_and_get_meta_signal from .pre_cache_len_concat import pre_cache_len_concat -from .init_kv_signal_per_query import init_kv_signal_per_query __all__ = [ "get_block_shape_and_split_kv_block", @@ -30,5 +29,5 @@ __all__ = [ "init_signal_layerwise", "gqa_rope_write_cache", "pre_cache_len_concat", - "init_kv_signal_per_query" + "init_kv_signal_per_query", ] diff --git a/fastdeploy/model_executor/layers/attention/ops/append_attention.py b/fastdeploy/model_executor/layers/attention/ops/append_attention.py index 979e8fd64..e3d2aee3c 100644 --- a/fastdeploy/model_executor/layers/attention/ops/append_attention.py +++ b/fastdeploy/model_executor/layers/attention/ops/append_attention.py @@ -21,8 +21,9 @@ import paddle from fastdeploy.platforms import current_platform if current_platform.is_cuda() and not current_platform.is_dcu(): - from fastdeploy.model_executor.ops.gpu import \ - append_attention as append_attention_gpu + from fastdeploy.model_executor.ops.gpu import ( + append_attention as append_attention_gpu, + ) def append_attention( @@ -131,4 +132,4 @@ def append_attention( ) return out else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/ops/get_block_shape_and_split_kv_block.py b/fastdeploy/model_executor/layers/attention/ops/get_block_shape_and_split_kv_block.py index 97c3e6f9b..f2e252a42 100644 --- a/fastdeploy/model_executor/layers/attention/ops/get_block_shape_and_split_kv_block.py +++ b/fastdeploy/model_executor/layers/attention/ops/get_block_shape_and_split_kv_block.py @@ -19,9 +19,9 @@ import paddle from fastdeploy.platforms import current_platform if current_platform.is_cuda(): - from fastdeploy.model_executor.ops.gpu import \ - get_block_shape_and_split_kv_block as \ - get_block_shape_and_split_kv_block_cuda + from fastdeploy.model_executor.ops.gpu import ( + get_block_shape_and_split_kv_block as get_block_shape_and_split_kv_block_cuda, + ) def get_block_shape_and_split_kv_block( @@ -32,7 +32,7 @@ def get_block_shape_and_split_kv_block( decoder_block_shape_q: int, group_size: int, block_size: int, - decoder_step_token_num: int + decoder_step_token_num: int, ): """ get_block_shape_and_split_kv_block @@ -58,7 +58,7 @@ def get_block_shape_and_split_kv_block( decoder_block_shape_q, group_size, block_size, - decoder_step_token_num + decoder_step_token_num, ) return ( encoder_batch_ids, @@ -74,4 +74,4 @@ def get_block_shape_and_split_kv_block( set_max_lengths, ) else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py b/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py index c012d932a..42bdd74a0 100644 --- a/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py +++ b/fastdeploy/model_executor/layers/attention/ops/gqa_rope_write_cache.py @@ -22,45 +22,68 @@ from fastdeploy.platforms import current_platform def gqa_rope_write_cache( - qkv: paddle.Tensor, - key_cache: paddle.Tensor, - value_cache: paddle.Tensor, - cu_seqlens_q: paddle.Tensor, - cu_seqlens_k: paddle.Tensor, - rotary_embs: paddle.Tensor, - seq_lens_this_time: paddle.Tensor, - seq_lens_encoder: paddle.Tensor, - seq_lens_decoder: paddle.Tensor, - padding_offsets: paddle.Tensor, - cum_offsets: paddle.Tensor, - block_tables: paddle.Tensor, - kv_batch_ids: paddle.Tensor, - kv_tile_ids_per_batch: paddle.Tensor, - kv_num_blocks: paddle.Tensor, - cache_batch_ids: paddle.Tensor, - cache_tile_ids_per_batch: paddle.Tensor, - cache_num_blocks: paddle.Tensor, - cache_k_quant_scales: Optional[paddle.Tensor] = None, - cache_v_quant_scales: Optional[paddle.Tensor] = None, - cache_k_dequant_scales: Optional[paddle.Tensor] = None, - cache_v_dequant_scales: Optional[paddle.Tensor] = None, - cache_k_zp: Optional[paddle.Tensor] = None, - cache_v_zp: Optional[paddle.Tensor] = None, - kv_signal_data: Optional[paddle.Tensor] = None, - kv_token_num: int = 1, - max_seq_len: int = 0, - cache_quant_type: str = "none"): + qkv: paddle.Tensor, + key_cache: paddle.Tensor, + value_cache: paddle.Tensor, + cu_seqlens_q: paddle.Tensor, + cu_seqlens_k: paddle.Tensor, + rotary_embs: paddle.Tensor, + seq_lens_this_time: paddle.Tensor, + seq_lens_encoder: paddle.Tensor, + seq_lens_decoder: paddle.Tensor, + padding_offsets: paddle.Tensor, + cum_offsets: paddle.Tensor, + block_tables: paddle.Tensor, + kv_batch_ids: paddle.Tensor, + kv_tile_ids_per_batch: paddle.Tensor, + kv_num_blocks: paddle.Tensor, + cache_batch_ids: paddle.Tensor, + cache_tile_ids_per_batch: paddle.Tensor, + cache_num_blocks: paddle.Tensor, + cache_k_quant_scales: Optional[paddle.Tensor] = None, + cache_v_quant_scales: Optional[paddle.Tensor] = None, + cache_k_dequant_scales: Optional[paddle.Tensor] = None, + cache_v_dequant_scales: Optional[paddle.Tensor] = None, + cache_k_zp: Optional[paddle.Tensor] = None, + cache_v_zp: Optional[paddle.Tensor] = None, + kv_signal_data: Optional[paddle.Tensor] = None, + kv_token_num: int = 1, + max_seq_len: int = 0, + cache_quant_type: str = "none", +): if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import gqa_rope_write_cache + q, k, v, qkv_ = gqa_rope_write_cache( - qkv, key_cache, value_cache, cu_seqlens_q, cu_seqlens_k, - rotary_embs, seq_lens_this_time, seq_lens_encoder, - seq_lens_decoder, padding_offsets, cum_offsets, block_tables, - kv_batch_ids, kv_tile_ids_per_batch, kv_num_blocks, - cache_batch_ids, cache_tile_ids_per_batch, cache_num_blocks, - cache_k_quant_scales, cache_v_quant_scales, cache_k_dequant_scales, - cache_v_dequant_scales, cache_k_zp, cache_v_zp, kv_signal_data, - kv_token_num, max_seq_len, cache_quant_type) + qkv, + key_cache, + value_cache, + cu_seqlens_q, + cu_seqlens_k, + rotary_embs, + seq_lens_this_time, + seq_lens_encoder, + seq_lens_decoder, + padding_offsets, + cum_offsets, + block_tables, + kv_batch_ids, + kv_tile_ids_per_batch, + kv_num_blocks, + cache_batch_ids, + cache_tile_ids_per_batch, + cache_num_blocks, + cache_k_quant_scales, + cache_v_quant_scales, + cache_k_dequant_scales, + cache_v_dequant_scales, + cache_k_zp, + cache_v_zp, + kv_signal_data, + kv_token_num, + max_seq_len, + cache_quant_type, + ) return q, k, v, qkv_ else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/ops/init_kv_signal_per_query.py b/fastdeploy/model_executor/layers/attention/ops/init_kv_signal_per_query.py index 866c0f168..3cae36bb5 100644 --- a/fastdeploy/model_executor/layers/attention/ops/init_kv_signal_per_query.py +++ b/fastdeploy/model_executor/layers/attention/ops/init_kv_signal_per_query.py @@ -31,7 +31,14 @@ def init_kv_signal_per_query( """ if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import init_kv_signal_per_query - out = init_kv_signal_per_query(seq_lens_encoder, seq_lens_this_time, seq_lens_decoder, rank, num_layers) + + out = init_kv_signal_per_query( + seq_lens_encoder, + seq_lens_this_time, + seq_lens_decoder, + rank, + num_layers, + ) return out else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/ops/init_signal_layerwise.py b/fastdeploy/model_executor/layers/attention/ops/init_signal_layerwise.py index f3477c133..d18e575d6 100644 --- a/fastdeploy/model_executor/layers/attention/ops/init_signal_layerwise.py +++ b/fastdeploy/model_executor/layers/attention/ops/init_signal_layerwise.py @@ -28,7 +28,8 @@ def init_signal_layerwise( """ if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import init_signal_layerwise + out = init_signal_layerwise(kv_signal_metadata, layer_id) return out else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/ops/open_shm_and_get_meta_signal.py b/fastdeploy/model_executor/layers/attention/ops/open_shm_and_get_meta_signal.py index bdfb1fbb4..873f537b2 100644 --- a/fastdeploy/model_executor/layers/attention/ops/open_shm_and_get_meta_signal.py +++ b/fastdeploy/model_executor/layers/attention/ops/open_shm_and_get_meta_signal.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle from fastdeploy.platforms import current_platform @@ -27,9 +28,9 @@ def open_shm_and_get_meta_signal( open_shm_and_get_meta_signal """ if current_platform.is_cuda(): - from fastdeploy.model_executor.ops.gpu import \ - open_shm_and_get_meta_signal + from fastdeploy.model_executor.ops.gpu import open_shm_and_get_meta_signal + out = open_shm_and_get_meta_signal(rank, device_id, keep_pd_step_flag) return out else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py b/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py index f0f0780a3..42a931d18 100644 --- a/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py +++ b/fastdeploy/model_executor/layers/attention/ops/pre_cache_len_concat.py @@ -23,14 +23,16 @@ import paddle from fastdeploy.platforms import current_platform -def pre_cache_len_concat(seq_lens_decoder: paddle.Tensor, - seq_lens_this_time: paddle.Tensor, - max_dec_len: int = 0, - block_size: int = 64): +def pre_cache_len_concat( + seq_lens_decoder: paddle.Tensor, + seq_lens_this_time: paddle.Tensor, + max_dec_len: int = 0, + block_size: int = 64, +): if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import pre_cache_len_concat - out = pre_cache_len_concat(seq_lens_decoder, seq_lens_this_time, - max_dec_len, block_size) + + out = pre_cache_len_concat(seq_lens_decoder, seq_lens_this_time, max_dec_len, block_size) return out else: - raise NotImplementedError() + raise NotImplementedError diff --git a/fastdeploy/model_executor/layers/attention/utils.py b/fastdeploy/model_executor/layers/attention/utils.py index ab0923630..00665cee4 100644 --- a/fastdeploy/model_executor/layers/attention/utils.py +++ b/fastdeploy/model_executor/layers/attention/utils.py @@ -15,14 +15,16 @@ """ import os + from fastdeploy.config import FDConfig + def init_rank_and_device_id(fd_config: FDConfig): - """ - - """ - rank = (fd_config.parallel_config.expert_parallel_rank * - fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank) + """ """ + rank = ( + fd_config.parallel_config.expert_parallel_rank * fd_config.parallel_config.tensor_parallel_size + + fd_config.parallel_config.tensor_parallel_rank + ) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 6c3cade14..0d1a69c2c 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -23,7 +23,9 @@ from typing import TYPE_CHECKING, List, Optional, Tuple import paddle from fastdeploy.model_executor.layers.attention.ops import ( - init_signal_layerwise, open_shm_and_get_meta_signal) + init_signal_layerwise, + open_shm_and_get_meta_signal, +) if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -31,7 +33,9 @@ if TYPE_CHECKING: from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) + AttentionBackend, + AttentionMetadata, +) @dataclass @@ -39,6 +43,7 @@ class XPUAttentionMetadata(AttentionMetadata): """ XPUAttentionMetadata """ + max_len_kv: paddle.Tensor = None set_max_lengths: int = -1 encoder_batch_ids: paddle.Tensor = None @@ -71,8 +76,13 @@ class XPUAttentionBackend(AttentionBackend): XPUAttentionBackend backend implementation. """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int): + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ): """ XPUAttentionBackend __init__ """ @@ -81,9 +91,9 @@ class XPUAttentionBackend(AttentionBackend): # TODO(gongshaotian): Use fd_config parameters in the correct location self.block_size: int = fd_config.parallel_config.block_size self.max_seq_len: int = fd_config.parallel_config.max_model_len - self.rope_theta: float = (10000.0 - if fd_config.model_config.rope_theta is None - else fd_config.model_config.rope_theta) + self.rope_theta: float = ( + 10000.0 if fd_config.model_config.rope_theta is None else fd_config.model_config.rope_theta + ) self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) self.causal: bool = getattr(fd_config.model_config, "causal", True) # self.speculate_method = fd_config.parallel_config.speculate_method @@ -98,8 +108,7 @@ class XPUAttentionBackend(AttentionBackend): self.num_layers: int = fd_config.model_config.num_hidden_layers # pd_disaggregation - self.use_pd_disaggregation: int = int( - os.getenv("FLAGS_use_pd_disaggregation", 0)) + self.use_pd_disaggregation: int = int(os.getenv("FLAGS_use_pd_disaggregation", 0)) self.start_layer_index: int = fd_config.model_config.start_layer_index def init_attention_metadata(self, forward_meta: ForwardMeta): @@ -124,8 +133,7 @@ class XPUAttentionBackend(AttentionBackend): # pd_disaggregation metadata.kv_signal_data_list = [None] * self.num_layers if self.use_pd_disaggregation: - metadata.kv_signal_metadata = open_shm_and_get_meta_signal( - self.rank, self.keep_pd_step_flag) + metadata.kv_signal_metadata = open_shm_and_get_meta_signal(self.rank, self.keep_pd_step_flag) self.attention_metadata: AttentionMetadata = metadata def get_attntion_meta(self) -> AttentionMetadata: @@ -139,8 +147,12 @@ class XPUAttentionBackend(AttentionBackend): """ Caculate kv cache shape """ - return (max_num_blocks, self.kv_num_heads, self.block_size, - self.head_dim) + return ( + max_num_blocks, + self.kv_num_heads, + self.block_size, + self.head_dim, + ) def forward_mixed( self, @@ -159,15 +171,16 @@ class XPUAttentionBackend(AttentionBackend): metadata = self.attention_metadata if self.use_pd_disaggregation: - metadata.kv_signal_data_list[ - layer.layer_id] = init_signal_layerwise( - metadata.kv_signal_metadata, - layer.layer_id + self.start_layer_index) + metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( + metadata.kv_signal_metadata, + layer.layer_id + self.start_layer_index, + ) k_quant_scale = getattr(layer, "cache_k_scale", None) v_quant_scale = getattr(layer, "cache_v_scale", None) from fastdeploy.model_executor.ops.xpu import block_attn + res = block_attn( qkv, forward_meta.caches[2 * layer.layer_id], diff --git a/fastdeploy/model_executor/layers/backends/__init__.py b/fastdeploy/model_executor/layers/backends/__init__.py index 819aae01e..18d1fccfe 100644 --- a/fastdeploy/model_executor/layers/backends/__init__.py +++ b/fastdeploy/model_executor/layers/backends/__init__.py @@ -22,24 +22,29 @@ __all__ = [] if current_platform.is_xpu(): from . import xpu - from .xpu import * - if hasattr(xpu, '__all__'): + + # fix: F403 `from .xpu import *` used; unable to detect undefined names + if hasattr(xpu, "__all__"): + globals().update({name: getattr(xpu, name) for name in xpu.__all__}) __all__.extend(xpu.__all__) if current_platform.is_npu(): from . import npu - from .npu import * - if hasattr(npu, '__all__'): + + if hasattr(npu, "__all__"): + globals().update({name: getattr(npu, name) for name in npu.__all__}) __all__.extend(npu.__all__) if current_platform.is_gcu(): from . import gcu - from .gcu import * - if hasattr(gcu, '__all__'): + + if hasattr(gcu, "__all__"): + globals().update({name: getattr(gcu, name) for name in gcu.__all__}) __all__.extend(gcu.__all__) if current_platform.is_dcu(): - from .dcu import * from . import dcu - if hasattr(dcu, '__all__'): - __all__.extend(dcu.__all__) \ No newline at end of file + + if hasattr(dcu, "__all__"): + globals().update({name: getattr(dcu, name) for name in dcu.__all__}) + __all__.extend(dcu.__all__) diff --git a/fastdeploy/model_executor/layers/backends/dcu/__init__.py b/fastdeploy/model_executor/layers/backends/dcu/__init__.py index 22fb31ad5..920775729 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/__init__.py +++ b/fastdeploy/model_executor/layers/backends/dcu/__init__.py @@ -19,4 +19,4 @@ dcu backend methods from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod from .weight_only import DCUWeightOnlyLinearMethod -__all__ = ['DCUTritonWeightOnlyMoEMethod', 'DCUWeightOnlyLinearMethod'] \ No newline at end of file +__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod"] diff --git a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py index 2961d3df6..9dd45ab95 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py +++ b/fastdeploy/model_executor/layers/backends/dcu/fused_moe_triton_backends.py @@ -17,10 +17,8 @@ import paddle from paddle import nn -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce -from fastdeploy.model_executor.layers.quantization.quant_base import \ - QuantMethodBase +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce +from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase from fastdeploy.utils import ceil_div @@ -36,7 +34,8 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase): self.quant_method = quant_method self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] self.added_scale_attrs = [ - "up_gate_proj_weight_scale", "down_proj_weight_scale" + "up_gate_proj_weight_scale", + "down_proj_weight_scale", ] def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None: @@ -52,10 +51,12 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase): assert len(down_proj_weights) == layer.num_local_experts assert self.quant_method.name() == "wint8" assert up_gate_proj_weights[0].shape == [ - layer.hidden_size, layer.moe_intermediate_size * 2 + layer.hidden_size, + layer.moe_intermediate_size * 2, ] assert down_proj_weights[0].shape == [ - layer.moe_intermediate_size, layer.hidden_size + layer.moe_intermediate_size, + layer.hidden_size, ] up_gate_proj_tensor = paddle.stack(up_gate_proj_weights, axis=0) @@ -71,26 +72,29 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase): scale_name = self.added_scale_attrs[idx] quanted_weight_scale = weight_tensor.abs().max(axis=1) - quanted_weight = weight_tensor / quanted_weight_scale[:, - None, :] * max_bound + quanted_weight = weight_tensor / quanted_weight_scale[:, None, :] * max_bound quanted_weight = paddle.round(quanted_weight).astype("int8") quanted_weight_scale = quanted_weight_scale / max_bound setattr( - layer, weight_name, + layer, + weight_name, layer.create_parameter( shape=quanted_weight.shape, dtype=quanted_weight.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, weight_name).set_value(quanted_weight) setattr( - layer, scale_name, + layer, + scale_name, layer.create_parameter( shape=quanted_weight_scale.shape, dtype=quanted_weight_scale.dtype, - )) + ), + ) getattr(layer, scale_name).set_value(quanted_weight_scale) def apply( @@ -112,10 +116,7 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase): gate_out = paddle.matmul(x.cast("float32"), layer.gate_weight) scores = paddle.nn.functional.softmax(gate_out, axis=-1) scores += layer.gate_correction_bias - topk_weights, topk_ids = paddle.topk(scores, - k=top_k, - axis=-1, - sorted=False) + topk_weights, topk_ids = paddle.topk(scores, k=top_k, axis=-1, sorted=False) topk_weights = topk_weights / topk_weights.sum(axis=-1, keepdim=True) intermediate_cache1 = paddle.empty( @@ -140,11 +141,15 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase): from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess from .triton_moe_kernels import fused_moe_kernel_paddle + sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess( - topk_ids, num_local_experts, config["BLOCK_SIZE_M"]) + topk_ids, num_local_experts, config["BLOCK_SIZE_M"] + ) max_num_tokens_padded = sorted_token_ids.shape[0] - grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * - ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), ) + grid = ( + ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) + * ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), + ) fused_moe_kernel_paddle[grid]( x, @@ -188,11 +193,11 @@ class DCUTritonWeightOnlyMoEMethod(QuantMethodBase): even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0, ) - intermediate_cache2 = paddle.incubate.nn.functional.swiglu( - intermediate_cache1) + intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1) - grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * - ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) + grid = ( + ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), + ) fused_moe_kernel_paddle[grid]( intermediate_cache2, layer.down_proj_weight, diff --git a/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py b/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py index 4a0c33f82..53af5ae6c 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py +++ b/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py @@ -29,7 +29,6 @@ def fused_moe_kernel_paddle( sorted_token_ids_ptr, expert_ids_ptr, num_tokens_post_padded_ptr, - # Matrix dimensions N, K, @@ -108,16 +107,13 @@ def fused_moe_kernel_paddle( offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + - offs_k[None, :] * stride_ak) + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak) off_experts = tl.load(expert_ids_ptr + pid_m) - b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + - offs_bn[None, :] * stride_bn) + b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) if use_int8_w8a16: - b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ - None, :] * stride_bsn + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn b_scale = tl.load(b_scale_ptrs) if use_fp8_w8a8: @@ -139,19 +135,14 @@ def fused_moe_kernel_paddle( mask=token_mask[:, None], other=0.0, ) - b = tl.load(b_ptrs, - cache_modifier=".cv", - eviction_policy='evict_first') + b = tl.load(b_ptrs, cache_modifier=".cv", eviction_policy="evict_first") else: a = tl.load( a_ptrs, - mask=token_mask[:, None] & - (offs_k[None, :] < K - k * BLOCK_SIZE_K), + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), other=0.0, ) - b = tl.load(b_ptrs, - mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, - other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) # We accumulate along the K dimension. if use_int8_w8a16: @@ -160,13 +151,14 @@ def fused_moe_kernel_paddle( if group_k > 0 and group_n > 0: k_start = k * BLOCK_SIZE_K offs_ks = k_start // group_k - a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, - mask=token_mask, - other=0.0) + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0, + ) b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) - accumulator += tl.dot(a, b) * a_scale[:, - None] * b_scale[None, :] + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] else: accumulator = tl.dot(a, b, acc=accumulator) else: @@ -176,9 +168,7 @@ def fused_moe_kernel_paddle( b_ptrs += BLOCK_SIZE_K * stride_bk if MUL_ROUTED_WEIGHT: - moe_weight = tl.load(topk_weights_ptr + offs_token, - mask=token_mask, - other=0) + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) accumulator = accumulator * moe_weight[:, None] if use_int8_w8a16: accumulator = (accumulator * b_scale).to(compute_type) @@ -191,8 +181,7 @@ def fused_moe_kernel_paddle( accumulator = accumulator.to(compute_type) # Write back the block of the output offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ - None, :] + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] c_mask = token_mask[:, None] & (offs_cn[None, :] < N) tl.store(c_ptrs, accumulator, mask=c_mask) diff --git a/fastdeploy/model_executor/layers/backends/dcu/weight_only.py b/fastdeploy/model_executor/layers/backends/dcu/weight_only.py index a29403f5c..061f4ab53 100644 --- a/fastdeploy/model_executor/layers/backends/dcu/weight_only.py +++ b/fastdeploy/model_executor/layers/backends/dcu/weight_only.py @@ -13,11 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle from paddle.nn.quant import weight_dequantize from fastdeploy.model_executor.layers.quantization.weight_only import ( - GPUWeightOnlyLinearMethod, WeightOnlyConfig) + GPUWeightOnlyLinearMethod, + WeightOnlyConfig, +) class DCUWeightOnlyLinearMethod(GPUWeightOnlyLinearMethod): @@ -38,7 +41,7 @@ class DCUWeightOnlyLinearMethod(GPUWeightOnlyLinearMethod): x=layer.weight, scale=layer.weight_scale, algo=self.quant_config.algo, - out_dtype=paddle.get_default_dtype() + out_dtype=paddle.get_default_dtype(), ) linear_out = paddle.matmul(x, dequant_out) if layer.bias is not None: diff --git a/fastdeploy/model_executor/layers/backends/gcu/__init__.py b/fastdeploy/model_executor/layers/backends/gcu/__init__.py index 8de8fe8d8..128690062 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/__init__.py +++ b/fastdeploy/model_executor/layers/backends/gcu/__init__.py @@ -18,14 +18,13 @@ gcu backend methods from .attention.flash_attn_backend import GCUFlashAttnBackend from .attention.mem_efficient_attn_backend import GCUMemEfficientAttnBackend -from .moe.fused_moe_method_gcu_backend import (GCUFusedMoeMethod, - GCUWeightOnlyMoEMethod) +from .moe.fused_moe_method_gcu_backend import GCUFusedMoeMethod, GCUWeightOnlyMoEMethod from .quantization.weight_only import GCUWeightOnlyLinearMethod __all__ = [ - 'GCUFlashAttnBackend', - 'GCUMemEfficientAttnBackend', - 'GCUFusedMoeMethod', - 'GCUWeightOnlyMoEMethod', - 'GCUWeightOnlyLinearMethod', + "GCUFlashAttnBackend", + "GCUMemEfficientAttnBackend", + "GCUFusedMoeMethod", + "GCUWeightOnlyMoEMethod", + "GCUWeightOnlyLinearMethod", ] diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index 00032e26f..f63605a79 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -17,31 +17,33 @@ from __future__ import annotations import os -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional -import paddle - import numpy as np +import paddle from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) + AttentionBackend, + AttentionMetadata, +) + if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode -from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding, - mem_efficient_attention, - flash_attn_var_len) from paddleformers.utils.log import logger +from fastdeploy.model_executor.ops.gcu import flash_attn_var_len, fused_rotary_embedding + @dataclass class GCUFlashAttnMetadata(AttentionMetadata): """ GCUFlashAttnMetadata """ + forward_mode: ForwardMode = ForwardMode.MIXED _dtype: paddle.dtype = paddle.bfloat16 @@ -63,15 +65,18 @@ class GCUFlashAttnMetadata(AttentionMetadata): pre_caches_length: int = 0 - - class GCUFlashAttnBackend(AttentionBackend): """ GCUFlashAttnBackend backend implementation. """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int): + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ): """ GCUFlashAttnBackend __init__ """ @@ -99,8 +104,6 @@ class GCUFlashAttnBackend(AttentionBackend): self.rotary_embs = None self.enable_monitor: bool = bool(os.getenv("FD_GCU_ATTN_MONITOR", False)) - - def init_attention_metadata(self, forward_meta: ForwardMeta): """Initialize attntion metadata hence all layers in the forward pass can reuse it.""" metadata = GCUFlashAttnMetadata() @@ -131,15 +134,14 @@ class GCUFlashAttnBackend(AttentionBackend): self.rotary_embs = metadata.rotary_embs.reshape((-1, self.head_dim)) # some info for attention - self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int] - self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]] - self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]] + self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int] + self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]] + self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]] self.seq_lens_sum = np.sum(self.seq_lens_this_time_list) self.max_seq_len_this_time = np.max(self.seq_lens_this_time_list) num_seqs = forward_meta.seq_lens_this_time.shape[0] - self.is_decoder = all(x[0] == 0 for x in self.seq_lens_encoder_list) self.is_all_prefill = all(x[0] == 0 for x in self.seq_lens_decoder_list) @@ -147,8 +149,14 @@ class GCUFlashAttnBackend(AttentionBackend): if self.all_slot_mapping is None: max_num_blocks_per_seq = (self.max_seq_len + self.block_size - 1) // self.block_size total_blocks = max_num_blocks_per_seq * self.max_num_seqs - self.all_block_tables = np.arange(0, total_blocks, dtype=np.int32).reshape((self.max_num_seqs, max_num_blocks_per_seq)).tolist() - self.all_slot_mapping = np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist() + self.all_block_tables = ( + np.arange(0, total_blocks, dtype=np.int32) + .reshape((self.max_num_seqs, max_num_blocks_per_seq)) + .tolist() + ) + self.all_slot_mapping = ( + np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist() + ) block_tables = [] slot_mapping = [] @@ -157,9 +165,9 @@ class GCUFlashAttnBackend(AttentionBackend): position_ids = [] for seq_idx in range(num_seqs): cache_len = None - if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill + if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill cache_len = 0 - elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode + elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode cache_len = self.seq_lens_decoder_list[seq_idx][0] # else: doesnot have req in this seq_idx @@ -193,7 +201,6 @@ class GCUFlashAttnBackend(AttentionBackend): self.max_seqlen_q = self.max_seq_len_this_time self.max_seqlen_k = np.max(cache_lens) - def get_attntion_meta(self): """get_attntion_meta""" return self.attention_metadata @@ -206,9 +213,11 @@ class GCUFlashAttnBackend(AttentionBackend): Caculate kv cache shape """ # [total_tokens, kv_num_heads, head_dim] - return (max_num_blocks * self.block_size, - self.kv_num_heads, - self.head_dim) + return ( + max_num_blocks * self.block_size, + self.kv_num_heads, + self.head_dim, + ) @paddle.no_grad() def forward_mixed( @@ -232,7 +241,6 @@ class GCUFlashAttnBackend(AttentionBackend): query = query.reshape_((1, -1, self.num_heads, self.head_dim)) key = key.reshape_((1, -1, self.kv_num_heads, self.head_dim)) - # 1. Rope if self.rotary_embs.dtype != query.dtype: self.rotary_embs = paddle.cast(self.rotary_embs, query.dtype) @@ -242,7 +250,7 @@ class GCUFlashAttnBackend(AttentionBackend): key, self.rotary_embs, self.position_ids, - layer.use_neox_rotary_style + layer.use_neox_rotary_style, ) # 2. Save kv cache @@ -281,4 +289,3 @@ class GCUFlashAttnBackend(AttentionBackend): ) res = res.reshape_((token_num, -1)) return res - diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index 74a726a4f..d105a41c2 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -16,33 +16,35 @@ from __future__ import annotations -import os -from dataclasses import dataclass, field +import math +from dataclasses import dataclass from typing import TYPE_CHECKING, List, Optional -import paddle - import numpy as np -import math +import paddle +from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( - AttentionBackend, AttentionMetadata) - -from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding, - mem_efficient_attention, - flash_attn_var_len) -from paddleformers.utils.log import logger + AttentionBackend, + AttentionMetadata, +) +from fastdeploy.model_executor.ops.gcu import ( + fused_rotary_embedding, + mem_efficient_attention, +) if TYPE_CHECKING: from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode + @dataclass class GCUMemEfficientAttnMetadata(AttentionMetadata): """ GCUMemEfficientAttnMetadata """ + forward_mode: ForwardMode = ForwardMode.MIXED _dtype: paddle.dtype = paddle.bfloat16 @@ -63,15 +65,18 @@ class GCUMemEfficientAttnMetadata(AttentionMetadata): pre_caches_length: int = 0 - - class GCUMemEfficientAttnBackend(AttentionBackend): """ GCUMemEfficientAttnBackend backend implementation. """ - def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, - head_dim: int): + def __init__( + self, + fd_config: FDConfig, + kv_num_heads: int, + num_heads: int, + head_dim: int, + ): """ GCUMemEfficientAttnBackend __init__ """ @@ -99,8 +104,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend): self.rotary_embs = None self.use_paddle_native_sdpa = False - - def init_attention_metadata(self, forward_meta: ForwardMeta): """Initialize attntion metadata hence all layers in the forward pass can reuse it.""" metadata = GCUMemEfficientAttnMetadata() @@ -125,32 +128,35 @@ class GCUMemEfficientAttnBackend(AttentionBackend): metadata.pre_caches_length = forward_meta.pre_caches_length # not inited - self.attention_metadata = metadata if self.rotary_embs is None: self.rotary_embs = metadata.rotary_embs.reshape((-1, self.head_dim)) # some info for attention - self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int] - self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]] - self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]] + self.seq_lens_this_time_list = forward_meta.seq_lens_this_time.tolist() # List[int] + self.seq_lens_encoder_list = forward_meta.seq_lens_encoder.tolist() # List[List[int]] + self.seq_lens_decoder_list = forward_meta.seq_lens_decoder.tolist() # List[List[int]] self.seq_lens_sum = np.sum(self.seq_lens_this_time_list) self.max_seq_len_this_time = np.max(self.seq_lens_this_time_list) num_seqs = forward_meta.seq_lens_this_time.shape[0] - self.is_decoder = all(x[0] == 0 for x in self.seq_lens_encoder_list) self.is_all_prefill = all(x[0] == 0 for x in self.seq_lens_decoder_list) - # block_tables and slot_mapping if self.all_slot_mapping is None: max_num_blocks_per_seq = (self.max_seq_len + self.block_size - 1) // self.block_size total_blocks = max_num_blocks_per_seq * self.max_num_seqs - self.all_block_tables = np.arange(0, total_blocks, dtype=np.int32).reshape((self.max_num_seqs, max_num_blocks_per_seq)).tolist() - self.all_slot_mapping = np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist() + self.all_block_tables = ( + np.arange(0, total_blocks, dtype=np.int32) + .reshape((self.max_num_seqs, max_num_blocks_per_seq)) + .tolist() + ) + self.all_slot_mapping = ( + np.arange(0, total_blocks * self.block_size, dtype=np.int32).reshape((self.max_num_seqs, -1)).tolist() + ) block_tables = [] slot_mapping = [] @@ -162,9 +168,9 @@ class GCUMemEfficientAttnBackend(AttentionBackend): position_ids = [] for seq_idx in range(num_seqs): cache_len = None - if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill + if self.seq_lens_encoder_list[seq_idx][0] != 0: # prefill cache_len = 0 - elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode + elif self.seq_lens_decoder_list[seq_idx][0] != 0: # decode cache_len = self.seq_lens_decoder_list[seq_idx][0] # else: doesnot have req in this seq_idx @@ -179,9 +185,12 @@ class GCUMemEfficientAttnBackend(AttentionBackend): position_ids.extend(self.position_ids_base[start:end]) query_lens.append(lens_this_time) cached_kv_lens.append(end) - cached_kv_slot_range.append([self.all_slot_mapping[seq_idx][0], self.all_slot_mapping[seq_idx][end]]) - - + cached_kv_slot_range.append( + [ + self.all_slot_mapping[seq_idx][0], + self.all_slot_mapping[seq_idx][end], + ] + ) self.block_tables = paddle.to_tensor(block_tables, dtype="int32") self.slot_mapping = paddle.to_tensor(slot_mapping, dtype="int32") @@ -206,7 +215,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend): self.cached_kv_lens = cached_kv_lens self.cached_kv_slot_range = cached_kv_slot_range - def get_attntion_meta(self): """get_attntion_meta""" return self.attention_metadata @@ -219,9 +227,11 @@ class GCUMemEfficientAttnBackend(AttentionBackend): Caculate kv cache shape """ # [total_tokens, kv_num_heads, head_dim] - return (max_num_blocks * self.block_size, - self.kv_num_heads, - self.head_dim) + return ( + max_num_blocks * self.block_size, + self.kv_num_heads, + self.head_dim, + ) @paddle.no_grad() def forward_mixed( @@ -245,7 +255,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend): query = query.reshape_((1, -1, self.num_heads, self.head_dim)) key = key.reshape_((1, -1, self.kv_num_heads, self.head_dim)) - # 1. Rope if self.rotary_embs.dtype != query.dtype: self.rotary_embs = paddle.cast(self.rotary_embs, query.dtype) @@ -255,7 +264,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend): key, self.rotary_embs, self.position_ids, - layer.use_neox_rotary_style + layer.use_neox_rotary_style, ) # 2. Save kv cache @@ -282,9 +291,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend): v_ = value_caches[kv_start:kv_end, :, :] if self.use_paddle_native_sdpa: - res = self.native_sdpa_impl( - q_, k_, v_ - ) + res = self.native_sdpa_impl(q_, k_, v_) else: res = mem_efficient_attention( query=q_.unsqueeze(0), @@ -302,7 +309,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend): result = result.reshape_((token_num, -1)) return result - def get_triangle_upper_mask(self, shape, dtype): # [batch_size, 1, q_seq_len, kv_seq_len] shape[1] = 1 @@ -313,7 +319,6 @@ class GCUMemEfficientAttnBackend(AttentionBackend): mask = paddle.triu(mask, diagonal=kv_seq_len - q_seq_len + 1) return mask - def native_sdpa_impl(self, query, key, value): # input shape: [num_tokens, num_heads, head_dim] -> [1, num_tokens, num_heads, head_dim] q = query.unsqueeze(0) @@ -342,13 +347,9 @@ class GCUMemEfficientAttnBackend(AttentionBackend): # matmul and devide by sqrt(head_dim) attn_weights = paddle.matmul(q / math.sqrt(head_dim), k.transpose([0, 1, 3, 2])) - attention_mask = self.get_triangle_upper_mask( - [batch, 1, q_seq_len, kv_seq_len], q.dtype - ) + attention_mask = self.get_triangle_upper_mask([batch, 1, q_seq_len, kv_seq_len], q.dtype) attn_weights = attn_weights + attention_mask - attn_weights = paddle.nn.functional.softmax( - attn_weights, axis=-1, dtype="float32" - ).astype(q.dtype) + attn_weights = paddle.nn.functional.softmax(attn_weights, axis=-1, dtype="float32").astype(q.dtype) attn_output = paddle.matmul(attn_weights, v) attn_output = attn_output.transpose([0, 2, 1, 3]) diff --git a/fastdeploy/model_executor/layers/backends/gcu/moe/__init__.py b/fastdeploy/model_executor/layers/backends/gcu/moe/__init__.py index c61a9d89f..7f0dee0e2 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/moe/__init__.py +++ b/fastdeploy/model_executor/layers/backends/gcu/moe/__init__.py @@ -11,6 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""" +""" " gcu moe """ diff --git a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py index 42b931956..7bc1850c7 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/moe/fused_moe_method_gcu_backend.py @@ -1,4 +1,3 @@ - """ # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # @@ -15,7 +14,6 @@ # limitations under the License. """ - import multiprocessing import os @@ -24,27 +22,30 @@ import paddle from paddle import nn from paddleformers.utils.log import logger -from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import \ - MoEMethodBase -from fastdeploy.model_executor.layers.utils import (CpuGuard, - create_and_set_parameter, - get_tensor) -from fastdeploy.model_executor.ops.gcu import (invoke_fused_moe_kernel, - moe_align_block_size, - topk_softmax, - weight_quantize_custom_rtn, - weight_quantize_rtn) +from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase +from fastdeploy.model_executor.layers.utils import ( + CpuGuard, + create_and_set_parameter, + get_tensor, +) +from fastdeploy.model_executor.ops.gcu import ( + invoke_fused_moe_kernel, + moe_align_block_size, + topk_softmax, + weight_quantize_custom_rtn, + weight_quantize_rtn, +) class GCUFusedMoeMethod(MoEMethodBase): """ Use GCU to compute Fused MoE. """ + def __init__(self, quant_config): super().__init__(quant_config) self.group_size = -1 - def create_weights(self, layer: nn.Layer, state_dict): """ Paddle gcu create weight process. @@ -53,28 +54,28 @@ class GCUFusedMoeMethod(MoEMethodBase): up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict) stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0) stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0) - for idx, weight_tensor in enumerate( - [stacked_up_gate_proj_weights, stacked_down_proj_weights]): + for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]): # shape [E, K, N] -> [E, N, K] weight_tensor = paddle.transpose(weight_tensor, [0, 2, 1]) weight_name = self.added_weight_attrs[idx] setattr( - layer, weight_name, + layer, + weight_name, layer.create_parameter( shape=weight_tensor.shape, dtype=weight_tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, weight_name).set_value(weight_tensor) - @paddle.no_grad() def compute_ffn( self, layer: nn.Layer, x: paddle.Tensor, gate_out: paddle.Tensor, - enable_quant = False + enable_quant=False, ) -> paddle.Tensor: """ Paddle gcu compute Fused MoE. @@ -86,8 +87,17 @@ class GCUFusedMoeMethod(MoEMethodBase): topk_weights = paddle.empty([token_num, top_k], dtype=gate_out.dtype) topk_indices = paddle.empty([token_num, top_k], dtype="int32") - token_expert_indices = paddle.empty([token_num, top_k], dtype="int32",) - topk_softmax(topk_weights, topk_indices, token_expert_indices, gate_out, norm_topk_prob=True) + token_expert_indices = paddle.empty( + [token_num, top_k], + dtype="int32", + ) + topk_softmax( + topk_weights, + topk_indices, + token_expert_indices, + gate_out, + norm_topk_prob=True, + ) config = { "BLOCK_SIZE_M": 32, @@ -136,7 +146,7 @@ class GCUFusedMoeMethod(MoEMethodBase): top_k, config, enable_quant, # use_int4_w4a16 - [0, self.group_size], # block_shape + [0, self.group_size], # block_shape ) intermediate_cache2 = paddle.empty( @@ -144,8 +154,7 @@ class GCUFusedMoeMethod(MoEMethodBase): dtype=x.dtype, ) - intermediate_cache2 = paddle.incubate.nn.functional.swiglu( - intermediate_cache1) + intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1) intermediate_cache2 = intermediate_cache2.reshape([-1, moe_intermediate_size]) @@ -181,13 +190,14 @@ class GCUFusedMoeMethod(MoEMethodBase): fused_moe_out = fused_moe_out.reshape_([token_num, hidden_size]) if layer.tp_size > 1: - from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce + from fastdeploy.distributed.communication_op import ( + tensor_model_parallel_all_reduce, + ) + tensor_model_parallel_all_reduce(fused_moe_out) return fused_moe_out - def apply( self, layer: nn.Layer, @@ -199,7 +209,6 @@ class GCUFusedMoeMethod(MoEMethodBase): """ return self.compute_ffn(layer, x, gate_out, enable_quant=False) - def apply_ep_prefill( self, layer: nn.Layer, @@ -211,7 +220,6 @@ class GCUFusedMoeMethod(MoEMethodBase): """ raise NotImplementedError - def apply_ep_decode( self, layer: nn.Layer, @@ -223,7 +231,6 @@ class GCUFusedMoeMethod(MoEMethodBase): """ raise NotImplementedError - def apply_tp( self, layer: nn.Layer, @@ -247,48 +254,44 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): self.moe_quant_type = self.quant_config.algo self.pack_num = 1 - assert self.quant_config.algo == "weight_only_int4", \ - "GCUWeightOnlyMoEMethod only support weight_only_int4, but got:{self.quant_config.algo}" + assert ( + self.quant_config.algo == "weight_only_int4" + ), "GCUWeightOnlyMoEMethod only support weight_only_int4, but got:{self.quant_config.algo}" self.added_qzeros_attrs = [ - "up_gate_proj_weight_zeros", "down_proj_weight_zeros" + "up_gate_proj_weight_zeros", + "down_proj_weight_zeros", ] self.group_size = 64 - self.quant_multi_process_group_size = int( - os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8) - ) + self.quant_multi_process_group_size = int(os.getenv("FD_MOE_QUANT_MULTI_PROCESS_GROUP_SIZE", 8)) logger.info(f"GCUWeightOnlyMoEMethod quant_multi_process_group_size: {self.quant_multi_process_group_size}") - def process_prequanted_weights(self, layer: nn.Layer, state_dict): """ Paddle gcu process prequanted weights. """ - up_gate_proj_expert_weight_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_key", None) - down_proj_expert_weight_key = layer.weight_key_map.get( - "down_proj_expert_weight_key", None) - up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_scale_key", None) - down_proj_expert_weight_scale_key = layer.weight_key_map.get( - "down_proj_expert_weight_scale_key", None) + up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None) + down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None) + up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None) + down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None) up_gate_proj_weights, down_proj_weights = layer.load_experts_weight( - state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key) + state_dict, + up_gate_proj_expert_weight_key, + down_proj_expert_weight_key, + ) # self.check(layer, up_gate_proj_weights, down_proj_weights) up_gate_proj_weight_scale = [] down_proj_weight_scale = [] for i in range(layer.num_experts): expert_idx = layer.expert_id_offset + i up_gate_proj_weight_scale.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx))) + ) down_proj_weight_scale.append( - get_tensor( - state_dict.pop( - down_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx))) + ) up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0) down_proj_weight = paddle.stack(down_proj_weights, axis=0) @@ -299,12 +302,11 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): "up_gate_proj_weight": up_gate_proj_weight, "down_proj_weight": down_proj_weight, "up_gate_proj_weight_scale": up_gate_proj_weight_scale, - "down_proj_weight_scale": down_proj_weight_scale + "down_proj_weight_scale": down_proj_weight_scale, } for name, tensor in name_tensor_map.items(): create_and_set_parameter(layer, name, tensor) - @paddle.no_grad() def create_weights(self, layer: nn.Layer, state_dict): """ @@ -313,7 +315,6 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict) self.check(layer, up_gate_proj_weights, down_proj_weights) - def quant_worker(p_group_idx, shared_dict, weights, moe_quant_type, group_size): with CpuGuard(): p_group_size = len(weights) @@ -322,13 +323,13 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): quant_weight, scale = weight_quantize_custom_rtn( weights[group_j], moe_quant_type, - group_size # group_size + group_size, # group_size ) shared_dict[p_group_size * p_group_idx + group_j] = ( - quant_weight, scale + quant_weight, + scale, ) - for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]): weight_name = self.added_weight_attrs[idx] scale_name = self.added_scale_attrs[idx] @@ -354,7 +355,13 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): p = multiprocessing.Process( target=quant_worker, - args=(i, shared_dict, w, self.moe_quant_type, self.group_size) + args=( + i, + shared_dict, + w, + self.moe_quant_type, + self.group_size, + ), ) p.start() processes.append(p) @@ -376,7 +383,7 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): quant_weight, scale = weight_quantize_rtn( weight_tensor[i], self.moe_quant_type, - self.group_size # group_size + self.group_size, # group_size ) weight_list.append(quant_weight) weight_scale_list.append(scale) @@ -389,7 +396,6 @@ class GCUWeightOnlyMoEMethod(GCUFusedMoeMethod): quanted_weight_zeros = quanted_weight_scale * 8 create_and_set_parameter(layer, zeros_name, quanted_weight_zeros) - def apply( self, layer: nn.Layer, diff --git a/fastdeploy/model_executor/layers/backends/gcu/quantization/__init__.py b/fastdeploy/model_executor/layers/backends/gcu/quantization/__init__.py index b5870b4dc..1c4491507 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/backends/gcu/quantization/__init__.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""" +""" " gcu quantization """ from .weight_only import GCUWeightOnlyLinearMethod diff --git a/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py index d390169fd..896c58369 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/backends/gcu/quantization/weight_only.py @@ -17,7 +17,9 @@ import paddle from fastdeploy.model_executor.layers.quantization.weight_only import ( - WeightOnlyConfig, WeightOnlyLinearMethod) + WeightOnlyConfig, + WeightOnlyLinearMethod, +) from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.ops.gcu import linear_quant, weight_quantize_rtn @@ -35,7 +37,6 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod): self.quant_config = quant_config self.group_size = -1 - def create_weights(self, layer): # The scale shape should be equal to the output dim of weight using Per-Channel Quantization. weight_scale_shape = [layer.weight_shape[1]] @@ -50,7 +51,6 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod): is_bias=False, ) - def process_prequanted_weights(self, layer, state_dict) -> None: """ Process pre-quantized weights before applying them to the model @@ -62,9 +62,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod): quant_weight = get_tensor(state_dict.pop(layer.weight_key)) weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key)) layer.weight.set_value(quant_weight) - layer.weight_scale.set_value( - weight_scale.astype(paddle.get_default_dtype())) - + layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype())) def process_loaded_weights(self, layer, weight) -> None: quanted_weight_tensor, weight_scale_tensor = weight_quantize_rtn( @@ -74,9 +72,7 @@ class GCUWeightOnlyLinearMethod(WeightOnlyLinearMethod): ) layer.weight.set_value(quanted_weight_tensor) - layer.weight_scale.set_value( - weight_scale_tensor.astype(paddle.get_default_dtype())) - + layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype())) @paddle.no_grad() def apply(self, layer, x): diff --git a/fastdeploy/model_executor/layers/backends/npu/__init__.py b/fastdeploy/model_executor/layers/backends/npu/__init__.py index 9aa616224..5f7a59bc8 100644 --- a/fastdeploy/model_executor/layers/backends/npu/__init__.py +++ b/fastdeploy/model_executor/layers/backends/npu/__init__.py @@ -14,4 +14,4 @@ """ npu backend methods -""" \ No newline at end of file +""" diff --git a/fastdeploy/model_executor/layers/backends/xpu/__init__.py b/fastdeploy/model_executor/layers/backends/xpu/__init__.py index 0bba8a09f..d528ebe07 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/__init__.py +++ b/fastdeploy/model_executor/layers/backends/xpu/__init__.py @@ -18,4 +18,4 @@ xpu backend methods from .quantization.weight_only import XPUWeightOnlyLinearMethod -__all__ = ['XPUWeightOnlyLinearMethod'] +__all__ = ["XPUWeightOnlyLinearMethod"] diff --git a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py index 36bd87bc0..15f93b911 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/backends/xpu/quantization/weight_only.py @@ -13,11 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle from paddle import nn from fastdeploy.model_executor.layers.quantization.weight_only import ( - WeightOnlyConfig, WeightOnlyLinearMethod) + WeightOnlyConfig, + WeightOnlyLinearMethod, +) from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu @@ -48,13 +51,10 @@ class XPUWeightOnlyLinearMethod(WeightOnlyLinearMethod): is_bias=False, ) - def process_loaded_weights(self, layer: nn.Layer, - weight: paddle.Tensor) -> None: + def process_loaded_weights(self, layer: nn.Layer, weight: paddle.Tensor) -> None: """ loaded_weights using xpu special quantization """ - quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu( - weight, self.quant_config.algo, -1, -1) - layer.weight.set_value( - paddle.transpose(quanted_weight_tensor, [1, 0])) + quanted_weight_tensor, weight_scale_tensor = weight_quantize_xpu(weight, self.quant_config.algo, -1, -1) + layer.weight.set_value(paddle.transpose(quanted_weight_tensor, [1, 0])) layer.weight_scale.set_value(weight_scale_tensor) diff --git a/fastdeploy/model_executor/layers/backends/xpu/utils.py b/fastdeploy/model_executor/layers/backends/xpu/utils.py index ddbd3e2e5..197c7f60d 100644 --- a/fastdeploy/model_executor/layers/backends/xpu/utils.py +++ b/fastdeploy/model_executor/layers/backends/xpu/utils.py @@ -36,7 +36,8 @@ def xpu_clip_and_round(x: np.ndarray) -> np.ndarray: def xpu_quant_qkv_weight( - weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]: + weight_np: np.ndarray, +) -> Tuple[paddle.Tensor, paddle.Tensor]: """ Quantize the query, key, and value weights for the Transformer model. @@ -65,7 +66,8 @@ def xpu_quant_qkv_weight( def xpu_quant_weight( - weight_np: np.ndarray) -> Tuple[paddle.Tensor, paddle.Tensor]: + weight_np: np.ndarray, +) -> Tuple[paddle.Tensor, paddle.Tensor]: """ Quantize the weight tensor for XPU devices. diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index f1c856604..4e650d6ad 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -75,11 +75,10 @@ class VocabParallelEmbedding(nn.Layer): self.embeddings = fleet.meta_parallel.VocabParallelEmbedding( num_embeddings, embedding_dim, - mp_group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Normal( - mean=0.0, std=self.initializer_range), ), + initializer=nn.initializer.Normal(mean=0.0, std=self.initializer_range), + ), ) else: # column cut embedding @@ -94,8 +93,7 @@ class VocabParallelEmbedding(nn.Layer): self.prefix = prefix self.dropout = nn.Dropout(self.hidden_dropout_prob) - def load_state_dict(self, state_dict: Dict[str, - paddle.Tensor | np.ndarray]): + def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): """ Load the checkpoint state dictionary into the layer. @@ -104,12 +102,12 @@ class VocabParallelEmbedding(nn.Layer): """ if self.tie_word_embeddings: self.embeddings.weight.set_value( - get_tensor(state_dict[self.prefix + ".weight"]).astype( - paddle.get_default_dtype())) + get_tensor(state_dict[self.prefix + ".weight"]).astype(paddle.get_default_dtype()) + ) else: self.embeddings.weight.set_value( - get_tensor(state_dict.pop(self.prefix + ".weight")).astype( - paddle.get_default_dtype())) + get_tensor(state_dict.pop(self.prefix + ".weight")).astype(paddle.get_default_dtype()) + ) def forward(self, ids_remove_padding=None) -> paddle.Tensor: """ @@ -131,8 +129,7 @@ class VocabParallelEmbedding(nn.Layer): paddle.distributed.all_gather( inputs_embeds_temp, input_embedings, - group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), sync_op=True, ) input_embedings = paddle.concat(inputs_embeds_temp, -1) diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index 324a5eed6..970167ae6 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -18,8 +18,7 @@ import paddle from paddle import nn from fastdeploy.config import FDConfig -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce from fastdeploy.platforms import current_platform from .utils import _set_var_distributed, divide, get_tensor @@ -57,8 +56,12 @@ class LinearBase(nn.Layer): NotImplementedError: Raised if the current platform is not a CUDA platform. """ super().__init__() - if current_platform.is_cuda() or current_platform.is_xpu( - ) or current_platform.is_iluvatar() or current_platform.is_gcu(): + if ( + current_platform.is_cuda() + or current_platform.is_xpu() + or current_platform.is_iluvatar() + or current_platform.is_gcu() + ): self.forward = self.forward_cuda else: raise NotImplementedError @@ -147,7 +150,7 @@ class LinearBase(nn.Layer): """ # weight self.state_dict = state_dict - assert self.weight_key is not None, 'weight_key should not be None.' + assert self.weight_key is not None, "weight_key should not be None." if self.fd_config.model_config.is_quantized: self.load_prequant_weight(state_dict) else: @@ -155,8 +158,7 @@ class LinearBase(nn.Layer): # bias if self.with_bias: - bias_tensor = paddle.to_tensor( - get_tensor(state_dict.pop(self.bias_key))) + bias_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.bias_key))) self.bias.set_value(bias_tensor) def forward_cuda(self, x: paddle.Tensor) -> paddle.Tensor: @@ -210,13 +212,15 @@ class ReplicatedLinear(LinearBase): add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False. skip_quant (bool): Whether to skip quantization. Defaults to False. """ - super().__init__(fd_config=fd_config, - prefix=prefix, - input_size=input_size, - output_size=output_size, - with_bias=with_bias, - add_bias=add_bias, - skip_quant=skip_quant) + super().__init__( + fd_config=fd_config, + prefix=prefix, + input_size=input_size, + output_size=output_size, + with_bias=with_bias, + add_bias=add_bias, + skip_quant=skip_quant, + ) self.hidden_size = fd_config.model_config.hidden_size self.weight_shape = [ @@ -259,18 +263,18 @@ class ColumnParallelLinear(LinearBase): add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False. skip_quant (bool): Whether to skip quantization. Defaults to False. """ - super().__init__(fd_config=fd_config, - prefix=prefix, - input_size=input_size, - output_size=output_size, - with_bias=with_bias, - add_bias=add_bias, - skip_quant=skip_quant) + super().__init__( + fd_config=fd_config, + prefix=prefix, + input_size=input_size, + output_size=output_size, + with_bias=with_bias, + add_bias=add_bias, + skip_quant=skip_quant, + ) self.nranks = fd_config.parallel_config.tensor_parallel_size self.input_size = input_size - self.output_size = divide( - output_size, - self.nranks) # Split the output_size using TP inference. + self.output_size = divide(output_size, self.nranks) # Split the output_size using TP inference. self.hidden_size = fd_config.model_config.hidden_size self.weight_shape = [ self.input_size, @@ -350,13 +354,15 @@ class MergedColumnParallelLinear(ColumnParallelLinear): self.hidden_size = fd_config.model_config.hidden_size self.nranks = fd_config.parallel_config.tensor_parallel_size - super().__init__(fd_config=fd_config, - prefix=prefix, - input_size=input_size, - output_size=output_size, - with_bias=with_bias, - add_bias=add_bias, - skip_quant=skip_quant) + super().__init__( + fd_config=fd_config, + prefix=prefix, + input_size=input_size, + output_size=output_size, + with_bias=with_bias, + add_bias=add_bias, + skip_quant=skip_quant, + ) def load_state_dict(self, state_dict: dict): """ @@ -366,22 +372,19 @@ class MergedColumnParallelLinear(ColumnParallelLinear): state_dict (dict): A dictionary containing the checkpoint weights and biases. """ # weight - assert self.weight_key is not None, 'weight_key should not be None.' + assert self.weight_key is not None, "weight_key should not be None." if self.weight_key in state_dict.keys(): weight_tensor = get_tensor(state_dict.pop(self.weight_key)) else: - gate_weight_key = self.weight_key.replace("up_gate_proj", - "gate_proj") + gate_weight_key = self.weight_key.replace("up_gate_proj", "gate_proj") up_weight_key = self.weight_key.replace("up_gate_proj", "up_proj") gate_tensor = get_tensor(state_dict.pop(gate_weight_key)) up_tensor = get_tensor(state_dict.pop(up_weight_key)) weight_tensor = paddle.concat([gate_tensor, up_tensor], axis=-1) if self.with_bias: - gate_bias_key = self.bias_key.replace("up_gate_proj", - "gate_proj") - bias_tensor = get_tensor(state_dict.pop(gate_bias_key)).astype( - paddle.get_default_dtype()) + gate_bias_key = self.bias_key.replace("up_gate_proj", "gate_proj") + bias_tensor = get_tensor(state_dict.pop(gate_bias_key)).astype(paddle.get_default_dtype()) state_dict[self.bias_key] = bias_tensor @@ -417,15 +420,16 @@ class QKVParallelLinear(ColumnParallelLinear): output_size = (self.num_heads + 2 * self.nranks) * self.head_dim else: self.kv_num_heads_per_rank = divide(self.kv_num_heads, self.nranks) - output_size = (self.num_heads + - 2 * self.kv_num_heads) * self.head_dim + output_size = (self.num_heads + 2 * self.kv_num_heads) * self.head_dim input_size = self.hidden_size - super().__init__(fd_config=fd_config, - prefix=prefix, - input_size=input_size, - output_size=output_size, - with_bias=with_bias, - add_bias=add_bias) + super().__init__( + fd_config=fd_config, + prefix=prefix, + input_size=input_size, + output_size=output_size, + with_bias=with_bias, + add_bias=add_bias, + ) def load_weight(self, state_dict: dict): """ @@ -445,18 +449,20 @@ class QKVParallelLinear(ColumnParallelLinear): v_tensor = get_tensor(state_dict.pop(v_weight_key)) if self.kv_num_heads < self.nranks: - sharedkv_index = (self.fd_config.parallel_config.tensor_parallel_rank * self.kv_num_heads) // self.nranks + sharedkv_index = ( + self.fd_config.parallel_config.tensor_parallel_rank * self.kv_num_heads + ) // self.nranks sharedkv_start = sharedkv_index * self.head_dim sharedkv_end = sharedkv_start + self.head_dim - k_tensor = k_tensor[ : , sharedkv_start : sharedkv_end] - v_tensor = v_tensor[ : , sharedkv_start : sharedkv_end] - weight_tensor = paddle.concat([q_tensor, k_tensor, v_tensor], - axis=-1).transpose([1, 0]) - weight_tensor = weight_tensor.reshape([ - (self.num_heads_per_rank + 2 * self.kv_num_heads_per_rank) * - (self.head_dim), - self.hidden_size, - ]) + k_tensor = k_tensor[:, sharedkv_start:sharedkv_end] + v_tensor = v_tensor[:, sharedkv_start:sharedkv_end] + weight_tensor = paddle.concat([q_tensor, k_tensor, v_tensor], axis=-1).transpose([1, 0]) + weight_tensor = weight_tensor.reshape( + [ + (self.num_heads_per_rank + 2 * self.kv_num_heads_per_rank) * (self.head_dim), + self.hidden_size, + ] + ) weight_tensor = paddle.transpose(weight_tensor, perm=[1, 0]) if self.fd_config.quant_config: @@ -472,7 +478,7 @@ class QKVParallelLinear(ColumnParallelLinear): state_dict (dict): A dictionary containing the checkpoint weights and biases. """ # weight - assert self.weight_key is not None, 'weight_key should not be None.' + assert self.weight_key is not None, "weight_key should not be None." # qkv fused in disk if self.fd_config.model_config.is_quantized: @@ -483,8 +489,7 @@ class QKVParallelLinear(ColumnParallelLinear): # bias if self.with_bias: if self.bias_key in state_dict.keys(): - bias_tensor = paddle.to_tensor( - get_tensor(state_dict.pop(self.bias_key))) + bias_tensor = paddle.to_tensor(get_tensor(state_dict.pop(self.bias_key))) self.bias.set_value(bias_tensor) else: q_bias_key = self.bias_key.replace("qkv_proj", "q_proj") @@ -536,13 +541,15 @@ class RowParallelLinear(LinearBase): add_bias (bool): Whether to add bias in the current layer or in the pre/post layer. Defaults to False. skip_quant (bool): Whether to skip quantization. Defaults to False. """ - super().__init__(fd_config=fd_config, - prefix=prefix, - input_size=input_size, - output_size=output_size, - with_bias=with_bias, - add_bias=add_bias, - skip_quant=skip_quant) + super().__init__( + fd_config=fd_config, + prefix=prefix, + input_size=input_size, + output_size=output_size, + with_bias=with_bias, + add_bias=add_bias, + skip_quant=skip_quant, + ) self.fd_config = fd_config self.skip_quant = False self.nranks = fd_config.parallel_config.tensor_parallel_size @@ -672,20 +679,22 @@ class KVBatchLinear(LinearBase): kv_weight_tensor = get_tensor(state_dict[self.weight_key]) # Reshape and split the weight - w = kv_weight_tensor.reshape([ - self.kv_lora_rank, - self.num_heads_per_partition, - -1, - ]).transpose(perm=[1, 2, 0]) + w = kv_weight_tensor.reshape( + [ + self.kv_lora_rank, + self.num_heads_per_partition, + -1, + ] + ).transpose(perm=[1, 2, 0]) # Split into K and V weights # wk_b: [num_heads, qk_nope_head_dim, kv_lora_rank] - wk_b = w[:, :self.qk_nope_head_dim, :] + wk_b = w[:, : self.qk_nope_head_dim, :] if self.v_head_dim is None: raise ValueError("self.v_head_dim should not be None") # wv_b: [num_heads, kv_lora_rank, v_head_dim] - wv_b = w[:, -self.v_head_dim:, :].transpose(perm=[0, 2, 1]) + wv_b = w[:, -self.v_head_dim :, :].transpose(perm=[0, 2, 1]) # Create K projection weight self.k_b_proj_weight = self.create_parameter( @@ -733,9 +742,7 @@ class KVBatchLinear(LinearBase): out = paddle.bmm(x, self.v_b_proj_weight) return out - def forward_cuda(self, - x: paddle.Tensor, - proj_type: str = 'k') -> paddle.Tensor: + def forward_cuda(self, x: paddle.Tensor, proj_type: str = "k") -> paddle.Tensor: """ Forward function that can handle both K and V projections @@ -746,9 +753,9 @@ class KVBatchLinear(LinearBase): Returns: Projection output """ - if proj_type == 'k': + if proj_type == "k": return self.forward_k_b(x) - elif proj_type == 'v': + elif proj_type == "v": return self.forward_v_b(x) else: raise ValueError(f"proj_type must be 'k' or 'v', got {proj_type}") diff --git a/fastdeploy/model_executor/layers/lm_head.py b/fastdeploy/model_executor/layers/lm_head.py index 188c25c19..4b8b96839 100644 --- a/fastdeploy/model_executor/layers/lm_head.py +++ b/fastdeploy/model_executor/layers/lm_head.py @@ -77,11 +77,9 @@ class ParallelLMHead(nn.Layer): self.linear = ColumnParallelLinear( embedding_dim, num_embeddings, - mp_group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=None, - has_bias=True - if self.bias_key is not None else False, + has_bias=True if self.bias_key is not None else False, gather_output=need_gather, fuse_matmul_bias=False, # False diff更小 ) @@ -89,17 +87,14 @@ class ParallelLMHead(nn.Layer): self.linear = RowParallelLinear( embedding_dim, num_embeddings, - mp_group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=None, - has_bias=True - if self.bias_key is not None else False, + has_bias=True if self.bias_key is not None else False, input_is_parallel=False, fuse_matmul_bias=False, # False diff更小 ) - def load_state_dict(self, state_dict: Dict[str, - paddle.Tensor | np.ndarray]): + def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): """ Load the checkpoint state dictionary into the layer. @@ -108,25 +103,20 @@ class ParallelLMHead(nn.Layer): """ if self.use_ep: - self.weight.set_value( - get_tensor(state_dict.pop(self.weight_key)).astype( - paddle.get_default_dtype())) + self.weight.set_value(get_tensor(state_dict.pop(self.weight_key)).astype(paddle.get_default_dtype())) else: if self.tie_word_embeddings: self.linear.weight.set_value( - get_tensor(state_dict.pop(self.weight_key)).astype( - paddle.get_default_dtype()).transpose([1, 0])) + get_tensor(state_dict.pop(self.weight_key)).astype(paddle.get_default_dtype()).transpose([1, 0]) + ) else: - weight_tensor = get_tensor( - state_dict.pop(self.weight_key)).astype( - paddle.get_default_dtype()) + weight_tensor = get_tensor(state_dict.pop(self.weight_key)).astype(paddle.get_default_dtype()) if self.linear.weight.shape != weight_tensor.shape: weight_tensor = weight_tensor.transpose([1, 0]) self.linear.weight.set_value(weight_tensor) if self.bias_key is not None: - bias = get_tensor(state_dict.pop(self.bias_key)).astype( - paddle.get_default_dtype()) + bias = get_tensor(state_dict.pop(self.bias_key)).astype(paddle.get_default_dtype()) self.linear.bias.set_value(bias) def forward(self, input: paddle.Tensor) -> paddle.Tensor: diff --git a/fastdeploy/model_executor/layers/moe/__init__.py b/fastdeploy/model_executor/layers/moe/__init__.py index c47eb28eb..67b56a5b2 100644 --- a/fastdeploy/model_executor/layers/moe/__init__.py +++ b/fastdeploy/model_executor/layers/moe/__init__.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .fused_moe_cutlass_backend import (CutlassW4A8MoEMethod, - CutlassWeightOnlyMoEMethod) +from .fused_moe_cutlass_backend import CutlassW4A8MoEMethod, CutlassWeightOnlyMoEMethod from .fused_moe_triton_backend import TritonWeightOnlyMoEMethod from .moe import FusedMoE __all__ = [ - CutlassWeightOnlyMoEMethod, CutlassW4A8MoEMethod, FusedMoE, - TritonWeightOnlyMoEMethod + CutlassWeightOnlyMoEMethod, + CutlassW4A8MoEMethod, + FusedMoE, + TritonWeightOnlyMoEMethod, ] diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index 0590c118e..4595ec00c 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -20,6 +20,7 @@ import paddle from paddle import nn from paddle.base.core import Config from paddleformers.utils.log import logger + try: from paddle.distributed.communication import deep_ep except: @@ -103,10 +104,12 @@ class DeepEPEngine: self.num_experts, ) # Allocate a buffer if not existed or not enough buffer size - if (self.deepep_engine is None - or self.deepep_engine.group != self.group - or not self.deepep_engine.low_latency_mode - or self.deepep_engine.num_rdma_bytes < num_rdma_bytes): + if ( + self.deepep_engine is None + or self.deepep_engine.group != self.group + or not self.deepep_engine.low_latency_mode + or self.deepep_engine.num_rdma_bytes < num_rdma_bytes + ): # NOTES: for best performance, the QP number **must** be equal to the number of the local experts assert self.num_experts % self.ep_size == 0 self.deepep_engine = deep_ep.Buffer( @@ -140,13 +143,7 @@ class DeepEPEngine: event: the event after executing the kernel (valid only if `async_finish` is set). hook: the receiving hook function (valid only if `return_recv_hook` is set). """ - ( - packed_recv_x, - recv_expert_count, - handle, - _, - dispatch_hook, - ) = self.deepep_engine.low_latency_dispatch( + (packed_recv_x, recv_expert_count, handle, _, dispatch_hook,) = self.deepep_engine.low_latency_dispatch( hidden_states, topk_idx, expertwise_scale, @@ -172,15 +169,14 @@ class DeepEPEngine: combined_hidden_states: [num_tokens, hidden] """ - combined_hidden_states, _, combine_hook = ( - self.deepep_engine.low_latency_combine( - hidden_states, - topk_idx, - topk_weights, - handle, - async_finish=False, - return_recv_hook=True, - )) + combined_hidden_states, _, combine_hook = self.deepep_engine.low_latency_combine( + hidden_states, + topk_idx, + topk_weights, + handle, + async_finish=False, + return_recv_hook=True, + ) return combined_hidden_states, combine_hook def clean_low_latency_buffer(self): @@ -188,8 +184,8 @@ class DeepEPEngine: clean_low_latency_buffer """ self.deepep_engine.clean_low_latency_buffer( - self.num_max_dispatch_tokens_per_rank, self.hidden, - self.num_experts) + self.num_max_dispatch_tokens_per_rank, self.hidden, self.num_experts + ) def barrier_all(self): """ @@ -203,14 +199,16 @@ class EPRunner: EPRunnerBase """ - def __init__(self, - top_k: int, - hidden: int, - num_experts: int, - moe_phase: MoEPhase, - num_max_dispatch_tokens_per_rank: int = 1, - ep_size: int = 1, - ep_rank: int = 0): + def __init__( + self, + top_k: int, + hidden: int, + num_experts: int, + moe_phase: MoEPhase, + num_max_dispatch_tokens_per_rank: int = 1, + ep_size: int = 1, + ep_rank: int = 0, + ): self.top_k = top_k self.num_experts = num_experts self.ep_engine = DeepEPEngine( @@ -255,24 +253,38 @@ class EPPrefillRunner(EPRunner): EPPrefillRunner """ - def __init__(self, - top_k: int, - hidden: int, - num_experts: int, - ep_size: int = 1, - ep_rank: int = 0): - super().__init__(top_k, - hidden, - num_experts, - MoEPhase.PREFILL, - ep_size=ep_size, - ep_rank=ep_rank) + def __init__( + self, + top_k: int, + hidden: int, + num_experts: int, + ep_size: int = 1, + ep_rank: int = 0, + ): + super().__init__( + top_k, + hidden, + num_experts, + MoEPhase.PREFILL, + ep_size=ep_size, + ep_rank=ep_rank, + ) - def dispatch(self, x: paddle.Tensor, topk_idx: paddle.Tensor, - topk_weights: paddle.Tensor, *args, **kwargs): - (num_tokens_per_rank, _, num_tokens_per_expert, is_token_in_rank, - _) = self.ep_engine.deepep_engine.get_dispatch_layout( - topk_idx, self.num_experts) + def dispatch( + self, + x: paddle.Tensor, + topk_idx: paddle.Tensor, + topk_weights: paddle.Tensor, + *args, + **kwargs, + ): + ( + num_tokens_per_rank, + _, + num_tokens_per_expert, + is_token_in_rank, + _, + ) = self.ep_engine.deepep_engine.get_dispatch_layout(topk_idx, self.num_experts) x_scale_tensor = kwargs.get("x_scale_tensor", None) dispatch_args = { @@ -287,8 +299,12 @@ class EPPrefillRunner(EPRunner): } return self.ep_engine.deepep_engine.dispatch(**dispatch_args) - def combine(self, tmp_ffn_out: paddle.Tensor, handle: tuple, - recv_topk_weights: paddle.Tensor): + def combine( + self, + tmp_ffn_out: paddle.Tensor, + handle: tuple, + recv_topk_weights: paddle.Tensor, + ): combine_args = { "x": tmp_ffn_out, "handle": handle, @@ -296,8 +312,7 @@ class EPPrefillRunner(EPRunner): "async_finish": self.ep_engine.async_finish, "topk_weights": recv_topk_weights, } - fused_moe_out, _, _ = (self.ep_engine.deepep_engine.combine( - **combine_args)) + fused_moe_out, _, _ = self.ep_engine.deepep_engine.combine(**combine_args) return fused_moe_out @@ -307,36 +322,46 @@ class EPDecoderRunner(EPRunner): EPPrefillRunner """ - def __init__(self, - top_k: int, - hidden: int, - num_experts: int, - num_max_dispatch_tokens_per_rank: int, - ep_size: int = 1, - ep_rank: int = 0): - super().__init__(top_k, - hidden, - num_experts, - MoEPhase.DECODER, - num_max_dispatch_tokens_per_rank, - ep_size=ep_size, - ep_rank=ep_rank) + def __init__( + self, + top_k: int, + hidden: int, + num_experts: int, + num_max_dispatch_tokens_per_rank: int, + ep_size: int = 1, + ep_rank: int = 0, + ): + super().__init__( + top_k, + hidden, + num_experts, + MoEPhase.DECODER, + num_max_dispatch_tokens_per_rank, + ep_size=ep_size, + ep_rank=ep_rank, + ) - def dispatch(self, x: paddle.Tensor, topk_idx: paddle.Tensor, - topk_weights: paddle.Tensor, *args, **kwargs): + def dispatch( + self, + x: paddle.Tensor, + topk_idx: paddle.Tensor, + topk_weights: paddle.Tensor, + *args, + **kwargs, + ): expertwise_scale = kwargs.get("expertwise_scale", None) use_fp8 = kwargs.get("use_fp8", False) - recv_hidden_states, recv_expert_count, handle, dispatch_hook = ( - self.ep_engine.low_latency_dispatch(x, topk_idx, expertwise_scale, - use_fp8)) + recv_hidden_states, recv_expert_count, handle, dispatch_hook = self.ep_engine.low_latency_dispatch( + x, topk_idx, expertwise_scale, use_fp8 + ) if dispatch_hook is not None: dispatch_hook() return recv_hidden_states, recv_expert_count, handle def combine(self, ffn_out, topk_idx, topk_weights, handle): - # TODO(@wufeisheng): Delete them when deepep in PaddlePaddle is fixed + # TODO(@wufeisheng): Delete them when deepep in PaddlePaddle is fixed ( src_info, layout_range, @@ -353,7 +378,8 @@ class EPDecoderRunner(EPRunner): ) combined_hidden_states, combine_hook = self.ep_engine.low_latency_combine( - ffn_out, topk_idx, topk_weights, handle) + ffn_out, topk_idx, topk_weights, handle + ) if combine_hook is not None: combine_hook() diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 874a90cca..0f65f45d8 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -25,8 +25,7 @@ from ..quantization.quant_base import QuantMethodBase class MoEMethodBase(QuantMethodBase): - """ - """ + """ """ def __init__(self, quant_config): super().__init__() @@ -36,7 +35,8 @@ class MoEMethodBase(QuantMethodBase): self.quant_config = quant_config self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] self.added_scale_attrs = [ - "up_gate_proj_weight_scale", "down_proj_weight_scale" + "up_gate_proj_weight_scale", + "down_proj_weight_scale", ] self.pack_num = 1 @@ -47,15 +47,25 @@ class MoEMethodBase(QuantMethodBase): if layer.ep_size > 1: if layer.fd_config.parallel_config.moe_phase == MoEPhase.DECODER: from .ep import EPDecoderRunner + self.ep_decoder_runner = EPDecoderRunner( - layer.top_k, layer.hidden_size, layer.num_experts, + layer.top_k, + layer.hidden_size, + layer.num_experts, layer.fd_config.model_config.num_max_dispatch_tokens_per_rank, - layer.ep_size, layer.ep_rank) + layer.ep_size, + layer.ep_rank, + ) else: from .ep import EPPrefillRunner + self.ep_prefill_runner = EPPrefillRunner( - layer.top_k, layer.hidden_size, layer.num_experts, - layer.ep_size, layer.ep_rank) + layer.top_k, + layer.hidden_size, + layer.num_experts, + layer.ep_size, + layer.ep_rank, + ) def process_loaded_weights(self, layer, weights) -> None: """ @@ -68,10 +78,12 @@ class MoEMethodBase(QuantMethodBase): check layer is valid for this method """ assert up_gate_proj_weights[0].shape == [ - layer.hidden_size // self.pack_num, layer.moe_intermediate_size * 2 + layer.hidden_size // self.pack_num, + layer.moe_intermediate_size * 2, ] assert down_proj_weights[0].shape == [ - layer.moe_intermediate_size // self.pack_num, layer.hidden_size + layer.moe_intermediate_size // self.pack_num, + layer.hidden_size, ] @abstractmethod diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 99ddb68cc..67a87cc22 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -20,25 +20,34 @@ from paddle.nn.quant import weight_quantize from paddleformers.utils.log import logger import fastdeploy -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce from fastdeploy.platforms import current_platform from ..utils import create_and_set_parameter, get_tensor from .fused_moe_backend_base import MoEMethodBase if current_platform.is_cuda() and not current_platform.is_dcu(): - from fastdeploy.model_executor.ops.gpu import (moe_expert_dispatch, - moe_expert_reduce, noaux_tc) + from fastdeploy.model_executor.ops.gpu import ( + moe_expert_dispatch, + moe_expert_reduce, + noaux_tc, + ) elif current_platform.is_iluvatar(): - from fastdeploy.model_executor.ops.iluvatar import (moe_expert_dispatch, - moe_expert_reduce) + from fastdeploy.model_executor.ops.iluvatar import ( + moe_expert_dispatch, + moe_expert_reduce, + ) # used for deepseek_v3 -def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k, - routed_scaling_factor, - e_score_correction_bias) -> paddle.Tensor: +def get_moe_scores( + gating_output: paddle.Tensor, + n_group, + topk_group, + top_k, + routed_scaling_factor, + e_score_correction_bias, +) -> paddle.Tensor: """ compute moe scores using e_score_correction_bias. """ @@ -69,16 +78,17 @@ class CutlassMoEMethod(MoEMethodBase): up_gate_proj_weights, down_proj_weights = layer.extract_moe_ffn_weights(state_dict) stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0) stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0) - for idx, weight_tensor in enumerate( - [stacked_up_gate_proj_weights, stacked_down_proj_weights]): + for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]): weight_name = self.added_weight_attrs[idx] setattr( - layer, weight_name, + layer, + weight_name, layer.create_parameter( shape=weight_tensor.shape, dtype=weight_tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, weight_name).set_value(weight_tensor) def compute_ffn( @@ -99,12 +109,9 @@ class CutlassMoEMethod(MoEMethodBase): layer.up_gate_proj_weight, layer.down_proj_weight, None, - (layer.up_gate_proj_weight_scale if hasattr( - layer, "up_gate_proj_weight_scale") else None), - (layer.down_proj_weight_scale if hasattr( - layer, "down_proj_weight_scale") else None), - (layer.down_proj_in_scale - if hasattr(layer, "down_proj_in_scale") else None), + (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None), + (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None), + (layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None), expert_idx_per_token, self.moe_quant_type, used_in_ep_low_latency, @@ -115,12 +122,9 @@ class CutlassMoEMethod(MoEMethodBase): layer.up_gate_proj_weight, layer.down_proj_weight, None, - (layer.up_gate_proj_weight_scale - if hasattr(layer, "up_gate_proj_weight_scale") else None), - (layer.down_proj_weight_scale - if hasattr(layer, "down_proj_weight_scale") else None), - (layer.down_proj_in_scale - if hasattr(layer, "down_proj_in_scale") else None), + (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None), + (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None), + (layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None), expert_idx_per_token, self.moe_quant_type, used_in_ep_low_latency, @@ -136,8 +140,7 @@ class CutlassMoEMethod(MoEMethodBase): Apply the EP prefill method. """ # 1. Select topk experts and weights - topk_idx, topk_weights = self.ep_prefill_runner.moe_select( - layer, gate_out) + topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) # 2. EP Dispatch ( recv_x, @@ -164,8 +167,7 @@ class CutlassMoEMethod(MoEMethodBase): recv_x, recv_topk_idx, recv_topk_weights, - (self.up_gate_proj_in_scale - if hasattr(self, "up_gate_proj_in_scale") else None), + (self.up_gate_proj_in_scale if hasattr(self, "up_gate_proj_in_scale") else None), recv_num_tokens_per_expert_list, token_all_num, self.moe_quant_type, @@ -177,9 +179,12 @@ class CutlassMoEMethod(MoEMethodBase): else: expert_idx_per_token = expert_idx_per_token.cast("int64") - ffn_out = self.compute_ffn(layer, permute_input, - recv_num_tokens_per_expert_list_cumsum, - expert_idx_per_token) + ffn_out = self.compute_ffn( + layer, + permute_input, + recv_num_tokens_per_expert_list_cumsum, + expert_idx_per_token, + ) # prmt back per rank tmp_ffn_out = fastdeploy.model_executor.ops.gpu.ep_moe_expert_combine( @@ -195,8 +200,7 @@ class CutlassMoEMethod(MoEMethodBase): tmp_ffn_out = recv_x # 4. EP combine - return self.ep_prefill_runner.combine(tmp_ffn_out, handle, - recv_topk_weights) + return self.ep_prefill_runner.combine(tmp_ffn_out, handle, recv_topk_weights) def apply_ep_decode( self, @@ -208,28 +212,28 @@ class CutlassMoEMethod(MoEMethodBase): Apply the EP decoder method. """ # 1. Select topk experts and weights - topk_idx, topk_weights = self.ep_decoder_runner.moe_select( - layer, gate_out) + topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) # 2. EP Dispatch - permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch( - x, topk_idx, topk_weights) + permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(x, topk_idx, topk_weights) # 3. Compute ffn if self.moe_quant_type == "w4a8": num_local_experts, max_num, _ = permute_input.shape - expert_idx_per_token = paddle.arange( - num_local_experts)[:, None].tile([1, max_num]) + expert_idx_per_token = paddle.arange(num_local_experts)[:, None].tile([1, max_num]) elif self.moe_quant_type in ["weight_only_int8", "weight_only_int4"]: expert_idx_per_token = None else: raise NotImplementedError - ffn_out = self.compute_ffn(layer, permute_input, - token_nums_per_expert.cast("int64"), - expert_idx_per_token, True) + ffn_out = self.compute_ffn( + layer, + permute_input, + token_nums_per_expert.cast("int64"), + expert_idx_per_token, + True, + ) # 4. EP combine - return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights, - handle) + return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights, handle) def apply_tp( self, @@ -241,10 +245,14 @@ class CutlassMoEMethod(MoEMethodBase): Paddle Cutlass compute Fused MoE. """ if layer.topk_method == "noaux_tc": - gate_out = get_moe_scores(gate_out, layer.n_group, - layer.topk_group, layer.top_k, - layer.routed_scaling_factor, - layer.gate_correction_bias) + gate_out = get_moe_scores( + gate_out, + layer.n_group, + layer.topk_group, + layer.top_k, + layer.routed_scaling_factor, + layer.gate_correction_bias, + ) ( permute_input, @@ -257,8 +265,9 @@ class CutlassMoEMethod(MoEMethodBase): x, gate_out, None, # Use layer.gate_correction_bias in get_moe_scores. - (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") - else None), # if set, permute_input will be int8_t + ( + layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None + ), # if set, permute_input will be int8_t layer.top_k, False, topk_only_mode=True, @@ -275,8 +284,9 @@ class CutlassMoEMethod(MoEMethodBase): x, gate_out, layer.gate_correction_bias, - (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") - else None), # if set, permute_input will be int8_t + ( + layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None + ), # if set, permute_input will be int8_t layer.top_k, False, topk_only_mode=False, @@ -289,8 +299,7 @@ class CutlassMoEMethod(MoEMethodBase): else: expert_idx_per_token = expert_idx_per_token.cast("int64") - ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert, - expert_idx_per_token) + ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert, expert_idx_per_token) # reduce 中会做 topk 个 weight 的 norm 和 routed_scaling_factor fused_moe_out = moe_expert_reduce( @@ -330,17 +339,14 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod): weight_name = self.added_weight_attrs[idx] weight_list = [] for i in range(layer.num_local_experts): - quant_weight, scale = weight_quantize(weight_tensor[i], - algo=self.moe_quant_type, - arch=80) + quant_weight, scale = weight_quantize(weight_tensor[i], algo=self.moe_quant_type, arch=80) weight_list.append(quant_weight) quanted_weight = paddle.stack(weight_list, axis=0) create_and_set_parameter(layer, weight_name, quanted_weight) self.create_w4a8_scale_weights(layer, layer.weight_key_map, state_dict) - def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict, - state_dict: dict): + def create_w4a8_scale_weights(self, layer: nn.Layer, weight_key_map: dict, state_dict: dict): """ Get w4a8 weights from state dict and process them. Args: @@ -357,13 +363,14 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod): create_and_set_parameter(layer, name, processed_in_scale) return processed_in_scale - def _process_weight_scale(name: str, - weight_scales: list[paddle.Tensor], - processed_in_scale: paddle.Tensor): - processed_weight_scale = (paddle.stack(weight_scales, axis=0) / - (127 * 112) / - processed_in_scale[:, None]).cast( - paddle.get_default_dtype()) + def _process_weight_scale( + name: str, + weight_scales: list[paddle.Tensor], + processed_in_scale: paddle.Tensor, + ): + processed_weight_scale = ( + paddle.stack(weight_scales, axis=0) / (127 * 112) / processed_in_scale[:, None] + ).cast(paddle.get_default_dtype()) create_and_set_parameter(layer, name, processed_weight_scale) # 1. Init scale containers and maps @@ -379,42 +386,34 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod): "down_proj_in_scale": down_proj_in_scales, } scale_key_map = { - "up_gate_proj_weight_scale": - weight_key_map.get("up_gate_proj_expert_weight_scale_key", None), - "down_proj_weight_scale": - weight_key_map.get("down_proj_expert_weight_scale_key", None), - "up_gate_proj_in_scale": - weight_key_map.get("up_gate_proj_expert_in_scale_key", None), - "down_proj_in_scale": - weight_key_map.get("down_proj_expert_in_scale_key", None), + "up_gate_proj_weight_scale": weight_key_map.get("up_gate_proj_expert_weight_scale_key", None), + "down_proj_weight_scale": weight_key_map.get("down_proj_expert_weight_scale_key", None), + "up_gate_proj_in_scale": weight_key_map.get("up_gate_proj_expert_in_scale_key", None), + "down_proj_in_scale": weight_key_map.get("down_proj_expert_in_scale_key", None), } for name, value in scale_key_map.items(): if value is None: - raise ValueError( - f"scale {name} should not be none in w4a8 mode.") + raise ValueError(f"scale {name} should not be none in w4a8 mode.") # 2. Extract scale tensor from state dict for local_expert_idx in range(layer.num_local_experts): expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts for name, scale_key_template in scale_key_map.items(): - scale_tensor = _extract_scale_tensor(state_dict, - scale_key_template, - expert_idx) + scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx) scale_weight_map[name].append(scale_tensor) # 3. Process scale tensor and set to layer in_scales = [] for in_scale_name in ["up_gate_proj_in_scale", "down_proj_in_scale"]: - in_scales.append( - _process_in_scale(in_scale_name, - scale_weight_map[in_scale_name])) + in_scales.append(_process_in_scale(in_scale_name, scale_weight_map[in_scale_name])) - for i, weight_scale_name in enumerate( - ["up_gate_proj_weight_scale", "down_proj_weight_scale"]): - _process_weight_scale(weight_scale_name, - scale_weight_map[weight_scale_name], - in_scales[i]) + for i, weight_scale_name in enumerate(["up_gate_proj_weight_scale", "down_proj_weight_scale"]): + _process_weight_scale( + weight_scale_name, + scale_weight_map[weight_scale_name], + in_scales[i], + ) class CutlassWeightOnlyMoEMethod(CutlassMoEMethod): @@ -432,30 +431,27 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod): """ Paddle cutlass process prequanted weights. """ - up_gate_proj_expert_weight_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_key", None) - down_proj_expert_weight_key = layer.weight_key_map.get( - "down_proj_expert_weight_key", None) - up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_scale_key", None) - down_proj_expert_weight_scale_key = layer.weight_key_map.get( - "down_proj_expert_weight_scale_key", None) + up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None) + down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None) + up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None) + down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None) up_gate_proj_weights, down_proj_weights = layer.load_experts_weight( - state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key) + state_dict, + up_gate_proj_expert_weight_key, + down_proj_expert_weight_key, + ) # self.check(layer, up_gate_proj_weights, down_proj_weights) up_gate_proj_weight_scale = [] down_proj_weight_scale = [] for i in range(layer.num_local_experts): expert_idx = layer.expert_id_offset + i up_gate_proj_weight_scale.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx))) + ) down_proj_weight_scale.append( - get_tensor( - state_dict.pop( - down_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx))) + ) up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0) down_proj_weight = paddle.stack(down_proj_weights, axis=0) @@ -466,7 +462,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod): "up_gate_proj_weight": up_gate_proj_weight, "down_proj_weight": down_proj_weight, "up_gate_proj_weight_scale": up_gate_proj_weight_scale, - "down_proj_weight_scale": down_proj_weight_scale + "down_proj_weight_scale": down_proj_weight_scale, } for name, tensor in name_tensor_map.items(): create_and_set_parameter(layer, name, tensor) @@ -485,8 +481,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod): weight_list = [] weight_scale_list = [] for i in range(layer.num_local_experts): - quant_weight, scale = weight_quantize(weight_tensor[i], - algo=self.moe_quant_type) + quant_weight, scale = weight_quantize(weight_tensor[i], algo=self.moe_quant_type) weight_list.append(quant_weight) weight_scale_list.append(scale) quanted_weight = paddle.stack(weight_list, axis=0) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 62f795b54..b80db3114 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -19,11 +19,9 @@ from paddle import nn from paddleformers.utils.log import logger import fastdeploy -import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce from fastdeploy.model_executor.layers.utils import get_tensor -from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func +from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func, deep_gemm from ..utils import create_and_set_parameter from .fused_moe_backend_base import MoEMethodBase @@ -50,10 +48,9 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): weight_list = [] weight_scale_list = [] for i in range(layer.num_local_experts): - from fastdeploy.model_executor.layers.utils import \ - per_block_cast_to_fp8 - quant_weight, scale = per_block_cast_to_fp8( - weight_tensor[i], self.quant_config.weight_block_size) + from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8 + + quant_weight, scale = per_block_cast_to_fp8(weight_tensor[i], self.quant_config.weight_block_size) weight_list.append(quant_weight) weight_scale_list.append(scale) @@ -62,41 +59,41 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): create_and_set_parameter(layer, weight_name, quanted_weight) quanted_weight_scale = paddle.stack(weight_scale_list, axis=0) - quanted_weight_scale = quanted_weight_scale.transpose( - [0, 2, 1]).contiguous() + quanted_weight_scale = quanted_weight_scale.transpose([0, 2, 1]).contiguous() create_and_set_parameter(layer, scale_name, quanted_weight_scale) def process_prequanted_weights(self, layer: nn.Layer, state_dict): """ Paddle cutlass process prequanted weights. """ - up_gate_proj_expert_weight_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_key", None) - down_proj_expert_weight_key = layer.weight_key_map.get( - "down_proj_expert_weight_key", None) - up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_scale_key", None) - down_proj_expert_weight_scale_key = layer.weight_key_map.get( - "down_proj_expert_weight_scale_key", None) + up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None) + down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None) + up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None) + down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None) up_gate_proj_weights, down_proj_weights = layer.load_experts_weight( - state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key) + state_dict, + up_gate_proj_expert_weight_key, + down_proj_expert_weight_key, + ) # self.check(layer, up_gate_proj_weights, down_proj_weights) up_gate_proj_weight_scale = [] down_proj_weight_scale = [] for i in range(layer.num_local_experts): expert_idx = layer.expert_id_offset + i up_gate_proj_weight_scale.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx))) + ) down_proj_weight_scale.append( - get_tensor( - state_dict.pop( - down_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx))) + ) - up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn") - down_proj_weight = paddle.stack(down_proj_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn") + up_gate_proj_weight = ( + paddle.stack(up_gate_proj_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn") + ) + down_proj_weight = ( + paddle.stack(down_proj_weights, axis=0).transpose([0, 2, 1]).contiguous().view("float8_e4m3fn") + ) up_gate_proj_weight_scale = paddle.stack(up_gate_proj_weight_scale, axis=0).transpose([0, 2, 1]).contiguous() down_proj_weight_scale = paddle.stack(down_proj_weight_scale, axis=0).transpose([0, 2, 1]).contiguous() @@ -104,7 +101,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): "up_gate_proj_weight": up_gate_proj_weight, "down_proj_weight": down_proj_weight, "up_gate_proj_weight_scale": up_gate_proj_weight_scale, - "down_proj_weight_scale": down_proj_weight_scale + "down_proj_weight_scale": down_proj_weight_scale, } for name, tensor in name_tensor_map.items(): create_and_set_parameter(layer, name, tensor) @@ -119,11 +116,11 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): Apply the EP prefill method. """ # 1. Select topk experts and weights - topk_idx, topk_weights = self.ep_prefill_runner.moe_select( - layer, gate_out) + topk_idx, topk_weights = self.ep_prefill_runner.moe_select(layer, gate_out) # 2. Dynamic compute blockwise quantization scales x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, self.quant_config.weight_block_size[0]) + x, self.quant_config.weight_block_size[0] + ) # 3. EP Dispatch ( recv_x, @@ -132,10 +129,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): recv_num_tokens_per_expert_list, handle, _, - ) = self.ep_prefill_runner.dispatch(x, - topk_idx, - topk_weights, - x_scale_tensor=x_scale_tensor) + ) = self.ep_prefill_runner.dispatch(x, topk_idx, topk_weights, x_scale_tensor=x_scale_tensor) token_all_num = sum(recv_num_tokens_per_expert_list) @@ -187,14 +181,15 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): # down_proj ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0]) - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose( - [1, 0]).contiguous() + ffn_out, self.quant_config.weight_block_size[0] + ) + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) ffn_out = paddle.empty( (ffn_out.shape[0], layer.down_proj_weight.shape[1]), - dtype=paddle.bfloat16) + dtype=paddle.bfloat16, + ) deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( (ffn_in_x, ffn_in_x_scale_tensor), (layer.down_proj_weight, layer.down_proj_weight_scale), @@ -216,8 +211,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): tmp_ffn_out = paddle.cast(recv_x[0], paddle.bfloat16) # 5. EP combine - return self.ep_prefill_runner.combine(tmp_ffn_out, handle, - recv_topk_weights) + return self.ep_prefill_runner.combine(tmp_ffn_out, handle, recv_topk_weights) def apply_ep_decode( self, @@ -229,19 +223,18 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): Apply the EP decoder method. """ # 1. Select topk experts and weights - topk_idx, topk_weights = self.ep_decoder_runner.moe_select( - layer, gate_out) + topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) # 2. EP Dispatch permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch( - x, topk_idx, topk_weights, use_fp8=True) + x, topk_idx, topk_weights, use_fp8=True + ) # 3. Compute ffn assert isinstance(permute_input, tuple) up_gate_proj_out = paddle.empty( [ layer.num_local_experts, - layer.ep_size * - layer.fd_config.model_config.num_max_dispatch_tokens_per_rank, + layer.ep_size * layer.fd_config.model_config.num_max_dispatch_tokens_per_rank, layer.moe_intermediate_size * 2, ], dtype=paddle.bfloat16, @@ -250,8 +243,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): ffn_out = paddle.empty( [ layer.num_local_experts, - layer.ep_size * - layer.fd_config.model_config.num_max_dispatch_tokens_per_rank, + layer.ep_size * layer.fd_config.model_config.num_max_dispatch_tokens_per_rank, layer.hidden_size, ], dtype=paddle.bfloat16, @@ -269,12 +261,13 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): expected_m, ) - act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked( - up_gate_proj_out, token_nums_per_expert) + act_out = fastdeploy.model_executor.ops.gpu.group_swiglu_with_masked(up_gate_proj_out, token_nums_per_expert) act_out_fp8, scale = fastdeploy.model_executor.ops.gpu.masked_per_token_quant( - act_out, token_nums_per_expert, - self.quant_config.weight_block_size[0]) + act_out, + token_nums_per_expert, + self.quant_config.weight_block_size[0], + ) deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_masked( (act_out_fp8, scale), @@ -288,8 +281,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): ) # 4. EP combine - return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights, - handle) + return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights, handle) def apply_tp( self, @@ -312,8 +304,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): tmp = count_tokens_per_expert_func(topk_ids, layer.num_experts) - recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, 128) + recv_x, recv_x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, 128) ( permute_input, @@ -332,7 +323,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): topk_weights, tmp[0], tmp[1], - False, # use_in_ep + False, # use_in_ep -1, ) @@ -355,15 +346,16 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): # down_proj ffn_in_x, ffn_in_x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant( - ffn_out, self.quant_config.weight_block_size[0]) + ffn_out, self.quant_config.weight_block_size[0] + ) - ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose( - [1, 0]).contiguous() + ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]).contiguous() ffn_in_x_scale_tensor = ffn_in_x_scale_tensor.transpose([1, 0]) ffn_out = paddle.empty( (ffn_out.shape[0], layer.down_proj_weight.shape[1]), - dtype=paddle.bfloat16) + dtype=paddle.bfloat16, + ) deep_gemm.m_grouped_gemm_fp8_fp8_bf16_nt_contiguous( (ffn_in_x, ffn_in_x_scale_tensor), (layer.down_proj_weight, layer.down_proj_weight_scale), diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py index da308a0b8..69c58a549 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_marlin_backend.py @@ -18,29 +18,35 @@ import paddle from paddle import nn import fastdeploy -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce -from fastdeploy.model_executor.ops.gpu import (MoeWna16MarlinGemmApi, - tritonmoe_preprocess_func) +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce +from fastdeploy.model_executor.ops.gpu import ( + MoeWna16MarlinGemmApi, + tritonmoe_preprocess_func, +) from ..quantization.quant_base import QuantMethodBase -def gptq_marlin_moe_repack(b_q_weight: paddle.Tensor, perm: paddle.Tensor, - size_k: int, size_n: int, - num_bits: int) -> paddle.Tensor: +def gptq_marlin_moe_repack( + b_q_weight: paddle.Tensor, + perm: paddle.Tensor, + size_k: int, + size_n: int, + num_bits: int, +) -> paddle.Tensor: """ Util function. """ from fastdeploy.model_executor.ops.gpu import gptq_marlin_repack + num_experts = b_q_weight.shape[0] assert size_k % 16 == 0 output = paddle.empty( [num_experts, size_k // 16, size_n * (num_bits // 2)], - dtype=b_q_weight.dtype) + dtype=b_q_weight.dtype, + ) for e in range(num_experts): - output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n, - num_bits) + output[e] = gptq_marlin_repack(b_q_weight[e], perm[e], size_k, size_n, num_bits) return output @@ -53,13 +59,11 @@ def get_scale_perms(): scale_perm.extend([i + 8 * j for j in range(8)]) scale_perm_single: list[int] = [] for i in range(4): - scale_perm_single.extend( - [2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) + scale_perm_single.extend([2 * i + j for j in [0, 1, 8, 9, 16, 17, 24, 25]]) return scale_perm, scale_perm_single -def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int, - group_size: int) -> paddle.Tensor: +def marlin_permute_scales(s: paddle.Tensor, size_k: int, size_n: int, group_size: int) -> paddle.Tensor: """ Util function. """ @@ -105,7 +109,8 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): self.quant_method = quant_method self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] self.added_scale_attrs = [ - "up_gate_proj_weight_scale", "down_proj_weight_scale" + "up_gate_proj_weight_scale", + "down_proj_weight_scale", ] self.added_zeros_attrs = ["zeros0", "zeros1"] @@ -117,10 +122,12 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): assert len(up_gate_proj_weights) == layer.num_local_experts assert len(down_proj_weights) == layer.num_local_experts assert up_gate_proj_weights[0].shape == [ - layer.hidden_size, layer.moe_intermediate_size * 2 + layer.hidden_size, + layer.moe_intermediate_size * 2, ] assert down_proj_weights[0].shape == [ - layer.moe_intermediate_size, layer.hidden_size + layer.moe_intermediate_size, + layer.hidden_size, ] up_gate_proj_tensor = paddle.stack(up_gate_proj_weights, axis=0) @@ -133,8 +140,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): scale_name = self.added_scale_attrs[idx] weight_scale = weight_tensor.abs().max(axis=1) - quanted_weight = weight_tensor / weight_scale[:, - None, :] * max_bound + quanted_weight = weight_tensor / weight_scale[:, None, :] * max_bound quanted_weight = paddle.round(quanted_weight).astype("int32") quanted_weight[quanted_weight > 7] = 7 @@ -143,7 +149,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): E, K, N = quanted_weight.shape quanted_weight = quanted_weight.reshape([0, K // 8, 8, N]) - res = paddle.zeros([E, K // 8, N], dtype='int32') + res = paddle.zeros([E, K // 8, N], dtype="int32") for j in range(8): tmp = quanted_weight[:, :, j, :] res = res | (tmp << (j * 4)) @@ -164,19 +170,24 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): weight_scale = marlin_moe_permute_scales( weight_scale, - size_k=layer.moe_intermediate_size, #useless + size_k=layer.moe_intermediate_size, # useless size_n=N, - group_size=group_size) + group_size=group_size, + ) - for (name, tensor) in [(weight_name, quanted_weight), - (scale_name, weight_scale)]: + for name, tensor in [ + (weight_name, quanted_weight), + (scale_name, weight_scale), + ]: setattr( - layer, name, + layer, + name, layer.create_parameter( shape=tensor.shape, dtype=tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, name).set_value(tensor) def apply( @@ -216,7 +227,8 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): workspace = paddle.empty([528], dtype="int32") sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func( - topk_ids, num_experts, block_size_m) + topk_ids, num_experts, block_size_m + ) ffn_out = MoeWna16MarlinGemmApi( x, @@ -243,7 +255,8 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): is_k_full=True, use_atomic_add=True, use_fp32_reduce=True, - is_zp_float=False)[0] + is_zp_float=False, + )[0] swiglu_out = paddle.incubate.nn.functional.swiglu(ffn_out) @@ -272,7 +285,8 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase): is_k_full=True, use_atomic_add=True, use_fp32_reduce=True, - is_zp_float=False)[0] + is_zp_float=False, + )[0] ffn_out.reshape_([token_num, -1, hidden_size]) ffn_out = ffn_out.sum(axis=1) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 512f76c81..1715cd60a 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -18,10 +18,8 @@ import paddle from paddle import nn import fastdeploy -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce -from fastdeploy.model_executor.layers.utils import (create_and_set_parameter, - get_tensor) +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce +from fastdeploy.model_executor.layers.utils import create_and_set_parameter, get_tensor from fastdeploy.utils import ceil_div from ..quantization.quant_base import QuantMethodBase @@ -46,7 +44,8 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase): self.quant_config = quant_config self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] self.added_scale_attrs = [ - "up_gate_proj_weight_scale", "down_proj_weight_scale" + "up_gate_proj_weight_scale", + "down_proj_weight_scale", ] def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None: @@ -66,10 +65,12 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase): assert algo == "wint8" assert up_gate_proj_weights[0].shape == [ - layer.hidden_size, layer.moe_intermediate_size * 2 + layer.hidden_size, + layer.moe_intermediate_size * 2, ] assert down_proj_weights[0].shape == [ - layer.moe_intermediate_size, layer.hidden_size + layer.moe_intermediate_size, + layer.hidden_size, ] up_gate_proj_tensor = paddle.stack(up_gate_proj_weights, axis=0) @@ -85,26 +86,29 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase): scale_name = self.added_scale_attrs[idx] quanted_weight_scale = weight_tensor.abs().max(axis=1) - quanted_weight = weight_tensor / quanted_weight_scale[:, - None, :] * max_bound + quanted_weight = weight_tensor / quanted_weight_scale[:, None, :] * max_bound quanted_weight = paddle.round(quanted_weight).astype("int8") quanted_weight_scale = quanted_weight_scale / max_bound setattr( - layer, weight_name, + layer, + weight_name, layer.create_parameter( shape=quanted_weight.shape, dtype=quanted_weight.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, weight_name).set_value(quanted_weight) setattr( - layer, scale_name, + layer, + scale_name, layer.create_parameter( shape=quanted_weight_scale.shape, dtype=quanted_weight_scale.dtype, - )) + ), + ) getattr(layer, scale_name).set_value(quanted_weight_scale) def apply( @@ -142,11 +146,13 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase): "GROUP_SIZE_M": 1, } sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func( - topk_ids, num_local_experts, config["BLOCK_SIZE_M"]) + topk_ids, num_local_experts, config["BLOCK_SIZE_M"] + ) max_possible_num_post_padded = sorted_token_ids.shape[0] grid = ( - ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) * - ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), ) + ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) + * ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), + ) fused_moe_kernel_paddle[grid]( x, @@ -190,8 +196,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase): even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0, ) - down_proj_input = paddle.incubate.nn.functional.swiglu( - up_gate_proj_out) + down_proj_input = paddle.incubate.nn.functional.swiglu(up_gate_proj_out) down_proj_out = paddle.empty( (token_num * top_k, hidden_size), @@ -199,8 +204,9 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase): ) grid = ( - ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) * - ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) + ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) + * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), + ) fused_moe_kernel_paddle[grid]( down_proj_input, layer.down_proj_weight, @@ -263,49 +269,58 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): up_gate_proj_tensor, down_proj_tensor = layer.extract_moe_ffn_weights(state_dict) assert up_gate_proj_tensor[0].shape == [ - layer.hidden_size, layer.moe_intermediate_size * 2 + layer.hidden_size, + layer.moe_intermediate_size * 2, ] assert down_proj_tensor[0].shape == [ - layer.moe_intermediate_size, layer.hidden_size + layer.moe_intermediate_size, + layer.hidden_size, ] up_gate_proj_tensor = paddle.stack(up_gate_proj_tensor, axis=0).view(paddle.float8_e4m3fn) down_proj_tensor = paddle.stack(down_proj_tensor, axis=0).view(paddle.float8_e4m3fn) added_wfp8afp8_attrs = [ - "up_gate_proj_weight", "down_proj_weight", "up_gate_proj_weight_scale", - "down_proj_weight_scale", "up_gate_proj_in_scale", "down_proj_in_scale" + "up_gate_proj_weight", + "down_proj_weight", + "up_gate_proj_weight_scale", + "down_proj_weight_scale", + "up_gate_proj_in_scale", + "down_proj_in_scale", ] def _extract_scale_tensor(key_template): result = [] for i in range(layer.num_experts): - result.append( - get_tensor(state_dict.pop(key_template.format(i)))) + result.append(get_tensor(state_dict.pop(key_template.format(i)))) return paddle.concat(result).cast("float32") weight_key_map = layer.weight_key_map - up_gate_proj_weight_scale = _extract_scale_tensor( - weight_key_map["up_gate_proj_expert_weight_scale_key"]) - down_proj_weight_scale = _extract_scale_tensor( - weight_key_map["down_proj_expert_weight_scale_key"]) - up_gate_proj_in_scale = _extract_scale_tensor( - weight_key_map["up_gate_proj_expert_in_scale_key"]) - down_proj_in_scale = _extract_scale_tensor( - weight_key_map["down_proj_expert_in_scale_key"]) + up_gate_proj_weight_scale = _extract_scale_tensor(weight_key_map["up_gate_proj_expert_weight_scale_key"]) + down_proj_weight_scale = _extract_scale_tensor(weight_key_map["down_proj_expert_weight_scale_key"]) + up_gate_proj_in_scale = _extract_scale_tensor(weight_key_map["up_gate_proj_expert_in_scale_key"]) + down_proj_in_scale = _extract_scale_tensor(weight_key_map["down_proj_expert_in_scale_key"]) - for idx, weight_tensor in enumerate([ - up_gate_proj_tensor, down_proj_tensor, up_gate_proj_weight_scale, - down_proj_weight_scale, up_gate_proj_in_scale, down_proj_in_scale - ]): + for idx, weight_tensor in enumerate( + [ + up_gate_proj_tensor, + down_proj_tensor, + up_gate_proj_weight_scale, + down_proj_weight_scale, + up_gate_proj_in_scale, + down_proj_in_scale, + ] + ): name = added_wfp8afp8_attrs[idx] setattr( - layer, name, + layer, + name, layer.create_parameter( shape=weight_tensor.shape, dtype=weight_tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) if weight_tensor.dtype == paddle.float8_e4m3fn: getattr(layer, name).copy_(weight_tensor, False) else: @@ -354,11 +369,16 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): } sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func( - topk_ids, num_local_experts, config_up_gate_proj["BLOCK_SIZE_M"]) + topk_ids, num_local_experts, config_up_gate_proj["BLOCK_SIZE_M"] + ) max_possible_num_post_padded = sorted_token_ids.shape[0] grid = ( - ceil_div(max_possible_num_post_padded, config_up_gate_proj["BLOCK_SIZE_M"]) * - ceil_div(moe_intermediate_size * 2, config_up_gate_proj["BLOCK_SIZE_N"]), ) + ceil_div( + max_possible_num_post_padded, + config_up_gate_proj["BLOCK_SIZE_M"], + ) + * ceil_div(moe_intermediate_size * 2, config_up_gate_proj["BLOCK_SIZE_N"]), + ) permute_x = fastdeploy.model_executor.ops.gpu.moe_fused_hadamard_quant_fp8( x, @@ -366,7 +386,8 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): topk_ids=topk_ids, top_k=top_k, intermediate_size=hidden_size, - tiled=False) + tiled=False, + ) fused_moe_kernel_paddle[grid]( permute_x, @@ -410,8 +431,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): even_Ks=hidden_size % config_up_gate_proj["BLOCK_SIZE_K"] == 0, ) - down_proj_input = paddle.incubate.nn.functional.swiglu( - up_gate_proj_out) + down_proj_input = paddle.incubate.nn.functional.swiglu(up_gate_proj_out) down_proj_input = fastdeploy.model_executor.ops.gpu.moe_fused_hadamard_quant_fp8( down_proj_input, @@ -419,7 +439,8 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): topk_ids=topk_ids, top_k=top_k, intermediate_size=moe_intermediate_size, - tiled=True) + tiled=True, + ) config_down_proj = { "BLOCK_SIZE_M": 32, @@ -434,8 +455,9 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): ) grid = ( - ceil_div(max_possible_num_post_padded, config_down_proj["BLOCK_SIZE_M"]) * - ceil_div(hidden_size, config_down_proj["BLOCK_SIZE_N"]), ) + ceil_div(max_possible_num_post_padded, config_down_proj["BLOCK_SIZE_M"]) + * ceil_div(hidden_size, config_down_proj["BLOCK_SIZE_N"]), + ) fused_moe_kernel_paddle[grid]( down_proj_input, @@ -486,6 +508,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase): return out + class BlockWiseFP8MoEMethod(QuantMethodBase): """ Use Triton Group Gemm to compute Fused BlockWise FP8 Quant MoE. @@ -498,13 +521,14 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): self.quant_config = quant_config self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] self.added_scale_attrs = [ - "up_gate_proj_weight_scale", "down_proj_weight_scale" + "up_gate_proj_weight_scale", + "down_proj_weight_scale", ] def process_prequanted_weights(self, layer: nn.Layer, state_dict) -> None: """process_prequanted_weights""" - raise NotImplementedError() + raise NotImplementedError def create_weights(self, layer: nn.Layer, state_dict): """ @@ -521,10 +545,9 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): weight_list = [] weight_scale_list = [] for i in range(layer.num_local_experts): - from fastdeploy.model_executor.layers.utils import \ - per_block_cast_to_fp8 - quant_weight, scale = per_block_cast_to_fp8( - weight_tensor[i], self.quant_config.weight_block_size) + from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8 + + quant_weight, scale = per_block_cast_to_fp8(weight_tensor[i], self.quant_config.weight_block_size) weight_list.append(quant_weight) weight_scale_list.append(scale) @@ -533,8 +556,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): create_and_set_parameter(layer, weight_name, quanted_weight) quanted_weight_scale = paddle.stack(weight_scale_list, axis=0) - quanted_weight_scale = quanted_weight_scale.transpose( - [0, 2, 1]).contiguous() + quanted_weight_scale = quanted_weight_scale.transpose([0, 2, 1]).contiguous() create_and_set_parameter(layer, scale_name, quanted_weight_scale) def check(self, layer: nn.Layer, up_gate_proj_weights, down_proj_weights): @@ -542,10 +564,12 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): check layer is valid for this method """ assert up_gate_proj_weights[0].shape == [ - layer.hidden_size, layer.moe_intermediate_size * 2 + layer.hidden_size, + layer.moe_intermediate_size * 2, ] assert down_proj_weights[0].shape == [ - layer.moe_intermediate_size, layer.hidden_size + layer.moe_intermediate_size, + layer.hidden_size, ] def apply( @@ -585,23 +609,22 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess( - topk_ids, num_local_experts, config["BLOCK_SIZE_M"]) + topk_ids, num_local_experts, config["BLOCK_SIZE_M"] + ) max_num_tokens_padded = sorted_token_ids.shape[0] - grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * - ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), ) + grid = ( + ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) + * ceil_div(moe_intermediate_size * 2, config["BLOCK_SIZE_N"]), + ) from .triton_moe_kernels import fused_moe_kernel_paddle - x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - x, self.quant_config.weight_block_size[0]) + x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(x, self.quant_config.weight_block_size[0]) - cache13 = paddle.empty([token_num * top_k * max(N1, N2)], - dtype=x.dtype) - intermediate_cache1 = cache13[:token_num * top_k * N1].view( - [token_num * top_k, N1]) - intermediate_cache3 = cache13[:token_num * top_k * N2].view( - [token_num * top_k, N2]) + cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype) + intermediate_cache1 = cache13[: token_num * top_k * N1].view([token_num * top_k, N1]) + intermediate_cache3 = cache13[: token_num * top_k * N2].view([token_num * top_k, N2]) fused_moe_kernel_paddle[grid]( x_q, @@ -645,14 +668,15 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): even_Ks=hidden_size % config["BLOCK_SIZE_K"] == 0, ) - intermediate_cache2 = paddle.incubate.nn.functional.swiglu( - intermediate_cache1) + intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1) - grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * - ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) + grid = ( + ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]), + ) x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( - intermediate_cache2, self.quant_config.weight_block_size[0]) + intermediate_cache2, self.quant_config.weight_block_size[0] + ) fused_moe_kernel_paddle[grid]( x_q, diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py index 5ec8c31af..e54734901 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py @@ -18,8 +18,7 @@ import paddle from paddle import nn import fastdeploy -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce from fastdeploy.utils import ceil_div from ..quantization.quant_base import QuantMethodBase @@ -45,12 +44,12 @@ class Wint2MoeMethod(QuantMethodBase): """ check layer is valid for this method """ - assert len( - up_gate_proj_weights - ) == layer.num_local_experts, "up_gate_proj_weights length should be equal to num_local_experts." - assert len( - down_proj_weights - ) == layer.num_local_experts, "down_proj_weights length should be equal to num_local_experts." + assert ( + len(up_gate_proj_weights) == layer.num_local_experts + ), "up_gate_proj_weights length should be equal to num_local_experts." + assert ( + len(down_proj_weights) == layer.num_local_experts + ), "down_proj_weights length should be equal to num_local_experts." def create_weights(self, layer: nn.Layer, state_dict): """ @@ -78,29 +77,22 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): """ Paddle cutlass process prequanted weights. """ - up_gate_proj_expert_weight_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_key", None) - down_proj_expert_weight_key = layer.weight_key_map.get( - "down_proj_expert_weight_key", None) - up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get( - "up_gate_proj_expert_weight_scale_key", None) - down_proj_expert_weight_scale_key = layer.weight_key_map.get( - "down_proj_expert_weight_scale_key", None) - up_gate_proj_expert_super_scales_key = layer.weight_key_map.get( - "up_gate_proj_expert_super_scales_key", None) - down_proj_expert_super_scales_key = layer.weight_key_map.get( - "down_proj_expert_super_scales_key", None) - up_gate_proj_expert_code_scale_key = layer.weight_key_map.get( - "up_gate_proj_expert_code_scale_key", None) - down_proj_expert_code_scale_key = layer.weight_key_map.get( - "down_proj_expert_code_scale_key", None) - up_gate_proj_expert_code_zp_key = layer.weight_key_map.get( - "up_gate_proj_expert_code_zp_key", None) - down_proj_expert_code_zp_key = layer.weight_key_map.get( - "down_proj_expert_code_zp_key", None) + up_gate_proj_expert_weight_key = layer.weight_key_map.get("up_gate_proj_expert_weight_key", None) + down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None) + up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None) + down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None) + up_gate_proj_expert_super_scales_key = layer.weight_key_map.get("up_gate_proj_expert_super_scales_key", None) + down_proj_expert_super_scales_key = layer.weight_key_map.get("down_proj_expert_super_scales_key", None) + up_gate_proj_expert_code_scale_key = layer.weight_key_map.get("up_gate_proj_expert_code_scale_key", None) + down_proj_expert_code_scale_key = layer.weight_key_map.get("down_proj_expert_code_scale_key", None) + up_gate_proj_expert_code_zp_key = layer.weight_key_map.get("up_gate_proj_expert_code_zp_key", None) + down_proj_expert_code_zp_key = layer.weight_key_map.get("down_proj_expert_code_zp_key", None) up_gate_proj_weights, down_proj_weights = layer.load_experts_weight( - state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key) + state_dict, + up_gate_proj_expert_weight_key, + down_proj_expert_weight_key, + ) # self.check(layer, up_gate_proj_weights, down_proj_weights) up_gate_proj_weight_scale = [] @@ -114,37 +106,23 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): for i in range(layer.num_experts): expert_idx = layer.expert_id_offset + i up_gate_proj_weight_scale.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_weight_scale_key.format(expert_idx))) + ) down_proj_weight_scale.append( - get_tensor( - state_dict.pop( - down_proj_expert_weight_scale_key.format(expert_idx)))) + get_tensor(state_dict.pop(down_proj_expert_weight_scale_key.format(expert_idx))) + ) up_gate_proj_super_scales.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_super_scales_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_super_scales_key.format(expert_idx))) + ) down_proj_super_scales.append( - get_tensor( - state_dict.pop( - down_proj_expert_super_scales_key.format(expert_idx)))) + get_tensor(state_dict.pop(down_proj_expert_super_scales_key.format(expert_idx))) + ) up_gate_proj_code_scale.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_code_scale_key.format(expert_idx)))) - down_proj_code_scale.append( - get_tensor( - state_dict.pop( - down_proj_expert_code_scale_key.format(expert_idx)))) - up_gate_proj_code_zp.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_code_zp_key.format(expert_idx)))) - down_proj_code_zp.append( - get_tensor( - state_dict.pop( - down_proj_expert_code_zp_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_code_scale_key.format(expert_idx))) + ) + down_proj_code_scale.append(get_tensor(state_dict.pop(down_proj_expert_code_scale_key.format(expert_idx)))) + up_gate_proj_code_zp.append(get_tensor(state_dict.pop(up_gate_proj_expert_code_zp_key.format(expert_idx)))) + down_proj_code_zp.append(get_tensor(state_dict.pop(down_proj_expert_code_zp_key.format(expert_idx)))) up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0) down_proj_weight = paddle.stack(down_proj_weights, axis=0) @@ -167,7 +145,7 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): "up_gate_proj_code_scale": up_gate_proj_code_scale, "down_proj_code_scale": down_proj_code_scale, "up_gate_proj_code_zp": up_gate_proj_code_zp, - "down_proj_code_zp": down_proj_code_zp + "down_proj_code_zp": down_proj_code_zp, } for name, tensor in name_tensor_map.items(): create_and_set_parameter(layer, name, tensor) @@ -189,6 +167,7 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): """ from fastdeploy.model_executor.ops.gpu import moe_expert_dispatch + ( permute_input, token_nums_per_expert, @@ -200,8 +179,9 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): x, gate_out, layer.gate_correction_bias, - (layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") - else None), # if set, permute_input will be int8_t + ( + layer.up_gate_proj_in_scale if hasattr(layer, "up_gate_proj_in_scale") else None + ), # if set, permute_input will be int8_t layer.top_k, False, topk_only_mode=False, @@ -243,12 +223,10 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): class TritonWint2FusedMoeMethod(CutlassWint2FusedMoeMethod): - def __init__(self, quant_config): super().__init__(quant_config) self.moe_quant_type = quant_config.moe_quant_type - def apply( self, layer: nn.Layer, @@ -259,8 +237,7 @@ class TritonWint2FusedMoeMethod(CutlassWint2FusedMoeMethod): Use Wint2 Triton Fusedmoe compute Fused MoE. """ - from fastdeploy.model_executor.ops.triton_ops import \ - moe_wint2_ffn_kernel + from fastdeploy.model_executor.ops.triton_ops import moe_wint2_ffn_kernel topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( gate_out, @@ -288,7 +265,6 @@ class TritonWint2FusedMoeMethod(CutlassWint2FusedMoeMethod): double_quant = True num_valid_tokens = topk_ids.shape[0] * topk_ids.shape[1] - config = { "BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 512, @@ -300,11 +276,11 @@ class TritonWint2FusedMoeMethod(CutlassWint2FusedMoeMethod): from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess( - topk_ids, E, config["BLOCK_SIZE_M"]) + topk_ids, E, config["BLOCK_SIZE_M"] + ) max_possible_num_post_padded = sorted_token_ids.shape[0] - grid = (ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) * - ceil_div(N, config["BLOCK_SIZE_N"]), ) + grid = (ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) * ceil_div(N, config["BLOCK_SIZE_N"]),) moe_wint2_ffn_kernel[grid]( x, @@ -360,9 +336,10 @@ class TritonWint2FusedMoeMethod(CutlassWint2FusedMoeMethod): "num_stages": 8, } - grid = (ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) * - ceil_div(layer.down_proj_weight.shape[-1], config["BLOCK_SIZE_N"]), ) - + grid = ( + ceil_div(max_possible_num_post_padded, config["BLOCK_SIZE_M"]) + * ceil_div(layer.down_proj_weight.shape[-1], config["BLOCK_SIZE_N"]), + ) moe_wint2_ffn_kernel[grid]( intermediate_cache2, @@ -409,7 +386,6 @@ class TritonWint2FusedMoeMethod(CutlassWint2FusedMoeMethod): fused_moe_out = paddle.sum(intermediate_cache3, axis=1) - if layer.tp_size > 1: tensor_model_parallel_all_reduce(fused_moe_out) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py index 6f74acdff..03331e46b 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_xpu_backend.py @@ -19,10 +19,8 @@ from typing import Dict import paddle from paddle import nn -from fastdeploy.model_executor.layers.quantization.quant_base import \ - QuantMethodBase -from fastdeploy.model_executor.layers.quantization.weight_only import \ - WeightOnlyConfig +from fastdeploy.model_executor.layers.quantization.quant_base import QuantMethodBase +from fastdeploy.model_executor.layers.quantization.weight_only import WeightOnlyConfig from fastdeploy.model_executor.ops.xpu import weight_quantize_xpu from .fused_moe_backend_base import MoEMethodBase @@ -44,16 +42,17 @@ class XPUMoEMethod(MoEMethodBase): weights[idx] = weight.transpose([1, 0]) stacked_up_gate_proj_weights = paddle.stack(up_gate_proj_weights, axis=0) stacked_down_proj_weights = paddle.stack(down_proj_weights, axis=0) - for idx, weight_tensor in enumerate( - [stacked_up_gate_proj_weights, stacked_down_proj_weights]): + for idx, weight_tensor in enumerate([stacked_up_gate_proj_weights, stacked_down_proj_weights]): weight_name = self.added_weight_attrs[idx] setattr( - layer, weight_name, + layer, + weight_name, layer.create_parameter( shape=weight_tensor.shape, dtype=weight_tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, weight_name).set_value(weight_tensor) def apply_tp( @@ -77,14 +76,16 @@ class XPUMoEMethod(MoEMethodBase): None, # down_proj bias None, # up_gate_proj scale None, # down_proj scale - None, # up_gate_proj_in_scale - "", # moe_quant_type + None, # up_gate_proj_in_scale + "", # moe_quant_type layer.top_k, False, # moe group, used in deepseek ) if layer.tp_size > 1: - from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce + from fastdeploy.distributed.communication_op import ( + tensor_model_parallel_all_reduce, + ) + tensor_model_parallel_all_reduce(fused_moe_out) return fused_moe_out @@ -111,6 +112,7 @@ class XPUMoEMethod(MoEMethodBase): """ raise NotImplementedError + class XPUWeightOnlyMoEMethod(QuantMethodBase): """ XPU Fused MoE Method. @@ -124,8 +126,7 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase): self.quant_config = quant_config self.moe_quant_type = self.quant_config.algo - def create_weights(self, layer: nn.Layer, state_dict: Dict[str, - paddle.Tensor]): + def create_weights(self, layer: nn.Layer, state_dict: Dict[str, paddle.Tensor]): """ Paddle cutlass create weight process. """ @@ -133,14 +134,19 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase): assert len(up_gate_proj_weights) == layer.num_local_experts assert len(down_proj_weights) == layer.num_local_experts assert up_gate_proj_weights[0].shape == [ - layer.hidden_size, layer.moe_intermediate_size * 2 + layer.hidden_size, + layer.moe_intermediate_size * 2, ] assert down_proj_weights[0].shape == [ - layer.moe_intermediate_size, layer.hidden_size + layer.moe_intermediate_size, + layer.hidden_size, ] added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] - added_scale_attrs = ["up_gate_proj_weight_scale", "down_proj_weight_scale"] + added_scale_attrs = [ + "up_gate_proj_weight_scale", + "down_proj_weight_scale", + ] for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]): weight_name = added_weight_attrs[idx] @@ -150,28 +156,31 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase): weight_scale_list = [] for i in range(layer.num_local_experts): quant_weight, scale = weight_quantize_xpu( - weight_tensor[i], self.moe_quant_type, -1, - -1) # weight is [k,n] - weight_list.append(quant_weight.transpose( - [1, 0])) # transpose weight to [n,k] + weight_tensor[i], self.moe_quant_type, -1, -1 + ) # weight is [k,n] + weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k] weight_scale_list.append(scale) quanted_weight = paddle.stack(weight_list, axis=0) setattr( - layer, weight_name, + layer, + weight_name, layer.create_parameter( shape=quanted_weight.shape, dtype=quanted_weight.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, weight_name).set_value(quanted_weight) quanted_weight_scale = paddle.stack(weight_scale_list, axis=0) setattr( - layer, scale_name, + layer, + scale_name, layer.create_parameter( shape=quanted_weight_scale.shape, dtype=quanted_weight_scale.dtype, - )) + ), + ) getattr(layer, scale_name).set_value(quanted_weight_scale) def apply( @@ -193,19 +202,18 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase): layer.down_proj_weight, None, # up_gate_proj bias None, # down_proj bias - (layer.up_gate_proj_weight_scale - if hasattr(layer, "up_gate_proj_weight_scale") else None), - (layer.down_proj_weight_scale - if hasattr(layer, "down_proj_weight_scale") else None), - (layer.down_proj_in_scale - if hasattr(layer, "down_proj_in_scale") else None), + (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None), + (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None), + (layer.down_proj_in_scale if hasattr(layer, "down_proj_in_scale") else None), self.moe_quant_type, layer.top_k, False, # moe group, used in deepseek ) if layer.tp_size > 1: - from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce + from fastdeploy.distributed.communication_op import ( + tensor_model_parallel_all_reduce, + ) + tensor_model_parallel_all_reduce(fused_moe_out) return fused_moe_out diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 2494f298a..6ea31642f 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -27,16 +27,21 @@ def get_moe_method(): return moe method based on device platform """ from fastdeploy.platforms import current_platform + if current_platform.is_cuda(): from .fused_moe_cutlass_backend import CutlassMoEMethod + return CutlassMoEMethod(None) elif current_platform.is_xpu(): from .fused_moe_xpu_backend import XPUMoEMethod + return XPUMoEMethod(None) elif current_platform.is_gcu(): from fastdeploy.model_executor.layers.backends import GCUFusedMoeMethod + return GCUFusedMoeMethod(None) - raise NotImplementedError() + raise NotImplementedError + class FusedMoE(nn.Layer): """ @@ -76,9 +81,9 @@ class FusedMoE(nn.Layer): self.ep_size = fd_config.parallel_config.expert_parallel_size self.ep_rank = fd_config.parallel_config.expert_parallel_rank - assert (self.tp_size >= 1 and self.ep_size == 1) or \ - (self.tp_size == 1 and self.ep_size > 1), \ - 'MoE only support parallelism on TP or EP dimension.' + assert (self.tp_size >= 1 and self.ep_size == 1) or ( + self.tp_size == 1 and self.ep_size > 1 + ), "MoE only support parallelism on TP or EP dimension." self.hidden_size = fd_config.model_config.hidden_size self.num_experts = num_experts @@ -123,7 +128,8 @@ class FusedMoE(nn.Layer): f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \ {top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \ , ep_size={self.ep_size}, \ - tp_size={self.tp_size}.") + tp_size={self.tp_size}." + ) def init_moe_weights(self): """ @@ -147,15 +153,31 @@ class FusedMoE(nn.Layer): ) up_gate_proj_output_dim = self.moe_intermediate_size * 2 if self.moe_quant_type in ["fp8", "wint8"]: - up_gate_proj_weight_shape = [self.num_local_experts, up_gate_proj_output_dim, self.hidden_size] - down_proj_weight_shape = [self.num_local_experts, self.hidden_size, self.moe_intermediate_size] + up_gate_proj_weight_shape = [ + self.num_local_experts, + up_gate_proj_output_dim, + self.hidden_size, + ] + down_proj_weight_shape = [ + self.num_local_experts, + self.hidden_size, + self.moe_intermediate_size, + ] else: - up_gate_proj_weight_shape = [self.num_local_experts, self.hidden_size, up_gate_proj_output_dim] - down_proj_weight_shape = [self.num_local_experts, self.moe_intermediate_size, self.hidden_size] + up_gate_proj_weight_shape = [ + self.num_local_experts, + self.hidden_size, + up_gate_proj_output_dim, + ] + down_proj_weight_shape = [ + self.num_local_experts, + self.moe_intermediate_size, + self.hidden_size, + ] # Create parameters if self.moe_quant_type == "fp8": - #(TODO:gaoziyuan) + # (TODO:gaoziyuan) pass elif self.moe_quant_type == "wint8": self.weight_dtype = "int8" @@ -187,9 +209,12 @@ class FusedMoE(nn.Layer): dtype=self._dtype, ) - def load_experts_weight(self, state_dict: dict, - up_gate_proj_expert_weight_key: str, - down_proj_expert_weight_key: str): + def load_experts_weight( + self, + state_dict: dict, + up_gate_proj_expert_weight_key: str, + down_proj_expert_weight_key: str, + ): """ Load experts weight from state_dict. Args: @@ -199,35 +224,23 @@ class FusedMoE(nn.Layer): """ up_gate_proj_weights = [] down_proj_weights = [] - is_ffn_merged = up_gate_proj_expert_weight_key.format( - self.expert_id_offset) in state_dict + is_ffn_merged = up_gate_proj_expert_weight_key.format(self.expert_id_offset) in state_dict if is_ffn_merged: for i in range(self.num_local_experts): expert_idx = self.expert_id_offset + i up_gate_proj_weights.append( - get_tensor( - state_dict.pop( - up_gate_proj_expert_weight_key.format(expert_idx)))) - down_proj_weights.append( - get_tensor( - state_dict.pop( - down_proj_expert_weight_key.format(expert_idx)))) + get_tensor(state_dict.pop(up_gate_proj_expert_weight_key.format(expert_idx))) + ) + down_proj_weights.append(get_tensor(state_dict.pop(down_proj_expert_weight_key.format(expert_idx)))) else: - gate_expert_weight_key = up_gate_proj_expert_weight_key.replace( - "up_gate_proj", "gate_proj") - up_expert_weight_key = up_gate_proj_expert_weight_key.replace( - "up_gate_proj", "up_proj") + gate_expert_weight_key = up_gate_proj_expert_weight_key.replace("up_gate_proj", "gate_proj") + up_expert_weight_key = up_gate_proj_expert_weight_key.replace("up_gate_proj", "up_proj") for j in range(self.num_local_experts): expert_idx = self.expert_id_offset + j - gate = get_tensor( - state_dict.pop(gate_expert_weight_key.format(expert_idx))) - up = get_tensor( - state_dict.pop(up_expert_weight_key.format(expert_idx))) + gate = get_tensor(state_dict.pop(gate_expert_weight_key.format(expert_idx))) + up = get_tensor(state_dict.pop(up_expert_weight_key.format(expert_idx))) up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1)) - down_proj_weights.append( - get_tensor( - state_dict.pop( - down_proj_expert_weight_key.format(expert_idx)))) + down_proj_weights.append(get_tensor(state_dict.pop(down_proj_expert_weight_key.format(expert_idx)))) return up_gate_proj_weights, down_proj_weights def extract_moe_ffn_weights(self, state_dict: dict): @@ -246,46 +259,43 @@ class FusedMoE(nn.Layer): AssertionError: If required weight keys are missing or number of weights doesn't match number of local experts. """ - up_gate_proj_expert_weight_key = self.weight_key_map.get( - "up_gate_proj_expert_weight_key", None) - down_proj_expert_weight_key = self.weight_key_map.get( - "down_proj_expert_weight_key", None) + up_gate_proj_expert_weight_key = self.weight_key_map.get("up_gate_proj_expert_weight_key", None) + down_proj_expert_weight_key = self.weight_key_map.get("down_proj_expert_weight_key", None) assert up_gate_proj_expert_weight_key is not None, "up_gate_proj_expert_weight_key should not be none." assert down_proj_expert_weight_key is not None, "down_proj_expert_weight_key should not be none." up_gate_proj_weights, down_proj_weights = self.load_experts_weight( - state_dict, up_gate_proj_expert_weight_key, down_proj_expert_weight_key) - assert len( - up_gate_proj_weights - ) == self.num_local_experts, "up_gate_proj_weights length should be equal to num_local_experts." - assert len( - down_proj_weights - ) == self.num_local_experts, "down_proj_weights length should be equal to num_local_experts." + state_dict, + up_gate_proj_expert_weight_key, + down_proj_expert_weight_key, + ) + assert ( + len(up_gate_proj_weights) == self.num_local_experts + ), "up_gate_proj_weights length should be equal to num_local_experts." + assert ( + len(down_proj_weights) == self.num_local_experts + ), "down_proj_weights length should be equal to num_local_experts." return up_gate_proj_weights, down_proj_weights - def extract_gate_correction_bias(self, gate_correction_bias_key, - state_dict): + def extract_gate_correction_bias(self, gate_correction_bias_key, state_dict): """ extract_gate_correction_bias function. """ - gate_correction_bias_tensor = get_tensor( - state_dict.pop(gate_correction_bias_key)).astype("float32") + gate_correction_bias_tensor = get_tensor(state_dict.pop(gate_correction_bias_key)).astype("float32") return gate_correction_bias_tensor def load_state_dict(self, state_dict): """ load_state_dict function. """ - self.gate_correction_bias_key = self.weight_key_map.get( - "gate_correction_bias_key", None) + self.gate_correction_bias_key = self.weight_key_map.get("gate_correction_bias_key", None) if self.gate_correction_bias_key is not None and self.gate_correction_bias_key in state_dict: self.moe_use_gate_correction_bias = True else: self.moe_use_gate_correction_bias = False if self.moe_use_gate_correction_bias: - gate_correction_bias_tensor = self.extract_gate_correction_bias( - self.gate_correction_bias_key, state_dict) + gate_correction_bias_tensor = self.extract_gate_correction_bias(self.gate_correction_bias_key, state_dict) self.gate_correction_bias = self.create_parameter( shape=gate_correction_bias_tensor.shape, dtype="float32", diff --git a/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py b/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py index ff289524f..1e146c306 100644 --- a/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py +++ b/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py @@ -14,9 +14,11 @@ # limitations under the License. """ -import triton import triton.language as tl -from fastdeploy.model_executor.ops.triton_ops.triton_utils_v2 import paddle_use_triton_v2 + +from fastdeploy.model_executor.ops.triton_ops.triton_utils_v2 import ( + paddle_use_triton_v2, +) @paddle_use_triton_v2() @@ -30,7 +32,6 @@ def fused_moe_kernel_paddle( sorted_token_ids_ptr, expert_ids_ptr, num_tokens_post_padded_ptr, - # Matrix dimensions max_possible_num_post_padded, num_valid_tokens, @@ -109,16 +110,13 @@ def fused_moe_kernel_paddle( offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + - offs_k[None, :] * stride_ak) + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak) off_experts = tl.load(expert_ids_ptr + pid_m) - b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + - offs_bn[None, :] * stride_bn) + b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) if use_int8_w8a16: - b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[ - None, :] * stride_bsn + b_scale_ptrs = b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn b_scale = tl.load(b_scale_ptrs) if use_fp8_w8a8: @@ -140,19 +138,14 @@ def fused_moe_kernel_paddle( mask=token_mask[:, None], other=0.0, ) - b = tl.load(b_ptrs, - cache_modifier=".cv", - eviction_policy='evict_first') + b = tl.load(b_ptrs, cache_modifier=".cv", eviction_policy="evict_first") else: a = tl.load( a_ptrs, - mask=token_mask[:, None] & - (offs_k[None, :] < K - k * BLOCK_SIZE_K), + mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K), other=0.0, ) - b = tl.load(b_ptrs, - mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, - other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) # We accumulate along the K dimension. if use_int8_w8a16: @@ -161,13 +154,14 @@ def fused_moe_kernel_paddle( if group_k > 0 and group_n > 0: k_start = k * BLOCK_SIZE_K offs_ks = k_start // group_k - a_scale = tl.load(a_scale_ptrs + offs_ks * stride_ask, - mask=token_mask, - other=0.0) + a_scale = tl.load( + a_scale_ptrs + offs_ks * stride_ask, + mask=token_mask, + other=0.0, + ) b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk) - accumulator += tl.dot(a, b) * a_scale[:, - None] * b_scale[None, :] + accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :] else: accumulator = tl.dot(a, b, acc=accumulator) else: @@ -177,9 +171,7 @@ def fused_moe_kernel_paddle( b_ptrs += BLOCK_SIZE_K * stride_bk if MUL_ROUTED_WEIGHT: - moe_weight = tl.load(topk_weights_ptr + offs_token, - mask=token_mask, - other=0) + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) accumulator = accumulator * moe_weight[:, None] if use_int8_w8a16: accumulator = (accumulator * b_scale).to(compute_type) @@ -192,8 +184,7 @@ def fused_moe_kernel_paddle( accumulator = accumulator.to(compute_type) # Write back the block of the output offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ - None, :] + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] c_mask = token_mask[:, None] & (offs_cn[None, :] < N) tl.store(c_ptrs, accumulator, mask=c_mask) diff --git a/fastdeploy/model_executor/layers/mtp_linear.py b/fastdeploy/model_executor/layers/mtp_linear.py index 80a8835ea..e7e427522 100644 --- a/fastdeploy/model_executor/layers/mtp_linear.py +++ b/fastdeploy/model_executor/layers/mtp_linear.py @@ -69,11 +69,9 @@ class ParallelEHProjection(nn.Layer): self.linear = ColumnParallelLinear( embedding_dim, num_embeddings, - mp_group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=None, - has_bias=True - if self.bias_key is not None else False, + has_bias=True if self.bias_key is not None else False, gather_output=need_gather, fuse_matmul_bias=False, # False diff更小 ) @@ -81,11 +79,9 @@ class ParallelEHProjection(nn.Layer): self.linear = RowParallelLinear( embedding_dim, num_embeddings, - mp_group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=None, - has_bias=True - if self.bias_key is not None else False, + has_bias=True if self.bias_key is not None else False, input_is_parallel=False, fuse_matmul_bias=False, # False diff更小 ) @@ -99,20 +95,15 @@ class ParallelEHProjection(nn.Layer): """ if self.use_ep: - self.weight.set_value( - get_tensor(state_dict.pop(self.weight_key)).astype( - paddle.get_default_dtype())) + self.weight.set_value(get_tensor(state_dict.pop(self.weight_key)).astype(paddle.get_default_dtype())) else: - weight_tensor = get_tensor( - state_dict.pop(self.weight_key)).astype( - paddle.get_default_dtype()) + weight_tensor = get_tensor(state_dict.pop(self.weight_key)).astype(paddle.get_default_dtype()) if self.linear.weight.shape != weight_tensor.shape: weight_tensor = weight_tensor.transpose([1, 0]) self.linear.weight.set_value(weight_tensor) if self.bias_key is not None: - bias = get_tensor(state_dict.pop(self.bias_key)).astype( - paddle.get_default_dtype()) + bias = get_tensor(state_dict.pop(self.bias_key)).astype(paddle.get_default_dtype()) self.linear.bias.set_value(bias) def forward(self, input): diff --git a/fastdeploy/model_executor/layers/normalization.py b/fastdeploy/model_executor/layers/normalization.py index c91e74173..dff17321b 100644 --- a/fastdeploy/model_executor/layers/normalization.py +++ b/fastdeploy/model_executor/layers/normalization.py @@ -102,8 +102,7 @@ class RMSNorm(nn.Layer): dtype=self._norm_weight_dtype, ) - def load_state_dict(self, state_dict: Dict[str, - paddle.Tensor | np.ndarray]): + def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): """ Load the checkpoint state dictionary into the layer. @@ -112,15 +111,10 @@ class RMSNorm(nn.Layer): """ # weight - weight_tensor = paddle.cast( - get_tensor(state_dict.pop(self.weight_key)), - self._norm_weight_dtype) + weight_tensor = paddle.cast(get_tensor(state_dict.pop(self.weight_key)), self._norm_weight_dtype) self.weight.set_value(weight_tensor) - def forward( - self, - x, - residual_input: Optional[paddle.Tensor] = None) -> paddle.Tensor: + def forward(self, x, residual_input: Optional[paddle.Tensor] = None) -> paddle.Tensor: """ Defines the forward computation of the layer. @@ -140,9 +134,7 @@ class RMSNorm(nn.Layer): if current_platform.is_gcu(): if residual_input is None: return rms_norm(x, self.weight, self.eps) - norm_out = self.norm_func( - x, residual_input, self.weight, self.eps - ) + norm_out = self.norm_func(x, residual_input, self.weight, self.eps) else: norm_out = self.norm_func( x, @@ -152,7 +144,7 @@ class RMSNorm(nn.Layer): begin_norm_axis=self.begin_norm_axis, bias=self.bias, residual=residual_input, - quant_scale=-1 if self.quant_scale is None else self.quant_scale, + quant_scale=(-1 if self.quant_scale is None else self.quant_scale), quant_round_type=self.quant_round_type, quant_max_bound=self.quant_max_bound, quant_min_bound=self.quant_min_bound, @@ -242,8 +234,7 @@ class LayerNorm(nn.Layer): dtype=self._norm_weight_dtype, ) - def load_state_dict(self, state_dict: Dict[str, - paddle.Tensor | np.ndarray]): + def load_state_dict(self, state_dict: Dict[str, paddle.Tensor | np.ndarray]): """ Load the checkpoint state dictionary into the layer. @@ -252,22 +243,18 @@ class LayerNorm(nn.Layer): """ # weight - weight_tensor = paddle.cast( - get_tensor(state_dict.pop(self.weight_key)), - self._norm_weight_dtype) + weight_tensor = paddle.cast(get_tensor(state_dict.pop(self.weight_key)), self._norm_weight_dtype) self.weight.set_value(weight_tensor) # bias if self.with_bias: bias_tensor = paddle.cast( get_tensor(state_dict.pop(self.bias_key)), - self._norm_weight_dtype) + self._norm_weight_dtype, + ) self.bias.set_value(bias_tensor) - def forward( - self, - x, - residual_input: Optional[paddle.Tensor] = None) -> paddle.Tensor: + def forward(self, x, residual_input: Optional[paddle.Tensor] = None) -> paddle.Tensor: """ Defines the forward computation of the layer. @@ -326,7 +313,7 @@ class LayerNorm(nn.Layer): begin_norm_axis=1, bias=self.bias, residual=residual_input, - quant_scale=-1 if self.quant_scale is None else self.quant_scale, + quant_scale=(-1 if self.quant_scale is None else self.quant_scale), quant_round_type=self.quant_round_type, quant_max_bound=self.quant_max_bound, quant_min_bound=self.quant_min_bound, diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index 43f3bbc23..ebfc2d2a5 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional import paddle @@ -49,17 +50,20 @@ class BlockWiseFP8Config(QuantConfigBase): return cls(weight_block_size) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: - ''' + """ Get quantization method. - ''' + """ if isinstance(layer, FusedMoE): if self.use_deep_gemm: - from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import \ - DeepGemmFusedMoeMethod + from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import ( + DeepGemmFusedMoeMethod, + ) + return DeepGemmFusedMoeMethod(self) else: - from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \ - BlockWiseFP8MoEMethod + from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import ( + BlockWiseFP8MoEMethod, + ) return BlockWiseFP8MoEMethod(self) else: return BlockWiseFP8LinearMethod(self) @@ -81,8 +85,8 @@ class BlockWiseFP8LinearMethod(QuantMethodBase): layer.weight_shape.reverse() layer.weight_scale = layer.create_parameter( shape=[ - (layer.output_size + self.quant_config.weight_block_size[0] - - 1) // self.quant_config.weight_block_size[0], + (layer.output_size + self.quant_config.weight_block_size[0] - 1) + // self.quant_config.weight_block_size[0], (layer.input_size + self.quant_config.weight_block_size[1] - 1) // self.quant_config.weight_block_size[1], ], @@ -93,8 +97,7 @@ class BlockWiseFP8LinearMethod(QuantMethodBase): def process_loaded_weights(self, layer, weights) -> None: weight_tensor = weights.transpose([1, 0]) - quanted_weight_tensor, weight_block_scale_tensor = ( - per_block_cast_to_fp8(weight_tensor)) + quanted_weight_tensor, weight_block_scale_tensor = per_block_cast_to_fp8(weight_tensor) layer.weight.copy_(quanted_weight_tensor, False) layer.weight_scale.set_value(weight_block_scale_tensor) @@ -113,10 +116,11 @@ class BlockWiseFP8LinearMethod(QuantMethodBase): def apply(self, layer, x): x, x_scale_tensor = fastdeploy.model_executor.ops.gpu.per_token_quant_padding( - x, self.quant_config.weight_block_size[0]) - linear_out = paddle.empty((x.shape[0], layer.output_size), - dtype=paddle.bfloat16) - import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm + x, self.quant_config.weight_block_size[0] + ) + linear_out = paddle.empty((x.shape[0], layer.output_size), dtype=paddle.bfloat16) + from fastdeploy.model_executor.ops.gpu import deep_gemm + deep_gemm.gemm_fp8_fp8_bf16_nt( (x, x_scale_tensor), (layer.weight, layer.weight_scale), diff --git a/fastdeploy/model_executor/layers/quantization/kv_cache.py b/fastdeploy/model_executor/layers/quantization/kv_cache.py index 54e2b8cbf..8cc77ae54 100644 --- a/fastdeploy/model_executor/layers/quantization/kv_cache.py +++ b/fastdeploy/model_executor/layers/quantization/kv_cache.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from enum import Enum from typing import Optional @@ -29,6 +30,7 @@ class KvCacheQuantzationTypes(str, Enum): """ KvCacheQuantzationTypes """ + INT8 = "int8" FP8 = "float8_e4m3fn" INT8_ZP = "int8_zp" @@ -50,7 +52,7 @@ class KvCacheQuantConfig(QuantConfigBase): try: self.quant_type = KvCacheQuantzationTypes(kv_cache_quant_type) except ValueError: - raise ValueError(f'Invalid Kvcache type: {kv_cache_quant_type}') + raise ValueError(f"Invalid Kvcache type: {kv_cache_quant_type}") self.has_zero_point = "zp" in kv_cache_quant_type @@ -59,7 +61,7 @@ class KvCacheQuantConfig(QuantConfigBase): elif self.quant_type == KvCacheQuantzationTypes.FP8 or self.quant_type == KvCacheQuantzationTypes.FP8_ZP: self.max_bound = 448.0 else: - raise ValueError(f'Invalid Kvcache type: {kv_cache_quant_type}') + raise ValueError(f"Invalid Kvcache type: {kv_cache_quant_type}") def name(self) -> str: """ @@ -110,12 +112,12 @@ class KVCacheMethodBase(QuantMethodBase): """ load_scale """ - cache_k_scale_tensor = get_tensor( - state_dict.pop(self.cache_k_scale_name)).cast( - paddle.get_default_dtype()).reshape_([-1]) - cache_v_scale_tensor = get_tensor( - state_dict.pop(self.cache_v_scale_name)).cast( - paddle.get_default_dtype()).reshape_([-1]) + cache_k_scale_tensor = ( + get_tensor(state_dict.pop(self.cache_k_scale_name)).cast(paddle.get_default_dtype()).reshape_([-1]) + ) + cache_v_scale_tensor = ( + get_tensor(state_dict.pop(self.cache_v_scale_name)).cast(paddle.get_default_dtype()).reshape_([-1]) + ) cache_k_scale = self.cache_quant_config.max_bound / cache_k_scale_tensor cache_v_scale = self.cache_quant_config.max_bound / cache_v_scale_tensor @@ -138,13 +140,13 @@ class KVCacheMethodBase(QuantMethodBase): self.cache_v_zp_name = layer.prefix + ".cachev_matmul.activation_zero_point" if self.cache_quant_config.quant_type == KvCacheQuantzationTypes.INT8: - setattr(layer, "cache_quant_type_str", "cache_int8") - setattr(layer, "quant_max_bound", 127.0) - setattr(layer, "quant_min_bound", -127.0) + layer.cache_quant_type_str = "cache_int8" + layer.quant_max_bound = 127.0 + layer.quant_min_bound = -127.0 elif self.cache_quant_config.quant_type == KvCacheQuantzationTypes.FP8: - setattr(layer, "cache_quant_type_str", "cache_fp8") - setattr(layer, "quant_max_bound", 448.0) - setattr(layer, "quant_min_bound", -448.0) + layer.cache_quant_type_str = "cache_fp8" + layer.quant_max_bound = 448.0 + layer.quant_min_bound = -448.0 else: raise NotImplementedError(f"{self.cache_quant_config.quant_type} is not implemented") @@ -156,5 +158,4 @@ class KVCacheMethodBase(QuantMethodBase): """ apply """ - raise RuntimeError( - f"{self.__class__.__name__}.apply should not be called.") + raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.") diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py index 4868b346b..0c39cbc63 100644 --- a/fastdeploy/model_executor/layers/quantization/mix_quant.py +++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional from fastdeploy.model_executor.layers.attention.attention import Attention @@ -51,26 +52,23 @@ class MixQuantConfig(QuantConfigBase): @classmethod def from_config(cls, config: dict) -> "MixQuantConfig": - return cls(config['dense_quant_type'], config['moe_quant_type'], - config.get('kv_cache_quant_type', None), - config.get('image_moe_quant_type', None)) + return cls( + config["dense_quant_type"], + config["moe_quant_type"], + config.get("kv_cache_quant_type", None), + config.get("image_moe_quant_type", None), + ) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if isinstance(layer, FusedMoE): if layer.moe_tag == "Image": - return get_quantization_config( - self.image_moe_quant_type).from_config( - {}).get_quant_method(layer) + return get_quantization_config(self.image_moe_quant_type).from_config({}).get_quant_method(layer) else: - return get_quantization_config( - self.moe_quant_type).from_config( - {}).get_quant_method(layer) + return get_quantization_config(self.moe_quant_type).from_config({}).get_quant_method(layer) elif isinstance(layer, Attention): if self.kv_cache_quant_type is not None: - return (get_quantization_config("kvcache").from_config( - self.kv_cache_quant_type).get_quant_method(layer)) + return get_quantization_config("kvcache").from_config(self.kv_cache_quant_type).get_quant_method(layer) else: return None else: - return get_quantization_config(self.dense_quant_type).from_config( - {}).get_quant_method(layer) + return get_quantization_config(self.dense_quant_type).from_config({}).get_quant_method(layer) diff --git a/fastdeploy/model_executor/layers/quantization/ops/__init__.py b/fastdeploy/model_executor/layers/quantization/ops/__init__.py index 082226713..63924f0bb 100644 --- a/fastdeploy/model_executor/layers/quantization/ops/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/ops/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from .cutlass_scaled_mm import cutlass_scaled_mm from .scaled_fp8_quant import scaled_fp8_quant diff --git a/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py b/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py index 984c4df2d..43ebba7b2 100644 --- a/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py +++ b/fastdeploy/model_executor/layers/quantization/ops/cutlass_scaled_mm.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional import paddle @@ -20,12 +21,14 @@ import paddle import fastdeploy -def cutlass_scaled_mm(a: paddle.Tensor, - b: paddle.Tensor, - scale_a: paddle.Tensor, - scale_b: paddle.Tensor, - out_dtype: paddle.dtype, - bias: Optional[paddle.Tensor] = None) -> paddle.Tensor: +def cutlass_scaled_mm( + a: paddle.Tensor, + b: paddle.Tensor, + scale_a: paddle.Tensor, + scale_b: paddle.Tensor, + out_dtype: paddle.dtype, + bias: Optional[paddle.Tensor] = None, +) -> paddle.Tensor: """ `cutlass_scaled_mm` implements a fused version of `output = paddle.mm((scale_a * a), (scale_b * b)).to(out_dtype)` @@ -48,9 +51,8 @@ def cutlass_scaled_mm(a: paddle.Tensor, scale_a.shape * [1, 128] == a.shape scale_b.shape * [128, 128] == b.shape """ - assert (out_dtype == paddle.bfloat16 or out_dtype == paddle.float16) - assert bias is None or bias.shape[0] == b.shape[ - 0] and bias.dtype == out_dtype + assert out_dtype == paddle.bfloat16 or out_dtype == paddle.float16 + assert bias is None or bias.shape[0] == b.shape[0] and bias.dtype == out_dtype # Ensure input tensors have valid shapes # assert a.numel() > 0, "Input tensor 'a' must not be empty" # assert b.numel() > 0, "Input tensor 'b' must not be empty" @@ -59,12 +61,11 @@ def cutlass_scaled_mm(a: paddle.Tensor, m = a.shape[0] n = b.shape[0] - cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0) + cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0 assert cutlass_compatible_b out = paddle.empty([m, n], dtype=out_dtype) - fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm( - out, a, b, scale_a, scale_b, bias) + fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias) return out @@ -100,7 +101,7 @@ def scaled_fp8_quant( scaling factor. """ # This code assumes batch_dim and num_tokens are flattened - assert (input.ndim == 2) + assert input.ndim == 2 shape = input.shape if num_token_padding: shape = (max(num_token_padding, input.shape[0]), shape[1]) @@ -109,18 +110,21 @@ def scaled_fp8_quant( if scale is None: if use_per_token_if_dynamic: scale = paddle.empty([shape[0], 1], dtype=paddle.float32) - from fastdeploy.model_executor.ops.gpu import \ - dynamic_per_token_scaled_fp8_quant + from fastdeploy.model_executor.ops.gpu import ( + dynamic_per_token_scaled_fp8_quant, + ) + dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub) else: scale = paddle.zeros([1], dtype=paddle.float32) - from fastdeploy.model_executor.ops.gpu import \ - dynamic_scaled_fp8_quant + from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant + dynamic_scaled_fp8_quant(output, input, scale) else: # num_token_padding not implemented for this case # assert (scale.numel() == 1 or num_token_padding is None) from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant + static_scaled_fp8_quant(output, input, scale) return output, scale diff --git a/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py b/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py index 3588f2bc2..50c3c6b43 100644 --- a/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py +++ b/fastdeploy/model_executor/layers/quantization/ops/scaled_fp8_quant.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional import paddle @@ -49,7 +50,7 @@ def scaled_fp8_quant( scaling factor. """ # This code assumes batch_dim and num_tokens are flattened - assert (input.ndim == 2) + assert input.ndim == 2 shape = input.shape if num_token_padding: shape = (max(num_token_padding, input.shape[0]), shape[1]) @@ -58,18 +59,21 @@ def scaled_fp8_quant( if scale is None: if use_per_token_if_dynamic: scale = paddle.empty([shape[0], 1], dtype=paddle.float32) - from fastdeploy.model_executor.ops.gpu import \ - dynamic_per_token_scaled_fp8_quant + from fastdeploy.model_executor.ops.gpu import ( + dynamic_per_token_scaled_fp8_quant, + ) + dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub) else: scale = paddle.zeros([1], dtype=paddle.float32) - from fastdeploy.model_executor.ops.gpu import \ - dynamic_scaled_fp8_quant + from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant + dynamic_scaled_fp8_quant(output, input, scale) else: # num_token_padding not implemented for this case # assert (scale.numel() == 1 or num_token_padding is None) from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant + static_scaled_fp8_quant(output, input, scale) return output, scale diff --git a/fastdeploy/model_executor/layers/quantization/quant_base.py b/fastdeploy/model_executor/layers/quantization/quant_base.py index 40df4aaf9..aa7e065f4 100644 --- a/fastdeploy/model_executor/layers/quantization/quant_base.py +++ b/fastdeploy/model_executor/layers/quantization/quant_base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from abc import ABC, abstractmethod from typing import Any, Optional @@ -65,8 +66,7 @@ class QuantConfigBase(ABC): for key in keys: if key in config: return config[key] - raise ValueError(f"Cannot find any of {keys} in the model's " - "quantization config.") + raise ValueError(f"Cannot find any of {keys} in the model's " "quantization config.") @abstractmethod def get_quant_method(self, layer, prefix) -> Optional[QuantMethodBase]: diff --git a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py index e2845af36..5841e9f35 100644 --- a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional from fastdeploy.model_executor.layers.moe import FusedMoE @@ -50,8 +51,10 @@ class TensorWiseFP8Config(QuantConfigBase): return method according to this config! """ if isinstance(layer, FusedMoE): - from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \ - TensorWiseFP8MoEMethod + from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import ( + TensorWiseFP8MoEMethod, + ) + return TensorWiseFP8MoEMethod(self) else: return TensorWiseFP8LinearMethod(self) @@ -112,7 +115,9 @@ class TensorWiseFP8LinearMethod(QuantMethodBase): compute! """ from fastdeploy.model_executor.ops.gpu import ( - cutlass_fp8_fp8_half_gemm_fused, fused_hadamard_quant_fp8) + cutlass_fp8_fp8_half_gemm_fused, + fused_hadamard_quant_fp8, + ) fp8_x = fused_hadamard_quant_fp8(x, scale=self.act_scale) @@ -124,5 +129,6 @@ class TensorWiseFP8LinearMethod(QuantMethodBase): bias=None, scale=self.total_scale, output_dtype="bfloat16", - activation_type="identity") + activation_type="identity", + ) return linear_out diff --git a/fastdeploy/model_executor/layers/quantization/w4a8.py b/fastdeploy/model_executor/layers/quantization/w4a8.py index f8776d6c1..86ec0f405 100644 --- a/fastdeploy/model_executor/layers/quantization/w4a8.py +++ b/fastdeploy/model_executor/layers/quantization/w4a8.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional from ..moe import FusedMoE @@ -36,7 +37,10 @@ class W4A8Config(QuantConfigBase): def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if isinstance(layer, FusedMoE): - from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import CutlassW4A8MoEMethod + from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( + CutlassW4A8MoEMethod, + ) + return CutlassW4A8MoEMethod(self) else: raise ValueError(f"Unsupported layer type {type(layer)} for w4a8") diff --git a/fastdeploy/model_executor/layers/quantization/w4afp8.py b/fastdeploy/model_executor/layers/quantization/w4afp8.py index 0785f4ab9..cf8e19a68 100644 --- a/fastdeploy/model_executor/layers/quantization/w4afp8.py +++ b/fastdeploy/model_executor/layers/quantization/w4afp8.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional import paddle @@ -69,13 +70,14 @@ class W4AFP8LinearMethod(QuantMethodBase): pass def process_loaded_weights(self, layer, weights) -> None: - quanted_weight_tensor, weight_scale_tensor = ( - fastdeploy.model_executor.ops.gpu. - scaled_gemm_f8_i4_f16_weight_quantize( - paddle.cast(weights, "float32").cpu(), - groupsize=-1, - scale_dtype="float16", - )) + ( + quanted_weight_tensor, + weight_scale_tensor, + ) = fastdeploy.model_executor.ops.gpu.scaled_gemm_f8_i4_f16_weight_quantize( + paddle.cast(weights, "float32").cpu(), + groupsize=-1, + scale_dtype="float16", + ) weight_scale_tensor = paddle.view(weight_scale_tensor, layer._dtype) layer.weight.set_value(quanted_weight_tensor) layer.weight_scale.set_value(weight_scale_tensor) @@ -87,11 +89,12 @@ class W4AFP8LinearMethod(QuantMethodBase): layer.weight_scale, zero_points=None, bias=layer.bias if layer.add_bias else None, - out_scale=self.quant_config.weight_scale_dict.get(layer.prefix + - ".weight_scale") - / (self.quant_config.act_scale_dict.get(layer.prefix + - ".activation_scale") * - QUANT_SCALING_FACTOR * QUANT_SCALING_FACTOR), + out_scale=self.quant_config.weight_scale_dict.get(layer.prefix + ".weight_scale") + / ( + self.quant_config.act_scale_dict.get(layer.prefix + ".activation_scale") + * QUANT_SCALING_FACTOR + * QUANT_SCALING_FACTOR + ), groupsize=0, out_dtype=layer._dtype, ) diff --git a/fastdeploy/model_executor/layers/quantization/w8a8.py b/fastdeploy/model_executor/layers/quantization/w8a8.py index 0d86789e0..3a4298528 100644 --- a/fastdeploy/model_executor/layers/quantization/w8a8.py +++ b/fastdeploy/model_executor/layers/quantization/w8a8.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional import paddle @@ -30,8 +31,13 @@ class W8A8Config(QuantConfigBase): quantization config for weight 8bits and activation 8bits """ - def __init__(self, weight_scale_dict, act_scale_dict, use_gemm_dequant, - use_smooth_quant) -> None: + def __init__( + self, + weight_scale_dict, + act_scale_dict, + use_gemm_dequant, + use_smooth_quant, + ) -> None: super().__init__() self.weight_scale_dict = weight_scale_dict self.act_scale_dict = act_scale_dict @@ -73,27 +79,22 @@ class W8A8LinearMethod(QuantMethodBase): layer.weight_dtype = "int8" if self.quant_config.use_smooth_quant: self.smooth_quant_method.create_weights(layer) - weight_scale = self.quant_config.weight_scale_dict.get(layer.prefix + - ".weight_scale") - in_scale = self.quant_config.act_scale_dict.get(layer.prefix + - ".activation_scale") + weight_scale = self.quant_config.weight_scale_dict.get(layer.prefix + ".weight_scale") + in_scale = self.quant_config.act_scale_dict.get(layer.prefix + ".activation_scale") self.skip_quant = False if weight_scale is None or in_scale is None: self.skip_quant = True return max_range = 127.0 - linear_out_scale = paddle.to_tensor( - weight_scale / - (max_range * max_range * in_scale)).astype("float32") + linear_out_scale = paddle.to_tensor(weight_scale / (max_range * max_range * in_scale)).astype("float32") layer.linear_out_scale = layer.create_parameter( shape=[layer.embed_dim], dtype="float32", is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) - layer.linear_out_scale.set_value( - convert_to_npu_dequant_scale(linear_out_scale)) + layer.linear_out_scale.set_value(convert_to_npu_dequant_scale(linear_out_scale)) def process_loaded_weights(self, layer, weights) -> None: if self.quant_config.use_smooth_quant: @@ -113,11 +114,13 @@ class W8A8LinearMethod(QuantMethodBase): return linear_out if self.quant_config.use_gemm_dequant: linear_out = fastdeploy.model_executor.ops.gpu.gemm_dequant( - x, layer.weight, layer.linear_out_scale, layer._dtype) + x, layer.weight, layer.linear_out_scale, layer._dtype + ) else: linear_out = paddle.matmul(x, layer.weight, False, True) linear_out = fastdeploy.model_executor.ops.gpu.dequant_int8( - linear_out, layer.linear_out_scale, layer._dtype) + linear_out, layer.linear_out_scale, layer._dtype + ) return linear_out @@ -149,8 +152,7 @@ class SmoothQuantLinearMethod(QuantMethodBase): def process_loaded_weights(self, layer, weights) -> None: if layer.shift_key in layer.state_dict: - shift_tensor = get_tensor(layer.state_dict.pop( - layer.shift_key)).astype(paddle.get_default_dtype()) + shift_tensor = get_tensor(layer.state_dict.pop(layer.shift_key)).astype(paddle.get_default_dtype()) else: shift_tensor = paddle.zeros( shape=layer.linear_shift_shape, @@ -158,8 +160,7 @@ class SmoothQuantLinearMethod(QuantMethodBase): ) layer.linear_shift.set_value(shift_tensor) if layer.smooth_key in layer.state_dict: - smooth_tensor = get_tensor(layer.state_dict.pop( - layer.smooth_key)).astype(paddle.get_default_dtype()) + smooth_tensor = get_tensor(layer.state_dict.pop(layer.smooth_key)).astype(paddle.get_default_dtype()) else: smooth_tensor = paddle.ones( shape=[layer.linear_smooth_shape], diff --git a/fastdeploy/model_executor/layers/quantization/weight_only.py b/fastdeploy/model_executor/layers/quantization/weight_only.py index 0a48c60f3..60756f7d0 100644 --- a/fastdeploy/model_executor/layers/quantization/weight_only.py +++ b/fastdeploy/model_executor/layers/quantization/weight_only.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os from abc import abstractmethod from typing import Optional @@ -42,8 +43,7 @@ class WeightOnlyConfig(QuantConfigBase): self.algo = algo # arch (int): The compute arch for target device. For example, A100 is 80, v100 is 70, # if you do not assign arch, we will get arch from your device, default: None. - self.weight_only_linear_arch = os.getenv( - "FLAGS_weight_only_linear_arch") + self.weight_only_linear_arch = os.getenv("FLAGS_weight_only_linear_arch") if self.weight_only_linear_arch is not None: self.weight_only_linear_arch = int(self.weight_only_linear_arch) self.quant_max_bound = 0 @@ -60,47 +60,62 @@ class WeightOnlyConfig(QuantConfigBase): def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if current_platform.is_xpu(): - from fastdeploy.model_executor.layers.backends import \ - XPUWeightOnlyLinearMethod - from fastdeploy.model_executor.layers.moe.fused_moe_xpu_backend import \ - XPUWeightOnlyMoEMethod + from fastdeploy.model_executor.layers.backends import ( + XPUWeightOnlyLinearMethod, + ) + from fastdeploy.model_executor.layers.moe.fused_moe_xpu_backend import ( + XPUWeightOnlyMoEMethod, + ) + if isinstance(layer, FusedMoE): return XPUWeightOnlyMoEMethod(self) else: return XPUWeightOnlyLinearMethod(self) elif current_platform.is_gcu(): from fastdeploy.model_executor.layers.backends import ( - GCUWeightOnlyLinearMethod, GCUWeightOnlyMoEMethod) + GCUWeightOnlyLinearMethod, + GCUWeightOnlyMoEMethod, + ) + if isinstance(layer, FusedMoE): return GCUWeightOnlyMoEMethod(self) else: return GCUWeightOnlyLinearMethod(self) elif current_platform.is_dcu(): if isinstance(layer, FusedMoE): - from fastdeploy.model_executor.layers.backends import \ - DCUTritonWeightOnlyMoEMethod + from fastdeploy.model_executor.layers.backends import ( + DCUTritonWeightOnlyMoEMethod, + ) + return DCUTritonWeightOnlyMoEMethod(self) else: - from fastdeploy.model_executor.layers.backends import \ - DCUWeightOnlyLinearMethod + from fastdeploy.model_executor.layers.backends import ( + DCUWeightOnlyLinearMethod, + ) + return DCUWeightOnlyLinearMethod(self) else: if isinstance(layer, FusedMoE): if layer.use_method == "cutlass": - from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import \ - CutlassWeightOnlyMoEMethod + from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( + CutlassWeightOnlyMoEMethod, + ) + return CutlassWeightOnlyMoEMethod(self) elif layer.use_method == "triton": - from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \ - TritonWeightOnlyMoEMethod + from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import ( + TritonWeightOnlyMoEMethod, + ) + return TritonWeightOnlyMoEMethod(self) elif layer.use_method == "marlin": - from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import \ - MarlinWeightOnlyMoEMethod + from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import ( + MarlinWeightOnlyMoEMethod, + ) + return MarlinWeightOnlyMoEMethod(self) else: - raise ValueError( - f"Unsupported MOE backend {layer.use_method}") + raise ValueError(f"Unsupported MOE backend {layer.use_method}") else: return GPUWeightOnlyLinearMethod(self) @@ -110,7 +125,9 @@ class WINT8Config(WeightOnlyConfig): weight only int8 config """ - def __init__(self, ) -> None: + def __init__( + self, + ) -> None: super().__init__("weight_only_int8") @classmethod @@ -126,7 +143,9 @@ class WINT4Config(WeightOnlyConfig): weight only int4 config """ - def __init__(self, ) -> None: + def __init__( + self, + ) -> None: super().__init__("weight_only_int4") @classmethod @@ -174,8 +193,7 @@ class WeightOnlyLinearMethod(QuantMethodBase): weight=layer.weight, bias=layer.bias if layer.add_bias else None, weight_scale=layer.weight_scale, - weight_dtype="int8" - if self.quant_config.name() == "wint8" else "int4", + weight_dtype=("int8" if self.quant_config.name() == "wint8" else "int4"), arch=self.quant_config.weight_only_linear_arch, ) return linear_out @@ -205,8 +223,7 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod): quant_weight = get_tensor(state_dict.pop(layer.weight_key)) weight_scale = get_tensor(state_dict.pop(layer.weight_scale_key)) layer.weight.set_value(quant_weight) - layer.weight_scale.set_value( - weight_scale.astype(paddle.get_default_dtype())) + layer.weight_scale.set_value(weight_scale.astype(paddle.get_default_dtype())) def process_loaded_weights(self, layer, weight) -> None: @@ -217,5 +234,4 @@ class GPUWeightOnlyLinearMethod(WeightOnlyLinearMethod): ) layer.weight.set_value(quanted_weight_tensor) - layer.weight_scale.set_value( - weight_scale_tensor.astype(paddle.get_default_dtype())) + layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype())) diff --git a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py index 34e2b7845..60339b2ae 100644 --- a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py +++ b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py @@ -13,14 +13,19 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional import paddle from fastdeploy.model_executor.layers.quantization.ops import ( - cutlass_scaled_mm, scaled_fp8_quant) + cutlass_scaled_mm, + scaled_fp8_quant, +) from fastdeploy.model_executor.layers.quantization.quant_base import ( - QuantConfigBase, QuantMethodBase) + QuantConfigBase, + QuantMethodBase, +) class WFP8AFP8Config(QuantConfigBase): @@ -37,21 +42,18 @@ class WFP8AFP8Config(QuantConfigBase): self.quant_round_type = 1 def name(self) -> str: - """ - """ + """ """ return "wfp8afp8" @classmethod def from_config(cls, config: dict) -> "WFP8AFP8Config": - """ - """ + """ """ weight_scale_dict = config.get("weight_scale_dict", None) act_scale_dict = config.get("act_scale_dict", None) return cls(weight_scale_dict, act_scale_dict) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: - """ - """ + """ """ return WFP8AFP8LinearMethod(self) @@ -68,8 +70,7 @@ class WFP8AFP8LinearMethod(QuantMethodBase): self.quant_config = quant_config def create_weights(self, layer): - """ - """ + """ """ layer.weight_shape.reverse() layer.weight_dtype = "float8_e4m3fn" # TODO(YuanRisheng): set weight logic should be moved to process_loaded_weights func @@ -82,8 +83,7 @@ class WFP8AFP8LinearMethod(QuantMethodBase): ) def process_loaded_weights(self, layer, weights) -> None: - """ - """ + """ """ if self.skip_quant: weight_tensor = weights.cast(layer._dtype) layer.weight.set_value(weight_tensor) @@ -99,18 +99,21 @@ class WFP8AFP8LinearMethod(QuantMethodBase): layer.weight_scale.set_value(weight_scale) def apply(self, layer, x): - """ - """ + """ """ if self.skip_quant: linear_out = paddle.matmul(x, layer.weight, False, True) return linear_out if self.use_per_token_if_dynamic: out_type = x.dtype - a_q, a_scales = scaled_fp8_quant( - x, use_per_token_if_dynamic=self.use_per_token_if_dynamic) - linear_out = cutlass_scaled_mm(a_q, layer.weight, a_scales, - layer.weight_scale, out_type, - layer.bias) + a_q, a_scales = scaled_fp8_quant(x, use_per_token_if_dynamic=self.use_per_token_if_dynamic) + linear_out = cutlass_scaled_mm( + a_q, + layer.weight, + a_scales, + layer.weight_scale, + out_type, + layer.bias, + ) else: raise NotImplementedError return linear_out diff --git a/fastdeploy/model_executor/layers/quantization/wint2.py b/fastdeploy/model_executor/layers/quantization/wint2.py index 97d676f4b..2586f719f 100644 --- a/fastdeploy/model_executor/layers/quantization/wint2.py +++ b/fastdeploy/model_executor/layers/quantization/wint2.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Optional from ..moe import FusedMoE @@ -79,29 +80,22 @@ class WINT2Config(QuantConfigBase): """ dense_quant_type = config.get("dense_quant_config", "wint8") - dense_quant_granularity = config.get("dense_quant_granularity", - "per_channel") + dense_quant_granularity = config.get("dense_quant_granularity", "per_channel") moe_quant_config = config.get("moe_quant_config", {}) moe_quant_type = moe_quant_config.get("quant_type", "w4w2") moe_w4_quant_config = moe_quant_config.get("moe_w4_quant_config", {}) - moe_w4_quant_type = moe_w4_quant_config.get("quant_type", - "wint4") - moe_w4_quant_granularity = moe_w4_quant_config.get( - "quant_granularity", "per_channel") - moe_w4_quant_start_layer = moe_w4_quant_config.get( - "quant_start_layer", 0) + moe_w4_quant_type = moe_w4_quant_config.get("quant_type", "wint4") + moe_w4_quant_granularity = moe_w4_quant_config.get("quant_granularity", "per_channel") + moe_w4_quant_start_layer = moe_w4_quant_config.get("quant_start_layer", 0) moe_w4_quant_end_layer = moe_w4_quant_config.get("quant_end_layer", 6) moe_w2_quant_config = moe_quant_config.get("moe_w2_quant_config", {}) moe_w2_quant_type = moe_w2_quant_config.get("quant_type", "wint2") - moe_w2_quant_granularity = moe_w2_quant_config.get( - "quant_granularity", "pp_acc") - moe_w2_quant_group_size = moe_w2_quant_config.get( - "quant_group_size", 0) - moe_w2_quant_start_layer = moe_w2_quant_config.get( - "quant_start_layer", 0) + moe_w2_quant_granularity = moe_w2_quant_config.get("quant_granularity", "pp_acc") + moe_w2_quant_group_size = moe_w2_quant_config.get("quant_group_size", 0) + moe_w2_quant_start_layer = moe_w2_quant_config.get("quant_start_layer", 0) moe_w2_quant_end_layer = moe_w2_quant_config.get("quant_end_layer", 0) return cls( @@ -130,13 +124,12 @@ class WINT2Config(QuantConfigBase): """ if isinstance(layer, FusedMoE): if layer.layer_idx <= self.moe_w4_quant_end_layer: - return get_quantization_config( - self.moe_w4_quant_type).from_config( - {}).get_quant_method(layer) + return get_quantization_config(self.moe_w4_quant_type).from_config({}).get_quant_method(layer) else: - from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import \ - CutlassWint2FusedMoeMethod + from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import ( + CutlassWint2FusedMoeMethod, + ) + return CutlassWint2FusedMoeMethod(self) else: - return get_quantization_config(self.dense_quant_type).from_config( - {}).get_quant_method(layer) + return get_quantization_config(self.dense_quant_type).from_config({}).get_quant_method(layer) diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py index 18bb1be33..4c06feeab 100644 --- a/fastdeploy/model_executor/layers/rotary_embedding.py +++ b/fastdeploy/model_executor/layers/rotary_embedding.py @@ -18,7 +18,7 @@ import math from typing import Optional, Tuple import paddle -import paddle.nn as nn +from paddle import nn from fastdeploy.config import ModelConfig from fastdeploy.platforms import current_platform @@ -30,7 +30,6 @@ from .utils import CpuGuard class ErnieRotaryEmbedding: - def __init__(self, rotary_dim, base, partial_rotary_factor): """ Pre-calculate rotary position embedding for position_ids. @@ -41,45 +40,36 @@ class ErnieRotaryEmbedding: def __call__(self, position_ids): bsz, max_seq_len = position_ids.shape[:2] - inv_freq = self.base**( - -paddle.arange(0, self.rotary_dim, 2, dtype="float32") / - self.rotary_dim) + inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) partial_rotary_position_ids = position_ids / self.partial_rotary_factor - freqs = paddle.einsum("ij,k->ijk", - partial_rotary_position_ids.cast("float32"), - inv_freq) - if paddle.is_compiled_with_xpu( - ) or paddle.is_compiled_with_custom_device("iluvatar_gpu"): + freqs = paddle.einsum("ij,k->ijk", partial_rotary_position_ids.cast("float32"), inv_freq) + if paddle.is_compiled_with_xpu() or paddle.is_compiled_with_custom_device("iluvatar_gpu"): # shape: [B, S, D] - rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), - dtype="float32") - emb = paddle.stack([freqs, freqs], axis=-1).reshape( - (bsz, max_seq_len, self.rotary_dim)) + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") + emb = paddle.stack([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim)) elif current_platform.is_gcu(): # shape: [B, S, D] rot_emb = paddle.concat([freqs.cos(), freqs.sin()], axis=-1) return rot_emb else: # shape: [B, S, D/2] - rot_emb = paddle.zeros( - (2, bsz, max_seq_len, 1, self.rotary_dim // 2), - dtype="float32") - emb = paddle.stack([freqs], axis=-1).reshape( - (bsz, max_seq_len, self.rotary_dim // 2)) + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim // 2), dtype="float32") + emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, self.rotary_dim // 2)) # shape: [B, S, 1, D] emb = paddle.unsqueeze(emb, 2) rot_emb[0] = paddle.cos(emb) rot_emb[1] = paddle.sin(emb) if paddle.is_compiled_with_custom_device("npu"): - return (paddle.concat([rot_emb, rot_emb], axis=3).transpose( - [0, 1, 2, 4, - 3]).reshape([2, bsz, max_seq_len, 1, self.rotary_dim])) + return ( + paddle.concat([rot_emb, rot_emb], axis=3) + .transpose([0, 1, 2, 4, 3]) + .reshape([2, bsz, max_seq_len, 1, self.rotary_dim]) + ) else: return rot_emb class QwenRotaryEmbedding: - def __init__(self, rotary_dim, base, partial_rotary_factor): """ Pre-calculate rotary position embedding for position_ids. @@ -90,22 +80,17 @@ class QwenRotaryEmbedding: def __call__(self, position_ids): bsz, max_seq_len = position_ids.shape[:2] - rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), - dtype="float32") - inv_freq = self.base**( - -paddle.arange(0, self.rotary_dim, 2, dtype="float32") / - self.rotary_dim) + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, self.rotary_dim), dtype="float32") + inv_freq = self.base ** (-paddle.arange(0, self.rotary_dim, 2, dtype="float32") / self.rotary_dim) # shape: [B, S, D/2] - freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), - inv_freq) + freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) if current_platform.is_gcu(): # shape: [B, S, D] rot_emb = paddle.concat([freqs.cos(), freqs.sin()], axis=-1) return rot_emb # shape: [B, S, 1, D] - emb = paddle.concat([freqs, freqs], axis=-1).reshape( - (bsz, max_seq_len, 1, self.rotary_dim)) + emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, 1, self.rotary_dim)) rot_emb[0] = paddle.cos(emb) rot_emb[1] = paddle.sin(emb) @@ -114,46 +99,30 @@ class QwenRotaryEmbedding: def yarn_get_mscale(scale=1, mscale=1): - """ - """ + """ """ if scale <= 1: return 1.0 return 0.1 * mscale * math.log(scale) + 1.0 -def yarn_find_correction_dim(num_rotations, - dim, - base=10000, - max_position_embeddings=2048): - """ - """ - return (dim * math.log(max_position_embeddings / - (num_rotations * 2 * math.pi))) / (2 * - math.log(base)) +def yarn_find_correction_dim(num_rotations, dim, base=10000, max_position_embeddings=2048): + """ """ + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) -def yarn_find_correction_range(low_rot, - high_rot, - dim, - base=10000, - max_position_embeddings=2048): - """ - """ - low = math.floor( - yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) - high = math.ceil( - yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)) +def yarn_find_correction_range(low_rot, high_rot, dim, base=10000, max_position_embeddings=2048): + """ """ + low = math.floor(yarn_find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(yarn_find_correction_dim(high_rot, dim, base, max_position_embeddings)) return max(low, 0), min(high, dim - 1) # Clamp values just in case def yarn_linear_ramp_mask(min, max, dim): - """ - """ + """ """ if min == max: max += 0.001 # Prevent singularity - linear_func = (paddle.arange(dim, dtype=paddle.float32) - min) / (max - - min) + linear_func = (paddle.arange(dim, dtype=paddle.float32) - min) / (max - min) ramp_func = paddle.clip(linear_func, 0, 1) return ramp_func @@ -205,9 +174,10 @@ class DeepseekScalingRotaryEmbedding(nn.Layer): self.beta_slow = beta_slow # Get n-d magnitude scaling corrected for interpolation. self.mscale = float( - yarn_get_mscale(self.scaling_factor, float(mscale)) / - yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) * - attn_factor) + yarn_get_mscale(self.scaling_factor, float(mscale)) + / yarn_get_mscale(self.scaling_factor, float(mscale_all_dim)) + * attn_factor + ) cache = self._compute_cos_sin_cache() @@ -215,27 +185,29 @@ class DeepseekScalingRotaryEmbedding(nn.Layer): self.register_buffer("cos_sin_cache", cache, persistable=True) def _compute_inv_freq(self, scaling_factor: float) -> paddle.Tensor: - pos_freqs = self.base**( - paddle.arange(0, self.rotary_dim, 2, dtype=paddle.float32) / - self.rotary_dim) + pos_freqs = self.base ** (paddle.arange(0, self.rotary_dim, 2, dtype=paddle.float32) / self.rotary_dim) inv_freq_extrapolation = 1.0 / pos_freqs inv_freq_interpolation = 1.0 / (scaling_factor * pos_freqs) - low, high = yarn_find_correction_range(self.beta_fast, self.beta_slow, - self.rotary_dim, self.base, - self.max_position_embeddings) + low, high = yarn_find_correction_range( + self.beta_fast, + self.beta_slow, + self.rotary_dim, + self.base, + self.max_position_embeddings, + ) # Get n-d rotational scaling corrected for extrapolation - inv_freq_mask = (1 - yarn_linear_ramp_mask( - low, high, self.rotary_dim // 2)) * self.extrapolation_factor - inv_freq = inv_freq_interpolation * ( - 1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask + inv_freq_mask = (1 - yarn_linear_ramp_mask(low, high, self.rotary_dim // 2)) * self.extrapolation_factor + inv_freq = inv_freq_interpolation * (1 - inv_freq_mask) + inv_freq_extrapolation * inv_freq_mask return inv_freq def _compute_cos_sin_cache(self) -> paddle.Tensor: inv_freq = self._compute_inv_freq(self.scaling_factor) - t = paddle.arange(self.max_position_embeddings * self.scaling_factor, - dtype=paddle.float32) + t = paddle.arange( + self.max_position_embeddings * self.scaling_factor, + dtype=paddle.float32, + ) freqs = paddle.einsum("i,j->ij", t, inv_freq) cos = freqs.cos() * self.mscale sin = freqs.sin() * self.mscale @@ -248,12 +220,9 @@ class DeepseekScalingRotaryEmbedding(nn.Layer): query: paddle.Tensor, key: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]: - """ - """ + """ """ # In-place operations that update the query and key tensors. - fused_rotary_position_encoding(query, key, position_ids, - self.cos_sin_cache, self.rotary_dim, - False) + fused_rotary_position_encoding(query, key, position_ids, self.cos_sin_cache, self.rotary_dim, False) return query, key @@ -271,12 +240,10 @@ def get_rope_impl( architecture = model_config.architectures[0] if model_config is None or architecture.startswith("Qwen"): - rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base, - partial_rotary_factor) + rotary_emb_layer = QwenRotaryEmbedding(rotary_dim, base, partial_rotary_factor) rotary_emb = rotary_emb_layer(position_ids) else: - rotary_emb_layer = ErnieRotaryEmbedding(rotary_dim, base, - partial_rotary_factor) + rotary_emb_layer = ErnieRotaryEmbedding(rotary_dim, base, partial_rotary_factor) rotary_emb = rotary_emb_layer(position_ids) return rotary_emb @@ -293,9 +260,8 @@ def get_rope_xpu( """ with CpuGuard(): position_ids = position_ids.cpu() - rotary_emb = get_rope_impl(rotary_dim, base, position_ids, - model_config, partial_rotary_factor) - return rotary_emb.to('xpu') + rotary_emb = get_rope_impl(rotary_dim, base, position_ids, model_config, partial_rotary_factor) + return rotary_emb.to("xpu") def get_rope( @@ -324,17 +290,20 @@ def get_rope( Default: 1 (apply to all dimensions). """ if current_platform.is_xpu(): - return get_rope_xpu(rotary_dim, base, position_ids, model_config, - partial_rotary_factor) + return get_rope_xpu(rotary_dim, base, position_ids, model_config, partial_rotary_factor) else: - return get_rope_impl(rotary_dim, base, position_ids, model_config, - partial_rotary_factor) + return get_rope_impl(rotary_dim, base, position_ids, model_config, partial_rotary_factor) class ErnieVlRotaryEmbedding3D: - - def __init__(self, rotary_dim, base, partial_rotary_factor, max_position, - freq_allocation): + def __init__( + self, + rotary_dim, + base, + partial_rotary_factor, + max_position, + freq_allocation, + ): self.rotary_dim = rotary_dim self.base = base self.paritial_rotary_factor = partial_rotary_factor @@ -342,36 +311,31 @@ class ErnieVlRotaryEmbedding3D: self.freq_allocation = freq_allocation def __call__(self, position_ids): - rot_emb = paddle.zeros( - (2, 1, self.max_position, 1, self.rotary_dim // 2), - dtype="float32") + rot_emb = paddle.zeros((2, 1, self.max_position, 1, self.rotary_dim // 2), dtype="float32") # position_ids_3d: [bsz, seq_len, 3] position_ids_3d = paddle.tile( - paddle.arange(self.max_position, - dtype="int64").unsqueeze(0).unsqueeze(-1), [1, 1, 3]) + paddle.arange(self.max_position, dtype="int64").unsqueeze(0).unsqueeze(-1), + [1, 1, 3], + ) - position_ids_3d[:, :position_ids.shape[1], :] = position_ids + position_ids_3d[:, : position_ids.shape[1], :] = position_ids # import pdb;pdb.set_trace() # position_ids: [bsz, seq_len] - position_ids = paddle.arange(0, self.max_position, 1, - dtype="float32").reshape((1, -1)) + position_ids = paddle.arange(0, self.max_position, 1, dtype="float32").reshape((1, -1)) position_ids = position_ids / self.paritial_rotary_factor indices = paddle.arange(0, self.rotary_dim, 2, dtype="float32") - indices = 1 / self.base**(indices / self.rotary_dim) + indices = 1 / self.base ** (indices / self.rotary_dim) # sinusoid_inp: [bsz, seq_len, 1, head_dim // 2] sinusoid_inp = position_ids.unsqueeze(-1) * indices.unsqueeze(0) # pos_emb: [bsz, seq_len, 1, head_dim] - pos_emb = paddle.concat( - [paddle.sin(sinusoid_inp), - paddle.cos(sinusoid_inp)], axis=-1) + pos_emb = paddle.concat([paddle.sin(sinusoid_inp), paddle.cos(sinusoid_inp)], axis=-1) # pos_emb: [bsz, 1, seq_len, head_dim] - pos_emb = paddle.reshape(pos_emb, - (-1, 1, self.max_position, self.rotary_dim)) + pos_emb = paddle.reshape(pos_emb, (-1, 1, self.max_position, self.rotary_dim)) # pos_emb: [bsz, seq_len, 1, head_dim] pos_emb = pos_emb.transpose([0, 2, 1, 3]) # sin: [bsz, seq_len, 1, head_dim // 2] @@ -388,35 +352,29 @@ class ErnieVlRotaryEmbedding3D: tmp_pos_id_2 = position_ids_3d[..., 2].squeeze().astype("int64") sin_bsz = paddle.index_select(sin, index=batch_indices, axis=0) - sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0, - axis=1)[:, :, :, -self.freq_allocation:] - sin_h = paddle.index_select(sin_bsz, index=tmp_pos_id_1, - axis=1)[:, :, :, :self.rotary_dim // 2 - - self.freq_allocation:2] - sin_w = paddle.index_select(sin_bsz, index=tmp_pos_id_2, - axis=1)[:, :, :, 1:self.rotary_dim // 2 - - self.freq_allocation:2] - sin_hw = paddle.stack([sin_h, sin_w], - axis=-1).reshape(sin_h.shape[:-1] + - [sin_h.shape[-1] * 2]) - sin_thw = paddle.concat([sin_hw, sin_t], axis=-1) # noqa + sin_t = paddle.index_select(sin_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, -self.freq_allocation :] + sin_h = paddle.index_select(sin_bsz, index=tmp_pos_id_1, axis=1)[ + :, :, :, : self.rotary_dim // 2 - self.freq_allocation : 2 + ] + sin_w = paddle.index_select(sin_bsz, index=tmp_pos_id_2, axis=1)[ + :, :, :, 1 : self.rotary_dim // 2 - self.freq_allocation : 2 + ] + sin_hw = paddle.stack([sin_h, sin_w], axis=-1).reshape(sin_h.shape[:-1] + [sin_h.shape[-1] * 2]) + sin_thw = paddle.concat([sin_hw, sin_t], axis=-1) cos_bsz = paddle.index_select(cos, index=batch_indices, axis=0) - cos_t = paddle.index_select(cos_bsz, index=tmp_pos_id_0, - axis=1)[:, :, :, -self.freq_allocation:] - cos_h = paddle.index_select(cos_bsz, index=tmp_pos_id_1, - axis=1)[:, :, :, :self.rotary_dim // 2 - - self.freq_allocation:2] - cos_w = paddle.index_select(cos_bsz, index=tmp_pos_id_2, - axis=1)[:, :, :, 1:self.rotary_dim // 2 - - self.freq_allocation:2] - cos_hw = paddle.stack([cos_h, cos_w], - axis=-1).reshape(cos_h.shape[:-1] + - [cos_h.shape[-1] * 2]) - cos_thw = paddle.concat([cos_hw, cos_t], axis=-1) # noqa + cos_t = paddle.index_select(cos_bsz, index=tmp_pos_id_0, axis=1)[:, :, :, -self.freq_allocation :] + cos_h = paddle.index_select(cos_bsz, index=tmp_pos_id_1, axis=1)[ + :, :, :, : self.rotary_dim // 2 - self.freq_allocation : 2 + ] + cos_w = paddle.index_select(cos_bsz, index=tmp_pos_id_2, axis=1)[ + :, :, :, 1 : self.rotary_dim // 2 - self.freq_allocation : 2 + ] + cos_hw = paddle.stack([cos_h, cos_w], axis=-1).reshape(cos_h.shape[:-1] + [cos_h.shape[-1] * 2]) + cos_thw = paddle.concat([cos_hw, cos_t], axis=-1) - rot_emb[0] = cos_thw # noqa - rot_emb[1] = sin_thw # noqa + rot_emb[0] = cos_thw + rot_emb[1] = sin_thw return rot_emb @@ -446,9 +404,8 @@ def get_rope_3d( max_position: Maximum position index to precompute. freq_allocation: Number of rotary dimensions allocated to temporal axis """ - rotary_emb3d_layer = ErnieVlRotaryEmbedding3D(rotary_dim, base, - partial_rotary_factor, - max_position, - freq_allocation) + rotary_emb3d_layer = ErnieVlRotaryEmbedding3D( + rotary_dim, base, partial_rotary_factor, max_position, freq_allocation + ) rotary_emb_3d = rotary_emb3d_layer(position_ids) return rotary_emb_3d diff --git a/fastdeploy/model_executor/layers/sample/__init__.py b/fastdeploy/model_executor/layers/sample/__init__.py index 373e64947..387091e47 100644 --- a/fastdeploy/model_executor/layers/sample/__init__.py +++ b/fastdeploy/model_executor/layers/sample/__init__.py @@ -11,6 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""" +""" " sample """ diff --git a/fastdeploy/model_executor/layers/sample/ops/__init__.py b/fastdeploy/model_executor/layers/sample/ops/__init__.py index 37c803ca3..16eb320b4 100644 --- a/fastdeploy/model_executor/layers/sample/ops/__init__.py +++ b/fastdeploy/model_executor/layers/sample/ops/__init__.py @@ -15,7 +15,9 @@ """ from .apply_penalty_multi_scores import ( - apply_penalty_multi_scores, apply_speculative_penalty_multi_scores) + apply_penalty_multi_scores, + apply_speculative_penalty_multi_scores, +) from .top_k_top_p_sampling import top_k_top_p_sampling __all__ = [ diff --git a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py index 392b13398..1123f112b 100644 --- a/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py +++ b/fastdeploy/model_executor/layers/sample/ops/apply_penalty_multi_scores.py @@ -37,8 +37,8 @@ def apply_penalty_multi_scores( apply_penalty_multi_scores """ if current_platform.is_cuda(): - from fastdeploy.model_executor.ops.gpu import \ - get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores + logits = get_token_penalty_multi_scores( pre_token_ids, prompt_ids, @@ -54,8 +54,8 @@ def apply_penalty_multi_scores( eos_token_ids, ) elif current_platform.is_xpu(): - from fastdeploy.model_executor.ops.xpu import \ - get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.xpu import get_token_penalty_multi_scores + logits = get_token_penalty_multi_scores( pre_token_ids, logits, @@ -69,8 +69,10 @@ def apply_penalty_multi_scores( eos_token_ids, ) elif current_platform.is_iluvatar(): - from fastdeploy.model_executor.ops.iluvatar import \ - get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.iluvatar import ( + get_token_penalty_multi_scores, + ) + logits = get_token_penalty_multi_scores( pre_token_ids, prompt_ids, @@ -86,8 +88,8 @@ def apply_penalty_multi_scores( eos_token_ids, ) elif current_platform.is_gcu(): - from fastdeploy.model_executor.ops.gcu import \ - get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.gcu import get_token_penalty_multi_scores + logits = get_token_penalty_multi_scores( pre_token_ids, logits, @@ -101,7 +103,7 @@ def apply_penalty_multi_scores( eos_token_ids, ) else: - raise NotImplementedError() + raise NotImplementedError return logits @@ -126,8 +128,9 @@ def apply_speculative_penalty_multi_scores( apply_speculative_penalty_multi_scores """ if current_platform.is_cuda(): - from fastdeploy.model_executor.ops.gpu import \ - speculate_get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.gpu import ( + speculate_get_token_penalty_multi_scores, + ) speculate_get_token_penalty_multi_scores( pre_token_ids, @@ -146,6 +149,6 @@ def apply_speculative_penalty_multi_scores( max_len, ) else: - raise NotImplementedError() + raise NotImplementedError # inplace return logits diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py index e364b13f2..63da37802 100644 --- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py +++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py @@ -22,8 +22,8 @@ from fastdeploy import envs from fastdeploy.platforms import current_platform if current_platform.is_gcu(): - from fastdeploy.model_executor.ops.gcu import \ - top_p_sampling as gcu_top_p_sampling + from fastdeploy.model_executor.ops.gcu import top_p_sampling as gcu_top_p_sampling + def top_k_top_p_sampling( x: paddle.Tensor, @@ -33,8 +33,8 @@ def top_k_top_p_sampling( topp_seed: Optional[paddle.Tensor] = None, seed: int = -1, k: int = 0, - mode: Literal['truncated', 'non-truncated'] = "truncated", - order: Literal['top_k_first', 'joint'] = "top_k_first", + mode: Literal["truncated", "non-truncated"] = "truncated", + order: Literal["top_k_first", "joint"] = "top_k_first", ) -> tuple[paddle.Tensor, paddle.Tensor]: """ x(Tensor): An input 2-D Tensor with type float32, float16 and bfloat16. @@ -61,35 +61,33 @@ def top_k_top_p_sampling( """ top_p_class = envs.FD_SAMPLING_CLASS.lower() if top_p_class == "air": - _, ids = air_top_p_sampling(x, - top_p, - threshold, - topp_seed, - seed=seed, - k=k, - mode=mode) + _, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode) elif top_p_class == "rejection": ids = rejection_top_p_sampling(x, top_p, top_k, seed, order) _ = None elif top_p_class == "base_non_truncated": - _, ids = paddle.tensor.top_p_sampling(x, - top_p, - threshold=threshold, - topp_seed=topp_seed, - seed=seed, - k=k, - mode="non-truncated") + _, ids = paddle.tensor.top_p_sampling( + x, + top_p, + threshold=threshold, + topp_seed=topp_seed, + seed=seed, + k=k, + mode="non-truncated", + ) else: if current_platform.is_gcu(): _, ids = gcu_top_p_sampling(x, top_p) else: - _, ids = paddle.tensor.top_p_sampling(x, - top_p, - threshold=threshold, - topp_seed=topp_seed, - seed=seed, - k=k, - mode="truncated") + _, ids = paddle.tensor.top_p_sampling( + x, + top_p, + threshold=threshold, + topp_seed=topp_seed, + seed=seed, + k=k, + mode="truncated", + ) return _, ids @@ -100,15 +98,15 @@ def air_top_p_sampling( topp_seed: Optional[paddle.Tensor] = None, seed: int = -1, k: int = 0, - mode: Literal['truncated', 'non-truncated'] = "truncated", + mode: Literal["truncated", "non-truncated"] = "truncated", ) -> tuple[paddle.Tensor, paddle.Tensor]: """ air_top_p_sampling """ try: from fastdeploy.model_executor.ops.gpu import air_top_p_sampling - out, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed, k, - mode) + + out, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed, k, mode) except ImportError: raise RuntimeError("Cannot import air_top_p_sampling op.") return out, ids @@ -119,14 +117,16 @@ def rejection_top_p_sampling( top_p: paddle.Tensor, top_k: paddle.Tensor, seed: int = -1, - order: Literal['top_k_first', 'joint'] = "top_k_first", + order: Literal["top_k_first", "joint"] = "top_k_first", ) -> paddle.Tensor: """ rejection_top_p_sampling """ try: from fastdeploy.model_executor.ops.gpu import ( - rejection_top_p_sampling, top_k_renorm_probs) + rejection_top_p_sampling, + top_k_renorm_probs, + ) if paddle.count_nonzero(top_k) == 0: ids = rejection_top_p_sampling( diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index f331f443f..e814f21dc 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -13,21 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import threading from concurrent.futures import ThreadPoolExecutor from typing import Any, Dict, List, Optional import paddle -import paddle.nn as nn import paddle.nn.functional as F +from paddle import nn from fastdeploy.config import FDConfig -from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \ - LogitsProcessorBase +from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( + LogitsProcessorBase, +) from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.ops import ( - apply_penalty_multi_scores, apply_speculative_penalty_multi_scores, - top_k_top_p_sampling) + apply_penalty_multi_scores, + apply_speculative_penalty_multi_scores, + top_k_top_p_sampling, +) from fastdeploy.platforms import current_platform from fastdeploy.worker.output import LogprobsTensors, SamplerOutput @@ -44,11 +48,13 @@ class SamplerProcessor: self.executor = ThreadPoolExecutor() self.logits_lock = threading.Lock() - def add_logits_processor(self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = []): - """ add logits processor to SamplerProcessor """ + def add_logits_processor( + self, + ids: int, + future: Optional[Any] = None, + prefill_tokens: List[int] = [], + ): + """add logits processor to SamplerProcessor""" with self.logits_lock: if future is None: if ids in self.logits_processor: @@ -67,7 +73,7 @@ class SamplerProcessor: self.logits_processor[ids] = [future, prefill_tokens] def update_vocab_mask(self, skip_idx_list: List[int] = []): - """ update vocab mask. (cpu-heavy operation) """ + """update vocab mask. (cpu-heavy operation)""" if len(self.logits_processor) == 0: return @@ -102,10 +108,8 @@ class SamplerProcessor: processor.fill_token_bitmask(self.token_bitmask, idx) - def apply_token_mask(self, - logits: paddle.Tensor, - skip_idx_list: List[int] = []): - """ apply token mask to logits """ + def apply_token_mask(self, logits: paddle.Tensor, skip_idx_list: List[int] = []): + """apply token mask to logits""" if len(self.logits_processor) == 0 or self.token_bitmask is None: return logits @@ -121,26 +125,20 @@ class SamplerProcessor: indices = list(self.logits_processor.keys()) mask_idx = [i for i in indices if i not in skip_idx_list] - return available_processors.apply_token_mask(logits, - self.token_bitmask, - indices=mask_idx) + return available_processors.apply_token_mask(logits, self.token_bitmask, indices=mask_idx) def _accept_token(self, idx: int, token: int): - """ accept token """ + """accept token""" if idx not in self.logits_processor: - raise ValueError( - f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}" - ) + raise ValueError(f"Invalid index, idx: {idx}, logit_processors.keys: {self.logits_processor.keys()}") if self.logits_processor[idx].is_terminated(): return self.logits_processor[idx].accept_token(token) - def update_output_tokens(self, - next_tokens: paddle.Tensor, - skip_idx_list: List[int] = []): - """ update output tokens """ + def update_output_tokens(self, next_tokens: paddle.Tensor, skip_idx_list: List[int] = []): + """update output tokens""" if len(self.logits_processor) == 0: return @@ -148,14 +146,13 @@ class SamplerProcessor: with self.logits_lock: for idx in self.logits_processor.keys(): token = token_ids[idx][0] - if token < 0 or self.logits_processor[ - idx] is None or idx in skip_idx_list: + if token < 0 or self.logits_processor[idx] is None or idx in skip_idx_list: continue self._accept_token(idx, token) def pre_process(self, skip_idx_list: List[int] = []): - """ pre process before running """ + """pre process before running""" # create async operation for guided decoding # TODO: support async self.update_vocab_mask(skip_idx_list) @@ -168,31 +165,35 @@ class Sampler(nn.Layer): """ def __init__(self): - """ - """ + """ """ super().__init__() - if current_platform.is_cuda() or current_platform.is_xpu( - ) or current_platform.is_iluvatar() or current_platform.is_gcu(): + if ( + current_platform.is_cuda() + or current_platform.is_xpu() + or current_platform.is_iluvatar() + or current_platform.is_gcu() + ): self.forward = self.forward_cuda else: - raise NotImplementedError() + raise NotImplementedError self.processor = SamplerProcessor() - def apply_logits_processor(self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = []): - """ apply logits processor to sampler """ + def apply_logits_processor( + self, + ids: int, + future: Optional[Any] = None, + prefill_tokens: List[int] = [], + ): + """apply logits processor to sampler""" self.processor.add_logits_processor(ids, future, prefill_tokens) def pre_process(self, skip_idx_list: List[int] = []): - """ pre process before running """ + """pre process before running""" self.processor.pre_process(skip_idx_list) def compute_logprobs(self, logits: paddle.Tensor) -> paddle.Tensor: - """ - """ + """ """ return F.log_softmax(logits, axis=-1) def gather_logprobs( @@ -226,9 +227,7 @@ class Sampler(nn.Layer): if num_logprobs >= 1: # Find the topK values. - topk_logprobs, topk_indices = paddle.topk(logprobs, - num_logprobs, - axis=-1) + topk_logprobs, topk_indices = paddle.topk(logprobs, num_logprobs, axis=-1) indices = paddle.concat([token_ids, topk_indices], axis=1) top_logprobs = paddle.concat([token_logprobs, topk_logprobs], axis=1) else: @@ -243,8 +242,7 @@ class Sampler(nn.Layer): sampling_metadata: SamplingMetadata, skip_idx_list: List[int] = [], ) -> SamplerOutput: - """ - """ + """ """ num_logprobs = sampling_metadata.max_num_logprobs if num_logprobs is not None: raw_logprobs = self.compute_logprobs(logits) @@ -270,8 +268,9 @@ class Sampler(nn.Layer): _, next_tokens = top_k_top_p_sampling(probs, sampling_metadata.top_p, sampling_metadata.top_k) - logprobs_tensors = None if num_logprobs is None else \ - self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens) + logprobs_tensors = ( + None if num_logprobs is None else self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=next_tokens) + ) self.processor.update_output_tokens(next_tokens, skip_idx_list) @@ -291,26 +290,27 @@ class SpeculativeSampler(nn.Layer): """ def __init__(self, fd_config: FDConfig): - """ - """ + """ """ super().__init__() if current_platform.is_cuda(): self.forward = self.forward_cuda else: - raise NotImplementedError() + raise NotImplementedError self.speculative_verify_window = fd_config.speculative_config.verify_window self.speculative_max_candidate_len = fd_config.speculative_config.max_candidate_len self.speculative_benchmark_mode = fd_config.speculative_config.benchmark_mode def pre_process(self, skip_idx_list: List[int] = []): - """ pre process before running """ + """pre process before running""" pass - def apply_logits_processor(self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = []): - """ apply logits processor to sampler """ + def apply_logits_processor( + self, + ids: int, + future: Optional[Any] = None, + prefill_tokens: List[int] = [], + ): + """apply logits processor to sampler""" pass def forward_cuda( @@ -320,11 +320,9 @@ class SpeculativeSampler(nn.Layer): max_model_len: int, share_inputs: List[paddle.Tensor], ) -> paddle.Tensor: - """ - """ + """ """ - from fastdeploy.model_executor.ops.gpu import (speculate_verify, - top_p_candidates) + from fastdeploy.model_executor.ops.gpu import speculate_verify, top_p_candidates logits = apply_speculative_penalty_multi_scores( sampling_metadata.pre_token_ids, @@ -361,7 +359,8 @@ class SpeculativeSampler(nn.Layer): share_inputs["seq_lens_encoder"], share_inputs["seq_lens_decoder"], share_inputs[ - "draft_tokens"], # Both input and output, need to write the last 1 token accepted to position 0. + "draft_tokens" + ], # Both input and output, need to write the last 1 token accepted to position 0. share_inputs["seq_lens_this_time"], verify_tokens, verify_scores, @@ -382,27 +381,27 @@ class SpeculativeSampler(nn.Layer): class MTPSampler(nn.Layer): - """ - """ + """ """ def __init__(self, fd_config: FDConfig): - """ - """ + """ """ super().__init__() if current_platform.is_cuda(): self.forward = self.forward_cuda else: - raise NotImplementedError() + raise NotImplementedError def pre_process(self, skip_idx_list: List[int] = []): - """ pre process before running """ + """pre process before running""" pass - def apply_logits_processor(self, - ids: int, - future: Optional[Any] = None, - prefill_tokens: List[int] = []): - """ apply logits processor to sampler """ + def apply_logits_processor( + self, + ids: int, + future: Optional[Any] = None, + prefill_tokens: List[int] = [], + ): + """apply logits processor to sampler""" pass def forward_cuda( @@ -412,8 +411,7 @@ class MTPSampler(nn.Layer): max_model_len: int, share_inputs: List[paddle.Tensor], ) -> paddle.Tensor: - """ - """ + """ """ logits = apply_speculative_penalty_multi_scores( sampling_metadata.pre_token_ids, logits, diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index d635ef285..7ea753889 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -27,14 +27,15 @@ from fastdeploy.platforms import current_platform if current_platform.is_cuda() and current_platform.available(): try: from fastdeploy.model_executor.ops.gpu import ( - get_padding_offset, speculate_get_padding_offset) + get_padding_offset, + speculate_get_padding_offset, + ) except Exception: raise ImportError( "Verify environment consistency between compilation and FastDeploy installation. " "And ensure the Paddle version supports FastDeploy's custom operators" ) -if current_platform.is_iluvatar(): - from fastdeploy.model_executor.ops.iluvatar import get_padding_offset + import re from fastdeploy import envs @@ -44,9 +45,7 @@ if cache_params != "none": c8_state_dict = paddle.load(cache_params, return_numpy=True) -def per_block_cast_to_fp8(x: Tensor, - block_size: list = [128, - 128]) -> Tuple[Tensor, Tensor]: +def per_block_cast_to_fp8(x: Tensor, block_size: list = [128, 128]) -> Tuple[Tensor, Tensor]: """ Only used in deep_gemm block wise quant weight. copy from FastDeploy/custom_ops/gpu_ops/fp8_deep_gemm/tests/test_core.py. @@ -55,21 +54,27 @@ def per_block_cast_to_fp8(x: Tensor, assert x.dim() == 2 m, n = x.shape - x_padded = paddle.zeros((ceil_div(m, block_size[0]) * block_size[0], - ceil_div(n, block_size[1]) * block_size[1]), - dtype=x.dtype) + x_padded = paddle.zeros( + ( + ceil_div(m, block_size[0]) * block_size[0], + ceil_div(n, block_size[1]) * block_size[1], + ), + dtype=x.dtype, + ) x_padded[:m, :n] = x x_view = paddle.view( x_padded, - (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1])) + (-1, block_size[0], x_padded.shape[1] // block_size[1], block_size[1]), + ) x_abs = paddle.abs(x_view).astype(paddle.float32) x_amax = paddle.amax(x_abs, axis=(1, 3), keepdim=True) x_amax = paddle.clip(x_amax, min=1e-4) x_scaled = (x_view * (448.0 / x_amax)).astype(paddle.float8_e4m3fn) - return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (paddle.view( - x_amax / 448.0, (x_view.shape[0], x_view.shape[2]))) + return x_scaled.view_as(x_padded)[:m, :n].contiguous(), ( + paddle.view(x_amax / 448.0, (x_view.shape[0], x_view.shape[2])) + ) # for distributed tensor model parallel @@ -130,8 +135,7 @@ def get_tensor(input: Union[paddle.Tensor, np.ndarray, str]) -> paddle.Tensor: if key_name in f.keys(): weight = f.get_tensor(key_name) weight = paddle.Tensor(weight, zero_copy=True) - weight = weight._copy_to( - paddle.framework._current_expected_place(), False) + weight = weight._copy_to(paddle.framework._current_expected_place(), False) return weight else: return None @@ -160,8 +164,7 @@ def matmul_hadU(X: Tensor) -> paddle.Tensor: input = X.clone().reshape((-1, X.shape[-1], 1)) output = input.clone() while input.shape[1] > 1: - input = input.reshape( - (input.shape[0], input.shape[1] // 2, 2, input.shape[2])) + input = input.reshape((input.shape[0], input.shape[1] // 2, 2, input.shape[2])) output = output.reshape(input.shape) output[:, :, 0, :] = input[:, :, 0, :] + input[:, :, 1, :] output[:, :, 1, :] = input[:, :, 0, :] - input[:, :, 1, :] @@ -171,8 +174,7 @@ def matmul_hadU(X: Tensor) -> paddle.Tensor: return input.reshape(X.shape) -def random_hadamard_matrix(block_size: int, - dtype: Union[paddle.dtype, str]) -> paddle.Tensor: +def random_hadamard_matrix(block_size: int, dtype: Union[paddle.dtype, str]) -> paddle.Tensor: """ Generate a random Hadamard matrix. @@ -203,8 +205,7 @@ def create_hadamard_matrix(hidden_size: int) -> paddle.Tensor: hadamard_block_size = 32 h = random_hadamard_matrix(hadamard_block_size, "float32") block_num = hidden_size // hadamard_block_size - hadamard_matrix = paddle.to_tensor( - block_diag(*[h for i in range(block_num)])) + hadamard_matrix = paddle.to_tensor(block_diag(*[h for i in range(block_num)])) return hadamard_matrix @@ -231,8 +232,7 @@ def ensure_divisibility(numerator, denominator): AssertionError: If the numerator cannot be evenly divided by the denominator, an assertion error is raised. """ - assert numerator % denominator == 0, "{} is not divisible by {}".format( - numerator, denominator) + assert numerator % denominator == 0, f"{numerator} is not divisible by {denominator}" def divide(numerator: int, denominator: int): @@ -252,10 +252,10 @@ def divide(numerator: int, denominator: int): def remove_padding( - max_len: paddle.Tensor, input_ids: paddle.Tensor, - seq_lens_this_time: paddle.Tensor -) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, - paddle.Tensor]: + max_len: paddle.Tensor, + input_ids: paddle.Tensor, + seq_lens_this_time: paddle.Tensor, +) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Remove padded sequences from the input. @@ -281,8 +281,7 @@ def remove_padding( padding_offset, cu_seqlens_q, cu_seqlens_k, - ) = get_padding_offset(input_ids, cum_offsets_now, token_num, - seq_lens_this_time) + ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) return ( ids_remove_padding, padding_offset, @@ -293,11 +292,12 @@ def remove_padding( def speculate_remove_padding( - max_len: paddle.Tensor, input_ids: paddle.Tensor, - seq_lens_this_time: paddle.Tensor, draft_tokens: paddle.Tensor, - seq_lens_encoder: paddle.Tensor -) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, - paddle.Tensor]: + max_len: paddle.Tensor, + input_ids: paddle.Tensor, + seq_lens_this_time: paddle.Tensor, + draft_tokens: paddle.Tensor, + seq_lens_encoder: paddle.Tensor, +) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: """ Remove padding from sequences. @@ -319,13 +319,7 @@ def speculate_remove_padding( if current_platform.is_cuda(): cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) token_num = paddle.sum(seq_lens_this_time) - ( - ids_remove_padding, - cum_offsets, - padding_offset, - cu_seqlens_q, - cu_seqlens_k, - ) = speculate_get_padding_offset( + (ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k,) = speculate_get_padding_offset( input_ids, draft_tokens, cum_offsets_now, @@ -359,8 +353,7 @@ class CpuGuard: paddle.device.set_device(self.ori_device) -def create_and_set_parameter(layer: nn.Layer, name: str, - tensor: paddle.Tensor): +def create_and_set_parameter(layer: nn.Layer, name: str, tensor: paddle.Tensor): """ Create a parameter for a specified layer and set its value to the given tensor. @@ -373,10 +366,12 @@ def create_and_set_parameter(layer: nn.Layer, name: str, None """ setattr( - layer, name, + layer, + name, layer.create_parameter( shape=tensor.shape, dtype=tensor.dtype, default_initializer=paddle.nn.initializer.Constant(0), - )) + ), + ) getattr(layer, name).set_value(tensor) diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index ca06eb3b5..f5ed5543e 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -26,36 +26,35 @@ from safetensors import safe_open from tqdm import tqdm from fastdeploy.config import FDConfig -from fastdeploy.model_executor.models.tp_utils import \ - check_tensor_parallel_prerequisites +from fastdeploy.model_executor.models.tp_utils import ( + check_tensor_parallel_prerequisites, +) from fastdeploy.platforms import current_platform -def load_ep_checkpoint(model_path: str, - fd_config: FDConfig, - return_numpy: bool = False): +def load_ep_checkpoint(model_path: str, fd_config: FDConfig, return_numpy: bool = False): """ load ep checkpoint """ - with open(os.path.join(model_path, "model.safetensors.index.json"), - "r") as f: + with open(os.path.join(model_path, "model.safetensors.index.json"), "r") as f: weight_list = json.load(f)["weight_map"] filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k} num_local_ffn_keys = [] from itertools import chain + def get_expert_ranges(fd_config): """ Generate expert index ranges based on configuration parameters - + This function is primarily used in Mixture-of-Experts (MoE) models to generate expert index ranges according to configuration parameters. When moe_num_experts is a list in the fd_config, it returns a chained combination of two ranges, otherwise returns a single range. - + Args: fd_config: FastDeploy Configuration object - + Returns: If moe_num_experts is a list: Returns a chained combination (chain object) of two ranges: @@ -66,25 +65,28 @@ def load_ep_checkpoint(model_path: str, """ base_range = range( fd_config.parallel_config.num_experts_start_offset, - fd_config.parallel_config.num_experts_start_offset + fd_config.parallel_config.num_experts_per_rank + fd_config.parallel_config.num_experts_start_offset + fd_config.parallel_config.num_experts_per_rank, ) if isinstance(fd_config.model_config.moe_num_experts, list): - return chain(base_range, - range(base_range.start + fd_config.model_config.moe_num_experts[0], base_range.stop + fd_config.model_config.moe_num_experts[0])) + return chain( + base_range, + range( + base_range.start + fd_config.model_config.moe_num_experts[0], + base_range.stop + fd_config.model_config.moe_num_experts[0], + ), + ) return base_range for i in range(fd_config.model_config.moe_layer_start_index, fd_config.model_config.num_hidden_layers): for j in get_expert_ranges(fd_config): up_gate_proj_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.weight" - down_proj_key = (f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight") + down_proj_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight" up_gate_proj_quant_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.quant_weight" - down_proj_quant_key = ( - f"ernie.layers.{i}.mlp.experts.{j}.down_proj.quant_weight") + down_proj_quant_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.quant_weight" up_gate_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.weight_scale" - down_proj_scale_key = ( - f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight_scale") + down_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight_scale" num_local_ffn_keys.append(up_gate_proj_key) num_local_ffn_keys.append(down_proj_key) num_local_ffn_keys.append(up_gate_proj_quant_key) @@ -101,31 +103,32 @@ def load_ep_checkpoint(model_path: str, safetensor_paths = set(filtered_map.values()) # Open each safetensor file sequentially with progress bar - for safetensor_path in tqdm(safetensor_paths, - desc="Loading safetensor files", - unit="file"): - with safe_open(os.path.join(model_path, safetensor_path), - framework="np", - device="cpu") as f: + for safetensor_path in tqdm(safetensor_paths, desc="Loading safetensor files", unit="file"): + with safe_open( + os.path.join(model_path, safetensor_path), + framework="np", + device="cpu", + ) as f: # Check if this file contains keys from filtered_map for k in filtered_map: if filtered_map[k] == safetensor_path and k in f.keys(): weight = f.get_tensor(k) if not return_numpy: weight = paddle.Tensor(weight, zero_copy=True) - weight = weight._copy_to( - paddle.framework._current_expected_place(), False) + weight = weight._copy_to(paddle.framework._current_expected_place(), False) state_dict[k] = weight return state_dict -def safetensors_weights_iterator(safe_tensor_list: list[str], ): +def safetensors_weights_iterator( + safe_tensor_list: list[str], +): """ safetensors_weights_iterator """ for st_file in tqdm( - safe_tensor_list, - desc="Loading safetensors checkpoint shards", + safe_tensor_list, + desc="Loading safetensors checkpoint shards", ): with safe_open(st_file, framework="np") as f: for name in f.keys(): @@ -133,7 +136,9 @@ def safetensors_weights_iterator(safe_tensor_list: list[str], ): yield name, param -def fastsafetensors_weights_iterator(safetensor_list: list[str], ): +def fastsafetensors_weights_iterator( + safetensor_list: list[str], +): """ Return an iterator over tensors on GPU from a given safetensor_list. """ @@ -143,23 +148,17 @@ def fastsafetensors_weights_iterator(safetensor_list: list[str], ): device = f"gpu:{pg.rank}" if paddle.is_compiled_with_cuda() else "cpu" else: pg = SingleGroup() - device = f"gpu:{pg.rank()}" if paddle.is_compiled_with_cuda( - ) else "cpu" + device = f"gpu:{pg.rank()}" if paddle.is_compiled_with_cuda() else "cpu" safetensor_files_sub_lists = [ - safetensor_list[i:i + world_size] - for i in range(0, len(safetensor_list), world_size) + safetensor_list[i : i + world_size] for i in range(0, len(safetensor_list), world_size) ] for st_file in tqdm( - safetensor_files_sub_lists, - desc="Loading fastsafetensors checkpoint shards", + safetensor_files_sub_lists, + desc="Loading fastsafetensors checkpoint shards", ): - loader = SafeTensorsFileLoader(pg, - device, - nogds=True, - debug_log=False, - framework="paddle") + loader = SafeTensorsFileLoader(pg, device, nogds=True, debug_log=False, framework="paddle") rank_file_map = {i: [f] for i, f in enumerate(st_file)} loader.add_filenames(rank_file_map) try: @@ -175,15 +174,12 @@ def fastsafetensors_weights_iterator(safetensor_list: list[str], ): loader.close() -def load_pre_sharded_checkpoint(model_path: str, - local_rank: int, - use_fastsafetensor: bool = False): +def load_pre_sharded_checkpoint(model_path: str, local_rank: int, use_fastsafetensor: bool = False): """ load_pre_sharded_checkpoint """ state_dict = {} - _, safetensor_files = get_all_safetensors( - os.path.join(model_path, f"rank{local_rank}")) + _, safetensor_files = get_all_safetensors(os.path.join(model_path, f"rank{local_rank}")) weights_iterator = safetensors_weights_iterator(safetensor_files) for name, weight in weights_iterator: state_dict[name] = weight @@ -201,13 +197,11 @@ def get_all_safetensors(model_path: str): key_name_list = f.keys() return key_name_list, safetensor_list else: - with open(os.path.join(model_path, "model.safetensors.index.json"), - "r") as f: + with open(os.path.join(model_path, "model.safetensors.index.json"), "r") as f: weight_map = json.load(f)["weight_map"] weight_files_in_index = set() for weight_name in weight_map: - weight_files_in_index.add( - os.path.join(model_path, weight_map[weight_name])) + weight_files_in_index.add(os.path.join(model_path, weight_map[weight_name])) key_name_list = list(set(weight_map.keys())) safetensor_list = list(weight_files_in_index) safetensor_list.sort() @@ -256,8 +250,7 @@ def deal_state_dict(state_dict): """deal_state_dict""" device = paddle.CUDAPinnedPlace() for name, src in state_dict.items(): - if src._is_initialized() and not isinstance(src.place, - paddle.CUDAPinnedPlace): + if src._is_initialized() and not isinstance(src.place, paddle.CUDAPinnedPlace): dst = src._copy_to(device, True) dst_tensor = dst.value().get_tensor() src_tensor = src.value().get_tensor() @@ -277,22 +270,15 @@ def load_composite_checkpoint( # 2. Tensor Parallel (TP) # 3. Pre-sharded (pre-split) """ - if fd_config.parallel_config.use_ep and \ - fd_config.speculative_config.model_type != "mtp": - state_dict = load_ep_checkpoint(model_path, - fd_config, - return_numpy=True) + if fd_config.parallel_config.use_ep and fd_config.speculative_config.model_type != "mtp": + state_dict = load_ep_checkpoint(model_path, fd_config, return_numpy=True) else: rank_dirs = [ - f for f in os.listdir(model_path) if f.startswith("rank") - and os.path.isdir(os.path.join(model_path, f)) + f for f in os.listdir(model_path) if f.startswith("rank") and os.path.isdir(os.path.join(model_path, f)) ] if len(rank_dirs) > 1: - if fd_config.parallel_config.tensor_parallel_size != len( - rank_dirs): - raise ValueError( - f"Your model only supports loading with tp{len(rank_dirs)}" - ) + if fd_config.parallel_config.tensor_parallel_size != len(rank_dirs): + raise ValueError(f"Your model only supports loading with tp{len(rank_dirs)}") state_dict = load_pre_sharded_checkpoint( model_path, fd_config.parallel_config.tensor_parallel_rank, @@ -300,18 +286,17 @@ def load_composite_checkpoint( ) else: if fd_config.load_config.use_fastsafetensor and ( - current_platform.available() - and current_platform.is_cuda()): - state_dict = load_tp_checkpoint_v1(model_path, - cls, - fd_config, - use_fastsafetensor=True) + current_platform.available() and current_platform.is_cuda() + ): + state_dict = load_tp_checkpoint_v1(model_path, cls, fd_config, use_fastsafetensor=True) deal_state_dict(state_dict) else: - state_dict = load_tp_checkpoint(model_path, - cls, - fd_config.model_config.pretrained_config, - return_numpy=return_numpy) + state_dict = load_tp_checkpoint( + model_path, + cls, + fd_config.model_config.pretrained_config, + return_numpy=return_numpy, + ) if not state_dict: raise ValueError("weight not found in state_dict !") return state_dict diff --git a/fastdeploy/model_executor/model_loader.py b/fastdeploy/model_executor/model_loader.py index 8d1819840..cd35445ec 100644 --- a/fastdeploy/model_executor/model_loader.py +++ b/fastdeploy/model_executor/model_loader.py @@ -20,16 +20,13 @@ import paddle from paddle import nn from fastdeploy.config import FDConfig, LoadConfig, ModelConfig -from fastdeploy.model_executor.load_weight_utils import \ - load_composite_checkpoint -from fastdeploy.model_executor.models.deepseek_v3 import \ - DeepSeekV3PretrainedModel -from fastdeploy.model_executor.models.ernie4_5_moe import \ - Ernie4_5_PretrainedModel -from fastdeploy.model_executor.models.ernie4_5_mtp import \ - Ernie4_5_MTPPretrainedModel -from fastdeploy.model_executor.models.ernie4_5_vl.ernie4_5_vl_moe import \ - Ernie4_5_VLPretrainedModel +from fastdeploy.model_executor.load_weight_utils import load_composite_checkpoint +from fastdeploy.model_executor.models.deepseek_v3 import DeepSeekV3PretrainedModel +from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_PretrainedModel +from fastdeploy.model_executor.models.ernie4_5_mtp import Ernie4_5_MTPPretrainedModel +from fastdeploy.model_executor.models.ernie4_5_vl.ernie4_5_vl_moe import ( + Ernie4_5_VLPretrainedModel, +) from fastdeploy.model_executor.models.model_base import ModelRegistry from fastdeploy.model_executor.models.qwen2 import Qwen2PretrainedModel from fastdeploy.model_executor.models.qwen3 import Qwen3PretrainedModel @@ -49,31 +46,31 @@ MODEL_CLASSES = { def get_model_from_loader(fd_config: FDConfig) -> nn.Layer: - """ load or download model """ + """load or download model""" model_loader = DefaultModelLoader(fd_config.load_config) model = model_loader.load_model(fd_config) return model class BaseModelLoader(ABC): - """ Base class for model loaders. """ + """Base class for model loaders.""" def __init__(self, load_config: LoadConfig): self.load_config = load_config @abstractmethod def download_model(self, load_config: ModelConfig) -> None: - """ Download a model so that it can be immediately loaded.""" + """Download a model so that it can be immediately loaded.""" raise NotImplementedError @abstractmethod def load_model(self, fd_config: FDConfig) -> nn.Layer: - """ Load a model with the given configurations.""" + """Load a model with the given configurations.""" raise NotImplementedError class DefaultModelLoader(BaseModelLoader): - """ ModelLoader that can load registered models """ + """ModelLoader that can load registered models""" def __init__(self, load_config: LoadConfig): super().__init__(load_config) @@ -98,6 +95,7 @@ class DefaultModelLoader(BaseModelLoader): if fd_config.load_config.dynamic_load_weight: # register rl model import fastdeploy.rl # noqa + architectures = architectures + "RL" with context: diff --git a/fastdeploy/model_executor/models/__init__.py b/fastdeploy/model_executor/models/__init__.py index 366a7f4a4..e7b440b81 100644 --- a/fastdeploy/model_executor/models/__init__.py +++ b/fastdeploy/model_executor/models/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import importlib import inspect import os @@ -28,26 +29,21 @@ def _find_py_files(root_dir): rel_path = py_file.relative_to(root_dir) if "__init__" in str(py_file): continue - dotted_path = str(rel_path).replace("/", ".").replace("\\", - ".").replace( - ".py", "") + dotted_path = str(rel_path).replace("/", ".").replace("\\", ".").replace(".py", "") py_files.append(dotted_path) return py_files -def auto_models_registry(dir_path, - register_path="fastdeploy.model_executor.models"): +def auto_models_registry(dir_path, register_path="fastdeploy.model_executor.models"): """ auto registry all models in this folder """ for module_file in _find_py_files(dir_path): try: - module = importlib.import_module(f'{register_path}.{module_file}') + module = importlib.import_module(f"{register_path}.{module_file}") for attr_name in dir(module): attr = getattr(module, attr_name) - if inspect.isclass(attr) and issubclass( - attr, - ModelForCasualLM) and attr is not ModelForCasualLM: + if inspect.isclass(attr) and issubclass(attr, ModelForCasualLM) and attr is not ModelForCasualLM: ModelRegistry.register(attr) except ImportError: raise ImportError(f"{module_file=} import error") diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index c7f573772..b1ebd98ec 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -25,26 +25,31 @@ from paddleformers.transformers import PretrainedModel from paddleformers.utils.log import logger from fastdeploy.config import FDConfig -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.linear import ( - ColumnParallelLinear, KVBatchLinear, MergedColumnParallelLinear, - ReplicatedLinear, RowParallelLinear) + ColumnParallelLinear, + KVBatchLinear, + MergedColumnParallelLinear, + ReplicatedLinear, + RowParallelLinear, +) from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm -from fastdeploy.model_executor.layers.rotary_embedding import \ - DeepseekScalingRotaryEmbedding +from fastdeploy.model_executor.layers.rotary_embedding import ( + DeepseekScalingRotaryEmbedding, +) from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.platforms import current_platform if current_platform.is_cuda(): - from fastdeploy.model_executor.ops.gpu import \ - get_position_ids_and_mask_encoder_batch + from fastdeploy.model_executor.ops.gpu import ( + get_position_ids_and_mask_encoder_batch, + ) class DeepSeekV3MLP(nn.Layer): @@ -86,14 +91,12 @@ class DeepSeekV3MLP(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) def forward(self, x): - """ - """ + """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) down_out = self.down_proj(act_out) @@ -105,42 +108,34 @@ class DeepSeekV3MoE(nn.Layer): DeepSeekV3MoE, for MoE Layer. """ - def __init__(self, fd_config: FDConfig, layer_id: int, - prefix: str) -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() self.tp_size = fd_config.parallel_config.tensor_parallel_size weight_key_map = { "gate_weight_key": f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.gate.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.weight", + "gate_correction_bias_key": f"{prefix}.gate.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.weight", } self.fused_moe = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.model_config. - moe_intermediate_size, + moe_intermediate_size=fd_config.model_config.moe_intermediate_size, num_experts=fd_config.model_config.n_routed_experts, top_k=fd_config.model_config.num_experts_per_tok, topk_method=fd_config.model_config.topk_method, topk_group=fd_config.model_config.topk_group, n_group=fd_config.model_config.n_group, - routed_scaling_factor=fd_config.model_config. - routed_scaling_factor, + routed_scaling_factor=fd_config.model_config.routed_scaling_factor, layer_idx=layer_id, weight_key_map=weight_key_map, ) self.num_shared_experts = fd_config.model_config.n_shared_experts - shared_experts_intermediate_size = ( - self.num_shared_experts * - fd_config.model_config.moe_intermediate_size) + shared_experts_intermediate_size = self.num_shared_experts * fd_config.model_config.moe_intermediate_size self.shared_experts = DeepSeekV3MLP( fd_config=fd_config, @@ -150,14 +145,12 @@ class DeepSeekV3MoE(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.fused_moe.load_state_dict(state_dict) self.shared_experts.load_state_dict(state_dict) def forward(self, hidden_states: paddle.Tensor): - """ - """ + """ """ shared_experts_out = self.shared_experts(hidden_states) moe_out = self.fused_moe(hidden_states) moe_out = moe_out + shared_experts_out @@ -172,10 +165,7 @@ class DeepseekV3MLAAttention(nn.Layer): DeepseekV3MLAAttention """ - def __init__(self, - fd_config: FDConfig, - layer_id: int, - prefix: str = "") -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = "") -> None: super().__init__() self.tp_size = fd_config.parallel_config.tensor_parallel_size @@ -196,16 +186,20 @@ class DeepseekV3MLAAttention(nn.Layer): self.rms_norm_eps = fd_config.model_config.rms_norm_eps if self.q_lora_rank is not None: - self.q_a_proj = ReplicatedLinear(fd_config=fd_config, - prefix=f"{prefix}.q_a_proj", - input_size=self.hidden_size, - output_size=self.q_lora_rank, - with_bias=False) + self.q_a_proj = ReplicatedLinear( + fd_config=fd_config, + prefix=f"{prefix}.q_a_proj", + input_size=self.hidden_size, + output_size=self.q_lora_rank, + with_bias=False, + ) - self.q_a_layernorm = RMSNorm(fd_config, - hidden_size=self.q_lora_rank, - eps=self.rms_norm_eps, - prefix=f"{prefix}.q_a_layernorm") + self.q_a_layernorm = RMSNorm( + fd_config, + hidden_size=self.q_lora_rank, + eps=self.rms_norm_eps, + prefix=f"{prefix}.q_a_layernorm", + ) self.q_b_proj = ColumnParallelLinear( fd_config=fd_config, @@ -215,8 +209,7 @@ class DeepseekV3MLAAttention(nn.Layer): with_bias=False, ) else: - assert (self.q_lora_rank is not None - ), "self.q_lora_rank is None, Please Check your config." + assert self.q_lora_rank is not None, "self.q_lora_rank is None, Please Check your config." # 不切TP,跑 W4A16 Gemm self.kv_a_proj_with_mqa = ReplicatedLinear( @@ -224,28 +217,31 @@ class DeepseekV3MLAAttention(nn.Layer): prefix=f"{prefix}.kv_a_proj_with_mqa", input_size=self.hidden_size, output_size=self.kv_lora_rank + self.qk_rope_head_dim, - with_bias=False) + with_bias=False, + ) - self.kv_a_layernorm = RMSNorm(fd_config, - hidden_size=self.kv_lora_rank, - eps=self.rms_norm_eps, - prefix=f"{prefix}.kv_a_layernorm") + self.kv_a_layernorm = RMSNorm( + fd_config, + hidden_size=self.kv_lora_rank, + eps=self.rms_norm_eps, + prefix=f"{prefix}.kv_a_layernorm", + ) self.kv_b_proj = ColumnParallelLinear( fd_config=fd_config, prefix=f"{prefix}.kv_b_proj", input_size=self.kv_lora_rank, - output_size=self.num_attention_heads * - (self.qk_nope_head_dim + self.v_head_dim), + output_size=self.num_attention_heads * (self.qk_nope_head_dim + self.v_head_dim), with_bias=False, ) - self.o_proj = RowParallelLinear(fd_config, - prefix=f"{prefix}.o_proj", - input_size=self.num_attention_heads * - self.v_head_dim, - output_size=self.hidden_size, - with_bias=False) + self.o_proj = RowParallelLinear( + fd_config, + prefix=f"{prefix}.o_proj", + input_size=self.num_attention_heads * self.v_head_dim, + output_size=self.hidden_size, + with_bias=False, + ) self.kv_b_proj_bmm = KVBatchLinear( fd_config=fd_config, @@ -253,14 +249,14 @@ class DeepseekV3MLAAttention(nn.Layer): kv_lora_rank=self.kv_lora_rank, num_attention_heads=self.num_attention_heads, qk_nope_head_dim=self.qk_nope_head_dim, - v_head_dim=self.v_head_dim) + v_head_dim=self.v_head_dim, + ) self.rope_scaling = fd_config.model_config.rope_scaling if self.rope_scaling: mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False) scaling_factor = self.rope_scaling["factor"] - mscale = self.yarn_get_mscale(scaling_factor, - float(mscale_all_dim)) + mscale = self.yarn_get_mscale(scaling_factor, float(mscale_all_dim)) self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale rope_scaling_kwargs = { @@ -270,15 +266,14 @@ class DeepseekV3MLAAttention(nn.Layer): "beta_slow", "mscale", "mscale_all_dim", - ] if key in self.rope_scaling + ] + if key in self.rope_scaling } self.rope_scaling_factor = self.rope_scaling["factor"] - self.rope_scaling_original_max_position_embeddings = self.rope_scaling[ - "original_max_position_embeddings"] + self.rope_scaling_original_max_position_embeddings = self.rope_scaling["original_max_position_embeddings"] self.rotary_emb = DeepseekScalingRotaryEmbedding( self.qk_rope_head_dim, - max_position_embeddings=self. - rope_scaling_original_max_position_embeddings, + max_position_embeddings=self.rope_scaling_original_max_position_embeddings, base=self.rope_theta, scaling_factor=self.rope_scaling_factor, **rope_scaling_kwargs, @@ -295,8 +290,7 @@ class DeepseekV3MLAAttention(nn.Layer): @staticmethod def yarn_get_mscale(scale=1, mscale=1): - """ - """ + """ """ if scale <= 1: return 1.0 return 0.1 * mscale * math.log(scale) + 1.0 @@ -308,63 +302,61 @@ class DeepseekV3MLAAttention(nn.Layer): position_ids: paddle.Tensor, mask_encoder_batch: paddle.Tensor, ): - """ - """ + """ """ layernorm_out = hidden_states - fmha_out = paddle.zeros(shape=[ - layernorm_out.shape[0], - self.num_attention_heads_tp * self.v_head_dim - ], - dtype=layernorm_out.dtype) + fmha_out = paddle.zeros( + shape=[ + layernorm_out.shape[0], + self.num_attention_heads_tp * self.v_head_dim, + ], + dtype=layernorm_out.dtype, + ) if forward_meta.max_enc_len_this_time: query = self.q_a_proj(layernorm_out) query = self.q_a_layernorm(query) query = self.q_b_proj(query) - query = query.reshape( - [-1, self.num_attention_heads_tp, self.qk_head_dim]) - query_nope, query_pe = query.split( - [self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1) + query = query.reshape([-1, self.num_attention_heads_tp, self.qk_head_dim]) + query_nope, query_pe = query.split([self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1) compressed_kv = self.kv_a_proj_with_mqa(layernorm_out) - compressed_kv, key_pe = compressed_kv.split( - [self.kv_lora_rank, self.qk_rope_head_dim], axis=-1) + compressed_kv, key_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], axis=-1) key_pe = key_pe.reshape([-1, 1, self.qk_rope_head_dim]) compressed_kv = self.kv_a_layernorm(compressed_kv) query_pe, key_pe = self.rotary_emb(position_ids, query_pe, key_pe) key_value = self.kv_b_proj(compressed_kv) - key_value = key_value.reshape([ - -1, self.num_attention_heads_tp, - self.qk_nope_head_dim + self.v_head_dim - ]) - key_nope, value = key_value.split( - [self.qk_nope_head_dim, self.v_head_dim], axis=-1) + key_value = key_value.reshape( + [ + -1, + self.num_attention_heads_tp, + self.qk_nope_head_dim + self.v_head_dim, + ] + ) + key_nope, value = key_value.split([self.qk_nope_head_dim, self.v_head_dim], axis=-1) - query[..., self.qk_nope_head_dim:] = query_pe + query[..., self.qk_nope_head_dim :] = query_pe key = paddle.empty_like(query) - key[..., :self.qk_nope_head_dim] = key_nope - key[..., self.qk_nope_head_dim:] = key_pe - value = paddle.nn.functional.pad( - value, [0, self.qk_head_dim - self.v_head_dim], value=0) + key[..., : self.qk_nope_head_dim] = key_nope + key[..., self.qk_nope_head_dim :] = key_pe + value = paddle.nn.functional.pad(value, [0, self.qk_head_dim - self.v_head_dim], value=0) - fmha_out_prefill = self.mla_attn(q=query, - k=key, - v=value, - qkv=None, - compressed_kv=compressed_kv, - k_pe=key_pe, - forward_meta=forward_meta) + fmha_out_prefill = self.mla_attn( + q=query, + k=key, + v=value, + qkv=None, + compressed_kv=compressed_kv, + k_pe=key_pe, + forward_meta=forward_meta, + ) - fmha_out_prefill = fmha_out_prefill.reshape( - [-1, self.num_attention_heads_tp, self.qk_head_dim]) - fmha_out_prefill = fmha_out_prefill[:, :, :self.v_head_dim] - fmha_out_prefill = fmha_out_prefill.reshape( - [-1, self.num_attention_heads_tp * self.v_head_dim]) - fmha_out_prefill = fmha_out_prefill * mask_encoder_batch.cast( - fmha_out_prefill.dtype) + fmha_out_prefill = fmha_out_prefill.reshape([-1, self.num_attention_heads_tp, self.qk_head_dim]) + fmha_out_prefill = fmha_out_prefill[:, :, : self.v_head_dim] + fmha_out_prefill = fmha_out_prefill.reshape([-1, self.num_attention_heads_tp * self.v_head_dim]) + fmha_out_prefill = fmha_out_prefill * mask_encoder_batch.cast(fmha_out_prefill.dtype) fmha_out = fmha_out + fmha_out_prefill if forward_meta.max_dec_len_this_time: @@ -373,51 +365,51 @@ class DeepseekV3MLAAttention(nn.Layer): ln_out_or_q_c = query compressed_kv = self.kv_a_proj_with_mqa(layernorm_out) - compressed_kv, key_pe = compressed_kv.split( - [self.kv_lora_rank, self.qk_rope_head_dim], axis=-1) + compressed_kv, key_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], axis=-1) key_pe = key_pe.reshape([-1, 1, self.qk_rope_head_dim]) compressed_kv = self.kv_a_layernorm(compressed_kv) query = self.q_b_proj(ln_out_or_q_c) - query = query.reshape( - [-1, self.num_attention_heads_tp, self.qk_head_dim]) + query = query.reshape([-1, self.num_attention_heads_tp, self.qk_head_dim]) - query_nope, query_pe = query.split( - [self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1) + query_nope, query_pe = query.split([self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1) query_pe, key_pe = self.rotary_emb(position_ids, query_pe, key_pe) - q_nope_out = self.kv_b_proj_bmm(query_nope.transpose([1, 0, 2]), - proj_type='k').transpose([1, 0, 2]) + q_nope_out = self.kv_b_proj_bmm(query_nope.transpose([1, 0, 2]), proj_type="k").transpose([1, 0, 2]) q_input = paddle.concat([q_nope_out, query_pe], axis=-1) - q_input = q_input.reshape([ - -1, - self.num_attention_heads_tp * - (self.kv_lora_rank + self.qk_rope_head_dim), - ]) - fmha_out_decode = self.mla_attn(q=q_input, - k=None, - v=None, - qkv=None, - compressed_kv=compressed_kv, - k_pe=key_pe, - forward_meta=forward_meta) + q_input = q_input.reshape( + [ + -1, + self.num_attention_heads_tp * (self.kv_lora_rank + self.qk_rope_head_dim), + ] + ) + fmha_out_decode = self.mla_attn( + q=q_input, + k=None, + v=None, + qkv=None, + compressed_kv=compressed_kv, + k_pe=key_pe, + forward_meta=forward_meta, + ) - fmha_out_decode = fmha_out_decode.reshape( - [-1, self.num_attention_heads_tp, - self.kv_lora_rank]).transpose([1, 0, 2]) + fmha_out_decode = fmha_out_decode.reshape([-1, self.num_attention_heads_tp, self.kv_lora_rank]).transpose( + [1, 0, 2] + ) - fmha_out_decode = (self.kv_b_proj_bmm( - fmha_out_decode, proj_type='v').transpose([1, 0, 2]).reshape( - [-1, self.num_attention_heads_tp * self.v_head_dim])) + fmha_out_decode = ( + self.kv_b_proj_bmm(fmha_out_decode, proj_type="v") + .transpose([1, 0, 2]) + .reshape([-1, self.num_attention_heads_tp * self.v_head_dim]) + ) fmha_out = fmha_out + fmha_out_decode output = self.o_proj(fmha_out) return output def load_state_dict(self, state_dict): - """ - """ + """ """ self.q_a_proj.load_state_dict(state_dict) self.q_a_layernorm.load_state_dict(state_dict) self.kv_a_proj_with_mqa.load_state_dict(state_dict) @@ -441,7 +433,7 @@ class DeepSeekV3DecoderLayer(nn.Layer): prefix: str = "", ) -> None: super().__init__() - layer_id = int(prefix.split(sep='.')[-1]) + layer_id = int(prefix.split(sep=".")[-1]) self.self_attn = DeepseekV3MLAAttention( fd_config=fd_config, @@ -449,9 +441,10 @@ class DeepSeekV3DecoderLayer(nn.Layer): prefix=f"{prefix}.self_attn", ) - if (fd_config.model_config.n_routed_experts is not None - and layer_id - >= fd_config.model_config.first_k_dense_replace): + if ( + fd_config.model_config.n_routed_experts is not None + and layer_id >= fd_config.model_config.first_k_dense_replace + ): self.mlp = DeepSeekV3MoE( fd_config=fd_config, layer_id=layer_id, @@ -479,8 +472,7 @@ class DeepSeekV3DecoderLayer(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.self_attn.load_state_dict(state_dict) self.mlp.load_state_dict(state_dict) self.input_layernorm.load_state_dict(state_dict) @@ -494,20 +486,16 @@ class DeepSeekV3DecoderLayer(nn.Layer): position_ids: paddle.Tensor, mask_encoder_batch: paddle.Tensor, ): - """ - """ + """ """ if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) + hidden_states, residual = self.input_layernorm(hidden_states, residual) - hidden_states = self.self_attn(forward_meta, hidden_states, - position_ids, mask_encoder_batch) + hidden_states = self.self_attn(forward_meta, hidden_states, position_ids, mask_encoder_batch) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) hidden_states = self.mlp(hidden_states) return hidden_states, residual @@ -536,12 +524,15 @@ class DeepSeekV3Model(nn.Layer): prefix="deepseek_v3.embed_tokens", ) - self.decoder_layers = nn.LayerList([ - DeepSeekV3DecoderLayer( - fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") - for i in range(self.num_layers) - ]) + self.decoder_layers = nn.LayerList( + [ + DeepSeekV3DecoderLayer( + fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(self.num_layers) + ] + ) self.norm = RMSNorm( fd_config, @@ -567,15 +558,18 @@ class DeepSeekV3Model(nn.Layer): position_ids: paddle.Tensor, mask_encoder_batch: paddle.Tensor, ): - """ - """ + """ """ hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding) residual = None for i in range(self.num_layers): hidden_states, residual = self.decoder_layers[i]( - forward_meta, hidden_states, residual, position_ids, - mask_encoder_batch) + forward_meta, + hidden_states, + residual, + position_ids, + mask_encoder_batch, + ) hidden_states = hidden_states + residual out = self.norm(hidden_states) @@ -604,8 +598,7 @@ class DeepseekV3ForCausalLM(ModelForCasualLM): @classmethod def name(cls): - """ - """ + """ """ return "DeepseekV3ForCausalLM" @paddle.no_grad() @@ -617,31 +610,28 @@ class DeepseekV3ForCausalLM(ModelForCasualLM): self.lm_head.load_state_dict(state_dict) def compute_logits(self, hidden_states: paddle.Tensor): - """ - """ + """ """ logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits def pre_process(self, forward_meta): - """ - """ + """ """ seq_lens_encoder = forward_meta.seq_lens_encoder seq_lens_decoder = forward_meta.seq_lens_decoder seq_lens_this_time = forward_meta.seq_lens_this_time position_ids_shape = paddle.sum(seq_lens_this_time) - position_ids = paddle.empty(shape=position_ids_shape, - dtype=seq_lens_encoder.dtype) - mask_encoder_batch = paddle.empty( - shape=position_ids_shape, - dtype=seq_lens_encoder.dtype).unsqueeze(1) + position_ids = paddle.empty(shape=position_ids_shape, dtype=seq_lens_encoder.dtype) + mask_encoder_batch = paddle.empty(shape=position_ids_shape, dtype=seq_lens_encoder.dtype).unsqueeze(1) - get_position_ids_and_mask_encoder_batch(seq_lens_encoder, - seq_lens_decoder, - seq_lens_this_time, - position_ids, - mask_encoder_batch) + get_position_ids_and_mask_encoder_batch( + seq_lens_encoder, + seq_lens_decoder, + seq_lens_this_time, + position_ids, + mask_encoder_batch, + ) return position_ids, mask_encoder_batch @@ -650,11 +640,14 @@ class DeepseekV3ForCausalLM(ModelForCasualLM): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ + """ """ position_ids, mask_encoder_batch = self.pre_process(forward_meta) - hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta, - position_ids=position_ids, mask_encoder_batch=mask_encoder_batch) + hidden_states = self.model( + ids_remove_padding=ids_remove_padding, + forward_meta=forward_meta, + position_ids=position_ids, + mask_encoder_batch=mask_encoder_batch, + ) return hidden_states @@ -676,8 +669,8 @@ class DeepSeekV3PretrainedModel(PretrainedModel): logger.info("DeepseekV3 inference model _get_tensor_parallel_mappings") - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func + fn = split_or_merge_func( is_split=is_split, tensor_parallel_degree=config.tensor_parallel_degree, @@ -691,66 +684,40 @@ class DeepSeekV3PretrainedModel(PretrainedModel): base_actions = { "lm_head.weight": partial(fn, is_column=True), "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, - is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), } # Self Attention Layer which are need TP. - base_actions["layers.0.self_attn.q_b_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.kv_b_proj.weight"] = partial( - fn, is_column=True) - base_actions[ - "layers.0.self_attn.q_b_proj.weight_scale_inv"] = partial( - fn, is_column=True) - base_actions[ - "layers.0.self_attn.kv_b_proj.weight_scale_inv"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.q_b_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.kv_b_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_b_proj.weight_scale_inv"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.kv_b_proj.weight_scale_inv"] = partial(fn, is_column=True) # MLP Layer - base_actions["layers.0.mlp.gate_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.mlp.up_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.mlp.down_proj.weight"] = partial( - fn, is_column=False) + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.down_proj.weight"] = partial(fn, is_column=False) # Moe Layer for expert_idx in range(config.n_routed_experts): - base_actions[ - f"layers.0.mlp.experts.{expert_idx}.up_proj.weight"] = partial( - fn, is_column=True) - base_actions[ - f"layers.0.mlp.experts.{expert_idx}.gate_proj.weight"] = partial( - fn, is_column=True) - base_actions[ - f"layers.0.mlp.experts.{expert_idx}.down_proj.weight"] = partial( - fn, is_column=False) + base_actions[f"layers.0.mlp.experts.{expert_idx}.up_proj.weight"] = partial(fn, is_column=True) + base_actions[f"layers.0.mlp.experts.{expert_idx}.gate_proj.weight"] = partial(fn, is_column=True) + base_actions[f"layers.0.mlp.experts.{expert_idx}.down_proj.weight"] = partial(fn, is_column=False) # Shared Expert Layer - base_actions[ - "layers.0.mlp.shared_experts.up_proj.weight"] = partial( - fn, is_column=True) - base_actions[ - "layers.0.mlp.shared_experts.gate_proj.weight"] = partial( - fn, is_column=True) - base_actions[ - "layers.0.mlp.shared_experts.down_proj.weight"] = partial( - fn, is_column=False) + base_actions["layers.0.mlp.shared_experts.up_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.shared_experts.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.shared_experts.down_proj.weight"] = partial(fn, is_column=False) # MTP parts - base_actions["layers.61.embed_tokens.weight"] = partial( - fn, is_column=False) - base_actions["layers.61.eh_proj.weight"] = partial(fn, - is_column=True) - base_actions["layers.61.shared_head.head.weight"] = partial( - fn, is_column=True) + base_actions["layers.61.embed_tokens.weight"] = partial(fn, is_column=False) + base_actions["layers.61.eh_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.61.shared_head.head.weight"] = partial(fn, is_column=True) for key, action in base_actions.items(): if "layers.0." in key: for i in range(num_layers): - final_actions[key.replace("layers.0.", - f"layers.{i}.")] = action + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action final_actions[key] = action return final_actions diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 873bc041f..460170b7d 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -28,25 +28,27 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.model_executor.forward_meta import ForwardMeta -from fastdeploy.model_executor.graph_optimization.decorator import \ - support_graph_optimization +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.linear import ( - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm -from fastdeploy.model_executor.models.utils import \ - LayerIdPlaceholder as layerid +from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid from fastdeploy.model_executor.models.utils import WeightMeta class Ernie4_5_MLP(nn.Layer): - def __init__( self, fd_config: FDConfig, @@ -92,91 +94,57 @@ class Ernie4_5_MLP(nn.Layer): class Ernie4_5_MoE(nn.Layer): - - def __init__(self, fd_config: FDConfig, layer_id: int, - prefix: str) -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() moe_quant_type = "" - if hasattr(fd_config.quant_config, 'moe_quant_type'): + if hasattr(fd_config.quant_config, "moe_quant_type"): moe_quant_type = fd_config.quant_config.moe_quant_type if moe_quant_type == "w4a8": weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.quant_weight", - "up_gate_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", - "down_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.down_proj.weight_scale", - "up_gate_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", - "down_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.down_proj.activation_scale", + "gate_weight_key": f"{prefix}.gate.weight", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.quant_weight", + "up_gate_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", + "down_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.down_proj.weight_scale", + "up_gate_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", + "down_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.down_proj.activation_scale", } elif moe_quant_type == "w4w2": weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.quant_weight", - "up_gate_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", - "down_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.down_proj.weight_scale", - "up_gate_proj_expert_super_scales_key": - f"{prefix}.experts.{{}}.up_gate_proj.super_scales", - "down_proj_expert_super_scales_key": - f"{prefix}.experts.{{}}.down_proj.super_scales", - "up_gate_proj_expert_code_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.code_scale", - "down_proj_expert_code_scale_key": - f"{prefix}.experts.{{}}.down_proj.code_scale", - "up_gate_proj_expert_code_zp_key": - f"{prefix}.experts.{{}}.up_gate_proj.code_zp", - "down_proj_expert_code_zp_key": - f"{prefix}.experts.{{}}.down_proj.code_zp", + "gate_weight_key": f"{prefix}.gate.weight", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.quant_weight", + "up_gate_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", + "down_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.down_proj.weight_scale", + "up_gate_proj_expert_super_scales_key": f"{prefix}.experts.{{}}.up_gate_proj.super_scales", + "down_proj_expert_super_scales_key": f"{prefix}.experts.{{}}.down_proj.super_scales", + "up_gate_proj_expert_code_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.code_scale", + "down_proj_expert_code_scale_key": f"{prefix}.experts.{{}}.down_proj.code_scale", + "up_gate_proj_expert_code_zp_key": f"{prefix}.experts.{{}}.up_gate_proj.code_zp", + "down_proj_expert_code_zp_key": f"{prefix}.experts.{{}}.down_proj.code_zp", } elif moe_quant_type == "tensor_wise_fp8" or ( - moe_quant_type == "block_wise_fp8" - and fd_config.model_config.is_quantized): + moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized + ): weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.quant_weight", - "up_gate_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", - "down_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.down_proj.weight_scale", - "up_gate_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", - "down_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.down_proj.activation_scale", + "gate_weight_key": f"{prefix}.gate.weight", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.quant_weight", + "up_gate_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", + "down_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.down_proj.weight_scale", + "up_gate_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", + "down_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.down_proj.activation_scale", } else: weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.weight", + "gate_weight_key": f"{prefix}.gate.weight", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.weight", } self.fused_moe = FusedMoE( @@ -211,9 +179,7 @@ class Ernie4_5_MoE(nn.Layer): class Ernie4_5_Attention(nn.Layer): - - def __init__(self, fd_config: FDConfig, layer_id: int, - prefix: str) -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() self.qkv_proj = QKVParallelLinear( @@ -224,8 +190,7 @@ class Ernie4_5_Attention(nn.Layer): self.o_proj = RowParallelLinear( fd_config=fd_config, prefix=f"{prefix}.o_proj", - input_size=fd_config.model_config.head_dim * - fd_config.model_config.num_attention_heads, + input_size=fd_config.model_config.head_dim * fd_config.model_config.num_attention_heads, output_size=fd_config.model_config.hidden_size, ) self.attn = Attention( @@ -258,14 +223,13 @@ class Ernie4_5_Attention(nn.Layer): class Ernie4_5_DecoderLayer(nn.Layer): - def __init__( self, fd_config: FDConfig, prefix: str = "", ) -> None: super().__init__() - layer_id = int(prefix.split(sep='.')[-1]) + layer_id = int(prefix.split(sep=".")[-1]) self.self_attn = Ernie4_5_Attention( fd_config=fd_config, @@ -273,8 +237,10 @@ class Ernie4_5_DecoderLayer(nn.Layer): prefix=f"{prefix}.self_attn", ) - if (getattr(fd_config.model_config, "moe_num_experts", None) is not None - and layer_id >= fd_config.model_config.moe_layer_start_index): + if ( + getattr(fd_config.model_config, "moe_num_experts", None) is not None + and layer_id >= fd_config.model_config.moe_layer_start_index + ): self.mlp = Ernie4_5_MoE( fd_config=fd_config, layer_id=layer_id, @@ -317,16 +283,14 @@ class Ernie4_5_DecoderLayer(nn.Layer): residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) + hidden_states, residual = self.input_layernorm(hidden_states, residual) hidden_states = self.self_attn( hidden_states=hidden_states, forward_meta=forward_meta, ) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) hidden_states = self.mlp(hidden_states) @@ -335,7 +299,6 @@ class Ernie4_5_DecoderLayer(nn.Layer): @support_graph_optimization class Ernie4_5_Model(nn.Layer): - def __init__( self, fd_config: FDConfig = None, @@ -356,14 +319,18 @@ class Ernie4_5_Model(nn.Layer): num_embeddings=fd_config.model_config.vocab_size, embedding_dim=fd_config.model_config.hidden_size, params_dtype=paddle.get_default_dtype(), - prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens")) + prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), + ) - self.layers = nn.LayerList([ - Ernie4_5_DecoderLayer( - fd_config=fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") - for i in range(self.num_layers) - ]) + self.layers = nn.LayerList( + [ + Ernie4_5_DecoderLayer( + fd_config=fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(self.num_layers) + ] + ) self.norm = RMSNorm( fd_config, @@ -396,9 +363,7 @@ class Ernie4_5_Model(nn.Layer): residual = None for i in range(self.num_layers): - hidden_states, residual = self.layers[i](forward_meta, - hidden_states, - residual) + hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) hidden_states = hidden_states + residual @@ -436,8 +401,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): return "Ernie4_5_MoeForCausalLM" @paddle.no_grad() - def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, - paddle.Tensor]]): + def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]]): """ Load model parameters from a given state dictionary. @@ -448,15 +412,14 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): """ self.ernie.load_state_dict(state_dict) if self.tie_word_embeddings: - self.lm_head.linear.weight.set_value( - self.ernie.embed_tokens.embeddings.weight.transpose([1, 0])) + self.lm_head.linear.weight.set_value(self.ernie.embed_tokens.embeddings.weight.transpose([1, 0])) else: self.lm_head.load_state_dict(state_dict) def compute_logits(self, hidden_states: paddle.Tensor): logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits @@ -468,8 +431,10 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.model_config.moe_layer_start_index, - self.fd_config.model_config.num_hidden_layers): + for i in range( + self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers, + ): self.ernie.layers[i].mlp.fused_moe(fake_hidden_states) def forward( @@ -477,8 +442,7 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - hidden_states = self.ernie(ids_remove_padding=ids_remove_padding, - forward_meta=forward_meta) + hidden_states = self.ernie(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) return hidden_states @@ -510,54 +474,75 @@ class Ernie4_5_PretrainedModel(PretrainedModel): return None weight_infos = [ - WeightMeta(f".layers.{{{layerid.LAYER_ID}}}.self_attn.qkv_proj.weight", - True, tsm.GQA), - WeightMeta(f".layers.{{{layerid.LAYER_ID}}}.self_attn.o_proj.weight", - False), + WeightMeta( + f".layers.{{{layerid.LAYER_ID}}}.self_attn.qkv_proj.weight", + True, + tsm.GQA, + ), + WeightMeta(f".layers.{{{layerid.LAYER_ID}}}.self_attn.o_proj.weight", False), WeightMeta( f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.up_gate_proj.weight", - True, tsm.PairFused), - WeightMeta(f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.down_proj.weight", - False), + True, + tsm.PairFused, + ), + WeightMeta(f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.down_proj.weight", False), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.EXPERT_ID}}}.up_gate_proj.weight", - True, tsm.PairFused), + True, + tsm.PairFused, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.EXPERT_ID}}}.down_proj.weight", - False), + False, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.up_gate_proj.weight", - True, tsm.PairFused), + True, + tsm.PairFused, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.down_proj.weight", - False), + False, + ), WeightMeta(".embed_tokens.weight", False), WeightMeta("lm_head.weight", True), # quant tensorwise WeightMeta( f".layers.{{{layerid.LAYER_ID}}}.self_attn.qkv_proj.quant_weight", - True, tsm.GQA), + True, + tsm.GQA, + ), WeightMeta( f".layers.{{{layerid.LAYER_ID}}}.self_attn.o_proj.quant_weight", - False), + False, + ), WeightMeta( f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.up_gate_proj.quant_weight", - True, tsm.PairFused), + True, + tsm.PairFused, + ), WeightMeta( f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.down_proj.quant_weight", - False), + False, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.EXPERT_ID}}}.up_gate_proj.quant_weight", - True, tsm.PairFused), + True, + tsm.PairFused, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.EXPERT_ID}}}.down_proj.quant_weight", - False), + False, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.up_gate_proj.quant_weight", - True, tsm.PairFused), + True, + tsm.PairFused, + ), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.shared_experts.down_proj.quant_weight", - False), + False, + ), ] @classmethod @@ -567,7 +552,10 @@ class Ernie4_5_PretrainedModel(PretrainedModel): """ logger.info("erine inference model _get_tensor_parallel_mappings") from fastdeploy.model_executor.models.tp_utils import ( - build_expanded_keys, has_prefix, split_or_merge_func_v1) + build_expanded_keys, + has_prefix, + split_or_merge_func_v1, + ) fn = split_or_merge_func_v1( is_split=is_split, @@ -575,19 +563,16 @@ class Ernie4_5_PretrainedModel(PretrainedModel): tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.num_attention_heads, num_key_value_heads=config.num_key_value_heads, - head_dim=config.head_dim) + head_dim=config.head_dim, + ) - def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, - moe_layer_start_index, - prefix_name): + def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, moe_layer_start_index, prefix_name): base_actions = {} weight_infos = cls.weight_infos - for (weight_name, is_column, extra) in weight_infos: + for weight_name, is_column, extra in weight_infos: params = { "is_column": is_column, - **({ - extra.value: True - } if extra else {}) + **({extra.value: True} if extra else {}), } if "lm_head.weight" in weight_name: @@ -598,12 +583,10 @@ class Ernie4_5_PretrainedModel(PretrainedModel): key = weight_name base_actions[key] = partial(fn, **params) final_actions = {} - start_layer = (moe_layer_start_index - if moe_layer_start_index > 0 else num_layers) - final_actions = build_expanded_keys( - base_actions, num_layers, start_layer, moe_num_experts - ) + start_layer = moe_layer_start_index if moe_layer_start_index > 0 else num_layers + final_actions = build_expanded_keys(base_actions, num_layers, start_layer, moe_num_experts) return final_actions + mappings = get_tensor_parallel_split_mappings( config.num_hidden_layers, getattr(config, "moe_num_experts", 0), diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 47dbee48f..b52d8ed71 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -53,8 +53,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): """ logger.info("erine inference model _get_tensor_parallel_mappings") - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( is_split=is_split, @@ -71,10 +70,8 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): num_key_value_heads, head_dim, ): - def get_shape(tensor): - return (tensor.get_shape() - if hasattr(tensor, "get_shape") else tensor.shape) + return tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape def slice_tensor(tensor, start, end): shape = get_shape(tensor) @@ -96,11 +93,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): size = shape[-1] block_size = size // degree if hasattr(tensor, "get_shape"): - return [ - slice_tensor(tensor, i * block_size, - (i + 1) * block_size) - for i in range(degree) - ] + return [slice_tensor(tensor, i * block_size, (i + 1) * block_size) for i in range(degree)] else: return np.split(tensor, degree, axis=-1) @@ -109,10 +102,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): v_list = split_tensor(v, tensor_parallel_degree) if tensor_parallel_rank is None: - return [ - np.concatenate([q_i, k_i, v_i], axis=-1) - for q_i, k_i, v_i in zip(q_list, k_list, v_list) - ] + return [np.concatenate([q_i, k_i, v_i], axis=-1) for q_i, k_i, v_i in zip(q_list, k_list, v_list)] else: return np.concatenate( [ @@ -123,8 +113,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): axis=-1, ) - def gqa_qkv_merge_func(weight_list, num_attention_heads, - num_key_value_heads, head_dim): + def gqa_qkv_merge_func(weight_list, num_attention_heads, num_key_value_heads, head_dim): tensor_parallel_degree = len(weight_list) num_attention_heads = num_attention_heads // tensor_parallel_degree num_key_value_heads = num_key_value_heads // tensor_parallel_degree @@ -132,8 +121,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): is_paddle_tensor = not isinstance(weight_list[0], np.ndarray) def get_shape(tensor): - return (tensor.get_shape() - if hasattr(tensor, "get_shape") else tensor.shape) + return tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape def slice_tensor(tensor, start, end): if len(get_shape(tensor)) == 1: @@ -166,8 +154,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): else: return np.concatenate(merged, axis=-1) - if (config.num_key_value_heads is not None - and config.num_key_value_heads != config.num_attention_heads): + if config.num_key_value_heads is not None and config.num_key_value_heads != config.num_attention_heads: if is_split: qkv_fn = partial( gqa_qkv_split_func, @@ -187,8 +174,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): else: qkv_fn = partial(fn, is_column=True) - def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, - moe_layer_start_index): + def get_tensor_parallel_split_mappings(num_layers, moe_num_experts, moe_layer_start_index): """ get tensor from parallel-split-mappings """ @@ -197,38 +183,32 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel): base_actions = {} - base_actions["ernie.mtp_linear_proj.0.weight"] = partial( - fn, is_column=True) - base_actions[ - f"{base_model_prefix}.0.self_attn.qkv_proj.weight"] = qkv_fn - base_actions[ - f"{base_model_prefix}.0.self_attn.o_proj.weight"] = partial( - fn, is_column=False) - base_actions[ - f"{base_model_prefix}.0.mlp.up_gate_proj.weight"] = partial( - fn, is_column=True, is_naive_2fuse=True) - base_actions[f"{base_model_prefix}.0.mlp.down_proj.weight"] = ( - partial(fn, is_column=False)) + base_actions["ernie.mtp_linear_proj.0.weight"] = partial(fn, is_column=True) + base_actions[f"{base_model_prefix}.0.self_attn.qkv_proj.weight"] = qkv_fn + base_actions[f"{base_model_prefix}.0.self_attn.o_proj.weight"] = partial(fn, is_column=False) + base_actions[f"{base_model_prefix}.0.mlp.up_gate_proj.weight"] = partial( + fn, is_column=True, is_naive_2fuse=True + ) + base_actions[f"{base_model_prefix}.0.mlp.down_proj.weight"] = partial(fn, is_column=False) for expert_idx in range(moe_num_experts): base_actions[ - f"{base_model_prefix}.{moe_layer_start_index}" - f".mlp.experts.{expert_idx}.up_gate_proj.weight"] = partial( - fn, is_column=True, is_naive_2fuse=True) + f"{base_model_prefix}.{moe_layer_start_index}" f".mlp.experts.{expert_idx}.up_gate_proj.weight" + ] = partial(fn, is_column=True, is_naive_2fuse=True) base_actions[ - f"{base_model_prefix}.{moe_layer_start_index}" - f".mlp.experts.{expert_idx}.down_proj.weight"] = partial( - fn, is_column=False) + f"{base_model_prefix}.{moe_layer_start_index}" f".mlp.experts.{expert_idx}.down_proj.weight" + ] = partial(fn, is_column=False) for key, action in base_actions.items(): - if (f"{base_model_prefix}.0.mlp.up_gate_proj.weight" in key or - f"{base_model_prefix}.0.mlp.down_proj.weight" in key): + if ( + f"{base_model_prefix}.0.mlp.up_gate_proj.weight" in key + or f"{base_model_prefix}.0.mlp.down_proj.weight" in key + ): for i in range(moe_layer_start_index): final_actions[key.replace("0.", f"{i}.")] = action elif f"{moe_layer_start_index}.mlp.experts." in key: for i in range(moe_layer_start_index, num_layers): - final_actions[key.replace(f"{moe_layer_start_index}.", - f"{i}.")] = action + final_actions[key.replace(f"{moe_layer_start_index}.", f"{i}.")] = action elif f"{base_model_prefix}.0." in key: for i in range(num_layers): final_actions[key.replace("0.", f"{i}.")] = action @@ -265,12 +245,15 @@ class Ernie4_5_MTPModel(nn.Layer): self.num_layers = fd_config.model_config.num_hidden_layers self.embed_tokens = fd_config.speculative_config.sharing_model.ernie.embed_tokens - self.layers = nn.LayerList([ - Ernie4_5_DecoderLayer( - fd_config=fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.{i}") - for i in range(self.num_layers) - ]) + self.layers = nn.LayerList( + [ + Ernie4_5_DecoderLayer( + fd_config=fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.{i}", + ) + for i in range(self.num_layers) + ] + ) self.enorm = RMSNorm( fd_config, @@ -319,18 +302,15 @@ class Ernie4_5_MTPModel(nn.Layer): """ forward """ - inputs_embedding = self.embed_tokens( - ids_remove_padding=ids_remove_padding) + inputs_embedding = self.embed_tokens(ids_remove_padding=ids_remove_padding) inputs_embedding = paddle.concat( - [self.enorm(inputs_embedding), - self.hnorm(previous_hidden_states)], - axis=-1) + [self.enorm(inputs_embedding), self.hnorm(previous_hidden_states)], + axis=-1, + ) hidden_states = self.eh_proj(inputs_embedding) residual = None for i in range(self.num_layers): - hidden_states, residual = self.layers[i](forward_meta, - hidden_states, - residual) + hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) hidden_states = hidden_states + residual @@ -358,13 +338,11 @@ class Ernie4_5_MTPForCausalLM(ModelForCasualLM): @classmethod def name(self): - """ - """ + """ """ return "Ernie4_5_MTPForCausalLM" @paddle.no_grad() - def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, - paddle.Tensor]]): + def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]]): """ Load model parameters from a given state dictionary. @@ -386,7 +364,7 @@ class Ernie4_5_MTPForCausalLM(ModelForCasualLM): """ logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits @@ -398,8 +376,10 @@ class Ernie4_5_MTPForCausalLM(ModelForCasualLM): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.model_config.moe_layer_start_index, - self.fd_config.model_config.num_hidden_layers): + for i in range( + self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers, + ): self.ernie.layers[i].mlp.fused_moe(fake_hidden_states) def forward( @@ -411,7 +391,6 @@ class Ernie4_5_MTPForCausalLM(ModelForCasualLM): """ forward """ - hidden_states = self.ernie(ids_remove_padding, previous_hidden_states, - forward_meta) + hidden_states = self.ernie(ids_remove_padding, previous_hidden_states, forward_meta) return hidden_states diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/__init__.py b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/__init__.py index baf7645d7..4c283de51 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/__init__.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/__init__.py @@ -18,5 +18,6 @@ from .configuration import DFNRopeVisionTransformerConfig from .modeling import DFNRopeVisionTransformerPretrainedModel __all__ = [ - 'DFNRopeVisionTransformerConfig', 'DFNRopeVisionTransformerPretrainedModel' + "DFNRopeVisionTransformerConfig", + "DFNRopeVisionTransformerPretrainedModel", ] diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/activation.py b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/activation.py index b1f87a59a..1c3b22ae1 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/activation.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/activation.py @@ -37,9 +37,9 @@ class NewGELUActivation(nn.Layer): Returns: Tensor: _description_ """ - return (0.5 * input * (1.0 + paddle.tanh( - math.sqrt(2.0 / math.pi) * - (input + 0.044715 * paddle.pow(input, 3.0))))) + return ( + 0.5 * input * (1.0 + paddle.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * paddle.pow(input, 3.0)))) + ) class GELUActivation(nn.Layer): @@ -99,9 +99,7 @@ class FastGELUActivation(nn.Layer): Returns: Tensor: _description_ """ - return 0.5 * input * (1.0 + - paddle.tanh(input * 0.7978845608 * - (1.0 + 0.044715 * input * input))) + return 0.5 * input * (1.0 + paddle.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) class QuickGELUActivation(nn.Layer): @@ -136,8 +134,7 @@ class ClippedGELUActivation(nn.Layer): def __init__(self, min: float, max: float): if min > max: - raise ValueError( - f"min should be < max (got min: {min}, max: {max})") + raise ValueError(f"min should be < max (got min: {min}, max: {max})") super().__init__() self.min = min @@ -234,15 +231,10 @@ class ClassInstantier(OrderedDict): ACT2CLS = { "gelu": GELUActivation, - "gelu_10": (ClippedGELUActivation, { - "min": -10, - "max": 10 - }), + "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}), "gelu_fast": FastGELUActivation, "gelu_new": NewGELUActivation, - "gelu_python": (GELUActivation, { - "use_gelu_python": True - }), + "gelu_python": (GELUActivation, {"use_gelu_python": True}), "linear": LinearActivation, "mish": MishActivation, "quick_gelu": QuickGELUActivation, @@ -271,9 +263,7 @@ def get_activation(activation_string): if activation_string in ACT2FN: return ACT2FN[activation_string] else: - raise KeyError( - f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}" - ) + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") # For backwards compatibility with: from activations import gelu_python diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py index ff532dd4c..2dcf07559 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py @@ -22,11 +22,14 @@ import paddle.distributed as dist import paddle.nn.functional as F from paddle import nn from paddle.distributed import fleet -from paddle.distributed.fleet.meta_parallel import (ColumnParallelLinear, - RowParallelLinear) +from paddle.distributed.fleet.meta_parallel import ( + ColumnParallelLinear, + RowParallelLinear, +) from paddle.distributed.fleet.utils import recompute -from paddle.nn.functional.flash_attention import \ - flash_attn_unpadded as flash_attn_varlen_func +from paddle.nn.functional.flash_attention import ( + flash_attn_unpadded as flash_attn_varlen_func, +) from paddleformers.transformers.model_utils import PretrainedModel from fastdeploy.model_executor.layers.utils import get_tensor @@ -49,7 +52,6 @@ def get_hcg(): class _AllToAll(paddle.autograd.PyLayer): - @staticmethod def forward( ctx, @@ -78,19 +80,20 @@ class _AllToAll(paddle.autograd.PyLayer): return input if input_split_sizes is None and output_split_sizes is None: output = paddle.empty_like(input) - task = dist.stream.alltoall_single(output, input, None, None, - group, True, True) + task = dist.stream.alltoall_single(output, input, None, None, group, True, True) task.wait() else: out_sizes = [sum(output_split_sizes)] out_sizes.extend(input.shape[1:]) output = paddle.empty(out_sizes, dtype=input.dtype) - task = dist.stream.alltoall_single(output, - input, - output_split_sizes, - input_split_sizes, - group, - sync_op=False) + task = dist.stream.alltoall_single( + output, + input, + output_split_sizes, + input_split_sizes, + group, + sync_op=False, + ) task.wait() return output @@ -104,21 +107,23 @@ class _AllToAll(paddle.autograd.PyLayer): if ctx.input_split_sizes is None and ctx.output_split_sizes is None: return _AllToAll.apply(*grad_output, ctx.group) else: - return _AllToAll.apply(*grad_output, ctx.group, - ctx.input_split_sizes, - ctx.output_split_sizes) + return _AllToAll.apply( + *grad_output, + ctx.group, + ctx.input_split_sizes, + ctx.output_split_sizes, + ) # Copied from transformers.models.llama.modeling_llama.rotate_half def rotate_half(x): """Rotates half the hidden dims of the input.""" - x1 = x[..., :x.shape[-1] // 2] - x2 = x[..., x.shape[-1] // 2:] + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] return paddle.concat([-x2, x1], axis=-1) # shape is the same as x -def apply_rotary_pos_emb_vision(tensor: paddle.Tensor, - freqs: paddle.Tensor) -> paddle.Tensor: +def apply_rotary_pos_emb_vision(tensor: paddle.Tensor, freqs: paddle.Tensor) -> paddle.Tensor: """_summary_ Args: @@ -134,10 +139,8 @@ def apply_rotary_pos_emb_vision(tensor: paddle.Tensor, tensor = tensor.astype(dtype="float32") cos = freqs.cos() sin = freqs.sin() - cos = cos.unsqueeze(1).tile( - repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32") - sin = sin.unsqueeze(1).tile( - repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32") + cos = cos.unsqueeze(1).tile(repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32") + sin = sin.unsqueeze(1).tile(repeat_times=[1, 1, 2]).unsqueeze(0).astype(dtype="float32") output = tensor * cos + rotate_half(tensor) * sin output = paddle.cast(output, orig_dtype) return output @@ -150,10 +153,7 @@ class VisionFlashAttention2(nn.Layer): nn (_type_): _description_ """ - def __init__(self, - dim: int, - num_heads: int = 16, - tensor_parallel_degree: int = 1) -> None: + def __init__(self, dim: int, num_heads: int = 16, tensor_parallel_degree: int = 1) -> None: super().__init__() self.num_heads = num_heads self.tensor_parallel_degree = tensor_parallel_degree @@ -162,8 +162,7 @@ class VisionFlashAttention2(nn.Layer): self.qkv = ColumnParallelLinear( dim, dim * 3, - mp_group=fleet.get_hybrid_communicate_group(). - get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), weight_attr=None, has_bias=True, fuse_matmul_bias=True, @@ -172,10 +171,10 @@ class VisionFlashAttention2(nn.Layer): self.proj = RowParallelLinear( dim, dim, - mp_group=fleet.get_hybrid_communicate_group( - ).get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), input_is_parallel=True, - has_bias=True) + has_bias=True, + ) else: self.qkv = nn.Linear(dim, dim * 3, bias_attr=True) self.proj = nn.Linear(dim, dim) @@ -199,15 +198,22 @@ class VisionFlashAttention2(nn.Layer): paddle.Tensor: _description_ """ seq_length = hidden_states.shape[0] - qkv = self.qkv(hidden_states).reshape( - [seq_length, 3, self.num_heads // self.tensor_parallel_degree, - -1]).transpose(perm=[1, 0, 2, 3]) + qkv = ( + self.qkv(hidden_states) + .reshape( + [ + seq_length, + 3, + self.num_heads // self.tensor_parallel_degree, + -1, + ] + ) + .transpose(perm=[1, 0, 2, 3]) + ) q, k, v = qkv.unbind(axis=0) - q = apply_rotary_pos_emb_vision(q.unsqueeze(axis=0), - rotary_pos_emb).squeeze(axis=0) - k = apply_rotary_pos_emb_vision(k.unsqueeze(axis=0), - rotary_pos_emb).squeeze(axis=0) + q = apply_rotary_pos_emb_vision(q.unsqueeze(axis=0), rotary_pos_emb).squeeze(axis=0) + k = apply_rotary_pos_emb_vision(k.unsqueeze(axis=0), rotary_pos_emb).squeeze(axis=0) max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() @@ -223,7 +229,10 @@ class VisionFlashAttention2(nn.Layer): max_seqlen, max_seqlen, scale=softmax_scale, # TODO: 需要手动加上 - )[0].squeeze(0).reshape([seq_length, -1])) + )[0] + .squeeze(0) + .reshape([seq_length, -1]) + ) attn_output = attn_output.astype(paddle.float32) attn_output = self.proj(attn_output) @@ -247,9 +256,7 @@ class PatchEmbed(nn.Layer): self.patch_size = patch_size self.in_channels = in_channels self.embed_dim = embed_dim - self.proj = nn.Linear(in_channels * patch_size * patch_size, - embed_dim, - bias_attr=False) + self.proj = nn.Linear(in_channels * patch_size * patch_size, embed_dim, bias_attr=False) def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor: """_summary_ @@ -262,8 +269,7 @@ class PatchEmbed(nn.Layer): """ target_dtype = self.proj.weight.dtype - hidden_states = self.proj( - paddle.cast(hidden_states, dtype=target_dtype)) + hidden_states = self.proj(paddle.cast(hidden_states, dtype=target_dtype)) return hidden_states @@ -275,11 +281,13 @@ class VisionMlp(nn.Layer): nn (_type_): _description_ """ - def __init__(self, - dim: int, - hidden_dim: int, - hidden_act: str, - tensor_parallel_degree: int = 1) -> None: + def __init__( + self, + dim: int, + hidden_dim: int, + hidden_act: str, + tensor_parallel_degree: int = 1, + ) -> None: super().__init__() self.tensor_parallel_degree = tensor_parallel_degree @@ -287,17 +295,17 @@ class VisionMlp(nn.Layer): self.fc1 = ColumnParallelLinear( dim, hidden_dim, - mp_group=fleet.get_hybrid_communicate_group( - ).get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), gather_output=False, - has_bias=True) + has_bias=True, + ) self.fc2 = RowParallelLinear( hidden_dim, dim, - mp_group=fleet.get_hybrid_communicate_group( - ).get_model_parallel_group(), + mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(), input_is_parallel=True, - has_bias=True) + has_bias=True, + ) else: self.fc1 = nn.Linear(dim, hidden_dim) self.fc2 = nn.Linear(hidden_dim, dim) @@ -330,8 +338,7 @@ class VisionRotaryEmbedding(nn.Layer): theta (float, optional): _description_. Defaults to 10000.0. """ super().__init__() - self.inv_freq = 1.0 / theta**( - paddle.arange(start=0, end=dim, step=2, dtype="float32") / dim) + self.inv_freq = 1.0 / theta ** (paddle.arange(start=0, end=dim, step=2, dtype="float32") / dim) def forward(self, seqlen: int) -> paddle.Tensor: """_summary_ @@ -354,7 +361,12 @@ class DFNRopeVisionBlock(nn.Layer): nn (_type_): _description_ """ - def __init__(self, config, tensor_parallel_degree: int, attn_implementation: str = "sdpa") -> None: + def __init__( + self, + config, + tensor_parallel_degree: int, + attn_implementation: str = "sdpa", + ) -> None: """_summary_ Args: @@ -369,18 +381,17 @@ class DFNRopeVisionBlock(nn.Layer): self.attn = VisionFlashAttention2( config.embed_dim, num_heads=config.num_heads, - tensor_parallel_degree=tensor_parallel_degree) + tensor_parallel_degree=tensor_parallel_degree, + ) self.mlp = VisionMlp( dim=config.embed_dim, hidden_dim=mlp_hidden_dim, hidden_act=config.hidden_act, - tensor_parallel_degree=tensor_parallel_degree) + tensor_parallel_degree=tensor_parallel_degree, + ) self.config = config - def forward(self, - hidden_states, - cu_seqlens, - rotary_pos_emb) -> paddle.Tensor: + def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> paddle.Tensor: """_summary_ Args: @@ -407,10 +418,7 @@ class PatchMerger(nn.Layer): nn (_type_): _description_ """ - def __init__(self, - dim: int, - context_dim: int, - spatial_merge_size: int = 2) -> None: + def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None: """_summary_ Args: @@ -466,7 +474,14 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2) self.blocks = nn.LayerList( - [DFNRopeVisionBlock(config.vision_config, config.pretrained_config.tensor_parallel_degree) for _ in range(config.vision_config.depth)]) + [ + DFNRopeVisionBlock( + config.vision_config, + config.pretrained_config.tensor_parallel_degree, + ) + for _ in range(config.vision_config.depth) + ] + ) assert ( config.vision_config.hidden_size == config.vision_config.embed_dim @@ -522,17 +537,13 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): pos_ids = np.concatenate(pos_ids, axis=0) if num_pad > 0: - pos_ids = np.concatenate( - [pos_ids, np.zeros((num_pad, 2), dtype=pos_ids.dtype)]) + pos_ids = np.concatenate([pos_ids, np.zeros((num_pad, 2), dtype=pos_ids.dtype)]) max_grid_size = np.amax(grid_hw_array[:, 1:]) rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size) rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(start_axis=1) return rotary_pos_emb - def forward(self, - hidden_states: paddle.Tensor, - grid_thw: paddle.Tensor, - num_pad=0) -> paddle.Tensor: + def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor, num_pad=0) -> paddle.Tensor: """_summary_ Args: @@ -546,9 +557,9 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): rotary_pos_emb = self.rot_pos_emb(grid_thw, num_pad=num_pad) - cu_seqlens = paddle.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], - grid_thw[:, 0]).cumsum( - axis=0, dtype="int32") + cu_seqlens = paddle.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + axis=0, dtype="int32" + ) if num_pad > 0: cu_seqlens = F.pad(cu_seqlens, (1, 1), value=0) @@ -556,14 +567,11 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): else: cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) - vit_num_recompute_layers = getattr(self.config, - "vit_num_recompute_layers", - self.config.depth) + vit_num_recompute_layers = getattr(self.config, "vit_num_recompute_layers", self.config.depth) for idx, blk in enumerate(self.blocks): if self.config.recompute and self.training and idx < vit_num_recompute_layers: - hidden_states = recompute(blk, hidden_states, cu_seqlens, - rotary_pos_emb) + hidden_states = recompute(blk, hidden_states, cu_seqlens, rotary_pos_emb) else: hidden_states = blk( hidden_states, @@ -576,8 +584,7 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): ret = self.ln(hidden_states) # add norm return ret - def extract_feature(self, hidden_states: paddle.Tensor, - grid_thw: paddle.Tensor) -> paddle.Tensor: + def extract_feature(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor) -> paddle.Tensor: """_summary_ Args: @@ -595,8 +602,8 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): dummy """ - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func + fn = split_or_merge_func( is_split=is_split, tensor_parallel_degree=config.tensor_parallel_degree, @@ -606,37 +613,34 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): def split_qkv_weight(x): head_dim = vision_config.hidden_size // vision_config.num_heads - x = x.reshape([ - vision_config.hidden_size, 3, vision_config.num_heads, head_dim - ]) - x = np.split(x, vision_config.tensor_parallel_degree, - axis=-2)[vision_config.tensor_parallel_rank] + x = x.reshape( + [ + vision_config.hidden_size, + 3, + vision_config.num_heads, + head_dim, + ] + ) + x = np.split(x, vision_config.tensor_parallel_degree, axis=-2)[vision_config.tensor_parallel_rank] x = x.reshape([vision_config.hidden_size, -1]) return x def split_qkv_bias(x): head_dim = vision_config.hidden_size // vision_config.num_heads x = x.reshape([3, vision_config.num_heads, head_dim]) - x = np.split(x, vision_config.tensor_parallel_degree, - axis=-2)[vision_config.tensor_parallel_rank] + x = np.split(x, vision_config.tensor_parallel_degree, axis=-2)[vision_config.tensor_parallel_rank] x = x.reshape([-1]) return x def get_tensor_parallel_split_mappings(depth): final_actions = {} base_actions = { - "vision_model.blocks.0.attn.proj.weight": - partial(fn, is_column=False), - "vision_model.blocks.0.fc1.weight": - partial(fn, is_column=True), - "vision_model.blocks.0.fc1.bias": - partial(fn, is_column=True), - "vision_model.blocks.0.fc2.weight": - partial(fn, is_column=False), - "vision_model.blocks.0.qkv.weight": - split_qkv_weight, - "vision_model.blocks.0.qkv.bias": - split_qkv_bias, + "vision_model.blocks.0.attn.proj.weight": partial(fn, is_column=False), + "vision_model.blocks.0.fc1.weight": partial(fn, is_column=True), + "vision_model.blocks.0.fc1.bias": partial(fn, is_column=True), + "vision_model.blocks.0.fc2.weight": partial(fn, is_column=False), + "vision_model.blocks.0.qkv.weight": split_qkv_weight, + "vision_model.blocks.0.qkv.bias": split_qkv_bias, } for key, action in base_actions.items(): @@ -654,13 +658,9 @@ class DFNRopeVisionTransformerPretrainedModel(PretrainedModel): for param_name, param in params_dict.items(): state_dict_key = f"{self.prefix_name}.{param_name}" if state_dict_key not in state_dict: - raise ValueError( - f"The key {state_dict_key} does not exist in state_dict. " - ) + raise ValueError(f"The key {state_dict_key} does not exist in state_dict. ") tensor = get_tensor(state_dict.pop(state_dict_key)) if param.shape != tensor.shape: - raise ValueError( - f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}" - ) + raise ValueError(f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}") else: param.copy_(tensor, False) diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dist_utils.py b/fastdeploy/model_executor/models/ernie4_5_vl/dist_utils.py index 1e4661363..4d1c9e250 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/dist_utils.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/dist_utils.py @@ -17,12 +17,15 @@ import paddle from paddle import distributed as dist from paddle.distributed import fleet -from paddle.distributed.fleet.utils.sequence_parallel_utils import \ - RowSequenceParallelLinear +from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + RowSequenceParallelLinear, +) __all__ = [ - "scatter_axis", "all_gather_group", "reduce_scatter_group", - "RowSequenceParallelLinear" + "scatter_axis", + "all_gather_group", + "reduce_scatter_group", + "RowSequenceParallelLinear", ] @@ -40,13 +43,15 @@ def scatter_axis(input, group=None, axis=0): rank = group.rank seq_len = input.shape[axis] assert seq_len % parallelism == 0, ( - f"Input sequence length {seq_len} can't be divided exactly" - f" by sequence parallelism {parallelism}") + f"Input sequence length {seq_len} can't be divided exactly" f" by sequence parallelism {parallelism}" + ) interval = seq_len // parallelism - input = paddle.slice(input, - axes=[axis], - starts=[interval * rank], - ends=[interval * (rank + 1)]) + input = paddle.slice( + input, + axes=[axis], + starts=[interval * rank], + ends=[interval * (rank + 1)], + ) # slice use stride, so we maintain the memory of whole input, use assign to free the whole input # which can avoid OOM. input = paddle.assign(input) @@ -81,15 +86,9 @@ def all_gather_group(input, group=None, axis=0): if axis == 0: output_shape[axis] = output_shape[axis] * parallelism output = paddle.empty(shape=output_shape, dtype=input.dtype) - dist.stream.all_gather(output, - input, - group=group, - use_calc_stream=True) + dist.stream.all_gather(output, input, group=group, use_calc_stream=True) return output - outputs = [ - paddle.empty(output_shape, dtype=input.dtype) - for _ in range(parallelism) - ] + outputs = [paddle.empty(output_shape, dtype=input.dtype) for _ in range(parallelism)] dist.stream.all_gather(outputs, input, group=group, use_calc_stream=True) output = paddle.concat(outputs, axis=axis) return output @@ -122,9 +121,5 @@ def reduce_scatter_group(input, group=None): ), f"Input sequence length {input.shape[0]} can't be divided exactly by sequence parallelism {parallelism}" output_shape[0] = output_shape[0] // parallelism output = paddle.empty(shape=output_shape, dtype=input.dtype) - dist.stream.reduce_scatter(output, - input, - op=dist.ReduceOp.SUM, - group=group, - use_calc_stream=True) + dist.stream.reduce_scatter(output, input, op=dist.ReduceOp.SUM, group=group, use_calc_stream=True) return output diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index f71cb6a87..6a1499e20 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -28,24 +28,28 @@ from paddleformers.transformers.configuration_utils import PretrainedConfig from paddleformers.utils.log import logger from fastdeploy.config import FDConfig -from fastdeploy.distributed.communication_op import \ - tensor_model_parallel_all_reduce -from fastdeploy.model_executor.graph_optimization.decorator import \ - support_graph_optimization +from fastdeploy.distributed.communication_op import tensor_model_parallel_all_reduce +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.utils import get_tensor -from fastdeploy.model_executor.models.ernie4_5_moe import (Ernie4_5_Attention, - Ernie4_5_MLP) +from fastdeploy.model_executor.models.ernie4_5_moe import ( + Ernie4_5_Attention, + Ernie4_5_MLP, +) from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.platforms import current_platform if current_platform.is_cuda() and not current_platform.is_dcu(): - from fastdeploy.model_executor.ops.gpu import (extract_text_token_output, - text_image_gather_scatter, - text_image_index_out) + from fastdeploy.model_executor.ops.gpu import ( + extract_text_token_output, + text_image_gather_scatter, + text_image_index_out, + ) from fastdeploy.model_executor.forward_meta import ForwardMeta @@ -68,9 +72,7 @@ class VLMoEMeta: class Ernie4_5_VLMoE(nn.Layer): - - def __init__(self, fd_config: FDConfig, layer_id: int, - prefix: str) -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str) -> None: super().__init__() self.tp_size = fd_config.parallel_config.tensor_parallel_size @@ -96,47 +98,34 @@ class Ernie4_5_VLMoE(nn.Layer): assert text_moe_layer_start_index <= text_moe_layer_end_index moe_quant_type = "" - if hasattr(fd_config, 'quant_config') and fd_config.quant_config is not None: - moe_quant_type = getattr(fd_config.quant_config, 'name', lambda: "")() + if hasattr(fd_config, "quant_config") and fd_config.quant_config is not None: + moe_quant_type = getattr(fd_config.quant_config, "name", lambda: "")() if layer_id >= text_moe_layer_start_index and layer_id <= text_moe_layer_end_index: if moe_quant_type == "tensor_wise_fp8" or ( - moe_quant_type == "block_wise_fp8" - and fd_config.model_config.is_quantized): + moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized + ): weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.quant_weight", - "up_gate_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", - "down_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.down_proj.weight_scale", - "up_gate_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", - "down_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.down_proj.activation_scale", + "gate_weight_key": f"{prefix}.gate.weight", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.quant_weight", + "up_gate_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", + "down_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.down_proj.weight_scale", + "up_gate_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", + "down_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.down_proj.activation_scale", } else: weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.weight", + "gate_weight_key": f"{prefix}.gate.weight", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.weight", } self.text_fused_moe = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.model_config. - moe_intermediate_size[0], + moe_intermediate_size=fd_config.model_config.moe_intermediate_size[0], num_experts=fd_config.model_config.moe_num_experts[0], expert_id_offset=0, top_k=fd_config.model_config.moe_k, @@ -156,42 +145,29 @@ class Ernie4_5_VLMoE(nn.Layer): assert image_moe_layer_start_index <= image_moe_layer_end_index if layer_id >= image_moe_layer_start_index and layer_id <= image_moe_layer_end_index: if moe_quant_type == "tensor_wise_fp8" or ( - moe_quant_type == "block_wise_fp8" - and fd_config.model_config.is_quantized): + moe_quant_type == "block_wise_fp8" and fd_config.model_config.is_quantized + ): weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight_1", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.quant_weight", - "up_gate_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", - "down_proj_expert_weight_scale_key": - f"{prefix}.experts.{{}}.down_proj.weight_scale", - "up_gate_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", - "down_proj_expert_in_scale_key": - f"{prefix}.experts.{{}}.down_proj.activation_scale", + "gate_weight_key": f"{prefix}.gate.weight_1", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.quant_weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.quant_weight", + "up_gate_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.weight_scale", + "down_proj_expert_weight_scale_key": f"{prefix}.experts.{{}}.down_proj.weight_scale", + "up_gate_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.up_gate_proj.activation_scale", + "down_proj_expert_in_scale_key": f"{prefix}.experts.{{}}.down_proj.activation_scale", } else: weight_key_map = { - "gate_weight_key": - f"{prefix}.gate.weight_1", - "gate_correction_bias_key": - f"{prefix}.moe_statics.e_score_correction_bias", - "up_gate_proj_expert_weight_key": - f"{prefix}.experts.{{}}.up_gate_proj.weight", - "down_proj_expert_weight_key": - f"{prefix}.experts.{{}}.down_proj.weight", + "gate_weight_key": f"{prefix}.gate.weight_1", + "gate_correction_bias_key": f"{prefix}.moe_statics.e_score_correction_bias", + "up_gate_proj_expert_weight_key": f"{prefix}.experts.{{}}.up_gate_proj.weight", + "down_proj_expert_weight_key": f"{prefix}.experts.{{}}.down_proj.weight", } self.image_fused_moe = FusedMoE( fd_config=fd_config, reduce_results=False, - moe_intermediate_size=fd_config.model_config. - moe_intermediate_size[1], + moe_intermediate_size=fd_config.model_config.moe_intermediate_size[1], num_experts=fd_config.model_config.moe_num_experts[1], expert_id_offset=fd_config.model_config.moe_num_experts[0], top_k=fd_config.model_config.moe_k, @@ -212,28 +188,23 @@ class Ernie4_5_VLMoE(nn.Layer): if self.num_shared_experts > 0: self.shared_experts = Ernie4_5_VLMLP( fd_config=fd_config, - intermediate_size=self.num_shared_experts * - fd_config.model_config.moe_intermediate_size[0], + intermediate_size=self.num_shared_experts * fd_config.model_config.moe_intermediate_size[0], prefix=f"{prefix}.shared_experts", reduce_results=False, ) - def extract_gate_correction_bias_text(self, gate_correction_bias_key, - state_dict): + def extract_gate_correction_bias_text(self, gate_correction_bias_key, state_dict): """ extract_gate_correction_bias function. """ - gate_correction_bias_tensor = get_tensor( - state_dict[gate_correction_bias_key]).astype("float32") + gate_correction_bias_tensor = get_tensor(state_dict[gate_correction_bias_key]).astype("float32") return gate_correction_bias_tensor[0].unsqueeze(0) - def extract_gate_correction_bias_image(self, gate_correction_bias_key, - state_dict): + def extract_gate_correction_bias_image(self, gate_correction_bias_key, state_dict): """ extract_gate_correction_bias function. """ - gate_correction_bias_tensor = get_tensor( - state_dict[gate_correction_bias_key]).astype("float32") + gate_correction_bias_tensor = get_tensor(state_dict[gate_correction_bias_key]).astype("float32") return gate_correction_bias_tensor[1].unsqueeze(0) def load_state_dict(self, state_dict): @@ -278,14 +249,13 @@ class Ernie4_5_VLMoE(nn.Layer): class Ernie4_5_VLDecoderLayer(nn.Layer): - def __init__( self, fd_config: FDConfig, prefix: str = "", ) -> None: super().__init__() - layer_id = int(prefix.split(sep='.')[-1]) + layer_id = int(prefix.split(sep=".")[-1]) moe_layer_start_index = fd_config.model_config.moe_layer_start_index if isinstance(moe_layer_start_index, list): @@ -309,9 +279,11 @@ class Ernie4_5_VLDecoderLayer(nn.Layer): assert min_moe_layer_start_index <= max_moe_layer_end_index - if (fd_config.model_config.moe_num_experts is not None - and layer_id >= min_moe_layer_start_index - and layer_id <= max_moe_layer_end_index): + if ( + fd_config.model_config.moe_num_experts is not None + and layer_id >= min_moe_layer_start_index + and layer_id <= max_moe_layer_end_index + ): self.mlp = Ernie4_5_VLMoE( fd_config=fd_config, layer_id=layer_id, @@ -355,16 +327,14 @@ class Ernie4_5_VLDecoderLayer(nn.Layer): residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) + hidden_states, residual = self.input_layernorm(hidden_states, residual) hidden_states = self.self_attn( hidden_states=hidden_states, forward_meta=forward_meta, ) - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) if isinstance(self.mlp, Ernie4_5_VLMoE): hidden_states = self.mlp(hidden_states, vl_moe_meta) @@ -376,7 +346,6 @@ class Ernie4_5_VLDecoderLayer(nn.Layer): @support_graph_optimization class Ernie4_5_VLModel(nn.Layer): - def __init__( self, fd_config: FDConfig = None, @@ -402,12 +371,15 @@ class Ernie4_5_VLModel(nn.Layer): prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) - self.layers = nn.LayerList([ - Ernie4_5_VLDecoderLayer( - fd_config=fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") - for i in range(self.num_layers) - ]) + self.layers = nn.LayerList( + [ + Ernie4_5_VLDecoderLayer( + fd_config=fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(self.num_layers) + ] + ) self.norm = RMSNorm( fd_config, @@ -456,11 +428,13 @@ class Ernie4_5_VLModel(nn.Layer): text_input = paddle.full( shape=[text_token_num, hidden_states.shape[1]], fill_value=1, - dtype=self._dtype) + dtype=self._dtype, + ) image_input = paddle.full( shape=[image_token_num, hidden_states.shape[1]], fill_value=1, - dtype=self._dtype) + dtype=self._dtype, + ) text_index = paddle.zeros_like(token_type_ids) image_index = paddle.zeros_like(token_type_ids) text_image_index_out(token_type_ids, text_index, image_index) @@ -493,8 +467,7 @@ class Ernie4_5_VLModel(nn.Layer): token_type_ids = token_type_ids.reshape([-1]) text_pos_shifted = token_type_ids[:token_num] == 0 score_text = hidden_states[text_pos_shifted.reshape([-1])] - max_seq_len, max_seq_len_index = paddle.topk( - forward_meta.seq_lens_this_time.squeeze(-1), k=1) + max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time.squeeze(-1), k=1) hidden_states = extract_text_token_output( max_seq_len, max_seq_len_index.cast("int32"), @@ -524,9 +497,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): # ----------- vision model ------------ self.vision_model = self._init_vision_model(fd_config.model_config) # ----------- resampler_model ------------ - self.resampler_model = self._init_resampler_model_model( - fd_config.model_config - ) + self.resampler_model = self._init_resampler_model_model(fd_config.model_config) # ernie self.ernie = Ernie4_5_VLModel(fd_config=fd_config) @@ -541,21 +512,19 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): self.tie_word_embeddings = fd_config.model_config.tie_word_embeddings def _init_vision_model(self, model_config) -> nn.Layer: - from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope.modeling import \ - DFNRopeVisionTransformerPretrainedModel + from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope.modeling import ( + DFNRopeVisionTransformerPretrainedModel, + ) - vision_model = DFNRopeVisionTransformerPretrainedModel( - model_config, prefix_name="vision_model" - ) - vision_model = paddle.amp.decorate( - models=vision_model, level="O2", dtype="bfloat16" - ) + vision_model = DFNRopeVisionTransformerPretrainedModel(model_config, prefix_name="vision_model") + vision_model = paddle.amp.decorate(models=vision_model, level="O2", dtype="bfloat16") vision_model.eval() return vision_model def _init_resampler_model_model(self, model_config) -> nn.Layer: - from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \ - VariableResolutionResamplerModel + from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ( + VariableResolutionResamplerModel, + ) resampler_model = VariableResolutionResamplerModel( model_config.vision_config.hidden_size, @@ -565,9 +534,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): config=model_config, prefix_name="resampler_model", ) - resampler_model = paddle.amp.decorate( - models=resampler_model, level="O2", dtype="bfloat16" - ) + resampler_model = paddle.amp.decorate(models=resampler_model, level="O2", dtype="bfloat16") resampler_model.eval() return resampler_model @@ -576,8 +543,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): return "Ernie4_5_VLMoeForConditionalGeneration" @paddle.no_grad() - def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, - paddle.Tensor]]): + def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]]): """ Load model parameters from a given state dictionary. @@ -590,15 +556,14 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): self.vision_model.load_state_dict(state_dict) self.resampler_model.load_state_dict(state_dict) if self.tie_word_embeddings: - self.lm_head.linear.weight.set_value( - self.ernie.embed_tokens.embeddings.weight.transpose([1, 0])) + self.lm_head.linear.weight.set_value(self.ernie.embed_tokens.embeddings.weight.transpose([1, 0])) else: self.lm_head.load_state_dict(state_dict) def compute_logits(self, hidden_states: paddle.Tensor): logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits @@ -610,8 +575,10 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): shape=[0, self.fd_config.model_config.hidden_size], dtype=paddle.get_default_dtype(), ) - for i in range(self.fd_config.model_config.moe_layer_start_index, - self.fd_config.model_config.num_hidden_layers): + for i in range( + self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers, + ): self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states) def forward( @@ -620,9 +587,11 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): image_features: paddle.Tensor, forward_meta: ForwardMeta, ): - hidden_states = self.ernie(ids_remove_padding=ids_remove_padding, - image_features=image_features, - forward_meta=forward_meta) + hidden_states = self.ernie( + ids_remove_padding=ids_remove_padding, + image_features=image_features, + forward_meta=forward_meta, + ) return hidden_states @@ -640,10 +609,8 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): """ return None - from fastdeploy.model_executor.models.tp_utils import \ - TensorSplitMode as tsm - from fastdeploy.model_executor.models.utils import \ - LayerIdPlaceholder as layerid + from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm + from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid from fastdeploy.model_executor.models.utils import WeightMeta weight_infos = [ @@ -652,17 +619,13 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): True, tsm.GQA, ), - WeightMeta( - f".layers.{{{layerid.LAYER_ID}}}.self_attn.o_proj.weight", False - ), + WeightMeta(f".layers.{{{layerid.LAYER_ID}}}.self_attn.o_proj.weight", False), WeightMeta( f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.up_gate_proj.weight", True, tsm.PairFused, ), - WeightMeta( - f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.down_proj.weight", False - ), + WeightMeta(f".layers.{{{layerid.FFN_LAYER_ID}}}.mlp.down_proj.weight", False), WeightMeta( f".layers.{{{layerid.MOE_LAYER_ID}}}.mlp.experts.{{{layerid.TEXT_EXPERT_ID}}}.up_gate_proj.weight", True, @@ -707,15 +670,9 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): f"vision_model.blocks.{{{layerid.LAYER_ID}}}.attn.proj.weight", False, ), - WeightMeta( - f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc2.weight", False - ), - WeightMeta( - f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc1.weight", True - ), - WeightMeta( - f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc1.bias", True - ), + WeightMeta(f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc2.weight", False), + WeightMeta(f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc1.weight", True), + WeightMeta(f"vision_model.blocks.{{{layerid.LAYER_ID}}}.mlp.fc1.bias", True), WeightMeta( f"vision_model.blocks.{{{layerid.LAYER_ID}}}.attn.qkv.weight", True, @@ -735,7 +692,10 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): """ logger.info("erine inference model _get_tensor_parallel_mappings") from fastdeploy.model_executor.models.tp_utils import ( - build_expanded_keys, has_prefix, split_or_merge_func_v1) + build_expanded_keys, + has_prefix, + split_or_merge_func_v1, + ) fn = split_or_merge_func_v1( is_split=is_split, @@ -751,8 +711,7 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): tensor_parallel_rank=config.tensor_parallel_rank, num_attention_heads=config.vision_config.get("num_heads"), num_key_value_heads=config.vision_config.get("num_heads"), - head_dim=config.vision_config.get("hidden_size") - // config.vision_config.get("num_heads"), + head_dim=config.vision_config.get("hidden_size") // config.vision_config.get("num_heads"), ) def get_tensor_parallel_split_mappings( @@ -779,11 +738,7 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): final_actions = build_expanded_keys( base_actions, num_layers, - ( - moe_layer_start_index - if moe_layer_start_index > 0 - else num_layers - ), + (moe_layer_start_index if moe_layer_start_index > 0 else num_layers), text_num_experts=moe_num_experts[0], img_num_experts=moe_num_experts[1], ) @@ -816,8 +771,6 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel): moe_layer_start_index, config.prefix_name, ) - vision_mappings = get_vison_parallel_split_mappings( - config.vision_config.get("depth") - ) + vision_mappings = get_vison_parallel_split_mappings(config.vision_config.get("depth")) return {**mappings, **vision_mappings} diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py index f85ac235c..b032747d4 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py @@ -23,11 +23,13 @@ from paddle import nn from paddle.autograd import PyLayer from paddle.distributed.fleet.utils import recompute -from fastdeploy.model_executor.layers.utils import (_set_var_distributed, - get_tensor) +from fastdeploy.model_executor.layers.utils import _set_var_distributed, get_tensor from fastdeploy.model_executor.models.ernie4_5_vl.dist_utils import ( - RowSequenceParallelLinear, all_gather_group, reduce_scatter_group, - scatter_axis) + RowSequenceParallelLinear, + all_gather_group, + reduce_scatter_group, + scatter_axis, +) class ScatterOp(PyLayer): @@ -125,10 +127,8 @@ class RMSNorm(nn.Layer): - Maintains original dtype for numerical stability during computation """ with paddle.amp.auto_cast(False): - variance = hidden_states.astype("float32").pow(2).mean( - -1, keepdim=True) - hidden_states = paddle.rsqrt(variance + - self.variance_epsilon) * hidden_states + variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True) + hidden_states = paddle.rsqrt(variance + self.variance_epsilon) * hidden_states return hidden_states.astype(self.weight.dtype) * self.weight @@ -137,8 +137,15 @@ class VariableResolutionResamplerModel(nn.Layer): VariableResolutionResamplerModel, 支持变分, 负责空间、时间维度缩并。 """ - def __init__(self, in_dim, out_dim, spatial_conv_size, temporal_conv_size, - config, prefix_name: str = ""): + def __init__( + self, + in_dim, + out_dim, + spatial_conv_size, + temporal_conv_size, + config, + prefix_name: str = "", + ): super().__init__() self.in_dim = in_dim self.out_dim = out_dim @@ -158,14 +165,17 @@ class VariableResolutionResamplerModel(nn.Layer): with paddle.utils.unique_name.guard("mm_resampler_"): self.spatial_linear = nn.Sequential( - (RowSequenceParallelLinear( - self.spatial_dim, - self.spatial_dim, - input_is_parallel=True, - has_bias=True, - fuse_matmul_bias=True, - ) if self.tensor_parallel_degree > 1 else nn.Linear( - self.spatial_dim, self.spatial_dim)), + ( + RowSequenceParallelLinear( + self.spatial_dim, + self.spatial_dim, + input_is_parallel=True, + has_bias=True, + fuse_matmul_bias=True, + ) + if self.tensor_parallel_degree > 1 + else nn.Linear(self.spatial_dim, self.spatial_dim) + ), nn.GELU(), nn.Linear(self.spatial_dim, self.spatial_dim), nn.LayerNorm(self.spatial_dim, epsilon=1e-6), @@ -187,21 +197,15 @@ class VariableResolutionResamplerModel(nn.Layer): if self.tensor_parallel_degree > 1: for idx in [2, 3]: - mark_as_sequence_parallel_parameter( - self.spatial_linear[idx].weight) - mark_as_sequence_parallel_parameter( - self.spatial_linear[idx].bias) - _set_var_distributed(self.spatial_linear[idx].weight, - split_axis=0) - _set_var_distributed(self.spatial_linear[idx].bias, - split_axis=0) + mark_as_sequence_parallel_parameter(self.spatial_linear[idx].weight) + mark_as_sequence_parallel_parameter(self.spatial_linear[idx].bias) + _set_var_distributed(self.spatial_linear[idx].weight, split_axis=0) + _set_var_distributed(self.spatial_linear[idx].bias, split_axis=0) if self.use_temporal_conv: for idx in [0, 2, 3]: - mark_as_sequence_parallel_parameter( - self.temporal_linear[idx].weight) - mark_as_sequence_parallel_parameter( - self.temporal_linear[idx].bias) + mark_as_sequence_parallel_parameter(self.temporal_linear[idx].weight) + mark_as_sequence_parallel_parameter(self.temporal_linear[idx].bias) mark_as_sequence_parallel_parameter(self.mlp.weight) mark_as_sequence_parallel_parameter(self.mlp.bias) @@ -237,8 +241,7 @@ class VariableResolutionResamplerModel(nn.Layer): if self.tensor_parallel_degree > 1: num_pad = ( x.shape[0] + self.tensor_parallel_degree - 1 - ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[ - 0] + ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[0] if num_pad > 0: x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0]) @@ -261,13 +264,10 @@ class VariableResolutionResamplerModel(nn.Layer): grid_thw_cpu = grid_thw.numpy() grid_t, grid_hw = grid_thw_cpu[:, 0], grid_thw_cpu[:, 1:] - grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size** - 2) + grid_hw_after_conv = grid_hw.prod(-1) // (self.spatial_conv_size**2) - tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // ( - self.spatial_conv_size**2) - batch_offset = np.empty(tokens_per_img_or_vid.size, - dtype=tokens_per_img_or_vid.dtype) + tokens_per_img_or_vid = grid_thw_cpu.prod(-1) // (self.spatial_conv_size**2) + batch_offset = np.empty(tokens_per_img_or_vid.size, dtype=tokens_per_img_or_vid.dtype) batch_offset[0] = 0 batch_offset[1:] = tokens_per_img_or_vid.cumsum()[:-1] @@ -275,25 +275,26 @@ class VariableResolutionResamplerModel(nn.Layer): # TODO: support any temporal conv size slice_offsets = [] - for temporoal_size, spatial_size, b_offset in zip( - grid_t, grid_hw_after_conv, batch_offset): + for temporoal_size, spatial_size, b_offset in zip(grid_t, grid_hw_after_conv, batch_offset): for temp_offset in range(0, temporoal_size, 2): slice_offsets.append( - np.arange(b_offset + (temp_offset) * spatial_size, - b_offset + (temp_offset + 1) * spatial_size)) - slice_offsets = paddle.to_tensor( - np.concatenate(slice_offsets, axis=-1)) + np.arange( + b_offset + (temp_offset) * spatial_size, + b_offset + (temp_offset + 1) * spatial_size, + ) + ) + slice_offsets = paddle.to_tensor(np.concatenate(slice_offsets, axis=-1)) slice_offsets2 = [] - for temporoal_size, spatial_size, b_offset in zip( - grid_t, grid_hw_after_conv, batch_offset): - for temp_offset in range(1 if temporoal_size > 1 else 0, - temporoal_size, 2): + for temporoal_size, spatial_size, b_offset in zip(grid_t, grid_hw_after_conv, batch_offset): + for temp_offset in range(1 if temporoal_size > 1 else 0, temporoal_size, 2): slice_offsets2.append( - np.arange(b_offset + (temp_offset) * spatial_size, - b_offset + (temp_offset + 1) * spatial_size)) - slice_offsets2 = paddle.to_tensor( - np.concatenate(slice_offsets2, axis=-1)) + np.arange( + b_offset + (temp_offset) * spatial_size, + b_offset + (temp_offset + 1) * spatial_size, + ) + ) + slice_offsets2 = paddle.to_tensor(np.concatenate(slice_offsets2, axis=-1)) x_timestep_1 = paddle.gather(x, slice_offsets, axis=0) x_timestep_2 = paddle.gather(x, slice_offsets2, axis=0) @@ -306,8 +307,7 @@ class VariableResolutionResamplerModel(nn.Layer): if self.tensor_parallel_degree > 1: num_pad = ( x.shape[0] + self.tensor_parallel_degree - 1 - ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[ - 0] + ) // self.tensor_parallel_degree * self.tensor_parallel_degree - x.shape[0] if num_pad > 0: x = paddle.nn.functional.pad(x, [0, num_pad, 0, 0]) if self.tensor_parallel_degree > 1: @@ -350,22 +350,17 @@ class VariableResolutionResamplerModel(nn.Layer): if state_dict_key not in state_dict: state_dict_key = f"ernie.{self.prefix_name}.{param_name}" if state_dict_key not in state_dict: - raise ValueError( - f"The key {state_dict_key} does not exist in state_dict. " - ) + raise ValueError(f"The key {state_dict_key} does not exist in state_dict. ") tensor = get_tensor(state_dict.pop(state_dict_key)) if param.shape != tensor.shape: - raise ValueError( - f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}" - ) + raise ValueError(f"{state_dict_key} param.shape={param.shape} tensor.shape={tensor.shape}") else: param.copy_(tensor, False) @classmethod def _get_tensor_parallel_mappings(cls, config, is_split=True): - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( is_split=is_split, @@ -375,17 +370,17 @@ class VariableResolutionResamplerModel(nn.Layer): ) res = {"spatial_linear.0.weight": partial(fn, is_column=False)} for k in ( - "spatial_linear.0.bias", # row linear bias - "spatial_linear.2.weight", - "spatial_linear.2.bias", # linear - "spatial_linear.3.weight", - "spatial_linear.3.bias", # layernorm - "temporal_linear.0.weight", - "temporal_linear.0.weight", # linear - "temporal_linear.2.weight", - "temporal_linear.2.bias", # linear - "temporal_linear.3.weight", - "temporal_linear.3.bias", # bias + "spatial_linear.0.bias", # row linear bias + "spatial_linear.2.weight", + "spatial_linear.2.bias", # linear + "spatial_linear.3.weight", + "spatial_linear.3.bias", # layernorm + "temporal_linear.0.weight", + "temporal_linear.0.weight", # linear + "temporal_linear.2.weight", + "temporal_linear.2.bias", # linear + "temporal_linear.3.weight", + "temporal_linear.3.bias", # bias ): res.update({k: lambda x: x}) return res diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index 4150adb98..4f4702622 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from abc import ABC, abstractmethod from typing import Dict, Union @@ -25,14 +26,13 @@ class ModelRegistry: """ Used to register and retrieve model classes. """ + _registry = {} @classmethod def register(cls, model_class): """register model class""" - if issubclass( - model_class, - ModelForCasualLM) and model_class is not ModelForCasualLM: + if issubclass(model_class, ModelForCasualLM) and model_class is not ModelForCasualLM: cls._registry[model_class.name()] = model_class return model_class @@ -59,8 +59,7 @@ class ModelForCasualLM(nn.Layer, ABC): self.fd_config = configs @abstractmethod - def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, - paddle.Tensor]]): + def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]]): """ Load model parameters from a given state dictionary. diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 3ffc7874e..418f3cdfb 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -25,21 +25,24 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig, ModelConfig from fastdeploy.model_executor.forward_meta import ForwardMeta -from fastdeploy.model_executor.graph_optimization.decorator import \ - support_graph_optimization +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.linear import ( - MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) + MergedColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM class Qwen2MLP(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -72,14 +75,12 @@ class Qwen2MLP(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) def forward(self, x): - """ - """ + """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) down_out = self.down_proj(act_out) @@ -87,18 +88,12 @@ class Qwen2MLP(nn.Layer): class Qwen2Attention(nn.Layer): - """ - """ + """ """ - def __init__(self, - fd_config: FDConfig, - layer_id: int, - prefix: str = "") -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = "") -> None: super().__init__() - self.qkv_proj = QKVParallelLinear(fd_config=fd_config, - prefix=f"{prefix}.qkv_proj", - with_bias=True) + self.qkv_proj = QKVParallelLinear(fd_config=fd_config, prefix=f"{prefix}.qkv_proj", with_bias=True) self.o_proj = RowParallelLinear( fd_config=fd_config, @@ -107,14 +102,15 @@ class Qwen2Attention(nn.Layer): output_size=fd_config.model_config.hidden_size, ) - self.attn = Attention(fd_config=fd_config, - layer_id=layer_id, - prefix=prefix, - use_neox_rotary_style=True) + self.attn = Attention( + fd_config=fd_config, + layer_id=layer_id, + prefix=prefix, + use_neox_rotary_style=True, + ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.qkv_proj.load_state_dict(state_dict) self.o_proj.load_state_dict(state_dict) @@ -123,8 +119,7 @@ class Qwen2Attention(nn.Layer): forward_meta: ForwardMeta, hidden_states: paddle.Tensor, ): - """ - """ + """ """ qkv_out = self.qkv_proj(hidden_states) atten_out = self.attn( @@ -136,8 +131,7 @@ class Qwen2Attention(nn.Layer): class Qwen2DecoderLayer(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -145,7 +139,7 @@ class Qwen2DecoderLayer(nn.Layer): prefix: str = "", ) -> None: super().__init__() - layer_id = int(prefix.split(sep='.')[-1]) + layer_id = int(prefix.split(sep=".")[-1]) self.self_attn = Qwen2Attention( fd_config=fd_config, @@ -173,8 +167,7 @@ class Qwen2DecoderLayer(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.self_attn.load_state_dict(state_dict) self.mlp.load_state_dict(state_dict) self.input_layernorm.load_state_dict(state_dict) @@ -186,15 +179,13 @@ class Qwen2DecoderLayer(nn.Layer): hidden_states: paddle.Tensor, residual: paddle.Tensor = None, ): - """ - """ + """ """ # Self Attention if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) + hidden_states, residual = self.input_layernorm(hidden_states, residual) hidden_states = self.self_attn( hidden_states=hidden_states, @@ -202,8 +193,7 @@ class Qwen2DecoderLayer(nn.Layer): ) # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) hidden_states = self.mlp(hidden_states) @@ -212,8 +202,7 @@ class Qwen2DecoderLayer(nn.Layer): @support_graph_optimization class Qwen2Model(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -238,12 +227,15 @@ class Qwen2Model(nn.Layer): prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) - self.layers = nn.LayerList([ - Qwen2DecoderLayer( - fd_config=fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") - for i in range(self.num_layers) - ]) + self.layers = nn.LayerList( + [ + Qwen2DecoderLayer( + fd_config=fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(self.num_layers) + ] + ) self.norm = RMSNorm( fd_config, @@ -272,16 +264,14 @@ class Qwen2Model(nn.Layer): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ + """ """ hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding) residual = None for i in range(self.num_layers): - hidden_states, residual = self.layers[i](forward_meta, - hidden_states, residual) + hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) hidden_states = hidden_states + residual @@ -302,7 +292,7 @@ class Qwen2ForCausalLM(ModelForCasualLM): """ super(Qwen2ForCausalLM, self).__init__(fd_config) - self.fd_config =fd_config + self.fd_config = fd_config self.qwen2 = Qwen2Model(fd_config=fd_config) self.ori_vocab_size = fd_config.model_config.ori_vocab_size @@ -316,8 +306,7 @@ class Qwen2ForCausalLM(ModelForCasualLM): @classmethod def name(self): - """ - """ + """ """ return "Qwen2ForCausalLM" @paddle.no_grad() @@ -334,11 +323,10 @@ class Qwen2ForCausalLM(ModelForCasualLM): self.lm_head.load_state_dict(state_dict) def compute_logits(self, hidden_states: paddle.Tensor): - """ - """ + """ """ logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits @@ -347,10 +335,8 @@ class Qwen2ForCausalLM(ModelForCasualLM): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ - hidden_states = self.qwen2(ids_remove_padding=ids_remove_padding, - forward_meta=forward_meta) + """ """ + hidden_states = self.qwen2(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) return hidden_states @@ -371,8 +357,7 @@ class Qwen2PretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( is_split=is_split, @@ -388,41 +373,30 @@ class Qwen2PretrainedModel(PretrainedModel): "lm_head.weight": partial(fn, is_column=True), # Row Linear "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, - is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), } # Column Linear if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.q_proj.bias"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.k_proj.bias"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.bias"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) - base_actions["layers.0.mlp.gate_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.mlp.up_proj.weight"] = partial( - fn, is_column=True) + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) for key, action in base_actions.items(): if "layers.0." in key: for i in range(num_layers): - final_actions[key.replace("layers.0.", - f"layers.{i}.")] = action + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action final_actions[key] = action return final_actions diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 4f7642bee..9dee0f8e4 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -25,12 +25,12 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.model_executor.forward_meta import ForwardMeta -from fastdeploy.model_executor.graph_optimization.decorator import \ - support_graph_optimization +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding -from fastdeploy.model_executor.layers.linear import (QKVParallelLinear, - RowParallelLinear) +from fastdeploy.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM @@ -38,52 +38,51 @@ from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP class Qwen3MLP(Qwen2MLP): - """ - """ + """ """ + pass class Qwen3Attention(nn.Layer): - """ - """ + """ """ - def __init__(self, - fd_config: FDConfig, - layer_id: int, - prefix: str = "") -> None: + def __init__(self, fd_config: FDConfig, layer_id: int, prefix: str = "") -> None: super().__init__() self.fd_config = fd_config self.head_dim = fd_config.model_config.head_dim - self.qkv_proj = QKVParallelLinear(fd_config, - prefix=f"{prefix}.qkv_proj", - with_bias=False) + self.qkv_proj = QKVParallelLinear(fd_config, prefix=f"{prefix}.qkv_proj", with_bias=False) nranks = fd_config.parallel_config.tensor_parallel_size self.o_proj = RowParallelLinear( fd_config, prefix=f"{prefix}.o_proj", - input_size=fd_config.model_config.head_dim * - fd_config.model_config.num_attention_heads, + input_size=fd_config.model_config.head_dim * fd_config.model_config.num_attention_heads, output_size=fd_config.model_config.hidden_size, ) - self.attn = Attention(fd_config, - layer_id=layer_id, - prefix=prefix, - use_neox_rotary_style=True) + self.attn = Attention( + fd_config, + layer_id=layer_id, + prefix=prefix, + use_neox_rotary_style=True, + ) - self.q_norm = RMSNorm(fd_config, - hidden_size=self.head_dim, - eps=fd_config.model_config.rms_norm_eps, - prefix=f"{prefix}.q_norm", - begin_norm_axis=2) - self.k_norm = RMSNorm(fd_config, - hidden_size=self.head_dim, - eps=fd_config.model_config.rms_norm_eps, - prefix=f"{prefix}.k_norm", - begin_norm_axis=2) + self.q_norm = RMSNorm( + fd_config, + hidden_size=self.head_dim, + eps=fd_config.model_config.rms_norm_eps, + prefix=f"{prefix}.q_norm", + begin_norm_axis=2, + ) + self.k_norm = RMSNorm( + fd_config, + hidden_size=self.head_dim, + eps=fd_config.model_config.rms_norm_eps, + prefix=f"{prefix}.k_norm", + begin_norm_axis=2, + ) nranks = fd_config.parallel_config.tensor_parallel_size num_kv_heads_replicas = max(1, nranks // fd_config.model_config.num_key_value_heads) @@ -91,8 +90,7 @@ class Qwen3Attention(nn.Layer): self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim * num_kv_heads_replicas // nranks def load_state_dict(self, state_dict): - """ - """ + """ """ self.qkv_proj.load_state_dict(state_dict) self.o_proj.load_state_dict(state_dict) self.q_norm.load_state_dict(state_dict) @@ -103,20 +101,16 @@ class Qwen3Attention(nn.Layer): forward_meta: ForwardMeta, hidden_states: paddle.Tensor, ): - """ - """ + """ """ qkv_out = self.qkv_proj(hidden_states) # origin_qkv_out = qkv_out - q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size], - axis=-1) + q, k, v = qkv_out.split([self.q_size, self.kv_size, self.kv_size], axis=-1) - q_by_head = q.reshape( - [*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim]) + q_by_head = q.reshape([*q.shape[:-1], q.shape[-1] // self.head_dim, self.head_dim]) q_by_head = self.q_norm(q_by_head) q = q_by_head.reshape(q.shape) - k_by_head = k.reshape( - [*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim]) + k_by_head = k.reshape([*k.shape[:-1], k.shape[-1] // self.head_dim, self.head_dim]) k_by_head = self.k_norm(k_by_head) k = k_by_head.reshape(k.shape) @@ -131,8 +125,7 @@ class Qwen3Attention(nn.Layer): class Qwen3DecoderLayer(Qwen2DecoderLayer): - """ - """ + """ """ def __init__( self, @@ -140,16 +133,13 @@ class Qwen3DecoderLayer(Qwen2DecoderLayer): prefix: str = "", ) -> None: super().__init__(fd_config, prefix) - layer_id = int(prefix.split(sep='.')[-1]) - self.self_attn = Qwen3Attention(fd_config=fd_config, - layer_id=layer_id, - prefix=f"{prefix}.self_attn") + layer_id = int(prefix.split(sep=".")[-1]) + self.self_attn = Qwen3Attention(fd_config=fd_config, layer_id=layer_id, prefix=f"{prefix}.self_attn") @support_graph_optimization class Qwen3Model(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -174,12 +164,15 @@ class Qwen3Model(nn.Layer): prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) - self.layers = nn.LayerList([ - Qwen3DecoderLayer( - fd_config=fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") - for i in range(self.num_layers) - ]) + self.layers = nn.LayerList( + [ + Qwen3DecoderLayer( + fd_config=fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(self.num_layers) + ] + ) self.norm = RMSNorm( fd_config, @@ -208,15 +201,13 @@ class Qwen3Model(nn.Layer): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ + """ """ hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding) residual = None for i in range(self.num_layers): - hidden_states, residual = self.layers[i](forward_meta, - hidden_states, residual) + hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) hidden_states = hidden_states + residual @@ -250,8 +241,7 @@ class Qwen3ForCausalLM(ModelForCasualLM): @classmethod def name(self): - """ - """ + """ """ return "Qwen3ForCausalLM" @paddle.no_grad() @@ -266,17 +256,15 @@ class Qwen3ForCausalLM(ModelForCasualLM): """ self.model.load_state_dict(state_dict) if self.tie_word_embeddings: - self.lm_head.linear.weight.set_value( - self.model.embed_tokens.embeddings.weight.transpose([1, 0])) + self.lm_head.linear.weight.set_value(self.model.embed_tokens.embeddings.weight.transpose([1, 0])) else: self.lm_head.load_state_dict(state_dict) def compute_logits(self, hidden_states: paddle.Tensor): - """ - """ + """ """ logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits @@ -285,10 +273,8 @@ class Qwen3ForCausalLM(ModelForCasualLM): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ - hidden_states = self.model(ids_remove_padding=ids_remove_padding, - forward_meta=forward_meta) + """ """ + hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) return hidden_states @@ -309,8 +295,7 @@ class Qwen3PretrainedModel(PretrainedModel): @classmethod def _get_tensor_parallel_mappings(cls, config, is_split=True): - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( is_split=is_split, @@ -326,34 +311,26 @@ class Qwen3PretrainedModel(PretrainedModel): # Row Linear "lm_head.weight": partial(fn, is_column=True), "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, - is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), "layers.0.mlp.down_proj.weight": partial(fn, is_column=False), } # Column Linear - base_actions["layers.0.self_attn.q_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.q_proj.bias"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) - base_actions["layers.0.mlp.gate_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.mlp.up_proj.weight"] = partial( - fn, is_column=True) + base_actions["layers.0.mlp.gate_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.mlp.up_proj.weight"] = partial(fn, is_column=True) for key, action in base_actions.items(): if "layers.0." in key: for i in range(num_layers): - final_actions[key.replace("layers.0.", - f"layers.{i}.")] = action + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action final_actions[key] = action return final_actions diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 11d387a54..bcf9dbe6a 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -25,12 +25,15 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.model_executor.forward_meta import ForwardMeta -from fastdeploy.model_executor.graph_optimization.decorator import \ - support_graph_optimization +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.linear import ( - MergedColumnParallelLinear, RowParallelLinear) + MergedColumnParallelLinear, + RowParallelLinear, +) from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm @@ -39,8 +42,7 @@ from fastdeploy.model_executor.models.qwen3 import Qwen3Attention class Qwen3MLP(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -74,14 +76,12 @@ class Qwen3MLP(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.up_gate_proj.load_state_dict(state_dict) self.down_proj.load_state_dict(state_dict) def forward(self, x): - """ - """ + """ """ gate_up_out = self.up_gate_proj(x) act_out = self.act_fn(gate_up_out) down_out = self.down_proj(act_out) @@ -89,8 +89,7 @@ class Qwen3MLP(nn.Layer): class Qwen3DecoderLayer(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -98,7 +97,7 @@ class Qwen3DecoderLayer(nn.Layer): prefix: str = "", ) -> None: super().__init__() - layer_id = int(prefix.split(sep='.')[-1]) + layer_id = int(prefix.split(sep=".")[-1]) self.self_attn = Qwen3Attention( fd_config=fd_config, @@ -106,24 +105,24 @@ class Qwen3DecoderLayer(nn.Layer): prefix=f"{prefix}.self_attn", ) weight_key_map = { - "gate_weight_key": - f"{prefix}.mlp.gate.weight", - "up_gate_proj_expert_weight_key": - f"{prefix}.mlp.experts.{{}}.up_gate_proj.weight", - "down_proj_expert_weight_key": - f"{prefix}.mlp.experts.{{}}.down_proj.weight", + "gate_weight_key": f"{prefix}.mlp.gate.weight", + "up_gate_proj_expert_weight_key": f"{prefix}.mlp.experts.{{}}.up_gate_proj.weight", + "down_proj_expert_weight_key": f"{prefix}.mlp.experts.{{}}.down_proj.weight", } - if (fd_config.model_config.moe_num_experts is not None - and layer_id >= fd_config.model_config.moe_layer_start_index): + if ( + fd_config.model_config.moe_num_experts is not None + and layer_id >= fd_config.model_config.moe_layer_start_index + ): - self.mlp = FusedMoE(fd_config, - moe_intermediate_size=fd_config.model_config. - moe_intermediate_size, - num_experts=fd_config.model_config.moe_num_experts, - top_k=fd_config.model_config.moe_topk, - layer_idx=layer_id, - weight_key_map=weight_key_map) + self.mlp = FusedMoE( + fd_config, + moe_intermediate_size=fd_config.model_config.moe_intermediate_size, + num_experts=fd_config.model_config.moe_num_experts, + top_k=fd_config.model_config.moe_topk, + layer_idx=layer_id, + weight_key_map=weight_key_map, + ) else: self.mlp = Qwen3MLP( fd_config, @@ -145,8 +144,7 @@ class Qwen3DecoderLayer(nn.Layer): ) def load_state_dict(self, state_dict): - """ - """ + """ """ self.self_attn.load_state_dict(state_dict) self.mlp.load_state_dict(state_dict) self.input_layernorm.load_state_dict(state_dict) @@ -158,14 +156,12 @@ class Qwen3DecoderLayer(nn.Layer): hidden_states: paddle.Tensor, residual: paddle.Tensor = None, ): - """ - """ + """ """ if residual is None: residual = hidden_states hidden_states = self.input_layernorm(hidden_states) else: - hidden_states, residual = self.input_layernorm( - hidden_states, residual) + hidden_states, residual = self.input_layernorm(hidden_states, residual) hidden_states = self.self_attn( hidden_states=hidden_states, @@ -173,8 +169,7 @@ class Qwen3DecoderLayer(nn.Layer): ) # Fully Connected - hidden_states, residual = self.post_attention_layernorm( - hidden_states, residual) + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) hidden_states = self.mlp(hidden_states) @@ -183,8 +178,7 @@ class Qwen3DecoderLayer(nn.Layer): @support_graph_optimization class Qwen3MoeModel(nn.Layer): - """ - """ + """ """ def __init__( self, @@ -209,12 +203,15 @@ class Qwen3MoeModel(nn.Layer): prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"), ) - self.layers = nn.LayerList([ - Qwen3DecoderLayer( - fd_config, - prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}") - for i in range(self.num_layers) - ]) + self.layers = nn.LayerList( + [ + Qwen3DecoderLayer( + fd_config, + prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}", + ) + for i in range(self.num_layers) + ] + ) self.norm = RMSNorm( fd_config, @@ -243,15 +240,13 @@ class Qwen3MoeModel(nn.Layer): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ + """ """ hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding) residual = None for i in range(self.num_layers): - hidden_states, residual = self.layers[i](forward_meta, - hidden_states, residual) + hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual) hidden_states = hidden_states + residual out = self.norm(hidden_states) @@ -284,8 +279,7 @@ class Qwen3MoeForCausalLM(ModelForCasualLM): @classmethod def name(self): - """ - """ + """ """ return "Qwen3MoeForCausalLM" @paddle.no_grad() @@ -302,11 +296,10 @@ class Qwen3MoeForCausalLM(ModelForCasualLM): self.lm_head.load_state_dict(state_dict) def compute_logits(self, hidden_states: paddle.Tensor): - """ - """ + """ """ logits = self.lm_head(hidden_states) logits = paddle.cast(logits, paddle.float32) - logits[:, self.ori_vocab_size:] = -float("inf") + logits[:, self.ori_vocab_size :] = -float("inf") return logits @@ -315,10 +308,8 @@ class Qwen3MoeForCausalLM(ModelForCasualLM): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - """ - """ - hidden_states = self.model(ids_remove_padding=ids_remove_padding, - forward_meta=forward_meta) + """ """ + hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) return hidden_states @@ -340,8 +331,7 @@ class Qwen3MoePretrainedModel(PretrainedModel): def _get_tensor_parallel_mappings(cls, config, is_split=True): # TODO not support TP split now, next PR will support TP. - from paddleformers.transformers.conversion_utils import \ - split_or_merge_func + from paddleformers.transformers.conversion_utils import split_or_merge_func fn = split_or_merge_func( is_split=is_split, @@ -357,45 +347,33 @@ class Qwen3MoePretrainedModel(PretrainedModel): "lm_head.weight": partial(fn, is_column=True), # Row Linear "embed_tokens.weight": partial(fn, is_column=False), - "layers.0.self_attn.o_proj.weight": partial(fn, - is_column=False), + "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False), } # Column Linear config.fuse_attention_qkv = False if config.fuse_attention_qkv: - base_actions["layers.0.self_attn.qkv_proj.weight"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.qkv_proj.weight"] = partial(fn, is_column=True) else: - base_actions["layers.0.self_attn.q_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.q_proj.bias"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.q_proj.bias"] = partial(fn, is_column=True) # if we have enough num_key_value_heads to split, then split it. if config.num_key_value_heads % config.tensor_parallel_degree == 0: - base_actions["layers.0.self_attn.k_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.weight"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.k_proj.bias"] = partial( - fn, is_column=True) - base_actions["layers.0.self_attn.v_proj.bias"] = partial( - fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.weight"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.k_proj.bias"] = partial(fn, is_column=True) + base_actions["layers.0.self_attn.v_proj.bias"] = partial(fn, is_column=True) for key, action in base_actions.items(): if "layers.0." in key: for i in range(num_layers): - final_actions[key.replace("layers.0.", - f"layers.{i}.")] = action + final_actions[key.replace("layers.0.", f"layers.{i}.")] = action final_actions[key] = action base_actions = { - "layers.0.mlp.experts.0.gate_proj.weight": - partial(fn, is_column=True), - "layers.0.mlp.experts.0.down_proj.weight": - partial(fn, is_column=False), - "layers.0.mlp.experts.0.up_proj.weight": - partial(fn, is_column=True), + "layers.0.mlp.experts.0.gate_proj.weight": partial(fn, is_column=True), + "layers.0.mlp.experts.0.down_proj.weight": partial(fn, is_column=False), + "layers.0.mlp.experts.0.up_proj.weight": partial(fn, is_column=True), } for key, action in base_actions.items(): @@ -413,11 +391,8 @@ class Qwen3MoePretrainedModel(PretrainedModel): elif isinstance(config.moe_num_experts, int): num_experts = config.moe_num_experts else: - raise ValueError( - f"Not support type of num_experts [{type(config.moe_num_experts)}]" - ) + raise ValueError(f"Not support type of num_experts [{type(config.moe_num_experts)}]") - mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, - num_experts) + mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers, num_experts) return mappings diff --git a/fastdeploy/model_executor/models/tp_utils.py b/fastdeploy/model_executor/models/tp_utils.py index 001a6ce08..65d8b48fc 100644 --- a/fastdeploy/model_executor/models/tp_utils.py +++ b/fastdeploy/model_executor/models/tp_utils.py @@ -38,15 +38,14 @@ def check_tensor_parallel_prerequisites( """check_tensor_parallel_prerequisites""" if fd_config.parallel_config.tensor_parallel_size > 1: tensor_parallel_map = cls._get_tensor_parallel_mappings( - fd_config.model_config.pretrained_config, is_split=True) + fd_config.model_config.pretrained_config, is_split=True + ) if not tensor_parallel_map: logger.error( "filtered_quant_map should not be empty. \ parallel splitting required, but _get_tensor_parallel_mappings is not implemented." ) - filtered_tp_keys = cls._resolve_prefix_keys( - tensor_parallel_map.keys(), safetensor_keys - ) + filtered_tp_keys = cls._resolve_prefix_keys(tensor_parallel_map.keys(), safetensor_keys) for k, v in filtered_tp_keys.items(): tensor_parallel_filtered_map[v] = tensor_parallel_map.pop(k) if not tensor_parallel_filtered_map: @@ -176,9 +175,7 @@ def build_expanded_keys( if start_layer < 0: continue for layer_id in range(start_layer, num_layers): - for export_id in range( - text_num_experts, text_num_experts + img_num_experts - ): + for export_id in range(text_num_experts, text_num_experts + img_num_experts): update_final_actions( { LayerIdPlaceholder.MOE_LAYER_ID.value: layer_id, @@ -188,10 +185,7 @@ def build_expanded_keys( key, action, ) - elif ( - LayerIdPlaceholder.MOE_LAYER_ID.value in placeholders - and len(placeholders) == 1 - ): + elif LayerIdPlaceholder.MOE_LAYER_ID.value in placeholders and len(placeholders) == 1: if start_layer < 0: continue for layer_id in range(start_layer, num_layers): @@ -222,8 +216,7 @@ def gqa_qkv_split_func( def get_shape(tensor): """get_shape""" - return tensor.get_shape() if hasattr(tensor, - "get_shape") else tensor.shape + return tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape def slice_tensor(tensor, start, end): """slice_tensor""" @@ -251,10 +244,7 @@ def gqa_qkv_split_func( size = shape[-1] if is_column else shape[0] block_size = size // degree if hasattr(tensor, "get_shape"): - return [ - slice_tensor(tensor, i * block_size, (i + 1) * block_size) - for i in range(degree) - ] + return [slice_tensor(tensor, i * block_size, (i + 1) * block_size) for i in range(degree)] else: if isinstance(x, paddle.Tensor): if is_column: @@ -342,8 +332,8 @@ def gqa_qkv_merge_func(num_attention_heads, num_key_value_heads, head_dim): def fn(weight_list, is_column=True): """fn""" tensor_parallel_degree = len(weight_list) - num_attention_heads = num_attention_heads // tensor_parallel_degree # noqa: F823 - num_key_value_heads = num_key_value_heads // tensor_parallel_degree # noqa: F823 + local_num_attention_heads = num_attention_heads // tensor_parallel_degree + local_num_key_value_heads = num_key_value_heads // tensor_parallel_degree is_paddle_tensor = not isinstance(weight_list[0], np.ndarray) @@ -351,8 +341,7 @@ def gqa_qkv_merge_func(num_attention_heads, num_key_value_heads, head_dim): """ get_shape """ - return tensor.get_shape() if hasattr(tensor, - "get_shape") else tensor.shape + return tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape def slice_tensor(tensor, start, end): """ @@ -368,9 +357,9 @@ def gqa_qkv_merge_func(num_attention_heads, num_key_value_heads, head_dim): q_list, k_list, v_list = [], [], [] for weight in weight_list: - q_end = num_attention_heads * head_dim - k_end = q_end + num_key_value_heads * head_dim - v_end = k_end + num_key_value_heads * head_dim + q_end = local_num_attention_heads * head_dim + k_end = q_end + local_num_key_value_heads * head_dim + v_end = k_end + local_num_key_value_heads * head_dim q = slice_tensor(weight, 0, q_end) k = slice_tensor(weight, q_end, k_end) diff --git a/fastdeploy/model_executor/models/utils.py b/fastdeploy/model_executor/models/utils.py index 14a9edfad..063344d19 100644 --- a/fastdeploy/model_executor/models/utils.py +++ b/fastdeploy/model_executor/models/utils.py @@ -31,10 +31,12 @@ import paddle from paddle.common_ops_import import convert_dtype from paddleformers.transformers.model_utils import _add_variant from paddleformers.transformers.utils import paddleformers_load -from paddleformers.utils.env import (PADDLE_WEIGHTS_INDEX_NAME, - SAFE_MASTER_WEIGHTS_INDEX_NAME, - SAFE_PEFT_WEIGHTS_INDEX_NAME, - SAFE_WEIGHTS_INDEX_NAME) +from paddleformers.utils.env import ( + PADDLE_WEIGHTS_INDEX_NAME, + SAFE_MASTER_WEIGHTS_INDEX_NAME, + SAFE_PEFT_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_INDEX_NAME, +) from paddleformers.utils.log import logger from tqdm import tqdm @@ -44,6 +46,7 @@ MAX_DRAFT_TOKENS = 6 class LayerIdPlaceholder(str, enum.Enum): """LayerIdPlaceholder""" + LAYER_ID = "layer_id" FFN_LAYER_ID = "ffn_layer_id" MOE_LAYER_ID = "moe_layer_id" @@ -51,6 +54,7 @@ class LayerIdPlaceholder(str, enum.Enum): TEXT_EXPERT_ID = "text_export_id" IMG_EXPERT_ID = "img_export_id" + class WeightMeta(NamedTuple): """ #tensor split parameters @@ -59,6 +63,7 @@ class WeightMeta(NamedTuple): # is_column: whether to split by columns # extra: optional flags like "is_naive_2fuse", "is_gqa", "is_naive_3fuse" """ + weight_name: str is_column: bool extra: Optional[str] = None @@ -81,8 +86,7 @@ class UniqueIDGenerator: first_key = sorted_keys[0] first_parameter = state_dict[first_key].cast("float32") # 假设模型参数是唯一的,通过第一个key来获取md5sum - model_md5 = hashlib.md5(str( - first_parameter.sum()).encode("utf-8")).hexdigest() + model_md5 = hashlib.md5(str(first_parameter.sum()).encode("utf-8")).hexdigest() unique_id = f"{model_md5}-{random.randint(10000, 99999)}" return unique_id @@ -99,20 +103,16 @@ def load_sharded_checkpoint(folder, variant=None, return_numpy=False): """ # Load the index - pdparams_file = os.path.join(folder, - _add_variant("model_state.pdparams", variant)) - lora_pdparams_file = os.path.join( - folder, _add_variant("lora_model_state.pdparams", variant)) - safetensors_file = os.path.join(folder, - _add_variant("model.safetensors", variant)) + pdparams_file = os.path.join(folder, _add_variant("model_state.pdparams", variant)) + lora_pdparams_file = os.path.join(folder, _add_variant("lora_model_state.pdparams", variant)) + safetensors_file = os.path.join(folder, _add_variant("model.safetensors", variant)) if os.path.isfile(pdparams_file): return paddle.load(pdparams_file, return_numpy=return_numpy) if os.path.isfile(lora_pdparams_file): return paddle.load(lora_pdparams_file, return_numpy=return_numpy) if os.path.isfile(safetensors_file): try: - from paddleformers.utils.safetensors import \ - fast_load_file as safe_load_file + from paddleformers.utils.safetensors import fast_load_file as safe_load_file except ImportError: from safetensors.numpy import load_file as safe_load_file @@ -120,18 +120,13 @@ def load_sharded_checkpoint(folder, variant=None, return_numpy=False): if not return_numpy: for key in list(state_dict.keys()): if isinstance(state_dict[key], np.ndarray): - state_dict[key] = paddle.Tensor(state_dict.pop(key), - zero_copy=True) + state_dict[key] = paddle.Tensor(state_dict.pop(key), zero_copy=True) return state_dict - index_file = os.path.join(folder, - _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant)) - safe_index_file = os.path.join( - folder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)) - safe_master_file = os.path.join( - folder, _add_variant(SAFE_MASTER_WEIGHTS_INDEX_NAME, variant)) - safe_peft_file = os.path.join( - folder, _add_variant(SAFE_PEFT_WEIGHTS_INDEX_NAME, variant)) + index_file = os.path.join(folder, _add_variant(PADDLE_WEIGHTS_INDEX_NAME, variant)) + safe_index_file = os.path.join(folder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)) + safe_master_file = os.path.join(folder, _add_variant(SAFE_MASTER_WEIGHTS_INDEX_NAME, variant)) + safe_peft_file = os.path.join(folder, _add_variant(SAFE_PEFT_WEIGHTS_INDEX_NAME, variant)) index_present = os.path.isfile(index_file) safe_index_present = os.path.isfile(safe_index_file) @@ -152,14 +147,11 @@ def load_sharded_checkpoint(folder, variant=None, return_numpy=False): load_safe = True load_index = safe_peft_file else: - raise ValueError( - f"Could not find {index_file} or {safe_index_file} or {safe_peft_file}" - ) + raise ValueError(f"Could not find {index_file} or {safe_index_file} or {safe_peft_file}") if load_safe: try: - from paddleformers.utils.safetensors import \ - fast_load_file as safe_load_file + from paddleformers.utils.safetensors import fast_load_file as safe_load_file except ImportError: from safetensors.numpy import load_file as safe_load_file @@ -167,8 +159,7 @@ def load_sharded_checkpoint(folder, variant=None, return_numpy=False): index = json.load(f) shard_files = list(set(index["weight_map"].values())) - loader = (safe_load_file if load_safe else partial( - paddleformers_load, map_location="np" if return_numpy else "cpu")) + loader = safe_load_file if load_safe else partial(paddleformers_load, map_location="np" if return_numpy else "cpu") ret = {} for shard_file in tqdm(shard_files): @@ -183,8 +174,7 @@ def load_sharded_checkpoint(folder, variant=None, return_numpy=False): return ret -def convert_ndarray_dtype(np_array: np.ndarray, - target_dtype: str) -> np.ndarray: +def convert_ndarray_dtype(np_array: np.ndarray, target_dtype: str) -> np.ndarray: """convert ndarray Args: @@ -195,8 +185,11 @@ def convert_ndarray_dtype(np_array: np.ndarray, np.ndarray: converted numpy ndarray instance """ source_dtype = convert_dtype(np_array.dtype) - if source_dtype == "uint16" and target_dtype == "bfloat16" and paddle.is_compiled_with_custom_device( - "iluvatar_gpu"): + if ( + source_dtype == "uint16" + and target_dtype == "bfloat16" + and paddle.is_compiled_with_custom_device("iluvatar_gpu") + ): return np_array.view(dtype=target_dtype) if source_dtype == "uint16" or target_dtype == "bfloat16": if paddle.is_compiled_with_xpu(): @@ -235,11 +228,9 @@ def pad_batch_data(insts, pad_id=0, return_seq_len=False, pad_style="right"): # pad to max input len # max_len = args.max_len if pad_style == "left": - inst_data = np.array([[pad_id] * (max_len - len(inst)) + list(inst) - for inst in insts]) + inst_data = np.array([[pad_id] * (max_len - len(inst)) + list(inst) for inst in insts]) else: - inst_data = np.array( - [list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]) + inst_data = np.array([list(inst) + [pad_id] * (max_len - len(inst)) for inst in insts]) if return_seq_len: seq_len = np.array([len(inst) for inst in insts]) return inst_data.astype("int64").reshape([-1, max_len]), seq_len @@ -258,8 +249,7 @@ def load_prefix_weights( Args: prefix_path (str): the path of prefix weight """ - past_key_values = paddle.to_tensor( - np.load(f"{prefix_path}/pre_caches.npy")).unsqueeze(2) + past_key_values = paddle.to_tensor(np.load(f"{prefix_path}/pre_caches.npy")).unsqueeze(2) if batch_size > 1: past_key_values = paddle.concat([past_key_values] * batch_size, axis=2) @@ -305,8 +295,7 @@ def w4a8_weight_convert(state_dict): name, w4a8_weight_bites_name_map, ) - state_dict[name] = weight_q.numpy( - ) if weight_q is not None else value + state_dict[name] = weight_q.numpy() if weight_q is not None else value del weight_q w4a8_weight_bites_layers_map = {} w4a8_weight_bites_layers_map["qkv_gemm_bits_map"] = [] @@ -319,13 +308,10 @@ def w4a8_weight_convert(state_dict): elif "out_proj" in name_keys: w4a8_weight_bites_layers_map["out_gemm_bits_map"].append(gemm_bits) elif "linear1" in name_keys: - w4a8_weight_bites_layers_map["up_gate_proj_gemm_bits_map"].append( - gemm_bits) + w4a8_weight_bites_layers_map["up_gate_proj_gemm_bits_map"].append(gemm_bits) elif "linear2" in name_keys: - w4a8_weight_bites_layers_map["down_proj_gemm_bits_map"].append( - gemm_bits) - logger.debug( - f"w4a8_weight_bites_layers_map:{w4a8_weight_bites_layers_map}") + w4a8_weight_bites_layers_map["down_proj_gemm_bits_map"].append(gemm_bits) + logger.debug(f"w4a8_weight_bites_layers_map:{w4a8_weight_bites_layers_map}") return state_dict, w4a8_weight_bites_layers_map @@ -415,10 +401,13 @@ def calculate_effective_tokens(training_args, train_dataset, max_seq_len): else: sharding_parallel_degree = 1 - total_batch = (training_args.max_steps * - training_args.per_device_train_batch_size * - training_args.gradient_accumulation_steps * - sharding_parallel_degree * data_parallel_degree) + total_batch = ( + training_args.max_steps + * training_args.per_device_train_batch_size + * training_args.gradient_accumulation_steps + * sharding_parallel_degree + * data_parallel_degree + ) for i, data in enumerate(train_dataset): if i == total_batch: break @@ -464,7 +453,7 @@ def parser_quant_type(quant_type): "fp8": "float8_e4m3fn", "fp16": "float16", "bf16": "bfloat16", - "fp32": "float32" + "fp32": "float32", } cache_type = default_type if "c8" in quant_type: @@ -483,8 +472,7 @@ def parser_quant_type(quant_type): pattern = f"({'|'.join(map(re.escape, ['w', 'a', 'c']))})" splited_type = re.split(pattern, quant_type) splited_type = [tmp_type for tmp_type in splited_type if tmp_type] - assert (len(splited_type) % 2 == 0 and len(splited_type) - <= 6), f"Quant type[{quant_type}] format error." + assert len(splited_type) % 2 == 0 and len(splited_type) <= 6, f"Quant type[{quant_type}] format error." quant_type_list = [] if "w" in splited_type: diff --git a/fastdeploy/model_executor/ops/__init__.py b/fastdeploy/model_executor/ops/__init__.py index 508e8707a..5e30570c9 100644 --- a/fastdeploy/model_executor/ops/__init__.py +++ b/fastdeploy/model_executor/ops/__init__.py @@ -12,11 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. """fastdeploy module""" -from . import gpu -from . import cpu -from . import xpu -from . import npu -from . import iluvatar -from . import gcu +from . import cpu, gcu, gpu, iluvatar, npu, xpu __all__ = ["gpu", "cpu", "xpu", "npu", "iluvatar", "gcu"] diff --git a/fastdeploy/model_executor/ops/cpu/__init__.py b/fastdeploy/model_executor/ops/cpu/__init__.py index 8a2e14546..ae2318f5a 100644 --- a/fastdeploy/model_executor/ops/cpu/__init__.py +++ b/fastdeploy/model_executor/ops/cpu/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" fastdeploy cpu ops """ +"""fastdeploy cpu ops""" from fastdeploy.import_ops import import_custom_ops, rename_imported_op diff --git a/fastdeploy/model_executor/ops/gcu/__init__.py b/fastdeploy/model_executor/ops/gcu/__init__.py index 04dab4c85..7403d7599 100644 --- a/fastdeploy/model_executor/ops/gcu/__init__.py +++ b/fastdeploy/model_executor/ops/gcu/__init__.py @@ -12,21 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" fastdeploy gcu ops """ -from fastdeploy.platforms import current_platform - +"""fastdeploy gcu ops""" from fastdeploy.import_ops import import_custom_ops, rename_imported_op +from fastdeploy.platforms import current_platform PACKAGE = "fastdeploy.model_executor.ops.gcu" import_custom_ops(PACKAGE, ".fastdeploy_ops", globals()) if current_platform.is_gcu(): - from paddle_custom_device.gcu.ops import (invoke_fused_moe_kernel, # noqa: F401,E402 - moe_align_block_size, top_p_sampling, # noqa: F401 - topk_softmax, # noqa: F401 - weight_quantize_custom_rtn, # noqa: F401 - weight_quantize_rtn) # noqa: F401 + from paddle_custom_device.gcu.ops import ( # noqa: F401 + invoke_fused_moe_kernel, + moe_align_block_size, + top_p_sampling, + topk_softmax, + weight_quantize_custom_rtn, + weight_quantize_rtn, + ) # ###################### Ops from PaddleCustomDevice #################### rename_imported_op( diff --git a/fastdeploy/model_executor/ops/iluvatar/__init__.py b/fastdeploy/model_executor/ops/iluvatar/__init__.py index 7c1eeb6f2..83b42f661 100644 --- a/fastdeploy/model_executor/ops/iluvatar/__init__.py +++ b/fastdeploy/model_executor/ops/iluvatar/__init__.py @@ -20,5 +20,5 @@ PACKAGE = "fastdeploy.model_executor.ops.iluvatar" import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals()) import_custom_ops(PACKAGE, ".fastdeploy_ops", globals()) -from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: E402, F401 -from .paged_attention import paged_attention # noqa: E402, F401 +from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401 +from .paged_attention import paged_attention # noqa: F401 diff --git a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py index ad77f8b69..5266b08ee 100644 --- a/fastdeploy/model_executor/ops/iluvatar/moe_ops.py +++ b/fastdeploy/model_executor/ops/iluvatar/moe_ops.py @@ -28,8 +28,13 @@ def group_gemm( scale: paddle.Tensor, output: paddle.Tensor, ): - assert (input.dim() == 2 and tokens_expert_prefix_sum.dim() == 1 - and weight.dim() == 3 and scale.dim() == 2 and output.dim() == 2) + assert ( + input.dim() == 2 + and tokens_expert_prefix_sum.dim() == 1 + and weight.dim() == 3 + and scale.dim() == 2 + and output.dim() == 2 + ) num_tokens = input.shape[0] dim_in = input.shape[1] dim_out = weight.shape[1] @@ -66,7 +71,8 @@ def group_gemm( weight_i, weight_scale=scale_i, weight_dtype="int8", - group_size=-1) + group_size=-1, + ) def iluvatar_moe_expert_ffn( @@ -90,13 +96,24 @@ def iluvatar_moe_expert_ffn( assert quant_method in ("weight_only_int8") assert not used_in_ep_low_latency tokens_expert_prefix_sum_cpu = tokens_expert_prefix_sum.to("cpu") - up_gate_proj_output = paddle.empty([permute_input.shape[0], up_gate_proj_weight.shape[1]], - dtype=permute_input.dtype) - group_gemm(permute_input, tokens_expert_prefix_sum_cpu, up_gate_proj_weight, - up_gate_proj_scale, up_gate_proj_output) + up_gate_proj_output = paddle.empty( + [permute_input.shape[0], up_gate_proj_weight.shape[1]], + dtype=permute_input.dtype, + ) + group_gemm( + permute_input, + tokens_expert_prefix_sum_cpu, + up_gate_proj_weight, + up_gate_proj_scale, + up_gate_proj_output, + ) act_out = swiglu(up_gate_proj_output) - output = paddle.empty([act_out.shape[0], down_proj_weight.shape[1]], - dtype=act_out.dtype) - group_gemm(act_out, tokens_expert_prefix_sum_cpu, down_proj_weight, down_proj_scale, - output) + output = paddle.empty([act_out.shape[0], down_proj_weight.shape[1]], dtype=act_out.dtype) + group_gemm( + act_out, + tokens_expert_prefix_sum_cpu, + down_proj_weight, + down_proj_scale, + output, + ) return output diff --git a/fastdeploy/model_executor/ops/iluvatar/paged_attention.py b/fastdeploy/model_executor/ops/iluvatar/paged_attention.py index f52bfe672..63819a868 100644 --- a/fastdeploy/model_executor/ops/iluvatar/paged_attention.py +++ b/fastdeploy/model_executor/ops/iluvatar/paged_attention.py @@ -15,32 +15,51 @@ """ import paddle + try: from fastdeploy.model_executor.ops.iluvatar import paged_attn except ImportError: paged_attn = None -def paged_attention(q: paddle.Tensor, - k_cache: paddle.Tensor, - v_cache: paddle.Tensor, - block_tables: paddle.Tensor, - seq_lens: paddle.Tensor, - num_kv_heads: int, - scale: float, - block_size: int, - max_context_len: int, - alibi_slopes: paddle.Tensor = None, - causal: bool = True, - window_left: int = -1, - window_right: int = -1, - softcap: float = 0.0, - use_cuda_graph: bool = False, - use_sqrt_alibi: bool = False, - k: paddle.Tensor = None, - v: paddle.Tensor = None): - output = paged_attn(q, k_cache, v_cache, block_tables, seq_lens, - alibi_slopes, k, v, num_kv_heads, scale, block_size, - max_context_len, causal, window_left, window_right, - softcap, use_cuda_graph, use_sqrt_alibi) +def paged_attention( + q: paddle.Tensor, + k_cache: paddle.Tensor, + v_cache: paddle.Tensor, + block_tables: paddle.Tensor, + seq_lens: paddle.Tensor, + num_kv_heads: int, + scale: float, + block_size: int, + max_context_len: int, + alibi_slopes: paddle.Tensor = None, + causal: bool = True, + window_left: int = -1, + window_right: int = -1, + softcap: float = 0.0, + use_cuda_graph: bool = False, + use_sqrt_alibi: bool = False, + k: paddle.Tensor = None, + v: paddle.Tensor = None, +): + output = paged_attn( + q, + k_cache, + v_cache, + block_tables, + seq_lens, + alibi_slopes, + k, + v, + num_kv_heads, + scale, + block_size, + max_context_len, + causal, + window_left, + window_right, + softcap, + use_cuda_graph, + use_sqrt_alibi, + ) return output[0] if isinstance(output, list) else output diff --git a/fastdeploy/model_executor/ops/triton_ops/__init__.py b/fastdeploy/model_executor/ops/triton_ops/__init__.py index 3a7fcd391..3b4888457 100644 --- a/fastdeploy/model_executor/ops/triton_ops/__init__.py +++ b/fastdeploy/model_executor/ops/triton_ops/__init__.py @@ -17,6 +17,7 @@ try: from .wint2_fused_moe import fused_moe_wint2_triton from .wint2_fused_moe_kernel import moe_wint2_ffn_kernel + __all__ = ["fused_moe_wint2_triton", "moe_wint2_ffn_kernel"] except: pass diff --git a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py index 9cdcaa302..206631520 100644 --- a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py +++ b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py @@ -81,8 +81,7 @@ def multi_process_do(commands): i += THREADS for i in range(THREADS): - p = multiprocessing.Process(target=one_process_work, - args=(commands, i)) + p = multiprocessing.Process(target=one_process_work, args=(commands, i)) process.append(p) for p in process: p.start() @@ -118,7 +117,7 @@ def extract_triton_kernel(kernel, file_name): # assert len(re.findall("@haha()", py_script)) == 1 # py_script = py_script.replace("@haha()", "@triton.jit") - py_script = py_script[py_script.find("def "):] + py_script = py_script[py_script.find("def ") :] py_script = "import triton\nimport triton.language as tl\n\n\n@triton.jit\n" + py_script py_script = py_script.replace("if bias_ptr is not None", "if bias_ptr") @@ -245,8 +244,7 @@ def build_package(generated_dir, python_package_name): setup_file_path = generated_dir + "/setup_cuda.py" python_path = sys.executable with open(setup_file_path, "w") as f: - f.write( - template_install.format(python_package_name=python_package_name)) + f.write(template_install.format(python_package_name=python_package_name)) f.close() install_command = f"cd {generated_dir} && {python_path} setup_cuda.py build" re = os.system(install_command) @@ -412,12 +410,15 @@ tune_and_invoke_part = """ } """ -common_template = (""" +common_template = ( + """ std::vector ${op_name}_func(${input_and_attr}) { ${prepare_attr_for_triton_kernel} ${prepare_ptr_for_triton_kernel} auto run_stream = ${arbitary_output_name}.stream(); - """ + tune_and_invoke_part + """ + """ + + tune_and_invoke_part + + """ return {${return_tensor_names}}; } @@ -430,7 +431,8 @@ PD_BUILD_OP(${op_name}) .SetKernelFn(PD_KERNEL(${op_name}_func)) .SetInferDtypeFn(PD_INFER_DTYPE(${op_name}_InferDtype)) .SetInferShapeFn(PD_INFER_SHAPE(${op_name}_InferShape)); -""") +""" +) def rendering_common_template( @@ -500,11 +502,11 @@ def rendering_common_template( "std::vector> ${op_name}_InferShape(" "const std::vector& A_shape) {" "return {${tmp}};" - "}\n ") + "}\n " + ) tmp = ",".join(["A_shape"] * len(return_tensor_names.split(","))) tmp_dict = {"tmp": tmp} - d2s_infer_shape_part = SubstituteTemplate(d2s_infer_shape_part, - tmp_dict) + d2s_infer_shape_part = SubstituteTemplate(d2s_infer_shape_part, tmp_dict) d2s_infer_code += d2s_infer_shape_part @@ -513,11 +515,11 @@ def rendering_common_template( "std::vector ${op_name}_InferDtype(" "const paddle::DataType& A_dtype) {" "return {${tmp}};" - "}\n ") + "}\n " + ) tmp = ",".join(["A_dtype"] * len(return_tensor_names.split(","))) tmp_dict = {"tmp": tmp} - d2s_infer_dtype_part = SubstituteTemplate(d2s_infer_dtype_part, - tmp_dict) + d2s_infer_dtype_part = SubstituteTemplate(d2s_infer_dtype_part, tmp_dict) d2s_infer_code += d2s_infer_dtype_part @@ -568,13 +570,13 @@ class KernelInterface: self.annotations = dict(func.__annotations__) self.constexprs = [ - self.arg_names.index(name) for name in self.arg_names + self.arg_names.index(name) + for name in self.arg_names if self.annotations.get(name) == triton.language.core.constexpr ] self.arg_exclude_constexpr = [ - self.arg_names[i] for i in range(len(self.arg_names)) - if i not in self.constexprs + self.arg_names[i] for i in range(len(self.arg_names)) if i not in self.constexprs ] import textwrap @@ -587,7 +589,7 @@ class KernelInterface: func_begin = re.findall(pat, py_script) assert len(func_begin) == 1 func_begin = func_begin[0] - py_script = py_script[py_script.find(func_begin):] + py_script = py_script[py_script.find(func_begin) :] def decorator(*args, **kwargs): """ @@ -626,11 +628,13 @@ class KernelInterface: const_hint_dict = {} for i in range(len(all_input)): ele = all_input[i] - if (type(ele) == paddle.Tensor - or type(ele) == paddle.base.framework.EagerParamBase - or type(ele) == paddle.base.framework.Parameter - or type(ele) == paddle.base.framework.Variable - or type(ele) == paddle.base.libpaddle.pir.Value): + if ( + type(ele) == paddle.Tensor + or type(ele) == paddle.base.framework.EagerParamBase + or type(ele) == paddle.base.framework.Parameter + or type(ele) == paddle.base.framework.Variable + or type(ele) == paddle.base.libpaddle.pir.Value + ): dtypes.append(ele.dtype) modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]" elif i in self.constexprs: @@ -646,9 +650,10 @@ class KernelInterface: if generated_dir is None: generated_dir = f"/tmp/triton_cache/rank{tp_rank}" print("the kernel cache dir is:", generated_dir) - assert (generated_dir is not None), ( + assert generated_dir is not None, ( "TRITON_KERNEL_CACHE_DIR is None, please set it such as " - "export TRITON_KERNEL_CACHE_DIR=/tmp/triton_cache ") + "export TRITON_KERNEL_CACHE_DIR=/tmp/triton_cache " + ) generated_dir = f"{generated_dir}/{op_name}" os.makedirs(generated_dir, exist_ok=True) @@ -676,13 +681,11 @@ class KernelInterface: lanuch_grid = ",".join(lanuch_grid) op_dict = {"op_name": op_name, "reset_zero_when_tune": ""} - op_dict["triton_kernel_args"] = ",".join( - modified_arg_exclude_constexpr) + op_dict["triton_kernel_args"] = ",".join(modified_arg_exclude_constexpr) op_dict["key"] = ",".join(self.key_args) # when tunning, we need to reset the out to zero. if "reset_zero_when_tune" in other_config.keys(): - op_dict["reset_zero_when_tune"] = other_config[ - "reset_zero_when_tune"] + op_dict["reset_zero_when_tune"] = other_config["reset_zero_when_tune"] paddle_custom_op_file_path = f"{generated_dir}/{op_name}.cu" so_path = find_so_path(generated_dir, python_package_name) @@ -694,17 +697,19 @@ class KernelInterface: SubstituteTemplate( self.custom_op_template, op_dict, - )) + ) + ) f.close() # ahead of time compile command. aot_template = ( - f"""{python_path} {compile_file} {py_script_file} """ + - f""" -n {func.__name__} -o {generated_dir}/{op_name}_kernel """ - + f"""--out-name {op_name}_kernel """ + - """ -w {num_warps} -ns {num_stages} """ + - f""" -s"{address_hint} {value_hint} {const_args}" """ + - f""" -g "{lanuch_grid}" """) + f"""{python_path} {compile_file} {py_script_file} """ + + f""" -n {func.__name__} -o {generated_dir}/{op_name}_kernel """ + + f"""--out-name {op_name}_kernel """ + + """ -w {num_warps} -ns {num_stages} """ + + f""" -s"{address_hint} {value_hint} {const_args}" """ + + f""" -g "{lanuch_grid}" """ + ) all_tune_config = list(self.tune_config) if len(all_tune_config) == 0: # when user do not specify config, we use const_hint_dict as config. @@ -727,24 +732,24 @@ class KernelInterface: ) raise ValueError(message) else: - assert key in config.keys( - ), f"you must specify {key} in your config." + assert key in config.keys(), f"you must specify {key} in your config." if "num_warps" not in config.keys(): config["num_warps"] = 4 if "num_stages" not in config.keys(): config["num_stages"] = 4 for key in config: - assert config[ - key] is not None, f"{key} must be specified." - codegen_command = aot_template.format(**config, ) + assert config[key] is not None, f"{key} must be specified." + codegen_command = aot_template.format( + **config, + ) print(codegen_command) codegen_commands.append(codegen_command) multi_process_do(codegen_commands) link_command = ( - f"{python_path} {link_file} " - f"{generated_dir}/*.h -o {generated_dir}/{op_name}_kernel") + f"{python_path} {link_file} " f"{generated_dir}/*.h -o {generated_dir}/{op_name}_kernel" + ) re = os.system(link_command) assert re == 0 @@ -757,8 +762,7 @@ class KernelInterface: so_path = find_so_path(generated_dir, python_package_name) print("== we find so_path: ", so_path) assert so_path is not None - paddle.utils.cpp_extension.load_op_meta_info_and_register_op( - so_path) + paddle.utils.cpp_extension.load_op_meta_info_and_register_op(so_path) self.decorator = decorator diff --git a/fastdeploy/model_executor/ops/triton_ops/triton_utils_v2.py b/fastdeploy/model_executor/ops/triton_ops/triton_utils_v2.py index 6681d752f..b8268ce88 100644 --- a/fastdeploy/model_executor/ops/triton_ops/triton_utils_v2.py +++ b/fastdeploy/model_executor/ops/triton_ops/triton_utils_v2.py @@ -23,10 +23,18 @@ import sys import paddle import triton -from .triton_utils import (SubstituteTemplate, build_package, compile_file, - extract_triton_kernel, find_so_path, - get_pointer_hint, link_file, multi_process_do, - python_path, rename_c_to_cu) +from .triton_utils import ( + SubstituteTemplate, + build_package, + compile_file, + extract_triton_kernel, + find_so_path, + get_pointer_hint, + link_file, + multi_process_do, + python_path, + rename_c_to_cu, +) def get_value_hint(x): @@ -49,7 +57,7 @@ def get_value_hint(x): return hint -common_template = (""" +common_template = """ #include "${op_name}_kernel.h" #include "paddle/extension.h" @@ -66,7 +74,7 @@ PYBIND11_MODULE(${op_name}_package, m) { m.def("${op_name}_func", ${op_name}_func, "get expert token num"); } -""") +""" class KernelInterface: @@ -98,13 +106,13 @@ class KernelInterface: self.annotations = dict(func.__annotations__) self.constexprs = [ - self.arg_names.index(name) for name in self.arg_names + self.arg_names.index(name) + for name in self.arg_names if self.annotations.get(name) == triton.language.core.constexpr ] self.arg_exclude_constexpr = [ - self.arg_names[i] for i in range(len(self.arg_names)) - if i not in self.constexprs + self.arg_names[i] for i in range(len(self.arg_names)) if i not in self.constexprs ] import textwrap @@ -115,7 +123,7 @@ class KernelInterface: func_begin = re.findall(pat, py_script) assert len(func_begin) == 1 func_begin = func_begin[0] - py_script = py_script[py_script.find(func_begin):] + py_script = py_script[py_script.find(func_begin) :] self.func_map = {} @@ -156,23 +164,22 @@ class KernelInterface: ele = all_input[i] if type(ele) in [ - paddle.Tensor, paddle.base.framework.EagerParamBase, - paddle.base.framework.Parameter, - paddle.base.framework.Variable, - paddle.base.libpaddle.pir.Value, - type(None) + paddle.Tensor, + paddle.base.framework.EagerParamBase, + paddle.base.framework.Parameter, + paddle.base.framework.Variable, + paddle.base.libpaddle.pir.Value, + type(None), ]: if ele is not None: dtypes.append(ele.dtype) - passed_arg_exclude_constexpr[ - i] = f"(CUdeviceptr)({passed_arg_exclude_constexpr[i]}->data())" + passed_arg_exclude_constexpr[i] = f"(CUdeviceptr)({passed_arg_exclude_constexpr[i]}->data())" else: dtypes.append(paddle.int8) - passed_arg_exclude_constexpr[ - i] = "(CUdeviceptr)(nullptr)" - decalare_arg_exclude_constexpr[ - i] = "const paddle::optional&" + decalare_arg_exclude_constexpr[ - i] + passed_arg_exclude_constexpr[i] = "(CUdeviceptr)(nullptr)" + decalare_arg_exclude_constexpr[i] = ( + "const paddle::optional&" + decalare_arg_exclude_constexpr[i] + ) elif i in self.constexprs: if isinstance(ele, bool): const_hint_dict[self.arg_names[i]] = (int)(ele) @@ -186,21 +193,16 @@ class KernelInterface: else: x_list.append(ele) if isinstance(ele, int): - decalare_arg_exclude_constexpr[ - i] = "const int64_t " + decalare_arg_exclude_constexpr[ - i] + decalare_arg_exclude_constexpr[i] = "const int64_t " + decalare_arg_exclude_constexpr[i] elif isinstance(ele, float): - decalare_arg_exclude_constexpr[ - i] = "const float " + decalare_arg_exclude_constexpr[ - i] + decalare_arg_exclude_constexpr[i] = "const float " + decalare_arg_exclude_constexpr[i] else: assert False python_package_name = f"{op_name}_package" tp_rank = paddle.distributed.get_rank() - generated_dir = os.getenv("TRITON_KERNEL_CACHE_DIR", - f"/tmp/triton_cache/rank{tp_rank}") + generated_dir = os.getenv("TRITON_KERNEL_CACHE_DIR", f"/tmp/triton_cache/rank{tp_rank}") print("the kernel cache dir is:", generated_dir) generated_dir = f"{generated_dir}/{op_name}" os.makedirs(generated_dir, exist_ok=True) @@ -231,10 +233,8 @@ class KernelInterface: lanuch_grid = ",".join(lanuch_grid) op_dict = {"op_name": op_name} - op_dict["triton_kernel_args"] = ",".join( - passed_arg_exclude_constexpr) - op_dict["tensor_and_attr"] = ",".join( - decalare_arg_exclude_constexpr) + op_dict["triton_kernel_args"] = ",".join(passed_arg_exclude_constexpr) + op_dict["tensor_and_attr"] = ",".join(decalare_arg_exclude_constexpr) paddle_custom_op_file_path = f"{generated_dir}/{op_name}.cu" so_path = find_so_path(generated_dir, python_package_name) @@ -242,20 +242,23 @@ class KernelInterface: if so_path is None: print("== we do not find so_path, we need to compile it") with open(paddle_custom_op_file_path, "w") as f: - f.write(SubstituteTemplate( - common_template, - op_dict, - )) + f.write( + SubstituteTemplate( + common_template, + op_dict, + ) + ) f.close() # ahead of time compile command. aot_template = ( - f"""{python_path} {compile_file} {py_script_file} """ + - f""" -n {func.__name__} -o {generated_dir}/{op_name}_kernel """ - + f"""--out-name {op_name}_kernel """ + - """ -w {num_warps} -ns {num_stages} """ + - f""" -s"{address_hint} {value_hint} {const_args}" """ + - f""" -g "{lanuch_grid}" """) + f"""{python_path} {compile_file} {py_script_file} """ + + f""" -n {func.__name__} -o {generated_dir}/{op_name}_kernel """ + + f"""--out-name {op_name}_kernel """ + + """ -w {num_warps} -ns {num_stages} """ + + f""" -s"{address_hint} {value_hint} {const_args}" """ + + f""" -g "{lanuch_grid}" """ + ) all_tune_config = [const_hint_dict] # reset const_hint_dict as empty. @@ -276,24 +279,24 @@ class KernelInterface: ) raise ValueError(message) else: - assert key in config.keys( - ), f"you must specify {key} in your config." + assert key in config.keys(), f"you must specify {key} in your config." if "num_warps" not in config.keys(): config["num_warps"] = 4 if "num_stages" not in config.keys(): config["num_stages"] = 4 for key in config: - assert config[ - key] is not None, f"{key} must be specified." - codegen_command = aot_template.format(**config, ) + assert config[key] is not None, f"{key} must be specified." + codegen_command = aot_template.format( + **config, + ) print(codegen_command) codegen_commands.append(codegen_command) multi_process_do(codegen_commands) link_command = ( - f"{python_path} {link_file} " - f"{generated_dir}/*.h -o {generated_dir}/{op_name}_kernel") + f"{python_path} {link_file} " f"{generated_dir}/*.h -o {generated_dir}/{op_name}_kernel" + ) re = os.system(link_command) assert re == 0 @@ -325,9 +328,11 @@ class KernelInterface: Returns: the decorator function. """ - self.grid = (( - "((max_possible_num_post_padded + BLOCK_SIZE_M -1)/ BLOCK_SIZE_M) * ((N + BLOCK_SIZE_N-1) / BLOCK_SIZE_N)" - ), ) + self.grid = ( + ( + "((max_possible_num_post_padded + BLOCK_SIZE_M -1)/ BLOCK_SIZE_M) * ((N + BLOCK_SIZE_N-1) / BLOCK_SIZE_N)" + ), + ) return self.decorator diff --git a/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe.py b/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe.py index fe279a8e5..e69c34a21 100644 --- a/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe.py +++ b/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe.py @@ -21,7 +21,10 @@ from paddle.base.framework import OpProtoHolder from paddle.framework import in_dynamic_or_pir_mode from fastdeploy.model_executor.ops.triton_ops.triton_utils import ( - get_dtype_str, paddle_use_triton, rendering_common_template) + get_dtype_str, + paddle_use_triton, + rendering_common_template, +) BLOCK_SIZE_M = 16 @@ -51,8 +54,11 @@ def invoke_fused_moe_kernel( sstride_am, sstride_ak = A.shape[1], 1 sstride_be, sstride_bk, sstride_bn = B.shape[1] * B.shape[2], B.shape[2], 1 sstride_cm, sstride_cn = C.shape[-1], 1 - sstride_bse, sstride_bsk, sstride_bsn = B_scale.shape[1] * B_scale.shape[ - 2], B_scale.shape[2], 1 + sstride_bse, sstride_bsk, sstride_bsn = ( + B_scale.shape[1] * B_scale.shape[2], + B_scale.shape[2], + 1, + ) sstride_bce, sstride_bck, sstride_bcn = B_code_scale.shape[1], 1, 1 ddouble_quant = B_super_scale is not None @@ -124,9 +130,7 @@ def invoke_fused_moe_kernel( prepare_attr_for_triton_kernel, prepare_ptr_for_triton_kernel, ) - grid = ( - "(EM+BLOCK_SIZE_M-1)/BLOCK_SIZE_M * ((N+BLOCK_SIZE_N-1)/BLOCK_SIZE_N)", - ) + grid = ("(EM+BLOCK_SIZE_M-1)/BLOCK_SIZE_M * ((N+BLOCK_SIZE_N-1)/BLOCK_SIZE_N)",) moe_wint2_ffn_kernel[(op_name, template_used, grid, configs)]( A, @@ -142,8 +146,8 @@ def invoke_fused_moe_kernel( num_tokens_post_padded, NN, KK, - -1, #EEM, - -1, #nnum_valid_tokens, + -1, # EEM, + -1, # nnum_valid_tokens, sstride_am, sstride_ak, sstride_be, @@ -185,7 +189,9 @@ def invoke_fused_moe_kernel( return outs[0] -@paddle_use_triton(key=["1"], ) +@paddle_use_triton( + key=["1"], +) def moe_wint2_ffn_kernel( # Pointers to matrices a_ptr, @@ -291,17 +297,14 @@ def moe_wint2_ffn_kernel( # offs_k = tl.arange(0, BLOCK_SIZE_K) offs_bk = tl.arange(0, real_k_size) - a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + - offs_bk[None, :] * pack_num * stride_ak) + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_bk[None, :] * pack_num * stride_ak) off_experts = tl.load(expert_ids_ptr + pid_m) - b_ptrs = b_ptr + off_experts * stride_be + (offs_bk[:, None] * stride_bk + - offs_bn[None, :] * stride_bn) + b_ptrs = b_ptr + off_experts * stride_be + (offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn) accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - bs_ptrs = bs_ptr + off_experts * stride_bse + offs_bn[ - None, :] * stride_bsn # group-wise, need advanced + bs_ptrs = bs_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn # group-wise, need advanced off_set = off_experts * stride_bce + offs_bn[None, :] * stride_bcn # load channel-wise scale & zero-point @@ -324,8 +327,7 @@ def moe_wint2_ffn_kernel( bs = ((bs >> s_shift_bits) & 0xF) * super_bs # reverse to int16 - b = tl.floor((b.to(tl.float32) * code_bs + code_bzp) + 0.5).to( - tl.int16) + b = tl.floor((b.to(tl.float32) * code_bs + code_bzp) + 0.5).to(tl.int16) # dequant b1 = (((b >> 9) & w_mask) - bzp) * bs a = tl.load( @@ -369,17 +371,14 @@ def moe_wint2_ffn_kernel( bs_ptrs += stride_bsk if MUL_ROUTED_WEIGHT: - moe_weight = tl.load(topk_weights_ptr + offs_token, - mask=token_mask, - other=0) + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) accumulator = accumulator * moe_weight[:, None] accumulator = accumulator.to(compute_type) # ----------------------------------------------------------- # Write back the block of the output offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ - None, :] + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] c_mask = token_mask[:, None] & (offs_cn[None, :] < N) tl.store(c_ptrs, accumulator, mask=c_mask) @@ -412,10 +411,8 @@ def fused_moe_wint2_impl( # f"Hidden size mismatch, {hidden_states.shape[1]} != {up_gate_proj_quant_weight.shape[1]}" assert topk_weights.shape == topk_ids.shape, "topk shape mismatch" assert hidden_states.is_contiguous(), "Hidden_states must be contiguous" - assert up_gate_proj_quant_weight.is_contiguous( - ), "Expert weights1 must be contiguous" - assert down_proj_quant_weight.is_contiguous( - ), "Expert weights2 must be contiguous" + assert up_gate_proj_quant_weight.is_contiguous(), "Expert weights1 must be contiguous" + assert down_proj_quant_weight.is_contiguous(), "Expert weights2 must be contiguous" assert group_size > 0, "Group size must be greater than 0" num_tokens, K = hidden_states.shape @@ -442,9 +439,7 @@ def fused_moe_wint2_impl( from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess - sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess( - topk_ids, E, BLOCK_SIZE_M) - + sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess(topk_ids, E, BLOCK_SIZE_M) invoke_fused_moe_kernel( A=hidden_states, @@ -464,8 +459,7 @@ def fused_moe_wint2_impl( group_size=group_size, ) - intermediate_cache2 = paddle.incubate.nn.functional.swiglu( - intermediate_cache1.reshape([-1, N])) + intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1.reshape([-1, N])) invoke_fused_moe_kernel( A=intermediate_cache2, diff --git a/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe_kernel.py b/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe_kernel.py index 8540f61b9..5852448ac 100644 --- a/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe_kernel.py +++ b/fastdeploy/model_executor/ops/triton_ops/wint2_fused_moe_kernel.py @@ -16,8 +16,9 @@ import triton.language as tl -from fastdeploy.model_executor.ops.triton_ops.triton_utils_v2 import \ - paddle_use_triton_v2 +from fastdeploy.model_executor.ops.triton_ops.triton_utils_v2 import ( + paddle_use_triton_v2, +) @paddle_use_triton_v2() @@ -124,17 +125,14 @@ def moe_wint2_ffn_kernel( # offs_k = tl.arange(0, BLOCK_SIZE_K) offs_bk = tl.arange(0, real_k_size) - a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + - offs_bk[None, :] * pack_num * stride_ak) + a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am + offs_bk[None, :] * pack_num * stride_ak) off_experts = tl.load(expert_ids_ptr + pid_m) - b_ptrs = b_ptr + off_experts * stride_be + (offs_bk[:, None] * stride_bk + - offs_bn[None, :] * stride_bn) + b_ptrs = b_ptr + off_experts * stride_be + (offs_bk[:, None] * stride_bk + offs_bn[None, :] * stride_bn) accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - bs_ptrs = bs_ptr + off_experts * stride_bse + offs_bn[ - None, :] * stride_bsn # group-wise, need advanced + bs_ptrs = bs_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn # group-wise, need advanced off_set = off_experts * stride_bce + offs_bn[None, :] * stride_bcn # load channel-wise scale & zero-point @@ -157,8 +155,7 @@ def moe_wint2_ffn_kernel( bs = ((bs >> s_shift_bits) & 0xF) * super_bs # reverse to int16 - b = tl.floor((b.to(tl.float32) * code_bs + code_bzp) + 0.5).to( - tl.int16) + b = tl.floor((b.to(tl.float32) * code_bs + code_bzp) + 0.5).to(tl.int16) # dequant b1 = (((b >> 9) & w_mask) - bzp) * bs a = tl.load( @@ -202,16 +199,13 @@ def moe_wint2_ffn_kernel( bs_ptrs += stride_bsk if MUL_ROUTED_WEIGHT: - moe_weight = tl.load(topk_weights_ptr + offs_token, - mask=token_mask, - other=0) + moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0) accumulator = accumulator * moe_weight[:, None] accumulator = accumulator.to(compute_type) # ----------------------------------------------------------- # Write back the block of the output offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[ - None, :] + c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :] c_mask = token_mask[:, None] & (offs_cn[None, :] < N) tl.store(c_ptrs, accumulator, mask=c_mask) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index a92d946ee..1e3f709cd 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from typing import Dict, Optional import paddle @@ -23,32 +24,51 @@ from fastdeploy.platforms import current_platform if current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import ( - get_padding_offset, save_output, set_stop_value_multi_ends, - step_paddle, update_inputs) + get_padding_offset, + save_output, + set_stop_value_multi_ends, + step_paddle, + update_inputs, + ) elif current_platform.is_gcu(): - from fastdeploy.model_executor.ops.gcu import (get_padding_offset, - save_output, - set_stop_value_multi_ends, - update_inputs) + from fastdeploy.model_executor.ops.gcu import ( + get_padding_offset, + save_output, + set_stop_value_multi_ends, + update_inputs, + ) elif current_platform.is_dcu(): - from fastdeploy.model_executor.ops.gpu import (get_padding_offset, - save_output, - set_stop_value_multi_ends, - step_paddle, update_inputs) + from fastdeploy.model_executor.ops.gpu import ( + get_padding_offset, + save_output, + set_stop_value_multi_ends, + step_paddle, + update_inputs, + ) else: from fastdeploy.model_executor.ops.gpu import ( - get_padding_offset, save_output, save_output_topk, set_stop_value_multi_ends, - speculate_clear_accept_nums, speculate_get_output_padding_offset, - speculate_get_padding_offset, speculate_get_seq_lens_output, - speculate_save_output, speculate_set_value_by_flags_and_idx, - speculate_step_paddle, speculate_step_system_cache, - speculate_update_v3, step_paddle, step_system_cache, update_inputs, - step_reschedule) + get_padding_offset, + save_output, + save_output_topk, + set_stop_value_multi_ends, + speculate_clear_accept_nums, + speculate_get_output_padding_offset, + speculate_get_padding_offset, + speculate_get_seq_lens_output, + speculate_save_output, + speculate_set_value_by_flags_and_idx, + speculate_step_paddle, + speculate_step_system_cache, + speculate_update_v3, + step_paddle, + step_reschedule, + step_system_cache, + update_inputs, + ) -from fastdeploy.worker.output import (ModelOutputData, ModelRunnerOutput, - SamplerOutput) +from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput -DISABLE_RECOVER = (envs.FD_DISABLED_RECOVER == "1") +DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1" def pre_process( @@ -118,47 +138,62 @@ def pre_process( batch_id_per_token, cu_seqlens_q, cu_seqlens_k, - ) = get_padding_offset(input_ids, cum_offsets_now, token_num, - seq_lens_this_time) - return (ids_remove_padding, cum_offsets, batch_id_per_token, cu_seqlens_q, - cu_seqlens_k, output_cum_offsets, output_padding_offset) + ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) + return ( + ids_remove_padding, + cum_offsets, + batch_id_per_token, + cu_seqlens_q, + cu_seqlens_k, + output_cum_offsets, + output_padding_offset, + ) -def post_process_normal(sampler_output: SamplerOutput, - model_output: ModelOutputData, - save_each_rank: bool = False, - skip_save_output: bool = False) -> ModelRunnerOutput: - """ Post-processing steps after completing a single token generation. """ +def post_process_normal( + sampler_output: SamplerOutput, + model_output: ModelOutputData, + save_each_rank: bool = False, + skip_save_output: bool = False, +) -> ModelRunnerOutput: + """Post-processing steps after completing a single token generation.""" # handle vl: if model_output.enable_thinking: exists_think_end = sampler_output.sampled_token_ids == model_output.think_end_id paddle.assign( - paddle.where( - exists_think_end, - model_output.need_think_end - 1, - model_output.need_think_end, - ), model_output.need_think_end) + paddle.where( + exists_think_end, + model_output.need_think_end - 1, + model_output.need_think_end, + ), + model_output.need_think_end, + ) paddle.assign( paddle.where( model_output.need_think_end.cast("bool"), model_output.reasoning_index - 1, model_output.reasoning_index, - ), model_output.reasoning_index) + ), + model_output.reasoning_index, + ) stop_wo_think = ( - (sampler_output.sampled_token_ids == model_output.eos_token_id) | - (model_output.reasoning_index == 0)) & ( - model_output.need_think_end > 0) - sampler_output.sampled_token_ids = paddle.where(stop_wo_think, - model_output.think_end_id, - sampler_output.sampled_token_ids) + (sampler_output.sampled_token_ids == model_output.eos_token_id) | (model_output.reasoning_index == 0) + ) & (model_output.need_think_end > 0) + sampler_output.sampled_token_ids = paddle.where( + stop_wo_think, + model_output.think_end_id, + sampler_output.sampled_token_ids, + ) paddle.assign( paddle.where( stop_wo_think, model_output.need_think_end - 1, model_output.need_think_end, - ), model_output.need_think_end) + ), + model_output.need_think_end, + ) # 1. Set stop value paddle.assign( paddle.where( @@ -168,17 +203,20 @@ def post_process_normal(sampler_output: SamplerOutput, ), model_output.step_idx, ) - length_cond = paddle.greater_equal(model_output.step_idx, - model_output.max_dec_len) + length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len) paddle.assign( paddle.logical_or(model_output.stop_flags, length_cond), model_output.stop_flags, ) # TODO(gongshaotian): Add use_stop_seqs - set_stop_value_multi_ends(sampler_output.sampled_token_ids, model_output.stop_flags, - model_output.seq_lens_this_time, - model_output.eos_token_id, - model_output.next_tokens, False) # multi ends + set_stop_value_multi_ends( + sampler_output.sampled_token_ids, + model_output.stop_flags, + model_output.seq_lens_this_time, + model_output.eos_token_id, + model_output.next_tokens, + False, + ) # multi ends # 2. Update the input buffer of the model with paddle.framework._no_check_dy2st_diff(): @@ -239,8 +277,7 @@ def post_process_specualate(model_output, save_each_rank: bool = False, skip_sav save_each_rank, ) - speculate_clear_accept_nums(model_output.accept_num, - model_output.seq_lens_decoder) + speculate_clear_accept_nums(model_output.accept_num, model_output.seq_lens_decoder) # Update pre_ids through accept tokens @@ -256,17 +293,18 @@ def post_process_specualate(model_output, save_each_rank: bool = False, skip_sav ) -def post_process(sampler_output: SamplerOutput, - model_output: ModelOutputData, - save_each_rank: bool = False, - speculative_decoding: bool = False, - skip_save_output: bool = False) -> None: - """ Post-processing steps after completing a single token generation. """ +def post_process( + sampler_output: SamplerOutput, + model_output: ModelOutputData, + save_each_rank: bool = False, + speculative_decoding: bool = False, + skip_save_output: bool = False, +) -> None: + """Post-processing steps after completing a single token generation.""" if speculative_decoding: post_process_specualate(model_output, save_each_rank, skip_save_output) else: - post_process_normal(sampler_output, model_output, save_each_rank, - skip_save_output) + post_process_normal(sampler_output, model_output, save_each_rank, skip_save_output) def step_cuda( @@ -280,33 +318,32 @@ def step_cuda( TODO(gongshaotian): normalization name """ - if speculative_config.method is not None: if enable_prefix_caching: speculate_step_system_cache( - share_inputs['stop_flags'], + share_inputs["stop_flags"], share_inputs["seq_lens_this_time"], - share_inputs['step_seq_lens_encoder'], - share_inputs['step_seq_lens_decoder'], - share_inputs['seq_lens_encoder'], - share_inputs['seq_lens_decoder'], + share_inputs["step_seq_lens_encoder"], + share_inputs["step_seq_lens_decoder"], + share_inputs["seq_lens_encoder"], + share_inputs["seq_lens_decoder"], share_inputs["block_tables"], - share_inputs['encoder_block_lens'], + share_inputs["encoder_block_lens"], share_inputs["is_block_step"], - share_inputs['step_block_list'], - share_inputs['step_lens'], - share_inputs['recover_block_list'], - share_inputs['recover_lens'], - share_inputs['need_block_list'], - share_inputs['need_block_len'], - share_inputs['used_list_len'], - share_inputs['free_list'], - share_inputs['free_list_len'], - share_inputs['input_ids'], - share_inputs['pre_ids'], - share_inputs['step_idx'], - share_inputs['next_tokens'], - share_inputs['first_token_ids'], + share_inputs["step_block_list"], + share_inputs["step_lens"], + share_inputs["recover_block_list"], + share_inputs["recover_lens"], + share_inputs["need_block_list"], + share_inputs["need_block_len"], + share_inputs["used_list_len"], + share_inputs["free_list"], + share_inputs["free_list_len"], + share_inputs["input_ids"], + share_inputs["pre_ids"], + share_inputs["step_idx"], + share_inputs["next_tokens"], + share_inputs["first_token_ids"], share_inputs["accept_num"], block_size, enc_dec_block_num, @@ -314,28 +351,28 @@ def step_cuda( ) else: speculate_step_paddle( - share_inputs['stop_flags'], + share_inputs["stop_flags"], share_inputs["seq_lens_this_time"], - share_inputs['step_seq_lens_encoder'], - share_inputs['seq_lens_encoder'], - share_inputs['seq_lens_decoder'], + share_inputs["step_seq_lens_encoder"], + share_inputs["seq_lens_encoder"], + share_inputs["seq_lens_decoder"], share_inputs["block_tables"], - share_inputs['encoder_block_lens'], + share_inputs["encoder_block_lens"], share_inputs["is_block_step"], - share_inputs['step_block_list'], - share_inputs['step_lens'], - share_inputs['recover_block_list'], - share_inputs['recover_lens'], - share_inputs['need_block_list'], - share_inputs['need_block_len'], - share_inputs['used_list_len'], - share_inputs['free_list'], - share_inputs['free_list_len'], - share_inputs['input_ids'], - share_inputs['pre_ids'], - share_inputs['step_idx'], - share_inputs['next_tokens'], - share_inputs['first_token_ids'], + share_inputs["step_block_list"], + share_inputs["step_lens"], + share_inputs["recover_block_list"], + share_inputs["recover_lens"], + share_inputs["need_block_list"], + share_inputs["need_block_len"], + share_inputs["used_list_len"], + share_inputs["free_list"], + share_inputs["free_list_len"], + share_inputs["input_ids"], + share_inputs["pre_ids"], + share_inputs["step_idx"], + share_inputs["next_tokens"], + share_inputs["first_token_ids"], share_inputs["accept_num"], block_size, enc_dec_block_num, @@ -344,20 +381,32 @@ def step_cuda( else: if enable_prefix_caching: step_system_cache( - share_inputs["stop_flags"], share_inputs["seq_lens_this_time"], + share_inputs["stop_flags"], + share_inputs["seq_lens_this_time"], share_inputs["step_seq_lens_encoder"], share_inputs["step_seq_lens_decoder"], share_inputs["seq_lens_encoder"], - share_inputs["seq_lens_decoder"], share_inputs["block_tables"], + share_inputs["seq_lens_decoder"], + share_inputs["block_tables"], share_inputs["encoder_block_lens"], - share_inputs["is_block_step"], share_inputs["step_block_list"], - share_inputs["step_lens"], share_inputs["recover_block_list"], - share_inputs["recover_lens"], share_inputs["need_block_list"], - share_inputs["need_block_len"], share_inputs["used_list_len"], - share_inputs["free_list"], share_inputs["free_list_len"], - share_inputs["input_ids"], share_inputs["pre_ids"], - share_inputs["step_idx"], share_inputs["next_tokens"], - share_inputs["first_token_ids"], block_size, enc_dec_block_num) + share_inputs["is_block_step"], + share_inputs["step_block_list"], + share_inputs["step_lens"], + share_inputs["recover_block_list"], + share_inputs["recover_lens"], + share_inputs["need_block_list"], + share_inputs["need_block_len"], + share_inputs["used_list_len"], + share_inputs["free_list"], + share_inputs["free_list_len"], + share_inputs["input_ids"], + share_inputs["pre_ids"], + share_inputs["step_idx"], + share_inputs["next_tokens"], + share_inputs["first_token_ids"], + block_size, + enc_dec_block_num, + ) elif DISABLE_RECOVER: step_reschedule( share_inputs["stop_flags"], @@ -414,19 +463,22 @@ def step_cuda( ) -def rebuild_padding(tmp_out: paddle.Tensor, - cum_offsets: paddle.Tensor, - seq_len_this_time: paddle.Tensor, - seq_lens_decoder: paddle.Tensor, - seq_lens_encoder: paddle.Tensor, - output_padding_offset: Optional[paddle.Tensor] = None, - max_input_length: Optional[int] = None): +def rebuild_padding( + tmp_out: paddle.Tensor, + cum_offsets: paddle.Tensor, + seq_len_this_time: paddle.Tensor, + seq_lens_decoder: paddle.Tensor, + seq_lens_encoder: paddle.Tensor, + output_padding_offset: Optional[paddle.Tensor] = None, + max_input_length: Optional[int] = None, +): """ Args: Returns: """ if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import rebuild_padding + hidden_states = rebuild_padding( tmp_out, cum_offsets, @@ -438,6 +490,7 @@ def rebuild_padding(tmp_out: paddle.Tensor, ) elif current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import rebuild_padding + hidden_states = rebuild_padding( tmp_out, cum_offsets, @@ -449,6 +502,7 @@ def rebuild_padding(tmp_out: paddle.Tensor, ) elif current_platform.is_gcu(): from fastdeploy.model_executor.ops.gcu import rebuild_padding + hidden_states = rebuild_padding( tmp_out, cum_offsets, @@ -460,6 +514,7 @@ def rebuild_padding(tmp_out: paddle.Tensor, ) elif current_platform.is_cpu(): from fastdeploy.model_executor.ops.cpu import rebuild_padding_cpu + hidden_states = rebuild_padding_cpu( tmp_out, cum_offsets, diff --git a/fastdeploy/output/__init__.py b/fastdeploy/output/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/output/__init__.py +++ b/fastdeploy/output/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 3b3b2f7b7..e2953f0af 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import copy import os import threading @@ -24,8 +25,7 @@ from concurrent.futures import ThreadPoolExecutor import numpy as np -from fastdeploy.engine.request import (CompletionOutput, RequestMetrics, - RequestOutput) +from fastdeploy.engine.request import CompletionOutput, RequestMetrics, RequestOutput from fastdeploy.inter_communicator import IPCSignal from fastdeploy.metrics.metrics import main_process_metrics from fastdeploy.platforms import current_platform @@ -39,13 +39,12 @@ MAX_DRAFT_TOKENS = 6 SPECULATE_MAX_BSZ = 256 -class TokenProcessor(object): +class TokenProcessor: """ get Token/Score from Paddle inference engine """ - def __init__(self, cfg, cached_generated_tokens, engine_worker_queue, - split_connector): + def __init__(self, cfg, cached_generated_tokens, engine_worker_queue, split_connector): import paddle paddle.device.set_device("cpu") @@ -59,22 +58,17 @@ class TokenProcessor(object): self.speculative_decoding = self.cfg.speculative_config.method is not None if self.speculative_decoding: - self.output_tokens = paddle.full(shape=[ - SPECULATE_MAX_BSZ * MAX_DRAFT_TOKENS + SPECULATE_MAX_BSZ + 2 - ], - fill_value=2, - dtype="int64") - elif self.cfg.enable_logprob: self.output_tokens = paddle.full( - shape=[MAX_BSZ * (K + 1) + 2, 1], fill_value=2, dtype="int64") - self.output_scores = paddle.full( - shape=[MAX_BSZ * (K + 1), 1], fill_value=0.0, dtype="float32") - self.output_ranks = paddle.full( - shape=[MAX_BSZ], fill_value=0, dtype="int64") + shape=[SPECULATE_MAX_BSZ * MAX_DRAFT_TOKENS + SPECULATE_MAX_BSZ + 2], + fill_value=2, + dtype="int64", + ) + elif self.cfg.enable_logprob: + self.output_tokens = paddle.full(shape=[MAX_BSZ * (K + 1) + 2, 1], fill_value=2, dtype="int64") + self.output_scores = paddle.full(shape=[MAX_BSZ * (K + 1), 1], fill_value=0.0, dtype="float32") + self.output_ranks = paddle.full(shape=[MAX_BSZ], fill_value=0, dtype="int64") else: - self.output_tokens = paddle.full(shape=[MAX_BSZ + 2, 1], - fill_value=2, - dtype="int64") + self.output_tokens = paddle.full(shape=[MAX_BSZ + 2, 1], fill_value=2, dtype="int64") self.worker = None self.statics_start_time = time.time() @@ -94,21 +88,23 @@ class TokenProcessor(object): 0, ] * MAX_DRAFT_TOKENS prefill_time_data = np.zeros([100], dtype=np.float32) - self.prefill_time_signal = IPCSignal(name="prefill_time_signal", - array=prefill_time_data, - dtype=np.float32, - suffix=os.getpid(), - create=True) + self.prefill_time_signal = IPCSignal( + name="prefill_time_signal", + array=prefill_time_data, + dtype=np.float32, + suffix=os.getpid(), + create=True, + ) self.executor = ThreadPoolExecutor(max_workers=1) self.prefill_result_status = dict() self._finalizer = weakref.finalize(self, self._cleanup_resources) def _cleanup_resources(self): """Cleaning up shared memory resources""" - if hasattr(self, 'prefill_time_signal'): + if hasattr(self, "prefill_time_signal"): self.prefill_time_signal.clear() - if hasattr(self, 'executor'): + if hasattr(self, "executor"): self.executor.shutdown(wait=False) def set_resource_manager(self, resource_manager): @@ -129,16 +125,12 @@ class TokenProcessor(object): if self.worker is not None: raise Exception("Worker is already running!") use_logprobs = ( - self.cfg.enable_logprob - and not self.speculative_decoding - and not self.cfg.parallel_config.enable_expert_parallel + self.cfg.enable_logprob + and not self.speculative_decoding + and not self.cfg.parallel_config.enable_expert_parallel ) - target_func = ( - self.process_sampling_with_logprob_results - if use_logprobs else - self.process_sampling_results - ) + target_func = self.process_sampling_with_logprob_results if use_logprobs else self.process_sampling_results self.worker = threading.Thread(target=target_func) @@ -159,7 +151,14 @@ class TokenProcessor(object): while True: try: is_blocking = True - get_output_topk(self.output_tokens, self.output_scores, self.output_ranks, K, rank_id, is_blocking) + get_output_topk( + self.output_tokens, + self.output_scores, + self.output_ranks, + K, + rank_id, + is_blocking, + ) if self.output_tokens[0, 0] == -2: continue @@ -170,8 +169,7 @@ class TokenProcessor(object): self._process_prefill_metrics() self._process_sampling_with_logprob_batch_output() except Exception as e: - llm_logger.info("while get input_data error: {0} {1}".format( - e, str(traceback.format_exc()))) + llm_logger.info(f"while get input_data error: {e} {traceback.format_exc()!s}") def process_sampling_results(self): """ @@ -186,21 +184,25 @@ class TokenProcessor(object): from fastdeploy.model_executor.ops.gcu import get_output else: from fastdeploy.model_executor.ops.gpu import ( - get_output, get_output_ep, speculate_get_output) + get_output, + get_output_ep, + speculate_get_output, + ) rank_id = self.cfg.parallel_config.local_data_parallel_id while True: try: is_blocking = True if self.speculative_decoding: - speculate_get_output(self.output_tokens, rank_id, - is_blocking, False) + speculate_get_output(self.output_tokens, rank_id, is_blocking, False) if self.output_tokens[0] == -2: continue else: - if self.cfg.parallel_config.enable_expert_parallel and \ - self.cfg.parallel_config.data_parallel_size > 1: + if ( + self.cfg.parallel_config.enable_expert_parallel + and self.cfg.parallel_config.data_parallel_size > 1 + ): get_output_ep(self.output_tokens, rank_id, is_blocking) else: @@ -208,14 +210,11 @@ class TokenProcessor(object): if self.output_tokens[0, 0] == -2: continue - llm_logger.debug( - f"rank_id {rank_id} self.output_tokens[0, 0] {self.output_tokens[0, 0]}" - ) + llm_logger.debug(f"rank_id {rank_id} self.output_tokens[0, 0] {self.output_tokens[0, 0]}") self._process_prefill_metrics() self._process_batch_output() except Exception as e: - llm_logger.info("while get input_data error: {0} {1}".format( - e, str(traceback.format_exc()))) + llm_logger.info(f"while get input_data error: {e} {traceback.format_exc()!s}") def _process_prefill_metrics(self): """Asynchronous processing prefill time indicators""" @@ -224,11 +223,9 @@ class TokenProcessor(object): try: current_index = 0 while current_index < len(self.prefill_time_signal.value): - prefill_time = self.prefill_time_signal.value[ - current_index] + prefill_time = self.prefill_time_signal.value[current_index] if prefill_time > 0: - main_process_metrics.request_prefill_time.observe( - prefill_time) + main_process_metrics.request_prefill_time.observe(prefill_time) self.prefill_time_signal.value[current_index] = 0 current_index += 1 except Exception as e: @@ -248,12 +245,7 @@ class TokenProcessor(object): except Exception as e: llm_logger.error(f"Error in TokenProcessor's postprocess: {e}") - def _recycle_resources(self, - task_id, - index, - task, - result=None, - is_prefill=False): + def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False): """ recycle resources """ @@ -262,13 +254,10 @@ class TokenProcessor(object): finished_task_ids = self.engine_worker_queue.get_finished_req() if len(finished_task_ids) > 0: for finished_task_id in finished_task_ids: - llm_logger.info( - f"finished_task_id: {finished_task_id}") - self.prefill_result_status[ - finished_task_id[0]] = finished_task_id[1] + llm_logger.info(f"finished_task_id: {finished_task_id}") + self.prefill_result_status[finished_task_id[0]] = finished_task_id[1] if task_id in self.prefill_result_status: - self.split_connector.send_first_token( - task.disaggregate_info, [result]) + self.split_connector.send_first_token(task.disaggregate_info, [result]) self.resource_manager.stop_flags[index] = True self.resource_manager.tasks_list[index] = None self.resource_manager._recycle_block_tables(task) @@ -300,8 +289,7 @@ class TokenProcessor(object): single_head_acceptance_rates = [] for head in range(self.cfg.speculative_config.num_speculative_tokens): single_head_acceptance_rates.append( - self.num_accept_requests_per_head[head] - / self.num_rest_requests_per_head[head] + self.num_accept_requests_per_head[head] / self.num_rest_requests_per_head[head] ) spec_logger.info(f" Single head accept ratio: {single_head_acceptance_rates}") @@ -316,10 +304,8 @@ class TokenProcessor(object): """ batch = self.output_tokens[1, 0] - tokens = self.output_tokens[2:batch * (K + 1) + 2].numpy().reshape( - [batch, K + 1])[:, :(K + 1)] - scores = self.output_scores[:batch * (K + 1)].numpy().reshape( - [batch, K + 1])[:, :(K + 1)] + tokens = self.output_tokens[2 : batch * (K + 1) + 2].numpy().reshape([batch, K + 1])[:, : (K + 1)] + scores = self.output_scores[: batch * (K + 1)].numpy().reshape([batch, K + 1])[:, : (K + 1)] ranks = self.output_ranks[:batch].numpy() batch_result = list() for i in range(batch): @@ -331,8 +317,7 @@ class TokenProcessor(object): token_ids = [token_id] recovery_stop = token_id == RECOVERY_STOP_SIGNAL if recovery_stop: - llm_logger.info( - f"recovery stop signal found at task {task_id}") + llm_logger.info(f"recovery stop signal found at task {task_id}") if not recovery_stop and token_id < 0: continue @@ -350,10 +335,9 @@ class TokenProcessor(object): arrival_time=task.arrival_time, inference_start_time=task.inference_start_time, first_token_time=time.time() - task.inference_start_time, - time_in_queue=task.schedule_start_time - - task.preprocess_end_time, - preprocess_cost_time=task.preprocess_end_time - - task.preprocess_start_time) + time_in_queue=task.schedule_start_time - task.preprocess_end_time, + preprocess_cost_time=task.preprocess_end_time - task.preprocess_start_time, + ) self._record_first_token_metrics(task, current_time) @@ -364,24 +348,25 @@ class TokenProcessor(object): ) self.number_of_output_tokens += len(token_ids) self._record_metrics(task, current_time, token_ids) - result = RequestOutput(request_id=task_id, - outputs=CompletionOutput( - index=i, - send_idx=self.tokens_counter[task_id], - token_ids=[], - logprob = None, - draft_token_ids=[], - top_logprobs=None, - ), - finished=False, - metrics=metrics) + result = RequestOutput( + request_id=task_id, + outputs=CompletionOutput( + index=i, + send_idx=self.tokens_counter[task_id], + token_ids=[], + logprob=None, + draft_token_ids=[], + top_logprobs=None, + ), + finished=False, + metrics=metrics, + ) if self.tokens_counter[task_id] == 0: if task.messages is not None: result.prompt = task.messages result.num_cached_tokens = task.num_cached_tokens - is_prefill = task.disaggregate_info is not None and task.disaggregate_info[ - "role"] == "prefill" + is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill" if is_prefill and len(token_ids) > 1: result.outputs.draft_token_ids = copy.deepcopy(token_ids) @@ -399,7 +384,7 @@ class TokenProcessor(object): result.outputs.top_logprobs = LogprobsLists( logprob_token_ids=[topk_token_ids], logprobs=[topk_logprobs], - sampled_token_ranks=[sampled_rank] + sampled_token_ranks=[sampled_rank], ) if token_id in task.eos_token_ids or is_prefill or recovery_stop: result.finished = True @@ -408,8 +393,8 @@ class TokenProcessor(object): if recovery_stop: result.error_msg = "Recover is not supported, the result is incomplete!" llm_logger.info( - f"Request: {task_id} finished, number of " - f"generated tokens: {self.tokens_counter[task_id]}.") + f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}." + ) llm_logger.info( f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}" ) @@ -418,8 +403,7 @@ class TokenProcessor(object): self._compute_speculative_status() if not is_prefill: self._record_completion_metrics(task, current_time) - self._recycle_resources(task_id, i, task, result, - is_prefill) + self._recycle_resources(task_id, i, task, result, is_prefill) break if not is_prefill or self.cfg.scheduler_config.name == "splitwise": batch_result.append(result) @@ -434,11 +418,11 @@ class TokenProcessor(object): tokens = self.output_tokens.numpy() if self.cfg.speculative_config.method: batch = self.output_tokens[1] - accept_num = tokens[2:batch + 2] + accept_num = tokens[2 : batch + 2] self._record_speculative_decoding_mertics(accept_num) else: batch = self.output_tokens[1, 0] - tokens = tokens[2:batch + 2] + tokens = tokens[2 : batch + 2] batch_result = list() for i in range(batch): @@ -450,10 +434,14 @@ class TokenProcessor(object): task_id = task.request_id if self.cfg.speculative_config.method: - token_ids = tokens[2 + SPECULATE_MAX_BSZ + - i * MAX_DRAFT_TOKENS:2 + SPECULATE_MAX_BSZ + - i * MAX_DRAFT_TOKENS + - accept_num[i]].tolist() + token_ids = tokens[ + 2 + + SPECULATE_MAX_BSZ + + i * MAX_DRAFT_TOKENS : 2 + + SPECULATE_MAX_BSZ + + i * MAX_DRAFT_TOKENS + + accept_num[i] + ].tolist() if len(token_ids) == 0 or token_ids[-1] <= 0: continue else: @@ -461,8 +449,7 @@ class TokenProcessor(object): token_ids = [token_id] recovery_stop = token_id == RECOVERY_STOP_SIGNAL if recovery_stop: - llm_logger.info( - f"recovery stop signal found at task {task_id}") + llm_logger.info(f"recovery stop signal found at task {task_id}") if not recovery_stop and token_id < 0: continue @@ -480,10 +467,9 @@ class TokenProcessor(object): arrival_time=task.arrival_time, inference_start_time=task.inference_start_time, first_token_time=time.time() - task.inference_start_time, - time_in_queue=task.schedule_start_time - - task.preprocess_end_time, - preprocess_cost_time=task.preprocess_end_time - - task.preprocess_start_time) + time_in_queue=task.schedule_start_time - task.preprocess_end_time, + preprocess_cost_time=task.preprocess_end_time - task.preprocess_start_time, + ) self._record_first_token_metrics(task, current_time) @@ -494,21 +480,23 @@ class TokenProcessor(object): ) self.number_of_output_tokens += len(token_ids) self._record_metrics(task, current_time, token_ids) - result = RequestOutput(request_id=task_id, - outputs=CompletionOutput( - index=i, - send_idx=self.tokens_counter[task_id], - token_ids=[], - draft_token_ids=[]), - finished=False, - metrics=metrics) + result = RequestOutput( + request_id=task_id, + outputs=CompletionOutput( + index=i, + send_idx=self.tokens_counter[task_id], + token_ids=[], + draft_token_ids=[], + ), + finished=False, + metrics=metrics, + ) if self.tokens_counter[task_id] == 0: if task.messages is not None: result.prompt = task.messages result.num_cached_tokens = task.num_cached_tokens - is_prefill = task.disaggregate_info is not None and task.disaggregate_info[ - "role"] == "prefill" + is_prefill = task.disaggregate_info is not None and task.disaggregate_info["role"] == "prefill" if is_prefill and len(token_ids) > 1: result.outputs.draft_token_ids = copy.deepcopy(token_ids) @@ -522,8 +510,8 @@ class TokenProcessor(object): if recovery_stop: result.error_msg = "Recover is not supported, the result is incomplete!" llm_logger.info( - f"Request: {task_id} finished, number of " - f"generated tokens: {self.tokens_counter[task_id]}.") + f"Request: {task_id} finished, number of " f"generated tokens: {self.tokens_counter[task_id]}." + ) llm_logger.info( f"Request: {task_id} token ratio: {self.tokens_counter[task_id] / (time.time() - task.inference_start_time)}" ) @@ -532,8 +520,7 @@ class TokenProcessor(object): self._compute_speculative_status() if not is_prefill: self._record_completion_metrics(task, current_time) - self._recycle_resources(task_id, i, task, result, - is_prefill) + self._recycle_resources(task_id, i, task, result, is_prefill) break if not is_prefill or self.cfg.scheduler_config.name == "splitwise": batch_result.append(result) @@ -542,8 +529,7 @@ class TokenProcessor(object): def _record_metrics(self, task, current_time, token_ids): """Record all metrics for a task""" - if hasattr(task, - 'last_token_time') and task.last_token_time is not None: + if hasattr(task, "last_token_time") and task.last_token_time is not None: token_gen_time = current_time - task.last_token_time main_process_metrics.time_per_output_token.observe(token_gen_time) task.last_token_time = current_time @@ -554,23 +540,19 @@ class TokenProcessor(object): def _record_first_token_metrics(self, task, current_time): """Record metrics for first token""" task.first_token_time = current_time - main_process_metrics.time_to_first_token.observe( - current_time - task.inference_start_time) - main_process_metrics.request_queue_time.observe( - task.schedule_start_time - task.preprocess_end_time) + main_process_metrics.time_to_first_token.observe(current_time - task.inference_start_time) + main_process_metrics.request_queue_time.observe(task.schedule_start_time - task.preprocess_end_time) def _record_completion_metrics(self, task, current_time): """Record metrics when request completes""" - if hasattr(task, 'first_token_time'): + if hasattr(task, "first_token_time"): decode_time = current_time - task.first_token_time main_process_metrics.request_decode_time.observe(decode_time) main_process_metrics.num_requests_running.dec(1) main_process_metrics.request_success_total.inc() - main_process_metrics.request_inference_time.observe( - current_time - task.inference_start_time) - main_process_metrics.request_generation_tokens.observe( - self.tokens_counter[task.request_id]) + main_process_metrics.request_inference_time.observe(current_time - task.inference_start_time) + main_process_metrics.request_generation_tokens.observe(self.tokens_counter[task.request_id]) def _record_speculative_decoding_mertics(self, accept_num): """Record metrics of speculative decoding""" @@ -586,12 +568,8 @@ class TokenProcessor(object): num_emitted_tokens = sum(real_accept_num) self.num_emitted_tokens += num_emitted_tokens - main_process_metrics.spec_decode_num_accepted_tokens_total.inc( - num_accepted_tokens - ) - main_process_metrics.spec_decode_num_emitted_tokens_total.inc( - num_emitted_tokens - ) + main_process_metrics.spec_decode_num_accepted_tokens_total.inc(num_accepted_tokens) + main_process_metrics.spec_decode_num_emitted_tokens_total.inc(num_emitted_tokens) if self.cfg.speculative_config.method in ["ngram"]: main_process_metrics.spec_decode_draft_acceptance_rate.set( @@ -599,10 +577,7 @@ class TokenProcessor(object): ) if self.cfg.speculative_config.method in ["mtp"]: - num_draft_tokens = ( - len(real_accept_num) - * self.cfg.speculative_config.num_speculative_tokens - ) + num_draft_tokens = len(real_accept_num) * self.cfg.speculative_config.num_speculative_tokens self.num_draft_tokens += num_draft_tokens self.max_num_emitted_tokens += len(real_accept_num) * ( @@ -612,12 +587,8 @@ class TokenProcessor(object): main_process_metrics.spec_decode_draft_acceptance_rate.set( self.num_accepted_tokens / self.num_draft_tokens ) - main_process_metrics.spec_decode_efficiency.set( - self.num_emitted_tokens / self.max_num_emitted_tokens - ) - main_process_metrics.spec_decode_num_draft_tokens_total.inc( - num_draft_tokens - ) + main_process_metrics.spec_decode_efficiency.set(self.num_emitted_tokens / self.max_num_emitted_tokens) + main_process_metrics.spec_decode_num_draft_tokens_total.inc(num_draft_tokens) num_rest_requests = len(real_accept_num) for head in range(self.cfg.speculative_config.num_speculative_tokens): @@ -629,12 +600,11 @@ class TokenProcessor(object): num_rest_requests = num_accept_requests # Calculate the acceptance rate for each head single_head_acceptance_rate = ( - self.num_accept_requests_per_head[head] - / self.num_rest_requests_per_head[head] + self.num_accept_requests_per_head[head] / self.num_rest_requests_per_head[head] + ) + main_process_metrics.spec_decode_draft_single_head_acceptance_rate[head].set( + single_head_acceptance_rate ) - main_process_metrics.spec_decode_draft_single_head_acceptance_rate[ - head - ].set(single_head_acceptance_rate) class WarmUpTokenProcessor(TokenProcessor): @@ -661,14 +631,15 @@ class WarmUpTokenProcessor(TokenProcessor): from fastdeploy.model_executor.ops.iluvatar import get_output else: from fastdeploy.model_executor.ops.gpu import ( - get_output, speculate_get_output) + get_output, + speculate_get_output, + ) while self._is_running: try: rank_id = 0 if self.speculative_decoding: - speculate_get_output(self.output_tokens, rank_id, - self._is_blocking) + speculate_get_output(self.output_tokens, rank_id, self._is_blocking) if self.output_tokens[0] == -2: continue else: @@ -678,8 +649,7 @@ class WarmUpTokenProcessor(TokenProcessor): continue self._process_batch_output() except Exception as e: - llm_logger.info("while get input_data error: {0} {1}".format( - e, str(traceback.format_exc()))) + llm_logger.info(f"while get input_data error: {e} {traceback.format_exc()!s}") def stop(self): """ diff --git a/fastdeploy/platforms/__init__.py b/fastdeploy/platforms/__init__.py index cc0938ca5..849005f48 100644 --- a/fastdeploy/platforms/__init__.py +++ b/fastdeploy/platforms/__init__.py @@ -16,14 +16,15 @@ platform module """ import paddle -from .cuda import CUDAPlatform -from .cpu import CPUPlatform -from .xpu import XPUPlatform -from .npu import NPUPlatform -from .dcu import DCUPlatform -from .iluvatar import IluvatarPlatform -from .gcu import GCUPlatform + from .base import _Backend # noqa: F401 +from .cpu import CPUPlatform +from .cuda import CUDAPlatform +from .dcu import DCUPlatform +from .gcu import GCUPlatform +from .iluvatar import IluvatarPlatform +from .npu import NPUPlatform +from .xpu import XPUPlatform _current_platform = None @@ -51,5 +52,4 @@ def __getattr__(name: str): elif name in globals(): return globals()[name] else: - raise AttributeError( - f"No attribute named '{name}' exists in {__name__}.") + raise AttributeError(f"No attribute named '{name}' exists in {__name__}.") diff --git a/fastdeploy/platforms/base.py b/fastdeploy/platforms/base.py index 1bfba0bbc..0c99b2593 100644 --- a/fastdeploy/platforms/base.py +++ b/fastdeploy/platforms/base.py @@ -32,6 +32,7 @@ class Platform: """ Platform base class, all device class will be derived from it """ + device_name: str def is_cuda(self) -> bool: @@ -87,9 +88,7 @@ class Platform: Verify whether the quantization is supported by the current platform. """ if self.supported_quantization and quant not in self.supported_quantization: - raise ValueError( - f"{quant} quantization is currently not supported in " - f"{self.device_name}.") + raise ValueError(f"{quant} quantization is currently not supported in " f"{self.device_name}.") @classmethod def available(self): diff --git a/fastdeploy/platforms/cuda.py b/fastdeploy/platforms/cuda.py index 294506b4b..6676d3c0f 100644 --- a/fastdeploy/platforms/cuda.py +++ b/fastdeploy/platforms/cuda.py @@ -25,6 +25,7 @@ class CUDAPlatform(Platform): """ cuda platform class """ + device_name = "gpu" @classmethod @@ -39,7 +40,8 @@ class CUDAPlatform(Platform): logger.warning( "You are using GPU version PaddlePaddle, but there is no GPU " "detected on your machine. Maybe CUDA devices is not set properly." - f"\n Original Error is {e}") + f"\n Original Error is {e}" + ) return False @classmethod @@ -49,24 +51,16 @@ class CUDAPlatform(Platform): """ if selected_backend == _Backend.NATIVE_ATTN: logger.info("Using NATIVE ATTN backend.") - return ( - "fastdeploy.model_executor.layers.attention.PaddleNativeAttnBackend" - ) + return "fastdeploy.model_executor.layers.attention.PaddleNativeAttnBackend" elif selected_backend == _Backend.APPEND_ATTN: logger.info("Using APPEND ATTN backend.") - return ( - "fastdeploy.model_executor.layers.attention.AppendAttentionBackend" - ) + return "fastdeploy.model_executor.layers.attention.AppendAttentionBackend" elif selected_backend == _Backend.MLA_ATTN: logger.info("Using MLA ATTN backend.") - return ( - "fastdeploy.model_executor.layers.attention.MLAAttentionBackend" - ) + return "fastdeploy.model_executor.layers.attention.MLAAttentionBackend" elif selected_backend == _Backend.FLASH_ATTN: logger.info("Using FLASH ATTN backend.") - return ( - "fastdeploy.model_executor.layers.attention.FlashAttentionBackend" - ) + return "fastdeploy.model_executor.layers.attention.FlashAttentionBackend" else: raise ValueError( "Invalid attention backend you specified.\n" diff --git a/fastdeploy/platforms/dcu.py b/fastdeploy/platforms/dcu.py index a79f04f61..bfd848335 100644 --- a/fastdeploy/platforms/dcu.py +++ b/fastdeploy/platforms/dcu.py @@ -15,14 +15,16 @@ dcu platform file """ import paddle -from .base import Platform, _Backend from paddleformers.utils.log import logger +from .base import Platform, _Backend + class DCUPlatform(Platform): """ dcu platform class """ + device_name = "dcu" @classmethod @@ -42,20 +44,15 @@ class DCUPlatform(Platform): return False @classmethod - def get_attention_backend_cls( - cls, - selected_backend - ): + def get_attention_backend_cls(cls, selected_backend): """ get_attention_backend_cls """ if selected_backend == _Backend.NATIVE_ATTN: logger.info("Using NATIVE ATTN backend.") - return ("fastdeploy.model_executor.layers.attention.PaddleNativeAttnBackend") + return "fastdeploy.model_executor.layers.attention.PaddleNativeAttnBackend" elif selected_backend == _Backend.BLOCK_ATTN: logger.info("Using BLOCK ATTN backend.") - return ("fastdeploy.model_executor.layers.attention.BlockAttentionBackend") + return "fastdeploy.model_executor.layers.attention.BlockAttentionBackend" else: - logger.warning( - "Other backends are not supported for now." - ) + logger.warning("Other backends are not supported for now.") diff --git a/fastdeploy/platforms/gcu.py b/fastdeploy/platforms/gcu.py index 42b55d641..e812113e1 100644 --- a/fastdeploy/platforms/gcu.py +++ b/fastdeploy/platforms/gcu.py @@ -25,6 +25,7 @@ class GCUPlatform(Platform): """ gcu platform class """ + device_name = "gcu" @classmethod @@ -33,7 +34,7 @@ class GCUPlatform(Platform): Check whether GCU is available. """ try: - assert paddle.base.core.get_custom_device_count('gcu') > 0 + assert paddle.base.core.get_custom_device_count("gcu") > 0 return True except Exception as e: logger.warning( @@ -50,10 +51,10 @@ class GCUPlatform(Platform): """ if selected_backend == _Backend.NATIVE_ATTN: logger.info("Using GCU mem_efficient ATTN backend.") - return ("fastdeploy.model_executor.layers.backends.gcu.attention.mem_efficient_attn_backend.GCUMemEfficientAttnBackend") + return "fastdeploy.model_executor.layers.backends.gcu.attention.mem_efficient_attn_backend.GCUMemEfficientAttnBackend" elif selected_backend == _Backend.APPEND_ATTN: logger.info("Using GCU ATTN backend.") - return ("fastdeploy.model_executor.layers.backends.gcu.attention.flash_attn_backend.GCUFlashAttnBackend") + return "fastdeploy.model_executor.layers.backends.gcu.attention.flash_attn_backend.GCUFlashAttnBackend" else: raise ValueError( "Invalid attention backend you specified.\n" diff --git a/fastdeploy/platforms/iluvatar.py b/fastdeploy/platforms/iluvatar.py index cd1892058..5cc8e146a 100644 --- a/fastdeploy/platforms/iluvatar.py +++ b/fastdeploy/platforms/iluvatar.py @@ -22,5 +22,4 @@ class IluvatarPlatform(Platform): """ get_attention_backend_cls """ - return ( - "fastdeploy.model_executor.layers.attention.IluvatarAttnBackend") + return "fastdeploy.model_executor.layers.attention.IluvatarAttnBackend" diff --git a/fastdeploy/platforms/utils.py b/fastdeploy/platforms/utils.py index 68d5649fa..6ad04c230 100644 --- a/fastdeploy/platforms/utils.py +++ b/fastdeploy/platforms/utils.py @@ -19,6 +19,7 @@ import numpy as np import paddle + def convert_to_npu_dequant_scale(deq_scale): """ Convert dequantization scale for NPU. @@ -39,8 +40,5 @@ def convert_to_npu_dequant_scale(deq_scale): if not paddle.is_compiled_with_custom_device("npu"): return deq_scale arr = deq_scale.numpy() - new_deq_scale = np.stack( - [arr.reshape(-1, 1), - np.zeros_like(arr).reshape(-1, 1)], axis=-1).reshape(-1) - return paddle.to_tensor( - np.frombuffer(new_deq_scale.tobytes(), dtype=np.int64)) + new_deq_scale = np.stack([arr.reshape(-1, 1), np.zeros_like(arr).reshape(-1, 1)], axis=-1).reshape(-1) + return paddle.to_tensor(np.frombuffer(new_deq_scale.tobytes(), dtype=np.int64)) diff --git a/fastdeploy/platforms/xpu.py b/fastdeploy/platforms/xpu.py index c00a1feee..2f3110742 100644 --- a/fastdeploy/platforms/xpu.py +++ b/fastdeploy/platforms/xpu.py @@ -22,6 +22,7 @@ class XPUPlatform(Platform): """ xpu platform class """ + device_name = "xpu" @classmethod @@ -37,7 +38,8 @@ class XPUPlatform(Platform): logger.warning( "You are using XPU version PaddlePaddle, but there is no XPU " "detected on your machine. Maybe CUDA devices is not set properly." - f"\n Original Error is {e}") + f"\n Original Error is {e}" + ) return False @classmethod @@ -46,11 +48,8 @@ class XPUPlatform(Platform): get_attention_backend_cls """ # TODO: 等支持配置 attention engine 之后再改回去 - return ( - "fastdeploy.model_executor.layers.attention.XPUAttentionBackend") + return "fastdeploy.model_executor.layers.attention.XPUAttentionBackend" if selected_backend == _Backend.NATIVE_ATTN: - return ( - "fastdeploy.model_executor.layers.attention.XPUAttentionBackend" - ) + return "fastdeploy.model_executor.layers.attention.XPUAttentionBackend" else: logger.warning("Other backends are not supported for now for XPU.") diff --git a/fastdeploy/reasoning/__init__.py b/fastdeploy/reasoning/__init__.py index ef950ef3c..aa7d65e50 100644 --- a/fastdeploy/reasoning/__init__.py +++ b/fastdeploy/reasoning/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from .abs_reasoning_parsers import ReasoningParser, ReasoningParserManager from .ernie_vl_reasoning_parsers import ErnieVLReasoningParser from .qwen3_reasoning_parsers import Qwen3ReasoningParser @@ -21,5 +22,5 @@ __all__ = [ "ReasoningParser", "ReasoningParserManager", "ErnieVLReasoningParser", - "Qwen3ReasoningParser" -] \ No newline at end of file + "Qwen3ReasoningParser", +] diff --git a/fastdeploy/reasoning/abs_reasoning_parsers.py b/fastdeploy/reasoning/abs_reasoning_parsers.py index f989547d9..50e01e5a9 100644 --- a/fastdeploy/reasoning/abs_reasoning_parsers.py +++ b/fastdeploy/reasoning/abs_reasoning_parsers.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from abc import abstractmethod from collections.abc import Sequence from functools import cached_property from typing import Callable, Optional, Union -from fastdeploy.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from fastdeploy.utils import is_list_of @@ -72,7 +72,7 @@ class ReasoningParser: @abstractmethod def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: ChatCompletionRequest ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from a complete model-generated string. @@ -115,11 +115,11 @@ class ReasoningParserManager: """ ReasoningParserManager """ + reasoning_parsers: dict[str, type] = {} @classmethod - def get_reasoning_parser(cls, - name: Optional[str]) -> type[ReasoningParser]: + def get_reasoning_parser(cls, name: Optional[str]) -> type[ReasoningParser]: """ Get reasoning parser by name which is registered by `register_module`. @@ -128,8 +128,7 @@ class ReasoningParserManager: if name in cls.reasoning_parsers: return cls.reasoning_parsers[name] - raise KeyError( - f"reasoning helper: '{name}' not found in reasoning_parsers") + raise KeyError(f"reasoning helper: '{name}' not found in reasoning_parsers") @classmethod def _register_module( @@ -139,8 +138,7 @@ class ReasoningParserManager: force: bool = True, ) -> None: if not issubclass(module, ReasoningParser): - raise TypeError("module must be subclass of ReasoningParser, " - f"but got {type(module)}") + raise TypeError("module must be subclass of ReasoningParser, " f"but got {type(module)}") if module_name is None: module_name = module.__name__ if isinstance(module_name, str): @@ -148,8 +146,7 @@ class ReasoningParserManager: for name in module_name: if not force and name in cls.reasoning_parsers: existed_module = cls.reasoning_parsers[name] - raise KeyError(f"{name} is already registered " - f"at {existed_module.__module__}") + raise KeyError(f"{name} is already registered " f"at {existed_module.__module__}") cls.reasoning_parsers[name] = module @classmethod @@ -168,11 +165,8 @@ class ReasoningParserManager: raise TypeError(f"force must be a boolean, but got {type(force)}") # raise the error ahead of time - if not (name is None or isinstance(name, str) - or is_list_of(name, str)): - raise TypeError( - "name must be None, an instance of str, or a sequence of str, " - f"but got {type(name)}") + if not (name is None or isinstance(name, str) or is_list_of(name, str)): + raise TypeError("name must be None, an instance of str, or a sequence of str, " f"but got {type(name)}") # use it as a normal method: x.register_module(module=SomeClass) if module is not None: diff --git a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py index c1814e20b..f5762b791 100644 --- a/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py +++ b/fastdeploy/reasoning/ernie_vl_reasoning_parsers.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from collections.abc import Sequence from typing import Optional, Union -from fastdeploy.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager @@ -39,14 +39,12 @@ class ErnieVLReasoningParser(ReasoningParser): if not self.model_tokenizer: raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " - "constructor during construction.") + "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." + ) self.think_end_token_id = self.vocab.get(self.think_end_token) if self.think_end_token_id is None: - raise RuntimeError( - "Ernie VL reasoning parser could not locate think end " - "tokens in the tokenizer!") + raise RuntimeError("Ernie VL reasoning parser could not locate think end " "tokens in the tokenizer!") def extract_reasoning_content_streaming( self, @@ -71,7 +69,7 @@ class ErnieVLReasoningParser(ReasoningParser): if self.think_end_token_id in delta_token_ids: end_index = delta_text.find(self.end_token) reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.end_token):] + content = delta_text[end_index + len(self.end_token) :] elif self.think_end_token_id in previous_token_ids: reasoning_content = "" content = delta_text @@ -80,9 +78,8 @@ class ErnieVLReasoningParser(ReasoningParser): content = "" return reasoning_content, content - def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: ChatCompletionRequest ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -99,8 +96,7 @@ class ErnieVLReasoningParser(ReasoningParser): if self.think_end_token not in model_output: return "", model_output # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition( - self.think_end_token) + reasoning_content, _, content = model_output.partition(self.think_end_token) final_content = content or "" - return reasoning_content, final_content \ No newline at end of file + return reasoning_content, final_content diff --git a/fastdeploy/reasoning/qwen3_reasoning_parsers.py b/fastdeploy/reasoning/qwen3_reasoning_parsers.py index 9e3aae592..4fc565c6c 100644 --- a/fastdeploy/reasoning/qwen3_reasoning_parsers.py +++ b/fastdeploy/reasoning/qwen3_reasoning_parsers.py @@ -13,11 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from collections.abc import Sequence from typing import Optional, Union -from fastdeploy.entrypoints.openai.protocol import (ChatCompletionRequest, - DeltaMessage) +from fastdeploy.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage from fastdeploy.reasoning import ReasoningParser, ReasoningParserManager @@ -40,15 +40,13 @@ class Qwen3ReasoningParser(ReasoningParser): if not self.model_tokenizer: raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " - "constructor during construction.") + "The model tokenizer must be passed to the ReasoningParser " "constructor during construction." + ) self.think_start_token_id = self.vocab.get(self.think_start_token) self.think_end_token_id = self.vocab.get(self.think_end_token) if self.think_end_token_id is None: - raise RuntimeError( - "Qwen3 reasoning parser could not locate think end " - "tokens in the tokenizer!") + raise RuntimeError("Qwen3 reasoning parser could not locate think end " "tokens in the tokenizer!") def extract_reasoning_content_streaming( self, @@ -67,27 +65,23 @@ class Qwen3ReasoningParser(ReasoningParser): - 'abc' goes to reasoning_content - 'xyz' goes to content """ - if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ - self.think_start_token_id, self.think_end_token_id - ]): + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id]): return "", "" # in delta if self.think_end_token_id in delta_token_ids: - # in delta, in delta, extract reasoning content + # in delta, in delta, extract reasoning content if self.think_start_token_id in delta_token_ids: start_index = delta_text.find(self.think_start_token) end_index = delta_token_ids.find(self.think_end_token) - reasoning_content = delta_text[start_index + - len(self.think_start_token - ):end_index] - content = delta_text[end_index+len(self.think_end_token):] + reasoning_content = delta_text[start_index + len(self.think_start_token) : end_index] + content = delta_text[end_index + len(self.think_end_token) :] return reasoning_content, content # in previous, in delta, else: end_index = delta_text.find(self.think_end_token) reasoning_content = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token):] + content = delta_text[end_index + len(self.think_end_token) :] content = content if content else None return reasoning_content, content # in previous reasoning content continues @@ -95,22 +89,18 @@ class Qwen3ReasoningParser(ReasoningParser): return "", delta_text # in previous elif self.think_start_token_id in previous_token_ids: - return delta_text,"" + return delta_text, "" # in delta elif self.think_start_token_id in delta_token_ids: - start_index=delta_text.find(self.think_start_token) - reasoning_content=delta_text[start_index + len(self.think_start_token):] + start_index = delta_text.find(self.think_start_token) + reasoning_content = delta_text[start_index + len(self.think_start_token) :] content = "" return reasoning_content, content else: return delta_text, "" - - - - def extract_reasoning_content( - self, model_output: str, request: ChatCompletionRequest + self, model_output: str, request: ChatCompletionRequest ) -> tuple[Optional[str], Optional[str]]: """ Extract reasoning content from the model output. @@ -130,22 +120,19 @@ class Qwen3ReasoningParser(ReasoningParser): # 检查是否有起始标签 if self.think_start_token in model_output: # 标准格式:contentanswer - if (self.think_start_token not in model_output - or self.think_end_token not in model_output): + if self.think_start_token not in model_output or self.think_end_token not in model_output: return None, model_output # Check if the is present in the model output, remove it # if it is present. model_output_parts = model_output.partition(self.think_start_token) - model_output = model_output_parts[2] if model_output_parts[ - 1] else model_output_parts[0] + model_output = model_output_parts[2] if model_output_parts[1] else model_output_parts[0] # Check if the model output contains the tokens. # If the end token is not found, return the model output as is. if self.think_end_token not in model_output: return None, model_output # Extract reasoning content from the model output. - reasoning_content, _, content = model_output.partition( - self.think_end_token) + reasoning_content, _, content = model_output.partition(self.think_end_token) final_content = content or None return reasoning_content, final_content diff --git a/fastdeploy/rl/__init__.py b/fastdeploy/rl/__init__.py index fc06329d5..55d89e8bb 100644 --- a/fastdeploy/rl/__init__.py +++ b/fastdeploy/rl/__init__.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os from fastdeploy.model_executor.models import auto_models_registry diff --git a/fastdeploy/rl/dynamic_weight_manager.py b/fastdeploy/rl/dynamic_weight_manager.py index 1040221e4..003957fdc 100644 --- a/fastdeploy/rl/dynamic_weight_manager.py +++ b/fastdeploy/rl/dynamic_weight_manager.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os import time from multiprocessing.shared_memory import SharedMemory @@ -46,15 +47,14 @@ class DynamicWeightManager: logger.info( f"✅ DynamicLoad model built successfully by {self.load_config.load_strategy}, " - f" rank={self.rank}, ranks={self.nranks}") + f" rank={self.rank}, ranks={self.nranks}" + ) @paddle.no_grad() def _capture_model_state(self): """Capture and store initial model parameters state.""" for name, param in self.model.state_dict().items(): - logger.debug( - f"Model param: {name}, shape={param.shape}, dtype={param.dtype}" - ) + logger.debug(f"Model param: {name}, shape={param.shape}, dtype={param.dtype}") self.state_dict[name] = param def update_parameters(self, pid: int = 0) -> None: @@ -73,11 +73,9 @@ class DynamicWeightManager: if handler := strategy_handlers.get(self.load_config.load_strategy): handler() else: - raise ValueError( - f"Unsupported strategy: {self.load_config.load_strategy}") + raise ValueError(f"Unsupported strategy: {self.load_config.load_strategy}") - logger.info( - f"Update parameters in {time.perf_counter()-start_time:.2f}s") + logger.info(f"Update parameters in {time.perf_counter()-start_time:.2f}s") self._finalize_update(pid) @@ -85,7 +83,8 @@ class DynamicWeightManager: """Update using IPC snapshot strategy for elastic recovery.""" model_path = os.path.join( self.parallel_config.model_name_or_path, - f"model_state.tp0{self.meta_src_id}.pdparams") + f"model_state.tp0{self.meta_src_id}.pdparams", + ) try: ipc_state_dict = paddle.load(model_path) @@ -94,16 +93,14 @@ class DynamicWeightManager: ipc_state_dict = paddle.load(fallback_path) self._update_model_from_state(ipc_state_dict, "snapshot") - logger.info( - f"IPC snapshot update parameters completed from {model_path}") + logger.info(f"IPC snapshot update parameters completed from {model_path}") def _update_ipc(self): """Update using standard IPC strategy (requires Training Worker).""" ipc_meta = paddle.load(self.ipc_path) state_dict = self._convert_ipc_meta_to_tensor(ipc_meta) self._update_model_from_state(state_dict, "raw") - logger.info( - f"IPC update parameters completed from file: {self.ipc_path}") + logger.info(f"IPC update parameters completed from file: {self.ipc_path}") def clear_parameters(self, pid: int = 0) -> None: """Clear all model parameters and free memory.""" @@ -118,8 +115,7 @@ class DynamicWeightManager: paddle.distributed.shutdown_process_group() self._update_shared_status(pid, -2) - def _update_model_from_state(self, state_dict: Dict[str, paddle.Tensor], - src_type: str): + def _update_model_from_state(self, state_dict: Dict[str, paddle.Tensor], src_type: str): """Update model parameters from given state dictionary.""" if len(state_dict) == 0: raise ValueError(f"No parameter found in state dict {state_dict}") @@ -133,19 +129,14 @@ class DynamicWeightManager: self._validate_parameter_match(name, new_param, target_param) new_param._share_buffer_to(target_param) update_count += 1 - logger.info( - f"🆗 Updated {update_count}/{len(state_dict)} parameters from {src_type} source" - ) + logger.info(f"🆗 Updated {update_count}/{len(state_dict)} parameters from {src_type} source") - def _validate_parameter_match(self, name: str, src: paddle.Tensor, - dst: paddle.Tensor): + def _validate_parameter_match(self, name: str, src: paddle.Tensor, dst: paddle.Tensor): """验证参数一致性""" if src.dtype != dst.dtype: - raise TypeError( - f"Type mismatch for {name}: {src.dtype} vs {dst.dtype}") + raise TypeError(f"Type mismatch for {name}: {src.dtype} vs {dst.dtype}") if src.shape != dst.shape: - raise ValueError( - f"Shape mismatch for {name}: {src.shape} vs {dst.shape}") + raise ValueError(f"Shape mismatch for {name}: {src.shape} vs {dst.shape}") def _finalize_update(self, pid: int): """Finalize update process with verification.""" @@ -163,7 +154,7 @@ class DynamicWeightManager: def _verify_parameters(self, operation: str): """Verify parameters are in expected state after operation.""" - expected_initialized = (operation == "update") + expected_initialized = operation == "update" all_valid = True for name, param in self.state_dict.items(): is_initialized = param._is_initialized() @@ -177,12 +168,12 @@ class DynamicWeightManager: if all_valid: logger.info(f"💡 Model Parameter {operation} verified successfully") else: - raise RuntimeError( - f"❌ Model Parameter {operation} verification failed") + raise RuntimeError(f"❌ Model Parameter {operation} verification failed") @staticmethod def _convert_ipc_meta_to_tensor( - ipc_meta: Dict[str, Any]) -> Dict[str, paddle.Tensor]: + ipc_meta: Dict[str, Any], + ) -> Dict[str, paddle.Tensor]: """Convert IPC metadata to tensor dictionary.""" converted = {} for name, meta in ipc_meta.items(): @@ -199,18 +190,18 @@ class DynamicWeightManager: curr_alloc = paddle.device.cuda.memory_allocated() / (1024**3) curr_reserved = paddle.device.cuda.memory_reserved() / (1024**3) - logger.warning(f"GPU memory usage {context}:" - f"max_allocated: {max_alloc:.2f}GB\n" - f"max_reserved: {max_reserved:.2f}GB\n" - f"current_allocated: {curr_alloc:.2f}GB\n" - f"current_reserved: {curr_reserved:.2f}GB") + logger.warning( + f"GPU memory usage {context}:" + f"max_allocated: {max_alloc:.2f}GB\n" + f"max_reserved: {max_reserved:.2f}GB\n" + f"current_allocated: {curr_alloc:.2f}GB\n" + f"current_reserved: {curr_reserved:.2f}GB" + ) def _update_shared_status(self, pid: int, status: int) -> None: """Update shared memory status flag for inter-process communication.""" array = np.zeros([1], dtype=np.int32) - shm = SharedMemory(create=False, - size=array.nbytes, - name=f"model_weights_status.{pid}") + shm = SharedMemory(create=False, size=array.nbytes, name=f"model_weights_status.{pid}") value = np.ndarray(array.shape, dtype=array.dtype, buffer=shm.buf) if self.rank == 0: value[self.rank] = status @@ -223,20 +214,17 @@ class DynamicWeightManager: is_stop = 0 while model_weights_status.value[0] != 0: if model_weights_status.value[0] == 1: - logger.info( - "infer engine stopped! start to load new checkpoint...") + logger.info("infer engine stopped! start to load new checkpoint...") model_runner.update_parameters(pid) elif model_weights_status.value[0] == -1: - logger.info( - "infer engine stopped! start to clear checkpoint...") + logger.info("infer engine stopped! start to clear checkpoint...") model_runner.clear_parameters(pid) while True: if model_weights_status.value[0] == 0: logger.info("finished loading new checkpoint") break - elif is_stop == 1 or (model_weights_status.value[0] == -2 - and is_stop == 0): + elif is_stop == 1 or (model_weights_status.value[0] == -2 and is_stop == 0): if is_stop == 0: logger.info("finished clearing checkpoint") is_stop = 1 diff --git a/fastdeploy/rl/rollout_config.py b/fastdeploy/rl/rollout_config.py index dcb95ea2d..92bf0723a 100644 --- a/fastdeploy/rl/rollout_config.py +++ b/fastdeploy/rl/rollout_config.py @@ -14,7 +14,6 @@ # limitations under the License. """ - from fastdeploy.worker.worker_process import initialize_fd_config diff --git a/fastdeploy/rl/rollout_model.py b/fastdeploy/rl/rollout_model.py index 199ec3a61..41e9589e7 100644 --- a/fastdeploy/rl/rollout_model.py +++ b/fastdeploy/rl/rollout_model.py @@ -18,19 +18,32 @@ from typing import Dict import paddle from paddle import nn -from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.model_executor.model_loader import ModelRegistry -from fastdeploy.model_executor.models.ernie4_5_moe import \ - Ernie4_5_MoeForCausalLM, Ernie4_5_PretrainedModel -from fastdeploy.model_executor.models.ernie4_5_vl.ernie4_5_vl_moe import \ - Ernie4_5_VLMoeForConditionalGeneration, Ernie4_5_VLPretrainedModel -from fastdeploy.model_executor.models.qwen2 import Qwen2ForCausalLM, Qwen2PretrainedModel -from fastdeploy.model_executor.models.qwen3 import Qwen3ForCausalLM, Qwen3PretrainedModel -from fastdeploy.model_executor.models.qwen3moe import Qwen3MoeForCausalLM, Qwen3MoePretrainedModel +from fastdeploy.model_executor.models.ernie4_5_moe import ( + Ernie4_5_MoeForCausalLM, + Ernie4_5_PretrainedModel, +) +from fastdeploy.model_executor.models.ernie4_5_vl.ernie4_5_vl_moe import ( + Ernie4_5_VLMoeForConditionalGeneration, + Ernie4_5_VLPretrainedModel, +) +from fastdeploy.model_executor.models.qwen2 import ( + Qwen2ForCausalLM, + Qwen2PretrainedModel, +) +from fastdeploy.model_executor.models.qwen3 import ( + Qwen3ForCausalLM, + Qwen3PretrainedModel, +) +from fastdeploy.model_executor.models.qwen3moe import ( + Qwen3MoeForCausalLM, + Qwen3MoePretrainedModel, +) from fastdeploy.rl.rollout_config import RolloutModelConfig + class RolloutModel(nn.Layer): """Main model class for rollout operations, supports multimodal components for train.""" @@ -53,7 +66,7 @@ class RolloutModel(nn.Layer): def get_name_mappings_to_training(self) -> Dict[str, str]: """Get parameter name mappings between rollout and training models.""" return getattr(self.rollout_model, "get_name_mappings_to_training", lambda: {})() - + def get_quantization_infer_keys(self) -> Dict[str, str]: """Get parameter name mappings between rollout and training models.""" return getattr(self.rollout_model, "get_quantization_infer_keys", lambda: {})() @@ -66,7 +79,10 @@ class RolloutModel(nn.Layer): class BaseRLModel(nn.Layer): """Base class for RL models with common functionality""" - def __init__(self,): + + def __init__( + self, + ): super(BaseRLModel, self).__init__() self.infer_to_train_mapping = {} self.fd_config = None @@ -74,15 +90,15 @@ class BaseRLModel(nn.Layer): @classmethod def name(cls) -> str: return cls.__name__ - + def _update_base_mappings(self, base_name: str) -> None: """Common static mappings""" static_mappings = { f"{base_name}.embed_tokens.embeddings.weight": f"{base_name}.embed_tokens.weight", - "lm_head.linear.weight": "lm_head.weight" + "lm_head.linear.weight": "lm_head.weight", } self.infer_to_train_mapping.update(static_mappings) - + def _complete_missing_mappings(self) -> None: """ Complete the mapping dictionary with keys that have identical names in inference and training. @@ -94,12 +110,12 @@ class BaseRLModel(nn.Layer): if getattr(self.fd_config.model_config, "tie_word_embeddings", False): self.infer_to_train_mapping.pop("lm_head.linear.weight") - + def get_quantization_infer_keys(self) -> list[str]: """Get quantization infer keys""" quant_weight_key = [] if self.fd_config.quant_config.name() == "wint8": - """ RL only support weight_only_int8 now""" + """RL only support weight_only_int8 now""" for key in self.state_dict().keys(): if "scale" in key: quant_weight_key.append(key.replace(".weight_scale", ".weight")) @@ -107,10 +123,12 @@ class BaseRLModel(nn.Layer): raise ValueError("Only 'wint8' quantization is supported in RL roullout.") return quant_weight_key + class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel): """ Ernie4_5_MoeForCausalLMRL """ + _get_tensor_parallel_mappings = Ernie4_5_PretrainedModel._get_tensor_parallel_mappings def __init__(self, fd_config: FDConfig): @@ -134,15 +152,18 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel): self._update_base_mappings("ernie") base_name = "ernie.layers" + # Helper function to add layer mappings def _add_layer_mappings(layer_idx: int): # MoE specific mappings - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \ - f"{base_name}.{layer_idx}.mlp.gate.weight" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight" + ] = f"{base_name}.{layer_idx}.mlp.gate.weight" if self.fd_config.model_config.moe_use_aux_free: - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \ - f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias" + ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" # MoE experts mappings for expert_idx in range(self.fd_config.model_config.moe_num_experts): @@ -165,8 +186,10 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel): assert isinstance(self.fd_config.model_config.moe_layer_start_index, int) # Process MoE layers - for layer_idx in range(self.fd_config.model_config.moe_layer_start_index, - self.fd_config.model_config.num_hidden_layers): + for layer_idx in range( + self.fd_config.model_config.moe_layer_start_index, + self.fd_config.model_config.num_hidden_layers, + ): _add_layer_mappings(layer_idx) self._complete_missing_mappings() @@ -178,6 +201,7 @@ class Ernie4_5_VLMoeForConditionalGenerationRL(Ernie4_5_VLMoeForConditionalGener """ Ernie4_5_VLMoeForConditionalGenerationRL """ + _get_tensor_parallel_mappings = Ernie4_5_VLPretrainedModel._get_tensor_parallel_mappings def __init__(self, fd_config: FDConfig): @@ -206,25 +230,30 @@ class Ernie4_5_VLMoeForConditionalGenerationRL(Ernie4_5_VLMoeForConditionalGener def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int): # MoE specific mappings gate_suffix = "" if moe_tag == "text" else "_1" - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"] = \ - f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight" + ] = f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}" if self.fd_config.model_config.moe_use_aux_free: - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_correction_bias"] = \ - f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_correction_bias" + ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" # Initialize defaultdict for expert weights from collections import defaultdict from itertools import chain - + def _generate_ranges(start, end, step=16, take=8): """生成 [start, start+take), [start+step, start+step+take), ... 直到 end""" - return chain( - *(range(i, min(i + take, end)) # 防止越界 - for i in range(start, end, step))) - + return chain(*(range(i, min(i + take, end)) for i in range(start, end, step))) # 防止越界 + expert_mappings = defaultdict(list) - for expert_idx in _generate_ranges(expert_start, total_moe_num, expert_num_per_rank * 2, expert_num_per_rank): + for expert_idx in _generate_ranges( + expert_start, + total_moe_num, + expert_num_per_rank * 2, + expert_num_per_rank, + ): for ph in place_holders: expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append( f"{base_name}.{layer_idx}.mlp.experts.{expert_idx}.up_gate_proj.{ph}" @@ -273,6 +302,7 @@ class Qwen2ForCausalLMRL(Qwen2ForCausalLM, BaseRLModel): """ Qwen2ForCausalLMRL """ + _get_tensor_parallel_mappings = Qwen2PretrainedModel._get_tensor_parallel_mappings def __init__(self, fd_config: FDConfig): @@ -295,15 +325,16 @@ class Qwen2ForCausalLMRL(Qwen2ForCausalLM, BaseRLModel): # Initialize mapping dictionary self._update_base_mappings("qwen2") base_name = "qwen2.layers" + # Helper function to add layer mappings def _add_layer_mappings(layer_idx): # FFN mappings for ph in place_holders: - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"] = \ - f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}" + ] = f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}" - for layer_idx in range( - self.fd_config.model_config.num_hidden_layers): + for layer_idx in range(self.fd_config.model_config.num_hidden_layers): _add_layer_mappings(layer_idx) self._complete_missing_mappings() @@ -315,6 +346,7 @@ class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM, BaseRLModel): """ Qwen3MoeForCausalLMRL """ + _get_tensor_parallel_mappings = Qwen3MoePretrainedModel._get_tensor_parallel_mappings def __init__(self, fd_config: FDConfig): @@ -343,12 +375,14 @@ class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM, BaseRLModel): # Helper function to add layer mappings def _add_layer_mappings(layer_idx: int): # MoE specific mappings - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate_weight"] = \ - f"{base_name}.{layer_idx}.mlp.gate.weight" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.gate_weight" + ] = f"{base_name}.{layer_idx}.mlp.gate.weight" if self.fd_config.moe_config.moe_use_aux_free: - self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \ - f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" + self.infer_to_train_mapping[ + f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias" + ] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" # MoE experts mappings for expert_idx in range(self.fd_config.moe_config.num_experts): @@ -382,6 +416,7 @@ class Qwen3ForCausalLMRL(Qwen3ForCausalLM, BaseRLModel): """ Qwen3ForCausalLMRL """ + _get_tensor_parallel_mappings = Qwen3PretrainedModel._get_tensor_parallel_mappings def __init__(self, fd_config: FDConfig): @@ -395,6 +430,6 @@ class Qwen3ForCausalLMRL(Qwen3ForCausalLM, BaseRLModel): def name(self) -> str: """name""" return "Qwen3ForCausalLMRL" - + def get_name_mappings_to_training(self) -> Dict[str, str]: pass diff --git a/fastdeploy/scheduler/__init__.py b/fastdeploy/scheduler/__init__.py index 93203be9c..df31dc52f 100644 --- a/fastdeploy/scheduler/__init__.py +++ b/fastdeploy/scheduler/__init__.py @@ -14,4 +14,6 @@ # limitations under the License. """ -from .config import SchedulerConfig \ No newline at end of file +from .config import SchedulerConfig + +__all__ = ["SchedulerConfig"] diff --git a/fastdeploy/scheduler/config.py b/fastdeploy/scheduler/config.py index f6cab4b2d..cd0a72af1 100644 --- a/fastdeploy/scheduler/config.py +++ b/fastdeploy/scheduler/config.py @@ -15,13 +15,14 @@ """ import redis + from fastdeploy.utils import llm_logger + from .global_scheduler import GlobalScheduler from .local_scheduler import LocalScheduler from .splitwise_scheduler import SplitWiseScheduler, SplitWiseSchedulerConfig - class LocalSchedulerConfig: """ Configuration class for LocalScheduler. @@ -31,16 +32,17 @@ class LocalSchedulerConfig: ttl: Time-to-live in seconds for request expiration """ - def __init__(self, - max_size: int = -1, - ttl: int = 900, - max_model_len: int = 8192, - enable_chunked_prefill: bool = False, - max_num_partial_prefills: int = 1, - max_long_partial_prefills: int = 1, - long_prefill_token_threshold: int = 0, - **kwargs - ): + def __init__( + self, + max_size: int = -1, + ttl: int = 900, + max_model_len: int = 8192, + enable_chunked_prefill: bool = False, + max_num_partial_prefills: int = 1, + max_long_partial_prefills: int = 1, + long_prefill_token_threshold: int = 0, + **kwargs, + ): """ Initialize LocalScheduler configuration. @@ -84,8 +86,7 @@ class LocalSchedulerConfig: llm_logger.info("LocalScheduler Configuration Information :") for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") class GlobalSchedulerConfig: @@ -101,22 +102,23 @@ class GlobalSchedulerConfig: ttl: Time-to-live in seconds for Redis keys """ - def __init__(self, - host: str = "127.0.0.1", - port: int = 6379, - db: int = 0, - password=None, - topic: str = "default", - ttl: int = 900, - min_load_score: float = 3, - max_model_len: int = 8192, - load_shards_num: int = 1, - enable_chunked_prefill: bool = False, - max_num_partial_prefills: int = 1, - max_long_partial_prefills: int = 1, - long_prefill_token_threshold: int = 0, - **kwargs - ): + def __init__( + self, + host: str = "127.0.0.1", + port: int = 6379, + db: int = 0, + password=None, + topic: str = "default", + ttl: int = 900, + min_load_score: float = 3, + max_model_len: int = 8192, + load_shards_num: int = 1, + enable_chunked_prefill: bool = False, + max_num_partial_prefills: int = 1, + max_long_partial_prefills: int = 1, + long_prefill_token_threshold: int = 0, + **kwargs, + ): """ Initialize GlobalScheduler (Redis-based) configuration. @@ -190,8 +192,7 @@ class GlobalSchedulerConfig: for k, v in self.__dict__.items(): llm_logger.info("{:<20}:{:<6}{}".format(k, "", v)) self.password = password - llm_logger.info( - "=============================================================") + llm_logger.info("=============================================================") class SchedulerConfig: @@ -224,7 +225,7 @@ class SchedulerConfig: if name == "global": self.config = GlobalSchedulerConfig(**kwargs) - + if name == "splitwise": self.config = SplitWiseSchedulerConfig(**kwargs) @@ -236,7 +237,7 @@ class SchedulerConfig: Exception: If invalid scheduler type is specified """ if self.name not in ["local", "global", "splitwise"]: - raise Exception(f'Unknown scheduler type {self.name}') + raise Exception(f"Unknown scheduler type {self.name}") self.config.check() @@ -255,25 +256,29 @@ class SchedulerConfig: """ if self.name == "global": - return GlobalScheduler(host=self.config.host, - port=self.config.port, - db=self.config.db, - password=self.config.password, - topic=self.config.topic, - ttl=self.config.ttl, - min_load_score=self.config.min_load_score, - load_shards_num=self.config.load_shards_num, - enable_chunked_prefill=self.config.enable_chunked_prefill, - max_num_partial_prefills=self.config.max_num_partial_prefills, - max_long_partial_prefills=self.config.max_long_partial_prefills, - long_prefill_token_threshold=self.config.long_prefill_token_threshold,) - + return GlobalScheduler( + host=self.config.host, + port=self.config.port, + db=self.config.db, + password=self.config.password, + topic=self.config.topic, + ttl=self.config.ttl, + min_load_score=self.config.min_load_score, + load_shards_num=self.config.load_shards_num, + enable_chunked_prefill=self.config.enable_chunked_prefill, + max_num_partial_prefills=self.config.max_num_partial_prefills, + max_long_partial_prefills=self.config.max_long_partial_prefills, + long_prefill_token_threshold=self.config.long_prefill_token_threshold, + ) + if self.name == "splitwise": return SplitWiseScheduler(self.config) - return LocalScheduler(max_size=self.config.max_size, - ttl=self.config.ttl, - enable_chunked_prefill=self.config.enable_chunked_prefill, - max_num_partial_prefills=self.config.max_num_partial_prefills, - max_long_partial_prefills=self.config.max_long_partial_prefills, - long_prefill_token_threshold=self.config.long_prefill_token_threshold,) + return LocalScheduler( + max_size=self.config.max_size, + ttl=self.config.ttl, + enable_chunked_prefill=self.config.enable_chunked_prefill, + max_num_partial_prefills=self.config.max_num_partial_prefills, + max_long_partial_prefills=self.config.max_long_partial_prefills, + long_prefill_token_threshold=self.config.long_prefill_token_threshold, + ) diff --git a/fastdeploy/scheduler/data.py b/fastdeploy/scheduler/data.py index cde2182b3..e3b2b6345 100644 --- a/fastdeploy/scheduler/data.py +++ b/fastdeploy/scheduler/data.py @@ -14,29 +14,32 @@ # limitations under the License. """ -from datetime import datetime -import time import json +import time +from datetime import datetime + from fastdeploy.engine.request import Request, RequestOutput -class ScheduledRequest(object): +class ScheduledRequest: """ A wrapper class for Request objects with scheduling metadata. - + This class extends Request objects with: - Queue information for distributed scheduling - Timestamp tracking - Serialization capabilities """ - def __init__(self, - request: Request, - request_queue_name: str = "", - response_queue_name: str = ""): + def __init__( + self, + request: Request, + request_queue_name: str = "", + response_queue_name: str = "", + ): """ Initialize a ScheduledRequest instance. - + Args: request: The original Request object request_queue_name: Name of the request queue @@ -49,17 +52,18 @@ class ScheduledRequest(object): def __repr__(self) -> str: local_time = datetime.fromtimestamp(self.schedule_time) - formatted_time = local_time.strftime( - "%Y-%m-%d %H:%M:%S") + f"{local_time.microsecond // 1000:03d}" - return (f"request_id:{self.request_id} request_queue:{self.request_queue_name} " - f"response_queue:{self.response_queue_name} " - f"schedule_time:{formatted_time}") + formatted_time = local_time.strftime("%Y-%m-%d %H:%M:%S") + f"{local_time.microsecond // 1000:03d}" + return ( + f"request_id:{self.request_id} request_queue:{self.request_queue_name} " + f"response_queue:{self.response_queue_name} " + f"schedule_time:{formatted_time}" + ) @property def request_id(self) -> str: """ Get the request ID. - + Returns: The unique request identifier """ @@ -69,7 +73,7 @@ class ScheduledRequest(object): def request_id(self, id: str): """ Set the request ID. - + Args: id: New request identifier """ @@ -79,7 +83,7 @@ class ScheduledRequest(object): def prompt_tokens_ids_len(self) -> int: """ Get the length of prompt token IDs. - + Returns: Number of tokens in the prompt """ @@ -88,7 +92,7 @@ class ScheduledRequest(object): def serialize(self) -> bytes: """ Serialize the request to bytes for storage/transmission. - + Returns: Serialized request data as bytes """ @@ -102,13 +106,13 @@ class ScheduledRequest(object): return serialized_data.encode() @classmethod - def unserialize(cls, serialized_data: bytes) -> 'ScheduledRequest': + def unserialize(cls, serialized_data: bytes) -> "ScheduledRequest": """ Deserialize bytes back into a ScheduledRequest. - + Args: serialized_data: Serialized request data - + Returns: Reconstructed ScheduledRequest object """ @@ -121,10 +125,10 @@ class ScheduledRequest(object): return scheduled_request -class ScheduledResponse(object): +class ScheduledResponse: """ A wrapper class for RequestOutput objects with scheduling metadata. - + This class extends RequestOutput objects with: - Timestamp tracking - Serialization capabilities @@ -134,7 +138,7 @@ class ScheduledResponse(object): def __init__(self, response: RequestOutput): """ Initialize a ScheduledResponse instance. - + Args: response: The original RequestOutput object """ @@ -148,7 +152,7 @@ class ScheduledResponse(object): def request_id(self) -> str: """ Get the request ID. - + Returns: The unique request identifier """ @@ -158,7 +162,7 @@ class ScheduledResponse(object): def request_id(self, id: str): """ Set the request ID. - + Args: id: New request identifier """ @@ -168,7 +172,7 @@ class ScheduledResponse(object): def index(self) -> int: """ Get the output index. - + Returns: Position index of this response in the sequence """ @@ -178,7 +182,7 @@ class ScheduledResponse(object): def finished(self) -> bool: """ Check if the request is complete. - + Returns: True if this is the final response for the request """ @@ -187,7 +191,7 @@ class ScheduledResponse(object): def serialize(self) -> bytes: """ Serialize the response to bytes for storage/transmission. - + Returns: Serialized response data as bytes """ @@ -199,13 +203,13 @@ class ScheduledResponse(object): return serialized_data.encode() @classmethod - def unserialize(cls, serialized_data: bytes) -> 'ScheduledResponse': + def unserialize(cls, serialized_data: bytes) -> "ScheduledResponse": """ Deserialize bytes back into a ScheduledResponse. - + Args: serialized_data: Serialized response data - + Returns: Reconstructed ScheduledResponse object """ diff --git a/fastdeploy/scheduler/global_scheduler.py b/fastdeploy/scheduler/global_scheduler.py index fb8cb3a8e..8d9b67a6a 100644 --- a/fastdeploy/scheduler/global_scheduler.py +++ b/fastdeploy/scheduler/global_scheduler.py @@ -14,24 +14,25 @@ # limitations under the License. """ - -from typing import List, Optional, Dict, Tuple -import traceback +import random import threading import time -import random +import traceback import uuid +from typing import Dict, List, Optional, Tuple + import crcmod from redis import ConnectionPool -from fastdeploy.scheduler.storage import AdaptedRedis + from fastdeploy.engine.request import Request, RequestOutput -from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse -from fastdeploy.scheduler.workers import Workers, Task -from fastdeploy.utils import scheduler_logger from fastdeploy.scheduler import utils +from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse +from fastdeploy.scheduler.storage import AdaptedRedis +from fastdeploy.scheduler.workers import Task, Workers +from fastdeploy.utils import scheduler_logger -class GlobalScheduler(object): +class GlobalScheduler: """ A distributed task scheduler that manages request/response queues using Redis. @@ -42,20 +43,21 @@ class GlobalScheduler(object): - Maintaining worker health checks """ - def __init__(self, - host: str, - port: int, - db: int, - password: Optional[str], - topic: str, - ttl: int, - min_load_score: float, - load_shards_num: int, - enable_chunked_prefill: bool, - max_num_partial_prefills: int, - max_long_partial_prefills: int, - long_prefill_token_threshold: int, - ): + def __init__( + self, + host: str, + port: int, + db: int, + password: Optional[str], + topic: str, + ttl: int, + min_load_score: float, + load_shards_num: int, + enable_chunked_prefill: bool, + max_num_partial_prefills: int, + max_long_partial_prefills: int, + long_prefill_token_threshold: int, + ): """ Initialize the GlobalScheduler with Redis connection and scheduling parameters. @@ -94,29 +96,25 @@ class GlobalScheduler(object): self.blpop_response_timeout = 10 self.crc16_mutex = threading.Lock() - self.crc16 = crcmod.predefined.Crc('ccitt-false') + self.crc16 = crcmod.predefined.Crc("ccitt-false") self.load_slot_for_getting_request = 0 - self.load_offset = 0 # const - self.load_count = 50 # const + self.load_offset = 0 # const + self.load_count = 50 # const self.load_lookup_num = 5 # const self.keep_alive_duration = 30 # const - connection_pool = ConnectionPool( - host=host, port=port, db=db, password=password, max_connections=10) + connection_pool = ConnectionPool(host=host, port=port, db=db, password=password, max_connections=10) self.client = AdaptedRedis(connection_pool=connection_pool) self.name, self.shard = self._generate_scheduler_name_and_shard() - self.keep_alive_workers = threading.Thread( - target=self._keep_alive, daemon=True) + self.keep_alive_workers = threading.Thread(target=self._keep_alive, daemon=True) self.keep_alive_workers.start() - self.put_requests_workers = Workers( - "put_requests_workers", self._put_requests_worker, 20) + self.put_requests_workers = Workers("put_requests_workers", self._put_requests_worker, 20) self.put_requests_workers.start(1) - self.put_results_workers = Workers( - "put_results_workers", self._put_results_worker, 300) + self.put_results_workers = Workers("put_results_workers", self._put_results_worker, 300) self.put_results_workers.start(1) self.mutex = threading.Lock() @@ -124,12 +122,10 @@ class GlobalScheduler(object): self.local_responses: Dict[str, List[ScheduledResponse]] = dict() self.stolen_requests: Dict[str, ScheduledRequest] = dict() - self.get_response_workers = threading.Thread( - target=self._get_results_worker, daemon=True) + self.get_response_workers = threading.Thread(target=self._get_results_worker, daemon=True) self.get_response_workers.start() - scheduler_logger.info( - f"Scheduler: name={self.name} redis_version={self.client.version}") + scheduler_logger.info(f"Scheduler: name={self.name} redis_version={self.client.version}") def _get_hash_slot(self, data: str) -> int: """ @@ -184,8 +180,8 @@ class GlobalScheduler(object): 4. Handles naming conflicts by appending incrementing suffixes Returns: - Tuple[str, int]: - - str: Unique scheduler name + Tuple[str, int]: + - str: Unique scheduler name - int: Assigned shard number (0 to load_shards_num-1) Implementation Details: @@ -202,21 +198,28 @@ class GlobalScheduler(object): try: _, name = utils.get_hostname_ip() except Exception as e: - scheduler_logger.warning( - f"Scheduler encountered an error while resolving the IP address. {e}") + scheduler_logger.warning(f"Scheduler encountered an error while resolving the IP address. {e}") name = str(uuid.uuid4()) size = len(name) count = 1 while True: - if self.client.set(self._instance_name(name), "", ex=self.keep_alive_duration, nx=True): + if self.client.set( + self._instance_name(name), + "", + ex=self.keep_alive_duration, + nx=True, + ): break name = f"{name[:size]}:{count}" count += 1 shard = self._get_hash_slot(name) % self.load_shards_num - self.client.set(self._instance_name(name), self._load_table_name(shard=shard), - ex=self.keep_alive_duration) + self.client.set( + self._instance_name(name), + self._load_table_name(shard=shard), + ex=self.keep_alive_duration, + ) return name, shard def _keep_alive(self): @@ -227,8 +230,11 @@ class GlobalScheduler(object): """ while True: try: - self.client.set(self._instance_name( - self.name), self._load_table_name(), ex=self.keep_alive_duration) + self.client.set( + self._instance_name(self.name), + self._load_table_name(), + ex=self.keep_alive_duration, + ) time.sleep(self.keep_alive_duration / 2) except Exception as e: scheduler_logger.error(f"Scheduler keep alive failed: {e}") @@ -324,7 +330,7 @@ class GlobalScheduler(object): mark = f"mark<{request_queue_name}>" if not response.request_id.startswith(mark): return - response.request_id = response.request_id[len(mark):] + response.request_id = response.request_id[len(mark) :] def _put_requests_worker(self, tasks: List[Task]) -> List[Task]: """ @@ -341,7 +347,10 @@ class GlobalScheduler(object): with self.mutex: for task in tasks: request = ScheduledRequest( - task.raw, self._request_queue_name(), self._response_queue_name()) + task.raw, + self._request_queue_name(), + self._response_queue_name(), + ) task.raw = None if request.request_id in self.local_responses: @@ -353,18 +362,21 @@ class GlobalScheduler(object): if len(requests) > 0: serialized_requests = [request.serialize() for request in requests] - self.client.rpush(self._request_queue_name(), * - serialized_requests, ttl=self.ttl) - self.client.zincrby(self._load_table_name(), - len(serialized_requests), self.name, - rem_amount=0, ttl=self.ttl) - scheduler_logger.info( - f"Scheduler has enqueued some requests: {requests}") + self.client.rpush(self._request_queue_name(), *serialized_requests, ttl=self.ttl) + self.client.zincrby( + self._load_table_name(), + len(serialized_requests), + self.name, + rem_amount=0, + ttl=self.ttl, + ) + scheduler_logger.info(f"Scheduler has enqueued some requests: {requests}") if duplicate: scheduler_logger.warning( "Scheduler has received some duplicated requests: " - f"{[task for task in tasks if task.reason is not None]}") + f"{[task for task in tasks if task.reason is not None]}" + ) return tasks def put_requests(self, requests: List[Request]) -> List[Tuple[str, Optional[str]]]: @@ -386,8 +398,14 @@ class GlobalScheduler(object): results = self.put_requests_workers.get_results(10, 0.001) return [(result.id, result.reason) for result in results] - def get_requests(self, available_blocks, block_size, reserved_output_blocks, - max_num_batched_tokens, batch=1) -> List[Request]: + def get_requests( + self, + available_blocks, + block_size, + reserved_output_blocks, + max_num_batched_tokens, + batch=1, + ) -> List[Request]: """ Get requests from the shared cache based on available resources. @@ -406,7 +424,8 @@ class GlobalScheduler(object): scheduler_logger.debug( f"Scheduler's resource are insufficient: available_blocks={available_blocks} " f"reserved_output_blocks={reserved_output_blocks} batch={batch} " - f"max_num_batched_tokens={max_num_batched_tokens}") + f"max_num_batched_tokens={max_num_batched_tokens}" + ) return [] mini_batch = (batch + 1) // 2 @@ -424,37 +443,38 @@ class GlobalScheduler(object): local_request_queue_name = self._request_queue_name() serialized_requests: List[Tuple[str, bytes]] = [] for bs in batches: - elements = self.client.lpop( - local_request_queue_name, bs, ttl=self.ttl) + elements = self.client.lpop(local_request_queue_name, bs, ttl=self.ttl) if elements is None: break - self.client.zincrby(self._load_table_name(), - - len(elements), self.name, rem_amount=0, ttl=self.ttl) - serialized_requests += [(local_request_queue_name, element) - for element in elements] + self.client.zincrby( + self._load_table_name(), + -len(elements), + self.name, + rem_amount=0, + ttl=self.ttl, + ) + serialized_requests += [(local_request_queue_name, element) for element in elements] extend_scheduler_names = [] extend_scheduler_load_table_name = "" if len(serialized_requests) == 0 and len(batches) > 0: for _ in range(min(self.load_lookup_num, self.load_shards_num)): - extend_scheduler_load_table_name = self._load_table_name( - slot=self.load_slot_for_getting_request) + extend_scheduler_load_table_name = self._load_table_name(slot=self.load_slot_for_getting_request) serialized_members = self.client.zrangebyscore( extend_scheduler_load_table_name, self.min_load_score, float("+inf"), start=self.load_offset, - num=self.load_count) + num=self.load_count, + ) self.load_slot_for_getting_request += 1 if len(serialized_members) > 0: break members = [member.decode("utf-8") for member in serialized_members] if len(members) > 0: - extend_scheduler_names = random.sample( - members, k=min(10, len(members))) - extend_scheduler_names = [ - name for name in extend_scheduler_names if name != self.name] + extend_scheduler_names = random.sample(members, k=min(10, len(members))) + extend_scheduler_names = [name for name in extend_scheduler_names if name != self.name] # find lucky one if len(extend_scheduler_names) > 0: @@ -463,40 +483,43 @@ class GlobalScheduler(object): elements = self.client.lpop(lucky_request_queue_name, batches[0]) if elements is not None and len(elements) > 0: - self.client.zincrby(extend_scheduler_load_table_name, - -len(elements), lucky, rem_amount=0, ttl=self.ttl) - serialized_requests += [(lucky_request_queue_name, element) - for element in elements] + self.client.zincrby( + extend_scheduler_load_table_name, + -len(elements), + lucky, + rem_amount=0, + ttl=self.ttl, + ) + serialized_requests += [(lucky_request_queue_name, element) for element in elements] scheduler_logger.info( f"Scheduler {self.name} has stolen some requests from another lucky one. " - f"(name={lucky} num={len(serialized_requests)})") + f"(name={lucky} num={len(serialized_requests)})" + ) else: exist_num = self.client.exists(self._instance_name(lucky)) if exist_num == 0: if self.client.zrem(extend_scheduler_load_table_name, lucky): - scheduler_logger.info( - f"Scheduler {lucky} has been removed") + scheduler_logger.info(f"Scheduler {lucky} has been removed") # blocked read if len(serialized_requests) == 0: request_queue_names = [local_request_queue_name] - request_queue_names += [ - self._request_queue_name(name) for name in extend_scheduler_names] + request_queue_names += [self._request_queue_name(name) for name in extend_scheduler_names] - element = self.client.blpop( - request_queue_names, self.blpop_request_timeout) + element = self.client.blpop(request_queue_names, self.blpop_request_timeout) if element is None: return [] request_queue_name = element[0].decode("utf-8") - scheduler_name = self._scheduler_name_from_request_queue( - request_queue_name) - load_table_name = extend_scheduler_load_table_name if scheduler_name != self.name else self._load_table_name() - self.client.zincrby(load_table_name, - -1, scheduler_name, rem_amount=0, ttl=self.ttl) + scheduler_name = self._scheduler_name_from_request_queue(request_queue_name) + load_table_name = ( + extend_scheduler_load_table_name if scheduler_name != self.name else self._load_table_name() + ) + self.client.zincrby(load_table_name, -1, scheduler_name, rem_amount=0, ttl=self.ttl) serialized_requests.append((request_queue_name, element[1])) if scheduler_name != self.name: scheduler_logger.info( - f"Scheduler {self.name} has stolen a request from another scheduler. (name={scheduler_name})") + f"Scheduler {self.name} has stolen a request from another scheduler. (name={scheduler_name})" + ) long_partial_requests = 0 short_partial_requests = 0 @@ -506,41 +529,34 @@ class GlobalScheduler(object): scheduled_requests: List[ScheduledRequest] = [] for request_queue_name, serialized_request in serialized_requests: if len(remaining_request) > 0: - remaining_request.append( - (request_queue_name, serialized_request)) + remaining_request.append((request_queue_name, serialized_request)) continue - request: ScheduledRequest = ScheduledRequest.unserialize( - serialized_request) - required_input_blocks = self.calc_required_blocks( - request.prompt_tokens_ids_len, block_size) + request: ScheduledRequest = ScheduledRequest.unserialize(serialized_request) + required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) current_prefill_tokens += request.prompt_tokens_ids_len required_total_blocks += required_input_blocks + reserved_output_blocks if required_total_blocks > available_blocks: - remaining_request.append( - (request_queue_name, serialized_request)) + remaining_request.append((request_queue_name, serialized_request)) continue if self.enable_chunked_prefill: if request.prompt_tokens_ids_len > self.long_prefill_token_threshold: long_partial_requests += 1 if long_partial_requests > self.max_long_partial_prefills: - remaining_request.append( - (request_queue_name, serialized_request)) + remaining_request.append((request_queue_name, serialized_request)) continue else: short_partial_requests += 1 if short_partial_requests + long_partial_requests > self.max_num_partial_prefills: - remaining_request.append( - (request_queue_name, serialized_request)) + remaining_request.append((request_queue_name, serialized_request)) continue else: if current_prefill_tokens > max_num_batched_tokens: - remaining_request.append( - (request_queue_name, serialized_request)) + remaining_request.append((request_queue_name, serialized_request)) continue scheduled_requests.append(request) @@ -556,11 +572,9 @@ class GlobalScheduler(object): self.stolen_requests[request.request_id] = request continue - scheduler_logger.error( - f"Scheduler has received a duplicate request from others: {request}") + scheduler_logger.error(f"Scheduler has received a duplicate request from others: {request}") - requests: List[Request] = [ - request.raw for request in scheduled_requests] + requests: List[Request] = [request.raw for request in scheduled_requests] if len(remaining_request) > 0: group: Dict[str, List] = dict() for request_queue_name, serialized_request in remaining_request: @@ -569,23 +583,26 @@ class GlobalScheduler(object): group[request_queue_name].append(serialized_request) for request_queue_name, serialized_requests in group.items(): - self.client.lpush(request_queue_name, * - serialized_requests) - scheduler_name = self._scheduler_name_from_request_queue( - request_queue_name) - load_table_name = extend_scheduler_load_table_name if scheduler_name != self.name else self._load_table_name() - self.client.zincrby(load_table_name, - len(serialized_requests), scheduler_name, ttl=self.ttl) + self.client.lpush(request_queue_name, *serialized_requests) + scheduler_name = self._scheduler_name_from_request_queue(request_queue_name) + load_table_name = ( + extend_scheduler_load_table_name if scheduler_name != self.name else self._load_table_name() + ) + self.client.zincrby( + load_table_name, + len(serialized_requests), + scheduler_name, + ttl=self.ttl, + ) - scheduler_logger.info( - f"Scheduler has put remaining request into the queue: {len(remaining_request)}") + scheduler_logger.info(f"Scheduler has put remaining request into the queue: {len(remaining_request)}") if len(requests) == 0: scheduler_logger.debug( - f"Scheduler has put all just-pulled request into the queue: {len(remaining_request)}") + f"Scheduler has put all just-pulled request into the queue: {len(remaining_request)}" + ) if len(requests) > 0: - scheduler_logger.info( - f"Scheduler has pulled some request: {[request.request_id for request in requests]}") + scheduler_logger.info(f"Scheduler has pulled some request: {[request.request_id for request in requests]}") return requests def _put_results_worker(self, tasks: List[Task]): @@ -623,17 +640,15 @@ class GlobalScheduler(object): if response.request_id in stolen_request_id_request_queue: response_queue_name = stolen_request_id_response_queue[response.request_id] - request_queue_name = stolen_request_id_request_queue[response.request_id] + # request_queue_name = stolen_request_id_request_queue[response.request_id] # self._unmark_response(response, request_queue_name) if response_queue_name not in stolen_responses: stolen_responses[response_queue_name] = [] - stolen_responses[response_queue_name].append( - response.serialize()) + stolen_responses[response_queue_name].append(response.serialize()) continue - scheduler_logger.error( - f"Scheduler has recieved a non-existent response from engine: {[response]}") + scheduler_logger.error(f"Scheduler has recieved a non-existent response from engine: {[response]}") with self.mutex: for request_id, responses in local_responses.items(): @@ -648,8 +663,7 @@ class GlobalScheduler(object): self.local_response_not_empty.notify_all() if len(finished_request_ids) > 0: - scheduler_logger.info( - f"Scheduler has received some finished responses: {finished_request_ids}") + scheduler_logger.info(f"Scheduler has received some finished responses: {finished_request_ids}") for response_queue_name, responses in stolen_responses.items(): self.client.rpush(response_queue_name, *responses, ttl=self.ttl) @@ -663,8 +677,7 @@ class GlobalScheduler(object): Args: results: List of RequestOutput objects to return """ - tasks: List[Task] = [Task(result.request_id, result) - for result in results] + tasks: List[Task] = [Task(result.request_id, result) for result in results] self.put_results_workers.add_tasks(tasks) # ---- for test ---- @@ -684,20 +697,20 @@ class GlobalScheduler(object): """ while True: try: - serialized_responses = self.client.lpop( - self._response_queue_name(), 300, ttl=self.ttl) + serialized_responses = self.client.lpop(self._response_queue_name(), 300, ttl=self.ttl) if serialized_responses is None or len(serialized_responses) == 0: element = self.client.blpop( - [self._response_queue_name()], self.blpop_response_timeout) + [self._response_queue_name()], + self.blpop_response_timeout, + ) if element is None or len(element) == 0: continue serialized_responses = [element[1]] responses: Dict[str, List[ScheduledResponse]] = dict() for serialized_response in serialized_responses: - response = ScheduledResponse.unserialize( - serialized_response) + response = ScheduledResponse.unserialize(serialized_response) if response.request_id not in responses: responses[response.request_id] = [] responses[response.request_id].append(response) @@ -707,13 +720,15 @@ class GlobalScheduler(object): if request_id not in self.local_responses: scheduler_logger.error( "Scheduler has received some non-existent response from the queue. " - f"response:{contents} queue:{self._response_queue_name()}") + f"response:{contents} queue:{self._response_queue_name()}" + ) continue self.local_responses[request_id] += contents self.local_response_not_empty.notify_all() except Exception as e: - scheduler_logger.error(f"Scheduler get_results_worker exception: {e} " - f"traceback: {traceback.format_exc()}") + scheduler_logger.error( + f"Scheduler get_results_worker exception: {e} " f"traceback: {traceback.format_exc()}" + ) def get_results(self) -> Dict[str, List[RequestOutput]]: """ @@ -732,7 +747,7 @@ class GlobalScheduler(object): 4. Automatically cleans up completed request tracking Returns: - Dict[str, List[RequestOutput]]: + Dict[str, List[RequestOutput]]: A dictionary where: - Key is the request ID - Value is a list of RequestOutput objects for that request @@ -765,8 +780,7 @@ class GlobalScheduler(object): return responses with self.local_response_not_empty: - responses: Dict[str, List[ScheduledResponse]] = self.local_response_not_empty.wait_for( - _get_results, 0.001) + responses: Dict[str, List[ScheduledResponse]] = self.local_response_not_empty.wait_for(_get_results, 0.001) results: Dict[str, List[RequestOutput]] = dict() for request_id, resps in responses.items(): @@ -778,8 +792,7 @@ class GlobalScheduler(object): if finished: del self.local_responses[request_id] - scheduler_logger.info( - f"Scheduler has pulled a finished response: {[request_id]}") + scheduler_logger.info(f"Scheduler has pulled a finished response: {[request_id]}") return results def reset(self): @@ -800,14 +813,13 @@ class GlobalScheduler(object): - Clears the local_responses dictionary tracking pending responses - Clears the stolen_requests dictionary tracking requests taken from other schedulers - Note: + Note: - Uses the scheduler's mutex to ensure thread safety - Does not affect other scheduler instances in the cluster - After reset, the scheduler will need to be reinitialized to be usable again """ with self.mutex: - self.client.delete(self._request_queue_name(), - self._response_queue_name()) + self.client.delete(self._request_queue_name(), self._response_queue_name()) self.client.zrem(self._load_table_name(), self.name) self.local_responses = dict() self.stolen_requests = dict() @@ -843,9 +855,10 @@ class GlobalScheduler(object): self.load_shards_num = load_shards_num if reallocate: - self.shard = self._get_hash_slot( - self.name) % self.load_shards_num + self.shard = self._get_hash_slot(self.name) % self.load_shards_num - scheduler_logger.info("Scheduler has reload config, " - f"load_shards_num({old_load_shards_num} => {self.load_shards_num}) " - f"shard({old_shard} => {self.shard})") + scheduler_logger.info( + "Scheduler has reload config, " + f"load_shards_num({old_load_shards_num} => {self.load_shards_num}) " + f"shard({old_shard} => {self.shard})" + ) diff --git a/fastdeploy/scheduler/local_scheduler.py b/fastdeploy/scheduler/local_scheduler.py index 8f0f5e8d2..5d79e5009 100644 --- a/fastdeploy/scheduler/local_scheduler.py +++ b/fastdeploy/scheduler/local_scheduler.py @@ -23,7 +23,7 @@ from fastdeploy.scheduler.data import ScheduledRequest, ScheduledResponse from fastdeploy.utils import scheduler_logger -class LocalScheduler(object): +class LocalScheduler: """ A local in-memory task scheduler for request/response management. @@ -142,7 +142,7 @@ class LocalScheduler(object): expired_ids = [] for request_id in self.ids: request = self.requests[request_id] - if (now - request.schedule_time < self.ttl): + if now - request.schedule_time < self.ttl: break expired_ids.append(request.request_id) @@ -157,8 +157,7 @@ class LocalScheduler(object): else: self.ids_read_cursor -= len(expired_ids) - def put_requests( - self, requests: List[Request]) -> List[Tuple[str, Optional[str]]]: + def put_requests(self, requests: List[Request]) -> List[Tuple[str, Optional[str]]]: """ Add new requests to the scheduler queue. @@ -171,8 +170,7 @@ class LocalScheduler(object): """ with self.mutex: self._recycle() - if self.max_size > 0 and len( - self.requests) + len(requests) > self.max_size: + if self.max_size > 0 and len(self.requests) + len(requests) > self.max_size: msg = f"Exceeding the max length of the local scheduler (max_size={self.max_size})" return [(request.request_id, msg) for request in requests] @@ -183,8 +181,7 @@ class LocalScheduler(object): duplicated_ids.append(request.request_id) else: scheduled_request = ScheduledRequest(request) - self.requests[ - scheduled_request.request_id] = scheduled_request + self.requests[scheduled_request.request_id] = scheduled_request valid_ids.append(scheduled_request.request_id) self.ids += valid_ids @@ -192,13 +189,10 @@ class LocalScheduler(object): scheduler_logger.info(f"Scheduler has enqueued some requests: {valid_ids}") if len(duplicated_ids) > 0: - scheduler_logger.warning( - f"Scheduler has received some duplicated requests: {duplicated_ids}" - ) + scheduler_logger.warning(f"Scheduler has received some duplicated requests: {duplicated_ids}") results = [(request_id, None) for request_id in valid_ids] - results += [(request_id, "duplicated request_id") - for request_id in duplicated_ids] + results += [(request_id, "duplicated request_id") for request_id in duplicated_ids] return results def calc_required_blocks(self, token_num, block_size): @@ -214,12 +208,14 @@ class LocalScheduler(object): """ return (token_num + block_size - 1) // block_size - def get_requests(self, - available_blocks, - block_size, - reserved_output_blocks, - max_num_batched_tokens, - batch=1) -> List[Request]: + def get_requests( + self, + available_blocks, + block_size, + reserved_output_blocks, + max_num_batched_tokens, + batch=1, + ) -> List[Request]: """ Retrieve requests from the scheduler based on available resources. @@ -237,13 +233,15 @@ class LocalScheduler(object): scheduler_logger.debug( f"Scheduler's resource are insufficient: available_blocks={available_blocks} " f"reserved_output_blocks={reserved_output_blocks} batch={batch} " - f"max_num_batched_tokens={max_num_batched_tokens}") + f"max_num_batched_tokens={max_num_batched_tokens}" + ) return [] with self.requests_not_empty: batch_ids = self.requests_not_empty.wait_for( - lambda: self.ids[self.ids_read_cursor:self.ids_read_cursor + - batch], self.wait_request_timeout) + lambda: self.ids[self.ids_read_cursor : self.ids_read_cursor + batch], + self.wait_request_timeout, + ) required_total_blocks = 0 current_prefill_tokens = 0 @@ -251,8 +249,7 @@ class LocalScheduler(object): long_partial_requests, short_partial_requests = 0, 0 for request_id in batch_ids: request = self.requests[request_id] - required_input_blocks = self.calc_required_blocks( - request.prompt_tokens_ids_len, block_size) + required_input_blocks = self.calc_required_blocks(request.prompt_tokens_ids_len, block_size) current_prefill_tokens += request.prompt_tokens_ids_len required_total_blocks += required_input_blocks + reserved_output_blocks if required_total_blocks > available_blocks: @@ -277,14 +274,10 @@ class LocalScheduler(object): self.ids_read_cursor += len(requests) if len(batch_ids) > 0 and len(requests) == 0: - scheduler_logger.debug( - f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}" - ) + scheduler_logger.debug(f"Scheduler has put all just-pulled request into the queue: {len(batch_ids)}") if len(requests) > 0: - scheduler_logger.info( - f"Scheduler has pulled some request: {[request.request_id for request in requests]}" - ) + scheduler_logger.info(f"Scheduler has pulled some request: {[request.request_id for request in requests]}") return requests @@ -295,24 +288,16 @@ class LocalScheduler(object): Args: results: List of RequestOutput objects containing results """ - responses: List[ScheduledResponse] = [ - ScheduledResponse(result) for result in results - ] + responses: List[ScheduledResponse] = [ScheduledResponse(result) for result in results] - finished_responses = [ - response.request_id for response in responses if response.finished - ] + finished_responses = [response.request_id for response in responses if response.finished] if len(finished_responses) > 0: - scheduler_logger.info( - f"Scheduler has received some finished responses: {finished_responses}" - ) + scheduler_logger.info(f"Scheduler has received some finished responses: {finished_responses}") with self.mutex: for response in responses: if response.request_id not in self.requests: - scheduler_logger.warning( - f"Scheduler has received a expired response: {[response.request_id]}" - ) + scheduler_logger.warning(f"Scheduler has received a expired response: {[response.request_id]}") continue if response.request_id not in self.responses: @@ -351,8 +336,7 @@ class LocalScheduler(object): return responses with self.responses_not_empty: - responses = self.responses_not_empty.wait_for( - _get_results, self.wait_response_timeout) + responses = self.responses_not_empty.wait_for(_get_results, self.wait_response_timeout) results = dict() for request_id, resps in responses.items(): @@ -364,7 +348,5 @@ class LocalScheduler(object): if finished: self._recycle(request_id) - scheduler_logger.info( - f"Scheduler has pulled a finished response: {[request_id]}" - ) + scheduler_logger.info(f"Scheduler has pulled a finished response: {[request_id]}") return results diff --git a/fastdeploy/scheduler/splitwise_scheduler.py b/fastdeploy/scheduler/splitwise_scheduler.py index be4534974..61dbd2230 100644 --- a/fastdeploy/scheduler/splitwise_scheduler.py +++ b/fastdeploy/scheduler/splitwise_scheduler.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import copy import hashlib import math @@ -25,34 +26,40 @@ from typing import List import orjson import redis -from fastdeploy.engine.request import (CompletionOutput, Request, - RequestMetrics, RequestOutput) +from fastdeploy.engine.request import ( + CompletionOutput, + Request, + RequestMetrics, + RequestOutput, +) from fastdeploy.utils import scheduler_logger as logger -class SplitWiseSchedulerConfig(object): +class SplitWiseSchedulerConfig: """SplitWise Scheduler Configuration""" def __init__( - self, - nodeid=None, - host="127.0.0.1", # redis host - port=6379, # redis port - password=None, # redis password - topic="fd", # redis topic - ttl=900, - release_load_expire_period=600, #s - sync_period=5, #ms - expire_period=3000, #ms - clear_expired_nodes_period=60, #s - reader_parallel=4, - reader_batch_size=200, - writer_parallel=4, - writer_batch_size=200, - **kwargs): + self, + nodeid=None, + host="127.0.0.1", # redis host + port=6379, # redis port + password=None, # redis password + topic="fd", # redis topic + ttl=900, + release_load_expire_period=600, # s + sync_period=5, # ms + expire_period=3000, # ms + clear_expired_nodes_period=60, # s + reader_parallel=4, + reader_batch_size=200, + writer_parallel=4, + writer_batch_size=200, + **kwargs, + ): if nodeid is None: import uuid + nodeid = str(uuid.uuid4()) self.nodeid = nodeid @@ -64,7 +71,7 @@ class SplitWiseSchedulerConfig(object): self.release_load_expire_period = release_load_expire_period self.sync_period = sync_period - self.expire_period = expire_period / 1000. + self.expire_period = expire_period / 1000.0 self.clear_expired_nodes_period = clear_expired_nodes_period self.reader_parallel = reader_parallel self.reader_batch_size = reader_batch_size @@ -82,13 +89,12 @@ class SplitWiseSchedulerConfig(object): logger.info("LocalScheduler Configuration Information :") for k, v in self.__dict__.items(): logger.info("{:<20}:{:<6}{}".format(k, "", v)) - logger.info( - "=============================================================") + logger.info("=============================================================") -class SplitWiseScheduler(object): +class SplitWiseScheduler: """ - SplitWise Scheduler + SplitWise Scheduler """ def __init__(self, config): @@ -97,68 +103,73 @@ class SplitWiseScheduler(object): def start(self, role, host, disaggregated): """ - Start APIScheduler and InferScheduler backup threads + Start APIScheduler and InferScheduler backup threads """ - logger.info( - f"Scheduler Start With: role:{role}, host:{host}, disaggregated:{disaggregated}" - ) + logger.info(f"Scheduler Start With: role:{role}, host:{host}, disaggregated:{disaggregated}") self.infer.start(role, host, disaggregated) self.scheduler.start() def reset_nodeid(self, nodeid): """ - reset node id + reset node id """ self.scheduler.nodeid = nodeid self.infer.nodeid = nodeid def put_requests(self, reqs: List[Request]): """ - put requests to global splitwise scheduler + put requests to global splitwise scheduler """ return self.scheduler.put_requests(reqs) def get_results(self, request_ids=[]): """ - get results from global splitwise scheduler + get results from global splitwise scheduler """ return self.scheduler.get_results() - def get_requests(self, - available_blocks, - block_size, - reserved_output_blocks, - max_num_batched_tokens, - batch=1): + def get_requests( + self, + available_blocks, + block_size, + reserved_output_blocks, + max_num_batched_tokens, + batch=1, + ): """ - get scheduled requests from global spltiwise scheduler + get scheduled requests from global spltiwise scheduler """ if available_blocks <= reserved_output_blocks or batch < 1: logger.info( f"Scheduler's resource are insufficient: available_blocks={available_blocks} " f"reserved_output_blocks={reserved_output_blocks} batch={batch} " - f"max_num_batched_tokens={max_num_batched_tokens}") + f"max_num_batched_tokens={max_num_batched_tokens}" + ) return [] - return self.infer.get_requests(available_blocks, block_size, - reserved_output_blocks, - max_num_batched_tokens, batch) + return self.infer.get_requests( + available_blocks, + block_size, + reserved_output_blocks, + max_num_batched_tokens, + batch, + ) def put_results(self, results: List[RequestOutput]): """ - put results to global splitwise scheduler + put results to global splitwise scheduler """ return self.infer.put_results(results) -class NodeInfo(object): +class NodeInfo: """ - Infer Node Info: load, rdma/ipc info + Infer Node Info: load, rdma/ipc info """ @classmethod def load_from(self, nodeid, info): """ - load node info from seiralized string + load node info from seiralized string """ health = orjson.loads(info) ts = health["ts"] @@ -168,8 +179,7 @@ class NodeInfo(object): disaggregated = health["disaggregated"] return NodeInfo(nodeid, role, host, disaggregated, load, ts) - def __init__(self, nodeid, role, host, disaggregated, load, - ts=time.time()): + def __init__(self, nodeid, role, host, disaggregated, load, ts=time.time()): self.nodeid = nodeid self.ts = ts self.host = host @@ -184,14 +194,14 @@ class NodeInfo(object): def expired(self, expire_period): """ - APIScheduler used to check if the node is expired + APIScheduler used to check if the node is expired """ now = time.time() return (now - self.ts) > expire_period def serialize(self): """ - InferScheduler used to sync load + InferScheduler used to sync load """ self.ts = time.time() health = { @@ -199,7 +209,7 @@ class NodeInfo(object): "role": self.role, "load": self.load, "host": self.host, - "disaggregated": self.disaggregated + "disaggregated": self.disaggregated, } return orjson.dumps(health) @@ -208,7 +218,7 @@ class NodeInfo(object): def expire_reqs(self, ttl): """ - InferScheduler used to clear expired reqs + InferScheduler used to clear expired reqs """ cur_time = time.time() with self.lock: @@ -216,9 +226,7 @@ class NodeInfo(object): for req_id, pairs in self.reqs.items(): load, arrival_time = pairs if cur_time - arrival_time > ttl: - logger.error( - f"InferScheduler Expire Reqs({req_id}), arrival({arrival_time}), ttl({ttl})" - ) + logger.error(f"InferScheduler Expire Reqs({req_id}), arrival({arrival_time}), ttl({ttl})") expire_reqs.add((req_id, load)) for req_id, load in expire_reqs: if req_id in self.reqs: @@ -227,7 +235,7 @@ class NodeInfo(object): def add_req(self, req_id, load): """ - InferScheduler used to record scheduled reqs(waiting or running) + InferScheduler used to record scheduled reqs(waiting or running) """ with self.lock: if req_id not in self.reqs: @@ -236,7 +244,7 @@ class NodeInfo(object): def update_req_timestamp(self, req_ids): """ - InferScheduler used to update reqs timestamp + InferScheduler used to update reqs timestamp """ cur_time = time.time() with self.lock: @@ -246,7 +254,7 @@ class NodeInfo(object): def finish_req(self, req_id): """ - InferScheduler used to clear finished reqs + InferScheduler used to clear finished reqs """ with self.lock: if req_id in self.reqs: @@ -255,9 +263,9 @@ class NodeInfo(object): del self.reqs[req_id] -class ResultReader(object): +class ResultReader: """ - ResultReader use an async thread to continue get infer result from redis + ResultReader use an async thread to continue get infer result from redis """ def __init__(self, client, idx, batch=200, ttl=900, group=""): @@ -277,7 +285,7 @@ class ResultReader(object): def add_req(self, req): """ - add a req to reader, reader will async fetch infer result from redis + add a req to reader, reader will async fetch infer result from redis """ with self.lock: self.reqs[req.request_id] = {"arrival_time": req.arrival_time} @@ -285,8 +293,8 @@ class ResultReader(object): def read(self): """ - batch read infer results - returns: dict(req_id, [ResultOutput]) + batch read infer results + returns: dict(req_id, [ResultOutput]) """ items = [] size = len(self.data) @@ -335,7 +343,7 @@ class ResultReader(object): def run(self): """ - continue fetch infer results from redis + continue fetch infer results from redis """ while True: try: @@ -344,21 +352,19 @@ class ResultReader(object): with self.lock: expired_reqs = set() for req_id, req in self.reqs.items(): - if cur_time - req.get("arrival_time", - cur_time) > self.ttl: + if cur_time - req.get("arrival_time", cur_time) > self.ttl: result = RequestOutput( request_id=req_id, prompt="", prompt_token_ids=[], outputs=CompletionOutput(-1, -1, []), - metrics=RequestMetrics( - arrival_time=req["arrival_time"]), + metrics=RequestMetrics(arrival_time=req["arrival_time"]), error_code=500, - error_msg=f"Req({req_id}) is expired({self.ttl})") + error_msg=f"Req({req_id}) is expired({self.ttl})", + ) self.data.appendleft(result) - logger.error( - f"Req({req_id}) is expired({self.ttl})") + logger.error(f"Req({req_id}) is expired({self.ttl})") expired_reqs.add(req_id) continue keys.append(req_id) @@ -373,22 +379,21 @@ class ResultReader(object): if total == 0: time.sleep(0.01) except Exception as e: - logger.error( - f"ResultsReader{self.idx} sync results error: {str(e)}") + logger.error(f"ResultsReader{self.idx} sync results error: {e!s}") def sync_results(self, keys): """ - fetch infer results from redis for the give keys + fetch infer results from redis for the give keys """ total = 0 if self.group != "": keys = [self.group] for key in keys: - #logger.info(f"Sync Results from Redis {key}") + # logger.info(f"Sync Results from Redis {key}") results = self.client.rpop(key, self.batch) if results is None or len(results) == 0: continue - #logger.info(f"Rpop {key} {self.idx}: {len(results)}") + # logger.info(f"Rpop {key} {self.idx}: {len(results)}") total += len(results) for result in results: try: @@ -401,9 +406,9 @@ class ResultReader(object): return total -class APIScheduler(object): +class APIScheduler: """ - APIScheduler: put requests to global schedule, and get recording infer results + APIScheduler: put requests to global schedule, and get recording infer results """ def __init__(self, config): @@ -416,9 +421,11 @@ class APIScheduler(object): self.topic = config.redis_topic self.cluster_key = f"{self.topic}.cluster" - self.client = redis.Redis(host=config.redis_host, - port=config.redis_port, - password=config.redis_password) + self.client = redis.Redis( + host=config.redis_host, + port=config.redis_port, + password=config.redis_password, + ) self.req_cond = threading.Condition() self.reqs_queue = deque() @@ -426,16 +433,14 @@ class APIScheduler(object): def start(self): """ - start backup threads + start backup threads """ for i in range(self.reader_parallel): group = f"{self.nodeid}-{i}" - reader = ResultReader(self.client, i, self.reader_batch_size, - self.ttl, group) + reader = ResultReader(self.client, i, self.reader_batch_size, self.ttl, group) self.readers.append(reader) - self.clear_expired_nodes_thread = threading.Thread( - target=self.loop_clear_expired_nodes) + self.clear_expired_nodes_thread = threading.Thread(target=self.loop_clear_expired_nodes) self.clear_expired_nodes_thread.start() self.schedule_thread = threading.Thread(target=self.loop_schedule) @@ -443,7 +448,7 @@ class APIScheduler(object): def put_requests(self, reqs): """ - put requests to local req queue. reqs will be async scheduled + put requests to local req queue. reqs will be async scheduled """ ret = [] with self.req_cond: @@ -455,7 +460,7 @@ class APIScheduler(object): def get_results(self): """ - get infer results from local queue. results is async fetched from redis + get infer results from local queue. results is async fetched from redis """ outputs = dict() for reader in self.readers: @@ -465,7 +470,7 @@ class APIScheduler(object): def loop_schedule(self): """ - loop schedule req based on global load states. + loop schedule req based on global load states. """ reader_idx = 0 while True: @@ -493,11 +498,11 @@ class APIScheduler(object): except IndexError: continue except Exception as e: - logger.error(f"APIScheduler Schedule req error: {str(e)}") + logger.error(f"APIScheduler Schedule req error: {e!s}") def schedule(self, req, pnodes, dnodes, mnodes, group=""): """ - schedule an req to according redis node queue + schedule an req to according redis node queue """ pnodes.extend(mnodes) pnodes.sort() @@ -508,16 +513,14 @@ class APIScheduler(object): req_dict["group"] = group req_str = orjson.dumps(req_dict) pkey = f"ReqQ_{pnode.nodeid}" - #logger.info(f"Schedule Req {req_str} to Mixed") + # logger.info(f"Schedule Req {req_str} to Mixed") self.client.lpush(pkey, req_str) else: dnodes.sort() dnode = self.select_pd(req, dnodes, "decode") disaggregated = copy.deepcopy(dnode.disaggregated) transfer_protocol = disaggregated["transfer_protocol"] - if len( - transfer_protocol - ) > 1 and "ipc" in transfer_protocol and "rdma" in transfer_protocol: + if len(transfer_protocol) > 1 and "ipc" in transfer_protocol and "rdma" in transfer_protocol: if pnode.host == dnode.host: disaggregated["transfer_protocol"] = "ipc" else: @@ -529,13 +532,13 @@ class APIScheduler(object): req_dict = req.to_dict() req_dict["group"] = group req_str = orjson.dumps(req_dict) - #logger.info(f"Schedule Req {req_str}") + # logger.info(f"Schedule Req {req_str}") self.client.lpush(dkey, req_str) self.client.lpush(pkey, req_str) def sync_cluster(self): """ - fetch cluster load states from redis + fetch cluster load states from redis """ clusters = self.client.hgetall(self.cluster_key) pnodes, dnodes, mnodes = [], [], [] @@ -556,7 +559,7 @@ class APIScheduler(object): def loop_clear_expired_nodes(self): """ - loop clear expired node's dirty data in redis + loop clear expired node's dirty data in redis """ while True: try: @@ -567,16 +570,15 @@ class APIScheduler(object): if node.expired(self.clear_expired_nodes_period): expire_nodes.add(nodeid) for nodeid in expire_nodes: - #logger.info(f"clear expired nodes: {nodeid}") + # logger.info(f"clear expired nodes: {nodeid}") self.client.hdel(self.cluster_key, nodeid) time.sleep(self.clear_expired_nodes_period) except Exception: - logger.error( - "APIScheduler clear expired nodes error: {str(e)}") + logger.error("APIScheduler clear expired nodes error: {str(e)}") def select_pd(self, req, nodes, role): """ - select a prefill/decode/mixed node based on load states + select a prefill/decode/mixed node based on load states """ def select(req, nodes, blur_step): @@ -587,10 +589,8 @@ class APIScheduler(object): if node.load >= blur_max: break blur_idx = idx - node = random.choice(nodes[:blur_idx + 1]) - logger.info( - f"Schedule Req {req.request_id}(len:{req.prompt_token_ids_len}) to {node}" - ) + node = random.choice(nodes[: blur_idx + 1]) + logger.info(f"Schedule Req {req.request_id}(len:{req.prompt_token_ids_len}) to {node}") return node if role == "prefill" or role == "mixed": @@ -607,9 +607,9 @@ class APIScheduler(object): raise Exception(f"Invalid Role: {role}") -class ResultWriter(object): +class ResultWriter: """ - ResultWriter use an async thread to continue writer infer results to redis + ResultWriter use an async thread to continue writer infer results to redis """ def __init__(self, client, idx, batch, ttl=900): @@ -627,7 +627,7 @@ class ResultWriter(object): def put(self, key, items): """ - put infer results to writer + put infer results to writer """ with self.cond: for item in items: @@ -636,7 +636,7 @@ class ResultWriter(object): def run(self): """ - continue batch write infer results to redis + continue batch write infer results to redis """ while True: try: @@ -644,9 +644,9 @@ class ResultWriter(object): size = len(self.data) if size == 0: self.cond.wait() - #qsize = size + # qsize = size size = min(size, self.batch) - #logger.info(f"Writer {self.idx} Queue Size: {qsize}, Cur Size: {size}") + # logger.info(f"Writer {self.idx} Queue Size: {qsize}, Cur Size: {size}") groups = dict() for i in range(size): key, item = self.data.pop() @@ -654,22 +654,22 @@ class ResultWriter(object): groups[key] = [] groups[key].append(item) for key, items in groups.items(): - #s = time.time() + # s = time.time() with self.client.pipeline() as pipe: pipe.multi() pipe.lpush(key, *items) pipe.expire(key, math.ceil(self.ttl)) pipe.execute() - #self.client.lpush(key, *items) - #e = time.time() - #logger.info(f"Lpush {self.idx}: {key} used {e-s} {len(items)} items") + # self.client.lpush(key, *items) + # e = time.time() + # logger.info(f"Lpush {self.idx}: {key} used {e-s} {len(items)} items") except Exception as e: - logger.error(f"ResultWriter write error: {str(e)}") + logger.error(f"ResultWriter write error: {e!s}") -class InferScheduler(object): +class InferScheduler: """ - InferScheduler: get scheduled requests to local queue, write results to redis + InferScheduler: get scheduled requests to local queue, write results to redis """ def __init__(self, config): @@ -682,20 +682,21 @@ class InferScheduler(object): self.ttl = config.ttl self.release_load_expire_period = config.release_load_expire_period - self.client = redis.Redis(host=config.redis_host, - port=config.redis_port, - password=config.redis_password) + self.client = redis.Redis( + host=config.redis_host, + port=config.redis_port, + password=config.redis_password, + ) self.reqs_queue = deque() self.writers = [] def start(self, role, host, disaggregated): """ - start backup threads + start backup threads """ for i in range(self.writer_parallel): - writer = ResultWriter(self.client, i, self.writer_batch_size, - self.ttl) + writer = ResultWriter(self.client, i, self.writer_batch_size, self.ttl) writer.start() self.writers.append(writer) @@ -709,25 +710,24 @@ class InferScheduler(object): self.report_thread = threading.Thread(target=self.routine_report) self.report_thread.start() - self.expire_reqs_thread = threading.Thread( - target=self.loop_expire_reqs) + self.expire_reqs_thread = threading.Thread(target=self.loop_expire_reqs) self.expire_reqs_thread.start() def routine_report(self): """ - routine report node info: load, health + routine report node info: load, health """ while True: try: info = self.node.serialize() self.client.hset(self.cluster_key, self.nodeid, info) - time.sleep(self.sync_period / 1000.) + time.sleep(self.sync_period / 1000.0) except Exception as e: - logger.error(f"InferScheduler routine report error: {str(e)}") + logger.error(f"InferScheduler routine report error: {e!s}") def loop_expire_reqs(self): """ - loop clear expired reqs + loop clear expired reqs """ while True: try: @@ -738,7 +738,7 @@ class InferScheduler(object): def loop_get_reqs(self): """ - loop get global scheduled reqs to local queue + loop get global scheduled reqs to local queue """ def select_writer(req): @@ -764,23 +764,26 @@ class InferScheduler(object): group = req.get("group", "") req = Request.from_dict(req) writer_idx = select_writer(req) - logger.info( - f"Infer Scheduler Get Req: {req.request_id} writer idx {writer_idx}" - ) + logger.info(f"Infer Scheduler Get Req: {req.request_id} writer idx {writer_idx}") req.request_id = f"{req.request_id}#{writer_idx}#{group}" if self.role == "prefill" or self.role == "mixed": self.reqs_queue.append(req) - self.node.add_req(req.request_id, - req.prompt_token_ids_len) + self.node.add_req(req.request_id, req.prompt_token_ids_len) else: self.node.add_req(req.request_id, 1) except Exception as e: - logger.error(f"InferScheduler loop get reqs error: {str(e)}") + logger.error(f"InferScheduler loop get reqs error: {e!s}") - def get_requests(self, available_blocks, block_size, - reserved_output_blocks, max_num_batched_tokens, batch): + def get_requests( + self, + available_blocks, + block_size, + reserved_output_blocks, + max_num_batched_tokens, + batch, + ): """ - get scheduled reqs from local reqs queue + get scheduled reqs from local reqs queue """ if len(self.reqs_queue) == 0: return [] @@ -793,19 +796,16 @@ class InferScheduler(object): try: req = self.reqs_queue.popleft() if cur_time - req.arrival_time > self.ttl: - logger.error( - f"req({req.request_id}) is expired({self.ttl}) when InferScheduler Get Requests" - ) + logger.error(f"req({req.request_id}) is expired({self.ttl}) when InferScheduler Get Requests") self.node.finish_req(req.request_id) continue current_prefill_tokens += req.prompt_token_ids_len - required_input_blocks = (req.prompt_token_ids_len + - block_size - 1) // block_size + required_input_blocks = (req.prompt_token_ids_len + block_size - 1) // block_size required_blocks += required_input_blocks + reserved_output_blocks if required_blocks > available_blocks or current_prefill_tokens > max_num_batched_tokens: self.reqs_queue.appendleft(req) return reqs - #logger.info(f"Get Requests from Scheduler: {req.request_id}") + # logger.info(f"Get Requests from Scheduler: {req.request_id}") reqs.append(req) except Exception: return reqs @@ -813,16 +813,14 @@ class InferScheduler(object): def put_results(self, results): """ - put infer results to according writer's local queue + put infer results to according writer's local queue """ groups = dict() req_ids = set() for result in results: if result.error_code != 200 or result.finished: self.node.finish_req(result.request_id) - logger.info( - f"{result.request_id} finished, node load is {self.node.load}" - ) + logger.info(f"{result.request_id} finished, node load is {self.node.load}") req_ids.add(result.request_id) @@ -837,7 +835,7 @@ class InferScheduler(object): result.finished = False result_str = orjson.dumps(result.to_dict()) - #if self.role == "prefill" or result.error_code != 200 or result.finished: + # if self.role == "prefill" or result.error_code != 200 or result.finished: # logger.info(f"Infer Put Finish Result: {result_str}") groups[key].append(result_str) diff --git a/fastdeploy/scheduler/storage.py b/fastdeploy/scheduler/storage.py index 7ef33cef4..51a9801ab 100644 --- a/fastdeploy/scheduler/storage.py +++ b/fastdeploy/scheduler/storage.py @@ -14,13 +14,13 @@ # limitations under the License. """ +import re +from collections.abc import Awaitable +from typing import List, Optional, Union -from typing import Optional, List, Union, Awaitable -from redis.typing import Number, FieldT, KeyT, EncodableT, ResponseT import redis from packaging import version -import re - +from redis.typing import EncodableT, FieldT, KeyT, Number, ResponseT LUA_LPOP = """ local key = KEYS[1] @@ -54,7 +54,7 @@ return currentAmount class AdaptedRedis(redis.Redis): """ A Redis client adapter that provides version-compatible operations. - + This class extends the standard Redis client to: - Handle version-specific behavior differences - Add TTL support for list operations @@ -65,7 +65,7 @@ class AdaptedRedis(redis.Redis): def __init__(self, **kwargs): """ Initialize the AdaptedRedis client. - + Args: **kwargs: Standard Redis client connection parameters """ @@ -78,14 +78,14 @@ class AdaptedRedis(redis.Redis): def _parse_version(self): """ Parse and store the Redis server version. - + Determines if the server is an older version that requires special handling for certain operations. """ - server_info = self.info(section='server') - version_string = server_info['redis_version'] + server_info = self.info(section="server") + version_string = server_info["redis_version"] - match = re.search(r'^(\d+\.\d+\.\d+)', version_string) + match = re.search(r"^(\d+\.\d+\.\d+)", version_string) if match: redis_version = match.group(1) else: @@ -102,7 +102,7 @@ class AdaptedRedis(redis.Redis): def _register_script(self): """ Register custom Lua scripts for enhanced Redis operations. - + Scripts include: - Atomic LPOP with count (for older Redis versions) - ZINCRBY with removal threshold @@ -114,12 +114,12 @@ class AdaptedRedis(redis.Redis): def rpush(self, name: str, *values: FieldT, ttl: Optional[float] = None) -> Union[Awaitable[int], int]: """ RPUSH operation with optional TTL. - + Args: name: List key *values: Values to push ttl: Optional time-to-live in seconds - + Returns: Length of the list after push """ @@ -133,22 +133,24 @@ class AdaptedRedis(redis.Redis): result = pipe.execute() return result[0] - def zincrby(self, - name: KeyT, - amount: float, - value: EncodableT, - rem_amount: Optional[float] = None, - ttl: Optional[float] = None) -> ResponseT: + def zincrby( + self, + name: KeyT, + amount: float, + value: EncodableT, + rem_amount: Optional[float] = None, + ttl: Optional[float] = None, + ) -> ResponseT: """ Atomic ZINCRBY with removal threshold and optional TTL. - + Args: name: Sorted set key amount: Increment amount value: Member to increment rem_amount: Optional threshold for member removal ttl: Optional time-to-live in seconds - + Returns: New score of the member """ @@ -157,7 +159,7 @@ class AdaptedRedis(redis.Redis): if ttl is None: if rem_amount is None: return super().zincrby(name, amount, value) - rem_amount = 'NIL' if rem_amount is None else str(rem_amount) + rem_amount = "NIL" if rem_amount is None else str(rem_amount) return self._zincrby(keys=[name], args=[amount, value, rem_amount]) with self.pipeline() as pipe: @@ -165,26 +167,26 @@ class AdaptedRedis(redis.Redis): if rem_amount is None: pipe.zincrby(name, amount, value) else: - rem_amount = 'NIL' if rem_amount is None else str(rem_amount) - self._zincrby(keys=[name], args=[ - amount, value, rem_amount], client=pipe) + rem_amount = "NIL" if rem_amount is None else str(rem_amount) + self._zincrby(keys=[name], args=[amount, value, rem_amount], client=pipe) pipe.expire(name, ttl) result = pipe.execute() return result[0] - def lpop(self, - name: str, - count: Optional[int] = None, - ttl: Optional[float] = None, - ) -> Union[Awaitable[Union[str, List, None]], Union[str, List, None]]: + def lpop( + self, + name: str, + count: Optional[int] = None, + ttl: Optional[float] = None, + ) -> Union[Awaitable[Union[str, List, None]], Union[str, List, None]]: """ LPOP operation with count support and optional TTL. - + Args: name: List key count: Number of elements to pop ttl: Optional time-to-live in seconds - + Returns: Popped elements (single or list) """ @@ -206,11 +208,11 @@ class AdaptedRedis(redis.Redis): def blpop(self, keys: List, timeout: Optional[Number] = 0): """ BLPOP operation with version-specific timeout handling. - + Args: keys: List of keys to pop from timeout: Maximum wait time in seconds - + Returns: Tuple of (key, value) or None if timeout """ diff --git a/fastdeploy/scheduler/utils.py b/fastdeploy/scheduler/utils.py index 723a37c7c..792570e96 100644 --- a/fastdeploy/scheduler/utils.py +++ b/fastdeploy/scheduler/utils.py @@ -20,16 +20,16 @@ import socket def get_hostname_ip(): """ Get the system's hostname and primary IP address. - + Returns: tuple: A tuple containing: - hostname (str): The system's hostname - ip_address (str): The primary IP address associated with the hostname - + Raises: socket.gaierror: If the hostname cannot be resolved to an IP address """ - + hostname = socket.gethostname() ip_address = socket.gethostbyname(hostname) return hostname, ip_address diff --git a/fastdeploy/scheduler/workers.py b/fastdeploy/scheduler/workers.py index 64be8945e..46a0f819f 100644 --- a/fastdeploy/scheduler/workers.py +++ b/fastdeploy/scheduler/workers.py @@ -14,10 +14,11 @@ # limitations under the License. """ -from typing import Callable, List, Any, Dict, Optional import functools import threading import traceback +from typing import Any, Callable, Dict, List, Optional + from fastdeploy.utils import scheduler_logger @@ -31,9 +32,7 @@ class Task: reason: Optional reason/status message for the task """ - def __init__(self, task_id: str, - task: Any, - reason: Optional[str] = None): + def __init__(self, task_id: str, task: Any, reason: Optional[str] = None): """ Initialize a Task instance. @@ -63,11 +62,13 @@ class Workers: - Graceful shutdown """ - def __init__(self, - name: str, - work: Callable[[List[Task]], Optional[List[Task]]], - max_task_batch_size: int = 1, - task_filters: Optional[List[Callable[[Task], bool]]] = None): + def __init__( + self, + name: str, + work: Callable[[List[Task]], Optional[List[Task]]], + max_task_batch_size: int = 1, + task_filters: Optional[List[Callable[[Task], bool]]] = None, + ): """ Initialize a Workers thread pool. @@ -112,8 +113,8 @@ class Workers: return True if filter is None: - tasks = self.tasks[:self.max_task_batch_size] - del self.tasks[:self.max_task_batch_size] + tasks = self.tasks[: self.max_task_batch_size] + del self.tasks[: self.max_task_batch_size] self.running_tasks[worker_index] = tasks return tasks @@ -142,16 +143,13 @@ class Workers: self.running_tasks[worker_index] = [] task_filter = None - task_filer_size = 0 if self.task_filters is None else len( - self.task_filters) + task_filer_size = 0 if self.task_filters is None else len(self.task_filters) if task_filer_size > 0: task_filter = self.task_filters[worker_index % task_filer_size] while True: with self.tasks_not_empty: - tasks = self.tasks_not_empty.wait_for( - functools.partial( - self._get_tasks, worker_index, task_filter)) + tasks = self.tasks_not_empty.wait_for(functools.partial(self._get_tasks, worker_index, task_filter)) if self.stop: self.stopped_count += 1 @@ -163,8 +161,7 @@ class Workers: try: results = self.work(tasks) except Exception as e: - scheduler_logger.error( - f"Worker {self.name} execute error: {e}, traceback: {traceback.format_exc()}") + scheduler_logger.error(f"Worker {self.name} execute error: {e}, traceback: {traceback.format_exc()}") continue if results is not None and len(results) > 0: @@ -186,8 +183,7 @@ class Workers: for _ in range(remain): index = len(self.pool) - t = threading.Thread(target=self._worker, - args=(index,), daemon=True) + t = threading.Thread(target=self._worker, args=(index,), daemon=True) t.start() self.pool.append(t) @@ -202,8 +198,7 @@ class Workers: self.tasks_not_empty.notify_all() self.results_not_empty.notify_all() - self.not_stop.wait_for( - lambda: self.stopped_count == len(self.pool)) + self.not_stop.wait_for(lambda: self.stopped_count == len(self.pool)) self.pool = [] self.tasks = [] @@ -223,6 +218,7 @@ class Workers: Returns: List of completed tasks/results """ + def _get_results(): if self.stop: return True diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 68eafa9b8..c2a5d0c4b 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -23,20 +23,22 @@ import paddle from fastdeploy.engine.request import Request from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend -from fastdeploy.model_executor.layers.attention.base_attention_backend import \ - AttentionBackend +from fastdeploy.model_executor.layers.attention.base_attention_backend import ( + AttentionBackend, +) from fastdeploy.model_executor.layers.rotary_embedding import get_rope from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import MTPSampler -from fastdeploy.model_executor.ops.gpu import (draft_model_postprocess, - draft_model_preprocess, - draft_model_update, - eagle_get_hidden_states, - mtp_save_first_token, - mtp_step_paddle, - share_external_data) -from fastdeploy.model_executor.pre_and_post_process import (pre_process, - rebuild_padding) +from fastdeploy.model_executor.ops.gpu import ( + draft_model_postprocess, + draft_model_preprocess, + draft_model_update, + eagle_get_hidden_states, + mtp_save_first_token, + mtp_step_paddle, + share_external_data, +) +from fastdeploy.model_executor.pre_and_post_process import pre_process, rebuild_padding from .base import Proposer @@ -46,8 +48,7 @@ class MTPProposer(Proposer): Proposer for Multi-Token-Prediction(MTP) """ - def __init__(self, cfg, main_model, local_rank, device_id, - main_model_inputs): + def __init__(self, cfg, main_model, local_rank, device_id, main_model_inputs): super().__init__(cfg) self.num_main_model_layers = self.model_config.num_hidden_layers self.local_rank = local_rank @@ -71,12 +72,10 @@ class MTPProposer(Proposer): self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM" self.speculative_config.sharing_model = main_model self.model_config.num_hidden_layers = 1 - self.parallel_config.model_name_or_path = ( - self.speculative_config.model_name_or_path) + self.parallel_config.model_name_or_path = self.speculative_config.model_name_or_path self.model_config.pretrained_config.prefix_name = "ernie.mtp_block" if self.speculative_config.quantization != "": - self.model_config.quantization = ( - self.speculative_config.quantization) + self.model_config.quantization = self.speculative_config.quantization self.model_config.start_layer_index = self.num_main_model_layers self.speculative_config.model_type = "mtp" @@ -84,43 +83,39 @@ class MTPProposer(Proposer): """ Load MTP Layer """ - from fastdeploy.model_executor.model_loader import \ - get_model_from_loader + from fastdeploy.model_executor.model_loader import get_model_from_loader self.model = get_model_from_loader(self.cfg) - def dummy_prefill_inputs(self, num_tokens: int, batch_size: int, - expected_decode_len: int): + def dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int): """Set dummy prefill inputs to model_inputs""" max_dec_len = expected_decode_len + 1 self.num_gpu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() - full_length = min(num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len) + full_length = min( + num_tokens // batch_size, + self.parallel_config.max_model_len - max_dec_len, + ) input_length = int(full_length * self.parallel_config.kv_cache_ratio) - block_num = ((input_length + self.parallel_config.block_size - 1) // - self.parallel_config.block_size + - self.parallel_config.enc_dec_block_num) + block_num = ( + input_length + self.parallel_config.block_size - 1 + ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num for i in range(batch_size): idx = i - self.model_inputs["input_ids"][idx:idx + - 1, :input_length] = (np.array( - [5] * input_length)) - self.model_inputs["eos_token_id"][:] = np.array( - [2], dtype="int64").reshape(-1, 1) - self.model_inputs["seq_lens_this_time"][idx:idx + 1] = input_length - self.model_inputs["seq_lens_encoder"][idx:idx + 1] = input_length - self.model_inputs["seq_lens_decoder"][idx:idx + 1] = 0 - self.model_inputs["step_idx"][idx:idx + 1] = 0 - self.model_inputs["max_dec_len"][idx:idx + 1] = max_dec_len - self.model_inputs["stop_flags"][idx:idx + 1] = False + self.model_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.model_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1) + self.model_inputs["seq_lens_this_time"][idx : idx + 1] = input_length + self.model_inputs["seq_lens_encoder"][idx : idx + 1] = input_length + self.model_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + self.model_inputs["step_idx"][idx : idx + 1] = 0 + self.model_inputs["max_dec_len"][idx : idx + 1] = max_dec_len + self.model_inputs["stop_flags"][idx : idx + 1] = False - self.model_inputs["encoder_block_lens"][idx:idx + 1] = block_num - self.model_inputs["block_tables"][idx:idx + - 1, :block_num] = (np.arange( - idx * block_num, - (idx + 1) * block_num, 1)) + self.model_inputs["encoder_block_lens"][idx : idx + 1] = block_num + self.model_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( + idx * block_num, (idx + 1) * block_num, 1 + ) def initialize_kv_cache(self): """ @@ -131,41 +126,41 @@ class MTPProposer(Proposer): cache_type = self.parallel_config.dtype - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): - cache_type = 'uint8' + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): + cache_type = "uint8" # Get kv cache shape - kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( - max_num_blocks=self.num_gpu_blocks) - if (not self.parallel_config.do_profile - and (self.parallel_config.enable_prefix_caching - or self.parallel_config.splitwise_role != "mixed")): + kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=self.num_gpu_blocks) + if not self.parallel_config.do_profile and ( + self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed" + ): cache_kvs_list = [] for i in range( - self.num_main_model_layers, - self.num_main_model_layers + self.model_config.num_hidden_layers): + self.num_main_model_layers, + self.num_main_model_layers + self.model_config.num_hidden_layers, + ): key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{self.local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{self.local_rank}.device{self.device_id}" - key_cache = share_external_data(key_cache, key_cache_name, - kv_cache_shape) + key_cache = share_external_data(key_cache, key_cache_name, kv_cache_shape) cache_kvs_list.append(key_cache) value_cache = paddle.empty(shape=[], dtype=cache_type) - value_cache = share_external_data(value_cache, val_cache_name, - kv_cache_shape) + value_cache = share_external_data(value_cache, val_cache_name, kv_cache_shape) cache_kvs_list.append(value_cache) self.model_inputs["caches"] = cache_kvs_list else: for i in range(self.model_config.num_hidden_layers): - self.cache_kvs["key_caches_{}".format(i)] = paddle.full( + self.cache_kvs[f"key_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, ) - self.cache_kvs["value_caches_{}".format(i)] = paddle.full( + self.cache_kvs[f"value_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, @@ -175,18 +170,19 @@ class MTPProposer(Proposer): del value paddle.device.cuda.empty_cache() - def _initialize_attn_backend(self, ) -> None: + def _initialize_attn_backend( + self, + ) -> None: """ Initialize attention backends and forward metadata """ assert len(self.attn_backends) == 0 # TODO(gongshaotian): Get rank from config - num_heads = (self.model_config.num_attention_heads // - self.parallel_config.tensor_parallel_size) + num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = ( - int(self.model_config.num_key_value_heads) // - self.parallel_config.tensor_parallel_size) + int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size + ) head_dim = self.model_config.head_dim # Get the attention backend @@ -217,28 +213,25 @@ class MTPProposer(Proposer): """ self.main_model_num_gpu_blocks = num_gpu_blocks - self.num_gpu_blocks = int( - num_gpu_blocks * - self.speculative_config.num_gpu_block_expand_ratio) - if not (self.parallel_config.enable_prefix_caching - or self.parallel_config.splitwise_role != "mixed"): + self.num_gpu_blocks = int(num_gpu_blocks * self.speculative_config.num_gpu_block_expand_ratio) + if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"): self.initialize_kv_cache() # Reset free list free_list = list( range( self.num_gpu_blocks - 1, - int(self.main_model_num_gpu_blocks * - self.parallel_config.kv_cache_ratio) - 1, + int(self.main_model_num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1, -1, - )) + ) + ) self.free_list_len = len(free_list) - self.model_inputs.update({ - "free_list": - paddle.to_tensor(free_list, dtype="int32"), - "free_list_len": - paddle.full([1], self.free_list_len, dtype="int32"), - }) + self.model_inputs.update( + { + "free_list": paddle.to_tensor(free_list, dtype="int32"), + "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"), + } + ) self.parallel_config.do_profile = False def _init_model_inputs(self): @@ -247,44 +240,27 @@ class MTPProposer(Proposer): """ self.model_inputs = {} # Same shape/dytpe with base model - self.model_inputs["block_tables"] = paddle.clone( - self.main_model_inputs["block_tables"]) - self.model_inputs["input_ids"] = paddle.clone( - self.main_model_inputs["input_ids"]) - self.model_inputs["seq_lens_this_time"] = paddle.clone( - self.main_model_inputs["seq_lens_this_time"]) - self.model_inputs["seq_lens_encoder"] = paddle.clone( - self.main_model_inputs["seq_lens_encoder"]) - self.model_inputs["seq_lens_decoder"] = paddle.clone( - self.main_model_inputs["seq_lens_decoder"]) - self.model_inputs["step_idx"] = paddle.clone( - self.main_model_inputs["step_idx"]) - self.model_inputs["stop_flags"] = paddle.clone( - self.main_model_inputs["stop_flags"]) - self.model_inputs["stop_nums"] = paddle.clone( - self.main_model_inputs["stop_nums"]) - self.model_inputs["not_need_stop"] = paddle.to_tensor([False], - dtype="bool", - place="cpu") - self.model_inputs["pre_ids"] = paddle.clone( - self.main_model_inputs["pre_ids"]) - self.model_inputs["ids_remove_padding"] = paddle.clone( - self.main_model_inputs["ids_remove_padding"]) - self.model_inputs["cum_offsets"] = paddle.clone( - self.main_model_inputs["cum_offsets"]) - self.model_inputs["batch_id_per_token"] = paddle.clone( - self.main_model_inputs["batch_id_per_token"]) - self.model_inputs["cu_seqlens_q"] = paddle.clone( - self.main_model_inputs["cu_seqlens_q"]) - self.model_inputs["cu_seqlens_k"] = paddle.clone( - self.main_model_inputs["cu_seqlens_k"]) - self.model_inputs["decoder_batch_ids"] = paddle.clone( - self.main_model_inputs["decoder_batch_ids"]) + self.model_inputs["block_tables"] = paddle.clone(self.main_model_inputs["block_tables"]) + self.model_inputs["input_ids"] = paddle.clone(self.main_model_inputs["input_ids"]) + self.model_inputs["seq_lens_this_time"] = paddle.clone(self.main_model_inputs["seq_lens_this_time"]) + self.model_inputs["seq_lens_encoder"] = paddle.clone(self.main_model_inputs["seq_lens_encoder"]) + self.model_inputs["seq_lens_decoder"] = paddle.clone(self.main_model_inputs["seq_lens_decoder"]) + self.model_inputs["step_idx"] = paddle.clone(self.main_model_inputs["step_idx"]) + self.model_inputs["stop_flags"] = paddle.clone(self.main_model_inputs["stop_flags"]) + self.model_inputs["stop_nums"] = paddle.clone(self.main_model_inputs["stop_nums"]) + self.model_inputs["not_need_stop"] = paddle.to_tensor([False], dtype="bool", place="cpu") + self.model_inputs["pre_ids"] = paddle.clone(self.main_model_inputs["pre_ids"]) + self.model_inputs["ids_remove_padding"] = paddle.clone(self.main_model_inputs["ids_remove_padding"]) + self.model_inputs["cum_offsets"] = paddle.clone(self.main_model_inputs["cum_offsets"]) + self.model_inputs["batch_id_per_token"] = paddle.clone(self.main_model_inputs["batch_id_per_token"]) + self.model_inputs["cu_seqlens_q"] = paddle.clone(self.main_model_inputs["cu_seqlens_q"]) + self.model_inputs["cu_seqlens_k"] = paddle.clone(self.main_model_inputs["cu_seqlens_k"]) + self.model_inputs["decoder_batch_ids"] = paddle.clone(self.main_model_inputs["decoder_batch_ids"]) self.model_inputs["decoder_tile_ids_per_batch"] = paddle.clone( - self.main_model_inputs["decoder_tile_ids_per_batch"]) + self.main_model_inputs["decoder_tile_ids_per_batch"] + ) - tmp_position_ids = paddle.arange( - self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) self.model_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, @@ -294,55 +270,41 @@ class MTPProposer(Proposer): # self.model_inputs["caches"] = self.cache_kvs # Inherit generation hyperparameters from the main model for consistency self.model_inputs["top_p"] = self.main_model_inputs["top_p"] - self.model_inputs["temperature"] = self.main_model_inputs[ - "temperature"] - self.model_inputs["eos_token_id"] = self.main_model_inputs[ - "eos_token_id"] - self.model_inputs["penalty_score"] = self.main_model_inputs[ - "penalty_score"] - self.model_inputs["frequency_score"] = self.main_model_inputs[ - "frequency_score"] - self.model_inputs["presence_score"] = self.main_model_inputs[ - "presence_score"] + self.model_inputs["temperature"] = self.main_model_inputs["temperature"] + self.model_inputs["eos_token_id"] = self.main_model_inputs["eos_token_id"] + self.model_inputs["penalty_score"] = self.main_model_inputs["penalty_score"] + self.model_inputs["frequency_score"] = self.main_model_inputs["frequency_score"] + self.model_inputs["presence_score"] = self.main_model_inputs["presence_score"] self.model_inputs["infer_seed"] = self.main_model_inputs["infer_seed"] - self.model_inputs["max_dec_len"] = self.main_model_inputs[ - "max_dec_len"] - self.model_inputs["min_dec_len"] = self.main_model_inputs[ - "min_dec_len"] + self.model_inputs["max_dec_len"] = self.main_model_inputs["max_dec_len"] + self.model_inputs["min_dec_len"] = self.main_model_inputs["min_dec_len"] self.model_inputs["bad_tokens"] = self.main_model_inputs["bad_tokens"] # Integrate the updated results in model forward - self.model_inputs["base_model_draft_tokens"] = self.main_model_inputs[ - "draft_tokens"] + self.model_inputs["base_model_draft_tokens"] = self.main_model_inputs["draft_tokens"] self.model_inputs["substep"] = 0 # Input tokens - self.model_inputs["draft_tokens"] = paddle.full( - shape=[self.max_num_seqs, 2], fill_value=-1, dtype="int64") + self.model_inputs["draft_tokens"] = paddle.full(shape=[self.max_num_seqs, 2], fill_value=-1, dtype="int64") - self.model_inputs["encoder_block_lens"] = paddle.clone( - self.main_model_inputs["encoder_block_lens"]) + self.model_inputs["encoder_block_lens"] = paddle.clone(self.main_model_inputs["encoder_block_lens"]) self.free_list = list( range( self.parallel_config.total_block_num - 1, - int(self.parallel_config.total_block_num * - self.parallel_config.kv_cache_ratio) - 1, + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, -1, - )) + ) + ) self.free_list_len = len(self.free_list) - self.model_inputs["free_list"] = paddle.to_tensor(self.free_list, - dtype="int32") - self.model_inputs["free_list_len"] = paddle.full( - shape=[1], fill_value=self.free_list_len, dtype="int32") + self.model_inputs["free_list"] = paddle.to_tensor(self.free_list, dtype="int32") + self.model_inputs["free_list_len"] = paddle.full(shape=[1], fill_value=self.free_list_len, dtype="int32") - self.model_inputs["batch_drop"] = paddle.full( - shape=[self.max_num_seqs, 1], fill_value=False, dtype="bool") - self.model_inputs["used_list_len"] = paddle.full( - shape=[self.max_num_seqs], fill_value=0, dtype="int32") + self.model_inputs["batch_drop"] = paddle.full(shape=[self.max_num_seqs, 1], fill_value=False, dtype="bool") + self.model_inputs["used_list_len"] = paddle.full(shape=[self.max_num_seqs], fill_value=0, dtype="int32") def insert_prefill_inputs(self, req_dicts: List[Request]): """ @@ -368,67 +330,56 @@ class MTPProposer(Proposer): idx = request.idx length = len(request.prompt_token_ids) - if (req_dicts[i].disaggregate_info is not None - and req_dicts[i].disaggregate_info["role"] == "decode"): + if req_dicts[i].disaggregate_info is not None and req_dicts[i].disaggregate_info["role"] == "decode": length = len(request.prompt_token_ids) - self.model_inputs["pre_ids"][idx:idx + 1] = ( - request.prompt_token_ids[-1]) + self.model_inputs["pre_ids"][idx : idx + 1] = request.prompt_token_ids[-1] prefill_token_num = self.max_draft_token_num + 1 - self.model_inputs["draft_tokens"][idx : idx + 1, \ - 0:1] = paddle.to_tensor(request.draft_token_ids[0:1], dtype='int64') + self.model_inputs["draft_tokens"][idx : idx + 1, 0:1] = paddle.to_tensor( + request.draft_token_ids[0:1], dtype="int64" + ) - self.model_inputs["seq_lens_encoder"][idx:idx + 1] = 0 - self.model_inputs["seq_lens_decoder"][idx:idx + 1] = length - self.model_inputs['seq_lens_this_time'][idx:idx + - 1] = prefill_token_num + self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.model_inputs["seq_lens_decoder"][idx : idx + 1] = length + self.model_inputs["seq_lens_this_time"][idx : idx + 1] = prefill_token_num - self.model_inputs["stop_flags"][idx:idx + 1] = False - self.model_inputs["batch_drop"][idx:idx + 1] = False - self.model_inputs["step_idx"][idx:idx + 1] = 1 + self.model_inputs["stop_flags"][idx : idx + 1] = False + self.model_inputs["batch_drop"][idx : idx + 1] = False + self.model_inputs["step_idx"][idx : idx + 1] = 1 encoder_block_num = len(request.block_tables) - self.model_inputs["encoder_block_lens"][idx:idx + - 1] = encoder_block_num - self.model_inputs["block_tables"][idx:idx + 1, :] = -1 - self.model_inputs["block_tables"][ - idx:idx + 1, :encoder_block_num] = np.array( - request.block_tables, dtype="int32") + self.model_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num + self.model_inputs["block_tables"][idx : idx + 1, :] = -1 + self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( + request.block_tables, dtype="int32" + ) else: length = len(request.prompt_token_ids) if length > 1: - self.model_inputs["input_ids"][ - idx:idx + 1, :length - - 1] = self.main_model_inputs["input_ids"][idx:idx + 1, - 1:length] - self.model_inputs["pre_ids"][idx:idx + 1] = -1 - self.model_inputs["step_idx"][idx:idx + 1] = 0 + self.model_inputs["input_ids"][idx : idx + 1, : length - 1] = self.main_model_inputs["input_ids"][ + idx : idx + 1, 1:length + ] + self.model_inputs["pre_ids"][idx : idx + 1] = -1 + self.model_inputs["step_idx"][idx : idx + 1] = 0 if self.parallel_config.enable_chunked_prefill: token_chunk_size = request.prefill_chunk_info[0] - self.model_inputs["seq_lens_encoder"][idx:idx + - 1] = token_chunk_size - self.model_inputs["seq_lens_this_time"][ - idx:idx + 1] = token_chunk_size + self.model_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.model_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size else: - self.model_inputs["seq_lens_encoder"][idx:idx + 1] = length - self.model_inputs["seq_lens_this_time"][idx:idx + - 1] = length + self.model_inputs["seq_lens_encoder"][idx : idx + 1] = length + self.model_inputs["seq_lens_this_time"][idx : idx + 1] = length - self.model_inputs["seq_lens_decoder"][idx:idx + - 1] = (request.get( - "seq_lens_decoder", - 0)) - self.model_inputs["stop_flags"][idx:idx + 1] = False - self.model_inputs["batch_drop"][idx:idx + 1] = False + self.model_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.model_inputs["stop_flags"][idx : idx + 1] = False + self.model_inputs["batch_drop"][idx : idx + 1] = False encoder_block_num = len(request.get("block_tables")) - self.model_inputs["encoder_block_lens"][idx:idx + - 1] = encoder_block_num - self.model_inputs["block_tables"][idx:idx + 1, :] = -1 - self.model_inputs["block_tables"][ - idx:idx + 1, :encoder_block_num] = np.array( - request.get("block_tables"), dtype="int32") + self.model_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num + self.model_inputs["block_tables"][idx : idx + 1, :] = -1 + self.model_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( + request.get("block_tables"), dtype="int32" + ) self.model_inputs["not_need_stop"][0] = True def _initialize_forward_meta(self): @@ -451,10 +402,9 @@ class MTPProposer(Proposer): cu_seqlens_q=self.model_inputs["cu_seqlens_q"], cu_seqlens_k=self.model_inputs["cu_seqlens_k"], block_tables=self.model_inputs["block_tables"], - caches=self.model_inputs["caches"] + caches=self.model_inputs["caches"], ) - # Initialzie attention meta data for attn_backend in self.attn_backends: attn_backend.init_attention_metadata(self.forward_meta) @@ -557,17 +507,14 @@ class MTPProposer(Proposer): self.model_inputs["seq_lens_decoder"], ) # Initialize forward meta data - self.model_inputs["ids_remove_padding"].copy_( - ids_remove_padding, False) + self.model_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) self.model_inputs["cum_offsets"].copy_(cum_offsets, False) - self.model_inputs["batch_id_per_token"].copy_( - batch_id_per_token, False) + self.model_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.model_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) self.model_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False) # for speculative decoding self.model_inputs["output_cum_offsets"] = output_cum_offsets - self.model_inputs["output_padding_offset"] = ( - output_padding_offset) + self.model_inputs["output_padding_offset"] = output_padding_offset self._initialize_forward_meta() # Get sampling metadata @@ -620,37 +567,29 @@ class MTPProposer(Proposer): Update single task's chunk_prefill info """ idx = task.idx - start_idx = sum(task.prefill_chunk_info[:task.chunk_idx]) + start_idx = sum(task.prefill_chunk_info[: task.chunk_idx]) if task.chunk_idx == len(task.prefill_chunk_info): - self.model_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.model_inputs["step_idx"][idx:idx + 1] = 1 - self.model_inputs["seq_lens_decoder"][idx:idx + - 1] = start_idx + task.get( - "seq_lens_decoder", 0) + self.model_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.model_inputs["step_idx"][idx : idx + 1] = 1 + self.model_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) else: token_chunk_size = task.prefill_chunk_info[task.chunk_idx] if task.chunk_idx < len(task.prefill_chunk_info) - 1: - self.model_inputs['input_ids'][ - idx, :token_chunk_size] = np.array( - task.prompt_token_ids[start_idx + 1:start_idx + - token_chunk_size + 1]) + self.model_inputs["input_ids"][idx, :token_chunk_size] = np.array( + task.prompt_token_ids[start_idx + 1 : start_idx + token_chunk_size + 1] + ) # Last prefill else: - self.model_inputs['input_ids'][ - idx, :token_chunk_size - 1] = np.array( - task.prompt_token_ids[start_idx + 1:start_idx + - token_chunk_size]) + self.model_inputs["input_ids"][idx, : token_chunk_size - 1] = np.array( + task.prompt_token_ids[start_idx + 1 : start_idx + token_chunk_size] + ) - self.model_inputs["seq_lens_this_time"][idx:idx + - 1] = token_chunk_size - self.model_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.model_inputs["step_idx"][idx:idx + 1] = 0 - self.model_inputs["seq_lens_decoder"][idx:idx + - 1] = start_idx + task.get( - "seq_lens_decoder", 0) + self.model_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.model_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.model_inputs["step_idx"][idx : idx + 1] = 0 + self.model_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) def _update_status(self): """ diff --git a/fastdeploy/spec_decode/ngram.py b/fastdeploy/spec_decode/ngram.py index 39a273271..833a45f54 100644 --- a/fastdeploy/spec_decode/ngram.py +++ b/fastdeploy/spec_decode/ngram.py @@ -32,8 +32,7 @@ class NgramProposer(Proposer): def __init__(self, cfg: FDConfig): super().__init__(cfg) self.max_ngram_size = self.speculative_config.max_ngram_size - self.input_ids_len = paddle.zeros(shape=[self.max_num_seqs, 1], - dtype="int64").cpu() + self.input_ids_len = paddle.zeros(shape=[self.max_num_seqs, 1], dtype="int64").cpu() def update(self, bid: int, seq_len: int): """ diff --git a/fastdeploy/splitwise/__init__.py b/fastdeploy/splitwise/__init__.py index c40559bc8..f4ede9062 100644 --- a/fastdeploy/splitwise/__init__.py +++ b/fastdeploy/splitwise/__init__.py @@ -12,4 +12,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" \ No newline at end of file +""" diff --git a/fastdeploy/splitwise/splitwise_connector.py b/fastdeploy/splitwise/splitwise_connector.py index 541fb78a4..6b4c8ce04 100644 --- a/fastdeploy/splitwise/splitwise_connector.py +++ b/fastdeploy/splitwise/splitwise_connector.py @@ -68,8 +68,7 @@ class SplitwiseConnector: self.router_socket.setsockopt(zmq.LINGER, 0) self.router_socket.setsockopt(zmq.SNDHWM, 1000) self.router_socket.setsockopt(zmq.ROUTER_MANDATORY, 1) - self.router_socket.bind( - f"tcp://*:{self.cfg.cache_config.pd_comm_port[0]}") + self.router_socket.bind(f"tcp://*:{self.cfg.cache_config.pd_comm_port[0]}") logger.info(f"bind {self.cfg.cache_config.pd_comm_port}") self.poller = zmq.Poller() @@ -177,8 +176,7 @@ class SplitwiseConnector: for port in self.cfg.innode_prefill_ports: if port not in self.connect_innode_instances: self.create_connection(port) - if self.connect_innode_instances[ - port].available_prefill_instances.qsize() > 0: + if self.connect_innode_instances[port].available_prefill_instances.qsize() > 0: return False return True @@ -199,15 +197,15 @@ class SplitwiseConnector: if self.connect_innode_instances[port].get_prefill_instances() == 1: for task in tasks: task.disaggregate_info = { - "role": "prefill", + "role": "prefill", "transfer_protocol": "ipc", "cache_info": { "ipc": { "ip": "0.0.0.0", "port": self.cfg.engine_worker_queue_port, - "current_id": current_id + "current_id": current_id, }, - } + }, } self.connect_innode_instances[port].put_disaggregated_tasks(("prefill", tasks)) current_port = port @@ -229,9 +227,9 @@ class SplitwiseConnector: "ipc": { "ip": "0.0.0.0", "port": current_port, - "current_id": current_id + "current_id": current_id, }, - } + }, } def send_splitwise_tasks(self, tasks, current_id): @@ -254,21 +252,20 @@ class SplitwiseConnector: if task.disaggregate_info["transfer_protocol"] == "ipc": addr = task.disaggregate_info["cache_info"]["ipc"]["port"] - task.disaggregate_info["cache_info"]["ipc"][ - "current_id"] = current_id + task.disaggregate_info["cache_info"]["ipc"]["current_id"] = current_id self.send_splitwise_tasks_innode([task], addr) else: - addr = f"{task.disaggregate_info['cache_info']['rdma']['ip']}:"\ - + f"{task.disaggregate_info['cache_info']['rdma']['port']}" + addr = ( + f"{task.disaggregate_info['cache_info']['rdma']['ip']}:" + + f"{task.disaggregate_info['cache_info']['rdma']['port']}" + ) logger.info(f"send splitwise tasks to port {addr} decode") self.current_request_ids[task.request_id] = "init" decode_diagg = task.disaggregate_info["cache_info"] - task.disaggregate_info[ - "cache_info"] = self.cfg.disaggregate_info["cache_info"] - task.disaggregate_info["cache_info"]["rdma"][ - "current_id"] = current_id + task.disaggregate_info["cache_info"] = self.cfg.disaggregate_info["cache_info"] + task.disaggregate_info["cache_info"]["rdma"]["current_id"] = current_id self._send_message(addr, "prefill", [task]) task.disaggregate_info["cache_info"] = decode_diagg task.disaggregate_info["role"] = "prefill" @@ -288,10 +285,8 @@ class SplitwiseConnector: if port not in self.connect_innode_instances: self.create_connection(port) for task in tasks: - task.disaggregate_info["cache_info"]["ipc"][ - "port"] = self.cfg.engine_worker_queue_port - self.connect_innode_instances[port].put_disaggregated_tasks( - ("decode", tasks)) + task.disaggregate_info["cache_info"]["ipc"]["port"] = self.cfg.engine_worker_queue_port + self.connect_innode_instances[port].put_disaggregated_tasks(("decode", tasks)) for task in tasks: task.disaggregate_info["cache_info"]["ipc"]["port"] = port logger.info(f"send splitwise tasks to port {port} decode") @@ -309,8 +304,7 @@ class SplitwiseConnector: port = prefill_msg["cache_info"]["ipc"]["port"] if port not in self.connect_innode_instances: self.create_connection(port) - self.connect_innode_instances[port].put_disaggregated_tasks( - ("decode", tasks_list)) + self.connect_innode_instances[port].put_disaggregated_tasks(("decode", tasks_list)) else: node = f"{prefill_msg['cache_info']['rdma']['ip']}:{prefill_msg['cache_info']['rdma']['port']}" logger.info(f"send first token to port {node} decode") @@ -326,18 +320,19 @@ class SplitwiseConnector: self.connect_innode_instances[port] = EngineWorkerQueue( address=("0.0.0.0", int(port)), num_client=self.cfg.tensor_parallel_size, - client_id=0) + client_id=0, + ) def send_cache_infos(self, tasks, current_id): """ - Send cache information to specific port. + Send cache information to specific port. - Parameters: - tasks (list): List of tasks. - current_id (int): Current id to indicate the prefill number. + Parameters: + tasks (list): List of tasks. + current_id (int): Current id to indicate the prefill number. - Returns: - bool: Whether it is in decode status. + Returns: + bool: Whether it is in decode status. """ is_decode = False temp_cache_info = dict() @@ -348,38 +343,26 @@ class SplitwiseConnector: if tasks[i].disaggregate_info["role"] == "decode": if tasks[i].disaggregate_info["transfer_protocol"] == "ipc": cache_info = { - "request_id": - tasks[i].request_id, - "device_ids": - self.cfg.device_ids.split(","), - "transfer_protocol": - "ipc", - "dest_block_ids": - tasks[i].disaggregate_info["block_tables"], + "request_id": tasks[i].request_id, + "device_ids": self.cfg.device_ids.split(","), + "transfer_protocol": "ipc", + "dest_block_ids": tasks[i].disaggregate_info["block_tables"], } - if tasks[i].disaggregate_info["cache_info"]["ipc"][ - "port"] not in temp_cache_info: - temp_cache_info[tasks[i].disaggregate_info[ - "cache_info"]["ipc"]["port"]] = [] - temp_cache_info[tasks[i].disaggregate_info["cache_info"] - ["ipc"]["port"]].append(cache_info) + if tasks[i].disaggregate_info["cache_info"]["ipc"]["port"] not in temp_cache_info: + temp_cache_info[tasks[i].disaggregate_info["cache_info"]["ipc"]["port"]] = [] + temp_cache_info[tasks[i].disaggregate_info["cache_info"]["ipc"]["port"]].append(cache_info) else: - addr = f"{tasks[i].disaggregate_info['cache_info']['rdma']['ip']}:" + \ - f"{tasks[i].disaggregate_info['cache_info']['rdma']['port']}" + addr = ( + f"{tasks[i].disaggregate_info['cache_info']['rdma']['ip']}:" + + f"{tasks[i].disaggregate_info['cache_info']['rdma']['port']}" + ) cache_info = { - "request_id": - tasks[i].request_id, - "device_ids": - self.cfg.device_ids.split(","), - "ip": - self.cfg.host_ip, - "rdma_ports": - self.cfg.disaggregate_info["cache_info"]["rdma"] - ["rdma_port"], - "transfer_protocol": - "rdma", - "dest_block_ids": - tasks[i].disaggregate_info["block_tables"], + "request_id": tasks[i].request_id, + "device_ids": self.cfg.device_ids.split(","), + "ip": self.cfg.host_ip, + "rdma_ports": self.cfg.disaggregate_info["cache_info"]["rdma"]["rdma_port"], + "transfer_protocol": "rdma", + "dest_block_ids": tasks[i].disaggregate_info["block_tables"], } if addr not in temp_cache_info: temp_cache_info[addr] = [] @@ -390,7 +373,7 @@ class SplitwiseConnector: else: addr = "prefill" if current_id == -1: - current_id = tasks[i].disaggregate_info["cache_info"]["ipc"]['current_id'] + current_id = tasks[i].disaggregate_info["cache_info"]["ipc"]["current_id"] cache_info = { "request_id": tasks[i].request_id, "src_block_ids": tasks[i].block_tables, @@ -423,16 +406,13 @@ class SplitwiseConnector: if msg_type == "decode" or msg_type == "prefill": payload = [output.to_dict() for output in payload] - json_data = json.dumps({ - "type": msg_type, - "payload": payload - }).encode('utf-8') + json_data = json.dumps({"type": msg_type, "payload": payload}).encode("utf-8") return json_data def _deserialize_message(self, data: bytes): # JSON反序列化 - message = json.loads(data.decode('utf-8')) + message = json.loads(data.decode("utf-8")) return message["type"], message["payload"] def _process_message(self, message: bytes): @@ -461,8 +441,7 @@ class SplitwiseConnector: """ tasks_data = [Request.from_dict(task) for task in tasks] - self.engine_worker_queue.put_disaggregated_tasks( - ("decode", tasks_data)) + self.engine_worker_queue.put_disaggregated_tasks(("decode", tasks_data)) def _handle_decode(self, payload): """ @@ -471,11 +450,14 @@ class SplitwiseConnector: tasks = [] for task in payload: tasks.append( - RequestOutput(request_id=task["request_id"], - outputs=CompletionOutput( - index=task["outputs"]["index"], - send_idx=0, - token_ids=task["outputs"]["token_ids"], - ), - finished=True)) + RequestOutput( + request_id=task["request_id"], + outputs=CompletionOutput( + index=task["outputs"]["index"], + send_idx=0, + token_ids=task["outputs"]["token_ids"], + ), + finished=True, + ) + ) self.engine_worker_queue.put_disaggregated_tasks(("decode", tasks)) diff --git a/fastdeploy/stop.sh b/fastdeploy/stop.sh index 9100fe0a6..b12c068ec 100644 --- a/fastdeploy/stop.sh +++ b/fastdeploy/stop.sh @@ -18,4 +18,3 @@ for pid in $api_server_pids; do done echo 'end uvicorn multi workers' done - diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 79ee65b77..a5cf5b3e0 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -19,6 +19,7 @@ import codecs import importlib import logging import os +import random import re import socket import tarfile @@ -27,8 +28,7 @@ from datetime import datetime from logging.handlers import BaseRotatingHandler from pathlib import Path from typing import Literal, TypeVar, Union -import random -import socket + import requests import yaml from aistudio_sdk.snapshot_download import snapshot_download @@ -50,6 +50,7 @@ class EngineError(Exception): class ColoredFormatter(logging.Formatter): """自定义日志格式器,用于控制台输出带颜色""" + COLOR_CODES = { logging.WARNING: 33, # 黄色 logging.ERROR: 31, # 红色 @@ -58,8 +59,8 @@ class ColoredFormatter(logging.Formatter): def format(self, record): color_code = self.COLOR_CODES.get(record.levelno, 0) - prefix = f'\033[{color_code}m' - suffix = '\033[0m' + prefix = f"\033[{color_code}m" + suffix = "\033[0m" message = super().format(record) if color_code: message = f"{prefix}{message}{suffix}" @@ -71,13 +72,15 @@ class DailyRotatingFileHandler(BaseRotatingHandler): like `logging.TimedRotatingFileHandler`, but this class support multi-process """ - def __init__(self, - filename, - backupCount=0, - encoding="utf-8", - delay=False, - utc=False, - **kwargs): + def __init__( + self, + filename, + backupCount=0, + encoding="utf-8", + delay=False, + utc=False, + **kwargs, + ): """ 初始化 RotatingFileHandler 对象。 @@ -99,8 +102,7 @@ class DailyRotatingFileHandler(BaseRotatingHandler): self.base_log_path = Path(filename) self.base_filename = self.base_log_path.name self.current_filename = self._compute_fn() - self.current_log_path = self.base_log_path.with_name( - self.current_filename) + self.current_log_path = self.base_log_path.with_name(self.current_filename) BaseRotatingHandler.__init__(self, filename, "a", encoding, delay) def shouldRollover(self, record): @@ -120,8 +122,7 @@ class DailyRotatingFileHandler(BaseRotatingHandler): self.stream = None self.current_filename = self._compute_fn() - self.current_log_path = self.base_log_path.with_name( - self.current_filename) + self.current_log_path = self.base_log_path.with_name(self.current_filename) if not self.delay: self.stream = self._open() @@ -132,8 +133,7 @@ class DailyRotatingFileHandler(BaseRotatingHandler): """ Calculate the log file name corresponding current time """ - return self.base_filename + "." + time.strftime( - self.suffix, time.localtime()) + return self.base_filename + "." + time.strftime(self.suffix, time.localtime()) def _open(self): """ @@ -142,13 +142,11 @@ class DailyRotatingFileHandler(BaseRotatingHandler): if self.encoding is None: stream = open(str(self.current_log_path), self.mode) else: - stream = codecs.open(str(self.current_log_path), self.mode, - self.encoding) + stream = codecs.open(str(self.current_log_path), self.mode, self.encoding) if self.base_log_path.exists(): try: - if (not self.base_log_path.is_symlink() or os.readlink( - self.base_log_path) != self.current_filename): + if not self.base_log_path.is_symlink() or os.readlink(self.base_log_path) != self.current_filename: os.remove(self.base_log_path) except OSError: pass @@ -179,16 +177,13 @@ class DailyRotatingFileHandler(BaseRotatingHandler): result = [] else: result.sort() - result = result[:len(result) - self.backup_count] + result = result[: len(result) - self.backup_count] for file_name in result: os.remove(str(self.base_log_path.with_name(file_name))) -def get_logger(name, - file_name, - without_formater=False, - print_to_console=False): +def get_logger(name, file_name, without_formater=False, print_to_console=False): """ get logger """ @@ -205,12 +200,10 @@ def get_logger(name, for handler in logger.handlers[:]: logger.removeHandler(handler) - LOG_FILE = "{0}/{1}".format(log_dir, file_name) + LOG_FILE = f"{log_dir}/{file_name}" backup_count = int(envs.FD_LOG_BACKUP_COUNT) handler = DailyRotatingFileHandler(LOG_FILE, backupCount=backup_count) - formatter = ColoredFormatter( - "%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s" - ) + formatter = ColoredFormatter("%(levelname)-8s %(asctime)s %(process)-5s %(filename)s[line:%(lineno)d] %(message)s") console_handler = logging.StreamHandler() if not without_formater: @@ -262,13 +255,15 @@ def download_file(url, save_path): response = requests.get(url, stream=True) response.raise_for_status() - total_size = int(response.headers.get('content-length', 0)) - progress_bar = tqdm(total=total_size, - unit='iB', - unit_scale=True, - desc=f"Downloading {os.path.basename(url)}") + total_size = int(response.headers.get("content-length", 0)) + progress_bar = tqdm( + total=total_size, + unit="iB", + unit_scale=True, + desc=f"Downloading {os.path.basename(url)}", + ) - with open(save_path, 'wb') as f: + with open(save_path, "wb") as f: for chunk in response.iter_content(chunk_size=1024): if chunk: # filter out keep-alive chunks f.write(chunk) @@ -279,7 +274,7 @@ def download_file(url, save_path): except Exception as e: if os.path.exists(save_path): os.remove(save_path) - raise RuntimeError(f"Download failed: {str(e)}") + raise RuntimeError(f"Download failed: {e!s}") def extract_tar(tar_path, output_dir): @@ -293,7 +288,7 @@ def extract_tar(tar_path, output_dir): pbar.update(1) print(f"Successfully extracted to: {output_dir}") except Exception as e: - raise RuntimeError(f"Extraction failed: {str(e)}") + raise RuntimeError(f"Extraction failed: {e!s}") def download_model(url, output_dir, temp_tar): @@ -339,15 +334,13 @@ class FlexibleArgumentParser(argparse.ArgumentParser): Extend argparse.ArgumentParser to support loading parameters from YAML files. """ - def __init__(self, *args, config_arg='--config', sep='_', **kwargs): + def __init__(self, *args, config_arg="--config", sep="_", **kwargs): super().__init__(*args, **kwargs) self.sep = sep # Create parser to prase yaml file self.tmp_parser = argparse.ArgumentParser(add_help=False) - self.tmp_parser.add_argument(config_arg, - type=str, - help='Path to YAML config file') + self.tmp_parser.add_argument(config_arg, type=str, help="Path to YAML config file") def parse_args(self, args=None, namespace=None): tmp_ns, remaining_args = self.tmp_parser.parse_known_args(args=args) @@ -355,16 +348,13 @@ class FlexibleArgumentParser(argparse.ArgumentParser): config = {} if config_path: - with open(config_path, 'r') as f: + with open(config_path, "r") as f: loaded_config = yaml.safe_load(f) config = loaded_config # Get declared parameters defined_dests = {action.dest for action in self._actions} - filtered_config = { - k: v - for k, v in config.items() if k in defined_dests - } + filtered_config = {k: v for k, v in config.items() if k in defined_dests} # Set parameters if namespace is None: @@ -374,6 +364,7 @@ class FlexibleArgumentParser(argparse.ArgumentParser): return super().parse_args(args=remaining_args, namespace=namespace) + def resolve_obj_from_strname(strname: str): module_name, obj_name = strname.rsplit(".", 1) module = importlib.import_module(module_name) @@ -399,16 +390,14 @@ def check_unified_ckpt(model_dir): try: # check all the file exists - safetensors_num = int( - model_files[0].strip(".safetensors").split("-")[-1]) + safetensors_num = int(model_files[0].strip(".safetensors").split("-")[-1]) flags = [0] * safetensors_num for x in model_files: current_index = int(x.strip(".safetensors").split("-")[1]) flags[current_index - 1] = 1 assert sum(flags) == len( model_files - ), "Number of safetensor files should be {}, but now it's {}".format( - len(model_files), sum(flags)) + ), f"Number of safetensor files should be {len(model_files)}, but now it's {sum(flags)}" except Exception as e: raise Exception(f"Failed to check unified checkpoint, details: {e}.") return is_unified_ckpt @@ -422,15 +411,13 @@ def get_host_ip(): return ip - - def get_random_port(): while True: port = random.randint(49152, 65535) with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: - s.bind(("0.0.0.0", port)) - return port + s.bind(("0.0.0.0", port)) + return port except OSError: continue @@ -441,12 +428,13 @@ def is_port_available(host, port): """ import errno import socket + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: try: s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) s.bind((host, port)) return True - except socket.error as e: + except OSError as e: if e.errno == errno.EADDRINUSE: return False return True @@ -467,8 +455,9 @@ def singleton(cls): def print_gpu_memory_use(gpu_id: int, title: str) -> None: - """ Print memory usage """ + """Print memory usage""" import pynvml + pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id) meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) @@ -516,14 +505,10 @@ def retrive_model_from_server(model_name_or_path, revision="master"): local_path = envs.FD_MODEL_CACHE if local_path is None: local_path = f'{os.getenv("HOME")}/{repo_id}' - snapshot_download(repo_id=repo_id, - revision=revision, - local_dir=local_path) + snapshot_download(repo_id=repo_id, revision=revision, local_dir=local_path) model_name_or_path = local_path except Exception: - raise Exception( - f"The setting model_name_or_path:{model_name_or_path} is not exist." - ) + raise Exception(f"The setting model_name_or_path:{model_name_or_path} is not exist.") return model_name_or_path @@ -554,20 +539,22 @@ def is_list_of( assert_never(check) + def version(): """ Prints the contents of the version.txt file located in the parent directory of this script. """ current_dir = os.path.dirname(os.path.abspath(__file__)) - version_file_path = os.path.join(current_dir, 'version.txt') + version_file_path = os.path.join(current_dir, "version.txt") try: - with open(version_file_path, 'r') as f: + with open(version_file_path, "r") as f: content = f.read() print(content) except FileNotFoundError: llm_logger.error("[version.txt] Not Found!") + llm_logger = get_logger("fastdeploy", "fastdeploy.log") data_processor_logger = get_logger("data_processor", "data_processor.log") scheduler_logger = get_logger("scheduler", "scheduler.log") diff --git a/fastdeploy/worker/dcu_worker.py b/fastdeploy/worker/dcu_worker.py index cf2c078d1..13b5eacbb 100644 --- a/fastdeploy/worker/dcu_worker.py +++ b/fastdeploy/worker/dcu_worker.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import time import paddle @@ -58,30 +59,28 @@ class DcuWorker(GpuWorker): start_time = time.perf_counter() paddle.device.cuda.reset_max_memory_reserved(self.local_rank) paddle.device.cuda.reset_max_memory_allocated(self.local_rank) - paddle_reserved_mem_before_run = paddle.device.cuda.max_memory_reserved( - self.local_rank) - paddle_allocated_mem_before_run = paddle.device.cuda.max_memory_allocated( - self.local_rank) # not reserved + paddle_reserved_mem_before_run = paddle.device.cuda.max_memory_reserved(self.local_rank) + paddle_allocated_mem_before_run = paddle.device.cuda.max_memory_allocated(self.local_rank) # not reserved total_gpu_memory = paddle.device.cuda.get_device_properties(self.local_rank).total_memory before_used_gpu_memory = paddle.device.cuda.memory_allocated(self.local_rank) - - logger.info(( - "Before running the profile, the memory usage info is as follows:", - f"\nDevice Total memory: {total_gpu_memory / Gb}", - f"\nDevice used memory: {before_used_gpu_memory / Gb}", - f"\nPaddle reserved memory: {paddle_reserved_mem_before_run / Gb}", - f"\nPaddle allocated memory: {paddle_allocated_mem_before_run / Gb}")) + logger.info( + ( + "Before running the profile, the memory usage info is as follows:", + f"\nDevice Total memory: {total_gpu_memory / Gb}", + f"\nDevice used memory: {before_used_gpu_memory / Gb}", + f"\nPaddle reserved memory: {paddle_reserved_mem_before_run / Gb}", + f"\nPaddle allocated memory: {paddle_allocated_mem_before_run / Gb}", + ) + ) # 2. Profile run self.model_runner.profile_run() # 3. Statistical memory information - paddle_reserved_mem_after_run = paddle.device.cuda.max_memory_reserved( - self.local_rank) - paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated( - self.local_rank) + paddle_reserved_mem_after_run = paddle.device.cuda.max_memory_reserved(self.local_rank) + paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated(self.local_rank) after_used_gpu_memory = paddle.device.cuda.memory_allocated(self.local_rank) @@ -89,18 +88,24 @@ class DcuWorker(GpuWorker): model_block_memory_used = self.cal_theortical_kvcache() paddle.device.cuda.empty_cache() paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run - available_kv_cache_memory = total_gpu_memory * \ - self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase + available_kv_cache_memory = ( + total_gpu_memory * self.parallel_config.gpu_memory_utilization + - after_used_gpu_memory + - paddle_peak_increase + ) available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num end_time = time.perf_counter() logger.info( - ("After running the profile, the memory usage info is as follows:", - f"\nDevice Total memory: {total_gpu_memory / Gb}", - f"\nDevice used memory: {after_used_gpu_memory / Gb}", - f"\nPaddle reserved memory: {paddle_reserved_mem_after_run / Gb}", - f"\nPaddle allocated memory: {paddle_allocated_mem_after_run / Gb}", - f"\nAvailable KV Cache meomory: {available_kv_cache_memory / Gb}", - f"Profile time: {end_time - start_time}")) + ( + "After running the profile, the memory usage info is as follows:", + f"\nDevice Total memory: {total_gpu_memory / Gb}", + f"\nDevice used memory: {after_used_gpu_memory / Gb}", + f"\nPaddle reserved memory: {paddle_reserved_mem_after_run / Gb}", + f"\nPaddle allocated memory: {paddle_allocated_mem_after_run / Gb}", + f"\nAvailable KV Cache meomory: {available_kv_cache_memory / Gb}", + f"Profile time: {end_time - start_time}", + ) + ) return available_kv_cache_memory # return to caculate the block num in this device diff --git a/fastdeploy/worker/eplb.py b/fastdeploy/worker/eplb.py index 45ce85eac..3d83b21a5 100644 --- a/fastdeploy/worker/eplb.py +++ b/fastdeploy/worker/eplb.py @@ -1,6 +1,7 @@ """ This file is copied from https://github.com/deepseek-ai/EPLB/blob/main/eplb.py """ + """Expert Parallelism Load Balancer (EPLB)""" from typing import Tuple @@ -8,8 +9,7 @@ from typing import Tuple import numpy as np -def balanced_packing(weight: np.ndarray, - num_packs: int) -> Tuple[np.ndarray, np.ndarray]: +def balanced_packing(weight: np.ndarray, num_packs: int) -> Tuple[np.ndarray, np.ndarray]: """ Pack n weighted objects to m packs, such that each bin contains exactly n/m objects and the weights of all packs are as balanced as possible. @@ -27,10 +27,7 @@ def balanced_packing(weight: np.ndarray, groups_per_pack = num_groups // num_packs if groups_per_pack == 1: - pack_index = np.arange(weight.shape[-1], - dtype=np.int32).reshape(1, - -1).repeat(num_layers, - axis=0) + pack_index = np.arange(weight.shape[-1], dtype=np.int32).reshape(1, -1).repeat(num_layers, axis=0) rank_in_pack = np.zeros_like(weight, dtype=np.int32) return pack_index, rank_in_pack @@ -42,9 +39,9 @@ def balanced_packing(weight: np.ndarray, pack_items = [0] * num_packs for group in indices[i]: pack = min( - (i - for i in range(num_packs) if pack_items[i] < groups_per_pack), - key=pack_weights.__getitem__) + (i for i in range(num_packs) if pack_items[i] < groups_per_pack), + key=pack_weights.__getitem__, + ) assert pack_items[pack] < groups_per_pack pack_index[i, group] = pack rank_in_pack[i, group] = pack_items[pack] @@ -53,9 +50,7 @@ def balanced_packing(weight: np.ndarray, return pack_index, rank_in_pack -def replicate_experts( - weight: np.ndarray, - num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: +def replicate_experts(weight: np.ndarray, num_phy: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Replicate `num_log` experts to `num_phy` replicas, such that the maximum load of all replicas is minimized. @@ -71,8 +66,7 @@ def replicate_experts( n, num_log = weight.shape num_redundant = num_phy - num_log assert num_redundant >= 0 - phy2log = np.arange(num_phy, dtype=np.int32).reshape(1, -1).repeat(n, - axis=0) + phy2log = np.arange(num_phy, dtype=np.int32).reshape(1, -1).repeat(n, axis=0) rank = np.zeros((n, num_phy), dtype=np.int32) logcnt = np.ones((n, num_log), dtype=np.int32) arangen = np.arange(n, dtype=np.int32) @@ -85,9 +79,12 @@ def replicate_experts( def rebalance_experts_hierarchical( - weight: np.ndarray, num_physical_experts: int, num_groups: int, - num_nodes: int, - num_gpus: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + weight: np.ndarray, + num_physical_experts: int, + num_groups: int, + num_nodes: int, + num_gpus: int, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Parameters: weight: [num_moe_layers, num_logical_experts] @@ -112,56 +109,51 @@ def rebalance_experts_hierarchical( def inverse(perm: np.ndarray) -> np.ndarray: inv = np.empty_like(perm) - inv[np.arange(perm.shape[0])[:, None], - perm] = np.arange(perm.shape[1], dtype=np.int32).reshape(1, -1) + inv[np.arange(perm.shape[0])[:, None], perm] = np.arange(perm.shape[1], dtype=np.int32).reshape(1, -1) return inv # Step 1: pack groups to nodes - tokens_per_group = weight.reshape(num_layers, num_groups, - group_size).sum(axis=-1) - group_pack_index, group_rank_in_pack = balanced_packing( - tokens_per_group, num_nodes) - log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) * - group_size)[:, :, None] + - np.arange(group_size, dtype=np.int32)).reshape(num_layers, -1) + tokens_per_group = weight.reshape(num_layers, num_groups, group_size).sum(axis=-1) + group_pack_index, group_rank_in_pack = balanced_packing(tokens_per_group, num_nodes) + log2mlog = ( + ((group_pack_index * groups_per_node + group_rank_in_pack) * group_size)[:, :, None] + + np.arange(group_size, dtype=np.int32) + ).reshape(num_layers, -1) mlog2log = inverse(log2mlog) # Step 2: construct redundant experts within nodes - tokens_per_mlog = np.take_along_axis(weight, mlog2log, axis=-1).reshape( - -1, num_logical_experts // num_nodes) - phy2mlog, phyrank, mlogcnt = replicate_experts( - tokens_per_mlog, num_physical_experts // num_nodes) + tokens_per_mlog = np.take_along_axis(weight, mlog2log, axis=-1).reshape(-1, num_logical_experts // num_nodes) + phy2mlog, phyrank, mlogcnt = replicate_experts(tokens_per_mlog, num_physical_experts // num_nodes) # Step 3: pack physical_experts to GPUs - tokens_per_phy = np.take_along_axis(tokens_per_mlog / mlogcnt, - phy2mlog, - axis=-1) - pack_index, rank_in_pack = balanced_packing(tokens_per_phy, - num_gpus // num_nodes) + tokens_per_phy = np.take_along_axis(tokens_per_mlog / mlogcnt, phy2mlog, axis=-1) + pack_index, rank_in_pack = balanced_packing(tokens_per_phy, num_gpus // num_nodes) phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack pphy2phy = inverse(phy2pphy) - pphy2mlog = np.take_along_axis( - phy2mlog, pphy2phy, - axis=-1) # [num_layers * num_nodes, num_log_per_nodes] - pphy2mlog = (pphy2mlog.reshape(num_layers, num_nodes, -1) + - np.arange(0, - num_logical_experts, - num_logical_experts // num_nodes, - dtype=np.int32).reshape(1, -1, 1)).reshape( - num_layers, -1) + pphy2mlog = np.take_along_axis(phy2mlog, pphy2phy, axis=-1) # [num_layers * num_nodes, num_log_per_nodes] + pphy2mlog = ( + pphy2mlog.reshape(num_layers, num_nodes, -1) + + np.arange( + 0, + num_logical_experts, + num_logical_experts // num_nodes, + dtype=np.int32, + ).reshape(1, -1, 1) + ).reshape(num_layers, -1) pphy2log = np.take_along_axis(mlog2log, pphy2mlog, axis=-1) - pphyrank = np.take_along_axis(phyrank, pphy2phy, - axis=-1).reshape(num_layers, -1) - logcnt = np.take_along_axis(mlogcnt.reshape(num_layers, -1), - log2mlog, - axis=-1) + pphyrank = np.take_along_axis(phyrank, pphy2phy, axis=-1).reshape(num_layers, -1) + logcnt = np.take_along_axis(mlogcnt.reshape(num_layers, -1), log2mlog, axis=-1) return pphy2log, pphyrank, logcnt def rebalance_experts( - weight: np.ndarray, num_replicas: int, num_groups: int, num_nodes: int, - num_gpus: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + weight: np.ndarray, + num_replicas: int, + num_groups: int, + num_nodes: int, + num_gpus: int, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Entry point for expert-parallelism load balancer. @@ -182,23 +174,23 @@ def rebalance_experts( if num_groups % num_nodes == 0: # use hierarchical load-balance policy phy2log, phyrank, logcnt = rebalance_experts_hierarchical( - weight, num_replicas, num_groups, num_nodes, num_gpus) + weight, num_replicas, num_groups, num_nodes, num_gpus + ) else: # use global load-balance policy phy2log, phyrank, logcnt = replicate_experts(weight, num_replicas) maxlogcnt = logcnt.max() - log2phy = np.full((num_layers, num_logical_experts, maxlogcnt), - -1, - dtype=np.int32) - np.put_along_axis(log2phy.reshape(num_layers, -1)[:, :, None], - (phy2log * maxlogcnt + phyrank)[:, :, None], - np.arange(num_replicas, dtype=np.int32).reshape( - 1, -1).repeat(num_layers, axis=0)[:, :, None], - axis=1) + log2phy = np.full((num_layers, num_logical_experts, maxlogcnt), -1, dtype=np.int32) + np.put_along_axis( + log2phy.reshape(num_layers, -1)[:, :, None], + (phy2log * maxlogcnt + phyrank)[:, :, None], + np.arange(num_replicas, dtype=np.int32).reshape(1, -1).repeat(num_layers, axis=0)[:, :, None], + axis=1, + ) return phy2log, log2phy, logcnt -__all__ = ['rebalance_experts'] +__all__ = ["rebalance_experts"] def main(): @@ -211,17 +203,20 @@ def main(): num_nodes = 4 num_gpus = 4 * 8 - model_tokens_per_expert_stats_list = np.random.randint( - low=1, high=10, size=(num_hidden_layers, num_expert)) + model_tokens_per_expert_stats_list = np.random.randint(low=1, high=10, size=(num_hidden_layers, num_expert)) phy2log, phyrank, logcnt = rebalance_experts( - model_tokens_per_expert_stats_list, num_replicas, num_groups, - num_nodes, num_gpus) + model_tokens_per_expert_stats_list, + num_replicas, + num_groups, + num_nodes, + num_gpus, + ) print(phy2log) print(phyrank) print(logcnt) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/fastdeploy/worker/experts_manager.py b/fastdeploy/worker/experts_manager.py index 53bc0b725..bb86e4479 100644 --- a/fastdeploy/worker/experts_manager.py +++ b/fastdeploy/worker/experts_manager.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + """redundant expert manger.""" from typing import Optional, Tuple @@ -28,8 +29,13 @@ class RedundantExpertManger: RedundantExpertManger """ - def __init__(self, n_routed_experts: int, num_hidden_layers: int, - redundant_experts_num: int, ep_size: int) -> None: + def __init__( + self, + n_routed_experts: int, + num_hidden_layers: int, + redundant_experts_num: int, + ep_size: int, + ) -> None: """Initialize a redundant expert manager""" self.num_expert = n_routed_experts self.redundant_experts_num = redundant_experts_num @@ -41,26 +47,33 @@ class RedundantExpertManger: self.num_groups = 1 self.export_per_rank = self.num_replicas // ep_size - assert self.num_replicas % ep_size == 0, \ - f"num_replicas must be divisible by ep_size, \ + assert ( + self.num_replicas % ep_size == 0 + ), f"num_replicas must be divisible by ep_size, \ but got num_replicas = {self.num_replicas}, ep_size = {ep_size}" - self.model_ep_rank_to_expert_id_list = paddle.full(shape=[ - self.num_hidden_layers, - self.num_expert + self.redundant_experts_num - ], - fill_value=-1, - dtype="int32") - self.model_expert_id_to_ep_rank_array = paddle.full(shape=[ - self.num_hidden_layers, self.num_expert, - self.redundant_experts_num + 1 - ], - fill_value=-1, - dtype="int32") + self.model_ep_rank_to_expert_id_list = paddle.full( + shape=[ + self.num_hidden_layers, + self.num_expert + self.redundant_experts_num, + ], + fill_value=-1, + dtype="int32", + ) + self.model_expert_id_to_ep_rank_array = paddle.full( + shape=[ + self.num_hidden_layers, + self.num_expert, + self.redundant_experts_num + 1, + ], + fill_value=-1, + dtype="int32", + ) self.model_expert_in_rank_num_list = paddle.full( shape=[self.num_hidden_layers, self.num_expert], fill_value=0, - dtype="int32") + dtype="int32", + ) # self.model_ep_rank_to_expert_id_list = paddle.arange( # self.num_expert + self.redundant_experts_num, # dtype="int32").tile([self.num_hidden_layers, 1]) @@ -73,20 +86,18 @@ class RedundantExpertManger: # dtype="int32") self.model_tokens_per_expert_stats_list = paddle.ones( - shape=[self.num_hidden_layers, self.num_expert], dtype="int32") + shape=[self.num_hidden_layers, self.num_expert], dtype="int32" + ) - rank_expert_list, \ - logical_to_physical_map, \ - expert_count = rebalance_experts( - self.model_tokens_per_expert_stats_list.cpu().numpy(), - self.num_replicas, - self.num_groups, - self.num_nodes, - self.num_gpus) + rank_expert_list, logical_to_physical_map, expert_count = rebalance_experts( + self.model_tokens_per_expert_stats_list.cpu().numpy(), + self.num_replicas, + self.num_groups, + self.num_nodes, + self.num_gpus, + ) - self.update_expert_rank_table(rank_expert_list, - logical_to_physical_map, expert_count, - False) + self.update_expert_rank_table(rank_expert_list, logical_to_physical_map, expert_count, False) logger.info( f"moe experts table manager init successfully, ep_size {ep_size} \ @@ -99,10 +110,12 @@ class RedundantExpertManger: """ get_ep_rank_to_expert_id_list_by_layer """ - return self.model_ep_rank_to_expert_id_list[layer_id], \ - self.model_expert_id_to_ep_rank_array[layer_id], \ - self.model_expert_in_rank_num_list[layer_id], \ - self.model_tokens_per_expert_stats_list[layer_id] + return ( + self.model_ep_rank_to_expert_id_list[layer_id], + self.model_expert_id_to_ep_rank_array[layer_id], + self.model_expert_in_rank_num_list[layer_id], + self.model_tokens_per_expert_stats_list[layer_id], + ) def get_ep_rank_to_expert_id_list( self, layer_id: int @@ -110,28 +123,33 @@ class RedundantExpertManger: """ get_ep_rank_to_expert_id_list """ - return self.model_ep_rank_to_expert_id_list[layer_id], \ - self.model_expert_id_to_ep_rank_array[layer_id], \ - self.model_expert_in_rank_num_list[layer_id], \ - self.model_tokens_per_expert_stats_list[layer_id] + return ( + self.model_ep_rank_to_expert_id_list[layer_id], + self.model_expert_id_to_ep_rank_array[layer_id], + self.model_expert_in_rank_num_list[layer_id], + self.model_tokens_per_expert_stats_list[layer_id], + ) def get_expert_tokens_stats( - self, - verbose: bool = False, - clear_stat: bool = False - ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], - Optional[np.ndarray]]: + self, verbose: bool = False, clear_stat: bool = False + ) -> Tuple[np.ndarray, Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]: """ get_per_expert_tokens_stats """ try: if verbose: - return self.model_tokens_per_expert_stats_list.cpu().numpy(), \ - self.model_expert_id_to_ep_rank_array.cpu().numpy(), \ - self.model_ep_rank_to_expert_id_list.cpu().numpy(), \ - self.model_expert_in_rank_num_list.cpu().numpy() - return self.model_tokens_per_expert_stats_list.cpu().numpy( - ), None, None, None + return ( + self.model_tokens_per_expert_stats_list.cpu().numpy(), + self.model_expert_id_to_ep_rank_array.cpu().numpy(), + self.model_ep_rank_to_expert_id_list.cpu().numpy(), + self.model_expert_in_rank_num_list.cpu().numpy(), + ) + return ( + self.model_tokens_per_expert_stats_list.cpu().numpy(), + None, + None, + None, + ) finally: if clear_stat: self.model_tokens_per_expert_stats_list.zero_() @@ -142,27 +160,28 @@ class RedundantExpertManger: """ return self.model_expert_id_to_ep_rank_array.cpu().numpy() - def update_expert_rank_table(self, - rank_expert_list: np.ndarray, - logical_to_physical_map: np.ndarray, - expert_count: np.ndarray, - clear_stat: bool = True) -> None: + def update_expert_rank_table( + self, + rank_expert_list: np.ndarray, + logical_to_physical_map: np.ndarray, + expert_count: np.ndarray, + clear_stat: bool = True, + ) -> None: """ update_expert_rank_table """ - #update model info - self.model_ep_rank_to_expert_id_list.copy_( - paddle.to_tensor(rank_expert_list), True) + # update model info + self.model_ep_rank_to_expert_id_list.copy_(paddle.to_tensor(rank_expert_list), True) self.model_expert_id_to_ep_rank_array.fill_(-1) - self.model_expert_id_to_ep_rank_array[:, :, :logical_to_physical_map.shape[-1]] = \ - paddle.to_tensor(logical_to_physical_map) - self.model_expert_in_rank_num_list.copy_( - paddle.to_tensor(expert_count), True) + self.model_expert_id_to_ep_rank_array[:, :, : logical_to_physical_map.shape[-1]] = paddle.to_tensor( + logical_to_physical_map + ) + self.model_expert_in_rank_num_list.copy_(paddle.to_tensor(expert_count), True) # reset if clear_stat: self.model_tokens_per_expert_stats_list.zero_() -if __name__ == '__main__': +if __name__ == "__main__": print(RedundantExpertManger(64, 2, 8, 8).model_expert_id_to_ep_rank_array) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 42cc19706..b58c2237f 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -13,33 +13,37 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os import time from typing import List, Optional import numpy as np import paddle -import paddle.nn as nn +from paddle import nn from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.guided_decoding import get_guided_backend -from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \ - LogitsProcessorBase +from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( + LogitsProcessorBase, +) from fastdeploy.model_executor.layers.attention import get_attention_backend -from fastdeploy.model_executor.layers.attention.base_attention_backend import \ - AttentionBackend +from fastdeploy.model_executor.layers.attention.base_attention_backend import ( + AttentionBackend, +) from fastdeploy.model_executor.layers.rotary_embedding import get_rope from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata -from fastdeploy.model_executor.layers.sample.sampler import ( - Sampler, SpeculativeSampler) +from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler from fastdeploy.model_executor.model_loader import get_model_from_loader from fastdeploy.model_executor.ops.gcu import set_value_by_flags_and_idx -from fastdeploy.model_executor.pre_and_post_process import (post_process, - pre_process, - rebuild_padding) +from fastdeploy.model_executor.pre_and_post_process import ( + post_process, + pre_process, + rebuild_padding, +) from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput @@ -48,12 +52,13 @@ class GCUModelRunner(ModelRunnerBase): """ """ def __init__( - self, - fd_config: FDConfig, - device: str, # logic device - device_id: int, # physical device id - rank: int, - local_rank: int): + self, + fd_config: FDConfig, + device: str, # logic device + device_id: int, # physical device id + rank: int, + local_rank: int, + ): super().__init__(fd_config=fd_config, device=device) self.rank = rank self.local_rank = local_rank @@ -74,18 +79,17 @@ class GCUModelRunner(ModelRunnerBase): # Cuda Graph self.use_cudagraph = self.graph_opt_config.use_cudagraph - self.cudagraph_capture_sizes = list( - reversed(self.graph_opt_config.cudagraph_capture_sizes)) + self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups - self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, - dtype='int32') + self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") # Initialize share inputs self._init_share_inputs(self.parallel_config.max_num_seqs) self.infer_seed_increment = paddle.full( shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, - dtype="int64") + dtype="int64", + ) self.restore_chunked_prefill_request = dict() # Initialize attention Backend @@ -98,14 +102,14 @@ class GCUModelRunner(ModelRunnerBase): # Postprocess Env params os.environ["INFERENCE_MSG_QUEUE_ID"] = str( - self.local_rank + - int(self.parallel_config.engine_worker_queue_port)) + self.local_rank + int(self.parallel_config.engine_worker_queue_port) + ) def prefill_finished(self): """ check whether prefill stage finished """ - if int(paddle.max(self.share_inputs['seq_lens_encoder'])) != 0: + if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 else: return 0 @@ -115,13 +119,9 @@ class GCUModelRunner(ModelRunnerBase): Init speculative proposer """ if self.speculative_method == "ngram": - raise NotImplementedError( - "NgramProposer is not support by GCUModelRunner." - ) + raise NotImplementedError("NgramProposer is not support by GCUModelRunner.") elif self.speculative_method == "mtp": - raise NotImplementedError( - "MTPProposer is not support by GCUModelRunner." - ) + raise NotImplementedError("MTPProposer is not support by GCUModelRunner.") else: self.proposer = None @@ -129,8 +129,9 @@ class GCUModelRunner(ModelRunnerBase): """ init logits processor for guided decoding """ - assert self.guided_backend is not None, "guided_backend is None, use "\ - "--guided-decoding-backend to specify the backend at server startup." + assert self.guided_backend is not None, ( + "guided_backend is None, use " "--guided-decoding-backend to specify the backend at server startup." + ) if request.guided_json is not None: schemata_key = ("json", request.guided_json) @@ -141,8 +142,10 @@ class GCUModelRunner(ModelRunnerBase): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - return self.guided_backend.get_logits_processor( - schemata_key=schemata_key), schemata_key + return ( + self.guided_backend.get_logits_processor(schemata_key=schemata_key), + schemata_key, + ) def insert_prefill_inputs(self, req_dicts: List[Request]): """ @@ -151,9 +154,8 @@ class GCUModelRunner(ModelRunnerBase): if "caches" not in self.share_inputs: self.initialize_kv_cache() - if req_dicts[-1].disaggregate_info is not None and req_dicts[ - -1].disaggregate_info["role"] == "prefill": - os.environ['PREFILL_NODE_ONE_STEP_STOP'] = "1" + if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill": + os.environ["PREFILL_NODE_ONE_STEP_STOP"] = "1" req_len = len(req_dicts) for i in range(req_len): @@ -162,137 +164,109 @@ class GCUModelRunner(ModelRunnerBase): length = len(request.prompt_token_ids) prefill_tokens = [] - if (request.guided_json is not None - or request.guided_regex is not None - or request.structural_tag is not None - or request.guided_grammar is not None): - logits_info, schemata_key = self._init_logits_processor( - request) + if ( + request.guided_json is not None + or request.guided_regex is not None + or request.structural_tag is not None + or request.guided_grammar is not None + ): + logits_info, schemata_key = self._init_logits_processor(request) request.logits_processor, request.logits_cached = logits_info request.schemata_key = schemata_key # Is Decode Node - if req_dicts[i].disaggregate_info is not None and req_dicts[ - i].disaggregate_info["role"] == "decode": + if req_dicts[i].disaggregate_info is not None and req_dicts[i].disaggregate_info["role"] == "decode": prefill_tokens.append(request.prompt_token_ids[0]) - self.share_inputs["pre_ids"][idx:idx + - 1] = request.prompt_token_ids[-1] - self.share_inputs["input_ids"][idx:idx + 1, - 0] = request.prompt_token_ids[0] - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs['seq_lens_decoder'][idx:idx + 1] = length - self.share_inputs['seq_lens_this_time'][idx:idx + 1] = 1 - self.share_inputs['step_seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs['step_seq_lens_decoder'][idx:idx + - 1] = length - self.share_inputs['step_idx'][idx:idx + 1] = 1 + self.share_inputs["pre_ids"][idx : idx + 1] = request.prompt_token_ids[-1] + self.share_inputs["input_ids"][idx : idx + 1, 0] = request.prompt_token_ids[0] + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1 + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = length + self.share_inputs["step_idx"][idx : idx + 1] = 1 if self.speculative_decoding: num_prefill_send_token = self.speculative_config.num_speculative_tokens + 1 - self.share_inputs['draft_tokens'][idx:idx + 1, 0:num_prefill_send_token] =\ - paddle.to_tensor(request.draft_token_ids[0:num_prefill_send_token], dtype="int64") - self.share_inputs['seq_lens_this_time'][ - idx:idx + 1] = num_prefill_send_token + self.share_inputs["draft_tokens"][idx : idx + 1, 0:num_prefill_send_token] = paddle.to_tensor( + request.draft_token_ids[0:num_prefill_send_token], + dtype="int64", + ) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = num_prefill_send_token else: - self.share_inputs["pre_ids"][idx:idx + 1] = -1 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["input_ids"][idx:idx + - 1, :length] = np.array( - request.prompt_token_ids) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids) # Use chunked prefill if self.parallel_config.enable_chunked_prefill: request.set("chunk_idx", 1) - logger.info( - f"prefill_chunk_info: {request.prefill_chunk_info}") + logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}") token_chunk_size = request.prefill_chunk_info[0] - self.share_inputs["seq_lens_this_time"][ - idx:idx + 1] = token_chunk_size - self.share_inputs['input_ids'][ - idx, :token_chunk_size] = np.array( - request.prompt_token_ids[:token_chunk_size]) - self.share_inputs['step_seq_lens_encoder'][ - idx:idx + 1] = token_chunk_size - self.share_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.share_inputs['seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['step_seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array( + request.prompt_token_ids[:token_chunk_size] + ) + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) else: - self.share_inputs['seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['step_seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['seq_lens_this_time'][idx:idx + - 1] = length - self.share_inputs['step_seq_lens_encoder'][idx:idx + - 1] = length - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length - if len(request.eos_token_ids - ) < self.parallel_config.eos_tokens_lens: + if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens: request.eos_token_ids.append(request.eos_token_ids[0]) - self.share_inputs["eos_token_id"][:] = np.array( - request.eos_token_ids, dtype="int64").reshape(-1, 1) - self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 0.7) - self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0) - self.share_inputs["temperature"][idx:idx + 1] = request.get( - "temperature", 0.95) - self.share_inputs["penalty_score"][idx:idx + 1] = request.get( - "repetition_penalty", 1.0) - self.share_inputs["frequency_score"][idx:idx + 1] = request.get( - "frequency_penalty", 0.0) - self.share_inputs["presence_score"][idx:idx + 1] = request.get( - "presence_penalty", 0.0) + self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) + self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) + self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95) + self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) + self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) + self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0) - self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( - "min_tokens", 1) - self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_length) - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1) + self.share_inputs["max_dec_len"][idx : idx + 1] = request.get("max_tokens", self.model_config.max_length) + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + 1] = length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length if request.get("seed") is not None: - self.share_inputs["infer_seed"][idx:idx + - 1] = request.get("seed") + self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed") encoder_block_num = len(request.get("block_tables")) - self.share_inputs["encoder_block_lens"][idx:idx + - 1] = encoder_block_num - self.share_inputs["block_tables"][idx:idx + 1, :] = -1 - self.share_inputs["block_tables"][ - idx:idx + 1, :encoder_block_num] = np.array( - request.block_tables, dtype="int32") + self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num + self.share_inputs["block_tables"][idx : idx + 1, :] = -1 + self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( + request.block_tables, dtype="int32" + ) - if request.get("stop_token_ids") is not None and request.get( - "stop_seqs_len") is not None: + if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) - for i in range(stop_seqs_num, - self.model_config.max_stop_seqs_num): + for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num): request.stop_seqs_len.append(0) - self.share_inputs["stop_seqs_len"][:] = np.array( - request.stop_seqs_len, dtype="int32") - self.share_inputs["stop_seqs"][:stop_seqs_num, :len( - request.get("stop_token_ids")[0])] = np.array( - request.get("stop_token_ids"), dtype="int64") + self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32") + self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( + request.get("stop_token_ids"), dtype="int64" + ) - self.sampler.apply_logits_processor( - idx, request.get("logits_processor"), prefill_tokens) + self.sampler.apply_logits_processor(idx, request.get("logits_processor"), prefill_tokens) self.share_inputs["not_need_stop"][0] = True if self.speculative_method in ["mtp"]: self.proposer.insert_prefill_inputs(req_dicts) - def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, - expected_decode_len: int): - """ Set dummy prefill inputs to share_inputs """ + def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int): + """Set dummy prefill inputs to share_inputs""" max_dec_len = expected_decode_len + 1 - full_length = min(num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len) + full_length = min( + num_tokens // batch_size, + self.parallel_config.max_model_len - max_dec_len, + ) input_length = int(full_length * self.parallel_config.kv_cache_ratio) block_num = ( input_length + self.parallel_config.block_size - 1 @@ -300,28 +274,23 @@ class GCUModelRunner(ModelRunnerBase): for i in range(batch_size): idx = i - self.share_inputs["input_ids"][idx:idx + - 1, :input_length] = np.array( - [5] * input_length) - self.share_inputs["eos_token_id"][:] = np.array( - [2], dtype="int64").reshape(-1, 1) - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length - self.share_inputs["step_seq_lens_encoder"][idx:idx + - 1] = input_length - self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length - self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["max_dec_len"][idx:idx + 1] = max_dec_len - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["max_dec_len"][idx : idx + 1] = max_dec_len + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + - 1] = input_length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length - self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num - self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \ - (idx + 1) * block_num, 1) + self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num + self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( + idx * block_num, (idx + 1) * block_num, 1 + ) def _init_share_inputs(self, max_num_seqs: int): """Initialize all share buffers for model inputs. @@ -333,203 +302,145 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["pre_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], -1, - dtype='int64') + dtype="int64", + ) self.share_inputs["input_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], self.parallel_config.pad_token_id, - dtype='int64') - self.share_inputs["eos_token_id"] = paddle.full( - [self.parallel_config.eos_tokens_lens, 1], 0, dtype='int64') - self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], - self.model_config.top_p, - dtype='float32') - self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + dtype="int64", + ) + self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64") + self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") + self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["temperature"] = paddle.full( - [max_num_seqs, 1], self.model_config.temperature, dtype='float32') + [max_num_seqs, 1], self.model_config.temperature, dtype="float32" + ) self.share_inputs["penalty_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.penalty_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.penalty_score, dtype="float32" + ) self.share_inputs["frequency_score"] = paddle.full( [max_num_seqs, 1], self.model_config.frequency_score, - dtype='float32') + dtype="float32", + ) self.share_inputs["presence_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.presence_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.presence_score, dtype="float32" + ) - self.share_inputs["min_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') - self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') - self.share_inputs["min_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') - self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, - 0, - dtype='int32') - self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["step_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["step_seq_lens_decoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["not_need_stop"] = paddle.full( - [1], False, - dtype='bool').cpu() - self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], - True, - dtype='bool') - self.share_inputs["stop_nums"] = paddle.full([1], - max_num_seqs, - dtype='int64') + self.share_inputs["min_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") + self.share_inputs["max_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.max_length, dtype="int64") + self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") + self.share_inputs["max_length"] = paddle.full([max_num_seqs, 1], self.model_config.max_length, dtype="int64") + self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") + self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["not_need_stop"] = paddle.full([1], False, dtype="bool").cpu() + self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], True, dtype="bool") + self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64") - self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype='int64') - self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], - False, - dtype='bool') - self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["step_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["need_block_len"] = paddle.full([1], - 0, - dtype='int32') - self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["ori_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int32') + self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype="int64") + self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool") + self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( [max_num_seqs * self.parallel_config.max_model_len], 0, - dtype='int64') - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') + dtype="int64", + ) + self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # AttentionBackend buffers - self.share_inputs["decoder_batch_ids"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') + self.share_inputs["decoder_batch_ids"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # Initialize rotary position embedding - tmp_position_ids = paddle.arange( - self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) self.share_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, base=self.model_config.rope_theta, - model_config=self.model_config) + model_config=self.model_config, + ) # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + - self.parallel_config.block_size - 1 + self.parallel_config.max_model_len + self.parallel_config.block_size - 1 ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num - self.share_inputs["block_tables"] = paddle.full( - [max_num_seqs, pre_max_block_num], -1, dtype='int32') + self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") # Initialize free list free_list = list( range( self.parallel_config.total_block_num - 1, - int(self.parallel_config.total_block_num * - self.parallel_config.kv_cache_ratio) - 1, -1)) + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs["free_list"] = paddle.to_tensor(free_list, - dtype="int32") - self.share_inputs["free_list_len"] = paddle.full([1], - self.free_list_len, - dtype="int32") + self.share_inputs["free_list"] = paddle.to_tensor(free_list, dtype="int32") + self.share_inputs["free_list_len"] = paddle.full([1], self.free_list_len, dtype="int32") # Initialize stop seqs - self.share_inputs["stop_seqs_len"] = paddle.full( - [self.model_config.max_stop_seqs_num], 0, dtype="int32") - self.share_inputs["stop_seqs"] = paddle.full([ - self.model_config.max_stop_seqs_num, - self.model_config.stop_seqs_max_len - ], - -1, - dtype="int32") + self.share_inputs["stop_seqs_len"] = paddle.full([self.model_config.max_stop_seqs_num], 0, dtype="int32") + self.share_inputs["stop_seqs"] = paddle.full( + [ + self.model_config.max_stop_seqs_num, + self.model_config.stop_seqs_max_len, + ], + -1, + dtype="int32", + ) if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( shape=[max_num_seqs, self.parallel_config.max_model_len], fill_value=1, - dtype='int64').cpu() - self.share_inputs['accept_tokens'] = paddle.full( + dtype="int64", + ).cpu() + self.share_inputs["accept_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, - dtype="int64") - self.share_inputs['accept_num'] = paddle.full(shape=[max_num_seqs], - fill_value=0, - dtype='int32') - self.share_inputs['draft_tokens'] = paddle.full( + dtype="int64", + ) + self.share_inputs["accept_num"] = paddle.full(shape=[max_num_seqs], fill_value=0, dtype="int32") + self.share_inputs["draft_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, - dtype="int64") + dtype="int64", + ) - self.share_inputs['actual_draft_token_num'] = paddle.full( + self.share_inputs["actual_draft_token_num"] = paddle.full( shape=[max_num_seqs], fill_value=max_draft_token_num, - dtype="int32") - self.share_inputs["output_cum_offsets"] = paddle.full( - shape=[max_num_seqs, 1], fill_value=0, dtype='int32') + dtype="int32", + ) + self.share_inputs["output_cum_offsets"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") self.share_inputs["output_padding_offset"] = paddle.full( shape=[max_num_seqs * (max_draft_token_num + 1)], fill_value=0, - dtype="int32") + dtype="int32", + ) def _prepare_inputs(self) -> None: - """ prepare the model inputs """ + """prepare the model inputs""" # Remove padding ( ids_remove_padding, @@ -540,14 +451,16 @@ class GCUModelRunner(ModelRunnerBase): output_cum_offsets, output_padding_offset, ) = pre_process( - self.parallel_config.max_model_len, self.share_inputs["input_ids"], - self.share_inputs["seq_lens_this_time"], self.speculative_decoding, - self.share_inputs["draft_tokens"] if self.speculative_decoding else - None, self.share_inputs["seq_lens_encoder"], - self.share_inputs["seq_lens_decoder"]) + self.parallel_config.max_model_len, + self.share_inputs["input_ids"], + self.share_inputs["seq_lens_this_time"], + self.speculative_decoding, + (self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + self.share_inputs["seq_lens_encoder"], + self.share_inputs["seq_lens_decoder"], + ) - self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, - False) + self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["padding_offset"].copy_(padding_offset, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) @@ -555,10 +468,8 @@ class GCUModelRunner(ModelRunnerBase): # For speculative decoding if self.speculative_decoding: - self.share_inputs["output_cum_offsets"].copy_( - output_cum_offsets, False) - self.share_inputs["output_padding_offset"].copy_( - output_padding_offset, False) + self.share_inputs["output_cum_offsets"].copy_(output_cum_offsets, False) + self.share_inputs["output_padding_offset"].copy_(output_padding_offset, False) # Initialize forward meta data self.initialize_forward_meta() @@ -580,16 +491,15 @@ class GCUModelRunner(ModelRunnerBase): ) def load_model(self) -> None: - """ load or download model """ - logger.info( - f"Starting to load model {self.model_config.architectures[0]}") + """load or download model""" + logger.info(f"Starting to load model {self.model_config.architectures[0]}") time_before_load = time.perf_counter() # 1. Load original model self.model = get_model_from_loader(fd_config=self.fd_config) # 1.1 Load RL dynamic model if self.fd_config.load_config.dynamic_load_weight: - from fastdeploy.rl.dynamic_weight_manager import \ - DynamicWeightManager + from fastdeploy.rl.dynamic_weight_manager import DynamicWeightManager + self.dynamic_weight_manager = DynamicWeightManager(self.fd_config, self.model) # 2. Load lora model @@ -597,14 +507,13 @@ class GCUModelRunner(ModelRunnerBase): # 3. Load drafter model(for speculative decoding) time_after_load = time.perf_counter() - logger.info( - f"Model loading took {time_after_load - time_before_load} seconds") + logger.info(f"Model loading took {time_after_load - time_before_load} seconds") # 4. Init proposer for speculative method self.init_speculative_proposer() def get_model(self) -> nn.Layer: - """ get current model """ + """get current model""" return self.model def initialize_forward_meta(self): @@ -627,7 +536,7 @@ class GCUModelRunner(ModelRunnerBase): cu_seqlens_q=self.share_inputs["cu_seqlens_q"], cu_seqlens_k=self.share_inputs["cu_seqlens_k"], block_tables=self.share_inputs["block_tables"], - caches=self.share_inputs["caches"] + caches=self.share_inputs["caches"], ) # Initialzie attention meta data @@ -641,13 +550,13 @@ class GCUModelRunner(ModelRunnerBase): self.forward_meta.clear_caches() def clear_parameters(self, pid): - """"dynamic model loader use to clear parameters use for RL""" + """ "dynamic model loader use to clear parameters use for RL""" self.dynamic_weight_manager.clear_parameters(pid) self.clear_cache() self.dynamic_weight_manager._log_memory("dynamic weight manager clear all memory") def update_parameters(self, pid): - """"dynamic model loader use to update parameters use for RL""" + """ "dynamic model loader use to update parameters use for RL""" self.dynamic_weight_manager.update_parameters(pid) self.initialize_kv_cache() self.dynamic_weight_manager._log_memory("dynamic weight manager update all memory") @@ -662,31 +571,30 @@ class GCUModelRunner(ModelRunnerBase): # Get kv cache dtype cache_type = self.parallel_config.dtype - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): - cache_type = 'uint8' + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): + cache_type = "uint8" # Get kv cache shape - kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( - max_num_blocks=max_block_num) + kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num) # local_rank = self.local_rank % self.parallel_config.tensor_parallel_size if not self.parallel_config.do_profile and ( - self.parallel_config.enable_prefix_caching \ - or self.parallel_config.splitwise_role != "mixed"): - raise NotImplementedError( - "prefix_caching is not support by GCUModelRunner." - ) + self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed" + ): + raise NotImplementedError("prefix_caching is not support by GCUModelRunner.") else: for i in range(self.model_config.num_hidden_layers): - cache_kvs["key_caches_{}".format(i)] = paddle.full( + cache_kvs[f"key_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, ) - cache_kvs["value_caches_{}".format(i)] = paddle.full( + cache_kvs[f"value_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, @@ -702,42 +610,49 @@ class GCUModelRunner(ModelRunnerBase): assert len(self.attn_backends) == 0 num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size - self.model_config.kv_num_heads = int( - self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_size + self.model_config.kv_num_heads = ( + int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size + ) head_dim = self.model_config.head_dim # Get the attention backend attn_cls = get_attention_backend() - attn_backend = attn_cls(self.fd_config, - kv_num_heads=self.model_config.kv_num_heads, - num_heads=num_heads, - head_dim=head_dim) + attn_backend = attn_cls( + self.fd_config, + kv_num_heads=self.model_config.kv_num_heads, + num_heads=num_heads, + head_dim=head_dim, + ) if attn_backend is None: raise NotImplementedError( "Attention backend which you specified is not supported, please set FD_ATTENTION_BACKEND correctly." ) self.attn_backends.append(attn_backend) - def _dummy_run(self, - num_tokens: paddle.Tensor, - batch_size: paddle.Tensor, - expected_decode_len: int = 1, - in_capturing: bool = False) -> paddle.Tensor: + def _dummy_run( + self, + num_tokens: paddle.Tensor, + batch_size: paddle.Tensor, + expected_decode_len: int = 1, + in_capturing: bool = False, + ) -> paddle.Tensor: """ Use dummy inputs to run before formal execution. Args: num_tokens: expected_decode_len: Expected number of tokens generated """ - self._dummy_prefill_inputs(num_tokens=num_tokens, - batch_size=batch_size, - expected_decode_len=expected_decode_len) + self._dummy_prefill_inputs( + num_tokens=num_tokens, + batch_size=batch_size, + expected_decode_len=expected_decode_len, + ) if self.speculative_method in ["mtp"]: self.proposer.dummy_prefill_inputs( num_tokens=num_tokens, batch_size=batch_size, - expected_decode_len=expected_decode_len) + expected_decode_len=expected_decode_len, + ) while True: # 1. Compute real num_tokens @@ -750,7 +665,8 @@ class GCUModelRunner(ModelRunnerBase): # 4. Run model model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta) + forward_meta=self.forward_meta, + ) hiddden_states = rebuild_padding( model_output, @@ -758,9 +674,9 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], - self.share_inputs["output_padding_offset"] - if self.speculative_decoding else - None, # speculative decoding requires + ( + self.share_inputs["output_padding_offset"] if self.speculative_decoding else None + ), # speculative decoding requires self.parallel_config.max_model_len, ) @@ -777,24 +693,22 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["step_idx"], self.share_inputs["stop_flags"], ) - sampler_output = self.sampler(logits, - self.sampling_metadata) + sampler_output = self.sampler(logits, self.sampling_metadata) if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: - self.sampler(logits, self.sampling_metadata, - self.parallel_config.max_model_len, - self.share_inputs) + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) sampler_output = None if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], 0) - paddle.distributed.broadcast( - self.share_inputs["accept_num"], 0) - paddle.distributed.broadcast(self.share_inputs["step_idx"], - 0) - paddle.distributed.broadcast( - self.share_inputs["stop_flags"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_tokens"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_num"], 0) + paddle.distributed.broadcast(self.share_inputs["step_idx"], 0) + paddle.distributed.broadcast(self.share_inputs["stop_flags"], 0) # 6. post process model_output_data = ModelOutputData( @@ -815,20 +729,20 @@ class GCUModelRunner(ModelRunnerBase): msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=self.share_inputs["draft_tokens"] - if self.speculative_decoding else None, - actual_draft_token_num=self. - share_inputs["actual_draft_token_num"] - if self.speculative_decoding else None, - accept_tokens=self.share_inputs["accept_tokens"] - if self.speculative_decoding else None, - accept_num=self.share_inputs["accept_num"] - if self.speculative_decoding else None) + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + ) - post_process(sampler_output=sampler_output, - model_output=model_output_data, - speculative_decoding=self.speculative_decoding, - skip_save_output=True) + post_process( + sampler_output=sampler_output, + model_output=model_output_data, + speculative_decoding=self.speculative_decoding, + skip_save_output=True, + ) if self.speculative_decoding: if self.speculative_method == "mtp": @@ -840,7 +754,7 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0: + if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0: break def _update_chunked_prefill(self, tasks): @@ -860,33 +774,25 @@ class GCUModelRunner(ModelRunnerBase): for id, task in list(self.restore_chunked_prefill_request.items()): idx = task.idx - logger.debug( - f"{task.request_id} chunked prefill {task.chunk_idx}/{len(task.prefill_chunk_info)}" - ) - start_idx = sum(task.prefill_chunk_info[:task.chunk_idx]) + logger.debug(f"{task.request_id} chunked prefill {task.chunk_idx}/{len(task.prefill_chunk_info)}") + start_idx = sum(task.prefill_chunk_info[: task.chunk_idx]) if task.chunk_idx == len(task.prefill_chunk_info): - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = 1 - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 1 - self.share_inputs["seq_lens_decoder"][ - idx:idx + 1] = start_idx + task.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1 + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 1 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) del self.restore_chunked_prefill_request[task.request_id] else: token_chunk_size = task.prefill_chunk_info[task.chunk_idx] - self.share_inputs["seq_lens_this_time"][idx:idx + - 1] = token_chunk_size - self.share_inputs['input_ids'][ - idx, :token_chunk_size] = np.array( - task.prompt_token_ids[start_idx:start_idx + - token_chunk_size]) - self.share_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["seq_lens_decoder"][ - idx:idx + 1] = start_idx + task.get("seq_lens_decoder", 0) - if self.speculative_decoding and self.proposer.is_chunk_prefill_enabled( - ): + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array( + task.prompt_token_ids[start_idx : start_idx + token_chunk_size] + ) + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) + if self.speculative_decoding and self.proposer.is_chunk_prefill_enabled(): self.proposer.update_task_chunk_prefill(task) task.chunk_idx += 1 @@ -899,26 +805,22 @@ class GCUModelRunner(ModelRunnerBase): Trigger CUDA Graph capture for all shapes in 'CudaGraphConfig.cudagraph_capture_sizes' """ if not self.use_cudagraph: - logger.info( - "Skipping CUDA graph capture. Please check GraphOptimizationConfig" - ) + logger.info("Skipping CUDA graph capture. Please check GraphOptimizationConfig") return time_before_capture = time.perf_counter() expected_decode_len = 1 capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): - self._dummy_run(num_tokens=self.parallel_config.max_model_len, - batch_size=batch_size, - in_capturing=True, - expected_decode_len=expected_decode_len) - logger.info( - f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}" + self._dummy_run( + num_tokens=self.parallel_config.max_model_len, + batch_size=batch_size, + in_capturing=True, + expected_decode_len=expected_decode_len, ) + logger.info(f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}") time_after_capture = time.perf_counter() - logger.info( - f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds" - ) + logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") def _get_skip_idx(self, model_forward_batch): """ @@ -933,15 +835,12 @@ class GCUModelRunner(ModelRunnerBase): return skip_idx_list for task in model_forward_batch: - if task.get("prefill_chunk_info", - None) is None or task.chunk_idx >= len( - task.prefill_chunk_info): + if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info): continue skip_idx_list.append(task.idx) for task in self.restore_chunked_prefill_request.values(): - if task.idx in skip_idx_list or task.chunk_idx >= len( - task.prefill_chunk_info): + if task.idx in skip_idx_list or task.chunk_idx >= len(task.prefill_chunk_info): continue skip_idx_list.append(task.idx) @@ -977,7 +876,8 @@ class GCUModelRunner(ModelRunnerBase): # 3. Execute model model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta) + forward_meta=self.forward_meta, + ) hiddden_states = rebuild_padding( model_output, @@ -985,8 +885,7 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], - self.share_inputs["output_padding_offset"] - if self.speculative_decoding else None, + (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), self.parallel_config.max_model_len, ) @@ -1012,17 +911,18 @@ class GCUModelRunner(ModelRunnerBase): paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: - self.sampler(logits, self.sampling_metadata, - self.parallel_config.max_model_len, self.share_inputs) + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) sampler_output = None if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], 0) - paddle.distributed.broadcast(self.share_inputs["accept_num"], - 0) + paddle.distributed.broadcast(self.share_inputs["accept_tokens"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_num"], 0) paddle.distributed.broadcast(self.share_inputs["step_idx"], 0) - paddle.distributed.broadcast(self.share_inputs["stop_flags"], - 0) + paddle.distributed.broadcast(self.share_inputs["stop_flags"], 0) # 5. Post Process model_output_data = ModelOutputData( @@ -1043,25 +943,25 @@ class GCUModelRunner(ModelRunnerBase): msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=self.share_inputs["draft_tokens"] - if self.speculative_decoding else None, - actual_draft_token_num=self.share_inputs["actual_draft_token_num"] - if self.speculative_decoding else None, - accept_tokens=self.share_inputs["accept_tokens"] - if self.speculative_decoding else None, - accept_num=self.share_inputs["accept_num"] - if self.speculative_decoding else None) + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + ) - if self.speculative_config.method in ["mtp"] and \ - self.parallel_config.splitwise_role == "prefill": + if self.speculative_config.method in ["mtp"] and self.parallel_config.splitwise_role == "prefill": skip_save_output = True else: skip_save_output = False - post_process(sampler_output=sampler_output, - model_output=model_output_data, - save_each_rank=self.parallel_config.use_ep, - speculative_decoding=self.speculative_decoding, - skip_save_output=skip_save_output) + post_process( + sampler_output=sampler_output, + model_output=model_output_data, + save_each_rank=self.parallel_config.use_ep, + speculative_decoding=self.speculative_decoding, + skip_save_output=skip_save_output, + ) # 6. Speculative decode if self.speculative_decoding: @@ -1092,11 +992,9 @@ class GCUModelRunner(ModelRunnerBase): request.logits_cached = True if isinstance(request.logits_processor, LogitsProcessorBase): - self.guided_backend.add_cache(request.schemata_key, - request.logits_processor) + self.guided_backend.add_cache(request.schemata_key, request.logits_processor) else: - self.guided_backend.add_cache( - request.schemata_key, request.logits_processor.result()) + self.guided_backend.add_cache(request.schemata_key, request.logits_processor.result()) def _execute_empty_input(self) -> None: """ @@ -1107,8 +1005,7 @@ class GCUModelRunner(ModelRunnerBase): if hasattr(self.model, "empty_input_forward"): self.model.empty_input_forward() else: - raise ValueError( - f"{type(self.model)} has no attribute 'empty_input_forward") + raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" @@ -1120,8 +1017,10 @@ class GCUModelRunner(ModelRunnerBase): # 1. Profile with multimodal encoder & encoder cache # 2. Dummy run - self._dummy_run(num_tokens=self.parallel_config.max_num_batched_tokens, - batch_size=min(self.parallel_config.max_num_seqs, 3)) + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=min(self.parallel_config.max_num_seqs, 3), + ) # 3. gc self.clear_cache() @@ -1139,23 +1038,24 @@ class GCUModelRunner(ModelRunnerBase): self.num_gcu_blocks = num_gpu_blocks # Reset block table and kv cache with global block num - if not (self.parallel_config.enable_prefix_caching \ - or self.parallel_config.splitwise_role != "mixed"): + if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"): self.initialize_kv_cache() # Reset free list free_list = list( range( self.num_gcu_blocks - 1, - int(self.num_gcu_blocks * self.parallel_config.kv_cache_ratio) - - 1, -1)) + int(self.num_gcu_blocks * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs.update({ - "free_list": - paddle.to_tensor(free_list, dtype="int32"), - "free_list_len": - paddle.full([1], self.free_list_len, dtype="int32"), - }) + self.share_inputs.update( + { + "free_list": paddle.to_tensor(free_list, dtype="int32"), + "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"), + } + ) self.parallel_config.do_profile = False @@ -1173,9 +1073,11 @@ class GCUModelRunner(ModelRunnerBase): - cache_int4: """ cache_quant_dtype = None - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): cache_quant_dtype = self.quant_config.kv_cache_quant_type if cache_quant_dtype is not None: # int8, int8_zp, fp8, fp8_zp @@ -1184,14 +1086,12 @@ class GCUModelRunner(ModelRunnerBase): byte_of_dtype = 2 hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads - num_layers = self.model_config.num_hidden_layers + \ - self.speculative_config.num_gpu_block_expand_ratio if \ - self.speculative_method in [ - "mtp" - ] else self.model_config.num_hidden_layers - required_memory = ( - byte_of_dtype * 2 * # k + v - (self.parallel_config.block_size * hidden_dim) * num_layers) + num_layers = ( + self.model_config.num_hidden_layers + self.speculative_config.num_gpu_block_expand_ratio + if self.speculative_method in ["mtp"] + else self.model_config.num_hidden_layers + ) + required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers # k + v return required_memory def not_need_stop(self) -> bool: diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py index f280084de..2e4e83885 100644 --- a/fastdeploy/worker/gcu_worker.py +++ b/fastdeploy/worker/gcu_worker.py @@ -13,11 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import gc from typing import List, Optional import paddle -import paddle.nn as nn +from paddle import nn from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request @@ -46,8 +47,7 @@ class GcuWorker(WorkerBase): pass def init_device(self): - """ Initialize device and Construct model runner - """ + """Initialize device and Construct model runner""" if paddle.is_compiled_with_custom_device("gcu"): # Set evironment variable self.device_ids = self.parallel_config.device_ids.split(",") @@ -58,8 +58,7 @@ class GcuWorker(WorkerBase): gc.collect() else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") + raise RuntimeError(f"Not support device type: {self.device_config.device}") # Construct model runner self.model_runner: GCUModelRunner = GCUModelRunner( @@ -67,7 +66,8 @@ class GcuWorker(WorkerBase): device=self.device, device_id=self.device_ids[self.local_rank], rank=self.rank, - local_rank=self.local_rank) + local_rank=self.local_rank, + ) def prefill_finished(self): """ @@ -98,8 +98,7 @@ class GcuWorker(WorkerBase): """ """ return self.model_runner.get_model() - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """ """ pass @@ -112,7 +111,7 @@ class GcuWorker(WorkerBase): return output def preprocess_new_task(self, req_dicts: List[Request]) -> None: - """ Process new requests and then start the decode loop + """Process new requests and then start the decode loop TODO(gongshaotian):The scheduler should schedule the handling of prefill, and workers and modelrunners should not perceive it. """ @@ -138,5 +137,4 @@ class GcuWorker(WorkerBase): def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None: """ """ - self.model_runner.update_share_input_block_num( - num_gpu_blocks=num_gpu_blocks) + self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8ad834c70..6ce285081 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -13,35 +13,40 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os import time from typing import List, Optional import numpy as np import paddle -import paddle.nn as nn +from paddle import nn from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.guided_decoding import get_guided_backend -from fastdeploy.model_executor.guided_decoding.base_guided_decoding import \ - LogitsProcessorBase +from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( + LogitsProcessorBase, +) from fastdeploy.model_executor.layers.attention import get_attention_backend -from fastdeploy.model_executor.layers.attention.base_attention_backend import \ - AttentionBackend -from fastdeploy.model_executor.layers.rotary_embedding import (get_rope, - get_rope_3d) +from fastdeploy.model_executor.layers.attention.base_attention_backend import ( + AttentionBackend, +) +from fastdeploy.model_executor.layers.rotary_embedding import get_rope, get_rope_3d from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata -from fastdeploy.model_executor.layers.sample.sampler import ( - Sampler, SpeculativeSampler) +from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler from fastdeploy.model_executor.model_loader import get_model_from_loader -from fastdeploy.model_executor.ops.gpu import (set_value_by_flags_and_idx, - share_external_data) -from fastdeploy.model_executor.pre_and_post_process import (post_process, - pre_process, - rebuild_padding, - step_cuda) +from fastdeploy.model_executor.ops.gpu import ( + set_value_by_flags_and_idx, + share_external_data, +) +from fastdeploy.model_executor.pre_and_post_process import ( + post_process, + pre_process, + rebuild_padding, + step_cuda, +) from fastdeploy.platforms import current_platform if not current_platform.is_dcu(): @@ -49,21 +54,20 @@ if not current_platform.is_dcu(): from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.model_executor.forward_meta import ForwardMeta -from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \ - ScatterOp +from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput class GPUModelRunner(ModelRunnerBase): - def __init__( - self, - fd_config: FDConfig, - device: str, # logic device - device_id: int, # physical device id - rank: int, - local_rank: int): + self, + fd_config: FDConfig, + device: str, # logic device + device_id: int, # physical device id + rank: int, + local_rank: int, + ): super().__init__(fd_config=fd_config, device=device) self.enable_mm = self.model_config.enable_mm self.rank = rank @@ -110,15 +114,15 @@ class GPUModelRunner(ModelRunnerBase): # Cuda Graph self.use_cudagraph = self.graph_opt_config.use_cudagraph - self.cudagraph_capture_sizes = list( - reversed(self.graph_opt_config.cudagraph_capture_sizes)) + self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) # Initialize share inputs self._init_share_inputs(self.parallel_config.max_num_seqs) self.infer_seed_increment = paddle.full( shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, - dtype="int64") + dtype="int64", + ) self.restore_chunked_prefill_request = dict() # Initialize attention Backend @@ -133,14 +137,14 @@ class GPUModelRunner(ModelRunnerBase): # Postprocess Env params os.environ["INFERENCE_MSG_QUEUE_ID"] = str( - self.local_rank + - int(self.parallel_config.engine_worker_queue_port)) + self.local_rank + int(self.parallel_config.engine_worker_queue_port) + ) def prefill_finished(self): """ Check whether prefill stage finished """ - if int(paddle.max(self.share_inputs['seq_lens_encoder'])) != 0: + if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 else: return 0 @@ -152,9 +156,13 @@ class GPUModelRunner(ModelRunnerBase): if self.speculative_method == "ngram": self.proposer = NgramProposer(self.fd_config) elif self.speculative_method == "mtp": - self.proposer = MTPProposer(self.fd_config, self.get_model(), - self.local_rank, self.device_id, - self.share_inputs) + self.proposer = MTPProposer( + self.fd_config, + self.get_model(), + self.local_rank, + self.device_id, + self.share_inputs, + ) else: self.proposer = None @@ -162,8 +170,9 @@ class GPUModelRunner(ModelRunnerBase): """ init logits processor for guided decoding """ - assert self.guided_backend is not None, "guided_backend is None, use "\ - "--guided-decoding-backend to specify the backend at server startup." + assert self.guided_backend is not None, ( + "guided_backend is None, use " "--guided-decoding-backend to specify the backend at server startup." + ) if request.guided_json is not None: schemata_key = ("json", request.guided_json) @@ -174,8 +183,10 @@ class GPUModelRunner(ModelRunnerBase): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - return self.guided_backend.get_logits_processor( - schemata_key=schemata_key), schemata_key + return ( + self.guided_backend.get_logits_processor(schemata_key=schemata_key), + schemata_key, + ) def insert_prefill_inputs(self, req_dicts: List[Request]): """ @@ -187,9 +198,8 @@ class GPUModelRunner(ModelRunnerBase): self.initialize_kv_cache() # NOTE(luotingdan): Set environment variable of prefill node - if req_dicts[-1].disaggregate_info is not None and req_dicts[ - -1].disaggregate_info["role"] == "prefill": - os.environ['PREFILL_NODE_ONE_STEP_STOP'] = "1" + if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill": + os.environ["PREFILL_NODE_ONE_STEP_STOP"] = "1" req_len = len(req_dicts) for i in range(req_len): @@ -199,127 +209,104 @@ class GPUModelRunner(ModelRunnerBase): assert length > 0, "The prompt requested must not be empty." prefill_tokens = [] - if (request.guided_json is not None - or request.guided_regex is not None - or request.structural_tag is not None - or request.guided_grammar is not None): - logits_info, schemata_key = self._init_logits_processor( - request) + if ( + request.guided_json is not None + or request.guided_regex is not None + or request.structural_tag is not None + or request.guided_grammar is not None + ): + logits_info, schemata_key = self._init_logits_processor(request) request.logits_processor, request.logits_cached = logits_info request.schemata_key = schemata_key # Is Decode Node - if req_dicts[i].disaggregate_info is not None and req_dicts[ - i].disaggregate_info["role"] == "decode": + if req_dicts[i].disaggregate_info is not None and req_dicts[i].disaggregate_info["role"] == "decode": prefill_tokens.append(request.prompt_token_ids[0]) - self.share_inputs["pre_ids"][idx:idx + - 1] = request.prompt_token_ids[-1] - self.share_inputs["input_ids"][idx:idx + 1, - 0] = request.prompt_token_ids[0] - self.share_inputs["prompt_ids"][idx:idx + 1, - :length] = np.array(request.prompt_token_ids) - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs['seq_lens_decoder'][idx:idx + 1] = length - self.share_inputs['seq_lens_this_time'][idx:idx + 1] = 1 - self.share_inputs['step_seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs['step_seq_lens_decoder'][idx:idx + - 1] = length - self.share_inputs["prompt_lens"][idx:idx + 1] = length - self.share_inputs['step_idx'][idx:idx + 1] = 1 + self.share_inputs["pre_ids"][idx : idx + 1] = request.prompt_token_ids[-1] + self.share_inputs["input_ids"][idx : idx + 1, 0] = request.prompt_token_ids[0] + self.share_inputs["prompt_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids) + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1 + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = length + self.share_inputs["prompt_lens"][idx : idx + 1] = length + self.share_inputs["step_idx"][idx : idx + 1] = 1 if self.speculative_decoding: num_prefill_send_token = self.speculative_config.num_speculative_tokens + 1 - self.share_inputs['draft_tokens'][idx:idx + 1, 0:num_prefill_send_token] =\ - paddle.to_tensor(request.draft_token_ids[0:num_prefill_send_token], dtype="int64") - self.share_inputs['seq_lens_this_time'][ - idx:idx + 1] = num_prefill_send_token + self.share_inputs["draft_tokens"][idx : idx + 1, 0:num_prefill_send_token] = paddle.to_tensor( + request.draft_token_ids[0:num_prefill_send_token], + dtype="int64", + ) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = num_prefill_send_token else: - self.share_inputs["pre_ids"][idx:idx + 1] = -1 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["input_ids"][idx:idx + - 1, :length] = np.array( - request.prompt_token_ids) - self.share_inputs["prompt_ids"][idx:idx + - 1, :length] = np.array( - request.prompt_token_ids) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids) + self.share_inputs["prompt_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids) # Use chunked prefill if self.parallel_config.enable_chunked_prefill: request.set("chunk_idx", 1) - logger.info( - f"prefill_chunk_info: {request.prefill_chunk_info}") + logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}") token_chunk_size = request.prefill_chunk_info[0] if self.enable_mm: inputs = self._preprocess_mm_task(token_chunk_size) if inputs.get("images") is not None: - self.share_inputs["image_features"] = self.extract_vision_features( - inputs) + self.share_inputs["image_features"] = self.extract_vision_features(inputs) else: # Compatible with the situation that lacks images and videos self.share_inputs["image_features"] = None if request.multimodal_inputs["position_ids"] is not None: position_ids = paddle.to_tensor( request.multimodal_inputs["position_ids"], - dtype="int64").unsqueeze([0]) + dtype="int64", + ).unsqueeze([0]) else: position_ids = None token_chunk_size = inputs["input_ids"].shape[1] request.set("start_idx", token_chunk_size) - self.share_inputs["input_ids"][ - idx:idx + 1, :token_chunk_size] = inputs["input_ids"] + self.share_inputs["input_ids"][idx : idx + 1, :token_chunk_size] = inputs["input_ids"] else: - self.share_inputs['input_ids'][ - idx, :token_chunk_size] = np.array( - request.prompt_token_ids[:token_chunk_size]) - self.share_inputs['seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['step_seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs["seq_lens_this_time"][ - idx:idx + 1] = token_chunk_size - self.share_inputs['step_seq_lens_encoder'][ - idx:idx + 1] = token_chunk_size - self.share_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.share_inputs["prompt_lens"][idx:idx + 1] = token_chunk_size + self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array( + request.prompt_token_ids[:token_chunk_size] + ) + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["prompt_lens"][idx : idx + 1] = token_chunk_size else: if self.enable_mm: inputs = self._preprocess_mm_task(request.multimodal_inputs) if inputs.get("images") is not None: - self.share_inputs[ - "image_features"] = self.extract_vision_features( - inputs) + self.share_inputs["image_features"] = self.extract_vision_features(inputs) else: # Compatible with the situation that lacks images and videos self.share_inputs["image_features"] = None position_ids = inputs["position_ids"] length = inputs["input_ids"].shape[1] - self.share_inputs["input_ids"][ - idx:idx + 1, :length] = inputs["input_ids"] + self.share_inputs["input_ids"][idx : idx + 1, :length] = inputs["input_ids"] else: - self.share_inputs['seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['step_seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['seq_lens_this_time'][idx:idx + - 1] = length - self.share_inputs['step_seq_lens_encoder'][idx:idx + - 1] = length - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = length - self.share_inputs["prompt_lens"][idx:idx + 1] = length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: enable_thinking = request.get("enable_thinking", True) enable_thinking = enable_thinking if enable_thinking is not None else True self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][ - idx:idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][ - idx:idx + 1, :] = request.get("reasoning_max_tokens", 2048) - self.share_inputs["rope_emb"][idx:idx + - 1, :] = self.prepare_rope3d( - position_ids, request.get("max_tokens", 2048)) - self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0 + self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) + self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( + position_ids, request.get("max_tokens", 2048) + ) + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) @@ -328,73 +315,67 @@ class GPUModelRunner(ModelRunnerBase): else: return default_value - - if len(request.eos_token_ids - ) < self.parallel_config.eos_tokens_lens: + if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens: request.eos_token_ids.append(request.eos_token_ids[0]) - self.share_inputs["eos_token_id"][:] = np.array( - request.eos_token_ids, dtype="int64").reshape(-1, 1) - self.share_inputs["top_p"][idx:idx + 1] = get_attr_from_request(request, "top_p", 0.7) - self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0) - self.share_inputs["temperature"][idx:idx + 1] = get_attr_from_request(request,"temperature", 0.95) - self.share_inputs["penalty_score"][idx:idx + 1] = get_attr_from_request( - request, "repetition_penalty", 1.0) - self.share_inputs["frequency_score"][idx:idx + 1] = get_attr_from_request( - request, "frequency_penalty", 0.0) - self.share_inputs["presence_score"][idx:idx + 1] = get_attr_from_request( - request, "presence_penalty", 0.0) + self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) + self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7) + self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95) + self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request( + request, "repetition_penalty", 1.0 + ) + self.share_inputs["frequency_score"][idx : idx + 1] = get_attr_from_request( + request, "frequency_penalty", 0.0 + ) + self.share_inputs["presence_score"][idx : idx + 1] = get_attr_from_request( + request, "presence_penalty", 0.0 + ) - self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( - "min_tokens", 1) - self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_model_len) - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1) + self.share_inputs["max_dec_len"][idx : idx + 1] = request.get( + "max_tokens", self.model_config.max_model_len + ) + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + 1] = length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length if request.get("seed") is not None: - self.share_inputs["infer_seed"][idx:idx + - 1] = request.get("seed") + self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed") encoder_block_num = len(request.get("block_tables")) - self.share_inputs["encoder_block_lens"][idx:idx + - 1] = encoder_block_num - self.share_inputs["block_tables"][idx:idx + 1, :] = -1 - self.share_inputs["block_tables"][ - idx:idx + 1, :encoder_block_num] = np.array( - request.block_tables, dtype="int32") + self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num + self.share_inputs["block_tables"][idx : idx + 1, :] = -1 + self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( + request.block_tables, dtype="int32" + ) - if request.get("stop_token_ids") is not None and request.get( - "stop_seqs_len") is not None: + if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) - for i in range(stop_seqs_num, - self.model_config.max_stop_seqs_num): + for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num): request.stop_seqs_len.append(0) - self.share_inputs["stop_seqs_len"][:] = np.array( - request.stop_seqs_len, dtype="int32") - self.share_inputs["stop_seqs"][:stop_seqs_num, :len( - request.get("stop_token_ids")[0])] = np.array( - request.get("stop_token_ids"), dtype="int64") + self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32") + self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( + request.get("stop_token_ids"), dtype="int64" + ) - self.sampler.apply_logits_processor( - idx, request.get("logits_processor"), prefill_tokens) + self.sampler.apply_logits_processor(idx, request.get("logits_processor"), prefill_tokens) self.share_inputs["not_need_stop"][0] = True if self.speculative_method in ["mtp"]: self.proposer.insert_prefill_inputs(req_dicts) - def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, - expected_decode_len: int): - """ Set dummy prefill inputs to share_inputs """ + def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int): + """Set dummy prefill inputs to share_inputs""" # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token if self.enable_mm: self.share_inputs["free_list"] = paddle.to_tensor([], dtype="int32") self.share_inputs["free_list_len"][0] = 0 max_dec_len = expected_decode_len + 1 - full_length = min(num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len) + full_length = min( + num_tokens // batch_size, + self.parallel_config.max_model_len - max_dec_len, + ) input_length = int(full_length * self.parallel_config.kv_cache_ratio) block_num = ( input_length + self.parallel_config.block_size - 1 @@ -402,33 +383,27 @@ class GPUModelRunner(ModelRunnerBase): for i in range(batch_size): idx = i - self.share_inputs["input_ids"][idx:idx + - 1, :input_length] = np.array( - [5] * input_length) - self.share_inputs["prompt_ids"][idx:idx + 1, :input_length] = np.array( - [5] * input_length) - self.share_inputs["eos_token_id"][:] = np.array( - [2], dtype="int64").reshape(-1, 1) - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length - self.share_inputs["step_seq_lens_encoder"][idx:idx + - 1] = input_length - self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length - self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0 - self.share_inputs["prompt_lens"][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["max_dec_len"][idx:idx + 1] = max_dec_len - self.share_inputs["min_dec_len"][idx:idx + 1] = max_dec_len - self.share_inputs["stop_flags"][idx:idx + 1] = False - self.share_inputs["temperature"][idx:idx + 1] = 1 + self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + self.share_inputs["prompt_lens"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["max_dec_len"][idx : idx + 1] = max_dec_len + self.share_inputs["min_dec_len"][idx : idx + 1] = max_dec_len + self.share_inputs["stop_flags"][idx : idx + 1] = False + self.share_inputs["temperature"][idx : idx + 1] = 1 - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + - 1] = input_length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length - self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num - self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \ - (idx + 1) * block_num, 1) + self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num + self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( + idx * block_num, (idx + 1) * block_num, 1 + ) def _init_share_inputs(self, max_num_seqs: int): """ @@ -440,143 +415,89 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["pre_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], -1, - dtype='int64') + dtype="int64", + ) self.share_inputs["input_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], self.parallel_config.pad_token_id, - dtype='int64') + dtype="int64", + ) self.share_inputs["prompt_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], self.parallel_config.pad_token_id, - dtype='int64') - self.share_inputs["eos_token_id"] = paddle.full( - [self.parallel_config.eos_tokens_lens, 1], 0, dtype='int64') - self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], - self.model_config.top_p, - dtype='float32') - self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + dtype="int64", + ) + self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64") + self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") + self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["temperature"] = paddle.full( - [max_num_seqs, 1], self.model_config.temperature, dtype='float32') + [max_num_seqs, 1], self.model_config.temperature, dtype="float32" + ) self.share_inputs["penalty_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.penalty_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.penalty_score, dtype="float32" + ) self.share_inputs["frequency_score"] = paddle.full( [max_num_seqs, 1], self.model_config.frequency_score, - dtype='float32') + dtype="float32", + ) self.share_inputs["presence_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.presence_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.presence_score, dtype="float32" + ) - self.share_inputs["min_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') + self.share_inputs["min_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') - self.share_inputs["min_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" + ) + self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') - self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, - 0, - dtype='int32') - self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["step_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["step_seq_lens_decoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["prompt_lens"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" + ) + self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") + self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["prompt_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["not_need_stop"] = paddle.full( - [1], False, - dtype='bool').cpu() # TODO(gongshaotian): move to pinnd memory - self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], - True, - dtype='bool') - self.share_inputs["stop_nums"] = paddle.full([1], - max_num_seqs, - dtype='int64') + [1], False, dtype="bool" + ).cpu() # TODO(gongshaotian): move to pinnd memory + self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], True, dtype="bool") + self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64") - self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype='int64') - self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], - False, - dtype='bool') - self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["step_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["need_block_len"] = paddle.full([1], - 0, - dtype='int32') - self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["ori_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int32') + self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype="int64") + self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool") + self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( [max_num_seqs * self.parallel_config.max_model_len], 0, - dtype='int64') - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') + dtype="int64", + ) + self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["batch_id_per_token"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # AttentionBackend buffers - self.share_inputs["decoder_batch_ids"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') + self.share_inputs["decoder_batch_ids"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # Initialize rotary position embedding - tmp_position_ids = paddle.arange( - self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) # TODO(gongshaotian): move to models if not self.enable_mm: @@ -584,87 +505,89 @@ class GPUModelRunner(ModelRunnerBase): rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, base=self.model_config.rope_theta, - model_config=self.model_config) + model_config=self.model_config, + ) # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + - self.parallel_config.block_size - 1 + self.parallel_config.max_model_len + self.parallel_config.block_size - 1 ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num - self.share_inputs["block_tables"] = paddle.full( - [max_num_seqs, pre_max_block_num], -1, dtype='int32') + self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") # Initialize free list free_list = list( range( self.parallel_config.total_block_num - 1, - int(self.parallel_config.total_block_num * - self.parallel_config.kv_cache_ratio) - 1, -1)) + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs["free_list"] = paddle.to_tensor(free_list, - dtype="int32") - self.share_inputs["free_list_len"] = paddle.full([1], - self.free_list_len, - dtype="int32") + self.share_inputs["free_list"] = paddle.to_tensor(free_list, dtype="int32") + self.share_inputs["free_list_len"] = paddle.full([1], self.free_list_len, dtype="int32") # Initialize stop seqs - self.share_inputs["stop_seqs_len"] = paddle.full( - [self.model_config.max_stop_seqs_num], 0, dtype="int32") - self.share_inputs["stop_seqs"] = paddle.full([ - self.model_config.max_stop_seqs_num, - self.model_config.stop_seqs_max_len - ], - -1, - dtype="int32") + self.share_inputs["stop_seqs_len"] = paddle.full([self.model_config.max_stop_seqs_num], 0, dtype="int32") + self.share_inputs["stop_seqs"] = paddle.full( + [ + self.model_config.max_stop_seqs_num, + self.model_config.stop_seqs_max_len, + ], + -1, + dtype="int32", + ) if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( shape=[max_num_seqs, self.parallel_config.max_model_len], fill_value=1, - dtype='int64').cpu() - self.share_inputs['accept_tokens'] = paddle.full( + dtype="int64", + ).cpu() + self.share_inputs["accept_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, - dtype="int64") - self.share_inputs['accept_num'] = paddle.full(shape=[max_num_seqs], - fill_value=0, - dtype='int32') - self.share_inputs['draft_tokens'] = paddle.full( + dtype="int64", + ) + self.share_inputs["accept_num"] = paddle.full(shape=[max_num_seqs], fill_value=0, dtype="int32") + self.share_inputs["draft_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, - dtype="int64") + dtype="int64", + ) - self.share_inputs['actual_draft_token_num'] = paddle.full( + self.share_inputs["actual_draft_token_num"] = paddle.full( shape=[max_num_seqs], fill_value=max_draft_token_num, - dtype="int32") - self.share_inputs["output_cum_offsets"] = paddle.full( - shape=[max_num_seqs, 1], fill_value=0, dtype='int32') + dtype="int32", + ) + self.share_inputs["output_cum_offsets"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") self.share_inputs["output_padding_offset"] = paddle.full( shape=[max_num_seqs * (max_draft_token_num + 1)], fill_value=0, - dtype="int32") + dtype="int32", + ) if self.enable_mm: head_dim = self.model_config.head_dim - self.share_inputs["rope_emb"] = paddle.full(shape=[ - max_num_seqs, 2, 1, self.parallel_config.max_model_len, 1, head_dim // 2 - ], - fill_value=0, - dtype="float32") + self.share_inputs["rope_emb"] = paddle.full( + shape=[ + max_num_seqs, + 2, + 1, + self.parallel_config.max_model_len, + 1, + head_dim // 2, + ], + fill_value=0, + dtype="float32", + ) self.share_inputs["image_features"] = None - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], - fill_value=0, - dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[1], - fill_value=True, - dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], - fill_value=0, - dtype="int32") + self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool") + self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") def _prepare_inputs(self) -> None: - """ Prepare the model inputs """ + """Prepare the model inputs""" # Remove padding ( ids_remove_padding, @@ -675,14 +598,16 @@ class GPUModelRunner(ModelRunnerBase): output_cum_offsets, output_padding_offset, ) = pre_process( - self.parallel_config.max_model_len, self.share_inputs["input_ids"], - self.share_inputs["seq_lens_this_time"], self.speculative_decoding, - self.share_inputs["draft_tokens"] if self.speculative_decoding else - None, self.share_inputs["seq_lens_encoder"], - self.share_inputs["seq_lens_decoder"]) + self.parallel_config.max_model_len, + self.share_inputs["input_ids"], + self.share_inputs["seq_lens_this_time"], + self.speculative_decoding, + (self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + self.share_inputs["seq_lens_encoder"], + self.share_inputs["seq_lens_decoder"], + ) - self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, - False) + self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) @@ -690,10 +615,8 @@ class GPUModelRunner(ModelRunnerBase): # For speculative decoding if self.speculative_decoding: - self.share_inputs["output_cum_offsets"].copy_( - output_cum_offsets, False) - self.share_inputs["output_padding_offset"].copy_( - output_padding_offset, False) + self.share_inputs["output_cum_offsets"].copy_(output_cum_offsets, False) + self.share_inputs["output_padding_offset"].copy_(output_padding_offset, False) # Initialize forward meta data self.initialize_forward_meta() @@ -717,32 +640,29 @@ class GPUModelRunner(ModelRunnerBase): ) def load_model(self) -> None: - """ load or download model """ - logger.info( - f"Starting to load model {self.model_config.architectures[0]}") + """load or download model""" + logger.info(f"Starting to load model {self.model_config.architectures[0]}") time_before_load = time.perf_counter() # 1. Load original model self.model = get_model_from_loader(fd_config=self.fd_config) # 1.1 Load RL dynamic model if self.fd_config.load_config.dynamic_load_weight: - from fastdeploy.rl.dynamic_weight_manager import \ - DynamicWeightManager - self.dynamic_weight_manager = DynamicWeightManager( - self.fd_config, self.model) + from fastdeploy.rl.dynamic_weight_manager import DynamicWeightManager + + self.dynamic_weight_manager = DynamicWeightManager(self.fd_config, self.model) # 2. Load lora model # 3. Load drafter model(for speculative decoding) time_after_load = time.perf_counter() - logger.info( - f"Model loading took {time_after_load - time_before_load} seconds") + logger.info(f"Model loading took {time_after_load - time_before_load} seconds") # 4. Init proposer for speculative method self._init_speculative_proposer() def get_model(self) -> nn.Layer: - """ Get current model """ + """Get current model""" return self.model def initialize_forward_meta(self): @@ -765,7 +685,7 @@ class GPUModelRunner(ModelRunnerBase): cu_seqlens_q=self.share_inputs["cu_seqlens_q"], cu_seqlens_k=self.share_inputs["cu_seqlens_k"], block_tables=self.share_inputs["block_tables"], - caches=self.share_inputs["caches"] + caches=self.share_inputs["caches"], ) # Update Batch type for cuda graph @@ -787,30 +707,29 @@ class GPUModelRunner(ModelRunnerBase): # Get kv cache dtype cache_type = self.parallel_config.dtype - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): - cache_type = 'uint8' + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): + cache_type = "uint8" # Get kv cache shape - kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( - max_num_blocks=max_block_num) + kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num) local_rank = self.local_rank % self.parallel_config.tensor_parallel_size if not self.parallel_config.do_profile and ( - self.parallel_config.enable_prefix_caching \ - or self.parallel_config.splitwise_role != "mixed"): + self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed" + ): cache_kvs_list = [] for i in range(self.model_config.num_hidden_layers): key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}" - key_cache = share_external_data(key_cache, key_cache_name, - kv_cache_shape) + key_cache = share_external_data(key_cache, key_cache_name, kv_cache_shape) cache_kvs_list.append(key_cache) value_cache = paddle.empty(shape=[], dtype=cache_type) - value_cache = share_external_data(value_cache, val_cache_name, - kv_cache_shape) + value_cache = share_external_data(value_cache, val_cache_name, kv_cache_shape) cache_kvs_list.append(value_cache) self.share_inputs["caches"] = cache_kvs_list @@ -818,12 +737,12 @@ class GPUModelRunner(ModelRunnerBase): else: for i in range(self.model_config.num_hidden_layers): - cache_kvs["key_caches_{}".format(i)] = paddle.full( + cache_kvs[f"key_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, ) - cache_kvs["value_caches_{}".format(i)] = paddle.full( + cache_kvs[f"value_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, @@ -840,25 +759,30 @@ class GPUModelRunner(ModelRunnerBase): assert len(self.attn_backends) == 0 num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size - self.model_config.kv_num_heads = max(1, int( - self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_size) + self.model_config.kv_num_heads = max( + 1, + int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size, + ) head_dim = self.model_config.head_dim # Get the attention backend attn_cls = get_attention_backend() - attn_backend = attn_cls(self.fd_config, - kv_num_heads=self.model_config.kv_num_heads, - num_heads=num_heads, - head_dim=head_dim) + attn_backend = attn_cls( + self.fd_config, + kv_num_heads=self.model_config.kv_num_heads, + num_heads=num_heads, + head_dim=head_dim, + ) self.attn_backends.append(attn_backend) - def _dummy_run(self, - num_tokens: paddle.Tensor, - batch_size: paddle.Tensor, - expected_decode_len: int = 1, - in_capturing: bool = False) -> paddle.Tensor: + def _dummy_run( + self, + num_tokens: paddle.Tensor, + batch_size: paddle.Tensor, + expected_decode_len: int = 1, + in_capturing: bool = False, + ) -> paddle.Tensor: """ Use dummy inputs to run before formal execution. Args: @@ -866,14 +790,17 @@ class GPUModelRunner(ModelRunnerBase): expected_decode_len: Expected number of tokens generated in_capturing: Is cuda graph in capturing state """ - self._dummy_prefill_inputs(num_tokens=num_tokens, - batch_size=batch_size, - expected_decode_len=expected_decode_len) + self._dummy_prefill_inputs( + num_tokens=num_tokens, + batch_size=batch_size, + expected_decode_len=expected_decode_len, + ) if self.speculative_method in ["mtp"]: self.proposer.dummy_prefill_inputs( num_tokens=num_tokens, batch_size=batch_size, - expected_decode_len=expected_decode_len) + expected_decode_len=expected_decode_len, + ) while True: # 1. Initialize forward meta and attention meta data @@ -885,14 +812,17 @@ class GPUModelRunner(ModelRunnerBase): # 3. Run model if self.enable_mm: - model_output = self.model(self.share_inputs["ids_remove_padding"], - self.share_inputs["image_features"], - self.forward_meta) + model_output = self.model( + self.share_inputs["ids_remove_padding"], + self.share_inputs["image_features"], + self.forward_meta, + ) hidden_states = model_output else: model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta) + forward_meta=self.forward_meta, + ) hidden_states = rebuild_padding( model_output, @@ -900,9 +830,9 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], - self.share_inputs["output_padding_offset"] - if self.speculative_decoding else - None, # speculative decoding requires + ( + self.share_inputs["output_padding_offset"] if self.speculative_decoding else None + ), # speculative decoding requires self.parallel_config.max_model_len, ) @@ -919,24 +849,22 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["step_idx"], self.share_inputs["stop_flags"], ) - sampler_output = self.sampler(logits, - self.sampling_metadata) + sampler_output = self.sampler(logits, self.sampling_metadata) if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: - self.sampler(logits, self.sampling_metadata, - self.parallel_config.max_model_len, - self.share_inputs) + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) sampler_output = None if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], 0) - paddle.distributed.broadcast( - self.share_inputs["accept_num"], 0) - paddle.distributed.broadcast(self.share_inputs["step_idx"], - 0) - paddle.distributed.broadcast( - self.share_inputs["stop_flags"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_tokens"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_num"], 0) + paddle.distributed.broadcast(self.share_inputs["step_idx"], 0) + paddle.distributed.broadcast(self.share_inputs["stop_flags"], 0) # 5. post process model_output_data = ModelOutputData( @@ -957,28 +885,24 @@ class GPUModelRunner(ModelRunnerBase): msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=self.share_inputs["draft_tokens"] - if self.speculative_decoding else None, - actual_draft_token_num=self. - share_inputs["actual_draft_token_num"] - if self.speculative_decoding else None, - accept_tokens=self.share_inputs["accept_tokens"] - if self.speculative_decoding else None, - accept_num=self.share_inputs["accept_num"] - if self.speculative_decoding else None, - enable_thinking= self.share_inputs["enable_thinking"] - if self.enable_mm else None, - think_end_id=self.model_config.think_end_id - if self.enable_mm else -1, - need_think_end=self.share_inputs["need_think_end"] - if self.enable_mm else None, - reasoning_index=self.share_inputs["reasoning_index"] - if self.enable_mm else None) + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), + think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), + need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), + reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), + ) - post_process(sampler_output=sampler_output, - model_output=model_output_data, - speculative_decoding=self.speculative_decoding, - skip_save_output=True) + post_process( + sampler_output=sampler_output, + model_output=model_output_data, + speculative_decoding=self.speculative_decoding, + skip_save_output=True, + ) if self.speculative_decoding: if self.speculative_method == "mtp": @@ -989,12 +913,15 @@ class GPUModelRunner(ModelRunnerBase): # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - step_cuda(self.share_inputs, self.parallel_config.block_size, - self.parallel_config.enc_dec_block_num, - self.speculative_config, - self.parallel_config.enable_prefix_caching) + step_cuda( + self.share_inputs, + self.parallel_config.block_size, + self.parallel_config.enc_dec_block_num, + self.speculative_config, + self.parallel_config.enable_prefix_caching, + ) - if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0: + if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0: break def _update_chunked_prefill(self, tasks): @@ -1013,56 +940,49 @@ class GPUModelRunner(ModelRunnerBase): for id, task in list(self.restore_chunked_prefill_request.items()): idx = task.idx - logger.debug( - f"{task.request_id} chunked prefill {task.chunk_idx}/{len(task.prefill_chunk_info)}" - ) + logger.debug(f"{task.request_id} chunked prefill {task.chunk_idx}/{len(task.prefill_chunk_info)}") if not self.enable_mm: - start_idx = sum(task.prefill_chunk_info[:task.chunk_idx]) + start_idx = sum(task.prefill_chunk_info[: task.chunk_idx]) if task.chunk_idx == len(task.prefill_chunk_info): - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = 1 - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 1 + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1 + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 1 if self.enable_mm: - self.share_inputs["seq_lens_decoder"][idx:idx + - 1] = task.start_idx + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = task.start_idx else: - self.share_inputs["seq_lens_decoder"][ - idx:idx + 1] = start_idx + task.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) del self.restore_chunked_prefill_request[task.request_id] else: token_chunk_size = task.prefill_chunk_info[task.chunk_idx] if self.enable_mm: inputs = self._preprocess_mm_task(task.prefill_chunk_info[task.chunk_idx]) if inputs.get("images") is not None: - self.share_inputs[ - "image_features"] = self.extract_vision_features( - inputs) + self.share_inputs["image_features"] = self.extract_vision_features(inputs) else: # Compatible with the situation that lacks images and videos self.share_inputs["image_features"] = None token_chunk_size = inputs["input_ids"].shape[1] - self.share_inputs["input_ids"][idx:idx + 1, :token_chunk_size] = inputs["input_ids"] + self.share_inputs["input_ids"][idx : idx + 1, :token_chunk_size] = inputs["input_ids"] self.share_inputs["prompt_ids"][ - idx:idx + 1, - self.share_inputs["prompt_lens"][idx:idx + 1]: self.share_inputs["prompt_lens"][idx:idx + 1] + token_chunk_size - ] = inputs["input_ids"] - self.share_inputs["seq_lens_decoder"][idx:idx +1] = task.start_idx + idx : idx + 1, + self.share_inputs["prompt_lens"][idx : idx + 1] : self.share_inputs["prompt_lens"][ + idx : idx + 1 + ] + + token_chunk_size, + ] = inputs["input_ids"] + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = task.start_idx task.start_idx += token_chunk_size else: - self.share_inputs['input_ids'][idx, :token_chunk_size] = np.array( - task.prompt_token_ids[start_idx:start_idx + - token_chunk_size]) - self.share_inputs["seq_lens_decoder"][ - idx:idx + 1] = start_idx + task.get("seq_lens_decoder", 0) - self.share_inputs["seq_lens_this_time"][idx:idx + - 1] = token_chunk_size - self.share_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.share_inputs["prompt_lens"][idx:idx + 1] += token_chunk_size - self.share_inputs["step_idx"][idx:idx + 1] = 0 + self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array( + task.prompt_token_ids[start_idx : start_idx + token_chunk_size] + ) + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["prompt_lens"][idx : idx + 1] += token_chunk_size + self.share_inputs["step_idx"][idx : idx + 1] = 0 - if self.speculative_decoding and self.proposer.is_chunk_prefill_enabled( - ): + if self.speculative_decoding and self.proposer.is_chunk_prefill_enabled(): self.proposer.update_task_chunk_prefill(task) task.chunk_idx += 1 @@ -1071,29 +991,24 @@ class GPUModelRunner(ModelRunnerBase): Trigger CUDA Graph capture for all shapes in cuda graph capture list """ if not self.use_cudagraph: - logger.info( - "Skipping CUDA graph capture. Please check GraphOptimizationConfig" - ) + logger.info("Skipping CUDA graph capture. Please check GraphOptimizationConfig") return time_before_capture = time.perf_counter() expected_decode_len = 1 capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): - self._dummy_run(num_tokens=self.parallel_config.max_model_len, - batch_size=batch_size, - in_capturing=True, - expected_decode_len=expected_decode_len) - logger.info( - f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}" + self._dummy_run( + num_tokens=self.parallel_config.max_model_len, + batch_size=batch_size, + in_capturing=True, + expected_decode_len=expected_decode_len, ) + logger.info(f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}") time_after_capture = time.perf_counter() - logger.info( - f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds" - ) + logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") - def _get_skip_idx(self, - model_forward_batch: Optional[List[Request]] = None): + def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None): """ Get the index of the request that needs to be skipped during execution. Args: @@ -1106,15 +1021,12 @@ class GPUModelRunner(ModelRunnerBase): return skip_idx_list for task in model_forward_batch: - if task.get("prefill_chunk_info", - None) is None or task.chunk_idx >= len( - task.prefill_chunk_info): + if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info): continue skip_idx_list.append(task.idx) for task in self.restore_chunked_prefill_request.values(): - if task.idx in skip_idx_list or task.chunk_idx >= len( - task.prefill_chunk_info): + if task.idx in skip_idx_list or task.chunk_idx >= len(task.prefill_chunk_info): continue skip_idx_list.append(task.idx) @@ -1147,22 +1059,24 @@ class GPUModelRunner(ModelRunnerBase): # 3. Execute model if self.enable_mm: - model_output = self.model(self.share_inputs["ids_remove_padding"], - self.share_inputs["image_features"], - self.forward_meta) + model_output = self.model( + self.share_inputs["ids_remove_padding"], + self.share_inputs["image_features"], + self.forward_meta, + ) hidden_states = model_output else: model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta) + forward_meta=self.forward_meta, + ) hidden_states = rebuild_padding( model_output, self.share_inputs["cum_offsets"], self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], - self.share_inputs["output_padding_offset"] - if self.speculative_decoding else None, + (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), self.parallel_config.max_model_len, ) @@ -1188,17 +1102,18 @@ class GPUModelRunner(ModelRunnerBase): paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) else: - self.sampler(logits, self.sampling_metadata, - self.parallel_config.max_model_len, self.share_inputs) + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) sampler_output = None if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], 0) - paddle.distributed.broadcast(self.share_inputs["accept_num"], - 0) + paddle.distributed.broadcast(self.share_inputs["accept_tokens"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_num"], 0) paddle.distributed.broadcast(self.share_inputs["step_idx"], 0) - paddle.distributed.broadcast(self.share_inputs["stop_flags"], - 0) + paddle.distributed.broadcast(self.share_inputs["stop_flags"], 0) # 5. Post Process model_output_data = ModelOutputData( @@ -1219,33 +1134,29 @@ class GPUModelRunner(ModelRunnerBase): msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=self.share_inputs["draft_tokens"] - if self.speculative_decoding else None, - actual_draft_token_num=self.share_inputs["actual_draft_token_num"] - if self.speculative_decoding else None, - accept_tokens=self.share_inputs["accept_tokens"] - if self.speculative_decoding else None, - accept_num=self.share_inputs["accept_num"] - if self.speculative_decoding else None, - enable_thinking= self.share_inputs["enable_thinking"] - if self.enable_mm else None, - think_end_id=self.model_config.think_end_id - if self.enable_mm else -1, - need_think_end=self.share_inputs["need_think_end"] - if self.enable_mm else None, - reasoning_index=self.share_inputs["reasoning_index"] - if self.enable_mm else None) + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), + think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), + need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), + reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), + ) - if self.speculative_config.method in ["mtp"] and \ - self.parallel_config.splitwise_role == "prefill": + if self.speculative_config.method in ["mtp"] and self.parallel_config.splitwise_role == "prefill": skip_save_output = True else: skip_save_output = False - post_process(sampler_output=sampler_output, - model_output=model_output_data, - save_each_rank=self.parallel_config.use_ep, - speculative_decoding=self.speculative_decoding, - skip_save_output=skip_save_output) + post_process( + sampler_output=sampler_output, + model_output=model_output_data, + save_each_rank=self.parallel_config.use_ep, + speculative_decoding=self.speculative_decoding, + skip_save_output=skip_save_output, + ) # 6. Speculative decode if self.speculative_decoding: @@ -1283,11 +1194,9 @@ class GPUModelRunner(ModelRunnerBase): request.logits_cached = True if isinstance(request.logits_processor, LogitsProcessorBase): - self.guided_backend.add_cache(request.schemata_key, - request.logits_processor) + self.guided_backend.add_cache(request.schemata_key, request.logits_processor) else: - self.guided_backend.add_cache( - request.schemata_key, request.logits_processor.result()) + self.guided_backend.add_cache(request.schemata_key, request.logits_processor.result()) def _execute_empty_input(self) -> None: """ @@ -1298,11 +1207,10 @@ class GPUModelRunner(ModelRunnerBase): if hasattr(self.model, "empty_input_forward"): self.model.empty_input_forward() else: - raise ValueError( - f"{type(self.model)} has no attribute 'empty_input_forward") + raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") def profile_run(self) -> None: - """ Execute a forward pass with dummy inputs to profile the memory usage of the model """ + """Execute a forward pass with dummy inputs to profile the memory usage of the model""" # Initialize kv cache for profile run. After profile run kv cache will be reset. # TODO(gongshaotian): Optimize the management logic of kvcache @@ -1312,8 +1220,10 @@ class GPUModelRunner(ModelRunnerBase): # 1. Profile with multimodal encoder & encoder cache # 2. Dummy run - self._dummy_run(num_tokens=self.parallel_config.max_num_batched_tokens, - batch_size=min(self.parallel_config.max_num_seqs, 3)) + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=min(self.parallel_config.max_num_seqs, 3), + ) # 3. gc self.clear_cache() @@ -1330,23 +1240,24 @@ class GPUModelRunner(ModelRunnerBase): self.num_gpu_blocks = num_gpu_blocks # Reset block table and kv cache with global block num - if not (self.parallel_config.enable_prefix_caching \ - or self.parallel_config.splitwise_role != "mixed"): + if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"): self.initialize_kv_cache() # Reset free list free_list = list( range( self.num_gpu_blocks - 1, - int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - - 1, -1)) + int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs.update({ - "free_list": - paddle.to_tensor(free_list, dtype="int32"), - "free_list_len": - paddle.full([1], self.free_list_len, dtype="int32"), - }) + self.share_inputs.update( + { + "free_list": paddle.to_tensor(free_list, dtype="int32"), + "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"), + } + ) self.parallel_config.do_profile = False @@ -1365,9 +1276,11 @@ class GPUModelRunner(ModelRunnerBase): - cache_int4: """ cache_quant_dtype = None - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): cache_quant_dtype = self.quant_config.kv_cache_quant_type if cache_quant_dtype is not None: # int8, int8_zp, fp8, fp8_zp @@ -1377,40 +1290,36 @@ class GPUModelRunner(ModelRunnerBase): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads # NOTE(liuzichang): Implement multi-layer MTP architecture in the future - num_layers = self.model_config.num_hidden_layers + \ - self.speculative_config.num_gpu_block_expand_ratio if \ - self.speculative_method in [ - "mtp" - ] else self.model_config.num_hidden_layers - required_memory = ( - byte_of_dtype * 2 * # k + v - (self.parallel_config.block_size * hidden_dim) * num_layers) + num_layers = ( + self.model_config.num_hidden_layers + self.speculative_config.num_gpu_block_expand_ratio + if self.speculative_method in ["mtp"] + else self.model_config.num_hidden_layers + ) + required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers # k + v return required_memory def not_need_stop(self) -> bool: - """ Stop decoding if the tensor meets the termination condition """ + """Stop decoding if the tensor meets the termination condition""" return self.share_inputs["not_need_stop"][0] def clear_cache(self): - """ Clear cached data from shared inputs and forward metadata """ + """Clear cached data from shared inputs and forward metadata""" self.share_inputs.pop("caches", None) if self.forward_meta is not None: self.forward_meta.clear_caches() def clear_parameters(self, pid): - """" Dynamic model loader use to clear parameters use for RL """ + """ " Dynamic model loader use to clear parameters use for RL""" self.dynamic_weight_manager.clear_parameters(pid) self.clear_cache() paddle.device.cuda.empty_cache() - self.dynamic_weight_manager._log_memory( - "dynamic weight manager clear all memory") + self.dynamic_weight_manager._log_memory("dynamic weight manager clear all memory") def update_parameters(self, pid): - """" Dynamic model loader use to update parameters use for RL """ + """ " Dynamic model loader use to update parameters use for RL""" self.dynamic_weight_manager.update_parameters(pid) self.initialize_kv_cache() - self.dynamic_weight_manager._log_memory( - "dynamic weight manager update all memory") + self.dynamic_weight_manager._log_memory("dynamic weight manager update all memory") def padding_cudagraph_inputs(self) -> None: """ @@ -1431,18 +1340,19 @@ class GPUModelRunner(ModelRunnerBase): ) processor.eval() image_preprocess = processor.image_preprocessor - image_preprocess.image_mean_tensor = paddle.to_tensor( - image_preprocess.image_mean, dtype="float32").reshape([1, 3, 1, 1]) - image_preprocess.image_std_tensor = paddle.to_tensor( - image_preprocess.image_std, dtype="float32").reshape([1, 3, 1, 1]) - image_preprocess.rescale_factor = paddle.to_tensor( - image_preprocess.rescale_factor, dtype="float32") - image_preprocess.image_mean_tensor = image_preprocess.image_mean_tensor.squeeze( - [-2, -1]).repeat_interleave(self.model_config.vision_config.patch_size**2 * 1, - -1) - image_preprocess.image_std_tensor = image_preprocess.image_std_tensor.squeeze( - [-2, -1]).repeat_interleave(self.model_config.vision_config.patch_size**2 * 1, - -1) + image_preprocess.image_mean_tensor = paddle.to_tensor(image_preprocess.image_mean, dtype="float32").reshape( + [1, 3, 1, 1] + ) + image_preprocess.image_std_tensor = paddle.to_tensor(image_preprocess.image_std, dtype="float32").reshape( + [1, 3, 1, 1] + ) + image_preprocess.rescale_factor = paddle.to_tensor(image_preprocess.rescale_factor, dtype="float32") + image_preprocess.image_mean_tensor = image_preprocess.image_mean_tensor.squeeze([-2, -1]).repeat_interleave( + self.model_config.vision_config.patch_size**2 * 1, -1 + ) + image_preprocess.image_std_tensor = image_preprocess.image_std_tensor.squeeze([-2, -1]).repeat_interleave( + self.model_config.vision_config.patch_size**2 * 1, -1 + ) self.image_preprocess = image_preprocess def _preprocess_mm_task(self, one: dict) -> None: @@ -1456,8 +1366,7 @@ class GPUModelRunner(ModelRunnerBase): if one["images"] is not None: image_type_ids = one["image_type_ids"][np.newaxis, :] images = one["images"] - image_type_ids = paddle.to_tensor(image_type_ids, - dtype=paddle.int64) + image_type_ids = paddle.to_tensor(image_type_ids, dtype=paddle.int64) images = paddle.to_tensor(images, dtype="uint8") grid_thw = paddle.to_tensor(one["grid_thw"], dtype="int64") else: @@ -1466,8 +1375,7 @@ class GPUModelRunner(ModelRunnerBase): grid_thw = None if one["position_ids"] is not None: - position_ids = paddle.to_tensor(one["position_ids"], - dtype="int64").unsqueeze([0]) + position_ids = paddle.to_tensor(one["position_ids"], dtype="int64").unsqueeze([0]) else: position_ids = None @@ -1500,20 +1408,17 @@ class GPUModelRunner(ModelRunnerBase): image_mask = input_ids == self.model_config.im_patch_id image_type_ids = inputs["image_type_ids"] with paddle.amp.auto_cast( - True, - custom_black_list=self.amp_black, - custom_white_list=self.amp_white, - level="O2", - dtype=self.parallel_config.dtype, + True, + custom_black_list=self.amp_black, + custom_white_list=self.amp_white, + level="O2", + dtype=self.parallel_config.dtype, ): - image_features = self.model.vision_model.extract_feature( - images, grid_thw) + image_features = self.model.vision_model.extract_feature(images, grid_thw) if self.parallel_config.tensor_parallel_size > 1: S, C = image_features.shape - image_features = image_features.reshape( - [-1, C * self.model_config.spatial_conv_size**2]) - image_features = ScatterOp.apply(image_features, - axis=-1) # mp 切 Fea + image_features = image_features.reshape([-1, C * self.model_config.spatial_conv_size**2]) + image_features = ScatterOp.apply(image_features, axis=-1) # mp 切 Fea image_features = image_features.reshape([S, -1]) image_features = self.model.resampler_model( image_features, @@ -1530,11 +1435,11 @@ class GPUModelRunner(ModelRunnerBase): prefix_max_position_ids = paddle.max(position_ids) + 1 dec_pos_ids = paddle.tile( - paddle.arange(max_len, - dtype="int64").unsqueeze(0).unsqueeze(-1), [1, 1, 3]) + paddle.arange(max_len, dtype="int64").unsqueeze(0).unsqueeze(-1), + [1, 1, 3], + ) dec_pos_ids = dec_pos_ids + prefix_max_position_ids - position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids], - axis=1) + position_ids_3d_real = paddle.concat([position_ids, dec_pos_ids], axis=1) rope_emb = get_rope_3d( position_ids=position_ids_3d_real, diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 18c1b4302..7dcdcbe8f 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import gc import time from typing import List, Optional import paddle -import paddle.nn as nn import pynvml +from paddle import nn from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request @@ -33,7 +34,6 @@ logger = get_logger("gpu_worker", "gpu_worker.log") class GpuWorker(WorkerBase): - def __init__( self, fd_config: FDConfig, @@ -52,8 +52,7 @@ class GpuWorker(WorkerBase): Initialize device and construct model runner """ self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 - if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda( - ): + if self.device_config.device_type == "cuda" and paddle.device.is_compiled_with_cuda(): # Set evironment variable self.device_ids = self.parallel_config.device_ids.split(",") self.device = f"gpu:{self.local_rank % self.max_chips_per_node}" @@ -63,12 +62,11 @@ class GpuWorker(WorkerBase): gc.collect() paddle.device.cuda.empty_cache() if self.parallel_config.enable_custom_all_reduce: - from fastdeploy.distributed.communication_op import \ - use_custom_allreduce + from fastdeploy.distributed.communication_op import use_custom_allreduce + use_custom_allreduce() else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") + raise RuntimeError(f"Not support device type: {self.device_config.device}") # Construct model runner self.model_runner: GPUModelRunner = GPUModelRunner( @@ -76,7 +74,8 @@ class GpuWorker(WorkerBase): device=self.device, device_id=self.device_ids[self.local_rank % self.max_chips_per_node], rank=self.rank, - local_rank=self.local_rank) + local_rank=self.local_rank, + ) def prefill_finished(self): """ @@ -102,33 +101,30 @@ class GpuWorker(WorkerBase): Gb = 1024**3 paddle.device.cuda.reset_max_memory_reserved(self.local_rank) paddle.device.cuda.reset_max_memory_allocated(self.local_rank) - paddle_reserved_mem_before_run = paddle.device.cuda.max_memory_reserved( - self.local_rank) - paddle_allocated_mem_before_run = paddle.device.cuda.max_memory_allocated( - self.local_rank) # not reserved + paddle_reserved_mem_before_run = paddle.device.cuda.max_memory_reserved(self.local_rank) + paddle_allocated_mem_before_run = paddle.device.cuda.max_memory_allocated(self.local_rank) # not reserved pynvml.nvmlInit() - handle = pynvml.nvmlDeviceGetHandleByIndex( - int(self.device_ids[self.local_rank])) + handle = pynvml.nvmlDeviceGetHandleByIndex(int(self.device_ids[self.local_rank])) before_run_meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) - logger.info(( - "Before running the profile, the memory usage info is as follows:", - f"\nDevice Total memory: {before_run_meminfo.total / Gb}", - f"\nDevice used memory: {before_run_meminfo.used / Gb}", - f"\nDevice free memory: {before_run_meminfo.free / Gb}", - f"\nPaddle reserved memory: {paddle_reserved_mem_before_run / Gb}", - f"\nPaddle allocated memory: {paddle_allocated_mem_before_run / Gb}" - )) + logger.info( + ( + "Before running the profile, the memory usage info is as follows:", + f"\nDevice Total memory: {before_run_meminfo.total / Gb}", + f"\nDevice used memory: {before_run_meminfo.used / Gb}", + f"\nDevice free memory: {before_run_meminfo.free / Gb}", + f"\nPaddle reserved memory: {paddle_reserved_mem_before_run / Gb}", + f"\nPaddle allocated memory: {paddle_allocated_mem_before_run / Gb}", + ) + ) # 2. Profile run self.model_runner.profile_run() # 3. Statistical memory information - paddle_reserved_mem_after_run = paddle.device.cuda.max_memory_reserved( - self.local_rank) - paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated( - self.local_rank) + paddle_reserved_mem_after_run = paddle.device.cuda.max_memory_reserved(self.local_rank) + paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated(self.local_rank) model_block_memory_used = self.cal_theortical_kvcache() paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run @@ -138,34 +134,39 @@ class GpuWorker(WorkerBase): after_run_meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle) pynvml.nvmlShutdown() - available_kv_cache_memory = after_run_meminfo.total * \ - self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase + available_kv_cache_memory = ( + after_run_meminfo.total * self.parallel_config.gpu_memory_utilization + - after_run_meminfo.used + - paddle_peak_increase + ) available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num end_time = time.perf_counter() - logger.info(( - "After running the profile, the memory usage info is as follows:", - f"\nDevice Total memory: {after_run_meminfo.total / Gb}", - f"\nDevice used memory: {after_run_meminfo.used / Gb}", - f"\nDevice free memory: {after_run_meminfo.free / Gb}", - f"\nPaddle reserved memory: {paddle_reserved_mem_after_run / Gb}", - f"\nPaddle allocated memory: {paddle_allocated_mem_after_run / Gb}", - f"\nAvailable KV Cache meomory: {available_kv_cache_memory / Gb}", - f"Profile time: {end_time - start_time}")) + logger.info( + ( + "After running the profile, the memory usage info is as follows:", + f"\nDevice Total memory: {after_run_meminfo.total / Gb}", + f"\nDevice used memory: {after_run_meminfo.used / Gb}", + f"\nDevice free memory: {after_run_meminfo.free / Gb}", + f"\nPaddle reserved memory: {paddle_reserved_mem_after_run / Gb}", + f"\nPaddle allocated memory: {paddle_allocated_mem_after_run / Gb}", + f"\nAvailable KV Cache meomory: {available_kv_cache_memory / Gb}", + f"Profile time: {end_time - start_time}", + ) + ) return available_kv_cache_memory # return to caculate the block num in this device def load_model(self) -> None: - """ Load model """ + """Load model""" self.model_runner.load_model() def get_model(self) -> nn.Layer: - """ Get current model """ + """Get current model""" return self.model_runner.get_model() - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: - """ Initizlize the KV Cache """ + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: + """Initizlize the KV Cache""" pass def execute_model( @@ -177,7 +178,7 @@ class GpuWorker(WorkerBase): return output def preprocess_new_task(self, req_dicts: List[Request]) -> None: - """ Process new requests and then start the decode loop + """Process new requests and then start the decode loop TODO(gongshaotian):The scheduler should schedule the handling of prefill, and workers and modelrunners should not perceive it. """ @@ -195,10 +196,9 @@ class GpuWorker(WorkerBase): return True def cal_theortical_kvcache(self) -> int: - """ Calculate the block memory required """ + """Calculate the block memory required""" return self.model_runner.cal_theortical_kvcache() def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None: - """ Reinitialize the kv cache using the parameters from the profile """ - self.model_runner.update_share_input_block_num( - num_gpu_blocks=num_gpu_blocks) + """Reinitialize the kv cache using the parameters from the profile""" + self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks) diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 90a72b86b..1bce9d19e 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -13,31 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os import time from typing import List, Optional import numpy as np import paddle -import paddle.nn as nn +from paddle import nn from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend -from fastdeploy.model_executor.layers.attention.base_attention_backend import \ - AttentionBackend +from fastdeploy.model_executor.layers.attention.base_attention_backend import ( + AttentionBackend, +) from fastdeploy.model_executor.layers.rotary_embedding import get_rope from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata -from fastdeploy.model_executor.layers.sample.sampler import ( - Sampler, SpeculativeSampler) +from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler from fastdeploy.model_executor.model_loader import get_model_from_loader from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx -from fastdeploy.model_executor.pre_and_post_process import (post_process, - pre_process, - rebuild_padding, - step_cuda) +from fastdeploy.model_executor.pre_and_post_process import ( + post_process, + pre_process, + rebuild_padding, + step_cuda, +) from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput @@ -46,12 +49,13 @@ class IluvatarModelRunner(ModelRunnerBase): """ """ def __init__( - self, - fd_config: FDConfig, - device: str, # logic device - device_id: int, # physical device id - rank: int, - local_rank: int): + self, + fd_config: FDConfig, + device: str, # logic device + device_id: int, # physical device id + rank: int, + local_rank: int, + ): super().__init__(fd_config=fd_config, device=device) self.rank = rank self.local_rank = local_rank @@ -73,18 +77,17 @@ class IluvatarModelRunner(ModelRunnerBase): # Cuda Graph self.use_cudagraph = self.graph_opt_config.use_cudagraph - self.cudagraph_capture_sizes = list( - reversed(self.graph_opt_config.cudagraph_capture_sizes)) + self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups - self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, - dtype='int32') + self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") # Initialize share inputs self._init_share_inputs(self.parallel_config.max_num_seqs) self.infer_seed_increment = paddle.full( shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, - dtype="int64") + dtype="int64", + ) self.restore_chunked_prefill_request = dict() # Initialize attention Backend @@ -99,14 +102,14 @@ class IluvatarModelRunner(ModelRunnerBase): # Postprocess Env params os.environ["INFERENCE_MSG_QUEUE_ID"] = str( - self.local_rank + - int(self.parallel_config.engine_worker_queue_port)) + self.local_rank + int(self.parallel_config.engine_worker_queue_port) + ) def prefill_finished(self): """ check whether prefill stage finished """ - if int(paddle.max(self.share_inputs['seq_lens_encoder'])) != 0: + if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 else: return 0 @@ -115,8 +118,9 @@ class IluvatarModelRunner(ModelRunnerBase): """ init logits processor for guided decoding """ - assert self.guided_backend is not None, "guided_backend is None, use "\ - "--guided-decoding-backend to specify the backend at server startup." + assert self.guided_backend is not None, ( + "guided_backend is None, use " "--guided-decoding-backend to specify the backend at server startup." + ) if request.guided_json is not None: schemata_key = ("json", request.guided_json) @@ -127,8 +131,10 @@ class IluvatarModelRunner(ModelRunnerBase): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - return self.guided_backend.get_logits_processor( - schemata_key=schemata_key), schemata_key + return ( + self.guided_backend.get_logits_processor(schemata_key=schemata_key), + schemata_key, + ) def insert_prefill_inputs(self, req_dicts: List[Request]): """ @@ -140,9 +146,8 @@ class IluvatarModelRunner(ModelRunnerBase): self.initialize_kv_cache() # NOTE(luotingdan): Set environment variable of prefill node - if req_dicts[-1].disaggregate_info is not None and req_dicts[ - -1].disaggregate_info["role"] == "prefill": - os.environ['PREFILL_NODE_ONE_STEP_STOP'] = "1" + if req_dicts[-1].disaggregate_info is not None and req_dicts[-1].disaggregate_info["role"] == "prefill": + os.environ["PREFILL_NODE_ONE_STEP_STOP"] = "1" req_len = len(req_dicts) for i in range(req_len): @@ -151,138 +156,110 @@ class IluvatarModelRunner(ModelRunnerBase): length = len(request.prompt_token_ids) prefill_tokens = [] - if (request.guided_json is not None - or request.guided_regex is not None - or request.structural_tag is not None - or request.guided_grammar is not None): - logits_info, schemata_key = self._init_logits_processor( - request) + if ( + request.guided_json is not None + or request.guided_regex is not None + or request.structural_tag is not None + or request.guided_grammar is not None + ): + logits_info, schemata_key = self._init_logits_processor(request) request.logits_processor, request.logits_cached = logits_info request.schemata_key = schemata_key # Is Decode Node - if req_dicts[i].disaggregate_info is not None and req_dicts[ - i].disaggregate_info["role"] == "decode": + if req_dicts[i].disaggregate_info is not None and req_dicts[i].disaggregate_info["role"] == "decode": prefill_tokens.append(request.prompt_token_ids[0]) - self.share_inputs["pre_ids"][idx:idx + - 1] = request.prompt_token_ids[-1] - self.share_inputs["input_ids"][idx:idx + 1, - 0] = request.prompt_token_ids[0] - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs['seq_lens_decoder'][idx:idx + 1] = length - self.share_inputs['seq_lens_this_time'][idx:idx + 1] = 1 - self.share_inputs['step_seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs['step_seq_lens_decoder'][idx:idx + - 1] = length - self.share_inputs["prompt_lens"][idx:idx + 1] = length - self.share_inputs['step_idx'][idx:idx + 1] = 1 + self.share_inputs["pre_ids"][idx : idx + 1] = request.prompt_token_ids[-1] + self.share_inputs["input_ids"][idx : idx + 1, 0] = request.prompt_token_ids[0] + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1 + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = length + self.share_inputs["prompt_lens"][idx : idx + 1] = length + self.share_inputs["step_idx"][idx : idx + 1] = 1 if self.speculative_decoding: num_prefill_send_token = self.speculative_config.num_speculative_tokens + 1 - self.share_inputs['draft_tokens'][idx:idx + 1, 0:num_prefill_send_token] =\ - paddle.to_tensor(request.draft_token_ids[0:num_prefill_send_token], dtype="int64") - self.share_inputs['seq_lens_this_time'][ - idx:idx + 1] = num_prefill_send_token + self.share_inputs["draft_tokens"][idx : idx + 1, 0:num_prefill_send_token] = paddle.to_tensor( + request.draft_token_ids[0:num_prefill_send_token], + dtype="int64", + ) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = num_prefill_send_token else: - self.share_inputs["pre_ids"][idx:idx + 1] = -1 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["input_ids"][idx:idx + - 1, :length] = np.array( - request.prompt_token_ids) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids) # Use chunked prefill if self.parallel_config.enable_chunked_prefill: request.set("chunk_idx", 1) - logger.info( - f"prefill_chunk_info: {request.prefill_chunk_info}") + logger.info(f"prefill_chunk_info: {request.prefill_chunk_info}") token_chunk_size = request.prefill_chunk_info[0] - self.share_inputs["seq_lens_this_time"][ - idx:idx + 1] = token_chunk_size - self.share_inputs['input_ids'][ - idx, :token_chunk_size] = np.array( - request.prompt_token_ids[:token_chunk_size]) - self.share_inputs['step_seq_lens_encoder'][ - idx:idx + 1] = token_chunk_size - self.share_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.share_inputs['seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['step_seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs["prompt_lens"][idx:idx + 1] = token_chunk_size + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array( + request.prompt_token_ids[:token_chunk_size] + ) + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["prompt_lens"][idx : idx + 1] = token_chunk_size else: - self.share_inputs['seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['step_seq_lens_decoder'][ - idx:idx + 1] = request.get("seq_lens_decoder", 0) - self.share_inputs['seq_lens_this_time'][idx:idx + - 1] = length - self.share_inputs['step_seq_lens_encoder'][idx:idx + - 1] = length - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = length - self.share_inputs["prompt_lens"][idx:idx + 1] = length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["step_seq_lens_decoder"][idx : idx + 1] = request.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["prompt_lens"][idx : idx + 1] = length - if len(request.eos_token_ids - ) < self.parallel_config.eos_tokens_lens: + if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens: request.eos_token_ids.append(request.eos_token_ids[0]) - self.share_inputs["eos_token_id"][:] = np.array( - request.eos_token_ids, dtype="int64").reshape(-1, 1) - self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 0.7) - self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0) - self.share_inputs["temperature"][idx:idx + 1] = request.get( - "temperature", 0.95) - self.share_inputs["penalty_score"][idx:idx + 1] = request.get( - "repetition_penalty", 1.0) - self.share_inputs["frequency_score"][idx:idx + 1] = request.get( - "frequency_penalty", 0.0) - self.share_inputs["presence_score"][idx:idx + 1] = request.get( - "presence_penalty", 0.0) + self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) + self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) + self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95) + self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) + self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) + self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0) - self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( - "min_tokens", 1) - self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_length) - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1) + self.share_inputs["max_dec_len"][idx : idx + 1] = request.get("max_tokens", self.model_config.max_length) + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + 1] = length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length if request.get("seed") is not None: - self.share_inputs["infer_seed"][idx:idx + - 1] = request.get("seed") + self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed") encoder_block_num = len(request.get("block_tables")) - self.share_inputs["encoder_block_lens"][idx:idx + - 1] = encoder_block_num - self.share_inputs["block_tables"][idx:idx + 1, :] = -1 - self.share_inputs["block_tables"][ - idx:idx + 1, :encoder_block_num] = np.array( - request.block_tables, dtype="int32") + self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num + self.share_inputs["block_tables"][idx : idx + 1, :] = -1 + self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( + request.block_tables, dtype="int32" + ) - if request.get("stop_token_ids") is not None and request.get( - "stop_seqs_len") is not None: + if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) - for i in range(stop_seqs_num, - self.model_config.max_stop_seqs_num): + for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num): request.stop_seqs_len.append(0) - self.share_inputs["stop_seqs_len"][:] = np.array( - request.stop_seqs_len, dtype="int32") - self.share_inputs["stop_seqs"][:stop_seqs_num, :len( - request.get("stop_token_ids")[0])] = np.array( - request.get("stop_token_ids"), dtype="int64") + self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32") + self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( + request.get("stop_token_ids"), dtype="int64" + ) - self.sampler.apply_logits_processor( - idx, request.get("logits_processor"), prefill_tokens) + self.sampler.apply_logits_processor(idx, request.get("logits_processor"), prefill_tokens) self.share_inputs["not_need_stop"][0] = True - def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, - expected_decode_len: int): - """ Set dummy prefill inputs to share_inputs """ + def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int): + """Set dummy prefill inputs to share_inputs""" # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token max_dec_len = expected_decode_len + 1 - full_length = min(num_tokens // batch_size, - self.parallel_config.max_model_len - max_dec_len) + full_length = min( + num_tokens // batch_size, + self.parallel_config.max_model_len - max_dec_len, + ) input_length = int(full_length * self.parallel_config.kv_cache_ratio) block_num = ( input_length + self.parallel_config.block_size - 1 @@ -290,31 +267,25 @@ class IluvatarModelRunner(ModelRunnerBase): for i in range(batch_size): idx = i - self.share_inputs["input_ids"][idx:idx + - 1, :input_length] = np.array( - [5] * input_length) - self.share_inputs["prompt_ids"][idx:idx + 1, :input_length] = np.array( - [5] * input_length) - self.share_inputs["eos_token_id"][:] = np.array( - [2], dtype="int64").reshape(-1, 1) - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length - self.share_inputs["step_seq_lens_encoder"][idx:idx + - 1] = input_length - self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length - self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0 - self.share_inputs["prompt_lens"][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["max_dec_len"][idx:idx + 1] = max_dec_len - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.share_inputs["prompt_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + self.share_inputs["prompt_lens"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["max_dec_len"][idx : idx + 1] = max_dec_len + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + - 1] = input_length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length - self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num - self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \ - (idx + 1) * block_num, 1) + self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num + self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( + idx * block_num, (idx + 1) * block_num, 1 + ) def _init_share_inputs(self, max_num_seqs: int): """Initialize all share buffers for model inputs. @@ -326,211 +297,154 @@ class IluvatarModelRunner(ModelRunnerBase): self.share_inputs["pre_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], -1, - dtype='int64') + dtype="int64", + ) self.share_inputs["input_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], self.parallel_config.pad_token_id, - dtype='int64') + dtype="int64", + ) self.share_inputs["prompt_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], self.parallel_config.pad_token_id, - dtype='int64') - self.share_inputs["eos_token_id"] = paddle.full( - [self.parallel_config.eos_tokens_lens, 1], 0, dtype='int64') - self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], - self.model_config.top_p, - dtype='float32') - self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + dtype="int64", + ) + self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64") + self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") + self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["temperature"] = paddle.full( - [max_num_seqs, 1], self.model_config.temperature, dtype='float32') + [max_num_seqs, 1], self.model_config.temperature, dtype="float32" + ) self.share_inputs["penalty_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.penalty_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.penalty_score, dtype="float32" + ) self.share_inputs["frequency_score"] = paddle.full( [max_num_seqs, 1], self.model_config.frequency_score, - dtype='float32') + dtype="float32", + ) self.share_inputs["presence_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.presence_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.presence_score, dtype="float32" + ) - self.share_inputs["min_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') - self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') - self.share_inputs["min_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_length, dtype='int64') - self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, - 0, - dtype='int32') - self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["step_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["step_seq_lens_decoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["prompt_lens"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + self.share_inputs["min_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") + self.share_inputs["max_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.max_length, dtype="int64") + self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") + self.share_inputs["max_length"] = paddle.full([max_num_seqs, 1], self.model_config.max_length, dtype="int64") + self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") + self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["prompt_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["not_need_stop"] = paddle.full( - [1], False, - dtype='bool').cpu() # TODO(gongshaotian): move to pinnd memory - self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], - True, - dtype='bool') - self.share_inputs["stop_nums"] = paddle.full([1], - max_num_seqs, - dtype='int64') + [1], False, dtype="bool" + ).cpu() # TODO(gongshaotian): move to pinnd memory + self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], True, dtype="bool") + self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64") - self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype='int64') - self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], - False, - dtype='bool') - self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["step_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["need_block_len"] = paddle.full([1], - 0, - dtype='int32') - self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["ori_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int32') + self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype="int64") + self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool") + self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") self.share_inputs["ids_remove_padding"] = paddle.full( [max_num_seqs * self.parallel_config.max_model_len], 0, - dtype='int64') - self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') + dtype="int64", + ) + self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["padding_offset"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_q"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["cu_seqlens_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # AttentionBackend buffers - self.share_inputs["decoder_batch_ids"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') + self.share_inputs["decoder_batch_ids"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") # Initialize rotary position embedding - tmp_position_ids = paddle.arange( - self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) # TODO(gongshaotian): move to models self.share_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, base=self.model_config.rope_theta, - model_config=self.model_config) + model_config=self.model_config, + ) # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + - self.parallel_config.block_size - 1 + self.parallel_config.max_model_len + self.parallel_config.block_size - 1 ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num - self.share_inputs["block_tables"] = paddle.full( - [max_num_seqs, pre_max_block_num], -1, dtype='int32') + self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") # Initialize free list free_list = list( range( self.parallel_config.total_block_num - 1, - int(self.parallel_config.total_block_num * - self.parallel_config.kv_cache_ratio) - 1, -1)) + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs["free_list"] = paddle.to_tensor(free_list, - dtype="int32") - self.share_inputs["free_list_len"] = paddle.full([1], - self.free_list_len, - dtype="int32") + self.share_inputs["free_list"] = paddle.to_tensor(free_list, dtype="int32") + self.share_inputs["free_list_len"] = paddle.full([1], self.free_list_len, dtype="int32") # Initialize stop seqs - self.share_inputs["stop_seqs_len"] = paddle.full( - [self.model_config.max_stop_seqs_num], 0, dtype="int32") - self.share_inputs["stop_seqs"] = paddle.full([ - self.model_config.max_stop_seqs_num, - self.model_config.stop_seqs_max_len - ], - -1, - dtype="int32") + self.share_inputs["stop_seqs_len"] = paddle.full([self.model_config.max_stop_seqs_num], 0, dtype="int32") + self.share_inputs["stop_seqs"] = paddle.full( + [ + self.model_config.max_stop_seqs_num, + self.model_config.stop_seqs_max_len, + ], + -1, + dtype="int32", + ) if self.speculative_decoding: max_draft_token_num = self.speculative_config.num_speculative_tokens self.share_inputs["input_ids_cpu"] = paddle.full( shape=[max_num_seqs, self.parallel_config.max_model_len], fill_value=1, - dtype='int64').cpu() - self.share_inputs['accept_tokens'] = paddle.full( + dtype="int64", + ).cpu() + self.share_inputs["accept_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, - dtype="int64") - self.share_inputs['accept_num'] = paddle.full(shape=[max_num_seqs], - fill_value=0, - dtype='int32') - self.share_inputs['draft_tokens'] = paddle.full( + dtype="int64", + ) + self.share_inputs["accept_num"] = paddle.full(shape=[max_num_seqs], fill_value=0, dtype="int32") + self.share_inputs["draft_tokens"] = paddle.full( shape=[max_num_seqs, max_draft_token_num + 1], fill_value=0, - dtype="int64") + dtype="int64", + ) - self.share_inputs['actual_draft_token_num'] = paddle.full( + self.share_inputs["actual_draft_token_num"] = paddle.full( shape=[max_num_seqs], fill_value=max_draft_token_num, - dtype="int32") - self.share_inputs["output_cum_offsets"] = paddle.full( - shape=[max_num_seqs, 1], fill_value=0, dtype='int32') + dtype="int32", + ) + self.share_inputs["output_cum_offsets"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") self.share_inputs["output_padding_offset"] = paddle.full( shape=[max_num_seqs * (max_draft_token_num + 1)], fill_value=0, - dtype="int32") + dtype="int32", + ) def _prepare_inputs(self) -> None: - """ prepare the model inputs """ + """prepare the model inputs""" # Remove padding ( ids_remove_padding, @@ -541,19 +455,22 @@ class IluvatarModelRunner(ModelRunnerBase): output_cum_offsets, output_padding_offset, ) = pre_process( - self.parallel_config.max_model_len, self.share_inputs["input_ids"], - self.share_inputs["seq_lens_this_time"], self.speculative_decoding, - self.share_inputs["draft_tokens"] if self.speculative_decoding else - None, self.share_inputs["seq_lens_encoder"], - self.share_inputs["seq_lens_decoder"]) - cu_seqlens_k = paddle.concat([ - paddle.to_tensor([0], dtype=paddle.int32), - paddle.cumsum(self.share_inputs["seq_lens_this_time"] + - self.share_inputs["seq_lens_decoder"][:, 0]) - ]) + self.parallel_config.max_model_len, + self.share_inputs["input_ids"], + self.share_inputs["seq_lens_this_time"], + self.speculative_decoding, + (self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + self.share_inputs["seq_lens_encoder"], + self.share_inputs["seq_lens_decoder"], + ) + cu_seqlens_k = paddle.concat( + [ + paddle.to_tensor([0], dtype=paddle.int32), + paddle.cumsum(self.share_inputs["seq_lens_this_time"] + self.share_inputs["seq_lens_decoder"][:, 0]), + ] + ) - self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, - False) + self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False) self.share_inputs["cum_offsets"].copy_(cum_offsets, False) self.share_inputs["padding_offset"].copy_(padding_offset, False) self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False) @@ -561,10 +478,8 @@ class IluvatarModelRunner(ModelRunnerBase): # For speculative decoding if self.speculative_decoding: - self.share_inputs["output_cum_offsets"].copy_( - output_cum_offsets, False) - self.share_inputs["output_padding_offset"].copy_( - output_padding_offset, False) + self.share_inputs["output_cum_offsets"].copy_(output_cum_offsets, False) + self.share_inputs["output_padding_offset"].copy_(output_padding_offset, False) # Initialize forward meta data self.initialize_forward_meta() @@ -587,9 +502,8 @@ class IluvatarModelRunner(ModelRunnerBase): ) def load_model(self) -> None: - """ load or download model """ - logger.info( - f"Starting to load model {self.model_config.architectures[0]}") + """load or download model""" + logger.info(f"Starting to load model {self.model_config.architectures[0]}") time_before_load = time.perf_counter() # 1. Load original model self.model = get_model_from_loader(fd_config=self.fd_config) @@ -599,11 +513,10 @@ class IluvatarModelRunner(ModelRunnerBase): # 3. Load drafter model(for speculative decoding) time_after_load = time.perf_counter() - logger.info( - f"Model loading took {time_after_load - time_before_load} seconds") + logger.info(f"Model loading took {time_after_load - time_before_load} seconds") def get_model(self) -> nn.Layer: - """ get current model """ + """get current model""" return self.model def initialize_forward_meta(self): @@ -626,7 +539,7 @@ class IluvatarModelRunner(ModelRunnerBase): cu_seqlens_q=self.share_inputs["cu_seqlens_q"], cu_seqlens_k=self.share_inputs["cu_seqlens_k"], block_tables=self.share_inputs["block_tables"], - caches=self.share_inputs["caches"] + caches=self.share_inputs["caches"], ) # Initialzie attention meta data @@ -649,28 +562,29 @@ class IluvatarModelRunner(ModelRunnerBase): # Get kv cache dtype cache_type = self.parallel_config.dtype - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): - cache_type = 'uint8' + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): + cache_type = "uint8" # Get kv cache shape - kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( - max_num_blocks=max_block_num) + kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num) if not self.parallel_config.do_profile and ( - self.parallel_config.enable_prefix_caching \ - or self.parallel_config.splitwise_role != "mixed"): + self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed" + ): raise NotImplementedError("Iluvatar does not support yet") else: for i in range(self.model_config.num_hidden_layers): - cache_kvs["key_caches_{}".format(i)] = paddle.full( + cache_kvs[f"key_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, ) - cache_kvs["value_caches_{}".format(i)] = paddle.full( + cache_kvs[f"value_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, @@ -690,36 +604,40 @@ class IluvatarModelRunner(ModelRunnerBase): num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size self.model_config.kv_num_heads = max( 1, - int(self.model_config.num_key_value_heads) // - self.parallel_config.tensor_parallel_size) + int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size, + ) head_dim = self.model_config.head_dim # Get the attention backend attn_cls = get_attention_backend() - attn_backend = attn_cls(self.fd_config, - kv_num_heads=self.model_config.kv_num_heads, - num_heads=num_heads, - head_dim=head_dim) + attn_backend = attn_cls( + self.fd_config, + kv_num_heads=self.model_config.kv_num_heads, + num_heads=num_heads, + head_dim=head_dim, + ) if attn_backend is None: - raise NotImplementedError( - "Attention backend which you chose is not support by GPUModelRunner" - ) + raise NotImplementedError("Attention backend which you chose is not support by GPUModelRunner") self.attn_backends.append(attn_backend) - def _dummy_run(self, - num_tokens: paddle.Tensor, - batch_size: paddle.Tensor, - expected_decode_len: int = 1, - in_capturing: bool = False) -> paddle.Tensor: + def _dummy_run( + self, + num_tokens: paddle.Tensor, + batch_size: paddle.Tensor, + expected_decode_len: int = 1, + in_capturing: bool = False, + ) -> paddle.Tensor: """ Use dummy inputs to run before formal execution. Args: num_tokens: expected_decode_len: Expected number of tokens generated """ - self._dummy_prefill_inputs(num_tokens=num_tokens, - batch_size=batch_size, - expected_decode_len=expected_decode_len) + self._dummy_prefill_inputs( + num_tokens=num_tokens, + batch_size=batch_size, + expected_decode_len=expected_decode_len, + ) while True: # 1. Compute real num_tokens @@ -732,7 +650,8 @@ class IluvatarModelRunner(ModelRunnerBase): # 4. Run model model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta) + forward_meta=self.forward_meta, + ) hiddden_states = rebuild_padding( model_output, @@ -757,24 +676,22 @@ class IluvatarModelRunner(ModelRunnerBase): self.share_inputs["step_idx"], self.share_inputs["stop_flags"], ) - sampled_token_ids = self.sampler(logits, - self.sampling_metadata) + sampled_token_ids = self.sampler(logits, self.sampling_metadata) if self.parallel_config.tensor_parallel_size > 1: paddle.distributed.broadcast(sampled_token_ids, 0) else: - self.sampler(logits, self.sampling_metadata, - self.parallel_config.max_model_len, - self.share_inputs) + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) sampled_token_ids = None if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], 0) - paddle.distributed.broadcast( - self.share_inputs["accept_num"], 0) - paddle.distributed.broadcast(self.share_inputs["step_idx"], - 0) - paddle.distributed.broadcast( - self.share_inputs["stop_flags"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_tokens"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_num"], 0) + paddle.distributed.broadcast(self.share_inputs["step_idx"], 0) + paddle.distributed.broadcast(self.share_inputs["stop_flags"], 0) # 6. post process model_output_data = ModelOutputData( @@ -795,30 +712,33 @@ class IluvatarModelRunner(ModelRunnerBase): msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=self.share_inputs["draft_tokens"] - if self.speculative_decoding else None, - actual_draft_token_num=self. - share_inputs["actual_draft_token_num"] - if self.speculative_decoding else None, - accept_tokens=self.share_inputs["accept_tokens"] - if self.speculative_decoding else None, - accept_num=self.share_inputs["accept_num"] - if self.speculative_decoding else None) + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + ) - post_process(sampled_token_ids=sampled_token_ids, - model_output=model_output_data, - speculative_decoding=self.speculative_decoding, - skip_save_output=True) + post_process( + sampled_token_ids=sampled_token_ids, + model_output=model_output_data, + speculative_decoding=self.speculative_decoding, + skip_save_output=True, + ) # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - step_cuda(self.share_inputs, self.parallel_config.block_size, - self.parallel_config.enc_dec_block_num, - self.speculative_config, - self.parallel_config.enable_prefix_caching) + step_cuda( + self.share_inputs, + self.parallel_config.block_size, + self.parallel_config.enc_dec_block_num, + self.speculative_config, + self.parallel_config.enable_prefix_caching, + ) - if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0: + if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0: break def _update_chunked_prefill(self, tasks): @@ -838,32 +758,25 @@ class IluvatarModelRunner(ModelRunnerBase): for id, task in list(self.restore_chunked_prefill_request.items()): idx = task.idx - logger.debug( - f"{task.request_id} chunked prefill {task.chunk_idx}/{len(task.prefill_chunk_info)}" - ) - start_idx = sum(task.prefill_chunk_info[:task.chunk_idx]) + logger.debug(f"{task.request_id} chunked prefill {task.chunk_idx}/{len(task.prefill_chunk_info)}") + start_idx = sum(task.prefill_chunk_info[: task.chunk_idx]) if task.chunk_idx == len(task.prefill_chunk_info): - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = 1 - self.share_inputs['seq_lens_encoder'][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 1 - self.share_inputs["seq_lens_decoder"][ - idx:idx + 1] = start_idx + task.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = 1 + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 1 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) del self.restore_chunked_prefill_request[task.request_id] else: token_chunk_size = task.prefill_chunk_info[task.chunk_idx] - self.share_inputs["seq_lens_this_time"][idx:idx + - 1] = token_chunk_size - self.share_inputs['input_ids'][ - idx, :token_chunk_size] = np.array( - task.prompt_token_ids[start_idx:start_idx + - token_chunk_size]) - self.share_inputs['seq_lens_encoder'][idx:idx + - 1] = token_chunk_size - self.share_inputs["prompt_lens"][idx:idx + 1] += token_chunk_size - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["seq_lens_decoder"][ - idx:idx + 1] = start_idx + task.get("seq_lens_decoder", 0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = token_chunk_size + self.share_inputs["input_ids"][idx, :token_chunk_size] = np.array( + task.prompt_token_ids[start_idx : start_idx + token_chunk_size] + ) + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = token_chunk_size + self.share_inputs["prompt_lens"][idx : idx + 1] += token_chunk_size + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = start_idx + task.get("seq_lens_decoder", 0) task.chunk_idx += 1 def _dummy_sampler_run(self) -> paddle.Tensor: @@ -875,26 +788,22 @@ class IluvatarModelRunner(ModelRunnerBase): Trigger CUDA Graph capture for all shapes in 'CudaGraphConfig.cudagraph_capture_sizes' """ if not self.use_cudagraph: - logger.info( - "Skipping CUDA graph capture. Please check GraphOptimizationConfig" - ) + logger.info("Skipping CUDA graph capture. Please check GraphOptimizationConfig") return time_before_capture = time.perf_counter() expected_decode_len = 1 capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): - self._dummy_run(num_tokens=self.parallel_config.max_model_len, - batch_size=batch_size, - in_capturing=True, - expected_decode_len=expected_decode_len) - logger.info( - f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}" + self._dummy_run( + num_tokens=self.parallel_config.max_model_len, + batch_size=batch_size, + in_capturing=True, + expected_decode_len=expected_decode_len, ) + logger.info(f"Warm up the model with the batch size:{batch_size}, num tokens:{expected_decode_len}") time_after_capture = time.perf_counter() - logger.info( - f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds" - ) + logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") def _get_skip_idx(self, model_forward_batch): """ @@ -909,15 +818,12 @@ class IluvatarModelRunner(ModelRunnerBase): return skip_idx_list for task in model_forward_batch: - if task.get("prefill_chunk_info", - None) is None or task.chunk_idx >= len( - task.prefill_chunk_info): + if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info): continue skip_idx_list.append(task.idx) for task in self.restore_chunked_prefill_request.values(): - if task.idx in skip_idx_list or task.chunk_idx >= len( - task.prefill_chunk_info): + if task.idx in skip_idx_list or task.chunk_idx >= len(task.prefill_chunk_info): continue skip_idx_list.append(task.idx) @@ -953,7 +859,8 @@ class IluvatarModelRunner(ModelRunnerBase): # 3. Execute model model_output = self.model( ids_remove_padding=self.share_inputs["ids_remove_padding"], - forward_meta=self.forward_meta) + forward_meta=self.forward_meta, + ) hiddden_states = rebuild_padding( model_output, @@ -961,8 +868,7 @@ class IluvatarModelRunner(ModelRunnerBase): self.share_inputs["seq_lens_this_time"], self.share_inputs["seq_lens_decoder"], self.share_inputs["seq_lens_encoder"], - self.share_inputs["output_padding_offset"] - if self.speculative_decoding else None, + (self.share_inputs["output_padding_offset"] if self.speculative_decoding else None), self.parallel_config.max_model_len, ) @@ -988,17 +894,18 @@ class IluvatarModelRunner(ModelRunnerBase): paddle.distributed.broadcast(sampled_token_ids, 0) else: - self.sampler(logits, self.sampling_metadata, - self.parallel_config.max_model_len, self.share_inputs) + self.sampler( + logits, + self.sampling_metadata, + self.parallel_config.max_model_len, + self.share_inputs, + ) sampled_token_ids = None if self.parallel_config.tensor_parallel_size > 1: - paddle.distributed.broadcast( - self.share_inputs["accept_tokens"], 0) - paddle.distributed.broadcast(self.share_inputs["accept_num"], - 0) + paddle.distributed.broadcast(self.share_inputs["accept_tokens"], 0) + paddle.distributed.broadcast(self.share_inputs["accept_num"], 0) paddle.distributed.broadcast(self.share_inputs["step_idx"], 0) - paddle.distributed.broadcast(self.share_inputs["stop_flags"], - 0) + paddle.distributed.broadcast(self.share_inputs["stop_flags"], 0) # 5. Post Process model_output_data = ModelOutputData( @@ -1019,25 +926,25 @@ class IluvatarModelRunner(ModelRunnerBase): msg_queue_id=self.parallel_config.msg_queue_id, mp_rank=self.local_rank, use_ep=self.parallel_config.use_ep, - draft_tokens=self.share_inputs["draft_tokens"] - if self.speculative_decoding else None, - actual_draft_token_num=self.share_inputs["actual_draft_token_num"] - if self.speculative_decoding else None, - accept_tokens=self.share_inputs["accept_tokens"] - if self.speculative_decoding else None, - accept_num=self.share_inputs["accept_num"] - if self.speculative_decoding else None) + draft_tokens=(self.share_inputs["draft_tokens"] if self.speculative_decoding else None), + actual_draft_token_num=( + self.share_inputs["actual_draft_token_num"] if self.speculative_decoding else None + ), + accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), + accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), + ) - if self.speculative_config.method in ["mtp"] and \ - self.parallel_config.splitwise_role == "prefill": + if self.speculative_config.method in ["mtp"] and self.parallel_config.splitwise_role == "prefill": skip_save_output = True else: skip_save_output = False - post_process(sampled_token_ids=sampled_token_ids, - model_output=model_output_data, - save_each_rank=self.parallel_config.use_ep, - speculative_decoding=self.speculative_decoding, - skip_save_output=skip_save_output) + post_process( + sampled_token_ids=sampled_token_ids, + model_output=model_output_data, + save_each_rank=self.parallel_config.use_ep, + speculative_decoding=self.speculative_decoding, + skip_save_output=skip_save_output, + ) # 7. Updata 'infer_seed' and step_cuda() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) @@ -1077,8 +984,7 @@ class IluvatarModelRunner(ModelRunnerBase): if hasattr(self.model, "empty_input_forward"): self.model.empty_input_forward() else: - raise ValueError( - f"{type(self.model)} has no attribute 'empty_input_forward") + raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" @@ -1091,8 +997,10 @@ class IluvatarModelRunner(ModelRunnerBase): # 1. Profile with multimodal encoder & encoder cache # 2. Dummy run - self._dummy_run(num_tokens=self.parallel_config.max_num_batched_tokens, - batch_size=min(self.parallel_config.max_num_seqs, 3)) + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=min(self.parallel_config.max_num_seqs, 3), + ) # 3. gc self.clear_cache() @@ -1108,23 +1016,24 @@ class IluvatarModelRunner(ModelRunnerBase): self.num_gpu_blocks = num_gpu_blocks # Reset block table and kv cache with global block num - if not (self.parallel_config.enable_prefix_caching \ - or self.parallel_config.splitwise_role != "mixed"): + if not (self.parallel_config.enable_prefix_caching or self.parallel_config.splitwise_role != "mixed"): self.initialize_kv_cache() # Reset free list free_list = list( range( self.num_gpu_blocks - 1, - int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - - 1, -1)) + int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs.update({ - "free_list": - paddle.to_tensor(free_list, dtype="int32"), - "free_list_len": - paddle.full([1], self.free_list_len, dtype="int32"), - }) + self.share_inputs.update( + { + "free_list": paddle.to_tensor(free_list, dtype="int32"), + "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"), + } + ) self.parallel_config.do_profile = False @@ -1140,9 +1049,11 @@ class IluvatarModelRunner(ModelRunnerBase): - cache_int4: """ cache_quant_dtype = None - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): cache_quant_dtype = self.quant_config.kv_cache_quant_type if cache_quant_dtype is not None: # int8, int8_zp, fp8, fp8_zp @@ -1152,14 +1063,12 @@ class IluvatarModelRunner(ModelRunnerBase): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads # NOTE(liuzichang): Implement multi-layer MTP architecture in the future - num_layers = self.model_config.num_hidden_layers + \ - self.speculative_config.num_gpu_block_expand_ratio if \ - self.speculative_method in [ - "mtp" - ] else self.model_config.num_hidden_layers - required_memory = ( - byte_of_dtype * 2 * # k + v - (self.parallel_config.block_size * hidden_dim) * num_layers) + num_layers = ( + self.model_config.num_hidden_layers + self.speculative_config.num_gpu_block_expand_ratio + if self.speculative_method in ["mtp"] + else self.model_config.num_hidden_layers + ) + required_memory = byte_of_dtype * 2 * (self.parallel_config.block_size * hidden_dim) * num_layers # k + v return required_memory def not_need_stop(self) -> bool: diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index 590e7e662..f855466ff 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -13,12 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import gc import os from typing import List, Optional import paddle -import paddle.nn as nn +from paddle import nn from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request @@ -47,8 +48,7 @@ class IluvatarWorker(WorkerBase): pass def init_device(self): - """ Initialize device and Construct model runner - """ + """Initialize device and Construct model runner""" if paddle.is_compiled_with_custom_device("iluvatar_gpu"): # Set evironment variable self.device = f"iluvatar_gpu:{self.local_rank}" @@ -58,8 +58,7 @@ class IluvatarWorker(WorkerBase): gc.collect() else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") + raise RuntimeError(f"Not support device type: {self.device_config.device}") # Construct model runner self.model_runner: IluvatarModelRunner = IluvatarModelRunner( @@ -67,7 +66,8 @@ class IluvatarWorker(WorkerBase): device=self.device, device_id=self.device_ids[self.local_rank], rank=self.rank, - local_rank=self.local_rank) + local_rank=self.local_rank, + ) def prefill_finished(self): """ @@ -99,8 +99,7 @@ class IluvatarWorker(WorkerBase): """ """ return self.model_runner.get_model() - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """ """ pass @@ -113,7 +112,7 @@ class IluvatarWorker(WorkerBase): return output def preprocess_new_task(self, req_dicts: List[Request]) -> None: - """ Process new requests and then start the decode loop + """Process new requests and then start the decode loop TODO(gongshaotian):The scheduler should schedule the handling of prefill, and workers and modelrunners should not perceive it. """ @@ -139,5 +138,4 @@ class IluvatarWorker(WorkerBase): def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None: """ """ - self.model_runner.update_share_input_block_num( - num_gpu_blocks=num_gpu_blocks) + self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks) diff --git a/fastdeploy/worker/model_runner_base.py b/fastdeploy/worker/model_runner_base.py index ebbc552da..8bd008650 100644 --- a/fastdeploy/worker/model_runner_base.py +++ b/fastdeploy/worker/model_runner_base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from abc import ABC, abstractmethod from paddle import nn @@ -26,14 +27,14 @@ logger = get_logger("model_runner_base", "model_runner_base.log") class ModelRunnerBase(ABC): """ - Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model - ModelRunner interface abstracts the model execution logic that - contain input preparation, token generation, and tokenprocessing. + Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model + ModelRunner interface abstracts the model execution logic that + contain input preparation, token generation, and tokenprocessing. """ def __init__(self, fd_config: FDConfig, device: str) -> None: """ - Initialize FDConfig + Initialize FDConfig """ self.fd_config = fd_config self.model_config = fd_config.model_config @@ -50,27 +51,29 @@ class ModelRunnerBase(ABC): @abstractmethod def load_model(self) -> None: """ - Load model from local path or remote(will download) path + Load model from local path or remote(will download) path """ raise NotImplementedError @abstractmethod def get_model(self) -> nn.Layer: """ - Get current model + Get current model """ raise NotImplementedError @abstractmethod - def execute_model(self, ) -> ModelRunnerOutput: + def execute_model( + self, + ) -> ModelRunnerOutput: """ - Execute model with and get output + Execute model with and get output """ raise NotImplementedError @abstractmethod def profile_run(self) -> None: """ - Execute a forward pass with dummy inputs to profile the memory usage of the model." + Execute a forward pass with dummy inputs to profile the memory usage of the model." """ raise NotImplementedError diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py index 8d8dd2ad7..73ecbcbde 100644 --- a/fastdeploy/worker/output.py +++ b/fastdeploy/worker/output.py @@ -21,8 +21,7 @@ import paddle class LogprobsLists(NamedTuple): - """ - """ + """ """ # [num_reqs, max_num_logprobs + 1] logprob_token_ids: list[list[int]] @@ -41,8 +40,7 @@ class LogprobsLists(NamedTuple): class LogprobsTensors(NamedTuple): - """ - """ + """ """ # [num_reqs, max_num_logprobs + 1] logprob_token_ids: paddle.Tensor @@ -60,16 +58,12 @@ class LogprobsTensors(NamedTuple): ) @staticmethod - def empty_cpu(num_positions: int, - num_tokens_per_position: int) -> "LogprobsTensors": + def empty_cpu(num_positions: int, num_tokens_per_position: int) -> "LogprobsTensors": """Create empty LogprobsTensors on CPU.""" - logprob_token_ids = paddle.empty( - [num_positions, num_tokens_per_position], - dtype=paddle.int64).cpu() + logprob_token_ids = paddle.empty([num_positions, num_tokens_per_position], dtype=paddle.int64).cpu() logprobs = paddle.empty_like(logprob_token_ids, dtype=paddle.float32) - selected_token_ranks = paddle.empty([num_positions], - dtype=paddle.int64).cpu() + selected_token_ranks = paddle.empty([num_positions], dtype=paddle.int64).cpu() return LogprobsTensors( logprob_token_ids=logprob_token_ids, logprobs=logprobs, @@ -79,8 +73,7 @@ class LogprobsTensors(NamedTuple): @dataclass class SamplerOutput: - """ - """ + """ """ # [num_reqs, max_num_generated_tokens] # Different requests can have different number of generated tokens. @@ -89,10 +82,11 @@ class SamplerOutput: sampled_token_ids: paddle.Tensor logprobs_tensors: Optional[LogprobsTensors] + @dataclass class ModelOutputData: """ - OutputData by execute_model + OutputData by execute_model """ """ @@ -222,11 +216,10 @@ class ModelOutputData: reasoning_index: paddle.Tensor = None - @dataclass class ModelRunnerOutput: """ - [WIP] ModelRunnerOutput is serialized and sent to the scheduler process. + [WIP] ModelRunnerOutput is serialized and sent to the scheduler process. """ """ diff --git a/fastdeploy/worker/utils.py b/fastdeploy/worker/utils.py index 626c33c9e..bf727c3bb 100644 --- a/fastdeploy/worker/utils.py +++ b/fastdeploy/worker/utils.py @@ -13,13 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import os def check_safetensors_model(model_dir: str): """ - model_dir : the directory of the model - Check whther the model is safetensors format + model_dir : the directory of the model + Check whther the model is safetensors format """ model_files = list() all_files = os.listdir(model_dir) @@ -35,8 +36,7 @@ def check_safetensors_model(model_dir: str): return True try: # check all the file exists - safetensors_num = int( - model_files[0].strip(".safetensors").split("-")[-1]) + safetensors_num = int(model_files[0].strip(".safetensors").split("-")[-1]) flags = [0] * safetensors_num for x in model_files: current_index = int(x.strip(".safetensors").split("-")[1]) diff --git a/fastdeploy/worker/worker_base.py b/fastdeploy/worker/worker_base.py index 9d9e1bf00..0d604c2e0 100644 --- a/fastdeploy/worker/worker_base.py +++ b/fastdeploy/worker/worker_base.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + from abc import ABC, abstractmethod from typing import Optional @@ -25,8 +26,8 @@ from fastdeploy.worker.output import ModelRunnerOutput class WorkerBase(ABC): """ - Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model - Worker interface that allows inference framwork to cleanly separate implementations for different harware. + Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model + Worker interface that allows inference framwork to cleanly separate implementations for different harware. """ def __init__( @@ -59,18 +60,17 @@ class WorkerBase(ABC): @abstractmethod def init_device(self) -> None: - """ Initialize the device state.""" + """Initialize the device state.""" raise NotImplementedError @abstractmethod - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """Initizlize the KV Cache with the given size in blocks.""" raise NotImplementedError @abstractmethod def get_model(self) -> nn.Layer: - """ Get the model loaded by worker.""" + """Get the model loaded by worker.""" raise NotImplementedError @abstractmethod diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 7ed29a71c..72ea36c24 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import argparse import json import time @@ -21,17 +22,23 @@ from typing import List import numpy as np import paddle import paddle.distributed as dist -import paddle.distributed.fleet as fleet +from paddle.distributed import fleet -from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, - GraphOptimizationConfig, LoadConfig, - ModelConfig, ParallelConfig, SpeculativeConfig, - ErnieArchitectures) +from fastdeploy.config import ( + DecodingConfig, + DeviceConfig, + ErnieArchitectures, + FDConfig, + GraphOptimizationConfig, + LoadConfig, + ModelConfig, + ParallelConfig, + SpeculativeConfig, +) from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import IPCSignal -from fastdeploy.model_executor.layers.quantization import \ - get_quantization_config +from fastdeploy.model_executor.layers.quantization import get_quantization_config from fastdeploy.platforms import current_platform from fastdeploy.utils import get_logger, none_or_str from fastdeploy.worker.worker_base import WorkerBase @@ -47,24 +54,28 @@ def get_worker(fd_config: FDConfig, local_rank: int, rank: int) -> WorkerBase: raise NotImplementedError("Only CUDA platform supports logprob.") if current_platform.is_dcu(): from fastdeploy.worker.dcu_worker import DcuWorker + return DcuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank) if current_platform.is_cuda(): from fastdeploy.worker.gpu_worker import GpuWorker + return GpuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank) if current_platform.is_xpu(): from fastdeploy.worker.xpu_worker import XpuWorker + return XpuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank) if current_platform.is_iluvatar(): from fastdeploy.worker.iluvatar_worker import IluvatarWorker - return IluvatarWorker(fd_config=fd_config, - local_rank=local_rank, - rank=rank) + + return IluvatarWorker(fd_config=fd_config, local_rank=local_rank, rank=rank) if current_platform.is_gcu(): from fastdeploy.worker.gcu_worker import GcuWorker + return GcuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank) + def init_distributed_environment(seed: int = 20) -> List[int]: - """ Initialize Paddle Fleet and get rank of worker """ + """Initialize Paddle Fleet and get rank of worker""" # Global rank ranks = dist.get_world_size() dist_strategy = fleet.DistributedStrategy() @@ -85,6 +96,7 @@ def init_distributed_environment(seed: int = 20) -> List[int]: return ranks, local_rank + def update_fd_config_for_mm(fd_config: FDConfig) -> None: if fd_config.model_config.enable_mm: tokenizer = ErnieBotTokenizer.from_pretrained( @@ -103,13 +115,12 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None: vision_config.dtype = fd_config.model_config.dtype # vision_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size # vision_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank - fd_config.model_config.im_patch_id = tokenizer.get_vocab()[ - "<|IMAGE_PLACEHOLDER|>" - ] + fd_config.model_config.im_patch_id = tokenizer.get_vocab()["<|IMAGE_PLACEHOLDER|>"] fd_config.model_config.think_end_id = tokenizer.get_vocab()[""] fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel -class PaddleDisWorkerProc(): + +class PaddleDisWorkerProc: """ Paddle Distrubuted wrapper for fastdeploy.worker.Worker, for handling single-node multi-GPU tensor parallel. @@ -117,12 +128,7 @@ class PaddleDisWorkerProc(): in the task queue. Control flow is transmitted by IPC. """ - def __init__( - self, - fd_config: FDConfig, - ranks: int = 1, - local_rank: int = 0 - ) -> None: + def __init__(self, fd_config: FDConfig, ranks: int = 1, local_rank: int = 0) -> None: """ Initialize a distributed worker and task queue for single-node multi-GPU setup. Args: @@ -136,20 +142,21 @@ class PaddleDisWorkerProc(): self.parallel_config = fd_config.parallel_config # TODO(gongshaotian): Use worker factory to get worker - self.worker = get_worker(fd_config=fd_config, - local_rank=self.local_rank, - rank=self.ranks) + self.worker = get_worker(fd_config=fd_config, local_rank=self.local_rank, rank=self.ranks) # Initialize task queue - task_address = (self.parallel_config.pod_ip, - self.parallel_config.engine_worker_queue_port) + task_address = ( + self.parallel_config.pod_ip, + self.parallel_config.engine_worker_queue_port, + ) self.task_queue = TaskQueue( address=task_address, is_server=False, num_client=self.parallel_config.tensor_parallel_size, client_id=self.parallel_config.tensor_parallel_rank, - local_data_parallel_id=self.parallel_config.expert_parallel_rank) + local_data_parallel_id=self.parallel_config.expert_parallel_rank, + ) def init_health_status(self) -> None: """ @@ -164,17 +171,18 @@ class PaddleDisWorkerProc(): # init worker_ready_signal self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 array_size = min( - self.max_chips_per_node, self.parallel_config.tensor_parallel_size * - self.parallel_config.expert_parallel_size) + self.max_chips_per_node, + self.parallel_config.tensor_parallel_size * self.parallel_config.expert_parallel_size, + ) workers_ready = np.zeros(shape=[array_size], dtype=np.int32) self.worker_ready_signal = IPCSignal( name="worker_ready_signal", array=workers_ready, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) - self.worker_ready_signal.value[self.local_rank % - self.max_chips_per_node] = 1 + create=False, + ) + self.worker_ready_signal.value[self.local_rank % self.max_chips_per_node] = 1 # init worker_healthy_live_signal workers_alive = np.zeros(shape=[array_size], dtype=np.int32) @@ -183,9 +191,9 @@ class PaddleDisWorkerProc(): array=workers_alive, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) - self.worker_healthy_live_signal.value[self.local_rank % 8] = int( - time.time()) + create=False, + ) + self.worker_healthy_live_signal.value[self.local_rank % 8] = int(time.time()) # init model_weights_status workers_model_weights = np.zeros(shape=[1], dtype=np.int32) @@ -194,28 +202,28 @@ class PaddleDisWorkerProc(): array=workers_model_weights, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) + create=False, + ) # init exist_task_signal - workers_exist_task = np.zeros( - [self.parallel_config.expert_parallel_size], dtype=np.int32) + workers_exist_task = np.zeros([self.parallel_config.expert_parallel_size], dtype=np.int32) self.exist_task_signal = IPCSignal( name="exist_task_signal", array=workers_exist_task, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) + create=False, + ) # init exist_swapped_task_signal - workers_swapped_task = np.zeros( - shape=[self.parallel_config.expert_parallel_size], - dtype=np.int32) + workers_swapped_task = np.zeros(shape=[self.parallel_config.expert_parallel_size], dtype=np.int32) self.exist_swapped_task_signal = IPCSignal( name="exist_swapped_task_signal", array=workers_swapped_task, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) + create=False, + ) # init exist_prefill_task_signal exist_prefill_task_signal_data = np.zeros([1], dtype=np.int32) @@ -224,26 +232,27 @@ class PaddleDisWorkerProc(): array=exist_prefill_task_signal_data, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) + create=False, + ) def event_loop_ep(self) -> None: """ Tmp loop function for ep utill DP is supported """ while True: - self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int( - time.time()) + self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time()) - if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks( - ) > 0: + if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0: tasks, read_finish = self.task_queue.get_tasks() req_dicts = [] for req_dict, bsz in tasks: num_running_requests = int(bsz) req_dicts.extend(req_dict) - logger.info(f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " \ - f"num_insert_requests: {len(req_dicts)}") + logger.info( + f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " + f"num_insert_requests: {len(req_dicts)}" + ) # Process prefill inputs self.worker.preprocess_new_task(req_dicts) @@ -252,7 +261,7 @@ class PaddleDisWorkerProc(): self.worker.execute_model() def event_loop_normal(self) -> None: - """ Main event loop for Paddle Distrubuted Workers. + """Main event loop for Paddle Distrubuted Workers. TODO(gongshaotian): support remote calling of functions that control worker. """ # Currently, only support single node @@ -271,18 +280,15 @@ class PaddleDisWorkerProc(): paddle.distributed.barrier() self.insert_step = False - self.worker_healthy_live_signal.value[self.local_rank] = int( - time.time()) + self.worker_healthy_live_signal.value[self.local_rank] = int(time.time()) # The first worker detects whether there are tasks in the task queue - if self.local_rank % mp_num_per_node == 0: + if self.local_rank % mp_num_per_node == 0: if self.task_queue.num_tasks() > 0: if self.nnode > 1: self.task_queue.read_finish_flag.set(1) else: - self.exist_task_signal.value[ - self.fd_config.parallel_config. - expert_parallel_rank] = 1 + self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] = 1 if self.parallel_config.tensor_parallel_size > 1: # Synchronize the signal for other workers @@ -291,23 +297,27 @@ class PaddleDisWorkerProc(): if self.fd_config.load_config.dynamic_load_weight: if self.exist_task_signal.value[0] == 2: - from fastdeploy.rl.dynamic_weight_manager import \ - DynamicWeightManager - DynamicWeightManager.check_model_weights_status( - self.model_weights_status, self.worker.model_runner, - self.parallel_config.engine_pid) + from fastdeploy.rl.dynamic_weight_manager import ( + DynamicWeightManager, + ) - if self.exist_task_signal.value[ - self.fd_config.parallel_config.expert_parallel_rank] == 1 or \ - self.task_queue.read_finish_flag.get() == 1: + DynamicWeightManager.check_model_weights_status( + self.model_weights_status, + self.worker.model_runner, + self.parallel_config.engine_pid, + ) + + if ( + self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] == 1 + or self.task_queue.read_finish_flag.get() == 1 + ): logger.info(f"Rank: {self.local_rank} Detected new requests.") self.insert_step = True tasks, read_finish = self.task_queue.get_tasks() if read_finish: # Ensure that every worker get the task - self.exist_task_signal.value[self.fd_config.parallel_config - .expert_parallel_rank] = 0 + self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] = 0 self.task_queue.read_finish_flag.set(0) req_dicts = [] @@ -316,8 +326,10 @@ class PaddleDisWorkerProc(): req_dicts.extend(req_dict) req_ids = [req.request_id for req in req_dicts] - logger.info(f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " \ - f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}") + logger.info( + f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " + f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}" + ) # Process prefill inputs self.worker.preprocess_new_task(req_dicts) @@ -333,9 +345,7 @@ class PaddleDisWorkerProc(): # These generated tokens can be obtained through get_output op. self.worker.execute_model(req_dicts) - self.exist_prefill_task_signal.value[ - 0] = self.worker.prefill_finished() - + self.exist_prefill_task_signal.value[0] = self.worker.prefill_finished() def determine_num_available_blocks(self) -> None: """Profiles the peak memory usage of the model to determine how many @@ -351,64 +361,51 @@ class PaddleDisWorkerProc(): """ if self.fd_config.parallel_config.do_profile: # 1. Get available memory(bytes) - available_kv_cache_memory = self.worker.determine_available_memory( - ) - logger.info( - f"------- available_kv_cache_memory:{available_kv_cache_memory / 1024**3} GB --------" - ) + available_kv_cache_memory = self.worker.determine_available_memory() + logger.info(f"------- available_kv_cache_memory:{available_kv_cache_memory / 1024**3} GB --------") # 2. Calculate the appropriate number of blocks model_block_memory_used = self.worker.cal_theortical_kvcache() - num_blocks_local = int(available_kv_cache_memory // - model_block_memory_used) + num_blocks_local = int(available_kv_cache_memory // model_block_memory_used) # NOTE(liuzichang): Too many block will lead to illegal memory access # We will develop dynamic limits in future. if num_blocks_local > 40000: - logger.info( - f"------- Reset num_blocks_local {num_blocks_local} to 40000" - ) + logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000") num_blocks_local = min(40000, num_blocks_local) - logger.info( - f"------- model_block_memory_used:{model_block_memory_used} --------" - ) - logger.info( - f"------- num_blocks_local:{num_blocks_local} --------") + logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------") + logger.info(f"------- num_blocks_local:{num_blocks_local} --------") - logger.info( - f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}" - ) + logger.info(f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}") # 3. Send IPCSignal - get_profile_block_num = np.zeros(shape=[self.ranks], - dtype=np.int32) + get_profile_block_num = np.zeros(shape=[self.ranks], dtype=np.int32) self.get_profile_block_num_signal = IPCSignal( name="get_profile_block_num", array=get_profile_block_num, dtype=np.int32, suffix=self.parallel_config.engine_pid, - create=False) - self.get_profile_block_num_signal.value[ - self.local_rank] = num_blocks_local + create=False, + ) + self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_local # Wait all worker send the signal while np.any(self.get_profile_block_num_signal.value <= 0): time.sleep(0.01) - num_blocks_global = self.get_profile_block_num_signal.value.min( - ).item() + num_blocks_global = self.get_profile_block_num_signal.value.min().item() if num_blocks_global < 0: logger.error( "The total number of blocks cannot be less than zero." "Please increase gpu_memory_utilization" - "Or decrease max_num_batched_tokens(max model length) ") + "Or decrease max_num_batched_tokens(max model length) " + ) raise ValueError( "The total number of blocks cannot be less than zero." "Please increase gpu_memory_utilization" - "Or decrease max_num_batched_tokens(max model length) ") + "Or decrease max_num_batched_tokens(max model length) " + ) - - self.get_profile_block_num_signal.value[ - self.local_rank] = num_blocks_global + self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_global else: num_blocks_global = self.fd_config.parallel_config.total_block_num # NOTE(liuzichang): Too big num_blocks_global will lead to error 700 @@ -416,11 +413,11 @@ class PaddleDisWorkerProc(): self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global) def init_device(self) -> None: - """ Initialize device and Construct model runner """ + """Initialize device and Construct model runner""" self.worker.init_device() def load_model(self) -> None: - """ Load weights and create model """ + """Load weights and create model""" self.worker.load_model() @@ -429,66 +426,44 @@ def parse_args(): Parse args from command line """ parser = argparse.ArgumentParser("FastDeploy LLM Inference") - parser.add_argument("-m", - "--model_name_or_path", - type=str, - default="./output", - help="model dir") - parser.add_argument("-mbs", - "--max_num_seqs", - type=int, - default=34, - help="max batch size") + parser.add_argument( + "-m", + "--model_name_or_path", + type=str, + default="./output", + help="model dir", + ) + parser.add_argument("-mbs", "--max_num_seqs", type=int, default=34, help="max batch size") parser.add_argument("--total_block_num", type=int, default=2000) parser.add_argument("--block_size", type=int, default=64) parser.add_argument("--pod_ip", type=str, default="127.0.0.1") parser.add_argument("--engine_worker_queue_port", type=int, default=9923) - parser.add_argument("--max_model_len", - type=int, - default=3072, - help="max model len") - parser.add_argument("--device_ids", - type=str, - default="0", - help="cuda visible devices") - parser.add_argument("--dtype", - type=str, - default="bfloat16", - help="input dtype") - parser.add_argument("--enc_dec_block_num", - type=int, - default=1, - help="encoder's decoder num") - parser.add_argument("--kv_cache_ratio", - type=float, - default=0.7, - help="kv cache ratio for input") - parser.add_argument("--first_token_id", - type=int, - default=1, - help="first token id") - parser.add_argument("--gpu_memory_utilization", - type=float, - default=0.9, - help="gpu memory utilization") - parser.add_argument("--engine_pid", - type=int, - default=None, - help="Process ID of engine") - parser.add_argument("--do_profile", - action='store_true', - help="do profile or not") - parser.add_argument("--pad_token_id", - type=int, - default=-1, - help="pad token id") - parser.add_argument("--eos_tokens_lens", - type=int, - default=2, - help="eos token lens") - parser.add_argument("--enable_chunked_prefill", - action='store_true', - help="enable chunked prefill") + parser.add_argument("--max_model_len", type=int, default=3072, help="max model len") + parser.add_argument("--device_ids", type=str, default="0", help="cuda visible devices") + parser.add_argument("--dtype", type=str, default="bfloat16", help="input dtype") + parser.add_argument("--enc_dec_block_num", type=int, default=1, help="encoder's decoder num") + parser.add_argument( + "--kv_cache_ratio", + type=float, + default=0.7, + help="kv cache ratio for input", + ) + parser.add_argument("--first_token_id", type=int, default=1, help="first token id") + parser.add_argument( + "--gpu_memory_utilization", + type=float, + default=0.9, + help="gpu memory utilization", + ) + parser.add_argument("--engine_pid", type=int, default=None, help="Process ID of engine") + parser.add_argument("--do_profile", action="store_true", help="do profile or not") + parser.add_argument("--pad_token_id", type=int, default=-1, help="pad token id") + parser.add_argument("--eos_tokens_lens", type=int, default=2, help="eos token lens") + parser.add_argument( + "--enable_chunked_prefill", + action="store_true", + help="enable chunked prefill", + ) parser.add_argument( "--speculative_method", default=None, @@ -519,71 +494,90 @@ def parse_args(): default=False, type=bool, ) - parser.add_argument("--max_num_batched_tokens", - type=int, - default=2048, - help="max num batched tokens") + parser.add_argument( + "--max_num_batched_tokens", + type=int, + default=2048, + help="max num batched tokens", + ) - parser.add_argument("--enable_prefix_caching", - action='store_true', - help="enable prefix cache") - parser.add_argument("--enable-custom-all-reduce", - action='store_true', - help="enable custom all-reduce") - parser.add_argument("--splitwise_role", - type=str, - default="mixed", - help="splitwise role") - parser.add_argument("--tensor_parallel_size", - type=int, - default=1, - help="tensor parallel size") - parser.add_argument("--expert_parallel_size", - type=int, - default=1, - help="expert parallel size") - parser.add_argument("--enable_expert_parallel", - action='store_true', - help="enable expert parallel") + parser.add_argument( + "--enable_prefix_caching", + action="store_true", + help="enable prefix cache", + ) + parser.add_argument( + "--enable-custom-all-reduce", + action="store_true", + help="enable custom all-reduce", + ) + parser.add_argument("--splitwise_role", type=str, default="mixed", help="splitwise role") + parser.add_argument( + "--tensor_parallel_size", + type=int, + default=1, + help="tensor parallel size", + ) + parser.add_argument( + "--expert_parallel_size", + type=int, + default=1, + help="expert parallel size", + ) + parser.add_argument( + "--enable_expert_parallel", + action="store_true", + help="enable expert parallel", + ) parser.add_argument("--ori_vocab_size", type=int, default=None) - parser.add_argument("--quantization", - type=str, - default="None", - help="Quantization name for the model, currentlly support " \ - "'wint4', 'wint8'," \ - "default is None. The priority of this configuration "\ - "is lower than that of the config file. " \ - "More complex quantization methods need to be configured via the config file.") - parser.add_argument("--graph_optimization_config", - type=json.loads, - default=None, - help=" Configation of Graph optimization backend. " + parser.add_argument( + "--quantization", + type=str, + default="None", + help="Quantization name for the model, currentlly support " + "'wint4', 'wint8'," + "default is None. The priority of this configuration " + "is lower than that of the config file. " + "More complex quantization methods need to be configured via the config file.", + ) + parser.add_argument( + "--graph_optimization_config", + type=json.loads, + default=None, + help=" Configation of Graph optimization backend. ", + ) + parser.add_argument( + "--guided_decoding_backend", + type=str, + default="off", + help="guided decoding backend", + ) + parser.add_argument( + "--disable_any_whitespace", + action="store_false", + help="Disable any whitespace for guided decoding.", + ) + parser.add_argument( + "--dynamic_load_weight", + action="store_true", + help="Enable dynamic weight loading strategy", ) - parser.add_argument("--guided_decoding_backend", - type=str, - default="off", - help="guided decoding backend") - parser.add_argument("--disable_any_whitespace", - action='store_false', - help="Disable any whitespace for guided decoding.") - parser.add_argument("--dynamic_load_weight", - action='store_true', - help="Enable dynamic weight loading strategy") parser.add_argument( "--load_strategy", type=str, - choices=['ipc', 'ipc_snapshot'], + choices=["ipc", "ipc_snapshot"], default="ipc_snapshot", help="Weight loading method when dynamic loading is enabled: " "'ipc': real-time IPC streaming with automatic resharding, " - "'ipc_snapshot': load from disk snapshot of IPC weights.") - parser.add_argument("--enable_mm", - action='store_true', - help="Whether to enable vl model") - parser.add_argument("--enable_logprob", - action='store_true', - help="Enable output of token-level log probabilities.") + "'ipc_snapshot': load from disk snapshot of IPC weights.", + ) + parser.add_argument("--enable_mm", action="store_true", help="Whether to enable vl model") + parser.add_argument( + "--enable_logprob", + action="store_true", + help="Enable output of token-level log probabilities.", + ) args = parser.parse_args() return args @@ -615,7 +609,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: else: num_experts = model_config.moe_num_experts - num_experts_per_rank = num_experts // args.expert_parallel_size + num_experts_per_rank = num_experts // args.expert_parallel_size num_experts_start_offset = expert_parallel_rank * num_experts_per_rank parallel_config.expert_parallel_rank = expert_parallel_rank @@ -629,7 +623,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: graph_opt_config = GraphOptimizationConfig( use_cudagraph=args.graph_optimization_config["use_cudagraph"], graph_opt_level=args.graph_optimization_config["graph_opt_level"], - cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"] + cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"], ) # Note(tangbinhan): used for load_checkpoint @@ -639,14 +633,10 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: model_config.pretrained_config.head_dim = model_config.head_dim logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") - logger.info( - f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}" - ) - logger.info( - f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}" - ) + logger.info(f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}") + logger.info(f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}") - if getattr(model_config, 'num_hidden_layers', None) is None: + if getattr(model_config, "num_hidden_layers", None) is None: raise ValueError("num_hidden_layers is None") quantization_config = model_config.quantization_config @@ -656,11 +646,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: model_config.is_quantized = True quant_config_name = None - if quantization_config is not None and quantization_config.get( - "quantization", None) is None: - raise ValueError( - "quantization_config should have a key named 'quantization' for specify quant config." - ) + if quantization_config is not None and quantization_config.get("quantization", None) is None: + raise ValueError("quantization_config should have a key named 'quantization' for specify quant config.") if quantization_config is not None: quant_config_name = quantization_config["quantization"] @@ -688,32 +675,29 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: logger.info("===========quantization_config==============") if quant_config is not None: if model_config.is_quantized: - logger.info( - "Model Status: Offline Quantized (pre-quantized weights loaded)" - ) + logger.info("Model Status: Offline Quantized (pre-quantized weights loaded)") else: - logger.info( - "Model Status: Original (will apply online quantization)") + logger.info("Model Status: Original (will apply online quantization)") logger.info(f"{quantization_config}") else: - logger.info( - "No quantization config found and use original weight and act dtype." - ) + logger.info("No quantization config found and use original weight and act dtype.") # Set VL tag model_config.enable_mm = args.enable_mm logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Load strategy: {load_config.load_strategy}") - fd_config = FDConfig(model_config=model_config, - parallel_config=parallel_config, - speculative_config=speculative_config, - device_config=device_config, - load_config=load_config, - decoding_config=decoding_config, - quant_config=quant_config, - graph_opt_config=graph_opt_config) + fd_config = FDConfig( + model_config=model_config, + parallel_config=parallel_config, + speculative_config=speculative_config, + device_config=device_config, + load_config=load_config, + decoding_config=decoding_config, + quant_config=quant_config, + graph_opt_config=graph_opt_config, + ) update_fd_config_for_mm(fd_config) return fd_config diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 50b1a3b9d..a7e30d6fe 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import random import time from typing import Dict, List, Optional import numpy as np import paddle -import paddle.nn as nn +from paddle import nn from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend -from fastdeploy.model_executor.layers.attention.base_attention_backend import \ - AttentionBackend +from fastdeploy.model_executor.layers.attention.base_attention_backend import ( + AttentionBackend, +) from fastdeploy.model_executor.layers.rotary_embedding import get_rope from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import Sampler @@ -39,30 +41,31 @@ logger = get_logger("xpu_model_runner", "xpu_model_runner.log") def xpu_pre_process( - max_len: int, - input_ids: paddle.Tensor, - seq_lens_this_time: int, - share_inputs: Dict, - use_speculate_method: bool, - draft_tokens: Optional[paddle.Tensor] = None, - seq_lens_encoder: Optional[paddle.Tensor] = None, - seq_lens_decoder: Optional[paddle.Tensor] = None) -> XPUForwardMeta: - """ - - """ + max_len: int, + input_ids: paddle.Tensor, + seq_lens_this_time: int, + share_inputs: Dict, + use_speculate_method: bool, + draft_tokens: Optional[paddle.Tensor] = None, + seq_lens_encoder: Optional[paddle.Tensor] = None, + seq_lens_decoder: Optional[paddle.Tensor] = None, +) -> XPUForwardMeta: + """ """ cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) token_num = paddle.sum(seq_lens_this_time) - from fastdeploy.model_executor.ops.xpu import (adjust_batch, - get_infer_param, - get_padding_offset) + from fastdeploy.model_executor.ops.xpu import ( + adjust_batch, + get_infer_param, + get_padding_offset, + ) + ( ids_remove_padding, cum_offsets, batch_id_per_token, cu_seqlens_q, cu_seqlens_k, - ) = get_padding_offset(input_ids, cum_offsets_now, token_num, - seq_lens_this_time) + ) = get_padding_offset(input_ids, cum_offsets_now, token_num, seq_lens_this_time) share_inputs["ids_remove_padding"] = None # set this after adjust batch share_inputs["cum_offsets"] = cum_offsets @@ -83,7 +86,7 @@ def xpu_pre_process( cu_seqlens_q=share_inputs["cu_seqlens_q"], cu_seqlens_k=share_inputs["cu_seqlens_k"], block_tables=share_inputs["block_tables"], - caches=share_inputs["caches"] + caches=share_inputs["caches"], ) # Get xpu extra param @@ -134,10 +137,9 @@ def xpu_process_output( cum_offsets: paddle.Tensor, xpu_forward_meta: XPUForwardMeta, ) -> paddle.Tensor: - """ - - """ + """ """ from fastdeploy.model_executor.ops.xpu import gather_next_token + hiddden_states = gather_next_token( forward_output, cum_offsets, @@ -155,15 +157,17 @@ def xpu_process_output( return hiddden_states -def xpu_post_process(sampled_token_ids: paddle.Tensor, - model_output: ModelOutputData, - skip_save_output: bool) -> None: - """ - - """ - from fastdeploy.model_executor.ops.xpu import (save_output, - set_stop_value_multi_ends, - update_inputs) +def xpu_post_process( + sampled_token_ids: paddle.Tensor, + model_output: ModelOutputData, + skip_save_output: bool, +) -> None: + """ """ + from fastdeploy.model_executor.ops.xpu import ( + save_output, + set_stop_value_multi_ends, + update_inputs, + ) # 1. Set stop value paddle.assign( @@ -174,16 +178,19 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor, ), model_output.step_idx, ) - length_cond = paddle.greater_equal(model_output.step_idx, - model_output.max_dec_len) + length_cond = paddle.greater_equal(model_output.step_idx, model_output.max_dec_len) paddle.assign( paddle.logical_or(model_output.stop_flags, length_cond), model_output.stop_flags, ) - set_stop_value_multi_ends(sampled_token_ids, model_output.stop_flags, - model_output.seq_lens_this_time, - model_output.eos_token_id, - model_output.next_tokens, False) # multi ends + set_stop_value_multi_ends( + sampled_token_ids, + model_output.stop_flags, + model_output.seq_lens_this_time, + model_output.eos_token_id, + model_output.next_tokens, + False, + ) # multi ends # 2. Update the input buffer of the model with paddle.framework._no_check_dy2st_diff(): @@ -209,12 +216,16 @@ def xpu_post_process(sampled_token_ids: paddle.Tensor, ) -def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int, - enc_dec_block_num: int) -> None: +def step_paddle( + share_inputs: Dict[str, paddle.Tensor], + block_size: int, + enc_dec_block_num: int, +) -> None: """ TODO(gongshaotian): normalization name """ from fastdeploy.model_executor.ops.xpu import step_paddle + step_paddle( share_inputs["stop_flags"], share_inputs["seq_lens_this_time"], @@ -246,8 +257,7 @@ def step_paddle(share_inputs: Dict[str, paddle.Tensor], block_size: int, class XPUModelRunner(ModelRunnerBase): """ """ - def __init__(self, fd_config: FDConfig, device: str, rank: int, - local_rank: int): + def __init__(self, fd_config: FDConfig, device: str, rank: int, local_rank: int): super().__init__(fd_config=fd_config, device=device) self.rank = rank self.local_rank = local_rank @@ -260,15 +270,15 @@ class XPUModelRunner(ModelRunnerBase): # Cuda Graph self.use_cudagraph = False - self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, - dtype='int32') + self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") # Initialize share inputs self._init_share_inputs(self.fd_config.parallel_config.max_num_seqs) self.infer_seed_increment = paddle.full( shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, - dtype="int64") + dtype="int64", + ) # Initialize attention Backend # Note(gonshaotian): Currently, all attention layers share one attention backend instance. @@ -281,68 +291,55 @@ class XPUModelRunner(ModelRunnerBase): self.forward_meta: ForwardMeta = None def process_prefill_inputs(self, req_dicts: List[Request]): - """ Process inputs for prefill tasks and update share_inputs buffer """ + """Process inputs for prefill tasks and update share_inputs buffer""" req_len = len(req_dicts) for i in range(req_len): request = req_dicts[i] idx = request.idx length = request.prompt_token_ids_len - self.share_inputs["input_ids"][idx:idx + 1, :length] = np.array( - request.prompt_token_ids) - if len(request.eos_token_ids - ) < self.parallel_config.eos_tokens_lens: + self.share_inputs["input_ids"][idx : idx + 1, :length] = np.array(request.prompt_token_ids) + if len(request.eos_token_ids) < self.parallel_config.eos_tokens_lens: request.eos_token_ids.append(request.eos_token_ids[0]) - self.share_inputs["eos_token_id"][:] = np.array( - request.eos_token_ids, dtype="int64").reshape(-1, 1) - self.share_inputs["pre_ids"][idx:idx + 1] = -1 - self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 0.7) - self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0) - self.share_inputs["temperature"][idx:idx + 1] = request.get( - "temperature", 0.95) - self.share_inputs["penalty_score"][idx:idx + 1] = request.get( - "repetition_penalty", 1.0) - self.share_inputs["frequency_score"][idx:idx + 1] = request.get( - "frequency_penalty", 0.0) - self.share_inputs["presence_score"][idx:idx + 1] = request.get( - "presence_penalty", 0.0) - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = length - self.share_inputs["step_seq_lens_encoder"][idx:idx + 1] = length - self.share_inputs["seq_lens_encoder"][idx:idx + 1] = length - self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( - "min_tokens", 1) + self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) + self.share_inputs["pre_ids"][idx : idx + 1] = -1 + self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) + self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95) + self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) + self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) + self.share_inputs["presence_score"][idx : idx + 1] = request.get("presence_penalty", 0.0) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["min_dec_len"][idx : idx + 1] = request.get("min_tokens", 1) - self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( - "max_tokens", self.model_config.max_model_len) - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["max_dec_len"][idx : idx + 1] = request.get( + "max_tokens", self.model_config.max_model_len + ) + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + 1] = length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = length if request.get("seed") is not None: - self.share_inputs["infer_seed"][idx:idx + - 1] = request.get("seed") + self.share_inputs["infer_seed"][idx : idx + 1] = request.get("seed") encoder_block_num = len(request.get("block_tables")) - self.share_inputs["encoder_block_lens"][idx:idx + - 1] = encoder_block_num - self.share_inputs["block_tables"][idx:idx + 1, :] = -1 - self.share_inputs["block_tables"][ - idx:idx + 1, :encoder_block_num] = np.array( - request.block_tables, dtype="int32") + self.share_inputs["encoder_block_lens"][idx : idx + 1] = encoder_block_num + self.share_inputs["block_tables"][idx : idx + 1, :] = -1 + self.share_inputs["block_tables"][idx : idx + 1, :encoder_block_num] = np.array( + request.block_tables, dtype="int32" + ) - if request.get("stop_token_ids") is not None and request.get( - "stop_seqs_len") is not None: + if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None: stop_seqs_num = len(request.get("stop_seqs_len")) - for i in range(stop_seqs_num, - self.model_config.max_stop_seqs_num): + for i in range(stop_seqs_num, self.model_config.max_stop_seqs_num): request.stop_seqs_len.append(0) - self.share_inputs["stop_seqs_len"][:] = np.array( - request.stop_seqs_len, dtype="int32") - self.share_inputs["stop_seqs"][:stop_seqs_num, :len( - request.get("stop_token_ids")[0])] = np.array( - request.get("stop_token_ids"), dtype="int64") + self.share_inputs["stop_seqs_len"][:] = np.array(request.stop_seqs_len, dtype="int32") + self.share_inputs["stop_seqs"][:stop_seqs_num, : len(request.get("stop_token_ids")[0])] = np.array( + request.get("stop_token_ids"), dtype="int64" + ) self.share_inputs["not_need_stop"][0] = True @@ -356,151 +353,108 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs["pre_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], -1, - dtype='int64') + dtype="int64", + ) self.share_inputs["input_ids"] = paddle.full( [max_num_seqs, self.parallel_config.max_model_len], self.parallel_config.pad_token_id, - dtype='int64') - self.share_inputs["eos_token_id"] = paddle.full( - [self.parallel_config.eos_tokens_lens, 1], 0, dtype='int64') - self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], - self.model_config.top_p, - dtype='float32') - self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + dtype="int64", + ) + self.share_inputs["eos_token_id"] = paddle.full([self.parallel_config.eos_tokens_lens, 1], 0, dtype="int64") + self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") + self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["temperature"] = paddle.full( - [max_num_seqs, 1], self.model_config.temperature, dtype='float32') + [max_num_seqs, 1], self.model_config.temperature, dtype="float32" + ) self.share_inputs["penalty_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.penalty_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.penalty_score, dtype="float32" + ) self.share_inputs["frequency_score"] = paddle.full( [max_num_seqs, 1], self.model_config.frequency_score, - dtype='float32') + dtype="float32", + ) self.share_inputs["presence_score"] = paddle.full( - [max_num_seqs, 1], - self.model_config.presence_score, - dtype='float32') + [max_num_seqs, 1], self.model_config.presence_score, dtype="float32" + ) - self.share_inputs["min_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') + self.share_inputs["min_dec_len"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") self.share_inputs["max_dec_len"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') - self.share_inputs["min_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.min_length, dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" + ) + self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64') - self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, - 0, - dtype='int32') - self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["step_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') + [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" + ) + self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") + self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["step_idx"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") self.share_inputs["not_need_stop"] = paddle.full( - [1], False, - dtype='bool').cpu() # TODO(gongshaotian): move to pinnd memory - self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], - True, - dtype='bool') - self.share_inputs["stop_nums"] = paddle.full([1], - max_num_seqs, - dtype='int64') + [1], False, dtype="bool" + ).cpu() # TODO(gongshaotian): move to pinnd memory + self.share_inputs["stop_flags"] = paddle.full([max_num_seqs, 1], True, dtype="bool") + self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64") - self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype='int64') - self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], - False, - dtype='bool') - self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["step_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype='int32') - self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], - -1, - dtype='int32') - self.share_inputs["need_block_len"] = paddle.full([1], - 0, - dtype='int32') - self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], - 0, - dtype='int32') - self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int64') - self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int64') - self.share_inputs["ori_seq_lens_encoder"] = paddle.full( - [max_num_seqs, 1], 0, dtype='int32') - self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], - 0, - dtype='int32') - self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], - -1, - dtype='int32') + self.share_inputs["bad_tokens"] = paddle.full([1], -1, dtype="int64") + self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool") + self.share_inputs["encoder_block_lens"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["step_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["step_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["recover_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["recover_lens"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") + self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32") + self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") + self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") + self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") + self.share_inputs["system_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int32") # Initialize rotary position embedding - tmp_position_ids = paddle.arange( - self.parallel_config.max_model_len).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) # TODO(gongshaotian): move to models self.share_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, position_ids=tmp_position_ids, base=self.model_config.rope_theta, - model_config=self.model_config) + model_config=self.model_config, + ) # Set block tables pre_max_block_num = ( - self.parallel_config.max_model_len + - self.parallel_config.block_size - 1 + self.parallel_config.max_model_len + self.parallel_config.block_size - 1 ) // self.parallel_config.block_size + self.parallel_config.enc_dec_block_num - self.share_inputs["block_tables"] = paddle.full( - [max_num_seqs, pre_max_block_num], -1, dtype='int32') + self.share_inputs["block_tables"] = paddle.full([max_num_seqs, pre_max_block_num], -1, dtype="int32") # Initialize free list free_list = list( range( self.parallel_config.total_block_num - 1, - int(self.parallel_config.total_block_num * - self.parallel_config.kv_cache_ratio) - 1, -1)) + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs["free_list"] = paddle.to_tensor(free_list, - dtype="int32") - self.share_inputs["free_list_len"] = paddle.full([1], - self.free_list_len, - dtype="int32") + self.share_inputs["free_list"] = paddle.to_tensor(free_list, dtype="int32") + self.share_inputs["free_list_len"] = paddle.full([1], self.free_list_len, dtype="int32") # Initialize stop seqs - self.share_inputs["stop_seqs_len"] = paddle.full( - [self.model_config.max_stop_seqs_num], 0, dtype="int32") - self.share_inputs["stop_seqs"] = paddle.full([ - self.model_config.max_stop_seqs_num, - self.model_config.stop_seqs_max_len - ], - -1, - dtype="int32") + self.share_inputs["stop_seqs_len"] = paddle.full([self.model_config.max_stop_seqs_num], 0, dtype="int32") + self.share_inputs["stop_seqs"] = paddle.full( + [ + self.model_config.max_stop_seqs_num, + self.model_config.stop_seqs_max_len, + ], + -1, + dtype="int32", + ) def _prepare_inputs(self) -> None: - """ prepare the model inputs """ + """prepare the model inputs""" self.forward_meta = xpu_pre_process( self.parallel_config.max_model_len, self.share_inputs["input_ids"], @@ -530,9 +484,8 @@ class XPUModelRunner(ModelRunnerBase): ) def load_model(self) -> None: - """ load or download model """ - logger.info( - f"Starting to load model {self.model_config.architectures[0]}") + """load or download model""" + logger.info(f"Starting to load model {self.model_config.architectures[0]}") time_before_load = time.perf_counter() # 1. Load original model self.model = get_model_from_loader(fd_config=self.fd_config) @@ -542,11 +495,10 @@ class XPUModelRunner(ModelRunnerBase): # 3. Load drafter model(for speculative decoding) time_after_load = time.perf_counter() - logger.info( - f"Model loading took {time_after_load - time_before_load} seconds") + logger.info(f"Model loading took {time_after_load - time_before_load} seconds") def get_model(self) -> nn.Layer: - """ get current model """ + """get current model""" return self.model def initialize_attention_backend(self): @@ -566,21 +518,22 @@ class XPUModelRunner(ModelRunnerBase): cache_type = self.parallel_config.dtype - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): - cache_type = 'uint8' + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): + cache_type = "uint8" - kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( - max_num_blocks=max_block_num) + kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(max_num_blocks=max_block_num) for i in range(self.model_config.num_hidden_layers): - cache_kvs["key_caches_{}".format(i)] = paddle.full( + cache_kvs[f"key_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, ) - cache_kvs["value_caches_{}".format(i)] = paddle.full( + cache_kvs[f"value_caches_{i}"] = paddle.full( shape=kv_cache_shape, fill_value=0, dtype=cache_type, @@ -598,17 +551,19 @@ class XPUModelRunner(ModelRunnerBase): # TODO(gongshaotian): Get rank from config num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size - self.model_config.kv_num_heads = int( - self.model_config.num_key_value_heads - ) // self.parallel_config.tensor_parallel_size + self.model_config.kv_num_heads = ( + int(self.model_config.num_key_value_heads) // self.parallel_config.tensor_parallel_size + ) head_dim = self.model_config.head_dim # Get the attention backend attn_cls = get_attention_backend() - attn_backend = attn_cls(self.fd_config, - kv_num_heads=self.model_config.kv_num_heads, - num_heads=num_heads, - head_dim=head_dim) + attn_backend = attn_cls( + self.fd_config, + kv_num_heads=self.model_config.kv_num_heads, + num_heads=num_heads, + head_dim=head_dim, + ) if attn_backend is None: raise NotImplementedError( "Attention backend which you specified is not supported, please set FD_ATTENTION_BACKEND correctly." @@ -626,15 +581,14 @@ class XPUModelRunner(ModelRunnerBase): """ check whether prefill stage finished """ - if int(paddle.max(self.share_inputs['seq_lens_encoder'])) != 0: + if int(paddle.max(self.share_inputs["seq_lens_encoder"])) != 0: return 1 else: return 0 def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int): - """ Set dummy prefill inputs to share_inputs """ - full_length = min(num_tokens // batch_size, - self.parallel_config.max_model_len - 10) + """Set dummy prefill inputs to share_inputs""" + full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - 10) input_length = int(full_length - 512) block_num = ( input_length + self.parallel_config.block_size - 1 @@ -642,35 +596,31 @@ class XPUModelRunner(ModelRunnerBase): for i in range(batch_size): idx = i - self.share_inputs["input_ids"][idx:idx + - 1, :input_length] = np.array( - [5] * input_length) - self.share_inputs["eos_token_id"][:] = np.array( - [2], dtype="int64").reshape(-1, 1) - self.share_inputs["seq_lens_this_time"][idx:idx + 1] = input_length - self.share_inputs["step_seq_lens_encoder"][idx:idx + - 1] = input_length - self.share_inputs["seq_lens_encoder"][idx:idx + 1] = input_length - self.share_inputs["seq_lens_decoder"][idx:idx + 1] = 0 - self.share_inputs["step_idx"][idx:idx + 1] = 0 - self.share_inputs["max_dec_len"][idx:idx + 1] = 10 - self.share_inputs["stop_flags"][idx:idx + 1] = False + self.share_inputs["input_ids"][idx : idx + 1, :input_length] = np.array([5] * input_length) + self.share_inputs["eos_token_id"][:] = np.array([2], dtype="int64").reshape(-1, 1) + self.share_inputs["seq_lens_this_time"][idx : idx + 1] = input_length + self.share_inputs["step_seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_encoder"][idx : idx + 1] = input_length + self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + self.share_inputs["step_idx"][idx : idx + 1] = 0 + self.share_inputs["max_dec_len"][idx : idx + 1] = 10 + self.share_inputs["stop_flags"][idx : idx + 1] = False - self.share_inputs["first_token_ids"][ - idx:idx + 1] = self.share_inputs["input_ids"][idx:idx + 1, :1] - self.share_inputs["ori_seq_lens_encoder"][idx:idx + - 1] = input_length + self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1] + self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length - self.share_inputs["infer_seed"][idx:idx + 1] = random.randint( - 0, 922337203685477580) - self.share_inputs["encoder_block_lens"][idx:idx + 1] = block_num - self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange(idx * block_num, \ - (idx + 1) * block_num, 1) + self.share_inputs["infer_seed"][idx : idx + 1] = random.randint(0, 922337203685477580) + self.share_inputs["encoder_block_lens"][idx : idx + 1] = block_num + self.share_inputs["block_tables"][idx : idx + 1, :block_num] = np.arange( + idx * block_num, (idx + 1) * block_num, 1 + ) - def _dummy_run(self, - num_tokens: paddle.Tensor, - batch_size: paddle.Tensor, - in_capturing: bool = False) -> paddle.Tensor: + def _dummy_run( + self, + num_tokens: paddle.Tensor, + batch_size: paddle.Tensor, + in_capturing: bool = False, + ) -> paddle.Tensor: """ Use dummy inputs to run before formal execution. Args: @@ -681,7 +631,7 @@ class XPUModelRunner(ModelRunnerBase): while True: self.execute_model(None, True) - if int((self.share_inputs['seq_lens_this_time'] > 0).sum()) == 0: + if int((self.share_inputs["seq_lens_this_time"] > 0).sum()) == 0: break def execute_model( @@ -703,12 +653,9 @@ class XPUModelRunner(ModelRunnerBase): # 2. Padding inputs for cuda grph # 3. Execute model - model_output = self.model(self.share_inputs["ids_remove_padding"], - self.forward_meta) + model_output = self.model(self.share_inputs["ids_remove_padding"], self.forward_meta) - hiddden_states = xpu_process_output(model_output, - self.share_inputs["cum_offsets"], - self.forward_meta) + hiddden_states = xpu_process_output(model_output, self.share_inputs["cum_offsets"], self.forward_meta) # 4. Compute logits, Sample logits = self.model.compute_logits(hiddden_states) @@ -742,15 +689,20 @@ class XPUModelRunner(ModelRunnerBase): accept_tokens=None, accept_num=None, ) - xpu_post_process(sampled_token_ids=sampler_output.sampled_token_ids, - model_output=model_output_data, - skip_save_output=is_dummy_run) + xpu_post_process( + sampled_token_ids=sampler_output.sampled_token_ids, + model_output=model_output_data, + skip_save_output=is_dummy_run, + ) # 7. Updata 'infer_seed' and step_paddle() self.share_inputs["infer_seed"].add_(self.infer_seed_increment) self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED - step_paddle(self.share_inputs, self.parallel_config.block_size, - self.parallel_config.enc_dec_block_num) + step_paddle( + self.share_inputs, + self.parallel_config.block_size, + self.parallel_config.enc_dec_block_num, + ) return None @@ -763,9 +715,10 @@ class XPUModelRunner(ModelRunnerBase): def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" - self._dummy_run(num_tokens=int( - self.parallel_config.max_num_batched_tokens), - batch_size=min(self.parallel_config.max_num_seqs, 1)) + self._dummy_run( + num_tokens=int(self.parallel_config.max_num_batched_tokens), + batch_size=min(self.parallel_config.max_num_seqs, 1), + ) def clear_block_table(self) -> None: """ @@ -788,9 +741,11 @@ class XPUModelRunner(ModelRunnerBase): - cache_int4: """ cache_quant_dtype = None - if (self.quant_config - and hasattr(self.quant_config, "kv_cache_quant_type") - and self.quant_config.kv_cache_quant_type is not None): + if ( + self.quant_config + and hasattr(self.quant_config, "kv_cache_quant_type") + and self.quant_config.kv_cache_quant_type is not None + ): cache_quant_dtype = self.quant_config.kv_cache_quant_type if cache_quant_dtype is not None: # int8, int8_zp, fp8, fp8_zp @@ -800,9 +755,11 @@ class XPUModelRunner(ModelRunnerBase): hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads required_memory = ( - byte_of_dtype * 2 * # k + v - (self.parallel_config.block_size * hidden_dim) * - self.model_config.num_hidden_layers) + byte_of_dtype + * 2 # k + v + * (self.parallel_config.block_size * hidden_dim) + * self.model_config.num_hidden_layers + ) return required_memory def update_share_input_block_num(self, num_gpu_blocks: int) -> None: @@ -820,15 +777,17 @@ class XPUModelRunner(ModelRunnerBase): free_list = list( range( self.num_gpu_blocks - 1, - int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - - 1, -1)) + int(self.num_gpu_blocks * self.parallel_config.kv_cache_ratio) - 1, + -1, + ) + ) self.free_list_len = len(free_list) - self.share_inputs.update({ - "free_list": - paddle.to_tensor(free_list, dtype="int32"), - "free_list_len": - paddle.full([1], self.free_list_len, dtype="int32"), - }) + self.share_inputs.update( + { + "free_list": paddle.to_tensor(free_list, dtype="int32"), + "free_list_len": paddle.full([1], self.free_list_len, dtype="int32"), + } + ) def not_need_stop(self) -> bool: """ """ diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index bf85762c1..8ce43b4dd 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -13,11 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import gc from typing import List, Optional import paddle -import paddle.nn as nn +from paddle import nn from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request @@ -46,8 +47,7 @@ class XpuWorker(WorkerBase): pass def init_device(self): - """ Initialize device and Construct model runner - """ + """Initialize device and Construct model runner""" if paddle.is_compiled_with_xpu(): # Set evironment variable self.device = f"xpu:{self.local_rank}" @@ -57,19 +57,19 @@ class XpuWorker(WorkerBase): gc.collect() else: - raise RuntimeError( - f"Not support device type: {self.device_config.device}") + raise RuntimeError(f"Not support device type: {self.device_config.device}") # Construct model runner self.model_runner: XPUModelRunner = XPUModelRunner( fd_config=self.fd_config, device=self.device, rank=self.rank, - local_rank=self.local_rank) + local_rank=self.local_rank, + ) def graph_optimize_and_warm_up_model(self) -> None: """ - Optimizes the inference graph using the specified optimization options. + Optimizes the inference graph using the specified optimization options. """ logger.warn("XPU current could not graph optimize and warm up model") @@ -87,15 +87,19 @@ class XpuWorker(WorkerBase): by adjusting the `gpu_memory_utilization` parameter. """ from fastdeploy.model_executor.ops.xpu import ( - xpu_get_free_global_memory, xpu_get_total_global_memory, - xpu_get_used_global_memory) + xpu_get_free_global_memory, + xpu_get_total_global_memory, + xpu_get_used_global_memory, + ) total_memory = xpu_get_total_global_memory(self.local_rank) used_memory = xpu_get_used_global_memory(self.local_rank) free_memory = xpu_get_free_global_memory(self.local_rank) - logger.info(f"Before warm up, total_memory: {total_memory}, \ - used_memory: {used_memory}, free_memory: {free_memory}") + logger.info( + f"Before warm up, total_memory: {total_memory}, \ + used_memory: {used_memory}, free_memory: {free_memory}" + ) self.model_runner.prepare_profile() self.model_runner.profile_run() @@ -108,8 +112,10 @@ class XpuWorker(WorkerBase): self.model_runner.clear_block_table() - logger.info(f"After warm up, total_available_memory: {total_available_memory}, \ - used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}") + logger.info( + f"After warm up, total_available_memory: {total_available_memory}, \ + used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}" + ) paddle.device.xpu.empty_cache() return available_kv_cache_memory # approximate value @@ -125,8 +131,7 @@ class XpuWorker(WorkerBase): """ """ return self.model_runner.get_model() - def initialize_cache(self, num_gpu_blocks: int, - num_cpu_blocks: int) -> None: + def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: """ """ pass @@ -145,7 +150,7 @@ class XpuWorker(WorkerBase): return self.model_runner.prefill_finished() def preprocess_new_task(self, req_dicts: List[Request]) -> None: - """ Process new requests and then start the decode loop + """Process new requests and then start the decode loop TODO(gongshaotian):The scheduler should schedule the handling of prefill, and workers and modelrunners should not perceive it. """ @@ -157,5 +162,4 @@ class XpuWorker(WorkerBase): def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None: """ """ - self.model_runner.update_share_input_block_num( - num_gpu_blocks=num_gpu_blocks) + self.model_runner.update_share_input_block_num(num_gpu_blocks=num_gpu_blocks) diff --git a/mkdocs.yml b/mkdocs.yml index 6777d251c..9ab270d1e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -4,10 +4,10 @@ nav: - 'Quick Start': - Installation: - 'Nvidia GPU': get_started/installation/nvidia_gpu.md - - 'KunlunXin XPU': get_started/installation/kunlunxin_xpu.md + - 'KunlunXin XPU': get_started/installation/kunlunxin_xpu.md - 'Enflame S60': get_started/installation/Enflame_gcu.md - 'Iluvatar CoreX': get_started/installation/iluvatar_gpu.md - - 'Quick Deployment For ERNIE-4.5-0.3B-Paddle': get_started/quick_start.md + - 'Quick Deployment For ERNIE-4.5-0.3B-Paddle': get_started/quick_start.md - 'Quick Deployment for ERNIE-4.5-VL-28B-A3B': get_started/quick_start_vl.md - 'ERNIE-4.5-300B-A47B': get_started/ernie-4.5.md - 'ERNIE-4.5-VL-424B-A47B': get_started/ernie-4.5-vl.md @@ -16,11 +16,11 @@ nav: - 'Monitor Metrics': online_serving/metrics.md - 'Scheduler': online_serving/scheduler.md - 'Offline Inference': offline_inference.md - - Quantiation: + - Quantiation: - 'Overview': quantization/README.md - 'Online Quantization': quantization/online_quantization.md - 'WINT2 Quantization': quantization/wint2.md - - Features: + - Features: - 'Prefix Caching': features/prefix_caching.md - 'Disaggration': features/disaggregated.md - 'Chunked Prefill': features/chunked_prefill.md @@ -34,10 +34,10 @@ nav: - 'Log Description': usage/log.md - 'Code Overview': usage/code_overview.md - 'Environment Variables': usage/environment_variables.md -theme: +theme: name: 'material' highlightjs: true - icon: + icon: repo: fontawesome/brands/github repo_url: https://github.com/PaddlePaddle/FastDeploy repo_name: FastDeploy diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..d67200713 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,57 @@ +[tool.isort] +profile = 'black' +known_third_party = ["paddle"] + + +[tool.black] +line-length = 119 +target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310'] +exclude = ['.flake8'] + + + +[tool.ruff] +exclude = [ + "./build", + "custom_ops/third_party", +] +line-length = 119 +target-version = "py39" + +[tool.ruff.format] +# Prevent change to double quotes by some users use ruff format +quote-style = "preserve" + +[tool.ruff.lint] +ignore = [ + # Whitespace before ‘,’, ‘;’, or ‘:’, it is not compatible with black + "E203", + # Module level import not at top of file + "E402", + # Line too long (82 > 79 characters) + "E501", + # Do not compare types, use `isinstance()` + "E721", + # Do not use bare except, specify exception instead + "E722", + # Do not assign a lambda expression, use a def + "E731", + # Do not use variables named ‘l’, ‘O’, or ‘I’ + "E741", + # `name` may be undefined, or defined from star imports: `module` + "F405", + # Local variable name is assigned to but never used + "F841", + # It not met the "Explicit is better than implicit" rule + "UP015", + # It will cause the performance regression on python3.10 + "UP038", + # collections.namedtuple can be quickly created a inlined class + "PYI024", + # `__all__.append` is a common pattern in Paddle + "PYI056", +] + +[tool.ruff.lint.per-file-ignores] +# Ignore for re-export in __init__ files +"__init__.py" = ["PLC0414"] diff --git a/requirements_dcu.txt b/requirements_dcu.txt index 7e6d524a9..14d2d42dd 100644 --- a/requirements_dcu.txt +++ b/requirements_dcu.txt @@ -27,4 +27,4 @@ moviepy use-triton-in-paddle crcmod fastsafetensors==0.1.14 -msgpack \ No newline at end of file +msgpack diff --git a/scripts/extract_mtp_weight_from_safetensor.py b/scripts/extract_mtp_weight_from_safetensor.py index 535ae4b93..1ac1fcfa5 100644 --- a/scripts/extract_mtp_weight_from_safetensor.py +++ b/scripts/extract_mtp_weight_from_safetensor.py @@ -28,19 +28,21 @@ from safetensors.numpy import save_file as safe_save_file def parse_args(): """""" - parser = argparse.ArgumentParser( - description="Extract and save MTP weights from safetensors.") - parser.add_argument("-i", - "--input_dir", - type=str, - required=True, - help="Path to the input safetensors model directory.") + parser = argparse.ArgumentParser(description="Extract and save MTP weights from safetensors.") + parser.add_argument( + "-i", + "--input_dir", + type=str, + required=True, + help="Path to the input safetensors model directory.", + ) parser.add_argument( "-o", "--output_dir", type=str, required=True, - help="Path to the output directory for saving processed weights.") + help="Path to the output directory for saving processed weights.", + ) return parser.parse_args() diff --git a/scripts/get_rdma_nics.sh b/scripts/get_rdma_nics.sh index db9e20c5b..4fc07a98c 100644 --- a/scripts/get_rdma_nics.sh +++ b/scripts/get_rdma_nics.sh @@ -62,7 +62,7 @@ function __JUDGE_NIC_TYPE__() { fi fi fi - + if [[ "$type" == "cpu" ]]; then for (( xgbe_no=0; xgbe_no < XGBE_NUM; xgbe_no++ )) do @@ -110,7 +110,7 @@ function __JUDGE_NIC_TYPE__() { function get_vxpu_nics() { local topo_output=$(xpu-smi topo -m) local xpu_info=$(echo "$topo_output" | grep -E '^XPU[0-9]+') - + local nic_mapping=() while IFS= read -r line; do if [[ $line =~ NIC([0-9]+):\ +(mlx[0-9_]+) ]]; then @@ -119,9 +119,9 @@ function get_vxpu_nics() { nic_mapping[$nic_idx]=$nic_name fi done < <(echo "$topo_output" | grep -E '^\s*NIC[0-9]+:') - + local nic_count=${#nic_mapping[@]} - + declare -A priority_map=([PIX]=2 [NODE]=1 [SYS]=0) local optimal_nics=() @@ -130,7 +130,7 @@ function get_vxpu_nics() { local nic_start_index=5 local max_nics=$(( ${#fields[@]} - nic_start_index )) local actual_nic_count=$(( max_nics < nic_count ? max_nics : nic_count )) - + local best_priority=-1 local best_nic="" @@ -185,7 +185,7 @@ function __main__() { for bond in $(ls -d /sys/class/net/bond* 2>/dev/null); do bond_if=$(basename "$bond") __NEW_GPU_ROOTPORT_FILE__ - + ibdev=$(ibdev2netdev 2>/dev/null | grep -w "$bond_if" | awk '{print $1}') if [ -n "$ibdev" ] && ip link show "$bond_if" | grep -q "state UP" && \ ip a show "$bond_if" | grep -q "inet "; then @@ -196,17 +196,17 @@ function __main__() { printf ",%s" "$ibdev" fi fi - + bondib=$(show_gids 2>/dev/null | grep -w "$bond_if" | awk '{print $1}' | grep "mlx.*bond" | head -1) if [ -n "$bondib" ] && ip link show "$bond_if" | grep -q "state UP" && \ ip a show "$bond_if" | grep -q "inet " && $first; then printf "KVCACHE_RDMA_NICS=%s" "$bondib" first=false fi - + __RM_GPU_ROOTPORT_FILE__ done - + ! $first && printf "\n" [ ! $first ] && return 0 fi @@ -222,4 +222,4 @@ function __main__() { done } -__main__ \ No newline at end of file +__main__ diff --git a/scripts/merge_cache_scale.py b/scripts/merge_cache_scale.py index c0d5482c1..7d46d3d52 100644 --- a/scripts/merge_cache_scale.py +++ b/scripts/merge_cache_scale.py @@ -14,9 +14,10 @@ # limitations under the License. """ -import os -import json import argparse +import json +import os + import numpy as np parser = argparse.ArgumentParser() diff --git a/scripts/run_ci.sh b/scripts/run_ci.sh index 2c6cadc51..7d77bccb4 100644 --- a/scripts/run_ci.sh +++ b/scripts/run_ci.sh @@ -67,4 +67,4 @@ if [ ${#failed_files[@]} -gt 0 ]; then else echo "All tests passed!" exit 0 -fi \ No newline at end of file +fi diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 64ee63b13..cb3ad94c1 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -89,4 +89,4 @@ if [ ${exit_code} -ne 0 ]; then echo "log/workerlog.0" cat log/workerlog.0 exit 1 -fi \ No newline at end of file +fi diff --git a/scripts/tune_cublaslt_int8_gemm.py b/scripts/tune_cublaslt_int8_gemm.py index f77768d3c..5af733d03 100644 --- a/scripts/tune_cublaslt_int8_gemm.py +++ b/scripts/tune_cublaslt_int8_gemm.py @@ -36,10 +36,16 @@ def tune_cublaslt_int8_gemm( try: from fastdeploy.model_executor.ops.gpu import tune_cublaslt_gemm except ImportError: - logger.warning( - "From fastdeploy.model_executor.ops.gpu import tune_cublaslt_gemm Failed!" - ) + logger.warning("From fastdeploy.model_executor.ops.gpu import tune_cublaslt_gemm Failed!") return - tune_cublaslt_gemm(K_tensor, N_tensor, m_min, m_max, dtype, is_test, - is_read_from_file, path) + tune_cublaslt_gemm( + K_tensor, + N_tensor, + m_min, + m_max, + dtype, + is_test, + is_read_from_file, + path, + ) diff --git a/scripts/tune_cutlass_fp8_gemm.py b/scripts/tune_cutlass_fp8_gemm.py index 6fa3dcbcc..181bcf1e5 100644 --- a/scripts/tune_cutlass_fp8_gemm.py +++ b/scripts/tune_cutlass_fp8_gemm.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" UT for cutlass_fp8_fp8_half_gemm_fused """ +"""UT for cutlass_fp8_fp8_half_gemm_fused""" import paddle from fastdeploy.utils import llm_logger as logger @@ -26,14 +26,14 @@ def tune_cutlass_fp8_fp8_half_gemm_fused( """ Tune fp8 gemm. """ - assert len(ns) == len( - ks), "The length of `ns` must be equal to that of `ks`" + assert len(ns) == len(ks), "The length of `ns` must be equal to that of `ks`" try: from fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_half_gemm_fused except ImportError: logger.warning( "From fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_half_gemm_fused failed, \ - fp8 is only support cuda arch 89+.") + fp8 is only support cuda arch 89+." + ) return paddle.seed(2003) for m in range(m_min, m_max + 32, 32): @@ -42,10 +42,8 @@ def tune_cutlass_fp8_fp8_half_gemm_fused( for idx in range(len(ns)): n = ns[idx] k = ks[idx] - A = paddle.rand(shape=[m, k], - dtype="bfloat16").astype("float8_e4m3fn") - B = paddle.rand(shape=[n, k], - dtype="bfloat16").astype("float8_e4m3fn") + A = paddle.rand(shape=[m, k], dtype="bfloat16").astype("float8_e4m3fn") + B = paddle.rand(shape=[n, k], dtype="bfloat16").astype("float8_e4m3fn") cutlass_fp8_fp8_half_gemm_fused( A, B, @@ -68,14 +66,16 @@ def tune_cutlass_fp8_fp8_fp8_dual_gemm_fused( """ Tune fp8 dual-gemm. """ - assert len(ns) == len( - ks), "The length of `ns` must be equal to that of `ks`" + assert len(ns) == len(ks), "The length of `ns` must be equal to that of `ks`" try: - from fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_fp8_dual_gemm_fused + from fastdeploy.model_executor.ops.gpu import ( + cutlass_fp8_fp8_fp8_dual_gemm_fused, + ) except ImportError: logger.warning( "From fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_fp8_dual_gemm_fused failed, \ - fp8 is only support cuda arch 89+.") + fp8 is only support cuda arch 89+." + ) return paddle.seed(2003) for m in range(m_min, m_max + 32, 32): @@ -84,12 +84,9 @@ def tune_cutlass_fp8_fp8_fp8_dual_gemm_fused( for idx in range(len(ns)): n = ns[idx] k = ks[idx] - A = paddle.rand(shape=[m, k], - dtype="bfloat16").astype("float8_e4m3fn") - B0 = paddle.rand(shape=[n, k], - dtype="bfloat16").astype("float8_e4m3fn") - B1 = paddle.rand(shape=[n, k], - dtype="bfloat16").astype("float8_e4m3fn") + A = paddle.rand(shape=[m, k], dtype="bfloat16").astype("float8_e4m3fn") + B0 = paddle.rand(shape=[n, k], dtype="bfloat16").astype("float8_e4m3fn") + B1 = paddle.rand(shape=[n, k], dtype="bfloat16").astype("float8_e4m3fn") cutlass_fp8_fp8_fp8_dual_gemm_fused( A, B0, @@ -115,14 +112,16 @@ def tune_per_channel_fp8_gemm_fused( """ Tune per-channel quant gemm. """ - assert len(ns) == len( - ks), "The length of `ns` must be equal to that of `ks`" + assert len(ns) == len(ks), "The length of `ns` must be equal to that of `ks`" try: - from fastdeploy.model_executor.ops.gpu import per_channel_fp8_fp8_half_gemm_fused + from fastdeploy.model_executor.ops.gpu import ( + per_channel_fp8_fp8_half_gemm_fused, + ) except ImportError: logger.warning( "From fastdeploy.model_executor.ops.gpu import per_channel_fp8_fp8_half_gemm_fused failed, \ - fp8 is only support cuda arch 89+.") + fp8 is only support cuda arch 89+." + ) return paddle.seed(2003) for m in range(m_min, m_max + 32, 32): @@ -131,10 +130,8 @@ def tune_per_channel_fp8_gemm_fused( for idx in range(len(ns)): n = ns[idx] k = ks[idx] - A = paddle.rand(shape=[m, k], - dtype="bfloat16").astype("float8_e4m3fn") - B = paddle.rand(shape=[n, k], - dtype="bfloat16").astype("float8_e4m3fn") + A = paddle.rand(shape=[m, k], dtype="bfloat16").astype("float8_e4m3fn") + B = paddle.rand(shape=[n, k], dtype="bfloat16").astype("float8_e4m3fn") scalar_scale = paddle.full([1], 0.168, dtype="float32") channel_scale = paddle.rand(shape=[n], dtype="float32") @@ -160,14 +157,16 @@ def tune_blockwise_fp8_gemm_fused( """ Tune per-channel quant gemm. """ - assert len(ns) == len( - ks), "The length of `ns` must be equal to that of `ks`" + assert len(ns) == len(ks), "The length of `ns` must be equal to that of `ks`" try: - from fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_half_block_gemm_fused + from fastdeploy.model_executor.ops.gpu import ( + cutlass_fp8_fp8_half_block_gemm_fused, + ) except ImportError: logger.warning( "From fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_half_block_gemm_fused failed, \ - fp8 is only support cuda arch 90+.") + fp8 is only support cuda arch 90+." + ) return paddle.seed(2003) for m in range(m_min, m_max + 32, 32): @@ -178,10 +177,8 @@ def tune_blockwise_fp8_gemm_fused( k = ks[idx] scale_n = (n + 128 - 1) // 128 scale_k = (k + 128 - 1) // 128 - A = paddle.rand(shape=[m, k], - dtype="bfloat16").astype("float8_e4m3fn") - B = paddle.rand(shape=[n, k], - dtype="bfloat16").astype("float8_e4m3fn") + A = paddle.rand(shape=[m, k], dtype="bfloat16").astype("float8_e4m3fn") + B = paddle.rand(shape=[n, k], dtype="bfloat16").astype("float8_e4m3fn") a_scale = paddle.randn([scale_k, m], dtype="float32") b_scale = paddle.randn([scale_n, scale_k], dtype="float32") diff --git a/scripts/tune_scaled_gemm_f8_i4_f16.py b/scripts/tune_scaled_gemm_f8_i4_f16.py index de67e1c75..d895f1458 100644 --- a/scripts/tune_scaled_gemm_f8_i4_f16.py +++ b/scripts/tune_scaled_gemm_f8_i4_f16.py @@ -14,14 +14,14 @@ """tune_cutlass_fp8int4_gemm""" import os + import paddle -from fastdeploy.model_executor.ops.gpu import scaled_gemm_f8_i4_f16 from tqdm import tqdm +from fastdeploy.model_executor.ops.gpu import scaled_gemm_f8_i4_f16 -def tune_scaled_gemm_f8_i4_f16( - ns: list, ks: list, dtype="int8", is_test=True, is_read_from_file=False -): + +def tune_scaled_gemm_f8_i4_f16(ns: list, ks: list, dtype="int8", is_test=True, is_read_from_file=False): """ Tune fp8 int4 gemm. """ diff --git a/scripts/vit_model_split.py b/scripts/vit_model_split.py index 591c9b936..2e4205795 100644 --- a/scripts/vit_model_split.py +++ b/scripts/vit_model_split.py @@ -14,12 +14,11 @@ # limitations under the License. """ -import paddle -import paddle.distributed as dist -from paddle.distributed import fleet import argparse import os +import paddle + parser = argparse.ArgumentParser() parser.add_argument( "--model_path", @@ -47,12 +46,19 @@ for i in range(args.model_degree): static_dict = {} for k, v in input_model_state_dict.items(): if "qkv.weight" in k: - static_dict[k] = input_model_state_dict[k].reshape( - [hidden_size, 3, kv_num_heads, head_dim] - ).split(args.model_degree, axis=-2)[i].reshape([hidden_size, -1]) + static_dict[k] = ( + input_model_state_dict[k] + .reshape([hidden_size, 3, kv_num_heads, head_dim]) + .split(args.model_degree, axis=-2)[i] + .reshape([hidden_size, -1]) + ) elif "qkv.bias" in k: - static_dict[k] = input_model_state_dict[k].reshape( - [3, kv_num_heads, head_dim]).split(args.model_degree, axis=-2)[i].reshape([-1]) + static_dict[k] = ( + input_model_state_dict[k] + .reshape([3, kv_num_heads, head_dim]) + .split(args.model_degree, axis=-2)[i] + .reshape([-1]) + ) elif "attn.proj.weight" in k: static_dict[k] = input_model_state_dict[k].split(args.model_degree, axis=-2)[i] elif "fc1.weight" in k: @@ -64,4 +70,7 @@ for i in range(args.model_degree): else: static_dict[k] = v - paddle.save(static_dict, os.path.join(args.model_path, f"model_state_tp0{i}.pdparams")) \ No newline at end of file + paddle.save( + static_dict, + os.path.join(args.model_path, f"model_state_tp0{i}.pdparams"), + ) diff --git a/scripts/vit_model_split.sh b/scripts/vit_model_split.sh index fa5b34818..ef4341c14 100644 --- a/scripts/vit_model_split.sh +++ b/scripts/vit_model_split.sh @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -python scripts/vit_model_split.py --model_path ./ --output_path ./ --model_degree 8 \ No newline at end of file +python scripts/vit_model_split.py --model_path ./ --output_path ./ --model_degree 8 diff --git a/setup.py b/setup.py index 85661f86d..47a556a6a 100644 --- a/setup.py +++ b/setup.py @@ -16,15 +16,14 @@ import os import re -import sys -import paddle import subprocess -from setuptools import setup -from setuptools.command.install import install -from pathlib import Path +import sys from pathlib import Path + +import paddle from setuptools import Extension, find_packages, setup from setuptools.command.build_ext import build_ext +from setuptools.command.install import install from wheel.bdist_wheel import bdist_wheel long_description = "FastDeploy: Large Language Model Serving.\n\n" @@ -47,19 +46,16 @@ class CustomBdistWheel(bdist_wheel): """Configure wheel as pure Python and platform-independent.""" super().finalize_options() self.root_is_pure = True - self.python_tag = 'py3' - self.abi_tag = 'none' + self.python_tag = "py3" + self.abi_tag = "none" self.plat_name_supplied = True - self.plat_name = 'any' + self.plat_name = "any" class CMakeExtension(Extension): """A setuptools Extension for CMake-based builds.""" - def __init__(self, - name: str, - sourcedir: str = "", - version: str = None) -> None: + def __init__(self, name: str, sourcedir: str = "", version: str = None) -> None: """ Initialize CMake extension. @@ -78,7 +74,7 @@ class CMakeBuild(build_ext): def get_ext_filename(self, ext_name): """Remove Python version tag from extension filename""" - return ext_name.split('.')[0] + '.so' + return ext_name.split(".")[0] + ".so" def build_extension(self, ext: CMakeExtension) -> None: """ @@ -94,10 +90,12 @@ class CMakeBuild(build_ext): cmake_args = [ f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}", f"-DPYTHON_EXECUTABLE={sys.executable}", - f"-DCMAKE_BUILD_TYPE={cfg}", "-DVERSION_INFO=", - "-DPYBIND11_PYTHON_VERSION=", "-DPYTHON_VERSION=", + f"-DCMAKE_BUILD_TYPE={cfg}", + "-DVERSION_INFO=", + "-DPYBIND11_PYTHON_VERSION=", + "-DPYTHON_VERSION=", f"-DPYTHON_INCLUDE_DIR={sys.prefix}/include/python{sys.version_info.major}.{sys.version_info.minor}", - f"-DPYTHON_LIBRARY={sys.prefix}/lib/libpython{sys.version_info.major}.{sys.version_info.minor}.so" + f"-DPYTHON_LIBRARY={sys.prefix}/lib/libpython{sys.version_info.major}.{sys.version_info.minor}.so", ] build_args = [] @@ -106,10 +104,11 @@ class CMakeBuild(build_ext): if not cmake_generator or cmake_generator == "Ninja": try: import ninja + ninja_executable_path = Path(ninja.BIN_DIR) / "ninja" cmake_args += [ "-GNinja", - f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}" + f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}", ] except ImportError: pass @@ -117,54 +116,44 @@ class CMakeBuild(build_ext): if "NMake" not in cmake_generator and "Ninja" not in cmake_generator: cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]] if "NMake" not in cmake_generator and "Ninja" not in cmake_generator: - cmake_args += [ - f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}" - ] + cmake_args += [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"] build_args += ["--config", cfg] if sys.platform.startswith("darwin"): archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", "")) if archs: - cmake_args += [ - "-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs)) - ] + cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))] - if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ and hasattr( - self, "parallel") and self.parallel: + if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ and hasattr(self, "parallel") and self.parallel: build_args += [f"-j{self.parallel}"] build_temp = Path(self.build_temp) / ext.name build_temp.mkdir(parents=True, exist_ok=True) - subprocess.run(["cmake", ext.sourcedir, *cmake_args], - cwd=build_temp, - check=True) - subprocess.run(["cmake", "--build", ".", *build_args], - cwd=build_temp, - check=True) + subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True) + subprocess.run(["cmake", "--build", ".", *build_args], cwd=build_temp, check=True) + class PostInstallCommand(install): """在标准安装完成后执行自定义命令""" + def run(self): # 先执行标准安装步骤 install.run(self) # 执行自定义命令 subprocess.check_call(["opentelemetry-bootstrap", "-a", "install"]) + def load_requirements(): """Load dependencies from requirements.txt""" - requirements_file_name = 'requirements.txt' - if paddle.is_compiled_with_custom_device('iluvatar_gpu'): - requirements_file_name = 'requirements_iluvatar.txt' + requirements_file_name = "requirements.txt" + if paddle.is_compiled_with_custom_device("iluvatar_gpu"): + requirements_file_name = "requirements_iluvatar.txt" elif paddle.is_compiled_with_rocm(): - requirements_file_name = 'requirements_dcu.txt' - requirements_path = os.path.join(os.path.dirname(__file__), - requirements_file_name) - with open(requirements_path, 'r') as f: - return [ - line.strip() for line in f - if line.strip() and not line.startswith('#') - ] + requirements_file_name = "requirements_dcu.txt" + requirements_path = os.path.join(os.path.dirname(__file__), requirements_file_name) + with open(requirements_path, "r") as f: + return [line.strip() for line in f if line.strip() and not line.startswith("#")] def get_device_type(): @@ -175,11 +164,11 @@ def get_device_type(): return "gpu" elif paddle.is_compiled_with_xpu(): return "xpu" - elif paddle.is_compiled_with_custom_device('npu'): + elif paddle.is_compiled_with_custom_device("npu"): return "npu" - elif paddle.is_compiled_with_custom_device('iluvatar_gpu'): + elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): return "iluvatar-gpu" - elif paddle.is_compiled_with_custom_device('gcu'): + elif paddle.is_compiled_with_custom_device("gcu"): return "gcu" else: return "cpu" @@ -190,10 +179,10 @@ def get_name(): return "fastdeploy-" + get_device_type() -cmdclass_dict = {'bdist_wheel': CustomBdistWheel} -cmdclass_dict['build_ext'] = CMakeBuild +cmdclass_dict = {"bdist_wheel": CustomBdistWheel} +cmdclass_dict["build_ext"] = CMakeBuild FASTDEPLOY_VERSION = os.environ.get("FASTDEPLOY_VERSION", "2.0.0-dev") -cmdclass_dict['build_optl'] = PostInstallCommand +cmdclass_dict["build_optl"] = PostInstallCommand setup( name=get_name(), @@ -210,22 +199,31 @@ setup( "fastdeploy": [ "model_executor/ops/gpu/*", "model_executor/ops/gpu/deep_gemm/include/**/*", - "model_executor/ops/cpu/*", "model_executor/ops/xpu/*", - "model_executor/ops/xpu/libs/*", "model_executor/ops/npu/*", - "model_executor/ops/base/*", "model_executor/ops/iluvatar/*", - "model_executor/models/*", "model_executor/layers/*", - "input/mm_processor/utils/*", "model_executor/ops/gcu/*", - "version.txt" + "model_executor/ops/cpu/*", + "model_executor/ops/xpu/*", + "model_executor/ops/xpu/libs/*", + "model_executor/ops/npu/*", + "model_executor/ops/base/*", + "model_executor/ops/iluvatar/*", + "model_executor/models/*", + "model_executor/layers/*", + "input/mm_processor/utils/*", + "model_executor/ops/gcu/*", + "version.txt", ] }, install_requires=load_requirements(), - ext_modules=[ - CMakeExtension( - "rdma_comm", - sourcedir= - "fastdeploy/cache_manager/transfer_factory/kvcache_transfer", - version=None) - ] if os.getenv("ENABLE_FD_RDMA", "0") == "1" else [], + ext_modules=( + [ + CMakeExtension( + "rdma_comm", + sourcedir="fastdeploy/cache_manager/transfer_factory/kvcache_transfer", + version=None, + ) + ] + if os.getenv("ENABLE_FD_RDMA", "0") == "1" + else [] + ), cmdclass=cmdclass_dict if os.getenv("ENABLE_FD_RDMA", "0") == "1" else {}, zip_safe=False, classifiers=[ @@ -233,8 +231,7 @@ setup( "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], - license='Apache 2.0', + license="Apache 2.0", python_requires=">=3.7", extras_require={"test": ["pytest>=6.0"]}, ) - diff --git a/test/ci_use/EB_Lite/test_EB_Lite_serving.py b/test/ci_use/EB_Lite/test_EB_Lite_serving.py index 5413a5933..645997673 100644 --- a/test/ci_use/EB_Lite/test_EB_Lite_serving.py +++ b/test/ci_use/EB_Lite/test_EB_Lite_serving.py @@ -31,6 +31,7 @@ FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) # List of ports to clean before and after tests PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] + def is_port_open(host: str, port: int, timeout=1.0): """ Check if a TCP port is open on the given host. @@ -42,19 +43,21 @@ def is_port_open(host: str, port: int, timeout=1.0): except Exception: return False + def kill_process_on_port(port: int): """ Kill processes that are listening on the given port. Uses `lsof` to find process ids and sends SIGKILL. """ try: - output = subprocess.check_output("lsof -i:{} -t".format(port), shell=True).decode().strip() + output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() for pid in output.splitlines(): os.kill(int(pid), signal.SIGKILL) - print("Killed process on port {}, pid={}".format(port, pid)) + print(f"Killed process on port {port}, pid={pid}") except subprocess.CalledProcessError: pass + def clean_ports(): """ Kill all processes occupying the ports listed in PORTS_TO_CLEAN. @@ -62,6 +65,7 @@ def clean_ports(): for port in PORTS_TO_CLEAN: kill_process_on_port(port) + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ @@ -82,17 +86,28 @@ def setup_and_run_server(): log_path = "server.log" cmd = [ - sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", - "--model", model_path, - "--port", str(FD_API_PORT), - "--tensor-parallel-size", "1", - "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", str(FD_METRICS_PORT), - "--max-model-len", "32768", - "--max-num-seqs", "128", - "--quantization", "wint4", + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "1", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), + "--max-model-len", + "32768", + "--max-num-seqs", + "128", + "--quantization", + "wint4", "--use-cudagraph", - "--graph-optimization-config", '{"cudagraph_capture_sizes": [1]}' + "--graph-optimization-config", + '{"cudagraph_capture_sizes": [1]}', ] # Start subprocess in new process group @@ -101,13 +116,13 @@ def setup_and_run_server(): cmd, stdout=logfile, stderr=subprocess.STDOUT, - start_new_session=True # Enables killing full group via os.killpg + start_new_session=True, # Enables killing full group via os.killpg ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): - print("API server is up on port {}".format(FD_API_PORT)) + print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: @@ -115,17 +130,17 @@ def setup_and_run_server(): try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: - print("Failed to kill process group: {}".format(e)) - raise RuntimeError("API server did not start on port {}".format(FD_API_PORT)) + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) - print("API server (pid={}) terminated".format(process.pid)) + print(f"API server (pid={process.pid}) terminated") except Exception as e: - print("Failed to terminate API server: {}".format(e)) + print(f"Failed to terminate API server: {e}") @pytest.fixture(scope="session") @@ -133,7 +148,7 @@ def api_url(request): """ Returns the API endpoint URL for chat completions. """ - return "http://0.0.0.0:{}/v1/chat/completions".format(FD_API_PORT) + return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" @pytest.fixture(scope="session") @@ -141,7 +156,7 @@ def metrics_url(request): """ Returns the metrics endpoint URL. """ - return "http://0.0.0.0:{}/metrics".format(FD_METRICS_PORT) + return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" @pytest.fixture @@ -162,9 +177,10 @@ def consistent_payload(): "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}], "temperature": 0.9, "top_p": 0, # fix top_p to reduce randomness - "seed": 13 # fixed random seed + "seed": 13, # fixed random seed } + # ========================== # Helper function to calculate difference rate between two texts # ========================== @@ -193,6 +209,7 @@ def calculate_diff_rate(text1, text2): max_len = max(len1, len2) return edit_distance / max_len if max_len > 0 else 0.0 + # ========================== # Consistency test for repeated runs with fixed payload # ========================== @@ -216,22 +233,25 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): diff_rate = calculate_diff_rate(content1, content2) # Verify that the difference rate is below the threshold - assert diff_rate < 0.05, "Output difference too large ({:.4%})".format(diff_rate) + assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})" + # ========================== # OpenAI Client chat.completions Test # ========================== + @pytest.fixture def openai_client(): ip = "0.0.0.0" service_http_port = str(FD_API_PORT) client = openai.Client( - base_url="http://{}:{}/v1".format(ip, service_http_port), - api_key="EMPTY_API_KEY" + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", ) return client + # Non-streaming test def test_non_streaming_chat(openai_client): """ @@ -248,10 +268,11 @@ def test_non_streaming_chat(openai_client): stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'content') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + # Streaming test def test_streaming_chat(openai_client, capsys): @@ -263,7 +284,10 @@ def test_streaming_chat(openai_client, capsys): messages=[ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": "List 3 countries and their capitals."}, - {"role": "assistant", "content": "China(Beijing), France(Paris), Australia(Canberra)."}, + { + "role": "assistant", + "content": "China(Beijing), France(Paris), Australia(Canberra).", + }, {"role": "user", "content": "OK, tell more."}, ], temperature=1, @@ -273,14 +297,16 @@ def test_streaming_chat(openai_client, capsys): output = [] for chunk in response: - if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'): + if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): output.append(chunk.choices[0].delta.content) assert len(output) > 2 + # ========================== # OpenAI Client completions Test # ========================== + def test_non_streaming(openai_client): """ Test non-streaming chat functionality with the local service @@ -294,9 +320,10 @@ def test_non_streaming(openai_client): ) # Assertions to check the response structure - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 + def test_streaming(openai_client, capsys): """ Test streaming functionality with the local service @@ -315,6 +342,7 @@ def test_streaming(openai_client, capsys): output.append(chunk.choices[0].text) assert len(output) > 0 + def test_non_streaming_with_stop_str(openai_client): """ Test non-streaming chat functionality with the local service @@ -328,7 +356,7 @@ def test_non_streaming_with_stop_str(openai_client): stream=False, ) # Assertions to check the response structure - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 assert response.choices[0].message.content.endswith("") @@ -341,10 +369,11 @@ def test_non_streaming_with_stop_str(openai_client): stream=False, ) # Assertions to check the response structure - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 assert not response.choices[0].message.content.endswith("") + def test_streaming_with_stop_str(openai_client): """ Test non-streaming chat functionality with the local service diff --git a/test/ci_use/EB_Lite_mtp/test_EB_Lite_serving_mtp.py b/test/ci_use/EB_Lite_mtp/test_EB_Lite_serving_mtp.py index b03a77dfc..22b79c143 100644 --- a/test/ci_use/EB_Lite_mtp/test_EB_Lite_serving_mtp.py +++ b/test/ci_use/EB_Lite_mtp/test_EB_Lite_serving_mtp.py @@ -12,16 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import requests -import time import json -import subprocess -import socket import os import signal +import socket +import subprocess import sys +import time + import openai +import pytest +import requests # Read ports from environment variables; use default values if not set FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) @@ -31,6 +32,7 @@ FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) # List of ports to clean before and after tests PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] + def is_port_open(host: str, port: int, timeout=1.0): """ Check if a TCP port is open on the given host. @@ -42,19 +44,21 @@ def is_port_open(host: str, port: int, timeout=1.0): except Exception: return False + def kill_process_on_port(port: int): """ Kill processes that are listening on the given port. Uses `lsof` to find process ids and sends SIGKILL. """ try: - output = subprocess.check_output("lsof -i:{} -t".format(port), shell=True).decode().strip() + output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() for pid in output.splitlines(): os.kill(int(pid), signal.SIGKILL) - print("Killed process on port {}, pid={}".format(port, pid)) + print(f"Killed process on port {port}, pid={pid}") except subprocess.CalledProcessError: pass + def clean_ports(): """ Kill all processes occupying the ports listed in PORTS_TO_CLEAN. @@ -62,6 +66,7 @@ def clean_ports(): for port in PORTS_TO_CLEAN: kill_process_on_port(port) + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ @@ -81,24 +86,31 @@ def setup_and_run_server(): model_path = "./ernie-4_5-21b-a3b-bf16-paddle" mtp_model_path = os.path.join(model_path, "mtp") - mtp_mode_str = json.dumps({ - "method": "mtp", - "num_speculative_tokens": 1, - "model": mtp_model_path - }) + mtp_mode_str = json.dumps({"method": "mtp", "num_speculative_tokens": 1, "model": mtp_model_path}) log_path = "server.log" cmd = [ - sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", - "--model", model_path, - "--port", str(FD_API_PORT), - "--tensor-parallel-size", "1", - "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", str(FD_METRICS_PORT), - "--max-model-len", "32768", - "--max-num-seqs", "128", - "--quantization", "wint4", - "--speculative-config", mtp_mode_str + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "1", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), + "--max-model-len", + "32768", + "--max-num-seqs", + "128", + "--quantization", + "wint4", + "--speculative-config", + mtp_mode_str, ] # Start subprocess in new process group @@ -107,13 +119,13 @@ def setup_and_run_server(): cmd, stdout=logfile, stderr=subprocess.STDOUT, - start_new_session=True # Enables killing full group via os.killpg + start_new_session=True, # Enables killing full group via os.killpg ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): - print("API server is up on port {}".format(FD_API_PORT)) + print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: @@ -121,17 +133,17 @@ def setup_and_run_server(): try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: - print("Failed to kill process group: {}".format(e)) - raise RuntimeError("API server did not start on port {}".format(FD_API_PORT)) + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) - print("API server (pid={}) terminated".format(process.pid)) + print(f"API server (pid={process.pid}) terminated") except Exception as e: - print("Failed to terminate API server: {}".format(e)) + print(f"Failed to terminate API server: {e}") @pytest.fixture(scope="session") @@ -139,7 +151,7 @@ def api_url(request): """ Returns the API endpoint URL for chat completions. """ - return "http://0.0.0.0:{}/v1/chat/completions".format(FD_API_PORT) + return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" @pytest.fixture(scope="session") @@ -147,7 +159,7 @@ def metrics_url(request): """ Returns the metrics endpoint URL. """ - return "http://0.0.0.0:{}/metrics".format(FD_METRICS_PORT) + return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" @pytest.fixture @@ -168,9 +180,10 @@ def consistent_payload(): "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}], "temperature": 0.9, "top_p": 0, # fix top_p to reduce randomness - "seed": 13 # fixed random seed + "seed": 13, # fixed random seed } + # ========================== # Helper function to calculate difference rate between two texts # ========================== @@ -199,6 +212,7 @@ def calculate_diff_rate(text1, text2): max_len = max(len1, len2) return edit_distance / max_len if max_len > 0 else 0.0 + # ========================== # Consistency test for repeated runs with fixed payload # ========================== @@ -222,22 +236,25 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): diff_rate = calculate_diff_rate(content1, content2) # Verify that the difference rate is below the threshold - assert diff_rate < 0.05, "Output difference too large ({:.4%})".format(diff_rate) + assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})" + # ========================== # OpenAI Client chat.completions Test # ========================== + @pytest.fixture def openai_client(): ip = "0.0.0.0" service_http_port = str(FD_API_PORT) client = openai.Client( - base_url="http://{}:{}/v1".format(ip, service_http_port), - api_key="EMPTY_API_KEY" + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", ) return client + # Non-streaming test def test_non_streaming_chat(openai_client): """ @@ -254,10 +271,11 @@ def test_non_streaming_chat(openai_client): stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'content') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + # Streaming test def test_streaming_chat(openai_client, capsys): @@ -269,7 +287,10 @@ def test_streaming_chat(openai_client, capsys): messages=[ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": "List 3 countries and their capitals."}, - {"role": "assistant", "content": "China(Beijing), France(Paris), Australia(Canberra)."}, + { + "role": "assistant", + "content": "China(Beijing), France(Paris), Australia(Canberra).", + }, {"role": "user", "content": "OK, tell more."}, ], temperature=1, @@ -279,14 +300,16 @@ def test_streaming_chat(openai_client, capsys): output = [] for chunk in response: - if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'): + if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): output.append(chunk.choices[0].delta.content) assert len(output) > 2 + # ========================== # OpenAI Client completions Test # ========================== + def test_non_streaming(openai_client): """ Test non-streaming chat functionality with the local service @@ -300,7 +323,7 @@ def test_non_streaming(openai_client): ) # Assertions to check the response structure - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 @@ -320,4 +343,4 @@ def test_streaming(openai_client, capsys): output = [] for chunk in response: output.append(chunk.choices[0].text) - assert len(output) > 0 \ No newline at end of file + assert len(output) > 0 diff --git a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 4252d5e9a..b362ba0bd 100644 --- a/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/test/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -12,17 +12,17 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import requests -import time import json -import subprocess -import socket import os import signal +import socket +import subprocess import sys -import openai +import time +import openai +import pytest +import requests # Read ports from environment variables; use default values if not set FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) @@ -32,6 +32,7 @@ FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) # List of ports to clean before and after tests PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] + def is_port_open(host: str, port: int, timeout=1.0): """ Check if a TCP port is open on the given host. @@ -43,19 +44,21 @@ def is_port_open(host: str, port: int, timeout=1.0): except Exception: return False + def kill_process_on_port(port: int): """ Kill processes that are listening on the given port. Uses `lsof` to find process ids and sends SIGKILL. """ try: - output = subprocess.check_output("lsof -i:{} -t".format(port), shell=True).decode().strip() + output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() for pid in output.splitlines(): os.kill(int(pid), signal.SIGKILL) - print("Killed process on port {}, pid={}".format(port, pid)) + print(f"Killed process on port {port}, pid={pid}") except subprocess.CalledProcessError: pass + def clean_ports(): """ Kill all processes occupying the ports listed in PORTS_TO_CLEAN. @@ -63,6 +66,7 @@ def clean_ports(): for port in PORTS_TO_CLEAN: kill_process_on_port(port) + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ @@ -77,28 +81,41 @@ def setup_and_run_server(): base_path = os.getenv("MODEL_PATH") if base_path: - model_path=os.path.join(base_path, "ernie-4_5-vl-28b-a3b-bf16-paddle") + model_path = os.path.join(base_path, "ernie-4_5-vl-28b-a3b-bf16-paddle") else: - model_path="./ernie-4_5-vl-28b-a3b-bf16-paddle" + model_path = "./ernie-4_5-vl-28b-a3b-bf16-paddle" log_path = "server.log" limit_mm_str = json.dumps({"image": 100, "video": 100}) cmd = [ - sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", - "--model", model_path, - "--port", str(FD_API_PORT), - "--tensor-parallel-size", "2", - "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", str(FD_METRICS_PORT), + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "2", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), "--enable-mm", - "--max-model-len", "32768", - "--max-num-batched-tokens", "384", - "--max-num-seqs", "128", - "--limit-mm-per-prompt", limit_mm_str, + "--max-model-len", + "32768", + "--max-num-batched-tokens", + "384", + "--max-num-seqs", + "128", + "--limit-mm-per-prompt", + limit_mm_str, "--enable-chunked-prefill", - "--kv-cache-ratio", "0.71", - "--quantization", "wint4" + "--kv-cache-ratio", + "0.71", + "--quantization", + "wint4", ] # Start subprocess in new process group @@ -107,13 +124,13 @@ def setup_and_run_server(): cmd, stdout=logfile, stderr=subprocess.STDOUT, - start_new_session=True # Enables killing full group via os.killpg + start_new_session=True, # Enables killing full group via os.killpg ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): - print("API server is up on port {}".format(FD_API_PORT)) + print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: @@ -121,17 +138,17 @@ def setup_and_run_server(): try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: - print("Failed to kill process group: {}".format(e)) - raise RuntimeError("API server did not start on port {}".format(FD_API_PORT)) + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) - print("API server (pid={}) terminated".format(process.pid)) + print(f"API server (pid={process.pid}) terminated") except Exception as e: - print("Failed to terminate API server: {}".format(e)) + print(f"Failed to terminate API server: {e}") @pytest.fixture(scope="session") @@ -139,7 +156,7 @@ def api_url(request): """ Returns the API endpoint URL for chat completions. """ - return "http://0.0.0.0:{}/v1/chat/completions".format(FD_API_PORT) + return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" @pytest.fixture(scope="session") @@ -147,7 +164,7 @@ def metrics_url(request): """ Returns the metrics endpoint URL. """ - return "http://0.0.0.0:{}/metrics".format(FD_METRICS_PORT) + return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" @pytest.fixture @@ -166,14 +183,23 @@ def consistent_payload(): """ return { "messages": [ - {"role": "user", "content": [ - {"type": "image_url", "image_url": {"url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", "detail": "high"}}, - {"type": "text", "text": "请描述图片内容"} - ]} + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + } ], "temperature": 0.8, "top_p": 0, # fix top_p to reduce randomness - "seed": 13 # fixed random seed + "seed": 13, # fixed random seed } @@ -190,7 +216,7 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): result1 = resp1.json() content1 = result1["choices"][0]["message"]["content"] file_res_temp = "ernie-4_5-vl" - f_o = open(file_res_temp, 'a') + f_o = open(file_res_temp, "a") f_o.writelines(content1) f_o.close() @@ -206,20 +232,23 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): # Verify that result is same as the base result assert content1 == content2 + # ========================== # OpenAI Client Chat Completion Test # ========================== + @pytest.fixture def openai_client(): ip = "0.0.0.0" service_http_port = str(FD_API_PORT) client = openai.Client( - base_url = "http://{}:{}/v1".format(ip, service_http_port), - api_key="EMPTY_API_KEY" + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", ) return client + # Non-streaming test def test_non_streaming_chat(openai_client): """Test non-streaming chat functionality with the local service""" @@ -228,33 +257,32 @@ def test_non_streaming_chat(openai_client): messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] - } + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], + }, ], temperature=1, max_tokens=53, stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'content') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + # Streaming test def test_streaming_chat(openai_client, capsys): @@ -264,30 +292,25 @@ def test_streaming_chat(openai_client, capsys): messages=[ { "role": "system", - "content": "You are a helpful AI assistant." + "content": "You are a helpful AI assistant.", }, # system不是必需,可选 - { - "role": "user", - "content": "List 3 countries and their capitals." - }, + {"role": "user", "content": "List 3 countries and their capitals."}, { "role": "assistant", - "content": "China(Beijing), France(Paris), Australia(Canberra)." + "content": "China(Beijing), France(Paris), Australia(Canberra).", }, { - "role": - "user", - "content": [{ - "type": "image_url", - "image_url": { - "url": - "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", - "detail": "high" - } - }, { - "type": "text", - "text": "请描述图片内容" - }] + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://ku.baidu-int.com/vk-assets-ltd/space/2024/09/13/933d1e0a0760498e94ec0f2ccee865e0", + "detail": "high", + }, + }, + {"type": "text", "text": "请描述图片内容"}, + ], }, ], temperature=1, @@ -297,6 +320,6 @@ def test_streaming_chat(openai_client, capsys): output = [] for chunk in response: - if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'): + if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): output.append(chunk.choices[0].delta.content) assert len(output) > 2 diff --git a/test/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py b/test/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py index dc7f97070..6fcfb42e3 100644 --- a/test/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py +++ b/test/ci_use/Qwen2-7B-Instruct_offline/test_Qwen2-7B-Instruct_offline.py @@ -12,19 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import traceback -from fastdeploy import LLM, SamplingParams import os -import subprocess import signal -import time import socket +import subprocess +import time +import traceback +import pytest + +from fastdeploy import LLM, SamplingParams FD_ENGINE_QUEUE_PORT = int(os.getenv("FD_ENGINE_QUEUE_PORT", 8313)) MAX_WAIT_SECONDS = 60 + def is_port_open(host: str, port: int, timeout=1.0): """ Check if a TCP port is open on the given host. @@ -46,9 +48,9 @@ def format_chat_prompt(messages): for msg in messages: role, content = msg["role"], msg["content"] if role == "user": - prompt += "<|im_start|>user\n{content}<|im_end|>\n".format(content=content) + prompt += f"<|im_start|>user\n{content}<|im_end|>\n" elif role == "assistant": - prompt += "<|im_start|>assistant\n{content}<|im_end|>\n".format(content=content) + prompt += f"<|im_start|>assistant\n{content}<|im_end|>\n" prompt += "<|im_start|>assistant\n" return prompt @@ -72,10 +74,10 @@ def llm(model_path): Fixture to initialize the LLM model with a given model path """ try: - output = subprocess.check_output("lsof -i:{} -t".format(FD_ENGINE_QUEUE_PORT), shell=True).decode().strip() + output = subprocess.check_output(f"lsof -i:{FD_ENGINE_QUEUE_PORT} -t", shell=True).decode().strip() for pid in output.splitlines(): os.kill(int(pid), signal.SIGKILL) - print("Killed process on port {}, pid={}".format(FD_ENGINE_QUEUE_PORT, pid)) + print(f"Killed process on port {FD_ENGINE_QUEUE_PORT}, pid={pid}") except subprocess.CalledProcessError: pass @@ -86,23 +88,24 @@ def llm(model_path): tensor_parallel_size=1, engine_worker_queue_port=FD_ENGINE_QUEUE_PORT, max_model_len=32768, - quantization="wint8" + quantization="wint8", ) # Wait for the port to be open wait_start = time.time() while not is_port_open("127.0.0.1", FD_ENGINE_QUEUE_PORT): if time.time() - wait_start > MAX_WAIT_SECONDS: - pytest.fail("Model engine did not start within {} seconds on port {}".format( - MAX_WAIT_SECONDS, FD_ENGINE_QUEUE_PORT)) + pytest.fail( + f"Model engine did not start within {MAX_WAIT_SECONDS} seconds on port {FD_ENGINE_QUEUE_PORT}" + ) time.sleep(1) - print("Model loaded successfully from {} in {:.2f}s.".format(model_path, time.time() - start)) + print(f"Model loaded successfully from {model_path} in {time.time() - start:.2f}s.") yield llm except Exception: - print("Failed to load model from {}.".format(model_path)) + print(f"Failed to load model from {model_path}.") traceback.print_exc() - pytest.fail("Failed to initialize LLM model from {}".format(model_path)) + pytest.fail(f"Failed to initialize LLM model from {model_path}") def test_generate_prompts(llm): @@ -128,13 +131,13 @@ def test_generate_prompts(llm): assert len(outputs) == len(prompts), "Number of outputs should match number of prompts" for i, output in enumerate(outputs): - assert output.prompt == prompts[i], "Prompt mismatch for case {}".format(i + 1) - assert isinstance(output.outputs.text, str), "Output text should be string for case {}".format(i + 1) - assert len(output.outputs.text) > 0, "Generated text should not be empty for case {}".format(i + 1) - assert isinstance(output.finished, bool), "'finished' should be boolean for case {}".format(i + 1) - assert output.metrics.model_execute_time > 0, "Execution time should be positive for case {}".format(i + 1) + assert output.prompt == prompts[i], f"Prompt mismatch for case {i + 1}" + assert isinstance(output.outputs.text, str), f"Output text should be string for case {i + 1}" + assert len(output.outputs.text) > 0, f"Generated text should not be empty for case {i + 1}" + assert isinstance(output.finished, bool), f"'finished' should be boolean for case {i + 1}" + assert output.metrics.model_execute_time > 0, f"Execution time should be positive for case {i + 1}" - print("=== Prompt generation Case {} Passed ===".format(i + 1)) + print(f"=== Prompt generation Case {i + 1} Passed ===") except Exception: print("Failed during prompt generation.") @@ -180,16 +183,16 @@ def test_chat_completion(llm): assert len(outputs[0].outputs.text) > 0, "Generated text should not be empty" assert outputs[0].metrics.model_execute_time > 0, "Execution time should be positive" - print("=== Chat Case {} Passed ===".format(i + 1)) + print(f"=== Chat Case {i + 1} Passed ===") except Exception: - print("[ERROR] Chat Case {} failed.".format(i + 1)) + print(f"[ERROR] Chat Case {i + 1} failed.") traceback.print_exc() - pytest.fail("Chat case {} failed".format(i + 1)) + pytest.fail(f"Chat case {i + 1} failed") if __name__ == "__main__": """ Main entry point for the test script. """ - pytest.main(["-sv", __file__]) \ No newline at end of file + pytest.main(["-sv", __file__]) diff --git a/test/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py b/test/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py index 76e9bbc38..5898d332f 100644 --- a/test/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py +++ b/test/ci_use/Qwen2-7B-Instruct_serving/test_Qwen2-7B-Instruct_serving.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import requests -import time -import json -from jsonschema import validate import concurrent.futures -import subprocess -import socket +import json import os import signal +import socket +import subprocess import sys -import openai +import time +import openai +import pytest +import requests +from jsonschema import validate # Read ports from environment variables; use default values if not set FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) @@ -34,6 +34,7 @@ FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) # List of ports to clean before and after tests PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] + def is_port_open(host: str, port: int, timeout=1.0): """ Check if a TCP port is open on the given host. @@ -45,19 +46,21 @@ def is_port_open(host: str, port: int, timeout=1.0): except Exception: return False + def kill_process_on_port(port: int): """ Kill processes that are listening on the given port. Uses `lsof` to find process ids and sends SIGKILL. """ try: - output = subprocess.check_output("lsof -i:{} -t".format(port), shell=True).decode().strip() + output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() for pid in output.splitlines(): os.kill(int(pid), signal.SIGKILL) - print("Killed process on port {}, pid={}".format(port, pid)) + print(f"Killed process on port {port}, pid={pid}") except subprocess.CalledProcessError: pass + def clean_ports(): """ Kill all processes occupying the ports listed in PORTS_TO_CLEAN. @@ -65,6 +68,7 @@ def clean_ports(): for port in PORTS_TO_CLEAN: kill_process_on_port(port) + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ @@ -79,21 +83,31 @@ def setup_and_run_server(): base_path = os.getenv("MODEL_PATH") if base_path: - model_path=os.path.join(base_path, "Qwen2-7B-Instruct") + model_path = os.path.join(base_path, "Qwen2-7B-Instruct") else: - model_path="./Qwen2-7B-Instruct" + model_path = "./Qwen2-7B-Instruct" log_path = "server.log" cmd = [ - sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", - "--model", model_path, - "--port", str(FD_API_PORT), - "--tensor-parallel-size", "1", - "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", str(FD_METRICS_PORT), - "--max-model-len", "32768", - "--max-num-seqs", "128", - "--quantization", "wint8" + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "1", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), + "--max-model-len", + "32768", + "--max-num-seqs", + "128", + "--quantization", + "wint8", ] # Start subprocess in new process group @@ -102,13 +116,13 @@ def setup_and_run_server(): cmd, stdout=logfile, stderr=subprocess.STDOUT, - start_new_session=True # Enables killing full group via os.killpg + start_new_session=True, # Enables killing full group via os.killpg ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): - print("API server is up on port {}".format(FD_API_PORT)) + print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: @@ -116,17 +130,17 @@ def setup_and_run_server(): try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: - print("Failed to kill process group: {}".format(e)) - raise RuntimeError("API server did not start on port {}".format(FD_API_PORT)) + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) - print("API server (pid={}) terminated".format(process.pid)) + print(f"API server (pid={process.pid}) terminated") except Exception as e: - print("Failed to terminate API server: {}".format(e)) + print(f"Failed to terminate API server: {e}") @pytest.fixture(scope="session") @@ -134,7 +148,7 @@ def api_url(request): """ Returns the API endpoint URL for chat completions. """ - return "http://0.0.0.0:{}/v1/chat/completions".format(FD_API_PORT) + return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" @pytest.fixture(scope="session") @@ -142,7 +156,7 @@ def metrics_url(request): """ Returns the metrics endpoint URL. """ - return "http://0.0.0.0:{}/metrics".format(FD_METRICS_PORT) + return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" @pytest.fixture @@ -163,9 +177,10 @@ def consistent_payload(): "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle"}], "temperature": 0.9, "top_p": 0, # fix top_p to reduce randomness - "seed": 13 # fixed random seed + "seed": 13, # fixed random seed } + # ========================== # JSON Schema for validating chat API responses # ========================== @@ -187,16 +202,16 @@ chat_response_schema = { "role": {"type": "string"}, "content": {"type": "string"}, }, - "required": ["role", "content"] + "required": ["role", "content"], }, "index": {"type": "number"}, - "finish_reason": {"type": "string"} + "finish_reason": {"type": "string"}, }, - "required": ["message", "index", "finish_reason"] - } - } + "required": ["message", "index", "finish_reason"], + }, + }, }, - "required": ["id", "object", "created", "model", "choices"] + "required": ["id", "object", "created", "model", "choices"], } @@ -228,6 +243,7 @@ def calculate_diff_rate(text1, text2): max_len = max(len1, len2) return edit_distance / max_len if max_len > 0 else 0.0 + # ========================== # Valid prompt test cases for parameterized testing # ========================== @@ -236,6 +252,7 @@ valid_prompts = [ [{"role": "user", "content": "用一句话介绍 FastDeploy"}], ] + @pytest.mark.parametrize("messages", valid_prompts) def test_valid_chat(messages, api_url, headers): """ @@ -246,6 +263,7 @@ def test_valid_chat(messages, api_url, headers): assert resp.status_code == 200 validate(instance=resp.json(), schema=chat_response_schema) + # ========================== # Consistency test for repeated runs with fixed payload # ========================== @@ -269,7 +287,8 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): diff_rate = calculate_diff_rate(content1, content2) # Verify that the difference rate is below the threshold - assert diff_rate < 0.05, "Output difference too large ({:.4%})".format(diff_rate) + assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})" + # ========================== # Invalid prompt tests @@ -282,6 +301,7 @@ invalid_prompts = [ [{"content": "hello"}], # Missing role ] + @pytest.mark.parametrize("messages", invalid_prompts) def test_invalid_chat(messages, api_url, headers): """ @@ -295,6 +315,7 @@ def test_invalid_chat(messages, api_url, headers): # Test for input exceeding context length # ========================== + def test_exceed_context_length(api_url, headers): """ Test case for inputs that exceed the model's maximum context length. @@ -302,9 +323,7 @@ def test_exceed_context_length(api_url, headers): # Construct an overly long message long_content = "你好," * 20000 - messages = [ - {"role": "user", "content": long_content} - ] + messages = [{"role": "user", "content": long_content}] resp = requests.post(api_url, headers=headers, json={"messages": messages}) @@ -315,8 +334,10 @@ def test_exceed_context_length(api_url, headers): response_json = {} # Check status code and response content - assert resp.status_code != 200 or "token" in json.dumps(response_json).lower(), \ - "Expected token limit error or similar, but got a normal response: {}".format(response_json) + assert ( + resp.status_code != 200 or "token" in json.dumps(response_json).lower() + ), f"Expected token limit error or similar, but got a normal response: {response_json}" + # ========================== # Multi-turn Conversation Test @@ -328,12 +349,13 @@ def test_multi_turn_conversation(api_url, headers): messages = [ {"role": "user", "content": "你是谁?"}, {"role": "assistant", "content": "我是AI助手"}, - {"role": "user", "content": "你能做什么?"} + {"role": "user", "content": "你能做什么?"}, ] resp = requests.post(api_url, headers=headers, json={"messages": messages}) assert resp.status_code == 200 validate(instance=resp.json(), schema=chat_response_schema) + # ========================== # Concurrent Performance Test # ========================== @@ -357,17 +379,19 @@ def test_concurrent_perf(api_url, headers): print("\nResponse time for each request:", durations) + # ========================== # Metrics Endpoint Test # ========================== + def test_metrics_endpoint(metrics_url): """ Test the metrics monitoring endpoint. """ resp = requests.get(metrics_url, timeout=5) - assert resp.status_code == 200, "Unexpected status code: {}".format(resp.status_code) + assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" assert "text/plain" in resp.headers["Content-Type"], "Content-Type is not text/plain" # Parse Prometheus metrics data @@ -477,20 +501,23 @@ def test_metrics_endpoint(metrics_url): assert request_params_max_tokens_sum_found, "缺少 fastdeploy:request_params_max_tokens_sum 指标" assert request_success_total_found, "缺少 fastdeploy:request_success_total 指标" + # ========================== # OpenAI Client chat.completions Test # ========================== + @pytest.fixture def openai_client(): ip = "0.0.0.0" service_http_port = str(FD_API_PORT) client = openai.Client( - base_url = "http://{}:{}/v1".format(ip, service_http_port), - api_key="EMPTY_API_KEY" + base_url=f"http://{ip}:{service_http_port}/v1", + api_key="EMPTY_API_KEY", ) return client + # Non-streaming test def test_non_streaming_chat(openai_client): """Test non-streaming chat functionality with the local service""" @@ -505,10 +532,11 @@ def test_non_streaming_chat(openai_client): stream=False, ) - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 - assert hasattr(response.choices[0], 'message') - assert hasattr(response.choices[0].message, 'content') + assert hasattr(response.choices[0], "message") + assert hasattr(response.choices[0].message, "content") + # Streaming test def test_streaming_chat(openai_client, capsys): @@ -518,7 +546,10 @@ def test_streaming_chat(openai_client, capsys): messages=[ {"role": "system", "content": "You are a helpful AI assistant."}, {"role": "user", "content": "List 3 countries and their capitals."}, - {"role": "assistant", "content": "China(Beijing), France(Paris), Australia(Canberra)."}, + { + "role": "assistant", + "content": "China(Beijing), France(Paris), Australia(Canberra).", + }, {"role": "user", "content": "OK, tell more."}, ], temperature=1, @@ -528,14 +559,16 @@ def test_streaming_chat(openai_client, capsys): output = [] for chunk in response: - if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'): + if hasattr(chunk.choices[0], "delta") and hasattr(chunk.choices[0].delta, "content"): output.append(chunk.choices[0].delta.content) assert len(output) > 2 + # ========================== # OpenAI Client completions Test # ========================== + def test_non_streaming(openai_client): """Test non-streaming chat functionality with the local service""" response = openai_client.completions.create( @@ -547,7 +580,7 @@ def test_non_streaming(openai_client): ) # Assertions to check the response structure - assert hasattr(response, 'choices') + assert hasattr(response, "choices") assert len(response.choices) > 0 @@ -560,9 +593,9 @@ def test_streaming(openai_client, capsys): max_tokens=1024, stream=True, ) - + # Collect streaming output output = [] for chunk in response: output.append(chunk.choices[0].text) - assert len(output) > 0 \ No newline at end of file + assert len(output) > 0 diff --git a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py index 092b1282f..fbe1ea48c 100644 --- a/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py +++ b/test/ci_use/Qwen3-MoE/test_Qwen3-MoE_serving.py @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest -import requests -import time -import subprocess -import socket import os import signal +import socket +import subprocess import sys +import time +import pytest +import requests # Read ports from environment variables; use default values if not set FD_API_PORT = int(os.getenv("FD_API_PORT", 8188)) @@ -30,6 +30,7 @@ FD_METRICS_PORT = int(os.getenv("FD_METRICS_PORT", 8233)) # List of ports to clean before and after tests PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT] + def is_port_open(host: str, port: int, timeout=1.0): """ Check if a TCP port is open on the given host. @@ -41,19 +42,21 @@ def is_port_open(host: str, port: int, timeout=1.0): except Exception: return False + def kill_process_on_port(port: int): """ Kill processes that are listening on the given port. Uses `lsof` to find process ids and sends SIGKILL. """ try: - output = subprocess.check_output("lsof -i:{} -t".format(port), shell=True).decode().strip() + output = subprocess.check_output(f"lsof -i:{port} -t", shell=True).decode().strip() for pid in output.splitlines(): os.kill(int(pid), signal.SIGKILL) - print("Killed process on port {}, pid={}".format(port, pid)) + print(f"Killed process on port {port}, pid={pid}") except subprocess.CalledProcessError: pass + def clean_ports(): """ Kill all processes occupying the ports listed in PORTS_TO_CLEAN. @@ -61,6 +64,7 @@ def clean_ports(): for port in PORTS_TO_CLEAN: kill_process_on_port(port) + @pytest.fixture(scope="session", autouse=True) def setup_and_run_server(): """ @@ -75,21 +79,31 @@ def setup_and_run_server(): base_path = os.getenv("MODEL_PATH") if base_path: - model_path=os.path.join(base_path, "Qwen3-30B-A3B") + model_path = os.path.join(base_path, "Qwen3-30B-A3B") else: - model_path="./Qwen3-30B-A3B" + model_path = "./Qwen3-30B-A3B" log_path = "server.log" cmd = [ - sys.executable, "-m", "fastdeploy.entrypoints.openai.api_server", - "--model", model_path, - "--port", str(FD_API_PORT), - "--tensor-parallel-size", "1", - "--engine-worker-queue-port", str(FD_ENGINE_QUEUE_PORT), - "--metrics-port", str(FD_METRICS_PORT), - "--max-model-len", "32768", - "--max-num-seqs", "50", - "--quantization", "wint4" + sys.executable, + "-m", + "fastdeploy.entrypoints.openai.api_server", + "--model", + model_path, + "--port", + str(FD_API_PORT), + "--tensor-parallel-size", + "1", + "--engine-worker-queue-port", + str(FD_ENGINE_QUEUE_PORT), + "--metrics-port", + str(FD_METRICS_PORT), + "--max-model-len", + "32768", + "--max-num-seqs", + "50", + "--quantization", + "wint4", ] # Start subprocess in new process group @@ -98,13 +112,13 @@ def setup_and_run_server(): cmd, stdout=logfile, stderr=subprocess.STDOUT, - start_new_session=True # Enables killing full group via os.killpg + start_new_session=True, # Enables killing full group via os.killpg ) # Wait up to 300 seconds for API server to be ready for _ in range(300): if is_port_open("127.0.0.1", FD_API_PORT): - print("API server is up on port {}".format(FD_API_PORT)) + print(f"API server is up on port {FD_API_PORT}") break time.sleep(1) else: @@ -112,17 +126,17 @@ def setup_and_run_server(): try: os.killpg(process.pid, signal.SIGTERM) except Exception as e: - print("Failed to kill process group: {}".format(e)) - raise RuntimeError("API server did not start on port {}".format(FD_API_PORT)) + print(f"Failed to kill process group: {e}") + raise RuntimeError(f"API server did not start on port {FD_API_PORT}") yield # Run tests print("\n===== Post-test server cleanup... =====") try: os.killpg(process.pid, signal.SIGTERM) - print("API server (pid={}) terminated".format(process.pid)) + print(f"API server (pid={process.pid}) terminated") except Exception as e: - print("Failed to terminate API server: {}".format(e)) + print(f"Failed to terminate API server: {e}") @pytest.fixture(scope="session") @@ -130,7 +144,7 @@ def api_url(request): """ Returns the API endpoint URL for chat completions. """ - return "http://0.0.0.0:{}/v1/chat/completions".format(FD_API_PORT) + return f"http://0.0.0.0:{FD_API_PORT}/v1/chat/completions" @pytest.fixture(scope="session") @@ -138,7 +152,7 @@ def metrics_url(request): """ Returns the metrics endpoint URL. """ - return "http://0.0.0.0:{}/metrics".format(FD_METRICS_PORT) + return f"http://0.0.0.0:{FD_METRICS_PORT}/metrics" @pytest.fixture @@ -148,6 +162,7 @@ def headers(): """ return {"Content-Type": "application/json"} + @pytest.fixture def consistent_payload(): """ @@ -155,12 +170,18 @@ def consistent_payload(): including a fixed random seed and temperature. """ return { - "messages": [{"role": "user", "content": "用一句话介绍 PaddlePaddle, 30字以内 /no_think"}], + "messages": [ + { + "role": "user", + "content": "用一句话介绍 PaddlePaddle, 30字以内 /no_think", + } + ], "temperature": 0.8, "top_p": 0, # fix top_p to reduce randomness - "seed": 13 # fixed random seed + "seed": 13, # fixed random seed } + # ========================== # Helper function to calculate difference rate between two texts # ========================== @@ -189,6 +210,7 @@ def calculate_diff_rate(text1, text2): max_len = max(len1, len2) return edit_distance / max_len if max_len > 0 else 0.0 + # ========================== # Consistency test for repeated runs with fixed payload # ========================== @@ -212,65 +234,64 @@ def test_consistency_between_runs(api_url, headers, consistent_payload): diff_rate = calculate_diff_rate(content1, content2) # Verify that the difference rate is below the threshold - assert diff_rate < 0.05, "Output difference too large ({:.4%})".format(diff_rate) + assert diff_rate < 0.05, f"Output difference too large ({diff_rate:.4%})" + # ========================== # think Prompt Test # ========================== + def test_thinking_prompt(api_url, headers): """ Test case to verify normal 'thinking' behavior (no '/no_think' appended). """ - messages = [ - {"role": "user", "content": "北京天安门在哪里"} - ] + messages = [{"role": "user", "content": "北京天安门在哪里"}] payload = { "messages": messages, "max_tokens": 100, "temperature": 0.8, - "top_p": 0.01 + "top_p": 0.01, } resp = requests.post(api_url, headers=headers, json=payload) - assert resp.status_code == 200, "Unexpected status code: {}".format(resp.status_code) + assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" try: response_json = resp.json() except Exception as e: - assert False, "Response is not valid JSON: {}".format(e) - + assert False, f"Response is not valid JSON: {e}" + content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower() assert "天安门" in content or "北京" in content, "Expected a location-related response with reasoning" + # ========================== # no_think Prompt Test # ========================== + def test_non_thinking_prompt(api_url, headers): """ Test case to verify non-thinking behavior (with '/no_think'). """ - messages = [ - {"role": "user", "content": "北京天安门在哪里 /no_think"} - ] + messages = [{"role": "user", "content": "北京天安门在哪里 /no_think"}] payload = { "messages": messages, "max_tokens": 100, "temperature": 0.8, - "top_p": 0.01 + "top_p": 0.01, } resp = requests.post(api_url, headers=headers, json=payload) - assert resp.status_code == 200, "Unexpected status code: {}".format(resp.status_code) + assert resp.status_code == 200, f"Unexpected status code: {resp.status_code}" try: response_json = resp.json() except Exception as e: - assert False, "Response is not valid JSON: {}".format(e) + assert False, f"Response is not valid JSON: {e}" content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower() - assert not any(x in content for x in ["根据", "我认为", "推测", "可能"]), \ - "Expected no reasoning in non-thinking response" \ No newline at end of file + assert not any(x in content for x in ["根据", "我认为", "推测", "可能"]), "Expected no reasoning in non-thinking response" diff --git a/test/ci_use/XPU_45T/run_45T.py b/test/ci_use/XPU_45T/run_45T.py index 009991f19..876e7cf93 100644 --- a/test/ci_use/XPU_45T/run_45T.py +++ b/test/ci_use/XPU_45T/run_45T.py @@ -15,7 +15,7 @@ import openai ip = "0.0.0.0" -service_http_port = "8188" # 服务配置的 +service_http_port = "8188" # 服务配置的 client = openai.Client(base_url=f"http://{ip}:{service_http_port}/v1", api_key="EMPTY_API_KEY") # 非流式对话 @@ -29,4 +29,4 @@ response = client.chat.completions.create( max_tokens=64, stream=False, ) -print(response) \ No newline at end of file +print(response) diff --git a/test/layers/test_append_attention.py b/test/layers/test_append_attention.py index 2b23566ef..6a7832575 100644 --- a/test/layers/test_append_attention.py +++ b/test/layers/test_append_attention.py @@ -12,11 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle -import unittest -import numpy as np import time +import unittest +import numpy as np +import paddle paddle.seed(10) @@ -25,19 +25,16 @@ class RopeEmbedding: def __init__(self, use_neox_rotary_style=False): self.use_neox_rotary_style = use_neox_rotary_style self.base = 10000 - + def get_neox_style_position_embedding(self, position_ids, head_dim): bsz, max_seq_len = position_ids.shape[:2] - rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, head_dim), - dtype="float32") - inv_freq = self.base**(-paddle.arange(0, head_dim, 2, dtype="float32") / head_dim) + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, head_dim), dtype="float32") + inv_freq = self.base ** (-paddle.arange(0, head_dim, 2, dtype="float32") / head_dim) # shape: [B, S, D/2] - freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), - inv_freq) + freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) # shape: [B, S, 1, D] - emb = paddle.concat([freqs, freqs], axis=-1).reshape( - (bsz, max_seq_len, 1, head_dim)) + emb = paddle.concat([freqs, freqs], axis=-1).reshape((bsz, max_seq_len, 1, head_dim)) rot_emb[0] = paddle.cos(emb) rot_emb[1] = paddle.sin(emb) @@ -45,21 +42,13 @@ class RopeEmbedding: def get_rotary_position_embedding(self, position_ids, head_dim): bsz, max_seq_len = position_ids.shape[:2] - rot_emb = paddle.zeros( - (2, bsz, max_seq_len, 1, head_dim // 2), dtype="float32" - ) - inv_freq = self.base ** ( - -paddle.arange(0, head_dim, 2, dtype="float32") / head_dim - ) + rot_emb = paddle.zeros((2, bsz, max_seq_len, 1, head_dim // 2), dtype="float32") + inv_freq = self.base ** (-paddle.arange(0, head_dim, 2, dtype="float32") / head_dim) # shape: [B, S, D/2] - freqs = paddle.einsum( - "ij,k->ijk", position_ids.cast("float32"), inv_freq - ) + freqs = paddle.einsum("ij,k->ijk", position_ids.cast("float32"), inv_freq) # shape: [B, S, D/2] - emb = paddle.stack([freqs], axis=-1).reshape( - (bsz, max_seq_len, head_dim // 2) - ) + emb = paddle.stack([freqs], axis=-1).reshape((bsz, max_seq_len, head_dim // 2)) # shape: [B, S, 1, D] emb = paddle.unsqueeze(emb, 2) @@ -73,31 +62,39 @@ class RopeEmbedding: # sin, cos = paddle.chunk(rp, 2, axis=-1) seq, head_dim = q.shape[2], q.shape[3] cos, sin = paddle.chunk(rotary_emb, 2, axis=0) - cos = paddle.squeeze(cos, axis=0).transpose( - [0, 2, 1, 3])[:, :, :seq, :] - sin = paddle.squeeze(sin, axis=0).transpose( - [0, 2, 1, 3])[:, :, :seq, :] + cos = paddle.squeeze(cos, axis=0).transpose([0, 2, 1, 3])[:, :, :seq, :] + sin = paddle.squeeze(sin, axis=0).transpose([0, 2, 1, 3])[:, :, :seq, :] # sin [θ0,θ1,θ2......θd/2-1] -> sin_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] - + if self.use_neox_rotary_style: sin_pos = sin cos_pos = cos # NeoX Stype:前后半部分分块旋转 rotate_half_q = paddle.reshape( - paddle.stack([-q[:, :, :, q.shape[-1]//2:], q[:, :, :, :q.shape[-1]//2]], axis=-1), + paddle.stack( + [ + -q[:, :, :, q.shape[-1] // 2 :], + q[:, :, :, : q.shape[-1] // 2], + ], + axis=-1, + ), paddle.shape(q), ) rotate_half_k = paddle.reshape( - paddle.stack([-k[:, :, :, k.shape[-1]//2:], k[:, :, :, :k.shape[-1]//2]], axis=-1), + paddle.stack( + [ + -k[:, :, :, k.shape[-1] // 2 :], + k[:, :, :, : k.shape[-1] // 2], + ], + axis=-1, + ), paddle.shape(k), ) else: # import pdb;pdb.set_trace() - sin_pos = paddle.reshape(paddle.stack( - [sin, sin], axis=-1), [1, 1, seq, head_dim]) + sin_pos = paddle.reshape(paddle.stack([sin, sin], axis=-1), [1, 1, seq, head_dim]) # cos [θ0,θ1,θ2......θd/2-1] -> cos_pos [θ0,θ0,θ1,θ1,θ2,θ2......θd/2-1,θd/2-1] - cos_pos = paddle.reshape(paddle.stack( - [cos, cos], axis=-1), [1, 1, seq, head_dim]) + cos_pos = paddle.reshape(paddle.stack([cos, cos], axis=-1), [1, 1, seq, head_dim]) # GPT Stype:奇偶位置分块旋转 rotate_half_q = paddle.reshape( paddle.stack([-q[:, :, :, 1::2], q[:, :, :, 0::2]], axis=-1), @@ -108,15 +105,9 @@ class RopeEmbedding: paddle.shape(k), ) - query = paddle.add( - paddle.multiply(q, cos_pos), paddle.multiply( - rotate_half_q, sin_pos) - ) + query = paddle.add(paddle.multiply(q, cos_pos), paddle.multiply(rotate_half_q, sin_pos)) - key = paddle.add( - paddle.multiply(k, cos_pos), paddle.multiply( - rotate_half_k, sin_pos) - ) + key = paddle.add(paddle.multiply(k, cos_pos), paddle.multiply(rotate_half_k, sin_pos)) return paddle.cast(query, q.dtype), paddle.cast(key, k.dtype) @@ -137,30 +128,19 @@ def create_attn_mask( for i in range(batch_size): seq_len = seq_lens[i] mask[i, 0, :seq_len, :seq_len] = ( - paddle.tril(paddle.ones(shape=(seq_len, seq_len), dtype=mask_type)) - - 1 + paddle.tril(paddle.ones(shape=(seq_len, seq_len), dtype=mask_type)) - 1 ) * 1e4 return mask -def block_cache_to_naive_cache( - cache_k, cache_v, bsz, block_tables, cache_seq_len -): +def block_cache_to_naive_cache(cache_k, cache_v, bsz, block_tables, cache_seq_len): _, num_head, blocksize, dim_head = cache_k.shape - out_cache_k = paddle.zeros( - shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_k.dtype - ) - out_cache_v = paddle.zeros( - shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_v.dtype - ) + out_cache_k = paddle.zeros(shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_k.dtype) + out_cache_v = paddle.zeros(shape=[bsz, num_head, cache_seq_len, dim_head], dtype=cache_v.dtype) for i in range(bsz): for j in range(cache_seq_len): - out_cache_k[i, :, j, :] = cache_k[ - block_tables[i, j // blocksize], :, j % blocksize, : - ] - out_cache_v[i, :, j, :] = cache_v[ - block_tables[i, j // blocksize], :, j % blocksize, : - ] + out_cache_k[i, :, j, :] = cache_k[block_tables[i, j // blocksize], :, j % blocksize, :] + out_cache_v[i, :, j, :] = cache_v[block_tables[i, j // blocksize], :, j % blocksize, :] return out_cache_k, out_cache_v @@ -209,8 +189,7 @@ def naive_attention_impl( if mask is not None: attention = attention + mask softmax_result = paddle.nn.functional.softmax(attention, -1) - result = paddle.matmul(paddle.cast( - softmax_result, dtype=value.dtype), value) + result = paddle.matmul(paddle.cast(softmax_result, dtype=value.dtype), value) return result @@ -235,9 +214,7 @@ def get_padding_offset(bsz, max_seq_len, seq_lens_this_time): def remove_padding(seq_lens, cu_seq_lens, inputs, token_num): bsz, num_head, seq_len, dim_head = inputs.shape - output = paddle.zeros( - shape=[token_num, num_head * dim_head], dtype=inputs.dtype - ) + output = paddle.zeros(shape=[token_num, num_head * dim_head], dtype=inputs.dtype) inputs = inputs.transpose([0, 2, 1, 3]).reshape([bsz, seq_len, -1]) for i in range(bsz): seq_len_now = seq_lens[i] @@ -248,38 +225,34 @@ def remove_padding(seq_lens, cu_seq_lens, inputs, token_num): def get_qkv_and_qkv_concat_tensor(bs, q_num_head, kv_num_head, seq_len, dim_head, place, dtype): - query = np.random.random([bs, q_num_head, seq_len, dim_head])/10 - q = paddle.to_tensor( - query, place=place, dtype=dtype, stop_gradient=False - ) - key = np.random.random([bs, kv_num_head, seq_len, dim_head])/10 - k = paddle.to_tensor( - key, place=place, dtype=dtype, stop_gradient=False - ) - value = np.random.random([bs, kv_num_head, seq_len, dim_head])/10 - v = paddle.to_tensor( - value, place=place, dtype=dtype, stop_gradient=False - ) - token_num = bs*seq_len + query = np.random.random([bs, q_num_head, seq_len, dim_head]) / 10 + q = paddle.to_tensor(query, place=place, dtype=dtype, stop_gradient=False) + key = np.random.random([bs, kv_num_head, seq_len, dim_head]) / 10 + k = paddle.to_tensor(key, place=place, dtype=dtype, stop_gradient=False) + value = np.random.random([bs, kv_num_head, seq_len, dim_head]) / 10 + v = paddle.to_tensor(value, place=place, dtype=dtype, stop_gradient=False) + token_num = bs * seq_len qkv = paddle.concat( [ - q.transpose([0, 2, 1, 3]).reshape( - [token_num, q_num_head*dim_head] - ), - k.transpose([0, 2, 1, 3]).reshape( - [token_num, kv_num_head*dim_head] - ), - v.transpose([0, 2, 1, 3]).reshape( - [token_num, kv_num_head*dim_head] - ), + q.transpose([0, 2, 1, 3]).reshape([token_num, q_num_head * dim_head]), + k.transpose([0, 2, 1, 3]).reshape([token_num, kv_num_head * dim_head]), + v.transpose([0, 2, 1, 3]).reshape([token_num, kv_num_head * dim_head]), ], axis=1, ).reshape([token_num, -1]) return q, k, v, qkv -def split_query_by_phase(query, seq_lens_encoder, seq_lens_decoder, seq_lens_this_time, q_dim, k_dim, v_dim): +def split_query_by_phase( + query, + seq_lens_encoder, + seq_lens_decoder, + seq_lens_this_time, + q_dim, + k_dim, + v_dim, +): """ 将 query 拆分为 encoder 和 decoder 的 Q/K/V。 """ @@ -292,8 +265,8 @@ def split_query_by_phase(query, seq_lens_encoder, seq_lens_decoder, seq_lens_thi query = paddle.reshape(query, [batch, max_seq, total_dim]) # 计算 mask,表示该 batch 是否是 encoder/decoder - is_encoder = (seq_lens_encoder > 0).astype('bool').reshape([-1]) # [batch] - is_decoder = (seq_lens_decoder > 0).astype('bool').reshape([-1]) # [batch] + is_encoder = (seq_lens_encoder > 0).astype("bool").reshape([-1]) # [batch] + is_decoder = (seq_lens_decoder > 0).astype("bool").reshape([-1]) # [batch] # 准备输出列表 enc_qs, enc_ks, enc_vs = [], [], [] @@ -330,8 +303,8 @@ def split_query_by_phase(query, seq_lens_encoder, seq_lens_decoder, seq_lens_thi return (enc_q, enc_k, enc_v), (dec_q, dec_k, dec_v) + class TestAppendGroupQueryAttnWithRope(unittest.TestCase): - def setUp(self): paddle.disable_static() self.name = "TestAppendGroupQueryAttnWithRope" @@ -350,14 +323,11 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): self.max_seq_len = self.seq_len + self.max_dec_len self.softmax_scale = self.dim_head**-0.5 self.rope_theta = 10000 - self.dtype = 'float16' + self.dtype = "float16" self.init_tensor() - def init_tensor(self): - self.block_num_per_seq = ( - self.seq_len + self.max_dec_len + self.blocksize - 1 - ) // self.blocksize + self.block_num_per_seq = (self.seq_len + self.max_dec_len + self.blocksize - 1) // self.blocksize self.rope = RopeEmbedding(self.use_neox_rotary_style) self.max_block_num = self.block_num_per_seq * self.batch_size self.free_list = list(range(self.max_block_num - 1, -1, -1)) @@ -378,10 +348,8 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): self.seq_lens_dec, "int32", ) - self.max_enc_len_this_time = paddle.to_tensor( - [self.max_enc_len_this_time], "int32", place=paddle.CPUPlace()) - self.max_dec_len_this_time = paddle.to_tensor( - [self.max_dec_len_this_time], "int32", place=paddle.CPUPlace()) + self.max_enc_len_this_time = paddle.to_tensor([self.max_enc_len_this_time], "int32", place=paddle.CPUPlace()) + self.max_dec_len_this_time = paddle.to_tensor([self.max_dec_len_this_time], "int32", place=paddle.CPUPlace()) self.seq_lens_this_time = self.seq_lens_encoder self.cache_shape = ( @@ -390,17 +358,13 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): self.blocksize, self.dim_head, ) - + self.scale = 1.0 / np.sqrt(self.dim_head) self.cache_k = paddle.zeros(shape=self.cache_shape, dtype=self.dtype) self.cache_v = paddle.zeros(shape=self.cache_shape, dtype=self.dtype) - self.block_tables = paddle.zeros( - shape=(self.batch_size, self.block_num_per_seq), dtype="int32" - ) + self.block_tables = paddle.zeros(shape=(self.batch_size, self.block_num_per_seq), dtype="int32") for i in range(self.batch_size): - need_block_num = ( - self.seq_len + self.max_dec_len + self.blocksize - 1 - ) // self.blocksize + need_block_num = (self.seq_len + self.max_dec_len + self.blocksize - 1) // self.blocksize for j in range(need_block_num): self.block_tables[i, j] = self.free_list.pop() ( @@ -408,15 +372,12 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): self.cum_offset, self.cu_seqlens_q, self.cu_seqlens_k, - ) = get_padding_offset( - self.batch_size, self.seq_len, self.seq_lens_this_time - ) + ) = get_padding_offset(self.batch_size, self.seq_len, self.seq_lens_this_time) self.token_num = self.padding_offset.shape[0] - def cmp_append_attention(self, naive_cache_k=None, naive_cache_v=None, attn_mask=None): paddle.disable_static() - self.token_num = self.seq_len*self.batch_size + self.token_num = self.seq_len * self.batch_size q, k, v, qkv = get_qkv_and_qkv_concat_tensor( self.batch_size, self.q_num_head, @@ -424,19 +385,27 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): self.seq_len, self.dim_head, self.place, - self.dtype + self.dtype, ) q, k = self.rope._apply_rope(self.rope_emb, q, k, causal=True) out_ = naive_attention_impl( - q, k, v, naive_cache_k, naive_cache_v, None, None, attn_mask, self.scale - ) - out_ = remove_padding( - self.seq_lens_this_time, self.cu_seqlens_q, out_, self.token_num + q, + k, + v, + naive_cache_k, + naive_cache_v, + None, + None, + attn_mask, + self.scale, ) + out_ = remove_padding(self.seq_lens_this_time, self.cu_seqlens_q, out_, self.token_num) speculate_max_draft_token_num = 1 - from fastdeploy.model_executor.layers.attention.ops import append_attention - from fastdeploy.model_executor.layers.attention.ops import get_block_shape_and_split_kv_block + from fastdeploy.model_executor.layers.attention.ops import ( + append_attention, + get_block_shape_and_split_kv_block, + ) ( encoder_batch_ids, @@ -457,15 +426,15 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): self.cum_offset, 64, 12, - (self.q_num_head + 2*self.kv_num_head) // self.kv_num_head, + (self.q_num_head + 2 * self.kv_num_head) // self.kv_num_head, self.blocksize, - speculate_max_draft_token_num+1, + speculate_max_draft_token_num + 1, ) # Warm up WARM_UP = 1 RUN_TIME = 2 - for i in range(WARM_UP+RUN_TIME): + for i in range(WARM_UP + RUN_TIME): if i == WARM_UP: paddle.device.synchronize() start_time = time.time() @@ -515,17 +484,13 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): 16, # decoder_block_shape_q 32768, # max_partition_size 32768, # encoder_max_partition_size - speculate_max_draft_token_num+1, # speculate_max_draft_token_num + speculate_max_draft_token_num + 1, # speculate_max_draft_token_num True, # causal False, # speculate_decoder )[0] paddle.device.synchronize() end_time = time.time() - print( - "[append-attn ut] cost_time:{}ms".format( - (end_time - start_time) / RUN_TIME * 1000 - ) - ) + print(f"[append-attn ut] cost_time:{(end_time - start_time) / RUN_TIME * 1000}ms") naive_cache_k, naive_cache_v = block_cache_to_naive_cache( self.cache_k, self.cache_v, @@ -541,16 +506,12 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): ) def test_all(self): - tmp_position_ids = paddle.arange( - self.seq_len + self.max_dec_len - ).reshape((1, -1)) + tmp_position_ids = paddle.arange(self.seq_len + self.max_dec_len).reshape((1, -1)) # appendattn 传的是最大maxseq if self.use_neox_rotary_style: self.rope_emb = self.rope.get_neox_style_position_embedding(tmp_position_ids, self.dim_head) else: - self.rope_emb = self.rope.get_rotary_position_embedding( - tmp_position_ids, self.dim_head - ) + self.rope_emb = self.rope.get_rotary_position_embedding(tmp_position_ids, self.dim_head) self.attention_mask = create_attn_mask( self.dtype, self.batch_size, @@ -582,10 +543,8 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): ] * self.batch_size self.max_enc_len_this_time = max(self.seq_lens_enc) self.max_dec_len_this_time = max(self.seq_lens_dec) - self.max_enc_len_this_time = paddle.to_tensor( - [self.max_enc_len_this_time], "int32", place=paddle.CPUPlace()) - self.max_dec_len_this_time = paddle.to_tensor( - [self.max_dec_len_this_time], "int32", place=paddle.CPUPlace()) + self.max_enc_len_this_time = paddle.to_tensor([self.max_enc_len_this_time], "int32", place=paddle.CPUPlace()) + self.max_dec_len_this_time = paddle.to_tensor([self.max_dec_len_this_time], "int32", place=paddle.CPUPlace()) self.seq_len = 1 ( @@ -596,6 +555,7 @@ class TestAppendGroupQueryAttnWithRope(unittest.TestCase): ) = get_padding_offset(self.batch_size, 1, self.seq_lens_this_time) self.cmp_append_attention(naive_cache_k, naive_cache_v, None) + class TestAppendGroupQueryAttnWithNeoXRope(TestAppendGroupQueryAttnWithRope): def setUp(self): paddle.disable_static() @@ -615,10 +575,9 @@ class TestAppendGroupQueryAttnWithNeoXRope(TestAppendGroupQueryAttnWithRope): self.max_seq_len = self.seq_len + self.max_dec_len self.softmax_scale = self.dim_head**-0.5 self.rope_theta = 10000 - self.dtype = 'float16' + self.dtype = "float16" self.init_tensor() - - -if __name__ == '__main__': + +if __name__ == "__main__": unittest.main() diff --git a/test/layers/test_attention.py b/test/layers/test_attention.py index 989ecd4e2..5a9816454 100644 --- a/test/layers/test_attention.py +++ b/test/layers/test_attention.py @@ -19,13 +19,14 @@ import unittest import paddle -from fastdeploy.model_executor.layers.attention import ( - Attention, PaddleNativeAttnBackend) from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode +from fastdeploy.model_executor.layers.attention import ( + Attention, + PaddleNativeAttnBackend, +) class MockModelRunner: - def __init__( self, page_size=1, @@ -53,19 +54,15 @@ class MockModelRunner: (), { # A typical max_bs * max_context_len for cuda graph decode - "size": - max_batch_size, + "size": max_batch_size, # Add req_to_token attribute - "req_to_token": - paddle.zeros([max_batch_size, max_context_len], - dtype=paddle.int32), + "req_to_token": paddle.zeros([max_batch_size, max_context_len], dtype=paddle.int32), }, ) self.page_size = page_size class TestNativePaddleAttentionBackend(unittest.TestCase): - def setUp(self): # Test parameters self.batch_size = 2 @@ -90,11 +87,10 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): # so we need to multiply the index by page_size. self.req_to_token = ( paddle.arange(0, batch_size, dtype=paddle.int32)[:, None] * seq_len - + paddle.arange(0, seq_len, dtype=paddle.int32)[None, :] + - page_size) - self.model_runner.req_to_token_pool.req_to_token[:batch_size, : - seq_len] = ( - self.req_to_token) + + paddle.arange(0, seq_len, dtype=paddle.int32)[None, :] + + page_size + ) + self.model_runner.req_to_token_pool.req_to_token[:batch_size, :seq_len] = self.req_to_token def _create_attention_layer(self): """Create attention layer for testing.""" @@ -114,15 +110,12 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): paddle.randn(shape, dtype=self.dtype), ) - def _run_reference_forward(self, mode, q, k, v, layer, forward_batch, - expected_shape): + def _run_reference_forward(self, mode, q, k, v, layer, forward_batch, expected_shape): """Run reference forward pass using native backend.""" if mode == ForwardMode.EXTEND: - output = self.ref_backend.forward_extend(q, k, v, layer, - forward_batch) + output = self.ref_backend.forward_extend(q, k, v, layer, forward_batch) else: # ForwardMode.DECODE - output = self.ref_backend.forward_decode(q, k, v, layer, - forward_batch) + output = self.ref_backend.forward_decode(q, k, v, layer, forward_batch) return output.view(expected_shape) def _verify_output(self, output, expected_shape, output_ref=None): @@ -133,33 +126,28 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): f"Expected shape {expected_shape}, got {output.shape}", ) self.assertEqual(output.dtype, self.dtype) - self.assertEqual( - paddle.isnan(output).sum().item(), 0, "Output contains NaN values") + self.assertEqual(paddle.isnan(output).sum().item(), 0, "Output contains NaN values") if output_ref is not None: if not paddle.allclose(output, output_ref, atol=1e-1, rtol=0.0): # Check where the values differ beyond the given tolerances - diff_mask = ~paddle.isclose( - output, output_ref, atol=1e-1, rtol=0.0) + diff_mask = ~paddle.isclose(output, output_ref, atol=1e-1, rtol=0.0) # Find the first index where the difference occurs if diff_mask.any(): first_mismatch_idx = diff_mask.nonzero()[0] - print("First mismatch at index:", - tuple(first_mismatch_idx.tolist())) - print("output:", - output[tuple(first_mismatch_idx.tolist())]) - print("output_ref:", - output_ref[tuple(first_mismatch_idx.tolist())]) - raise AssertionError( - "Attention output is not close to the torch native backend output" - ) + print( + "First mismatch at index:", + tuple(first_mismatch_idx.tolist()), + ) + print("output:", output[tuple(first_mismatch_idx.tolist())]) + print( + "output_ref:", + output_ref[tuple(first_mismatch_idx.tolist())], + ) + raise AssertionError("Attention output is not close to the torch native backend output") - def _create_forward_batch(self, - mode, - q_len=None, - prefix_len=0, - page_size=1): + def _create_forward_batch(self, mode, q_len=None, prefix_len=0, page_size=1): """Create a forward batch for testing based on mode and lengths.""" self._init_model_runner(page_size=page_size) @@ -179,16 +167,11 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): forward_mode=mode, req_pool_indices=paddle.arange(self.batch_size), seq_lens=paddle.to_tensor([total_len] * self.batch_size), - extend_prefix_lens=paddle.to_tensor([prefix_len] * - self.batch_size), + extend_prefix_lens=paddle.to_tensor([prefix_len] * self.batch_size), extend_seq_lens=paddle.to_tensor([q_len] * self.batch_size), - seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size, - place="cpu"), - extend_prefix_lens_cpu=paddle.to_tensor([prefix_len] * - self.batch_size, - place="cpu"), - extend_seq_lens_cpu=paddle.to_tensor([q_len] * self.batch_size, - place="cpu"), + seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size, place="cpu"), + extend_prefix_lens_cpu=paddle.to_tensor([prefix_len] * self.batch_size, place="cpu"), + extend_seq_lens_cpu=paddle.to_tensor([q_len] * self.batch_size, place="cpu"), attn_backend=self.backend, ) else: # ForwardMode.DECODE @@ -196,8 +179,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): total_len = self.seq_len + decode_len if mode == ForwardMode.DECODE and page_size > 1: # Get next page_size multiple of self.seq_len - out_cache_start = (self.batch_size * self.seq_len // page_size - + 1) * page_size + out_cache_start = (self.batch_size * self.seq_len // page_size + 1) * page_size # out_cache_end is the start of the next block out_cache_end = out_cache_start + decode_len * page_size else: @@ -206,16 +188,13 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): forward_batch = ForwardMeta( batch_size=self.batch_size, - input_ids=paddle.randint(0, 100, - (self.batch_size, decode_len)), - out_cache_loc=paddle.to_tensor( - [out_cache_start, out_cache_end]), + input_ids=paddle.randint(0, 100, (self.batch_size, decode_len)), + out_cache_loc=paddle.to_tensor([out_cache_start, out_cache_end]), seq_lens_sum=self.batch_size * total_len, forward_mode=mode, req_pool_indices=paddle.arange(self.batch_size), seq_lens=paddle.to_tensor([total_len] * self.batch_size), - seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size, - place="cpu"), + seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size, place="cpu"), attn_backend=self.backend, ) @@ -223,8 +202,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): forward_batch.req_to_token_pool = self.model_runner.req_to_token_pool # Write current batch's req_to_token to req_to_token_pool - self._mock_write_to_req_to_token_pool(self.batch_size, total_len, - page_size) + self._mock_write_to_req_to_token_pool(self.batch_size, total_len, page_size) # Add kv pool for this forward batch forward_batch.token_to_kv_pool = self.model_runner.token_to_kv_pool @@ -236,10 +214,13 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): [self.batch_size * cache_len, self.num_heads, self.head_dim], dtype=self.dtype, ) - cache_v = (paddle.ones( - [self.batch_size * cache_len, self.num_heads, self.head_dim], - dtype=self.dtype, - ) * 2) + cache_v = ( + paddle.ones( + [self.batch_size * cache_len, self.num_heads, self.head_dim], + dtype=self.dtype, + ) + * 2 + ) # Set the prefix KV cache forward_batch.token_to_kv_pool.set_kv_buffer( @@ -263,8 +244,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): layer = self._create_attention_layer() # Create forward batch and set up - forward_batch = self._create_forward_batch(mode, q_len, prefix_len, - page_size) + forward_batch = self._create_forward_batch(mode, q_len, prefix_len, page_size) # Create QKV tensors for the input q, k, v = self._create_qkv_tensors(self.batch_size * q_len) @@ -291,8 +271,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): expected_shape = [self.batch_size, self.num_heads * self.head_dim] output = self.backend.forward_decode(q, k, v, layer, forward_batch) - output_ref = self._run_reference_forward(mode, q, k, v, layer, - forward_batch, expected_shape) + output_ref = self._run_reference_forward(mode, q, k, v, layer, forward_batch, expected_shape) self._verify_output(output, expected_shape, output_ref) @@ -310,15 +289,11 @@ class TestNativePaddleAttentionBackend(unittest.TestCase): """Test extending from cached prefix tokens.""" prefix_len = self.seq_len // 2 extend_len = self.seq_len - prefix_len - self._run_attention_test(ForwardMode.EXTEND, - q_len=extend_len, - prefix_len=prefix_len) + self._run_attention_test(ForwardMode.EXTEND, q_len=extend_len, prefix_len=prefix_len) def test_forward_extend_with_page_size_greater_than_1(self): """Test extending from cached prefix tokens with page size greater than 1.""" - self._run_attention_test(ForwardMode.EXTEND, - q_len=self.seq_len, - page_size=64) + self._run_attention_test(ForwardMode.EXTEND, q_len=self.seq_len, page_size=64) def test_forward_decode_with_page_size_greater_than_1(self): """Test decode operation with page size greater than 1.""" diff --git a/test/layers/test_quant_layer.py b/test/layers/test_quant_layer.py index b32984b4b..31be300c1 100644 --- a/test/layers/test_quant_layer.py +++ b/test/layers/test_quant_layer.py @@ -13,5 +13,3 @@ # See the License for the specific language governing permissions and # limitations under the License. """ - -from fastdeploy.model_executor.layers.linear import Linear \ No newline at end of file diff --git a/test/layers/test_sampler.py b/test/layers/test_sampler.py index 53e8f808b..65a6bfbe6 100644 --- a/test/layers/test_sampler.py +++ b/test/layers/test_sampler.py @@ -21,26 +21,19 @@ from fastdeploy.model_executor.layers.sample.sampler import Sampler def _create_fake_logits(batch_size: int, vocab_size: int) -> paddle.Tensor: - fake_logits = paddle.full(shape=[batch_size, vocab_size], - fill_value=1e-2, - dtype="float32") + fake_logits = paddle.full(shape=[batch_size, vocab_size], fill_value=1e-2, dtype="float32") return fake_logits -def _create_penalty_tensor(batch_size: int, - penalty_value: float) -> paddle.Tensor: - return paddle.full(shape=[batch_size, 1], - fill_value=penalty_value, - dtype="float32") +def _create_penalty_tensor(batch_size: int, penalty_value: float) -> paddle.Tensor: + return paddle.full(shape=[batch_size, 1], fill_value=penalty_value, dtype="float32") def _create_tokens_tensor( batch_size: int, max_seq_len: int, ) -> paddle.Tensor: - pre_token_ids = paddle.full(shape=[batch_size, max_seq_len], - fill_value=-1, - dtype="int64") + pre_token_ids = paddle.full(shape=[batch_size, max_seq_len], fill_value=-1, dtype="int64") return pre_token_ids @@ -51,34 +44,18 @@ def _create_default_sampling_metadata( ) -> SamplingMetadata: fake_sampling_metadata = SamplingMetadata( - temperature=paddle.full(shape=[batch_size, 1], - fill_value=0.9, - dtype="float32"), - top_p=paddle.full(shape=[batch_size, 1], - fill_value=0.7, - dtype="float32"), - prompt_ids=paddle.full(shape=[batch_size, max_seq_len], - fill_value=0, - dtype="int64"), - prompt_lens=paddle.full(shape=[batch_size, 1], - fill_value=5, - dtype="int64"), - step_idx=paddle.full(shape=[batch_size, 1], - fill_value=0, - dtype="int64"), + temperature=paddle.full(shape=[batch_size, 1], fill_value=0.9, dtype="float32"), + top_p=paddle.full(shape=[batch_size, 1], fill_value=0.7, dtype="float32"), + prompt_ids=paddle.full(shape=[batch_size, max_seq_len], fill_value=0, dtype="int64"), + prompt_lens=paddle.full(shape=[batch_size, 1], fill_value=5, dtype="int64"), + step_idx=paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int64"), pre_token_ids=_create_tokens_tensor(batch_size, max_seq_len), frequency_penalties=_create_penalty_tensor(batch_size, 0.0), presence_penalties=_create_penalty_tensor(batch_size, 0.0), repetition_penalties=_create_penalty_tensor(batch_size, 1.0), - min_dec_lens=paddle.full(shape=[batch_size, 1], - fill_value=min_seq_len, - dtype="int64"), - bad_words_token_ids=paddle.full(shape=[batch_size], - fill_value=-1, - dtype="int64"), - eos_token_ids=paddle.full(shape=[batch_size], - fill_value=-2, - dtype="int64"), + min_dec_lens=paddle.full(shape=[batch_size, 1], fill_value=min_seq_len, dtype="int64"), + bad_words_token_ids=paddle.full(shape=[batch_size], fill_value=-1, dtype="int64"), + eos_token_ids=paddle.full(shape=[batch_size], fill_value=-2, dtype="int64"), ) return fake_sampling_metadata @@ -91,8 +68,7 @@ def test_sampler(): sampler = Sampler() logits = _create_fake_logits(batch_size, vocab_size) - sampling_metadata = _create_default_sampling_metadata( - batch_size, min_seq_len, max_seq_len) + sampling_metadata = _create_default_sampling_metadata(batch_size, min_seq_len, max_seq_len) next_tokens = sampler(logits, sampling_metadata) print(next_tokens) diff --git a/test/operators/test_air_topp_sampling.py b/test/operators/test_air_topp_sampling.py index 7f87740d5..d3ec669cd 100644 --- a/test/operators/test_air_topp_sampling.py +++ b/test/operators/test_air_topp_sampling.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" UT for air_topp_sampling kernel """ +"""UT for air_topp_sampling kernel""" import subprocess import unittest @@ -23,7 +23,6 @@ import fastdeploy.model_executor.ops.gpu class Test(unittest.TestCase): - def setUp(self): """ Initialize. @@ -32,8 +31,7 @@ class Test(unittest.TestCase): np.random.seed(42) print(paddle.device.cuda.get_device_properties()) print(paddle.__git_commit__) - nvcc_output = subprocess.check_output(["nvcc", "--version"], - universal_newlines=True) + nvcc_output = subprocess.check_output(["nvcc", "--version"], universal_newlines=True) output = nvcc_output.split() release_idx = output.index("release") + 1 self.nvcc_cuda_version = float(output[release_idx].split(",")[0]) @@ -49,15 +47,15 @@ class Test(unittest.TestCase): x = paddle.randn([bsz, vocab_size]) x = paddle.nn.functional.softmax(x) x = paddle.cast(x, "float32") - top_ps = paddle.to_tensor( - np.random.uniform(0, 1, [bsz]).astype(np.float32)) + top_ps = paddle.to_tensor(np.random.uniform(0, 1, [bsz]).astype(np.float32)) _, next_tokens = fastdeploy.model_executor.ops.gpu.air_topp_sampling( - x.cuda(), top_ps.cuda(), None, None, seed=0, k=1, mode="truncated") + x.cuda(), top_ps.cuda(), None, None, seed=0, k=1, mode="truncated" + ) print(next_tokens) less_than_zero = next_tokens >= 0 greater_than_vocab_size = next_tokens <= vocab_size accuracy = paddle.logical_and(less_than_zero, greater_than_vocab_size) - print(f'Accuracy of results: {accuracy}') + print(f"Accuracy of results: {accuracy}") if __name__ == "__main__": diff --git a/test/operators/test_cutlass_scaled_mm.py b/test/operators/test_cutlass_scaled_mm.py index 7b2a2d789..d158d115d 100644 --- a/test/operators/test_cutlass_scaled_mm.py +++ b/test/operators/test_cutlass_scaled_mm.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" UT for air_topp_sampling kernel """ +"""UT for air_topp_sampling kernel""" import subprocess import unittest @@ -20,11 +20,12 @@ import numpy as np import paddle from fastdeploy.model_executor.layers.quantization.ops import ( - cutlass_scaled_mm, scaled_fp8_quant) + cutlass_scaled_mm, + scaled_fp8_quant, +) class Test(unittest.TestCase): - def setUp(self): """ Initialize. @@ -35,8 +36,7 @@ class Test(unittest.TestCase): self.sm_version = self.prop.major * 10 + self.prop.minor print(self.prop) print(paddle.__git_commit__) - nvcc_output = subprocess.check_output(["nvcc", "--version"], - universal_newlines=True) + nvcc_output = subprocess.check_output(["nvcc", "--version"], universal_newlines=True) output = nvcc_output.split() release_idx = output.index("release") + 1 self.nvcc_cuda_version = float(output[release_idx].split(",")[0]) @@ -46,8 +46,7 @@ class Test(unittest.TestCase): Check cutlass_scaled_mm output. """ if self.sm_version < 89: - self.skipTest( - "cutlass_scaled_mm with fp8 input only support sm89+") + self.skipTest("cutlass_scaled_mm with fp8 input only support sm89+") M = 32 N = 1024 K = 1024 @@ -59,10 +58,8 @@ class Test(unittest.TestCase): # Ensure quantized tensors and scales are valid assert a_q.numel() > 0, "Quantized tensor 'a_q' must not be empty" assert b_q.numel() > 0, "Quantized tensor 'b_q' must not be empty" - assert a_scales.numel( - ) > 0, "Scale tensor 'a_scales' must not be empty" - assert b_scales.numel( - ) > 0, "Scale tensor 'b_scales' must not be empty" + assert a_scales.numel() > 0, "Scale tensor 'a_scales' must not be empty" + assert b_scales.numel() > 0, "Scale tensor 'b_scales' must not be empty" bias = paddle.rand([N], dtype=paddle.bfloat16) baseline = paddle.matmul(a, b, transpose_x=False, transpose_y=True) diff --git a/test/operators/test_deqant_int8_cpp_extension.py b/test/operators/test_deqant_int8_cpp_extension.py index 96a3ca421..66b639d7c 100644 --- a/test/operators/test_deqant_int8_cpp_extension.py +++ b/test/operators/test_deqant_int8_cpp_extension.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" UT for air_topp_sampling kernel """ +"""UT for air_topp_sampling kernel""" import unittest @@ -20,7 +20,6 @@ import paddle class Test(unittest.TestCase): - def setUp(self): """ Initialize. @@ -50,10 +49,7 @@ class Test(unittest.TestCase): exe.run(paddle.static.default_startup_program()) op_out = exe.run(fetch_list=[op_out])[0] func_out = self.dequant_int8_test(True) - np.testing.assert_allclose(op_out, - func_out.numpy(), - rtol=1e-04, - atol=1e-04) + np.testing.assert_allclose(op_out, func_out.numpy(), rtol=1e-04, atol=1e-04) if __name__ == "__main__": diff --git a/test/operators/test_dequant.py b/test/operators/test_dequant.py index 762a057f3..1b00380e0 100644 --- a/test/operators/test_dequant.py +++ b/test/operators/test_dequant.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle -import numpy as np -from fastdeploy.model_executor.ops.gpu import gemm_dequant -from fastdeploy.model_executor.ops.gpu import dequant_int8 -from itertools import product import unittest +from itertools import product + +import numpy as np +import paddle + +from fastdeploy.model_executor.ops.gpu import dequant_int8, gemm_dequant class Test(unittest.TestCase): @@ -43,9 +44,7 @@ class Test(unittest.TestCase): act_int_tensor = (act * 128).astype("int8") weight_int_tensor = (weight * 128).astype("int8") scale = paddle.rand([n]) - linear_out = paddle.matmul( - act_int_tensor, weight_int_tensor, transpose_y=True - ) + linear_out = paddle.matmul(act_int_tensor, weight_int_tensor, transpose_y=True) result = dequant_int8(linear_out, scale, "bfloat16") result_gemm_dequant = gemm_dequant( @@ -55,7 +54,10 @@ class Test(unittest.TestCase): out_dtype="bfloat16", ) np.testing.assert_allclose( - result.numpy(), result_gemm_dequant.numpy(), rtol=1e-05, atol=1e-05 + result.numpy(), + result_gemm_dequant.numpy(), + rtol=1e-05, + atol=1e-05, ) diff --git a/test/operators/test_fp8_fp8_half_cuda_core_gemm.py b/test/operators/test_fp8_fp8_half_cuda_core_gemm.py index 590235265..4fa257287 100644 --- a/test/operators/test_fp8_fp8_half_cuda_core_gemm.py +++ b/test/operators/test_fp8_fp8_half_cuda_core_gemm.py @@ -12,14 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" UT for fp8_fp8_half_cuda_core_gemm kernel """ +"""UT for fp8_fp8_half_cuda_core_gemm kernel""" -import paddle -import numpy as np -from fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_half_gemm_fused -from itertools import product import os import unittest +from itertools import product + +import numpy as np +import paddle + +from fastdeploy.model_executor.ops.gpu import cutlass_fp8_fp8_half_gemm_fused class Test(unittest.TestCase): @@ -47,21 +49,17 @@ class Test(unittest.TestCase): combinations = list(product(m, nks)) for m, (n, k) in combinations: - act = ( - paddle.rand([m, k]) - .clip(min=-1 * self.E4M3_MAX_POS, max=self.E4M3_MAX_POS) - .to(paddle.float8_e4m3fn) - ) + act = paddle.rand([m, k]).clip(min=-1 * self.E4M3_MAX_POS, max=self.E4M3_MAX_POS).to(paddle.float8_e4m3fn) weight = ( - paddle.rand([n, k]) - .clip(min=-1 * self.E4M3_MAX_POS, max=self.E4M3_MAX_POS) - .to(paddle.float8_e4m3fn) + paddle.rand([n, k]).clip(min=-1 * self.E4M3_MAX_POS, max=self.E4M3_MAX_POS).to(paddle.float8_e4m3fn) ) bias = (paddle.rand([n])).to(paddle.bfloat16) scale = 1.2 result = paddle.matmul( - act.astype("bfloat16"), weight.astype("bfloat16"), transpose_y=True + act.astype("bfloat16"), + weight.astype("bfloat16"), + transpose_y=True, ) result = result * scale result = result + bias @@ -77,9 +75,7 @@ class Test(unittest.TestCase): activation_type="", ) - np.testing.assert_allclose( - result.numpy(), result_cuda.numpy(), rtol=1e-04, atol=1e-04 - ) + np.testing.assert_allclose(result.numpy(), result_cuda.numpy(), rtol=1e-04, atol=1e-04) if __name__ == "__main__": diff --git a/test/operators/test_fused_moe.py b/test/operators/test_fused_moe.py index 4303eea4e..ce78e05c1 100644 --- a/test/operators/test_fused_moe.py +++ b/test/operators/test_fused_moe.py @@ -12,20 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" test for moe ops """ +"""test for moe ops""" import unittest -import numpy as np +import numpy as np import paddle import paddle.nn.functional as F from paddle import nn from paddle.incubate.nn.functional import swiglu + from fastdeploy.model_executor.ops.gpu import ( + fused_expert_moe, moe_expert_dispatch, moe_expert_ffn, moe_expert_reduce, - fused_expert_moe, ) # Set random seeds for reproducibility @@ -35,7 +36,7 @@ np.random.seed(42) class Expert(nn.Layer): """A single expert layer using SwiGLU activation.""" - + def __init__(self, d_model, d_feedforward): super().__init__() self.fc1 = nn.Linear(d_model, d_feedforward * 2) # *2 for SwiGLU @@ -50,7 +51,7 @@ class Expert(nn.Layer): class TestFusedMoeConsistency(unittest.TestCase): """Test case for verifying consistency between baseline and fused MoE implementations.""" - + @classmethod def setUpClass(cls): """Class-level setup that runs once before all tests.""" @@ -77,11 +78,8 @@ class TestFusedMoeConsistency(unittest.TestCase): def init_experts(self): """Initialize expert layers and gate weights.""" - self.experts = nn.LayerList([ - Expert(self.d_model, self.d_feedforward) - for _ in range(self.num_experts) - ]) - + self.experts = nn.LayerList([Expert(self.d_model, self.d_feedforward) for _ in range(self.num_experts)]) + # Initialize gate weights self.gate = nn.Linear(self.d_model, self.num_experts) self.gate_weight = self.gate.weight.cast("float32") @@ -89,18 +87,17 @@ class TestFusedMoeConsistency(unittest.TestCase): def prepare_data(self): """Prepare input data and expert parameters.""" # Input tensor - self.x = paddle.randn( - [self.batch_size, self.seq_len, self.d_model], - dtype=self.dtype - ) - + self.x = paddle.randn([self.batch_size, self.seq_len, self.d_model], dtype=self.dtype) + # Stack expert parameters for fused operations self.w0 = paddle.stack([e.fc1.weight for e in self.experts]).astype(self.dtype) - self.b0 = paddle.stack([e.fc1.bias for e in self.experts] - ).reshape([self.num_experts, 1, -1]).astype(self.dtype) + self.b0 = ( + paddle.stack([e.fc1.bias for e in self.experts]).reshape([self.num_experts, 1, -1]).astype(self.dtype) + ) self.w1 = paddle.stack([e.fc2.weight for e in self.experts]).astype(self.dtype) - self.b1 = paddle.stack([e.fc2.bias for e in self.experts] - ).reshape([self.num_experts, 1, -1]).astype(self.dtype) + self.b1 = ( + paddle.stack([e.fc2.bias for e in self.experts]).reshape([self.num_experts, 1, -1]).astype(self.dtype) + ) def baseline_forward(self, hidden_states): """Baseline implementation processing experts sequentially.""" @@ -114,10 +111,7 @@ class TestFusedMoeConsistency(unittest.TestCase): # Initialize output final_hidden_states = paddle.zeros_like(hidden_states) - expert_mask = paddle.transpose( - F.one_hot(selected_experts, num_classes=self.num_experts), - [2, 1, 0] - ) + expert_mask = paddle.transpose(F.one_hot(selected_experts, num_classes=self.num_experts), [2, 1, 0]) # Process each expert for expert_id in range(self.num_experts): @@ -127,7 +121,7 @@ class TestFusedMoeConsistency(unittest.TestCase): current_state = paddle.index_select(hidden_states, top_x, axis=0) expert_out = self.experts[expert_id](current_state) - + current_hidden_states = expert_out * routing_weights[top_x, idx].reshape([-1, 1]) paddle.index_add_( x=final_hidden_states, @@ -152,7 +146,7 @@ class TestFusedMoeConsistency(unittest.TestCase): "None", # No activation type self.top_k, False, # Not renormalizing topk - False # Not using expert capacity + False, # Not using expert capacity ) def split_forward(self, hidden_states): @@ -163,7 +157,7 @@ class TestFusedMoeConsistency(unittest.TestCase): # Routing computation logits = paddle.matmul(hidden_states.cast("float32"), self.gate_weight) scores = F.softmax(logits, axis=-1) - + # Dispatch tokens to experts ( permute_input, @@ -187,7 +181,7 @@ class TestFusedMoeConsistency(unittest.TestCase): "none", False, ) - + # Combine results output = moe_expert_reduce( ffn_out, @@ -198,7 +192,7 @@ class TestFusedMoeConsistency(unittest.TestCase): norm_topk_prob=False, routed_scaling_factor=1.0, ) - + return output.reshape([batch_size, seq_len, hidden_dim]) def test_consistency(self): @@ -219,18 +213,18 @@ class TestFusedMoeConsistency(unittest.TestCase): fused_out, rtol=self.rtol, atol=self.atol, - err_msg="Baseline and fused outputs differ" + err_msg="Baseline and fused outputs differ", ) - + # Compare baseline vs split np.testing.assert_allclose( base_out, split_out, rtol=self.rtol, atol=self.atol, - err_msg="Baseline and split outputs differ" + err_msg="Baseline and split outputs differ", ) if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/operators/test_get_token_penalty_multi_scores.py b/test/operators/test_get_token_penalty_multi_scores.py index 6677d9e6c..e2ca91a14 100644 --- a/test/operators/test_get_token_penalty_multi_scores.py +++ b/test/operators/test_get_token_penalty_multi_scores.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" UT for air_topp_sampling kernel """ +"""UT for air_topp_sampling kernel""" import copy import unittest @@ -21,7 +21,6 @@ import paddle class Test(unittest.TestCase): - def setUp(self): """ Initialize. @@ -31,18 +30,36 @@ class Test(unittest.TestCase): self.vocab_size = 103424 # prompt token - prompt_ids = paddle.full(shape=[self.num_seqs, self.max_model_len], fill_value=0, dtype='int64') - prompt_lens = paddle.randint(low=0, high=100, shape=[self.num_seqs, 1], dtype='int64') - fake_tokens = paddle.randint(low=3, high=self.vocab_size, shape=[self.num_seqs, self.max_model_len], dtype='int64') + prompt_ids = paddle.full( + shape=[self.num_seqs, self.max_model_len], + fill_value=0, + dtype="int64", + ) + prompt_lens = paddle.randint(low=0, high=100, shape=[self.num_seqs, 1], dtype="int64") + fake_tokens = paddle.randint( + low=3, + high=self.vocab_size, + shape=[self.num_seqs, self.max_model_len], + dtype="int64", + ) for i in range(self.num_seqs): - prompt_ids[i, :prompt_lens[i]] = fake_tokens[i, :prompt_lens[i]] + prompt_ids[i, : prompt_lens[i]] = fake_tokens[i, : prompt_lens[i]] # generated token - pre_ids = paddle.full(shape=[self.num_seqs, self.max_model_len], fill_value=-1, dtype='int64') - step_idx = paddle.randint(low=0, high=100, shape=[self.num_seqs, 1], dtype='int64') - fake_tokens = paddle.randint(low=3, high=self.vocab_size, shape=[self.num_seqs, self.max_model_len], dtype='int64') + pre_ids = paddle.full( + shape=[self.num_seqs, self.max_model_len], + fill_value=-1, + dtype="int64", + ) + step_idx = paddle.randint(low=0, high=100, shape=[self.num_seqs, 1], dtype="int64") + fake_tokens = paddle.randint( + low=3, + high=self.vocab_size, + shape=[self.num_seqs, self.max_model_len], + dtype="int64", + ) for i in range(self.num_seqs): - pre_ids[i, :step_idx[i]] = fake_tokens[i, :step_idx[i]] + pre_ids[i, : step_idx[i]] = fake_tokens[i, : step_idx[i]] logits = paddle.randn([self.num_seqs, self.vocab_size]).cast("float32") @@ -67,7 +84,7 @@ class Test(unittest.TestCase): "penalty_score": penalty_score, "frequency_score": frequency_score, "presence_score": presence_score, - "temperature": temperature + "temperature": temperature, } def get_token_penalty_multi_scores_baseline(self): @@ -92,17 +109,29 @@ class Test(unittest.TestCase): # all penalties prompt_ids = input_data["prompt_ids"] for i in range(self.num_seqs): - prompt_ids[i, input_data["prompt_lens"][i]:] = -1 + prompt_ids[i, input_data["prompt_lens"][i] :] = -1 prompt_repeat_times = paddle.zeros([self.num_seqs, self.vocab_size + 1]).cast("int64") - prompt_repeat_times = paddle.put_along_axis(prompt_repeat_times, prompt_ids, paddle.ones_like(input_data["pre_ids"]), axis=1, reduce="add") - prompt_repeat_times = prompt_repeat_times[:, :self.vocab_size] + prompt_repeat_times = paddle.put_along_axis( + prompt_repeat_times, + prompt_ids, + paddle.ones_like(input_data["pre_ids"]), + axis=1, + reduce="add", + ) + prompt_repeat_times = prompt_repeat_times[:, : self.vocab_size] prompt_mask = prompt_repeat_times > 0 pre_ids = input_data["pre_ids"] pre_ids[pre_ids == -1] = self.vocab_size out_repeat_times = paddle.zeros([self.num_seqs, self.vocab_size + 1]).cast("int64") - out_repeat_times = paddle.put_along_axis(out_repeat_times, pre_ids, paddle.ones_like(input_data["pre_ids"]), axis=1, reduce="add") - out_repeat_times = out_repeat_times[:, :self.vocab_size] + out_repeat_times = paddle.put_along_axis( + out_repeat_times, + pre_ids, + paddle.ones_like(input_data["pre_ids"]), + axis=1, + reduce="add", + ) + out_repeat_times = out_repeat_times[:, : self.vocab_size] output_mask = out_repeat_times > 0 penalty_score = penalty_score.tile(self.vocab_size) @@ -115,26 +144,25 @@ class Test(unittest.TestCase): logits /= temperature return logits - def test_penalty_op(self): - """ - """ + """ """ baseline_out = self.get_token_penalty_multi_scores_baseline() - from fastdeploy.model_executor.ops.gpu import \ - get_token_penalty_multi_scores + from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores + logits = get_token_penalty_multi_scores( - self.input_data["pre_ids"], - self.input_data["prompt_ids"], - self.input_data["prompt_lens"], - self.input_data["logits"], - self.input_data["penalty_score"], - self.input_data["frequency_score"], - self.input_data["presence_score"], - self.input_data["temperature"], - self.input_data["bad_tokens"], - self.input_data["step_idx"], - self.input_data["min_dec_len"], - self.input_data["eos_token_id"]) + self.input_data["pre_ids"], + self.input_data["prompt_ids"], + self.input_data["prompt_lens"], + self.input_data["logits"], + self.input_data["penalty_score"], + self.input_data["frequency_score"], + self.input_data["presence_score"], + self.input_data["temperature"], + self.input_data["bad_tokens"], + self.input_data["step_idx"], + self.input_data["min_dec_len"], + self.input_data["eos_token_id"], + ) np.testing.assert_allclose(baseline_out.numpy(), logits.numpy(), rtol=1e-04, atol=1e-04) diff --git a/test/operators/test_perchannel_gemm.py b/test/operators/test_perchannel_gemm.py index 26913fe99..02bc33651 100644 --- a/test/operators/test_perchannel_gemm.py +++ b/test/operators/test_perchannel_gemm.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" UT for per_channel_fp8_fp8_half_gemm_fused kernel """ +"""UT for per_channel_fp8_fp8_half_gemm_fused kernel""" import os -import paddle -import numpy as np -from itertools import product import unittest +from itertools import product + +import numpy as np +import paddle class Test(unittest.TestCase): @@ -39,7 +40,9 @@ class Test(unittest.TestCase): if cc < 89: self.skipTest("per_channel_fp8_fp8_half_gemm_fused only support sm89+") - from fastdeploy.model_executor.ops.gpu import per_channel_fp8_fp8_half_gemm_fused + from fastdeploy.model_executor.ops.gpu import ( + per_channel_fp8_fp8_half_gemm_fused, + ) nks = [[2048, 2048], [2048, 5504], [6144, 2048]] nks = nks + [[4096, 4096], [4096, 12800], [6144, 4096]] @@ -58,12 +61,7 @@ class Test(unittest.TestCase): channel_scale = paddle.rand(shape=[n], dtype="float32") bias = paddle.rand(shape=[n], dtype="bfloat16") - result_bf16 = ( - paddle.matmul(A_bf16, B_bf16, transpose_y=True) - * scalar_scale - * channel_scale - + bias - ) + result_bf16 = paddle.matmul(A_bf16, B_bf16, transpose_y=True) * scalar_scale * channel_scale + bias result_fp8 = per_channel_fp8_fp8_half_gemm_fused( A_fp8, B_fp8, @@ -76,12 +74,13 @@ class Test(unittest.TestCase): ) # absolute_error = paddle.abs(result_bf16 - result_fp8) # mean_absolute_error = paddle.mean(absolute_error) - relative_error = paddle.abs(result_bf16 - result_fp8) / ( - paddle.abs(result_bf16) - ) + relative_error = paddle.abs(result_bf16 - result_fp8) / (paddle.abs(result_bf16)) mean_relative_error = paddle.mean(relative_error) np.testing.assert_allclose( - mean_relative_error.numpy(), np.array([0.001]), rtol=0.001, atol=0.25 + mean_relative_error.numpy(), + np.array([0.001]), + rtol=0.001, + atol=0.25, ) diff --git a/test/operators/test_rejection_top_p_sampling.py b/test/operators/test_rejection_top_p_sampling.py index 81d9b65b7..f034763c4 100644 --- a/test/operators/test_rejection_top_p_sampling.py +++ b/test/operators/test_rejection_top_p_sampling.py @@ -13,17 +13,20 @@ # limitations under the License. import unittest + import numpy as np import paddle + from fastdeploy.model_executor.ops.gpu import rejection_top_p_sampling + class TestRejectionTopPSampling(unittest.TestCase): def setUp(self): """Initialize common test data""" self.batch_size = 10 self.vocab_size = 103424 paddle.seed(2023) - + # Generate test data once for all tests self.pre_norm_prob_np = np.random.rand(self.batch_size, self.vocab_size).astype(np.float32) self.paddle_pre_norm_prob = paddle.to_tensor(self.pre_norm_prob_np) @@ -32,12 +35,12 @@ class TestRejectionTopPSampling(unittest.TestCase): def test_top_p_sampling_reject_case1(self): """Test with fixed top_p=0.8 and different random seeds""" top_p_paddle = paddle.full((self.batch_size,), 0.8) - + # Test with different seeds for seed in [1024, 2033, 2033]: samples = rejection_top_p_sampling(self.paddle_norm_prob, top_p_paddle, seed) self._validate_samples(samples) - + # Basic validation self.assertTrue(paddle.all(samples >= 0)) self.assertTrue(paddle.all(samples < self.vocab_size)) @@ -46,9 +49,9 @@ class TestRejectionTopPSampling(unittest.TestCase): """Test with varying top_p values across batch""" top_p_paddle = paddle.uniform(shape=[self.batch_size], min=0.1, max=1.0) samples = rejection_top_p_sampling(self.paddle_norm_prob, top_p_paddle, -1) - + self._validate_samples(samples) - + # Additional check that we're getting different results for different top_p unique_samples = len(paddle.unique(samples)) print(f"Unique samples: {unique_samples}") @@ -58,9 +61,10 @@ class TestRejectionTopPSampling(unittest.TestCase): """Common validation for all test cases""" self.assertTrue(paddle.all(samples >= 0)) self.assertTrue(paddle.all(samples < self.vocab_size)) - + # Check dtype self.assertEqual(samples.dtype, paddle.int64) + if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() diff --git a/test/operators/test_scaled_gemm_f8_i4_f16.py b/test/operators/test_scaled_gemm_f8_i4_f16.py index 70e3aab9e..a154d1df8 100644 --- a/test/operators/test_scaled_gemm_f8_i4_f16.py +++ b/test/operators/test_scaled_gemm_f8_i4_f16.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" UT for fp8_int4_gemm kernel """ +"""UT for fp8_int4_gemm kernel""" -import paddle import unittest + import numpy as np +import paddle + from fastdeploy.model_executor.ops.gpu import ( scaled_gemm_f8_i4_f16, scaled_gemm_f8_i4_f16_weight_quantize, @@ -37,9 +39,7 @@ class Test(unittest.TestCase): quant_fp8_pertensor """ scale = paddle.max(paddle.abs(tensor)) - tensor = paddle.cast( - (tensor * 448 / scale).clip(-448, 448), "float8_e4m3fn" - ).astype(tensor.dtype) + tensor = paddle.cast((tensor * 448 / scale).clip(-448, 448), "float8_e4m3fn").astype(tensor.dtype) return tensor, scale def dequant_fp8_pertensor(self, tensor, scale): @@ -56,9 +56,7 @@ class Test(unittest.TestCase): A_fp8, A_fp8_scale = self.quant_fp8_pertensor(A) B_fp8, B_fp8_scale = self.quant_fp8_pertensor(B) - processed_B, w_scale = scaled_gemm_f8_i4_f16_weight_quantize( - B_fp8, groupsize=-1, scale_dtype="float16" - ) + processed_B, w_scale = scaled_gemm_f8_i4_f16_weight_quantize(B_fp8, groupsize=-1, scale_dtype="float16") w_scale = paddle.view(w_scale, dtype) out_scale = (A_fp8_scale / 448) * (B_fp8_scale / 448) diff --git a/test/operators/test_split_fuse.py b/test/operators/test_split_fuse.py index 66132552e..ee0ea9e52 100644 --- a/test/operators/test_split_fuse.py +++ b/test/operators/test_split_fuse.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" UT for set_stop_value """ +"""UT for set_stop_value""" import paddle -from fastdeploy.model_executor.ops.gpu import get_mm_split_fuse +from fastdeploy.model_executor.ops.gpu import get_mm_split_fuse input_ids = [] image_type_ids = [] diff --git a/test/operators/test_stop_generation.py b/test/operators/test_stop_generation.py index 6218180e5..2eca9b7b5 100644 --- a/test/operators/test_stop_generation.py +++ b/test/operators/test_stop_generation.py @@ -12,8 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" UT for set_stop_value """ +"""UT for set_stop_value""" import paddle + from fastdeploy.model_executor.ops.gpu import set_stop_value topk_ids = paddle.randint(0, 10000, (8, 1)) diff --git a/test/operators/test_token_penalty.py b/test/operators/test_token_penalty.py index 17df9a85e..6114fb175 100644 --- a/test/operators/test_token_penalty.py +++ b/test/operators/test_token_penalty.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -""" UT for get_token_penalty """ -import paddle +"""UT for get_token_penalty""" import numpy as np +import paddle + from fastdeploy.model_executor.ops.gpu import get_token_penalty_once paddle.seed(2023) @@ -29,23 +30,17 @@ penalty_scores = paddle.to_tensor(penalty_scores) print("logits[0][pre_ids[0]]: ", logits[0][pre_ids[0]]) res = get_token_penalty_once(pre_ids, logits, penalty_scores) for i in range(8): - print("res[{}]:{}".format(i, res[i][pre_ids[i]])) + print(f"res[{i}]:{res[i][pre_ids[i]]}") input_ids = pre_ids score = paddle.index_sample(logits, input_ids) score = paddle.where(score < 0, score * penalty_scores, score / penalty_scores) -bsz = paddle.shape(logits)[ - 0 -] # TODO: Bsz as input for inference with dynamic batch_size -bsz_range = paddle.arange( - start=bsz * 0, end=bsz, step=bsz / bsz, name="bsz_range", dtype="int64" -).unsqueeze(-1) +bsz = paddle.shape(logits)[0] # TODO: Bsz as input for inference with dynamic batch_size +bsz_range = paddle.arange(start=bsz * 0, end=bsz, step=bsz / bsz, name="bsz_range", dtype="int64").unsqueeze(-1) input_ids = input_ids + bsz_range * logits.shape[-1] -res2 = paddle.scatter(logits.flatten(), input_ids.flatten(), score.flatten()).reshape( - logits.shape -) +res2 = paddle.scatter(logits.flatten(), input_ids.flatten(), score.flatten()).reshape(logits.shape) print("-------------------------------------------") for i in range(8): print(res2[i][pre_ids[i]]) diff --git a/test/operators/test_topp_sampling.py b/test/operators/test_topp_sampling.py index 62b3553dd..f446b4a8e 100644 --- a/test/operators/test_topp_sampling.py +++ b/test/operators/test_topp_sampling.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" UT for topp_sampling """ -import paddle +"""UT for topp_sampling""" import numpy as np +import paddle + from fastdeploy.model_executor.ops.gpu import topp_sampling paddle.seed(2022) diff --git a/test/worker/test_cuda_graph.py b/test/worker/test_cuda_graph.py index 30c0dca1e..597901357 100644 --- a/test/worker/test_cuda_graph.py +++ b/test/worker/test_cuda_graph.py @@ -13,23 +13,25 @@ # See the License for the specific language governing permissions and # limitations under the License. """ + import paddle from fastdeploy.config import FDConfig, GraphOptimizationConfig -from fastdeploy.model_executor.graph_optimization.decorator import \ - support_graph_optimization from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.graph_optimization.decorator import ( + support_graph_optimization, +) @support_graph_optimization class TestCase1SubLayer1(paddle.nn.Layer): - """ Sub layer 1 of test case 1 """ + """Sub layer 1 of test case 1""" def __init__(self, fd_config: FDConfig, **kwargs): super().__init__() def forward(self, _, forward_meta: ForwardMeta): - """ Sub layer1 forward pass """ + """Sub layer1 forward pass""" output = paddle.add(forward_meta.input_ids, forward_meta.input_ids) print(" SubLayer1 Output: {output}") @@ -43,7 +45,7 @@ class TestCase1SubLayer2(paddle.nn.Layer): super().__init__() def forward(self, _, forward_meta: ForwardMeta): - """ Sub layer2 forward pass """ + """Sub layer2 forward pass""" x = paddle.ones_like(forward_meta.input_ids) y = paddle.ones_like(forward_meta.input_ids) output = x + y @@ -59,21 +61,21 @@ class TestCase1SubLayer3(paddle.nn.Layer): super().__init__() def forward(self, _, forward_meta: ForwardMeta): - """ Sub layer3 forward pass """ + """Sub layer3 forward pass""" output = paddle.add(forward_meta.input_ids, forward_meta.input_ids) print(" SubLayer3 Output: {output}") return output class TestModel1(paddle.nn.Layer): - """ Tast Model """ + """Tast Model""" def __init__(self, fd_config: FDConfig, **kwargs): super().__init__() self.fd_config = fd_config def forward(self, _, forward_meta: ForwardMeta): - """ Test model for ward pass """ + """Test model for ward pass""" self.sublayer1 = TestCase1SubLayer1(self.fd_config) self.sublayer2 = TestCase1SubLayer2(self.fd_config) self.sublayer3 = TestCase1SubLayer3(self.fd_config) @@ -95,18 +97,18 @@ class TestModel1(paddle.nn.Layer): @support_graph_optimization class TestModel2(paddle.nn.Layer): - """ Tast Model """ + """Tast Model""" def __init__(self, fd_config: FDConfig, **kwargs): super().__init__() def forward(self, _, forward_meta: ForwardMeta): - """ Test model for ward pass """ + """Test model for ward pass""" return forward_meta.input_ids + forward_meta.input_ids def run_test_case(): - """ Run test case """ + """Run test case""" # Set llm config1 graph_opt_config = GraphOptimizationConfig() graph_opt_config.use_cudagraph = True @@ -128,5 +130,5 @@ def run_test_case(): print(output2) -if __name__ == '__main__': +if __name__ == "__main__": run_test_case() diff --git a/tools/deep_gemm_pre-compile/generate_config.py b/tools/deep_gemm_pre-compile/generate_config.py index ef746c425..9b66285ff 100644 --- a/tools/deep_gemm_pre-compile/generate_config.py +++ b/tools/deep_gemm_pre-compile/generate_config.py @@ -19,8 +19,7 @@ import math import os from typing import Tuple -from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.gemm import \ - get_smem_config +from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.gemm import get_smem_config logger = logging.getLogger(__name__) console_handler = logging.StreamHandler() @@ -40,7 +39,10 @@ def generate_kn_pairs(model_cfg: dict) -> Tuple[list, list, list]: [hidden_size, intermediate_size * 2], [intermediate_size, hidden_size], [hidden_size, hidden_size], - [hidden_size, (num_attention_heads + num_key_value_heads * 2) * head_dim], + [ + hidden_size, + (num_attention_heads + num_key_value_heads * 2) * head_dim, + ], ] grouped_gemm_contiguous_kn_pairs = [ # Moe grouped gemm contiguous @@ -53,7 +55,11 @@ def generate_kn_pairs(model_cfg: dict) -> Tuple[list, list, list]: [moe_intermediate_size, hidden_size], ] - return gemm_kn_pairs, grouped_gemm_contiguous_kn_pairs, grouped_gemm_masked_kn_pairs + return ( + gemm_kn_pairs, + grouped_gemm_contiguous_kn_pairs, + grouped_gemm_masked_kn_pairs, + ) def generate_json( @@ -79,9 +85,7 @@ def generate_json( NUM_STAGES = [8, 7, 6, 5, 4, 3] for num_stages in NUM_STAGES: for kn_pair in kn_pairs: - smem_config = get_smem_config( - num_stages, kn_pair[0], block_m, block_n - ) + smem_config = get_smem_config(num_stages, kn_pair[0], block_m, block_n) for tma_multicast_config in TMA_MULTICAST_CONFIGS: cfg = { "N": kn_pair[1], @@ -107,9 +111,11 @@ def main(args): with open(os.path.join(args.model, "config.json"), "r") as f: model_cfg = json.load(f) - gemm_kn_pairs, grouped_gemm_contiguous_kn_pairs, grouped_gemm_masked_kn_pairs = ( - generate_kn_pairs(model_cfg) - ) + ( + gemm_kn_pairs, + grouped_gemm_contiguous_kn_pairs, + grouped_gemm_masked_kn_pairs, + ) = generate_kn_pairs(model_cfg) num_gemm = generate_json( gemm_kn_pairs, model_cfg["moe_num_experts"], @@ -129,9 +135,7 @@ def main(args): ) logger.info(f"Configurations generated and saved to {args.output}") logger.info(f"Generated {num_gemm} gemm configuration.") - logger.info( - f"Generated {num_grouped_contiguous} grouped_gemm_contiguous configuration." - ) + logger.info(f"Generated {num_grouped_contiguous} grouped_gemm_contiguous configuration.") logger.info(f"Generated {num_grouped_masked} grouped_gemm_masked configuration.") diff --git a/tools/deep_gemm_pre-compile/pre_compile.py b/tools/deep_gemm_pre-compile/pre_compile.py index 38571f5cd..4bb74f2af 100644 --- a/tools/deep_gemm_pre-compile/pre_compile.py +++ b/tools/deep_gemm_pre-compile/pre_compile.py @@ -25,15 +25,21 @@ from tqdm import tqdm from fastdeploy.model_executor.ops.gpu.deep_gemm.jit.compiler import build from fastdeploy.model_executor.ops.gpu.deep_gemm.jit.template import ( - cpp_format, generate) -from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.gemm import \ - includes as gemm_includes -from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.gemm import \ - template as gemm_template -from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.m_grouped_gemm import \ - includes as m_grouped_includes -from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.m_grouped_gemm import \ - template as m_grouped_template + cpp_format, + generate, +) +from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.gemm import ( + includes as gemm_includes, +) +from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.gemm import ( + template as gemm_template, +) +from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.m_grouped_gemm import ( + includes as m_grouped_includes, +) +from fastdeploy.model_executor.ops.gpu.deep_gemm.jit_kernels.m_grouped_gemm import ( + template as m_grouped_template, +) logger = logging.getLogger(__name__) console_handler = logging.StreamHandler() @@ -110,9 +116,7 @@ class CompileWorker(threading.Thread): ("smem_size", int), ) if cfg["IS_GROUPED_CONTIGUOUS"] or cfg["IS_GROUPED_MASKED"]: - keys["NUM_GROUPS"] = int( - cfg["MOE_NUM_EXPERTS"] / cfg["EXPERT_PARALLEL"] - ) + keys["NUM_GROUPS"] = int(cfg["MOE_NUM_EXPERTS"] / cfg["EXPERT_PARALLEL"]) includes = m_grouped_includes template = m_grouped_template name = "m_grouped_gemm_fp8_fp8_bf16_nt" @@ -120,7 +124,7 @@ class CompileWorker(threading.Thread): code = generate(includes, arg_defs, cpp_format(template, keys)) build(name, arg_defs, code) except Exception as e: - logger.error(f"Failed to compile config {cfg}: {str(e)}") + logger.error(f"Failed to compile config {cfg}: {e!s}") raise RuntimeError(e) finally: self.pbar.update(1) diff --git a/tools/deep_gemm_pre-compile/pre_compile.sh b/tools/deep_gemm_pre-compile/pre_compile.sh index 8b609dfeb..37dcd3c83 100644 --- a/tools/deep_gemm_pre-compile/pre_compile.sh +++ b/tools/deep_gemm_pre-compile/pre_compile.sh @@ -28,4 +28,4 @@ python generate_config.py \ python pre_compile.py \ --config_file=./deep_gemm_pre_compile_config.jsonl \ --expert_parallel=$EXPERT_PARALLEL \ - --num_threads=$nproc \ No newline at end of file + --num_threads=$nproc diff --git a/tools/dockerfile/Dockerfile.ci b/tools/dockerfile/Dockerfile.ci index 83ae1e980..1afb1b987 100644 --- a/tools/dockerfile/Dockerfile.ci +++ b/tools/dockerfile/Dockerfile.ci @@ -1,5 +1,5 @@ FROM ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:cuda126-dev RUN apt update && apt install -y lsof -RUN wget https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/refs/heads/develop/requirements.txt +RUN wget https://raw.githubusercontent.com/PaddlePaddle/FastDeploy/refs/heads/develop/requirements.txt RUN python -m pip install -r requirements.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple && python -m pip install pytest RUN apt update && apt install -y python3.10-venv diff --git a/tools/dockerfile/docker_build.sh b/tools/dockerfile/docker_build.sh index 5bed0599d..d8e5f0ab5 100644 --- a/tools/dockerfile/docker_build.sh +++ b/tools/dockerfile/docker_build.sh @@ -3,7 +3,7 @@ PRODUCT_NAME='ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:fastdeplo cp ../../requirements.txt ./ docker build -t ${PRODUCT_NAME} -f Dockerfile.ci . \ - --network host + --network host # --build-arg HTTP_PROXY=${proxy} \ # --build-arg HTTPS_PROXY=${proxy} \ # --build-arg ftp_proxy=${proxy}