update flake8 version to support pre-commit in python3.12 (#3000)

* update flake8 version to support pre-commit in python3.12

* polish code
This commit is contained in:
Zero Rains
2025-07-24 16:43:31 +08:00
committed by GitHub
parent 5151bc92c8
commit 0fb37ab7e4
30 changed files with 324 additions and 275 deletions

View File

@@ -1,5 +1,5 @@
[flake8] [flake8]
ignore = E203, E402, E501, E731, E741, W503, W605, E722 ignore = E203, E402, E501, E731, E741, W503, W605, E722, E231, W604, E702, E226, E221, E713, E271
max-line-length = 119 max-line-length = 119
# E402: module level import not at top of file # E402: module level import not at top of file

View File

@@ -7,7 +7,7 @@ default_stages:
# - manual # Run in CI # - manual # Run in CI
repos: repos:
- repo: https://github.com/psf/black.git - repo: https://github.com/psf/black.git
rev: 22.8.0 rev: 25.1.0
hooks: hooks:
- id: black - id: black
files: \.(py|pyi)$ files: \.(py|pyi)$
@@ -18,7 +18,7 @@ repos:
hooks: hooks:
- id: isort - id: isort
- repo: https://github.com/PyCQA/flake8 - repo: https://github.com/PyCQA/flake8
rev: 4.0.1 rev: 7.0.0
hooks: hooks:
- id: flake8 - id: flake8
# 代码检查 # 代码检查

View File

@@ -29,7 +29,13 @@ for i in range(bs):
ids_len = seq_lens[i, 0] ids_len = seq_lens[i, 0]
input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64") input_ids[i, 0:ids_len] = np.random.randint(1, 10, seq_lens[i, 0], "int64")
(x_remove_padding, cum_offsets_out, padding_offset, cu_seqlens_q, cu_seqlens_k,) = get_padding_offset( (
x_remove_padding,
cum_offsets_out,
padding_offset,
cu_seqlens_q,
cu_seqlens_k,
) = get_padding_offset(
paddle.to_tensor(input_ids), paddle.to_tensor(input_ids),
paddle.to_tensor(cum_offset), paddle.to_tensor(cum_offset),
paddle.to_tensor(token_num), paddle.to_tensor(token_num),

View File

@@ -473,7 +473,10 @@ class PrefixCacheManager:
current_time = time.time() current_time = time.time()
self._update_matched_node_info(req_id, match_block_node, current_time) self._update_matched_node_info(req_id, match_block_node, current_time)
# 2. prepare cache # 2. prepare cache
(gpu_recv_block_ids, gpu_extra_block_ids,) = self._prepare_cache( (
gpu_recv_block_ids,
gpu_extra_block_ids,
) = self._prepare_cache(
req_id, req_id,
input_ids, input_ids,
block_size, block_size,

View File

@@ -113,10 +113,7 @@ class CudaRTLibrary:
Function( Function(
"cudaStreamIsCapturing", "cudaStreamIsCapturing",
cudaError_t, cudaError_t,
[ [cudaStream_t, ctypes.POINTER(cudaStreamCaptureStatus)],
cudaStream_t,
ctypes.POINTER(cudaStreamCaptureStatus)
]
), ),
] ]
@@ -197,9 +194,8 @@ class CudaRTLibrary:
self.funcs["cudaIpcOpenMemHandle"](ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess) self.funcs["cudaIpcOpenMemHandle"](ctypes.byref(devPtr), handle, cudaIpcMemLazyEnablePeerAccess)
) )
return devPtr return devPtr
def cudaStreamIsCapturing(self, stream: cudaStream_t) -> ctypes.c_int: def cudaStreamIsCapturing(self, stream: cudaStream_t) -> ctypes.c_int:
is_capturing = ctypes.c_int() is_capturing = ctypes.c_int()
self.CUDART_CHECK( self.CUDART_CHECK(self.funcs["cudaStreamIsCapturing"](stream, is_capturing))
self.funcs["cudaStreamIsCapturing"](stream, is_capturing)
)
return is_capturing return is_capturing

View File

@@ -559,8 +559,8 @@ class EngineArgs:
"--ips", "--ips",
type=lambda s: s.split(",") if s else None, type=lambda s: s.split(",") if s else None,
default=EngineArgs.ips, default=EngineArgs.ips,
help= help="IP addresses of all nodes participating in distributed inference.",
"IP addresses of all nodes participating in distributed inference.") )
# Performance tuning parameters group # Performance tuning parameters group
perf_group = parser.add_argument_group("Performance Tuning") perf_group = parser.add_argument_group("Performance Tuning")

View File

@@ -41,7 +41,7 @@ class EngineClient:
mm_processor_kwargs, mm_processor_kwargs,
enable_mm=False, enable_mm=False,
reasoning_parser=None, reasoning_parser=None,
data_parallel_size=1 data_parallel_size=1,
): ):
input_processor = InputPreprocessor( input_processor = InputPreprocessor(
tokenizer, tokenizer,
@@ -55,8 +55,7 @@ class EngineClient:
self.data_processor = input_processor.create_processor() self.data_processor = input_processor.create_processor()
self.max_model_len = max_model_len self.max_model_len = max_model_len
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
array_size = min( array_size = min(max_chips_per_node, tensor_parallel_size * data_parallel_size)
max_chips_per_node, tensor_parallel_size * data_parallel_size)
self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32) self.worker_healthy_live_recorded_time_array = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_healthy_live_signal = IPCSignal( self.worker_healthy_live_signal = IPCSignal(
name="worker_healthy_live_signal", name="worker_healthy_live_signal",

View File

@@ -113,7 +113,7 @@ async def lifespan(app: FastAPI):
args.mm_processor_kwargs, args.mm_processor_kwargs,
args.enable_mm, args.enable_mm,
args.reasoning_parser, args.reasoning_parser,
args.data_parallel_size args.data_parallel_size,
) )
app.state.dynamic_load_weight = args.dynamic_load_weight app.state.dynamic_load_weight = args.dynamic_load_weight
chat_handler = OpenAIServingChat(engine_client, pid, args.ips) chat_handler = OpenAIServingChat(engine_client, pid, args.ips)

View File

@@ -19,9 +19,10 @@ import time
import traceback import traceback
import uuid import uuid
from typing import List, Optional from typing import List, Optional
import numpy as np
import aiozmq import aiozmq
import msgpack import msgpack
import numpy as np
from aiozmq import zmq from aiozmq import zmq
from fastdeploy.entrypoints.openai.protocol import ( from fastdeploy.entrypoints.openai.protocol import (
@@ -151,7 +152,9 @@ class OpenAIServingChat:
if request.metadata is not None: if request.metadata is not None:
enable_thinking = request.metadata.get("enable_thinking") enable_thinking = request.metadata.get("enable_thinking")
include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False) include_stop_str_in_output = request.metadata.get("include_stop_str_in_output", False)
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) enable_return_token_ids = request.return_token_ids or (
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
)
while num_choices > 0: while num_choices > 0:
try: try:
raw_data = await asyncio.wait_for(dealer.read(), timeout=10) raw_data = await asyncio.wait_for(dealer.read(), timeout=10)
@@ -199,7 +202,7 @@ class OpenAIServingChat:
tool_calls=None, tool_calls=None,
prompt_token_ids=None, prompt_token_ids=None,
completion_token_ids=None, completion_token_ids=None,
) ),
) )
if enable_return_token_ids: if enable_return_token_ids:
choice.delta.prompt_token_ids = list(prompt_token_ids) choice.delta.prompt_token_ids = list(prompt_token_ids)
@@ -239,7 +242,7 @@ class OpenAIServingChat:
previous_num_tokens += len(output["token_ids"]) previous_num_tokens += len(output["token_ids"])
delta_message = DeltaMessage( delta_message = DeltaMessage(
content=delta_text, content=delta_text,
reasoning_content=output.get("reasoning_content"), \ reasoning_content=output.get("reasoning_content"),
prompt_token_ids=None, prompt_token_ids=None,
completion_token_ids=None, completion_token_ids=None,
tool_calls=output.get("tool_call_content", []), tool_calls=output.get("tool_call_content", []),
@@ -329,7 +332,9 @@ class OpenAIServingChat:
final_res = None final_res = None
enable_thinking = None enable_thinking = None
include_stop_str_in_output = False include_stop_str_in_output = False
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) enable_return_token_ids = request.return_token_ids or (
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
)
try: try:
dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc") dealer = await aiozmq.create_zmq_stream(zmq.DEALER, connect=f"ipc:///dev/shm/router_{self.pid}.ipc")
dealer.write([b"", request_id.encode("utf-8")]) dealer.write([b"", request_id.encode("utf-8")])
@@ -403,7 +408,7 @@ class OpenAIServingChat:
reasoning_content=output.get("reasoning_content"), reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call_content"), tool_calls=output.get("tool_call_content"),
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
completion_token_ids=completion_token_ids if enable_return_token_ids else None, completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
) )
logprobs_full_res = None logprobs_full_res = None
if logprob_contents: if logprob_contents:

View File

@@ -18,9 +18,10 @@ import asyncio
import time import time
import uuid import uuid
from typing import List from typing import List
import numpy as np
import aiozmq import aiozmq
import msgpack import msgpack
import numpy as np
from aiozmq import zmq from aiozmq import zmq
from fastdeploy.engine.request import RequestOutput from fastdeploy.engine.request import RequestOutput
@@ -48,7 +49,6 @@ class OpenAIServingCompletion:
else: else:
self.master_ip = self.master_ip.split(",")[0] self.master_ip = self.master_ip.split(",")[0]
def _check_master(self): def _check_master(self):
if self.master_ip is None: if self.master_ip is None:
return True return True
@@ -238,7 +238,9 @@ class OpenAIServingCompletion:
model=model_name, model=model_name,
choices=choices, choices=choices,
) )
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) enable_return_token_ids = request.return_token_ids or (
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
)
current_waiting_time = 0 current_waiting_time = 0
while num_choices > 0: while num_choices > 0:
try: try:
@@ -267,12 +269,16 @@ class OpenAIServingCompletion:
id=request_id, id=request_id,
created=created_time, created=created_time,
model=model_name, model=model_name,
choices=[CompletionResponseStreamChoice( choices=[
CompletionResponseStreamChoice(
index=idx, index=idx,
text="", text="",
prompt_token_ids=list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None, prompt_token_ids=(
list(prompt_batched_token_ids[idx]) if enable_return_token_ids else None
),
completion_token_ids=None, completion_token_ids=None,
)] )
],
) )
yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n" yield f"data: {chunk.model_dump_json(exclude_unset=True)}\n\n"
first_iteration[idx] = False first_iteration[idx] = False
@@ -286,15 +292,17 @@ class OpenAIServingCompletion:
output = res["outputs"] output = res["outputs"]
choices.append(CompletionResponseStreamChoice( choices.append(
CompletionResponseStreamChoice(
index=idx, index=idx,
text=output["text"], text=output["text"],
prompt_token_ids=None, prompt_token_ids=None,
completion_token_ids=output.get("token_ids") if enable_return_token_ids else None, completion_token_ids=(output.get("token_ids") if enable_return_token_ids else None),
tool_calls=output.get("tool_call_content"), tool_calls=output.get("tool_call_content"),
reasoning_content=output.get("reasoning_content"), reasoning_content=output.get("reasoning_content"),
arrival_time=arrival_time arrival_time=arrival_time,
)) )
)
if res["finished"]: if res["finished"]:
if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens: if request.max_tokens is None or output_tokens[idx] + 1 != request.max_tokens:
chunk.choices[0].finish_reason = "stop" chunk.choices[0].finish_reason = "stop"
@@ -353,12 +361,14 @@ class OpenAIServingCompletion:
created_time: int, created_time: int,
model_name: str, model_name: str,
prompt_batched_token_ids: list(), prompt_batched_token_ids: list(),
completion_batched_token_ids: list() completion_batched_token_ids: list(),
) -> CompletionResponse: ) -> CompletionResponse:
choices: List[CompletionResponseChoice] = [] choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0 num_prompt_tokens = 0
num_generated_tokens = 0 num_generated_tokens = 0
enable_return_token_ids = request.return_token_ids or (request.extra_body is not None and request.extra_body.get('return_token_ids', False)) enable_return_token_ids = request.return_token_ids or (
request.extra_body is not None and request.extra_body.get("return_token_ids", False)
)
for idx in range(len(final_res_batch)): for idx in range(len(final_res_batch)):
final_res = final_res_batch[idx] final_res = final_res_batch[idx]
@@ -385,8 +395,8 @@ class OpenAIServingCompletion:
index=len(choices), index=len(choices),
text=output_text, text=output_text,
prompt_token_ids=prompt_token_ids if enable_return_token_ids else None, prompt_token_ids=prompt_token_ids if enable_return_token_ids else None,
completion_token_ids=completion_token_ids if enable_return_token_ids else None, completion_token_ids=(completion_token_ids if enable_return_token_ids else None),
reasoning_content=output.get('reasoning_content'), reasoning_content=output.get("reasoning_content"),
tool_calls=output.get("tool_call_content"), tool_calls=output.get("tool_call_content"),
logprobs=None, logprobs=None,
finish_reason=None, finish_reason=None,

View File

@@ -99,8 +99,7 @@ class ErnieProcessor(BaseDataProcessor):
if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0: if request.prompt_token_ids is None or len(request.prompt_token_ids) == 0:
if request.prompt is None and request.messages is None: if request.prompt is None and request.messages is None:
raise ValueError( raise ValueError(f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
f"The request should have `prompt_token_ids`, `prompt` or `messages`: {request}.")
if request.prompt is not None: if request.prompt is not None:
prompt = request.prompt if request.prompt is not None else request.messages[0] prompt = request.prompt if request.prompt is not None else request.messages[0]
prompt = prompt[0] if isinstance(prompt, list) else prompt prompt = prompt[0] if isinstance(prompt, list) else prompt
@@ -164,8 +163,8 @@ class ErnieProcessor(BaseDataProcessor):
req_id = request.get("request_id", None) req_id = request.get("request_id", None)
data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}") data_processor_logger.info(f"req_id:{req_id}, tokens:{tokens}, token_ids: {token_ids}")
else: else:
request['prompt_token_ids'] = self.messages2ids(request) request["prompt_token_ids"] = self.messages2ids(request)
if len(request['prompt_token_ids']) == 0: if len(request["prompt_token_ids"]) == 0:
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
# truncate prompts that exceed the length limit # truncate prompts that exceed the length limit
@@ -246,8 +245,7 @@ class ErnieProcessor(BaseDataProcessor):
if is_end: if is_end:
full_text = previous_texts + delta_text full_text = previous_texts + delta_text
if enable_thinking and self.reasoning_parser: if enable_thinking and self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content( reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
full_text, response_dict)
response_dict["outputs"]["text"] = text response_dict["outputs"]["text"] = text
response_dict["outputs"]["reasoning_content"] = reasoning_content response_dict["outputs"]["reasoning_content"] = reasoning_content
else: else:

View File

@@ -507,5 +507,6 @@ class DataProcessor:
tokens = self.tokenizer.tokenize(prompt_token_str) tokens = self.tokenizer.tokenize(prompt_token_str)
token_ids = self.tokenizer.convert_tokens_to_ids(tokens) token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
data_processor_logger.info( data_processor_logger.info(
f"req_id:{request.get('request_id', ''),} tokens: {tokens}, token_ids: {token_ids}") f"req_id:{request.get('request_id', ''), } tokens: {tokens}, token_ids: {token_ids}"
)
return token_ids return token_ids

View File

@@ -239,9 +239,7 @@ class DataProcessor(BaseDataProcessor):
task["enable_thinking"] = kwargs.get("enable_thinking", True) task["enable_thinking"] = kwargs.get("enable_thinking", True)
request.prompt_token_ids = self.messages2ids(task) request.prompt_token_ids = self.messages2ids(task)
else: else:
raise ValueError( raise ValueError(f"The request should have `input_ids`, `text` or `messages`: {request}.")
f"The request should have `input_ids`, `text` or `messages`: {request}."
)
if len(request.prompt_token_ids) == 0: if len(request.prompt_token_ids) == 0:
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
if request.get("max_tokens") is None: if request.get("max_tokens") is None:
@@ -281,18 +279,16 @@ class DataProcessor(BaseDataProcessor):
data_processor_logger.info(f"Processing request {request}") data_processor_logger.info(f"Processing request {request}")
# processing prompt_token_ids # processing prompt_token_ids
if not request.get('prompt_token_ids'): if not request.get("prompt_token_ids"):
if 'prompt' in request: if "prompt" in request:
request['prompt_token_ids'] = self.text2ids(request['prompt'], max_model_len).tolist() request["prompt_token_ids"] = self.text2ids(request["prompt"], max_model_len).tolist()
elif 'messages' in request: elif "messages" in request:
if self.tokenizer.chat_template is None: if self.tokenizer.chat_template is None:
raise ValueError("This model does not support chat_template.") raise ValueError("This model does not support chat_template.")
request["prompt_token_ids"] = self.messages2ids(request) request["prompt_token_ids"] = self.messages2ids(request)
else: else:
raise ValueError( raise ValueError(f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}")
f"Request must contain 'prompt_token_ids', 'prompt', or 'messages': {request}" if len(request["prompt_token_ids"]) == 0:
)
if len(request['prompt_token_ids']) == 0:
raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs") raise ValueError("Invalid input: prompt_token_ids must be a non-empty sequence of token IDs")
if request.get("max_tokens") is None: if request.get("max_tokens") is None:
request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
@@ -357,8 +353,7 @@ class DataProcessor(BaseDataProcessor):
if is_end: if is_end:
full_text = previous_texts + delta_text full_text = previous_texts + delta_text
if enable_thinking and self.reasoning_parser: if enable_thinking and self.reasoning_parser:
reasoning_content, text = self.reasoning_parser.extract_reasoning_content( reasoning_content, text = self.reasoning_parser.extract_reasoning_content(full_text, response_dict)
full_text, response_dict)
response_dict["outputs"]["text"] = text response_dict["outputs"]["text"] = text
response_dict["outputs"]["reasoning_content"] = reasoning_content response_dict["outputs"]["reasoning_content"] = reasoning_content
else: else:

View File

@@ -21,8 +21,8 @@ import paddle.nn.layer
from paddle.device.cuda import graphs from paddle.device.cuda import graphs
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.utils import get_logger
from fastdeploy.distributed.communication import capture_custom_allreduce from fastdeploy.distributed.communication import capture_custom_allreduce
from fastdeploy.utils import get_logger
logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log") logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
@@ -99,7 +99,7 @@ class CudaGraphPiecewiseBackend:
entry.runnable(**kwargs) entry.runnable(**kwargs)
logger.debug( logger.debug(
f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, " f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
f"finished ({n+1}/{entry.num_finished_warmup}) times" f"finished ({n + 1}/{entry.num_finished_warmup}) times"
) )
# Store input addresses for debug # Store input addresses for debug
@@ -115,7 +115,6 @@ class CudaGraphPiecewiseBackend:
output = entry.runnable(**kwargs) output = entry.runnable(**kwargs)
new_grpah.capture_end() new_grpah.capture_end()
# Store output buffer # Store output buffer
entry.cuda_graph = new_grpah entry.cuda_graph = new_grpah
entry.output_buffer = paddle.zeros_like(output) entry.output_buffer = paddle.zeros_like(output)

View File

@@ -17,7 +17,11 @@ dcu backend methods
""" """
from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod from .fused_moe_triton_backends import DCUTritonWeightOnlyMoEMethod
from .weight_only import DCUWeightOnlyLinearMethod
from .top_p_sampling import native_top_p_sampling from .top_p_sampling import native_top_p_sampling
from .weight_only import DCUWeightOnlyLinearMethod
__all__ = ["DCUTritonWeightOnlyMoEMethod", "DCUWeightOnlyLinearMethod", "native_top_p_sampling"] __all__ = [
"DCUTritonWeightOnlyMoEMethod",
"DCUWeightOnlyLinearMethod",
"native_top_p_sampling",
]

View File

@@ -13,13 +13,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
import paddle import paddle
def native_top_p_sampling( def native_top_p_sampling(probs: paddle.Tensor, top_p: paddle.Tensor) -> tuple[paddle.Tensor, paddle.Tensor]:
probs: paddle.Tensor,
top_p: paddle.Tensor
) -> tuple[paddle.Tensor, paddle.Tensor]:
sorted_indices = paddle.argsort(probs, descending=True) sorted_indices = paddle.argsort(probs, descending=True)
sorted_probs = paddle.sort(probs, descending=True) sorted_probs = paddle.sort(probs, descending=True)
cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
@@ -30,7 +28,9 @@ def native_top_p_sampling(
sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1] sorted_indices = sorted_indices + paddle.arange(probs.shape[0], dtype="int64").unsqueeze(-1) * probs.shape[-1]
condition = paddle.scatter( condition = paddle.scatter(
sorted_indices_to_remove.flatten(), sorted_indices.flatten(), sorted_indices_to_remove.flatten() sorted_indices_to_remove.flatten(),
sorted_indices.flatten(),
sorted_indices_to_remove.flatten(),
) )
condition = paddle.cast(condition, "bool").reshape(probs.shape) condition = paddle.cast(condition, "bool").reshape(probs.shape)

View File

@@ -143,7 +143,13 @@ class DeepEPEngine:
event: the event after executing the kernel (valid only if `async_finish` is set). event: the event after executing the kernel (valid only if `async_finish` is set).
hook: the receiving hook function (valid only if `return_recv_hook` is set). hook: the receiving hook function (valid only if `return_recv_hook` is set).
""" """
(packed_recv_x, recv_expert_count, handle, _, dispatch_hook,) = self.deepep_engine.low_latency_dispatch( (
packed_recv_x,
recv_expert_count,
handle,
_,
dispatch_hook,
) = self.deepep_engine.low_latency_dispatch(
hidden_states, hidden_states,
topk_idx, topk_idx,
expertwise_scale, expertwise_scale,

View File

@@ -21,15 +21,21 @@ import fastdeploy
from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import ( from fastdeploy.model_executor.ops.gpu import (
MoeWna16MarlinGemmApi, MoeWna16MarlinGemmApi,
tritonmoe_preprocess_func,
noaux_tc, noaux_tc,
tritonmoe_preprocess_func,
) )
from ..quantization.quant_base import QuantMethodBase from ..quantization.quant_base import QuantMethodBase
def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k,
def get_moe_scores(
gating_output: paddle.Tensor,
n_group,
topk_group,
top_k,
routed_scaling_factor, routed_scaling_factor,
e_score_correction_bias) -> paddle.Tensor: e_score_correction_bias,
) -> paddle.Tensor:
""" """
compute moe scores using e_score_correction_bias. compute moe scores using e_score_correction_bias.
""" """
@@ -45,6 +51,7 @@ def get_moe_scores(gating_output: paddle.Tensor, n_group, topk_group, top_k,
) )
return scores return scores
def gptq_marlin_moe_repack( def gptq_marlin_moe_repack(
b_q_weight: paddle.Tensor, b_q_weight: paddle.Tensor,
perm: paddle.Tensor, perm: paddle.Tensor,
@@ -226,10 +233,14 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
topk_method = layer.topk_method topk_method = layer.topk_method
if topk_method == "noaux_tc": if topk_method == "noaux_tc":
gate_out = get_moe_scores(gate_out, layer.n_group, gate_out = get_moe_scores(
layer.topk_group, layer.top_k, gate_out,
layer.n_group,
layer.topk_group,
layer.top_k,
layer.routed_scaling_factor, layer.routed_scaling_factor,
layer.gate_correction_bias) layer.gate_correction_bias,
)
topk_weights, topk_ids = paddle.topk(gate_out, k=layer.top_k, axis=-1, sorted=False) topk_weights, topk_ids = paddle.topk(gate_out, k=layer.top_k, axis=-1, sorted=False)
else: else:

View File

@@ -609,11 +609,11 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func
sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func( sorted_token_ids, expert_ids, num_tokens_post_padded = tritonmoe_preprocess_func(
topk_ids, num_local_experts, config["BLOCK_SIZE_M"]) topk_ids, num_local_experts, config["BLOCK_SIZE_M"]
)
# cache13 = create_empty_tensor(tuple([token_num * top_k * max(N1, N2)]), x.dtype) # cache13 = create_empty_tensor(tuple([token_num * top_k * max(N1, N2)]), x.dtype)
cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype) cache13 = paddle.empty([token_num * top_k * max(N1, N2)], dtype=x.dtype)
intermediate_cache1 = cache13[:token_num * top_k * N1].view( intermediate_cache1 = cache13[: token_num * top_k * N1].view([token_num * top_k, N1])
[token_num * top_k, N1])
max_num_tokens_padded = sorted_token_ids.shape[0] max_num_tokens_padded = sorted_token_ids.shape[0]
grid = ( grid = (
@@ -669,11 +669,11 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1) intermediate_cache2 = paddle.incubate.nn.functional.swiglu(intermediate_cache1)
intermediate_cache3 = cache13[:token_num * top_k * N2].view( intermediate_cache3 = cache13[: token_num * top_k * N2].view([token_num * top_k, N2])
[token_num * top_k, N2])
grid = (ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * grid = (
ceil_div(hidden_size, config["BLOCK_SIZE_N"]), ) ceil_div(max_num_tokens_padded, config["BLOCK_SIZE_M"]) * ceil_div(hidden_size, config["BLOCK_SIZE_N"]),
)
x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant( x_q, x_scale = fastdeploy.model_executor.ops.gpu.per_token_quant(
intermediate_cache2, self.quant_config.weight_block_size[0] intermediate_cache2, self.quant_config.weight_block_size[0]

View File

@@ -125,7 +125,7 @@ class FusedMoE(nn.Layer):
self.init_moe_weights() self.init_moe_weights()
logger.info( logger.info(
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset+self.num_local_experts}), \ f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
{top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \ {top_k=}, hidden_size={self.hidden_size}, {moe_intermediate_size=}, \
, ep_size={self.ep_size}, \ , ep_size={self.ep_size}, \
tp_size={self.tp_size}." tp_size={self.tp_size}."
@@ -232,17 +232,21 @@ class FusedMoE(nn.Layer):
up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx) up_gate_proj_expert_weight_key_name = up_gate_proj_expert_weight_key.format(expert_idx)
up_gate_proj_weights.append( up_gate_proj_weights.append(
get_tensor( get_tensor(
(
state_dict.pop(up_gate_proj_expert_weight_key_name) state_dict.pop(up_gate_proj_expert_weight_key_name)
if up_gate_proj_expert_weight_key_name in state_dict if up_gate_proj_expert_weight_key_name in state_dict
else up_gate_proj_expert_weight_key_name, else up_gate_proj_expert_weight_key_name
),
self.fd_config.parallel_config.model_name_or_path, self.fd_config.parallel_config.model_name_or_path,
) )
) )
down_proj_weights.append( down_proj_weights.append(
get_tensor( get_tensor(
(
state_dict.pop(down_proj_expert_weight_key_name) state_dict.pop(down_proj_expert_weight_key_name)
if down_proj_expert_weight_key_name in state_dict if down_proj_expert_weight_key_name in state_dict
else down_proj_expert_weight_key_name, else down_proj_expert_weight_key_name
),
self.fd_config.parallel_config.model_name_or_path, self.fd_config.parallel_config.model_name_or_path,
) )
) )
@@ -255,23 +259,29 @@ class FusedMoE(nn.Layer):
up_expert_weight_key_name = up_expert_weight_key.format(expert_idx) up_expert_weight_key_name = up_expert_weight_key.format(expert_idx)
down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx) down_proj_expert_weight_key_name = down_proj_expert_weight_key.format(expert_idx)
gate = get_tensor( gate = get_tensor(
(
state_dict.pop(gate_expert_weight_key_name) state_dict.pop(gate_expert_weight_key_name)
if gate_expert_weight_key_name in state_dict if gate_expert_weight_key_name in state_dict
else gate_expert_weight_key_name, else gate_expert_weight_key_name
),
self.fd_config.parallel_config.model_name_or_path, self.fd_config.parallel_config.model_name_or_path,
) )
up = get_tensor( up = get_tensor(
(
state_dict.pop(up_expert_weight_key_name) state_dict.pop(up_expert_weight_key_name)
if up_expert_weight_key_name in state_dict if up_expert_weight_key_name in state_dict
else up_expert_weight_key_name, else up_expert_weight_key_name
),
self.fd_config.parallel_config.model_name_or_path, self.fd_config.parallel_config.model_name_or_path,
) )
up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1)) up_gate_proj_weights.append(paddle.concat([gate, up], axis=-1))
down_proj_weights.append( down_proj_weights.append(
get_tensor( get_tensor(
(
state_dict.pop(down_proj_expert_weight_key_name) state_dict.pop(down_proj_expert_weight_key_name)
if down_proj_expert_weight_key_name in state_dict if down_proj_expert_weight_key_name in state_dict
else down_proj_expert_weight_key_name, else down_proj_expert_weight_key_name
),
self.fd_config.parallel_config.model_name_or_path, self.fd_config.parallel_config.model_name_or_path,
) )
) )

View File

@@ -54,8 +54,8 @@ def apply_penalty_multi_scores(
eos_token_ids, eos_token_ids,
) )
elif current_platform.is_dcu(): elif current_platform.is_dcu():
from fastdeploy.model_executor.ops.gpu import \ from fastdeploy.model_executor.ops.gpu import get_token_penalty_multi_scores
get_token_penalty_multi_scores
logits = get_token_penalty_multi_scores( logits = get_token_penalty_multi_scores(
pre_token_ids, pre_token_ids,
prompt_ids, prompt_ids,

View File

@@ -81,6 +81,7 @@ def top_k_top_p_sampling(
_, ids = gcu_top_p_sampling(x, top_p) _, ids = gcu_top_p_sampling(x, top_p)
elif current_platform.is_dcu(): elif current_platform.is_dcu():
from fastdeploy.model_executor.layers.backends import native_top_p_sampling from fastdeploy.model_executor.layers.backends import native_top_p_sampling
_, ids = native_top_p_sampling(x, top_p) _, ids = native_top_p_sampling(x, top_p)
else: else:
_, ids = paddle.tensor.top_p_sampling( _, ids = paddle.tensor.top_p_sampling(

View File

@@ -300,7 +300,13 @@ def speculate_remove_padding(
if current_platform.is_cuda(): if current_platform.is_cuda():
cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time) cum_offsets_now = paddle.cumsum(max_len - seq_lens_this_time)
token_num = paddle.sum(seq_lens_this_time) token_num = paddle.sum(seq_lens_this_time)
(ids_remove_padding, cum_offsets, padding_offset, cu_seqlens_q, cu_seqlens_k,) = speculate_get_padding_offset( (
ids_remove_padding,
cum_offsets,
padding_offset,
cu_seqlens_q,
cu_seqlens_k,
) = speculate_get_padding_offset(
input_ids, input_ids,
draft_tokens, draft_tokens,
cum_offsets_now, cum_offsets_now,

View File

@@ -103,9 +103,9 @@ def extract_triton_kernel(kernel, file_name):
import textwrap import textwrap
fn = kernel fn = kernel
if type(kernel) == triton.runtime.jit.JITFunction: if isinstance(kernel, triton.runtime.jit.JITFunction):
fn = kernel.fn fn = kernel.fn
elif type(kernel) == triton.runtime.autotuner.Autotuner: elif isinstance(kernel, triton.runtime.autotuner.Autotuner):
fn = kernel.fn.fn fn = kernel.fn.fn
else: else:
AssertionError("error occurs") AssertionError("error occurs")
@@ -195,14 +195,14 @@ def get_value_hint(x):
""" """
hint = "" hint = ""
for ele in x: for ele in x:
if type(ele) == int: if isinstance(ele, int):
if ele % 16 == 0 and ele > 0: if ele % 16 == 0 and ele > 0:
hint += "i64:16," hint += "i64:16,"
elif ele == 1: elif ele == 1:
hint += "i64:1," hint += "i64:1,"
else: else:
hint += "i64," hint += "i64,"
if type(ele) == float: if isinstance(ele, float):
hint += "fp32," hint += "fp32,"
return hint return hint
@@ -467,16 +467,16 @@ def rendering_common_template(
if arg_defaults[i] is None: if arg_defaults[i] is None:
input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]}," input_and_attr += f"paddle::optional<paddle::Tensor> & {arg_names[i]},"
paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),""" paddle_input_sig += f"""paddle::Optional("{arg_names[i]}"),"""
elif type(arg_defaults[i]) == float: elif isinstance(arg_defaults[i], float):
input_and_attr += f"float {arg_names[i]}," input_and_attr += f"float {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: float",""" paddle_attr_sig += f""""{arg_names[i]}: float","""
elif type(arg_defaults[i]) == bool: elif isinstance(arg_defaults[i], bool):
input_and_attr += f"bool {arg_names[i]}," input_and_attr += f"bool {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: bool",""" paddle_attr_sig += f""""{arg_names[i]}: bool","""
elif type(arg_defaults[i]) == int: elif isinstance(arg_defaults[i], int):
input_and_attr += f"int64_t {arg_names[i]}," input_and_attr += f"int64_t {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: int64_t",""" paddle_attr_sig += f""""{arg_names[i]}: int64_t","""
elif type(arg_defaults[i]) == str: elif isinstance(arg_defaults[i], str):
input_and_attr += f"std::string {arg_names[i]}," input_and_attr += f"std::string {arg_names[i]},"
paddle_attr_sig += f""""{arg_names[i]}: std::string",""" paddle_attr_sig += f""""{arg_names[i]}: std::string","""
elif arg_names[i] == "config": elif arg_names[i] == "config":
@@ -629,11 +629,11 @@ class KernelInterface:
for i in range(len(all_input)): for i in range(len(all_input)):
ele = all_input[i] ele = all_input[i]
if ( if (
type(ele) == paddle.Tensor isinstance(ele, paddle.Tensor)
or type(ele) == paddle.base.framework.EagerParamBase or isinstance(ele, paddle.base.framework.EagerParamBase)
or type(ele) == paddle.base.framework.Parameter or isinstance(ele, paddle.base.framework.Parameter)
or type(ele) == paddle.base.framework.Variable or isinstance(ele, paddle.base.framework.Variable)
or type(ele) == paddle.base.libpaddle.pir.Value or isinstance(ele, paddle.base.libpaddle.pir.Value)
): ):
dtypes.append(ele.dtype) dtypes.append(ele.dtype)
modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]" modified_arg_exclude_constexpr[i] = f"input_ptrs[{i}]"
@@ -668,7 +668,7 @@ class KernelInterface:
lanuch_grid = list(self.grid) lanuch_grid = list(self.grid)
for i in range(len(lanuch_grid)): for i in range(len(lanuch_grid)):
ele = lanuch_grid[i] ele = lanuch_grid[i]
if type(ele) == str: if isinstance(ele, str):
for key in const_hint_dict.keys(): for key in const_hint_dict.keys():
if key in ele: if key in ele:
ele = ele.replace(key, f"{{{key}}}") ele = ele.replace(key, f"{{{key}}}")

View File

@@ -153,14 +153,14 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel):
# Helper function to add layer mappings # Helper function to add layer mappings
def _add_layer_mappings(layer_idx: int): def _add_layer_mappings(layer_idx: int):
# MoE specific mappings # MoE specific mappings
self.infer_to_train_mapping[ self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = (
f"{base_name}.{layer_idx}.mlp.fused_moe.gate_weight" f"{base_name}.{layer_idx}.mlp.gate.weight"
] = f"{base_name}.{layer_idx}.mlp.gate.weight" )
if self.fd_config.model_config.moe_use_aux_free: if self.fd_config.model_config.moe_use_aux_free:
self.infer_to_train_mapping[ self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = (
f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias" f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" )
# MoE experts mappings # MoE experts mappings
for expert_idx in range(self.fd_config.model_config.moe_num_experts): for expert_idx in range(self.fd_config.model_config.moe_num_experts):
@@ -184,7 +184,8 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM, BaseRLModel):
assert isinstance(self.fd_config.model_config.moe_layer_start_index, int) assert isinstance(self.fd_config.model_config.moe_layer_start_index, int)
# Process MoE layers # Process MoE layers
for layer_idx in range( for layer_idx in range(
self.fd_config.model_config.moe_layer_start_index, self.fd_config.model_config.num_hidden_layers self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_hidden_layers,
): ):
_add_layer_mappings(layer_idx) _add_layer_mappings(layer_idx)
@@ -226,9 +227,9 @@ class Ernie4_5_VLMoeForConditionalGenerationRL(Ernie4_5_VLMoeForConditionalGener
def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int): def _add_expert_mappings(layer_idx: int, moe_tag: str, expert_start: int):
# MoE specific mappings # MoE specific mappings
gate_suffix = "" if moe_tag == "text" else "_1" gate_suffix = "" if moe_tag == "text" else "_1"
self.infer_to_train_mapping[ self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight"] = (
f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.gate_weight" f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}"
] = f"{base_name}.{layer_idx}.mlp.gate.weight{gate_suffix}" )
if self.fd_config.model_config.moe_use_aux_free: if self.fd_config.model_config.moe_use_aux_free:
self.infer_to_train_mapping[ self.infer_to_train_mapping[
@@ -245,7 +246,10 @@ class Ernie4_5_VLMoeForConditionalGenerationRL(Ernie4_5_VLMoeForConditionalGener
expert_mappings = defaultdict(list) expert_mappings = defaultdict(list)
for expert_idx in _generate_ranges( for expert_idx in _generate_ranges(
expert_start, total_moe_num, expert_num_per_rank * 2, expert_num_per_rank expert_start,
total_moe_num,
expert_num_per_rank * 2,
expert_num_per_rank,
): ):
for ph in place_holders: for ph in place_holders:
expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append( expert_mappings[f"{base_name}.{layer_idx}.mlp.{moe_tag}_fused_moe.up_gate_proj_weight"].append(
@@ -323,9 +327,9 @@ class Qwen2ForCausalLMRL(Qwen2ForCausalLM, BaseRLModel):
def _add_layer_mappings(layer_idx): def _add_layer_mappings(layer_idx):
# FFN mappings # FFN mappings
for ph in place_holders: for ph in place_holders:
self.infer_to_train_mapping[ self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}"] = (
f"{base_name}.{layer_idx}.mlp.up_gate_proj.{ph}" f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}"
] = f"{base_name}.{layer_idx}.mlp.gate_up_fused_proj.{ph}" )
for layer_idx in range(self.fd_config.model_config.num_hidden_layers): for layer_idx in range(self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx) _add_layer_mappings(layer_idx)
@@ -368,14 +372,14 @@ class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM, BaseRLModel):
# Helper function to add layer mappings # Helper function to add layer mappings
def _add_layer_mappings(layer_idx: int): def _add_layer_mappings(layer_idx: int):
# MoE specific mappings # MoE specific mappings
self.infer_to_train_mapping[ self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.gate_weight"] = (
f"{base_name}.{layer_idx}.mlp.gate_weight" f"{base_name}.{layer_idx}.mlp.gate.weight"
] = f"{base_name}.{layer_idx}.mlp.gate.weight" )
if self.fd_config.moe_config.moe_use_aux_free: if self.fd_config.moe_config.moe_use_aux_free:
self.infer_to_train_mapping[ self.infer_to_train_mapping[f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = (
f"{base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias" f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
] = f"{base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" )
# MoE experts mappings # MoE experts mappings
for expert_idx in range(self.fd_config.moe_config.num_experts): for expert_idx in range(self.fd_config.moe_config.num_experts):

View File

@@ -6,7 +6,7 @@ known_third_party = ["paddle"]
[tool.black] [tool.black]
line-length = 119 line-length = 119
target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310'] target_version = ['py35', 'py36', 'py37', 'py38', 'py39', 'py310']
exclude = ['.flake8'] exclude = '.flake8'

View File

@@ -342,10 +342,12 @@ def test_streaming(openai_client, capsys):
output.append(chunk.choices[0].text) output.append(chunk.choices[0].text)
assert len(output) > 0 assert len(output) > 0
# ========================== # ==========================
# OpenAI Client additional chat/completions test # OpenAI Client additional chat/completions test
# ========================== # ==========================
def test_non_streaming_with_stop_str(openai_client): def test_non_streaming_with_stop_str(openai_client):
""" """
Test non-streaming chat functionality with the local service Test non-streaming chat functionality with the local service
@@ -423,12 +425,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
extra_body={"return_token_ids": True}, extra_body={"return_token_ids": True},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message') assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, 'prompt_token_ids') assert hasattr(response.choices[0].message, "prompt_token_ids")
assert isinstance(response.choices[0].message.prompt_token_ids, list) assert isinstance(response.choices[0].message.prompt_token_ids, list)
assert hasattr(response.choices[0].message, 'completion_token_ids') assert hasattr(response.choices[0].message, "completion_token_ids")
assert isinstance(response.choices[0].message.completion_token_ids, list) assert isinstance(response.choices[0].message.completion_token_ids, list)
# disable return_token_ids # disable return_token_ids
@@ -440,12 +442,12 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
extra_body={"return_token_ids": False}, extra_body={"return_token_ids": False},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message') assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, 'prompt_token_ids') assert hasattr(response.choices[0].message, "prompt_token_ids")
assert response.choices[0].message.prompt_token_ids is None assert response.choices[0].message.prompt_token_ids is None
assert hasattr(response.choices[0].message, 'completion_token_ids') assert hasattr(response.choices[0].message, "completion_token_ids")
assert response.choices[0].message.completion_token_ids is None assert response.choices[0].message.completion_token_ids is None
@@ -464,11 +466,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
) )
is_first_chunk = True is_first_chunk = True
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0 assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta') assert hasattr(chunk.choices[0], "delta")
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
assert hasattr(chunk.choices[0].delta, 'completion_token_ids') assert hasattr(chunk.choices[0].delta, "completion_token_ids")
if is_first_chunk: if is_first_chunk:
is_first_chunk = False is_first_chunk = False
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
@@ -487,12 +489,12 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
stream=True, stream=True,
) )
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0 assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta') assert hasattr(chunk.choices[0], "delta")
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
assert chunk.choices[0].delta.prompt_token_ids is None assert chunk.choices[0].delta.prompt_token_ids is None
assert hasattr(chunk.choices[0].delta, 'completion_token_ids') assert hasattr(chunk.choices[0].delta, "completion_token_ids")
assert chunk.choices[0].delta.completion_token_ids is None assert chunk.choices[0].delta.completion_token_ids is None
@@ -509,11 +511,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
extra_body={"return_token_ids": True}, extra_body={"return_token_ids": True},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response.choices[0], 'prompt_token_ids') assert hasattr(response.choices[0], "prompt_token_ids")
assert isinstance(response.choices[0].prompt_token_ids, list) assert isinstance(response.choices[0].prompt_token_ids, list)
assert hasattr(response.choices[0], 'completion_token_ids') assert hasattr(response.choices[0], "completion_token_ids")
assert isinstance(response.choices[0].completion_token_ids, list) assert isinstance(response.choices[0].completion_token_ids, list)
# disable return_token_ids # disable return_token_ids
@@ -525,11 +527,11 @@ def test_non_streaming_completion_with_return_token_ids(openai_client, capsys):
extra_body={"return_token_ids": False}, extra_body={"return_token_ids": False},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response.choices[0], 'prompt_token_ids') assert hasattr(response.choices[0], "prompt_token_ids")
assert response.choices[0].prompt_token_ids is None assert response.choices[0].prompt_token_ids is None
assert hasattr(response.choices[0], 'completion_token_ids') assert hasattr(response.choices[0], "completion_token_ids")
assert response.choices[0].completion_token_ids is None assert response.choices[0].completion_token_ids is None
@@ -548,10 +550,10 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys):
) )
is_first_chunk = True is_first_chunk = True
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0 assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'prompt_token_ids') assert hasattr(chunk.choices[0], "prompt_token_ids")
assert hasattr(chunk.choices[0], 'completion_token_ids') assert hasattr(chunk.choices[0], "completion_token_ids")
if is_first_chunk: if is_first_chunk:
is_first_chunk = False is_first_chunk = False
assert isinstance(chunk.choices[0].prompt_token_ids, list) assert isinstance(chunk.choices[0].prompt_token_ids, list)
@@ -570,11 +572,11 @@ def test_streaming_completion_with_return_token_ids(openai_client, capsys):
stream=True, stream=True,
) )
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0 assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'prompt_token_ids') assert hasattr(chunk.choices[0], "prompt_token_ids")
assert chunk.choices[0].prompt_token_ids is None assert chunk.choices[0].prompt_token_ids is None
assert hasattr(chunk.choices[0], 'completion_token_ids') assert hasattr(chunk.choices[0], "completion_token_ids")
assert chunk.choices[0].completion_token_ids is None assert chunk.choices[0].completion_token_ids is None
@@ -587,13 +589,13 @@ def test_non_streaming_chat_with_prompt_token_ids(openai_client, capsys):
messages=[], messages=[],
temperature=1, temperature=1,
max_tokens=5, max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response, 'usage') assert hasattr(response, "usage")
assert hasattr(response.usage, 'prompt_tokens') assert hasattr(response.usage, "prompt_tokens")
assert response.usage.prompt_tokens == 9 assert response.usage.prompt_tokens == 9
@@ -606,17 +608,17 @@ def test_streaming_chat_with_prompt_token_ids(openai_client, capsys):
messages=[], messages=[],
temperature=1, temperature=1,
max_tokens=5, max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
stream=True, stream=True,
stream_options={"include_usage": True}, stream_options={"include_usage": True},
) )
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert hasattr(chunk, 'usage') assert hasattr(chunk, "usage")
if len(chunk.choices) > 0: if len(chunk.choices) > 0:
assert chunk.usage is None assert chunk.usage is None
else: else:
assert hasattr(chunk.usage, 'prompt_tokens') assert hasattr(chunk.usage, "prompt_tokens")
assert chunk.usage.prompt_tokens == 9 assert chunk.usage.prompt_tokens == 9
@@ -629,13 +631,13 @@ def test_non_streaming_completion_with_prompt_token_ids(openai_client, capsys):
prompt="", prompt="",
temperature=1, temperature=1,
max_tokens=5, max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response, 'usage') assert hasattr(response, "usage")
assert hasattr(response.usage, 'prompt_tokens') assert hasattr(response.usage, "prompt_tokens")
assert response.usage.prompt_tokens == 9 assert response.usage.prompt_tokens == 9
@@ -648,16 +650,15 @@ def test_streaming_completion_with_prompt_token_ids(openai_client, capsys):
prompt="", prompt="",
temperature=1, temperature=1,
max_tokens=5, max_tokens=5,
extra_body={"prompt_token_ids": [5209,626,274,45954,1071,3265,3934,1869,93937]}, extra_body={"prompt_token_ids": [5209, 626, 274, 45954, 1071, 3265, 3934, 1869, 93937]},
stream=True, stream=True,
stream_options={"include_usage": True}, stream_options={"include_usage": True},
) )
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert hasattr(chunk, 'usage') assert hasattr(chunk, "usage")
if len(chunk.choices) > 0: if len(chunk.choices) > 0:
assert chunk.usage is None assert chunk.usage is None
else: else:
assert hasattr(chunk.usage, 'prompt_tokens') assert hasattr(chunk.usage, "prompt_tokens")
assert chunk.usage.prompt_tokens == 9 assert chunk.usage.prompt_tokens == 9

View File

@@ -325,11 +325,11 @@ def test_streaming_chat(openai_client, capsys):
assert len(output) > 2 assert len(output) > 2
# ========================== # ==========================
# OpenAI Client additional chat/completions test # OpenAI Client additional chat/completions test
# ========================== # ==========================
def test_non_streaming_chat_with_return_token_ids(openai_client, capsys): def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
""" """
Test return_token_ids option in non-streaming chat functionality with the local service Test return_token_ids option in non-streaming chat functionality with the local service
@@ -340,35 +340,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
messages=[ messages=[
{ {
"role": "system", "role": "system",
"content": "You are a helpful AI assistant." "content": "You are a helpful AI assistant.",
}, # system不是必需可选 }, # system不是必需可选
{ {
"role": "role": "user",
"user", "content": [
"content": [{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", "detail": "high",
"detail": "high" },
} },
}, { {"type": "text", "text": "请描述图片内容"},
"type": "text", ],
"text": "请描述图片内容" },
}]
}
], ],
temperature=1, temperature=1,
max_tokens=53, max_tokens=53,
extra_body={"return_token_ids": True}, extra_body={"return_token_ids": True},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message') assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, 'prompt_token_ids') assert hasattr(response.choices[0].message, "prompt_token_ids")
assert isinstance(response.choices[0].message.prompt_token_ids, list) assert isinstance(response.choices[0].message.prompt_token_ids, list)
assert hasattr(response.choices[0].message, 'completion_token_ids') assert hasattr(response.choices[0].message, "completion_token_ids")
assert isinstance(response.choices[0].message.completion_token_ids, list) assert isinstance(response.choices[0].message.completion_token_ids, list)
# 不设定 return_token_ids # 不设定 return_token_ids
@@ -377,35 +375,33 @@ def test_non_streaming_chat_with_return_token_ids(openai_client, capsys):
messages=[ messages=[
{ {
"role": "system", "role": "system",
"content": "You are a helpful AI assistant." "content": "You are a helpful AI assistant.",
}, # system不是必需可选 }, # system不是必需可选
{ {
"role": "role": "user",
"user", "content": [
"content": [{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", "detail": "high",
"detail": "high" },
} },
}, { {"type": "text", "text": "请描述图片内容"},
"type": "text", ],
"text": "请描述图片内容" },
}]
}
], ],
temperature=1, temperature=1,
max_tokens=53, max_tokens=53,
extra_body={"return_token_ids": False}, extra_body={"return_token_ids": False},
stream=False, stream=False,
) )
assert hasattr(response, 'choices') assert hasattr(response, "choices")
assert len(response.choices) > 0 assert len(response.choices) > 0
assert hasattr(response.choices[0], 'message') assert hasattr(response.choices[0], "message")
assert hasattr(response.choices[0].message, 'prompt_token_ids') assert hasattr(response.choices[0].message, "prompt_token_ids")
assert response.choices[0].message.prompt_token_ids is None assert response.choices[0].message.prompt_token_ids is None
assert hasattr(response.choices[0].message, 'completion_token_ids') assert hasattr(response.choices[0].message, "completion_token_ids")
assert response.choices[0].message.completion_token_ids is None assert response.choices[0].message.completion_token_ids is None
@@ -419,23 +415,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
messages=[ messages=[
{ {
"role": "system", "role": "system",
"content": "You are a helpful AI assistant." "content": "You are a helpful AI assistant.",
}, # system不是必需可选 }, # system不是必需可选
{ {
"role": "role": "user",
"user", "content": [
"content": [{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", "detail": "high",
"detail": "high" },
} },
}, { {"type": "text", "text": "请描述图片内容"},
"type": "text", ],
"text": "请描述图片内容" },
}]
}
], ],
temperature=1, temperature=1,
max_tokens=53, max_tokens=53,
@@ -444,11 +438,11 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
) )
is_first_chunk = True is_first_chunk = True
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0 assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta') assert hasattr(chunk.choices[0], "delta")
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
assert hasattr(chunk.choices[0].delta, 'completion_token_ids') assert hasattr(chunk.choices[0].delta, "completion_token_ids")
if is_first_chunk: if is_first_chunk:
is_first_chunk = False is_first_chunk = False
assert isinstance(chunk.choices[0].delta.prompt_token_ids, list) assert isinstance(chunk.choices[0].delta.prompt_token_ids, list)
@@ -463,23 +457,21 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
messages=[ messages=[
{ {
"role": "system", "role": "system",
"content": "You are a helpful AI assistant." "content": "You are a helpful AI assistant.",
}, # system不是必需可选 }, # system不是必需可选
{ {
"role": "role": "user",
"user", "content": [
"content": [{ {
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg",
"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg", "detail": "high",
"detail": "high" },
} },
}, { {"type": "text", "text": "请描述图片内容"},
"type": "text", ],
"text": "请描述图片内容" },
}]
}
], ],
temperature=1, temperature=1,
max_tokens=53, max_tokens=53,
@@ -487,10 +479,10 @@ def test_streaming_chat_with_return_token_ids(openai_client, capsys):
stream=True, stream=True,
) )
for chunk in response: for chunk in response:
assert hasattr(chunk, 'choices') assert hasattr(chunk, "choices")
assert len(chunk.choices) > 0 assert len(chunk.choices) > 0
assert hasattr(chunk.choices[0], 'delta') assert hasattr(chunk.choices[0], "delta")
assert hasattr(chunk.choices[0].delta, 'prompt_token_ids') assert hasattr(chunk.choices[0].delta, "prompt_token_ids")
assert chunk.choices[0].delta.prompt_token_ids is None assert chunk.choices[0].delta.prompt_token_ids is None
assert hasattr(chunk.choices[0].delta, 'completion_token_ids') assert hasattr(chunk.choices[0].delta, "completion_token_ids")
assert chunk.choices[0].delta.completion_token_ids is None assert chunk.choices[0].delta.completion_token_ids is None

View File

@@ -294,4 +294,6 @@ def test_non_thinking_prompt(api_url, headers):
assert False, f"Response is not valid JSON: {e}" assert False, f"Response is not valid JSON: {e}"
content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower() content = response_json.get("choices", [{}])[0].get("message", {}).get("content", "").lower()
assert not any(x in content for x in ["根据", "我认为", "推测", "可能"]), "Expected no reasoning in non-thinking response" assert not any(
x in content for x in ["根据", "我认为", "推测", "可能"]
), "Expected no reasoning in non-thinking response"